Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions Lib/test/test_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,6 +584,87 @@ def test_hex_separator_six_bytes(self):
self.assertEqual(six_bytes.hex(':', -6), '0306090c0f12')
self.assertEqual(six_bytes.hex(' ', -95), '0306090c0f12')

def test_hex_simd_boundaries(self):
# Test lengths around the SIMD threshold (16 bytes).
# SIMD processes 16 bytes at a time; smaller inputs use scalar code.
for length in (14, 15, 16, 17, 31, 32, 33, 64, 65):
data = self.type2test(bytes(range(length)))
expected = ''.join(f'{b:02x}' for b in range(length))
with self.subTest(length=length):
self.assertEqual(data.hex(), expected)

def test_hex_nibble_boundaries(self):
# Test the nibble value boundary at 9/10 (where '9' becomes 'a').
# SIMD uses signed comparison for efficiency; verify correctness
# at this boundary for various nibble combinations.
boundary_bytes = self.type2test(bytes([
0x09, # both nibbles: 0, 9
0x0a, # both nibbles: 0, 10
0x90, # both nibbles: 9, 0
0x99, # both nibbles: 9, 9 (max all-digit)
0x9a, # both nibbles: 9, 10
0xa0, # both nibbles: 10, 0
0xa9, # both nibbles: 10, 9
0xaa, # both nibbles: 10, 10 (min all-letter)
0x00, # min value
0xff, # max value
]))
self.assertEqual(boundary_bytes.hex(), '090a90999aa0a9aa00ff')

# Repeat with 16+ bytes to exercise SIMD path
simd_boundary = self.type2test(boundary_bytes * 2)
self.assertEqual(simd_boundary.hex(), '090a90999aa0a9aa00ff' * 2)

def test_hex_simd_separator(self):
# Test SIMD path for separator insertion (sep >= 8 bytes, len >= 16).
# SIMD hexlifies then shuffles in-place to insert separators.

# 32 bytes exercises SIMD; test various separator group sizes
data = self.type2test(bytes(range(32)))

# bytes_per_sep=8: 4 groups of 8 bytes, 3 separators
self.assertEqual(
data.hex('-', 8),
'0001020304050607-08090a0b0c0d0e0f-'
'1011121314151617-18191a1b1c1d1e1f'
)
# bytes_per_sep=9: groups of 9 from start, 5 byte remainder at end
self.assertEqual(
data.hex('.', 9),
'0001020304.05060708090a0b0c0d.'
'0e0f10111213141516.1718191a1b1c1d1e1f'
)
# bytes_per_sep=16: 2 groups of 16 bytes
self.assertEqual(
data.hex(' ', 16),
'000102030405060708090a0b0c0d0e0f '
'101112131415161718191a1b1c1d1e1f'
)
# Negative bytes_per_sep: groups from end, remainder at start
self.assertEqual(
data.hex('|', -8),
'0001020304050607|08090a0b0c0d0e0f|'
'1011121314151617|18191a1b1c1d1e1f'
)
self.assertEqual(
data.hex('_', -9),
'000102030405060708_090a0b0c0d0e0f1011_'
'12131415161718191a_1b1c1d1e1f'
)

# 20 bytes: SIMD (16) + 4 byte scalar remainder
data20 = self.type2test(bytes(range(20)))
# Positive: groups from start, remainder at end
self.assertEqual(
data20.hex('#', 8),
'00010203#0405060708090a0b#0c0d0e0f10111213'
)
# Negative: groups from end, remainder at start
self.assertEqual(
data20.hex('@', -8),
'0001020304050607@08090a0b0c0d0e0f@10111213'
)

def test_join(self):
self.assertEqual(self.type2test(b"").join([]), b"")
self.assertEqual(self.type2test(b"").join([b""]), b"")
Expand Down
183 changes: 177 additions & 6 deletions Python/pystrhex.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,126 @@
#include "pycore_strhex.h" // _Py_strhex_with_sep()
#include "pycore_unicodeobject.h" // _PyUnicode_CheckConsistency()

/* Scalar hexlify: convert len bytes to 2*len hex characters.
Uses table lookup via Py_hexdigits for the conversion. */
static inline void
_Py_hexlify_scalar(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
/* Various optimizations like using math instead of a table lookup,
manually unrolling the loop, storing the global table pointer locally,
and doing wider dst writes have been tried and benchmarked; all produced
nearly identical performance on gcc 15. Using a 256 entry uint16_t
table was a bit slower. So we keep our old simple and obvious code. */
for (Py_ssize_t i = 0; i < len; i++) {
unsigned char c = src[i];
*dst++ = Py_hexdigits[c >> 4];
*dst++ = Py_hexdigits[c & 0x0f];
Comment on lines +19 to +20
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any difference with the following?

Suggested change
*dst++ = Py_hexdigits[c >> 4];
*dst++ = Py_hexdigits[c & 0x0f];
dst[0] = Py_hexdigits[c >> 4];
dst[1] = Py_hexdigits[c & 0x0f];
dst += 2;

}
}

/* Portable SIMD optimization for hexlify using GCC/Clang vector extensions.
Uses __builtin_shufflevector for portable interleave that compiles to
native SIMD instructions (SSE2 punpcklbw/punpckhbw on x86-64 [always],
NEON zip1/zip2 on ARM64 [always], & vzip on ARM32 when compiler flags
for the target microarch allow it [try -march=native if running 32-bit
on an rpi3 or later]).
Requirements:
- GCC 12+ or Clang 3.0+ (for __builtin_shufflevector)
- x86-64, ARM64, or ARM32 with NEON
Performance:
- Up to 11x faster on larger data than the scalar code.
- For more common small data it varies between 1.1-3x faster.
Even faster is possible for big data using AVX2 or AVX512 but
that adds complication. Honestly, who really hexes _huge_ data?!
Speeding up the 16-64 byte cases fits nicely with md5 through sha512.
*/
#if (defined(__x86_64__) || defined(__aarch64__) || \
(defined(__arm__) && defined(__ARM_NEON))) && \
(defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 12))
# define PY_HEXLIFY_CAN_COMPILE_SIMD 1
#else
# define PY_HEXLIFY_CAN_COMPILE_SIMD 0
#endif

#if PY_HEXLIFY_CAN_COMPILE_SIMD

/* 128-bit vector of 16 unsigned bytes */
typedef unsigned char v16u8 __attribute__((vector_size(16)));
/* 128-bit vector of 16 signed bytes - for efficient comparison.
Using signed comparison generates pcmpgtb on x86-64 instead of
the slower psubusb+pcmpeqb sequence from unsigned comparison.
ARM NEON performs the same either way. */
typedef signed char v16s8 __attribute__((vector_size(16)));

/* Splat a byte value across all 16 lanes */
static inline v16u8
v16u8_splat(unsigned char x)
{
return (v16u8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

static inline v16s8
v16s8_splat(signed char x)
{
return (v16s8){x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x};
}

/* Portable SIMD hexlify: converts 16 bytes to 32 hex chars per iteration.
Compiles to native SSE2 on x86-64, NEON on ARM64 (and some ARM32). */
static void
_Py_hexlify_simd(const unsigned char *src, Py_UCS1 *dst, Py_ssize_t len)
{
const v16u8 mask_0f = v16u8_splat(0x0f);
const v16u8 ascii_0 = v16u8_splat('0');
const v16u8 offset = v16u8_splat('a' - '0' - 10); /* 0x27 */
const v16s8 nine = v16s8_splat(9);

Py_ssize_t i = 0;

/* Process 16 bytes at a time */
for (; i + 16 <= len; i += 16, dst += 32) {
/* Load 16 bytes (memcpy for safe unaligned access) */
v16u8 data;
memcpy(&data, src + i, 16);

/* Extract high and low nibbles using vector operators */
v16u8 hi = (data >> 4) & mask_0f;
v16u8 lo = data & mask_0f;

/* Compare > 9 using signed comparison for efficient codegen.
Nibble values 0-15 are safely in signed byte range.
This generates pcmpgtb on x86-64, avoiding the slower
psubusb+pcmpeqb sequence from unsigned comparison. */
v16u8 hi_gt9 = (v16u8)((v16s8)hi > nine);
v16u8 lo_gt9 = (v16u8)((v16s8)lo > nine);

/* Convert nibbles to hex ASCII */
hi = hi + ascii_0 + (hi_gt9 & offset);
lo = lo + ascii_0 + (lo_gt9 & offset);

/* Interleave hi/lo nibbles using portable shufflevector.
This compiles to punpcklbw/punpckhbw on x86-64, zip1/zip2 on ARM64,
or vzip on ARM32. */
v16u8 result0 = __builtin_shufflevector(hi, lo,
0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
v16u8 result1 = __builtin_shufflevector(hi, lo,
8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);

/* Store 32 hex characters */
memcpy(dst, &result0, 16);
memcpy(dst + 16, &result1, 16);
}

/* Scalar fallback for remaining 0-15 bytes */
_Py_hexlify_scalar(src + i, dst, len - i);
}

#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */

static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
PyObject* sep, int bytes_per_sep_group,
const int return_bytes)
Expand Down Expand Up @@ -82,20 +202,67 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
unsigned char c;

if (bytes_per_sep_group == 0) {
for (i = j = 0; i < arglen; ++i) {
assert((j + 1) < resultlen);
c = argbuf[i];
retbuf[j++] = Py_hexdigits[c >> 4];
retbuf[j++] = Py_hexdigits[c & 0x0f];
#if PY_HEXLIFY_CAN_COMPILE_SIMD
if (arglen >= 16) {
// little vector units go brrrr...
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);
}
else
#endif
{
_Py_hexlify_scalar((const unsigned char *)argbuf, retbuf, arglen);
}
assert(j == resultlen);
}
else {
/* The number of complete chunk+sep periods */
Py_ssize_t chunks = (arglen - 1) / abs_bytes_per_sep;
Py_ssize_t chunk;
unsigned int k;

#if PY_HEXLIFY_CAN_COMPILE_SIMD
/* SIMD path for separator groups >= 8 bytes.
SIMD hexlify to output buffer, then shuffle in-place to insert
separators. Working backwards avoids overlap issues since we're
expanding (destination index >= source index). */
if (abs_bytes_per_sep >= 8 && arglen >= 16) {
/* SIMD hexlify all bytes to start of output buffer */
_Py_hexlify_simd((const unsigned char *)argbuf, retbuf, arglen);

/* Shuffle in-place, working backwards */
Py_ssize_t hex_chunk_size = 2 * (Py_ssize_t)abs_bytes_per_sep;
Py_ssize_t remainder_bytes = arglen - chunks * (Py_ssize_t)abs_bytes_per_sep;
Py_ssize_t remainder_hex_len = 2 * remainder_bytes;
Py_ssize_t hex_pos = 2 * arglen; /* End of hex data */
Py_ssize_t out_pos = resultlen; /* End of output */

if (bytes_per_sep_group < 0) {
/* Forward: remainder at end, separators after each chunk */
if (remainder_hex_len > 0) {
hex_pos -= remainder_hex_len;
out_pos -= remainder_hex_len;
memmove(retbuf + out_pos, retbuf + hex_pos, remainder_hex_len);
}
for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
retbuf[--out_pos] = sep_char;
hex_pos -= hex_chunk_size;
out_pos -= hex_chunk_size;
memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
}
}
else {
/* Backward: remainder at start, separators before each chunk */
for (Py_ssize_t c = chunks - 1; c >= 0; c--) {
hex_pos -= hex_chunk_size;
out_pos -= hex_chunk_size;
memmove(retbuf + out_pos, retbuf + hex_pos, hex_chunk_size);
retbuf[--out_pos] = sep_char;
}
/* Remainder at start stays in place (hex_pos == out_pos == remainder_hex_len) */
}
goto done_hexlify;
}
#endif /* PY_HEXLIFY_CAN_COMPILE_SIMD */

if (bytes_per_sep_group < 0) {
i = j = 0;
for (chunk = 0; chunk < chunks; chunk++) {
Expand Down Expand Up @@ -133,6 +300,10 @@ static PyObject *_Py_strhex_impl(const char* argbuf, const Py_ssize_t arglen,
}
}

#if PY_HEXLIFY_CAN_COMPILE_SIMD
done_hexlify:
#endif

#ifdef Py_DEBUG
if (!return_bytes) {
assert(_PyUnicode_CheckConsistency(retval, 1));
Expand Down
Loading