Skip to content

Commit a882f5a

Browse files
committed
Learnings from truffle.
1 parent a730338 commit a882f5a

File tree

1 file changed

+23
-18
lines changed

1 file changed

+23
-18
lines changed

libc-top-half/musl/src/string/strspn_simd.c

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -14,34 +14,37 @@
1414
// SIMDized check which bytes are in a set (Geoff Langdale)
1515
// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
1616

17+
// This is the same algorithm as truffle from Hyperscan:
18+
// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
19+
// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
20+
1721
typedef struct {
1822
__u8x16 lo;
1923
__u8x16 hi;
2024
} __wasm_v128_bitmap256_t;
2125

2226
__attribute__((always_inline))
23-
static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
24-
uint8_t hi_nibble = (uint8_t)i >> 4;
25-
uint8_t lo_nibble = (uint8_t)i & 0xf;
26-
bitmap->lo[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 0));
27-
bitmap->hi[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 8));
27+
static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, uint8_t i) {
28+
uint8_t hi_nibble = i >> 4;
29+
uint8_t lo_nibble = i & 0xf;
30+
bitmap->lo[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 0));
31+
bitmap->hi[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 8));
2832
}
2933

3034
__attribute__((always_inline))
3135
static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) {
3236
v128_t hi_nibbles = wasm_u8x16_shr(v, 4);
33-
v128_t bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128, //
34-
1, 2, 4, 8, 16, 32, 64, 128);
37+
v128_t bitmask_lookup = wasm_u64x2_const_splat(0x8040201008040201);
3538
v128_t bitmask = wasm_i8x16_relaxed_swizzle(bitmask_lookup, hi_nibbles);
3639

3740
v128_t indices_0_7 = v & wasm_u8x16_const_splat(0x8f);
3841
v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat(0x80);
3942

40-
v128_t row_0_7 = wasm_i8x16_swizzle(bitmap.lo, indices_0_7);
41-
v128_t row_8_15 = wasm_i8x16_swizzle(bitmap.hi, indices_8_15);
43+
v128_t row_0_7 = wasm_i8x16_swizzle((v128_t)bitmap.lo, indices_0_7);
44+
v128_t row_8_15 = wasm_i8x16_swizzle((v128_t)bitmap.hi, indices_8_15);
4245

4346
v128_t bitsets = row_0_7 | row_8_15;
44-
return wasm_i8x16_eq(bitsets & bitmask, bitmask);
47+
return bitsets & bitmask;
4548
}
4649

4750
size_t strspn(const char *s, const char *c)
@@ -90,7 +93,7 @@ size_t strspn(const char *s, const char *c)
9093

9194
for (; *c; c++) {
9295
// Terminator IS NOT on the bitmap.
93-
__wasm_v128_setbit(&bitmap, *c);
96+
__wasm_v128_setbit(&bitmap, (uint8_t)*c);
9497
}
9598

9699
for (;;) {
@@ -102,12 +105,13 @@ size_t strspn(const char *s, const char *c)
102105
: "=r"(v)
103106
: "r"(addr)
104107
: "memory");
105-
v128_t cmp = __wasm_v128_chkbits(bitmap, v);
108+
v128_t found = __wasm_v128_chkbits(bitmap, v);
106109
// Bitmask is slow on AArch64, all_true is much faster.
107-
if (!wasm_i8x16_all_true(cmp)) {
110+
if (!wasm_i8x16_all_true(found)) {
111+
v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
108112
// Clear the bits corresponding to align (little-endian)
109113
// so we can count trailing zeros.
110-
int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
114+
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
111115
// At least one bit will be set, unless align cleared them.
112116
// Knowing this helps the compiler if it unrolls the loop.
113117
__builtin_assume(mask || align);
@@ -138,7 +142,7 @@ size_t strcspn(const char *s, const char *c)
138142

139143
do {
140144
// Terminator IS on the bitmap.
141-
__wasm_v128_setbit(&bitmap, *c);
145+
__wasm_v128_setbit(&bitmap, (uint8_t)*c);
142146
} while (*c++);
143147

144148
for (;;) {
@@ -150,12 +154,13 @@ size_t strcspn(const char *s, const char *c)
150154
: "=r"(v)
151155
: "r"(addr)
152156
: "memory");
153-
v128_t cmp = __wasm_v128_chkbits(bitmap, v);
157+
v128_t found = __wasm_v128_chkbits(bitmap, v);
154158
// Bitmask is slow on AArch64, any_true is much faster.
155-
if (wasm_v128_any_true(cmp)) {
159+
if (wasm_v128_any_true(found)) {
160+
v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
156161
// Clear the bits corresponding to align (little-endian)
157162
// so we can count trailing zeros.
158-
int mask = wasm_i8x16_bitmask(cmp) >> align << align;
163+
int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
159164
// At least one bit will be set, unless align cleared them.
160165
// Knowing this helps the compiler if it unrolls the loop.
161166
__builtin_assume(mask || align);

0 commit comments

Comments
 (0)