1414// SIMDized check which bytes are in a set (Geoff Langdale)
1515// http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
1616
17+ // This is the same algorithm as truffle from Hyperscan:
18+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
19+ // https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
20+
1721typedef struct {
1822 __u8x16 lo ;
1923 __u8x16 hi ;
2024} __wasm_v128_bitmap256_t ;
2125
2226__attribute__((always_inline ))
23- static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , int i ) {
24- uint8_t hi_nibble = ( uint8_t ) i >> 4 ;
25- uint8_t lo_nibble = ( uint8_t ) i & 0xf ;
26- bitmap -> lo [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 0 ));
27- bitmap -> hi [lo_nibble ] |= (uint8_t )(( uint32_t ) 1 << (hi_nibble - 8 ));
27+ static void __wasm_v128_setbit (__wasm_v128_bitmap256_t * bitmap , uint8_t i ) {
28+ uint8_t hi_nibble = i >> 4 ;
29+ uint8_t lo_nibble = i & 0xf ;
30+ bitmap -> lo [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 0 ));
31+ bitmap -> hi [lo_nibble ] |= (uint8_t )(1u << (hi_nibble - 8 ));
2832}
2933
3034__attribute__((always_inline ))
3135static v128_t __wasm_v128_chkbits (__wasm_v128_bitmap256_t bitmap , v128_t v ) {
3236 v128_t hi_nibbles = wasm_u8x16_shr (v , 4 );
33- v128_t bitmask_lookup = wasm_u8x16_const (1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 , //
34- 1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 );
37+ v128_t bitmask_lookup = wasm_u64x2_const_splat (0x8040201008040201 );
3538 v128_t bitmask = wasm_i8x16_relaxed_swizzle (bitmask_lookup , hi_nibbles );
3639
3740 v128_t indices_0_7 = v & wasm_u8x16_const_splat (0x8f );
3841 v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat (0x80 );
3942
40- v128_t row_0_7 = wasm_i8x16_swizzle (bitmap .lo , indices_0_7 );
41- v128_t row_8_15 = wasm_i8x16_swizzle (bitmap .hi , indices_8_15 );
43+ v128_t row_0_7 = wasm_i8x16_swizzle (( v128_t ) bitmap .lo , indices_0_7 );
44+ v128_t row_8_15 = wasm_i8x16_swizzle (( v128_t ) bitmap .hi , indices_8_15 );
4245
4346 v128_t bitsets = row_0_7 | row_8_15 ;
44- return wasm_i8x16_eq ( bitsets & bitmask , bitmask ) ;
47+ return bitsets & bitmask ;
4548}
4649
4750size_t strspn (const char * s , const char * c )
@@ -90,7 +93,7 @@ size_t strspn(const char *s, const char *c)
9093
9194 for (; * c ; c ++ ) {
9295 // Terminator IS NOT on the bitmap.
93- __wasm_v128_setbit (& bitmap , * c );
96+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
9497 }
9598
9699 for (;;) {
@@ -102,12 +105,13 @@ size_t strspn(const char *s, const char *c)
102105 : "=r" (v )
103106 : "r" (addr )
104107 : "memory" );
105- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
108+ v128_t found = __wasm_v128_chkbits (bitmap , v );
106109 // Bitmask is slow on AArch64, all_true is much faster.
107- if (!wasm_i8x16_all_true (cmp )) {
110+ if (!wasm_i8x16_all_true (found )) {
111+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
108112 // Clear the bits corresponding to align (little-endian)
109113 // so we can count trailing zeros.
110- int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
114+ int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
111115 // At least one bit will be set, unless align cleared them.
112116 // Knowing this helps the compiler if it unrolls the loop.
113117 __builtin_assume (mask || align );
@@ -138,7 +142,7 @@ size_t strcspn(const char *s, const char *c)
138142
139143 do {
140144 // Terminator IS on the bitmap.
141- __wasm_v128_setbit (& bitmap , * c );
145+ __wasm_v128_setbit (& bitmap , ( uint8_t ) * c );
142146 } while (* c ++ );
143147
144148 for (;;) {
@@ -150,12 +154,13 @@ size_t strcspn(const char *s, const char *c)
150154 : "=r" (v )
151155 : "r" (addr )
152156 : "memory" );
153- v128_t cmp = __wasm_v128_chkbits (bitmap , v );
157+ v128_t found = __wasm_v128_chkbits (bitmap , v );
154158 // Bitmask is slow on AArch64, any_true is much faster.
155- if (wasm_v128_any_true (cmp )) {
159+ if (wasm_v128_any_true (found )) {
160+ v128_t cmp = wasm_i8x16_eq (found , (v128_t ){});
156161 // Clear the bits corresponding to align (little-endian)
157162 // so we can count trailing zeros.
158- int mask = wasm_i8x16_bitmask (cmp ) >> align << align ;
163+ int mask = ( uint16_t )~ wasm_i8x16_bitmask (cmp ) >> align << align ;
159164 // At least one bit will be set, unless align cleared them.
160165 // Knowing this helps the compiler if it unrolls the loop.
161166 __builtin_assume (mask || align );
0 commit comments