Learnings from truffle.

ncruces · ncruces · commit a882f5ac93c1 · 2025-08-12T02:16:37.000+01:00
diff --git a/libc-top-half/musl/src/string/strspn_simd.c b/libc-top-half/musl/src/string/strspn_simd.c
@@ -14,34 +14,37 @@
 // SIMDized check which bytes are in a set (Geoff Langdale)
 // http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html
 
+// This is the same algorithm as truffle from Hyperscan:
+// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/truffle.c#L64-L81
+// https://github.com/intel/hyperscan/blob/v5.4.2/src/nfa/trufflecompile.cpp
+
 typedef struct {
   __u8x16 lo;
   __u8x16 hi;
 } __wasm_v128_bitmap256_t;
 
 __attribute__((always_inline))
-static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, int i) {
-  uint8_t hi_nibble = (uint8_t)i >> 4;
-  uint8_t lo_nibble = (uint8_t)i & 0xf;
-  bitmap->lo[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 0));
-  bitmap->hi[lo_nibble] |= (uint8_t)((uint32_t)1 << (hi_nibble - 8));
+static void __wasm_v128_setbit(__wasm_v128_bitmap256_t *bitmap, uint8_t i) {
+  uint8_t hi_nibble = i >> 4;
+  uint8_t lo_nibble = i & 0xf;
+  bitmap->lo[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 0));
+  bitmap->hi[lo_nibble] |= (uint8_t)(1u << (hi_nibble - 8));
 }
 
 __attribute__((always_inline))
 static v128_t __wasm_v128_chkbits(__wasm_v128_bitmap256_t bitmap, v128_t v) {
   v128_t hi_nibbles = wasm_u8x16_shr(v, 4);
-  v128_t bitmask_lookup = wasm_u8x16_const(1, 2, 4, 8, 16, 32, 64, 128,  //
-                                           1, 2, 4, 8, 16, 32, 64, 128);
+  v128_t bitmask_lookup = wasm_u64x2_const_splat(0x8040201008040201);
   v128_t bitmask = wasm_i8x16_relaxed_swizzle(bitmask_lookup, hi_nibbles);
 
   v128_t indices_0_7 = v & wasm_u8x16_const_splat(0x8f);
   v128_t indices_8_15 = indices_0_7 ^ wasm_u8x16_const_splat(0x80);
 
-  v128_t row_0_7 = wasm_i8x16_swizzle(bitmap.lo, indices_0_7);
-  v128_t row_8_15 = wasm_i8x16_swizzle(bitmap.hi, indices_8_15);
+  v128_t row_0_7 = wasm_i8x16_swizzle((v128_t)bitmap.lo, indices_0_7);
+  v128_t row_8_15 = wasm_i8x16_swizzle((v128_t)bitmap.hi, indices_8_15);
 
   v128_t bitsets = row_0_7 | row_8_15;
-  return wasm_i8x16_eq(bitsets & bitmask, bitmask);
+  return bitsets & bitmask;
 }
 
 size_t strspn(const char *s, const char *c)
@@ -90,7 +93,7 @@ size_t strspn(const char *s, const char *c)
 
   for (; *c; c++) {
     // Terminator IS NOT on the bitmap.
-    __wasm_v128_setbit(&bitmap, *c);
+    __wasm_v128_setbit(&bitmap, (uint8_t)*c);
   }
 
   for (;;) {
@@ -102,12 +105,13 @@ size_t strspn(const char *s, const char *c)
         : "=r"(v)
         : "r"(addr)
         : "memory");
-    v128_t cmp = __wasm_v128_chkbits(bitmap, v);
+    v128_t found = __wasm_v128_chkbits(bitmap, v);
     // Bitmask is slow on AArch64, all_true is much faster.
-    if (!wasm_i8x16_all_true(cmp)) {
+    if (!wasm_i8x16_all_true(found)) {
+      v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
       // Clear the bits corresponding to align (little-endian)
       // so we can count trailing zeros.
-      int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
+      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
       // At least one bit will be set, unless align cleared them.
       // Knowing this helps the compiler if it unrolls the loop.
       __builtin_assume(mask || align);
@@ -138,7 +142,7 @@ size_t strcspn(const char *s, const char *c)
 
   do {
     // Terminator IS on the bitmap.
-    __wasm_v128_setbit(&bitmap, *c);
+    __wasm_v128_setbit(&bitmap, (uint8_t)*c);
   } while (*c++);
 
   for (;;) {
@@ -150,12 +154,13 @@ size_t strcspn(const char *s, const char *c)
         : "=r"(v)
         : "r"(addr)
         : "memory");
-    v128_t cmp = __wasm_v128_chkbits(bitmap, v);
+    v128_t found = __wasm_v128_chkbits(bitmap, v);
     // Bitmask is slow on AArch64, any_true is much faster.
-    if (wasm_v128_any_true(cmp)) {
+    if (wasm_v128_any_true(found)) {
+      v128_t cmp = wasm_i8x16_eq(found, (v128_t){});
       // Clear the bits corresponding to align (little-endian)
       // so we can count trailing zeros.
-      int mask = wasm_i8x16_bitmask(cmp) >> align << align;
+      int mask = (uint16_t)~wasm_i8x16_bitmask(cmp) >> align << align;
       // At least one bit will be set, unless align cleared them.
       // Knowing this helps the compiler if it unrolls the loop.
       __builtin_assume(mask || align);