subhadeepkaran
diff --git a/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 37 additions & 557 deletions b/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 37 additions & 557 deletions
diff --git a/‎faiss/impl/code_distance/code_distance-avx512.cpp‎
Lines changed: 1 addition & 1 deletion b/‎faiss/impl/code_distance/code_distance-avx512.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎faiss/impl/scalar_quantizer/codecs.h‎
Lines changed: 16 additions & 206 deletions b/‎faiss/impl/scalar_quantizer/codecs.h‎
Lines changed: 16 additions & 206 deletions
@@ -192,7 +192,7 @@ struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX512> {
 };
 
 // explicit template instanciations
-// template struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX512F>;
+// template struct PQCodeDistance<PQDecoder8, SIMDLevel::AVX512>;
 
 // these two will automatically use the generic implementation
 template struct PQCodeDistance<PQDecoder16, SIMDLevel::AVX512>;
 
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <faiss/impl/ScalarQuantizer.h>
+#include <faiss/utils/simd_levels.h>
 
 namespace faiss {
 
@@ -19,7 +20,17 @@ namespace scalar_quantizer {
  * index).
  */
 
-struct Codec8bit {
+template <SIMDLevel>
+struct Codec8bit {};
+
+template <SIMDLevel>
+struct Codec4bit {};
+
+template <SIMDLevel>
+struct Codec6bit {};
+
+template <>
+struct Codec8bit<SIMDLevel::NONE> {
     static FAISS_ALWAYS_INLINE void encode_component(
             float x,
             uint8_t* code,
@@ -32,45 +43,9 @@ struct Codec8bit {
             int i) {
         return (code[i] + 0.5f) / 255.0f;
     }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE simd16float32
-    decode_16_components(const uint8_t* code, int i) {
-        const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i));
-        const __m512i i32 = _mm512_cvtepu8_epi32(c16);
-        const __m512 f16 = _mm512_cvtepi32_ps(i32);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
-        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE simd8float32
-    decode_8_components(const uint8_t* code, int i) {
-        const uint64_t c8 = *(uint64_t*)(code + i);
-
-        const __m128i i8 = _mm_set1_epi64x(c8);
-        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
-        const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
-        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return simd8float32(float32x4x2_t{res1, res2});
-    }
-#endif
 };
-
-struct Codec4bit {
+template <>
+struct Codec4bit<SIMDLevel::NONE> {
     static FAISS_ALWAYS_INLINE void encode_component(
             float x,
             uint8_t* code,
@@ -83,64 +58,10 @@ struct Codec4bit {
             int i) {
         return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
     }
-
-#if defined(__AVX512F__)
-    static FAISS_ALWAYS_INLINE simd16float32
-    decode_16_components(const uint8_t* code, int i) {
-        uint64_t c8 = *(uint64_t*)(code + (i >> 1));
-        uint64_t mask = 0x0f0f0f0f0f0f0f0f;
-        uint64_t c8ev = c8 & mask;
-        uint64_t c8od = (c8 >> 4) & mask;
-
-        __m128i c16 =
-                _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od));
-        __m256i c8lo = _mm256_cvtepu8_epi32(c16);
-        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8));
-        __m512i i16 = _mm512_castsi256_si512(c8lo);
-        i16 = _mm512_inserti32x8(i16, c8hi, 1);
-        __m512 f16 = _mm512_cvtepi32_ps(i16);
-        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f);
-        const __m512 one_255 = _mm512_set1_ps(1.f / 15.f);
-        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
-    }
-#elif defined(__AVX2__)
-    static FAISS_ALWAYS_INLINE simd8float32
-    decode_8_components(const uint8_t* code, int i) {
-        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
-        uint32_t mask = 0x0f0f0f0f;
-        uint32_t c4ev = c4 & mask;
-        uint32_t c4od = (c4 >> 4) & mask;
-
-        // the 8 lower bytes of c8 contain the values
-        __m128i c8 =
-                _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od));
-        __m128i c4lo = _mm_cvtepu8_epi32(c8);
-        __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4));
-        __m256i i8 = _mm256_castsi128_si256(c4lo);
-        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        __m256 half = _mm256_set1_ps(0.5f);
-        f8 = _mm256_add_ps(f8, half);
-        __m256 one_255 = _mm256_set1_ps(1.f / 15.f);
-        return simd8float32(_mm256_mul_ps(f8, one_255));
-    }
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE simd8float32
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return simd8float32({res1, res2});
-    }
-#endif
 };
 
-struct Codec6bit {
+template <>
+struct Codec6bit<SIMDLevel::NONE> {
     static FAISS_ALWAYS_INLINE void encode_component(
             float x,
             uint8_t* code,
@@ -188,117 +109,6 @@ struct Codec6bit {
         }
         return (bits + 0.5f) / 63.0f;
     }
-
-#if defined(__AVX512F__)
-
-    static FAISS_ALWAYS_INLINE simd16float32
-    decode_16_components(const uint8_t* code, int i) {
-        // pure AVX512 implementation (not necessarily the fastest).
-        // see:
-        // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
-
-        // clang-format off
-
-      // 16 components, 16x6 bit=12 bytes
-      const __m128i bit_6v =
-              _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3);
-      const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v);
-
-      // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
-      // 00          01          02          03
-      const __m256i shuffle_mask = _mm256_setr_epi16(
-              0xFF00, 0x0100, 0x0201, 0xFF02,
-              0xFF03, 0x0403, 0x0504, 0xFF05,
-              0xFF06, 0x0706, 0x0807, 0xFF08,
-              0xFF09, 0x0A09, 0x0B0A, 0xFF0B);
-      const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask);
-
-      // 0: xxxxxxxx xx543210
-      // 1: xxxx5432 10xxxxxx
-      // 2: xxxxxx54 3210xxxx
-      // 3: xxxxxxxx 543210xx
-      const __m256i shift_right_v = _mm256_setr_epi16(
-              0x0U, 0x6U, 0x4U, 0x2U,
-              0x0U, 0x6U, 0x4U, 0x2U,
-              0x0U, 0x6U, 0x4U, 0x2U,
-              0x0U, 0x6U, 0x4U, 0x2U);
-      __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v);
-
-      // remove unneeded bits
-      shuffled_shifted =
-              _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F));
-
-      // scale
-      const __m512 f8 =
-              _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted));
-      const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
-      const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
-      return simd16float32(_mm512_fmadd_ps(f8, one_255, half_one_255));
-
-        // clang-format on
-    }
-
-#elif defined(__AVX2__)
-
-    /* Load 6 bytes that represent 8 6-bit values, return them as a
-     * 8*32 bit vector register */
-    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
-        const __m128i perm = _mm_set_epi8(
-                -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
-        const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
-
-        // load 6 bytes
-        __m128i c1 =
-                _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]);
-
-        // put in 8 * 32 bits
-        __m128i c2 = _mm_shuffle_epi8(c1, perm);
-        __m256i c3 = _mm256_cvtepi16_epi32(c2);
-
-        // shift and mask out useless bits
-        __m256i c4 = _mm256_srlv_epi32(c3, shifts);
-        __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4);
-        return c5;
-    }
-
-    static FAISS_ALWAYS_INLINE simd8float32
-    decode_8_components(const uint8_t* code, int i) {
-        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
-        // // for the reference, maybe, it becomes used oned day.
-        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
-        // const uint32_t* data32 = (const uint32_t*)data16;
-        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
-        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
-        // const __m128i i8 = _mm_set1_epi64x(vext);
-        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
-        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
-        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
-
-        __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
-        __m256 f8 = _mm256_cvtepi32_ps(i8);
-        // this could also be done with bit manipulations but it is
-        // not obviously faster
-        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
-        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
-        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
-    }
-
-#endif
-
-#ifdef USE_NEON
-    static FAISS_ALWAYS_INLINE simd8float32
-    decode_8_components(const uint8_t* code, int i) {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = decode_component(code, i + j);
-        }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return simd8float32(float32x4x2_t({res1, res2}));
-    }
-#endif
 };
 
 } // namespace scalar_quantizer