subhadeepkaran
diff --git a/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 114 additions & 1851 deletions b/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 114 additions & 1851 deletions
diff --git a/‎faiss/impl/ScalarQuantizer.h‎
Lines changed: 0 additions & 2 deletions b/‎faiss/impl/ScalarQuantizer.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎faiss/impl/scalar_quantizer/codecs.h‎
Lines changed: 305 additions & 0 deletions b/‎faiss/impl/scalar_quantizer/codecs.h‎
Lines changed: 305 additions & 0 deletions
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
 
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/ScalarQuantizer.h>
+
+namespace faiss {
+
+namespace scalar_quantizer {
+
+/*******************************************************************
+ * Codec: converts between values in [0, 1] and an index in a code
+ * array. The "i" parameter is the vector component index (not byte
+ * index).
+ */
+
+struct Codec8bit {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        code[i] = (int)(255 * x);
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        return (code[i] + 0.5f) / 255.0f;
+    }
+
+#if defined(__AVX512F__)
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        const __m128i c16 = _mm_loadu_si128((__m128i*)(code + i));
+        const __m512i i32 = _mm512_cvtepu8_epi32(c16);
+        const __m512 f16 = _mm512_cvtepi32_ps(i32);
+        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 255.f);
+        const __m512 one_255 = _mm512_set1_ps(1.f / 255.f);
+        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
+    }
+#elif defined(__AVX2__)
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        const uint64_t c8 = *(uint64_t*)(code + i);
+
+        const __m128i i8 = _mm_set1_epi64x(c8);
+        const __m256i i32 = _mm256_cvtepu8_epi32(i8);
+        const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 255.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 255.f);
+        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
+    }
+#endif
+
+#ifdef USE_NEON
+    static FAISS_ALWAYS_INLINE decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32(float32x4x2_t{res1, res2});
+    }
+#endif
+};
+
+struct Codec4bit {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        code[i / 2] |= (int)(x * 15.0) << ((i & 1) << 2);
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        return (((code[i / 2] >> ((i & 1) << 2)) & 0xf) + 0.5f) / 15.0f;
+    }
+
+#if defined(__AVX512F__)
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        uint64_t c8 = *(uint64_t*)(code + (i >> 1));
+        uint64_t mask = 0x0f0f0f0f0f0f0f0f;
+        uint64_t c8ev = c8 & mask;
+        uint64_t c8od = (c8 >> 4) & mask;
+
+        __m128i c16 =
+                _mm_unpacklo_epi8(_mm_set1_epi64x(c8ev), _mm_set1_epi64x(c8od));
+        __m256i c8lo = _mm256_cvtepu8_epi32(c16);
+        __m256i c8hi = _mm256_cvtepu8_epi32(_mm_srli_si128(c16, 8));
+        __m512i i16 = _mm512_castsi256_si512(c8lo);
+        i16 = _mm512_inserti32x8(i16, c8hi, 1);
+        __m512 f16 = _mm512_cvtepi32_ps(i16);
+        const __m512 half_one_255 = _mm512_set1_ps(0.5f / 15.f);
+        const __m512 one_255 = _mm512_set1_ps(1.f / 15.f);
+        return simd16float32(_mm512_fmadd_ps(f16, one_255, half_one_255));
+    }
+#elif defined(__AVX2__)
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        uint32_t c4 = *(uint32_t*)(code + (i >> 1));
+        uint32_t mask = 0x0f0f0f0f;
+        uint32_t c4ev = c4 & mask;
+        uint32_t c4od = (c4 >> 4) & mask;
+
+        // the 8 lower bytes of c8 contain the values
+        __m128i c8 =
+                _mm_unpacklo_epi8(_mm_set1_epi32(c4ev), _mm_set1_epi32(c4od));
+        __m128i c4lo = _mm_cvtepu8_epi32(c8);
+        __m128i c4hi = _mm_cvtepu8_epi32(_mm_srli_si128(c8, 4));
+        __m256i i8 = _mm256_castsi128_si256(c4lo);
+        i8 = _mm256_insertf128_si256(i8, c4hi, 1);
+        __m256 f8 = _mm256_cvtepi32_ps(i8);
+        __m256 half = _mm256_set1_ps(0.5f);
+        f8 = _mm256_add_ps(f8, half);
+        __m256 one_255 = _mm256_set1_ps(1.f / 15.f);
+        return simd8float32(_mm256_mul_ps(f8, one_255));
+    }
+#endif
+
+#ifdef USE_NEON
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32({res1, res2});
+    }
+#endif
+};
+
+struct Codec6bit {
+    static FAISS_ALWAYS_INLINE void encode_component(
+            float x,
+            uint8_t* code,
+            int i) {
+        int bits = (int)(x * 63.0);
+        code += (i >> 2) * 3;
+        switch (i & 3) {
+            case 0:
+                code[0] |= bits;
+                break;
+            case 1:
+                code[0] |= bits << 6;
+                code[1] |= bits >> 2;
+                break;
+            case 2:
+                code[1] |= bits << 4;
+                code[2] |= bits >> 4;
+                break;
+            case 3:
+                code[2] |= bits << 2;
+                break;
+        }
+    }
+
+    static FAISS_ALWAYS_INLINE float decode_component(
+            const uint8_t* code,
+            int i) {
+        uint8_t bits;
+        code += (i >> 2) * 3;
+        switch (i & 3) {
+            case 0:
+                bits = code[0] & 0x3f;
+                break;
+            case 1:
+                bits = code[0] >> 6;
+                bits |= (code[1] & 0xf) << 2;
+                break;
+            case 2:
+                bits = code[1] >> 4;
+                bits |= (code[2] & 3) << 4;
+                break;
+            case 3:
+                bits = code[2] >> 2;
+                break;
+        }
+        return (bits + 0.5f) / 63.0f;
+    }
+
+#if defined(__AVX512F__)
+
+    static FAISS_ALWAYS_INLINE simd16float32
+    decode_16_components(const uint8_t* code, int i) {
+        // pure AVX512 implementation (not necessarily the fastest).
+        // see:
+        // https://github.com/zilliztech/knowhere/blob/main/thirdparty/faiss/faiss/impl/ScalarQuantizerCodec_avx512.h
+
+        // clang-format off
+
+      // 16 components, 16x6 bit=12 bytes
+      const __m128i bit_6v =
+              _mm_maskz_loadu_epi8(0b0000111111111111, code + (i >> 2) * 3);
+      const __m256i bit_6v_256 = _mm256_broadcast_i32x4(bit_6v);
+
+      // 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
+      // 00          01          02          03
+      const __m256i shuffle_mask = _mm256_setr_epi16(
+              0xFF00, 0x0100, 0x0201, 0xFF02,
+              0xFF03, 0x0403, 0x0504, 0xFF05,
+              0xFF06, 0x0706, 0x0807, 0xFF08,
+              0xFF09, 0x0A09, 0x0B0A, 0xFF0B);
+      const __m256i shuffled = _mm256_shuffle_epi8(bit_6v_256, shuffle_mask);
+
+      // 0: xxxxxxxx xx543210
+      // 1: xxxx5432 10xxxxxx
+      // 2: xxxxxx54 3210xxxx
+      // 3: xxxxxxxx 543210xx
+      const __m256i shift_right_v = _mm256_setr_epi16(
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U,
+              0x0U, 0x6U, 0x4U, 0x2U);
+      __m256i shuffled_shifted = _mm256_srlv_epi16(shuffled, shift_right_v);
+
+      // remove unneeded bits
+      shuffled_shifted =
+              _mm256_and_si256(shuffled_shifted, _mm256_set1_epi16(0x003F));
+
+      // scale
+      const __m512 f8 =
+              _mm512_cvtepi32_ps(_mm512_cvtepi16_epi32(shuffled_shifted));
+      const __m512 half_one_255 = _mm512_set1_ps(0.5f / 63.f);
+      const __m512 one_255 = _mm512_set1_ps(1.f / 63.f);
+      return simd16float32(_mm512_fmadd_ps(f8, one_255, half_one_255));
+
+        // clang-format on
+    }
+
+#elif defined(__AVX2__)
+
+    /* Load 6 bytes that represent 8 6-bit values, return them as a
+     * 8*32 bit vector register */
+    static FAISS_ALWAYS_INLINE __m256i load6(const uint16_t* code16) {
+        const __m128i perm = _mm_set_epi8(
+                -1, 5, 5, 4, 4, 3, -1, 3, -1, 2, 2, 1, 1, 0, -1, 0);
+        const __m256i shifts = _mm256_set_epi32(2, 4, 6, 0, 2, 4, 6, 0);
+
+        // load 6 bytes
+        __m128i c1 =
+                _mm_set_epi16(0, 0, 0, 0, 0, code16[2], code16[1], code16[0]);
+
+        // put in 8 * 32 bits
+        __m128i c2 = _mm_shuffle_epi8(c1, perm);
+        __m256i c3 = _mm256_cvtepi16_epi32(c2);
+
+        // shift and mask out useless bits
+        __m256i c4 = _mm256_srlv_epi32(c3, shifts);
+        __m256i c5 = _mm256_and_si256(_mm256_set1_epi32(63), c4);
+        return c5;
+    }
+
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        // // Faster code for Intel CPUs or AMD Zen3+, just keeping it here
+        // // for the reference, maybe, it becomes used oned day.
+        // const uint16_t* data16 = (const uint16_t*)(code + (i >> 2) * 3);
+        // const uint32_t* data32 = (const uint32_t*)data16;
+        // const uint64_t val = *data32 + ((uint64_t)data16[2] << 32);
+        // const uint64_t vext = _pdep_u64(val, 0x3F3F3F3F3F3F3F3FULL);
+        // const __m128i i8 = _mm_set1_epi64x(vext);
+        // const __m256i i32 = _mm256_cvtepi8_epi32(i8);
+        // const __m256 f8 = _mm256_cvtepi32_ps(i32);
+        // const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        // const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        // return _mm256_fmadd_ps(f8, one_255, half_one_255);
+
+        __m256i i8 = load6((const uint16_t*)(code + (i >> 2) * 3));
+        __m256 f8 = _mm256_cvtepi32_ps(i8);
+        // this could also be done with bit manipulations but it is
+        // not obviously faster
+        const __m256 half_one_255 = _mm256_set1_ps(0.5f / 63.f);
+        const __m256 one_255 = _mm256_set1_ps(1.f / 63.f);
+        return simd8float32(_mm256_fmadd_ps(f8, one_255, half_one_255));
+    }
+
+#endif
+
+#ifdef USE_NEON
+    static FAISS_ALWAYS_INLINE simd8float32
+    decode_8_components(const uint8_t* code, int i) {
+        float32_t result[8] = {};
+        for (size_t j = 0; j < 8; j++) {
+            result[j] = decode_component(code, i + j);
+        }
+        float32x4_t res1 = vld1q_f32(result);
+        float32x4_t res2 = vld1q_f32(result + 4);
+        return simd8float32(float32x4x2_t({res1, res2}));
+    }
+#endif
+};
+
+} // namespace scalar_quantizer
+} // namespace faiss