diff --git a/faiss/gpu/impl/PQCodeDistances-inl.cuh b/faiss/gpu/impl/PQCodeDistances-inl.cuh index e2e66f30c7..087f390bf7 100644 --- a/faiss/gpu/impl/PQCodeDistances-inl.cuh +++ b/faiss/gpu/impl/PQCodeDistances-inl.cuh @@ -20,11 +20,9 @@ namespace faiss { namespace gpu { -#if defined(USE_AMD_ROCM) && __AMDGCN_WAVEFRONT_SIZE == 64u -#define LAUNCH_BOUND 320 -#else -#define LAUNCH_BOUND 288 -#endif +__device__ constexpr inline int getLaunchBound() { + return kWarpSize == 32 ? 288 : 320; +} // Kernel responsible for calculating distance from residual vector to // each product quantizer code centroid @@ -33,7 +31,7 @@ template < typename CentroidT, int DimsPerSubQuantizer, bool L2Distance> -__global__ void __launch_bounds__(LAUNCH_BOUND, 3) pqCodeDistances( +__global__ void __launch_bounds__(getLaunchBound(), 3) pqCodeDistances( Tensor queries, int queriesPerBlock, Tensor coarseCentroids, diff --git a/faiss/gpu/utils/DeviceDefs.cuh b/faiss/gpu/utils/DeviceDefs.cuh index dc4c469a1b..ddca0b5312 100644 --- a/faiss/gpu/utils/DeviceDefs.cuh +++ b/faiss/gpu/utils/DeviceDefs.cuh @@ -8,17 +8,28 @@ #pragma once #include +#ifdef USE_AMD_ROCM +#include // ROCm version macros +#if ROCM_VERSION_MAJOR >= 7 +#include // rocprim::arch::wavefront +#endif +#endif namespace faiss { namespace gpu { #ifdef USE_AMD_ROCM +#if ROCM_VERSION_MAJOR < 7 #if __AMDGCN_WAVEFRONT_SIZE == 32u constexpr int kWarpSize = 32; #else constexpr int kWarpSize = 64; #endif +#else +// ROCm 7.0 and above +constexpr __device__ int kWarpSize = rocprim::arch::wavefront::max_size(); +#endif // This is a memory barrier for intra-warp writes to shared memory. __forceinline__ __device__ void warpFence() {