diff --git a/CMakeLists.txt b/CMakeLists.txt index 4a689e51c..c902e1efa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,9 +30,15 @@ if(BUILD_UCM_MINDIE) add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${UCM_CXX11_ABI}) endif() -set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now") -set(FLAGS_DEBUG "-O0 -g") -set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2") +if(WIN32) + set(FLAGS_PUBLIC "/W3") + set(FLAGS_DEBUG "/Od /Zi") + set(FLAGS_RELEASE "/O2") +else() + set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now") + set(FLAGS_DEBUG "-O0 -g") + set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2") +endif() string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER) if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}") diff --git a/docs/source/getting-started/quickstart_sglang.md b/docs/source/getting-started/quickstart_sglang.md index cab1a57b3..8b50430e5 100644 --- a/docs/source/getting-started/quickstart_sglang.md +++ b/docs/source/getting-started/quickstart_sglang.md @@ -85,6 +85,8 @@ docker build --build-arg INSTALL_MODE=package \ pip install -v -e . --no-build-isolation ``` + > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda` to select the ROCm device backend (the KV-transfer and Hamming-distance kernels built with HIP). It requires a ROCm installation and a ROCm build of PyTorch; pass `-DCMAKE_HIP_ARCHITECTURES=` (e.g. `gfx90a`, `gfx1100`) if your GPU is not auto-detected. Validated on gfx90a and gfx1100. + ### Option 3: Install by pip 1. Prepare SGLang Environment diff --git a/docs/source/getting-started/quickstart_vllm.md b/docs/source/getting-started/quickstart_vllm.md index 719b3dea2..c0b3f2894 100644 --- a/docs/source/getting-started/quickstart_vllm.md +++ b/docs/source/getting-started/quickstart_vllm.md @@ -93,6 +93,8 @@ docker build --build-arg INSTALL_MODE=package \ pip install -v -e . --no-build-isolation ``` + > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda`. This selects the ROCm device backend, which builds the same KV-transfer and Hamming-distance kernels with HIP. It requires a ROCm installation and a ROCm build of PyTorch in the environment; pass `-DCMAKE_HIP_ARCHITECTURES=` (e.g. `gfx90a`, `gfx1100`) if CMake does not detect your GPU. Validated on gfx90a and gfx1100. + 3. Apply vLLM Integration Patches (Not required for versions > 0.11.0) To integrate UCM with vLLM 0.11.0, you can choose between a dynamic **monkey patch** (recommended) and a manual **git patch**. diff --git a/docs/source/user-guide/support-matrix/support_matrix.md b/docs/source/user-guide/support-matrix/support_matrix.md index f3f8ff9ee..1cc08f36b 100644 --- a/docs/source/user-guide/support-matrix/support_matrix.md +++ b/docs/source/user-guide/support-matrix/support_matrix.md @@ -58,6 +58,7 @@ This section presents the currently supported compute platforms and devices. |:----------------:|:------:|:------:| | CANN | Ascend | 910C, 910B | | CUDA | NVIDIA | H100, H20, L40, L20 | +| ROCm | AMD | MI250X (gfx90a), Radeon Pro W7800 (gfx1100), Radeon PRO V710 (gfx1101), Radeon RX 9070 XT (gfx1201), Radeon 8060S (gfx1151) | | MUSA | Mthreads | S5000 | | MACA | MetaX | C500 | diff --git a/setup.py b/setup.py index 478adfc61..817ba38e8 100644 --- a/setup.py +++ b/setup.py @@ -68,7 +68,7 @@ def print_platform_warning(): {RED}{'=' * 80} {BOLD}⚠️ WARNING: PLATFORM environment variable is not set! ⚠️{RESET} {RED}{'=' * 80}{RESET} -{YELLOW}Please set PLATFORM to one of: cuda, ascend, ascend-a3, musa, maca{RESET} +{YELLOW}Please set PLATFORM to one of: cuda, rocm, ascend, ascend-a3, musa, maca{RESET} Example: {BOLD}export PLATFORM=cuda{RESET} # For CUDA platform {YELLOW}In CI scenarios only, you don't need to specify PLATFORM. If it's not a CI scenario, please uninstall and then reinstall with PLATFORM specified.{RESET} @@ -174,6 +174,8 @@ def build_cmake(self, ext: CMakeExtension): match PLATFORM: case "cuda": cmake_args += ["-DRUNTIME_ENVIRONMENT=cuda"] + case "rocm": + cmake_args += ["-DRUNTIME_ENVIRONMENT=rocm"] case "ascend" | "ascend-a3": cmake_args += ["-DRUNTIME_ENVIRONMENT=ascend"] case "musa": diff --git a/ucm/shared/infra/CMakeLists.txt b/ucm/shared/infra/CMakeLists.txt index 80a3bf065..546f96f74 100644 --- a/ucm/shared/infra/CMakeLists.txt +++ b/ucm/shared/infra/CMakeLists.txt @@ -5,14 +5,11 @@ target_link_libraries(infra_status PUBLIC fmt) add_subdirectory(logger) -file(GLOB_RECURSE UCMINFRA_TEMPLATE_SOURCE_FILES "template/*.*") -add_library(infra_template OBJECT ${UCMINFRA_TEMPLATE_SOURCE_FILES}) -target_include_directories(infra_template PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +add_library(infra_template INTERFACE) +target_include_directories(infra_template INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) -file(GLOB_RECURSE UCMINFRA_THREAD_SOURCE_FILES "thread/*.*") -add_library(infra_thread OBJECT ${UCMINFRA_THREAD_SOURCE_FILES}) -target_include_directories(infra_thread PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +add_library(infra_thread INTERFACE) +target_include_directories(infra_thread INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) -file(GLOB_RECURSE UCMINFRA_TIME_SOURCE_FILES "time/*.*") -add_library(infra_time OBJECT ${UCMINFRA_TIME_SOURCE_FILES}) -target_include_directories(infra_time PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +add_library(infra_time INTERFACE) +target_include_directories(infra_time INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/ucm/shared/infra/logger/cc/spdlog_logger.cc b/ucm/shared/infra/logger/cc/spdlog_logger.cc index acc0cb71a..90b22829c 100644 --- a/ucm/shared/infra/logger/cc/spdlog_logger.cc +++ b/ucm/shared/infra/logger/cc/spdlog_logger.cc @@ -32,6 +32,12 @@ #include #include "compress_rotate_file_sink.h" #include "logger.h" +#ifdef _WIN32 +#include +#define getpid _getpid +#else +#include +#endif namespace UC::Logger { constexpr uint32_t kRateLimitCountBits = 2; constexpr uint64_t kRateLimitCountMask = (1u << kRateLimitCountBits) - 1u; diff --git a/ucm/shared/infra/logger/logger.cc b/ucm/shared/infra/logger/logger.cc index c96ff531a..d51360273 100644 --- a/ucm/shared/infra/logger/logger.cc +++ b/ucm/shared/infra/logger/logger.cc @@ -24,7 +24,9 @@ #include "logger.h" #include +#ifndef _WIN32 #include +#endif namespace UC::Logger { void Log(Level lv, std::string file, std::string func, int line, std::string msg) diff --git a/ucm/shared/metrics/CMakeLists.txt b/ucm/shared/metrics/CMakeLists.txt index 906959720..25973fcbe 100644 --- a/ucm/shared/metrics/CMakeLists.txt +++ b/ucm/shared/metrics/CMakeLists.txt @@ -1,5 +1,8 @@ file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc") add_library(metrics SHARED ${UCMMETRICS_CC_SOURCE_FILES}) +if(WIN32) + set_target_properties(metrics PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() target_include_directories(metrics PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/cc/api ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain diff --git a/ucm/shared/test/CMakeLists.txt b/ucm/shared/test/CMakeLists.txt index e608fe173..08d412fdc 100644 --- a/ucm/shared/test/CMakeLists.txt +++ b/ucm/shared/test/CMakeLists.txt @@ -1,6 +1,9 @@ if(BUILD_UNIT_TESTS) include(GoogleTest) file(GLOB_RECURSE UCMSHARED_TEST_SOURCE_FILES "./case/*.cc") + if(WIN32) + list(FILTER UCMSHARED_TEST_SOURCE_FILES EXCLUDE REGEX "thread_pool_test\\.cc$") + endif() add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES}) target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case) target_link_libraries(ucmshared.test PRIVATE diff --git a/ucm/shared/test/case/metrics/metrics_test.cc b/ucm/shared/test/case/metrics/metrics_test.cc index fa975ef74..1c2107b52 100644 --- a/ucm/shared/test/case/metrics/metrics_test.cc +++ b/ucm/shared/test/case/metrics/metrics_test.cc @@ -27,7 +27,9 @@ #include #include #include +#ifndef _WIN32 #include +#endif #include #include "metrics_api.h" diff --git a/ucm/shared/trans/CMakeLists.txt b/ucm/shared/trans/CMakeLists.txt index 6dfe24f33..c91c375c4 100644 --- a/ucm/shared/trans/CMakeLists.txt +++ b/ucm/shared/trans/CMakeLists.txt @@ -7,6 +7,9 @@ endif() if(RUNTIME_ENVIRONMENT STREQUAL "cuda") add_subdirectory(cuda) endif() +if(RUNTIME_ENVIRONMENT STREQUAL "rocm") + add_subdirectory(rocm) +endif() if(RUNTIME_ENVIRONMENT STREQUAL "simu") add_subdirectory(simu) endif() diff --git a/ucm/shared/trans/cuda/cuda_sm_kernel.cu b/ucm/shared/trans/cuda/cuda_sm_kernel.cu index 595092525..f633b9d2c 100644 --- a/ucm/shared/trans/cuda/cuda_sm_kernel.cu +++ b/ucm/shared/trans/cuda/cuda_sm_kernel.cu @@ -34,6 +34,7 @@ namespace UC::Trans { inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src, volatile uint8_t* __restrict__ dst) { +#if defined(__CUDA_ARCH__) uint4 lo, hi; asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3}, [%4];" : "=r"(lo.x), "=r"(lo.y), "=r"(lo.z), "=r"(lo.w) @@ -47,6 +48,20 @@ inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src, asm volatile("st.volatile.global.v4.b32 [%0+16], {%1,%2,%3,%4};" : : "l"(dst), "r"(hi.x), "r"(hi.y), "r"(hi.z), "r"(hi.w)); +#else + // ROCm has no ld.global.cs/st.volatile.global PTX or __ldcs/__stcg builtins; + // this is a plain vectorized 32-byte copy. Dropping `volatile` is correct on + // AMD: host visibility comes from the fine-grained-coherent host registration + // plus the per-transfer hipStreamSynchronize (GPU caches flush at kernel + // completion), not from the qualifier. On AMD `volatile` only forces an L1 + // bypass (glc, GPU-L2 scope), which is neither necessary nor sufficient for + // host visibility; system-scope ordering, if ever needed, is + // __threadfence_system. + const uint4* src4 = reinterpret_cast(src); + uint4* dst4 = reinterpret_cast(const_cast(dst)); + dst4[0] = src4[0]; + dst4[1] = src4[1]; +#endif } __global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num) diff --git a/ucm/shared/trans/rocm/CMakeLists.txt b/ucm/shared/trans/rocm/CMakeLists.txt new file mode 100644 index 000000000..36a934c76 --- /dev/null +++ b/ucm/shared/trans/rocm/CMakeLists.txt @@ -0,0 +1,30 @@ +# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the +# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU +# build host must then set the arch explicitly). +enable_language(HIP) +find_package(hip REQUIRED) + +set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../vendor/hip_compat) + +set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu + PROPERTIES LANGUAGE HIP) +add_library(kernel OBJECT ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu) +target_include_directories(kernel PRIVATE ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda) +if(WIN32) + target_compile_options(kernel PRIVATE -Wall) +else() + target_compile_options(kernel PRIVATE -Wall -fPIC) +endif() + +add_library(trans STATIC + ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc + ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc + ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc + ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc +) +target_include_directories(trans PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda) +target_link_libraries(trans PUBLIC + fmt + hip::host + kernel +) diff --git a/ucm/shared/vendor/hip_compat/cuda.h b/ucm/shared/vendor/hip_compat/cuda.h new file mode 100644 index 000000000..59b12e3e5 --- /dev/null +++ b/ucm/shared/vendor/hip_compat/cuda.h @@ -0,0 +1,40 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Author: Jeff Daily + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ + +/* + * ROCm/HIP compatibility shim for the CUDA driver-API header. The sparse + * Hamming-distance extension includes only to pull in the runtime + * declarations it shares with ; it uses no driver-API entry + * points. On a ROCm build we map it onto the runtime shim so the include + * resolves without the (absent) NVIDIA driver header. + */ +#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_H +#define UNIFIEDCACHE_HIP_COMPAT_CUDA_H + +#include "cuda_runtime.h" + +#endif diff --git a/ucm/shared/vendor/hip_compat/cuda_runtime.h b/ucm/shared/vendor/hip_compat/cuda_runtime.h new file mode 100644 index 000000000..a84ff95b1 --- /dev/null +++ b/ucm/shared/vendor/hip_compat/cuda_runtime.h @@ -0,0 +1,73 @@ +/** + * MIT License + * + * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. + * + * Author: Jeff Daily + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * */ + +/* + * ROCm/HIP compatibility shim. On a ROCm build the per-backend CMake puts this + * directory ahead of the toolchain includes, so every existing + * `#include ` resolves here instead of the (absent) NVIDIA + * header. We pull in the HIP runtime and alias the small set of cuda* runtime + * symbols the KV-transfer backend uses to their hip* equivalents, so the + * device-backend sources compile unchanged. The NVIDIA path never sees this + * file (its include dir points at the real CUDA toolkit). + */ +#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H +#define UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H + +#include +#include + +#include + +using cudaError_t = hipError_t; +using cudaStream_t = hipStream_t; +using cudaEvent_t = hipEvent_t; + +static constexpr hipError_t cudaSuccess = hipSuccess; +static constexpr hipMemcpyKind cudaMemcpyHostToDevice = hipMemcpyHostToDevice; +static constexpr hipMemcpyKind cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost; +static constexpr unsigned int cudaStreamNonBlocking = hipStreamNonBlocking; +static constexpr unsigned int cudaHostRegisterDefault = hipHostRegisterDefault; + +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMallocHost hipHostMalloc +#define cudaFreeHost hipHostFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaSetDevice hipSetDevice +#define cudaStreamCreate hipStreamCreate +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamAddCallback hipStreamAddCallback +#define cudaStreamWaitEvent hipStreamWaitEvent +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostGetDevicePointer hipHostGetDevicePointer + +#endif diff --git a/ucm/sparse/gsa_on_device/CMakeLists.txt b/ucm/sparse/gsa_on_device/CMakeLists.txt index d0d1f218e..7ade00967 100644 --- a/ucm/sparse/gsa_on_device/CMakeLists.txt +++ b/ucm/sparse/gsa_on_device/CMakeLists.txt @@ -52,4 +52,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "cuda") message(STATUS "Building GSAOnDevice for CUDA...") add_subdirectory(csrc/cuda/hash_retrieval) add_subdirectory(csrc/cuda/ham_dist) +elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm") + message(STATUS "Building GSAOnDevice for ROCm...") + add_subdirectory(csrc/cuda/hash_retrieval) + add_subdirectory(csrc/rocm/ham_dist) endif() diff --git a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h index ef3fce15d..783d0b9fe 100644 --- a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h +++ b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h @@ -1,7 +1,14 @@ #pragma once +#ifdef USE_ROCM +// torch keeps the cuda spelling for its public symbols on ROCm; the hipified +// context header provides c10::cuda::getCurrentCUDAStream backed by HIP, while +// the cuda-spelled header pulls in NVIDIA-only cuda_runtime_api.h/cusparse.h. +#include +#else #include #include +#endif #include #include diff --git a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu index eeef537b9..a882518f4 100644 --- a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu +++ b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu @@ -1,4 +1,6 @@ +#ifndef USE_ROCM #include +#endif #include #include diff --git a/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt b/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt new file mode 100644 index 000000000..182a45bd3 --- /dev/null +++ b/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt @@ -0,0 +1,115 @@ +message(STATUS "Building ham_dist (hamming HIP/ROCm extension)...") + +# The project-global release flags strip all symbols (-s), which would remove +# the CPython module init entry point from this extension and make it +# unloadable. Drop -s for this subtree so the .so keeps PyInit_hamming. +string(REPLACE "-s " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +string(REGEX REPLACE "(^| )-s$" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + +# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the +# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU +# build host must then set the arch explicitly). +enable_language(HIP) +find_package(hip REQUIRED) + +set(CUDA_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}/../../cuda/ham_dist) +set(HIP_COMPAT_DIR ${UCM_ROOT_DIR}/ucm/shared/vendor/hip_compat) + +# ---- Python ---- +find_package(Python COMPONENTS Interpreter Development.Module REQUIRED) + +execute_process( + COMMAND ${Python_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX') or '')" + OUTPUT_VARIABLE PY_EXT_SUFFIX + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +execute_process( + COMMAND ${Python_EXECUTABLE} -c "import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))" + OUTPUT_VARIABLE PYTORCH_PATH + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE PYTORCH_RESULT +) +if(NOT PYTORCH_RESULT EQUAL 0) + message(FATAL_ERROR "Failed to find PyTorch installation via Python") +endif() +message(STATUS "Found PyTorch at: ${PYTORCH_PATH}") + +set(CXX11_ABI "1") +execute_process( + COMMAND ${Python_EXECUTABLE} -c "import torch; print(int(getattr(torch._C, '_GLIBCXX_USE_CXX11_ABI', 1)))" + OUTPUT_VARIABLE TORCH_ABI + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE TORCH_ABI_RESULT +) +if(TORCH_ABI_RESULT EQUAL 0) + set(CXX11_ABI "${TORCH_ABI}") +endif() + +set(INCLUDE_DIRS + ${HIP_COMPAT_DIR} + ${CUDA_SRC_DIR} + ${PYTORCH_PATH}/include + ${PYTORCH_PATH}/include/torch/csrc/api/include +) + +set(LIBRARY_DIRS + ${PYTORCH_PATH}/lib +) + +set(LIBRARIES + torch + c10 + c10_hip + torch_cpu + torch_hip + torch_python + pthread + hip::host +) + +set_source_files_properties(${CUDA_SRC_DIR}/paged_ham_dist_mla.cu PROPERTIES LANGUAGE HIP) +# NO_EXTRAS disables pybind11's default LTO + strip, which under +# -fvisibility=hidden would drop the CPython module init symbol and make the +# extension unloadable. +pybind11_add_module(hamming NO_EXTRAS + ${CUDA_SRC_DIR}/cpy/hamming.cpp + ${CUDA_SRC_DIR}/paged_ham_dist_mla.cu +) + +set_target_properties(hamming PROPERTIES + PREFIX "" + SUFFIX "${PY_EXT_SUFFIX}" + POSITION_INDEPENDENT_CODE ON + CXX_STANDARD 20 + HIP_STANDARD 20 +) + +target_include_directories(hamming PRIVATE ${INCLUDE_DIRS}) +target_link_directories(hamming PRIVATE ${LIBRARY_DIRS}) +target_link_libraries(hamming PRIVATE ${LIBRARIES} Python::Module) + +# torch on ROCm keeps the cuda spelling and selects the HIP backend via these +# macros; the half-operator unmacros are nvcc-only and unneeded under hipcc. +target_compile_definitions(hamming PRIVATE + TORCH_EXTENSION_NAME=hamming + __HIP_PLATFORM_AMD__ + USE_ROCM + _GLIBCXX_USE_CXX11_ABI=${CXX11_ABI} +) + +target_compile_options(hamming PRIVATE + $<$:-O3> + $<$:-O3> + # The kernel uses INFINITY as a masking sentinel, so do not enable clang + # -ffast-math (it implies -ffinite-math-only and would discard the inf). + # torch 2.x headers use C++20 constructs; do not let the project-global + # -Werror promote warnings from third-party headers to errors here. + -Wno-error +) + +file(RELATIVE_PATH INSTALL_REL_PATH + ${UCM_ROOT_DIR} + ${CUDA_SRC_DIR} +) +install(TARGETS hamming LIBRARY DESTINATION ${INSTALL_REL_PATH} COMPONENT ucm) diff --git a/ucm/sparse/test/gsa/test_hamming_rocm_ref.py b/ucm/sparse/test/gsa/test_hamming_rocm_ref.py new file mode 100644 index 000000000..526993954 --- /dev/null +++ b/ucm/sparse/test/gsa/test_hamming_rocm_ref.py @@ -0,0 +1,189 @@ +"""Cross-checked pass/fail test for the Hamming-distance scoring kernel. + +The original test_cuda_hamming_{mla,gqa}.py drivers only print/benchmark the +kernel output. This test instead computes an independent CPU reference for the +paged (block-mode) Hamming score and asserts the GPU kernel matches it, giving +a deterministic correctness gate that is valid on both NVIDIA and ROCm. Run it +with one GPU visible, e.g. HIP_VISIBLE_DEVICES=0 / CUDA_VISIBLE_DEVICES=0. +""" + +import os +import sys + +import torch + +try: + from ucm.sparse.gsa_on_device.csrc.cuda.ham_dist import hamming +except ModuleNotFoundError: + # Allow running against the freshly built extension without an editable + # install: point HAMMING_DIR at the directory holding hamming*.so. + sys.path.insert(0, os.environ.get("HAMMING_DIR", os.getcwd())) + import hamming + + +def num_chunk_for(hd: int) -> int: + return hd // 32 + + +def popcount32(x: torch.Tensor) -> torch.Tensor: + x = x.to(torch.int64) & 0xFFFFFFFF + count = torch.zeros_like(x) + for i in range(32): + count += (x >> i) & 1 + return count + + +def reference_block_score( + key_i32, query_i32, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead +): + # key: (num_blocks, block_size, num_kv_head, num_chunk) int32 + # query: (b, 1, num_head, num_chunk) int32 + # + # Exact integer reference (popcount of key XOR query, summed over the GQA + # group and over chunks). The kernel stores the score in fp16, so the + # comparison below allows for fp16 rounding of these large integer sums. + num_blocks, block_size, num_kv_head, num_chunk = key_i32.shape + b = query_i32.shape[0] + num_head = query_i32.shape[2] + kv_group = num_head // num_kv_head + + key = key_i32.cpu() + query = query_i32.cpu() + bt = block_table.cpu() + sl = seqlen.cpu() + + if reduce_kvhead: + out = torch.full((b, 1, max_seqlen), float("inf"), dtype=torch.float32) + else: + out = torch.full((b, num_kv_head, max_seqlen), float("inf"), dtype=torch.float32) + + for bi in range(b): + actual = int(sl[bi].item()) + for pos in range(max_seqlen): + is_inf = pos >= actual + is_sink_or_recent = (pos < sink) or ( + (pos >= actual - recent) and (pos < actual) + ) + if is_inf: + continue + block_slot = pos // block_size + offset = pos % block_size + phys = int(bt[bi, block_slot].item()) + # The kernel reads the key block as (num_kv_head, block_size, + # num_chunk): base = phys*(num_kv_head*block_size*num_chunk), + # element (kv*block_size + offset)*num_chunk + chunk. Index a flat + # view of the key tensor the same way so the reference matches the + # kernel's layout regardless of the host tensor's nominal shape. + block = key[phys].reshape(-1) + per_kv = [] + for kv in range(num_kv_head): + base = (kv * block_size + offset) * num_chunk + k_chunks = block[base : base + num_chunk] + s = 0 + for g in range(kv_group): + head = kv * kv_group + g + q_chunks = query[bi, 0, head, :] + s += int(popcount32(k_chunks ^ q_chunks).sum().item()) + per_kv.append(s) + if reduce_kvhead: + val = 0.0 if is_sink_or_recent else float(min(per_kv)) + out[bi, 0, pos] = val + else: + for kv in range(num_kv_head): + val = 0.0 if is_sink_or_recent else float(per_kv[kv]) + out[bi, kv, pos] = val + return out + + +def build_inputs(b, h, hk, hd, block_size, seqlen_list, seed=42): + torch.manual_seed(seed) + max_seqlen = max(seqlen_list) + seqlen = torch.tensor(seqlen_list, dtype=torch.int32).cuda() + num_blocks_per_seq = (seqlen + block_size - 1) // block_size + num_blocks = int(num_blocks_per_seq.sum().item()) + 1 + max_num_block_per_seq = (max_seqlen + block_size - 1) // block_size + max_seqlen = int(max_num_block_per_seq * block_size) + + block_table = torch.zeros((b, max_num_block_per_seq), dtype=torch.int32) + start = 1 + for i, n in enumerate(num_blocks_per_seq): + block_table[i, :n] = torch.arange(start, start + n, dtype=torch.int32) + start += int(n) + block_table = block_table.cuda() + + key = torch.randn(num_blocks, block_size, hk, hd // 32).to(torch.float32) + query = torch.randn(b, 1, h, hd // 32).to(torch.float32) + key = key.view(torch.int32).cuda() + query = query.view(torch.int32).cuda() + return key, query, block_table, seqlen, max_seqlen + + +def run_case(name, b, h, hk, hd, block_size, seqlen_list, sink, recent, reduce_kvhead): + key, query, block_table, seqlen, max_seqlen = build_inputs( + b, h, hk, hd, block_size, seqlen_list + ) + out = hamming.hamming_score( + key, query, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead + ) + out_cpu = out.detach().float().cpu() + ref = reference_block_score( + key, query, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead + ) + + finite = torch.isfinite(ref) + mismatch_inf = torch.isinf(ref) ^ torch.isinf(out_cpu) + if mismatch_inf.any(): + print(f"[{name}] FAIL: inf mask mismatch at {int(mismatch_inf.sum())} positions") + return False + # The kernel sums the per-chunk integer popcounts into an fp16 accumulator, + # so each finite score carries fp16 rounding (relative ~2^-10) accumulated + # over the chunks. Compare with a relative tolerance plus a small absolute + # floor; an order-of-magnitude error (a wrong popcount/index) is far outside + # this band and would still fail. + if finite.any(): + diff = (out_cpu[finite] - ref[finite]).abs() + tol = (2 ** -10) * ref[finite].abs() * num_chunk_for(hd) + 1.0 + max_abs = diff.max().item() + max_rel = (diff / (ref[finite].abs() + 1.0)).max().item() + ok = bool((diff <= tol).all()) + else: + max_abs = max_rel = 0.0 + ok = True + print( + f"[{name}] {'PASS' if ok else 'FAIL'}: max_abs_err={max_abs:.1f} " + f"max_rel_err={max_rel:.2e} shape={tuple(out_cpu.shape)} " + f"reduce_kvhead={reduce_kvhead}" + ) + return ok + + +def main(): + torch.cuda.set_device(0) + results = [] + # MLA-style: many heads, single kv head, no kv reduction. + results.append( + run_case("mla", b=2, h=128, hk=1, hd=576, block_size=64, + seqlen_list=[513, 320], sink=1, recent=1, reduce_kvhead=False) + ) + # GQA-style: grouped heads, multiple kv heads, kv reduction (min over kv). + results.append( + run_case("gqa", b=3, h=32, hk=8, hd=128, block_size=128, + seqlen_list=[640, 512, 384], sink=1, recent=1, reduce_kvhead=True) + ) + # Determinism: two runs must be bit-identical. + key, query, bt, sl, ms = build_inputs(2, 128, 1, 576, 64, [513, 320]) + o1 = hamming.hamming_score(key, query, bt, sl, ms, 1, 1, False).detach().cpu() + o2 = hamming.hamming_score(key, query, bt, sl, ms, 1, 1, False).detach().cpu() + det = torch.equal(o1, o2) + print(f"[determinism] {'PASS' if det else 'FAIL'}: two-run bit-identical={det}") + results.append(det) + + if all(results): + print("ALL HAMMING TESTS PASSED") + return 0 + print("HAMMING TESTS FAILED") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ucm/store/nfsstore/device/CMakeLists.txt b/ucm/store/nfsstore/device/CMakeLists.txt index a424119ee..a9de10b46 100644 --- a/ucm/store/nfsstore/device/CMakeLists.txt +++ b/ucm/store/nfsstore/device/CMakeLists.txt @@ -6,10 +6,12 @@ elseif(RUNTIME_ENVIRONMENT STREQUAL "maca") add_subdirectory(maca) elseif(RUNTIME_ENVIRONMENT STREQUAL "cuda") add_subdirectory(cuda) +elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm") + add_subdirectory(rocm) elseif(RUNTIME_ENVIRONMENT STREQUAL "simu") add_subdirectory(simu) else() - message(FATAL_ERROR "RUNTIME_ENVIRONMENT must be one of: ascend, musa, cuda, simu. Current value: ${RUNTIME_ENVIRONMENT}") + message(FATAL_ERROR "RUNTIME_ENVIRONMENT must be one of: ascend, musa, cuda, rocm, simu. Current value: ${RUNTIME_ENVIRONMENT}") endif() if(TARGET storedevice) diff --git a/ucm/store/nfsstore/device/cuda/cuda_device.cu b/ucm/store/nfsstore/device/cuda/cuda_device.cu index 235b860cb..32b3e7119 100644 --- a/ucm/store/nfsstore/device/cuda/cuda_device.cu +++ b/ucm/store/nfsstore/device/cuda/cuda_device.cu @@ -32,16 +32,32 @@ inline __device__ void H2DUnit(uint8_t* __restrict__ dst, const volatile uint8_t* __restrict__ src) { +#if defined(__CUDA_ARCH__) uint64_t a, b; asm volatile("ld.global.cs.v2.u64 {%0, %1}, [%2];" : "=l"(a), "=l"(b) : "l"(src)); asm volatile("st.global.cg.v2.u64 [%0], {%1, %2};" ::"l"(dst), "l"(a), "l"(b)); +#else + // Plain 16-byte copy; see cuda_sm_kernel.cu for why dropping `volatile` is + // correct on AMD (coherent host registration + the per-transfer stream sync + // provide host visibility; AMD `volatile` is only an L1-bypass/glc hint at + // GPU-L2 scope, neither necessary nor sufficient here). + *reinterpret_cast(dst) = + *reinterpret_cast(const_cast(src)); +#endif } inline __device__ void D2HUnit(volatile uint8_t* __restrict__ dst, const uint8_t* __restrict__ src) { +#if defined(__CUDA_ARCH__) uint64_t a, b; asm volatile("ld.global.cs.v2.u64 {%0, %1}, [%2];" : "=l"(a), "=l"(b) : "l"(src)); asm volatile("st.volatile.global.v2.u64 [%0], {%1, %2};" ::"l"(dst), "l"(a), "l"(b)); +#else + // Plain 16-byte copy; volatile not needed on AMD (see H2DUnit and the + // cuda_sm_kernel.cu note). + *reinterpret_cast(const_cast(dst)) = + *reinterpret_cast(src); +#endif } __global__ void H2DKernel(uintptr_t* dst, const volatile uintptr_t* src, size_t num, size_t size) diff --git a/ucm/store/nfsstore/device/rocm/CMakeLists.txt b/ucm/store/nfsstore/device/rocm/CMakeLists.txt new file mode 100644 index 000000000..5fc1507ba --- /dev/null +++ b/ucm/store/nfsstore/device/rocm/CMakeLists.txt @@ -0,0 +1,18 @@ +# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the +# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU +# build host must then set the arch explicitly). +enable_language(HIP) +find_package(hip REQUIRED) + +set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../../../shared/vendor/hip_compat) + +set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cu + PROPERTIES LANGUAGE HIP) +add_library(storedevice STATIC ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cu) +target_include_directories(storedevice PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/..) +target_link_libraries(storedevice PUBLIC infra_status infra_logger fmt hip::host) +if(WIN32) + target_compile_options(storedevice PRIVATE -Wall) +else() + target_compile_options(storedevice PRIVATE -Wall -fPIC) +endif()