diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4a689e51c..c902e1efa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,9 +30,15 @@ if(BUILD_UCM_MINDIE)
     add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${UCM_CXX11_ABI})
 endif()
 
-set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
-set(FLAGS_DEBUG "-O0 -g")
-set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
+if(WIN32)
+    set(FLAGS_PUBLIC "/W3")
+    set(FLAGS_DEBUG "/Od /Zi")
+    set(FLAGS_RELEASE "/O2")
+else()
+    set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
+    set(FLAGS_DEBUG "-O0 -g")
+    set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
+endif()
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER)
 if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}")
diff --git a/docs/source/getting-started/quickstart_sglang.md b/docs/source/getting-started/quickstart_sglang.md
index cab1a57b3..8b50430e5 100644
--- a/docs/source/getting-started/quickstart_sglang.md
+++ b/docs/source/getting-started/quickstart_sglang.md
@@ -85,6 +85,8 @@ docker build --build-arg INSTALL_MODE=package \
     pip install -v -e . --no-build-isolation
     ```
 
+    > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda` to select the ROCm device backend (the KV-transfer and Hamming-distance kernels built with HIP). It requires a ROCm installation and a ROCm build of PyTorch; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if your GPU is not auto-detected. Validated on gfx90a and gfx1100.
+
 
 ### Option 3: Install by pip
 1. Prepare SGLang Environment
diff --git a/docs/source/getting-started/quickstart_vllm.md b/docs/source/getting-started/quickstart_vllm.md
index 719b3dea2..c0b3f2894 100644
--- a/docs/source/getting-started/quickstart_vllm.md
+++ b/docs/source/getting-started/quickstart_vllm.md
@@ -93,6 +93,8 @@ docker build --build-arg INSTALL_MODE=package \
     pip install -v -e . --no-build-isolation
     ```
 
+    > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda`. This selects the ROCm device backend, which builds the same KV-transfer and Hamming-distance kernels with HIP. It requires a ROCm installation and a ROCm build of PyTorch in the environment; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if CMake does not detect your GPU. Validated on gfx90a and gfx1100.
+
 3. Apply vLLM Integration Patches (Not required for versions > 0.11.0)
 
     To integrate UCM with vLLM 0.11.0, you can choose between a dynamic **monkey patch** (recommended) and a manual **git patch**.
diff --git a/docs/source/user-guide/support-matrix/support_matrix.md b/docs/source/user-guide/support-matrix/support_matrix.md
index f3f8ff9ee..1cc08f36b 100644
--- a/docs/source/user-guide/support-matrix/support_matrix.md
+++ b/docs/source/user-guide/support-matrix/support_matrix.md
@@ -58,6 +58,7 @@ This section presents the currently supported compute platforms and devices.
 |:----------------:|:------:|:------:|
 | CANN | Ascend | 910C, 910B |
 | CUDA | NVIDIA | H100, H20, L40, L20 |
+| ROCm | AMD | MI250X (gfx90a), Radeon Pro W7800 (gfx1100), Radeon PRO V710 (gfx1101), Radeon RX 9070 XT (gfx1201), Radeon 8060S (gfx1151) |
 | MUSA | Mthreads | S5000 |
 | MACA | MetaX | C500 |
 
diff --git a/setup.py b/setup.py
index 478adfc61..817ba38e8 100644
--- a/setup.py
+++ b/setup.py
@@ -68,7 +68,7 @@ def print_platform_warning():
 {RED}{'=' * 80}
 {BOLD}⚠️  WARNING: PLATFORM environment variable is not set! ⚠️{RESET}
 {RED}{'=' * 80}{RESET}
-{YELLOW}Please set PLATFORM to one of: cuda, ascend, ascend-a3, musa, maca{RESET}
+{YELLOW}Please set PLATFORM to one of: cuda, rocm, ascend, ascend-a3, musa, maca{RESET}
 Example:
   {BOLD}export PLATFORM=cuda{RESET}    # For CUDA platform
 {YELLOW}In CI scenarios only, you don't need to specify PLATFORM. If it's not a CI scenario, please uninstall and then reinstall with PLATFORM specified.{RESET}
@@ -174,6 +174,8 @@ def build_cmake(self, ext: CMakeExtension):
         match PLATFORM:
             case "cuda":
                 cmake_args += ["-DRUNTIME_ENVIRONMENT=cuda"]
+            case "rocm":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=rocm"]
             case "ascend" | "ascend-a3":
                 cmake_args += ["-DRUNTIME_ENVIRONMENT=ascend"]
             case "musa":
diff --git a/ucm/shared/infra/CMakeLists.txt b/ucm/shared/infra/CMakeLists.txt
index 80a3bf065..546f96f74 100644
--- a/ucm/shared/infra/CMakeLists.txt
+++ b/ucm/shared/infra/CMakeLists.txt
@@ -5,14 +5,11 @@ target_link_libraries(infra_status PUBLIC fmt)
 
 add_subdirectory(logger)
 
-file(GLOB_RECURSE UCMINFRA_TEMPLATE_SOURCE_FILES "template/*.*")
-add_library(infra_template OBJECT ${UCMINFRA_TEMPLATE_SOURCE_FILES})
-target_include_directories(infra_template PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_template INTERFACE)
+target_include_directories(infra_template INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-file(GLOB_RECURSE UCMINFRA_THREAD_SOURCE_FILES "thread/*.*")
-add_library(infra_thread OBJECT ${UCMINFRA_THREAD_SOURCE_FILES})
-target_include_directories(infra_thread PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_thread INTERFACE)
+target_include_directories(infra_thread INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-file(GLOB_RECURSE UCMINFRA_TIME_SOURCE_FILES "time/*.*")
-add_library(infra_time OBJECT ${UCMINFRA_TIME_SOURCE_FILES})
-target_include_directories(infra_time PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_time INTERFACE)
+target_include_directories(infra_time INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
diff --git a/ucm/shared/infra/logger/cc/spdlog_logger.cc b/ucm/shared/infra/logger/cc/spdlog_logger.cc
index acc0cb71a..90b22829c 100644
--- a/ucm/shared/infra/logger/cc/spdlog_logger.cc
+++ b/ucm/shared/infra/logger/cc/spdlog_logger.cc
@@ -32,6 +32,12 @@
 #include <spdlog/spdlog.h>
 #include "compress_rotate_file_sink.h"
 #include "logger.h"
+#ifdef _WIN32
+#include <process.h>
+#define getpid _getpid
+#else
+#include <unistd.h>
+#endif
 namespace UC::Logger {
 constexpr uint32_t kRateLimitCountBits = 2;
 constexpr uint64_t kRateLimitCountMask = (1u << kRateLimitCountBits) - 1u;
diff --git a/ucm/shared/infra/logger/logger.cc b/ucm/shared/infra/logger/logger.cc
index c96ff531a..d51360273 100644
--- a/ucm/shared/infra/logger/logger.cc
+++ b/ucm/shared/infra/logger/logger.cc
@@ -24,7 +24,9 @@
 
 #include "logger.h"
 #include <iostream>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 namespace UC::Logger {
 
 void Log(Level lv, std::string file, std::string func, int line, std::string msg)
diff --git a/ucm/shared/metrics/CMakeLists.txt b/ucm/shared/metrics/CMakeLists.txt
index 906959720..25973fcbe 100644
--- a/ucm/shared/metrics/CMakeLists.txt
+++ b/ucm/shared/metrics/CMakeLists.txt
@@ -1,5 +1,8 @@
 file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc")
 add_library(metrics SHARED ${UCMMETRICS_CC_SOURCE_FILES})
+if(WIN32)
+    set_target_properties(metrics PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
 target_include_directories(metrics PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/cc/api
     ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain
diff --git a/ucm/shared/test/CMakeLists.txt b/ucm/shared/test/CMakeLists.txt
index e608fe173..08d412fdc 100644
--- a/ucm/shared/test/CMakeLists.txt
+++ b/ucm/shared/test/CMakeLists.txt
@@ -1,6 +1,9 @@
 if(BUILD_UNIT_TESTS)
     include(GoogleTest)
     file(GLOB_RECURSE UCMSHARED_TEST_SOURCE_FILES "./case/*.cc")
+    if(WIN32)
+        list(FILTER UCMSHARED_TEST_SOURCE_FILES EXCLUDE REGEX "thread_pool_test\\.cc$")
+    endif()
     add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES})
     target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case)
     target_link_libraries(ucmshared.test PRIVATE
diff --git a/ucm/shared/test/case/metrics/metrics_test.cc b/ucm/shared/test/case/metrics/metrics_test.cc
index fa975ef74..1c2107b52 100644
--- a/ucm/shared/test/case/metrics/metrics_test.cc
+++ b/ucm/shared/test/case/metrics/metrics_test.cc
@@ -27,7 +27,9 @@
 #include <iostream>
 #include <numeric>
 #include <thread>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 #include <vector>
 #include "metrics_api.h"
 
diff --git a/ucm/shared/trans/CMakeLists.txt b/ucm/shared/trans/CMakeLists.txt
index 6dfe24f33..c91c375c4 100644
--- a/ucm/shared/trans/CMakeLists.txt
+++ b/ucm/shared/trans/CMakeLists.txt
@@ -7,6 +7,9 @@ endif()
 if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     add_subdirectory(cuda)
 endif()
+if(RUNTIME_ENVIRONMENT STREQUAL "rocm")
+    add_subdirectory(rocm)
+endif()
 if(RUNTIME_ENVIRONMENT STREQUAL "simu")
     add_subdirectory(simu)
 endif()
diff --git a/ucm/shared/trans/cuda/cuda_sm_kernel.cu b/ucm/shared/trans/cuda/cuda_sm_kernel.cu
index 595092525..f633b9d2c 100644
--- a/ucm/shared/trans/cuda/cuda_sm_kernel.cu
+++ b/ucm/shared/trans/cuda/cuda_sm_kernel.cu
@@ -34,6 +34,7 @@ namespace UC::Trans {
 inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
                                     volatile uint8_t* __restrict__ dst)
 {
+#if defined(__CUDA_ARCH__)
     uint4 lo, hi;
     asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3}, [%4];"
                  : "=r"(lo.x), "=r"(lo.y), "=r"(lo.z), "=r"(lo.w)
@@ -47,6 +48,20 @@ inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
     asm volatile("st.volatile.global.v4.b32 [%0+16], {%1,%2,%3,%4};"
                  :
                  : "l"(dst), "r"(hi.x), "r"(hi.y), "r"(hi.z), "r"(hi.w));
+#else
+    // ROCm has no ld.global.cs/st.volatile.global PTX or __ldcs/__stcg builtins;
+    // this is a plain vectorized 32-byte copy. Dropping `volatile` is correct on
+    // AMD: host visibility comes from the fine-grained-coherent host registration
+    // plus the per-transfer hipStreamSynchronize (GPU caches flush at kernel
+    // completion), not from the qualifier. On AMD `volatile` only forces an L1
+    // bypass (glc, GPU-L2 scope), which is neither necessary nor sufficient for
+    // host visibility; system-scope ordering, if ever needed, is
+    // __threadfence_system.
+    const uint4* src4 = reinterpret_cast<const uint4*>(src);
+    uint4* dst4 = reinterpret_cast<uint4*>(const_cast<uint8_t*>(dst));
+    dst4[0] = src4[0];
+    dst4[1] = src4[1];
+#endif
 }
 
 __global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num)
diff --git a/ucm/shared/trans/rocm/CMakeLists.txt b/ucm/shared/trans/rocm/CMakeLists.txt
new file mode 100644
index 000000000..36a934c76
--- /dev/null
+++ b/ucm/shared/trans/rocm/CMakeLists.txt
@@ -0,0 +1,30 @@
+# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the
+# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU
+# build host must then set the arch explicitly).
+enable_language(HIP)
+find_package(hip REQUIRED)
+
+set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../vendor/hip_compat)
+
+set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu
+    PROPERTIES LANGUAGE HIP)
+add_library(kernel OBJECT ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu)
+target_include_directories(kernel PRIVATE ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
+if(WIN32)
+    target_compile_options(kernel PRIVATE -Wall)
+else()
+    target_compile_options(kernel PRIVATE -Wall -fPIC)
+endif()
+
+add_library(trans STATIC
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc
+)
+target_include_directories(trans PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
+target_link_libraries(trans PUBLIC
+    fmt
+    hip::host
+    kernel
+)
diff --git a/ucm/shared/vendor/hip_compat/cuda.h b/ucm/shared/vendor/hip_compat/cuda.h
new file mode 100644
index 000000000..59b12e3e5
--- /dev/null
+++ b/ucm/shared/vendor/hip_compat/cuda.h
@@ -0,0 +1,40 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * */
+
+/*
+ * ROCm/HIP compatibility shim for the CUDA driver-API header. The sparse
+ * Hamming-distance extension includes <cuda.h> only to pull in the runtime
+ * declarations it shares with <cuda_runtime.h>; it uses no driver-API entry
+ * points. On a ROCm build we map it onto the runtime shim so the include
+ * resolves without the (absent) NVIDIA driver header.
+ */
+#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_H
+#define UNIFIEDCACHE_HIP_COMPAT_CUDA_H
+
+#include "cuda_runtime.h"
+
+#endif
diff --git a/ucm/shared/vendor/hip_compat/cuda_runtime.h b/ucm/shared/vendor/hip_compat/cuda_runtime.h
new file mode 100644
index 000000000..a84ff95b1
--- /dev/null
+++ b/ucm/shared/vendor/hip_compat/cuda_runtime.h
@@ -0,0 +1,73 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * */
+
+/*
+ * ROCm/HIP compatibility shim. On a ROCm build the per-backend CMake puts this
+ * directory ahead of the toolchain includes, so every existing
+ * `#include <cuda_runtime.h>` resolves here instead of the (absent) NVIDIA
+ * header. We pull in the HIP runtime and alias the small set of cuda* runtime
+ * symbols the KV-transfer backend uses to their hip* equivalents, so the
+ * device-backend sources compile unchanged. The NVIDIA path never sees this
+ * file (its include dir points at the real CUDA toolkit).
+ */
+#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H
+#define UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include <hip/hip_runtime.h>
+
+using cudaError_t = hipError_t;
+using cudaStream_t = hipStream_t;
+using cudaEvent_t = hipEvent_t;
+
+static constexpr hipError_t cudaSuccess = hipSuccess;
+static constexpr hipMemcpyKind cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
+static constexpr hipMemcpyKind cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
+static constexpr unsigned int cudaStreamNonBlocking = hipStreamNonBlocking;
+static constexpr unsigned int cudaHostRegisterDefault = hipHostRegisterDefault;
+
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMallocHost hipHostMalloc
+#define cudaFreeHost hipHostFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamAddCallback hipStreamAddCallback
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaHostRegister hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostGetDevicePointer hipHostGetDevicePointer
+
+#endif
diff --git a/ucm/sparse/gsa_on_device/CMakeLists.txt b/ucm/sparse/gsa_on_device/CMakeLists.txt
index d0d1f218e..7ade00967 100644
--- a/ucm/sparse/gsa_on_device/CMakeLists.txt
+++ b/ucm/sparse/gsa_on_device/CMakeLists.txt
@@ -52,4 +52,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     message(STATUS "Building GSAOnDevice for CUDA...")
     add_subdirectory(csrc/cuda/hash_retrieval)
     add_subdirectory(csrc/cuda/ham_dist)
+elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm")
+    message(STATUS "Building GSAOnDevice for ROCm...")
+    add_subdirectory(csrc/cuda/hash_retrieval)
+    add_subdirectory(csrc/rocm/ham_dist)
 endif()
diff --git a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h
index ef3fce15d..783d0b9fe 100644
--- a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h
+++ b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h
@@ -1,7 +1,14 @@
 #pragma once
 
+#ifdef USE_ROCM
+// torch keeps the cuda spelling for its public symbols on ROCm; the hipified
+// context header provides c10::cuda::getCurrentCUDAStream backed by HIP, while
+// the cuda-spelled header pulls in NVIDIA-only cuda_runtime_api.h/cusparse.h.
+#include <ATen/hip/HIPContext.h>
+#else
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
+#endif
 #include <cuda_runtime.h>
 #include <torch/script.h>
 
diff --git a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu
index eeef537b9..a882518f4 100644
--- a/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu
+++ b/ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/paged_ham_dist_mla.cu
@@ -1,4 +1,6 @@
+#ifndef USE_ROCM
 #include <cuda.h>
+#endif
 #include <cuda_runtime.h>
 #include <torch/script.h>
 
diff --git a/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt b/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt
new file mode 100644
index 000000000..182a45bd3
--- /dev/null
+++ b/ucm/sparse/gsa_on_device/csrc/rocm/ham_dist/CMakeLists.txt
@@ -0,0 +1,115 @@
+message(STATUS "Building ham_dist (hamming HIP/ROCm extension)...")
+
+# The project-global release flags strip all symbols (-s), which would remove
+# the CPython module init entry point from this extension and make it
+# unloadable. Drop -s for this subtree so the .so keeps PyInit_hamming.
+string(REPLACE "-s " "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+string(REGEX REPLACE "(^| )-s$" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the
+# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU
+# build host must then set the arch explicitly).
+enable_language(HIP)
+find_package(hip REQUIRED)
+
+set(CUDA_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}/../../cuda/ham_dist)
+set(HIP_COMPAT_DIR ${UCM_ROOT_DIR}/ucm/shared/vendor/hip_compat)
+
+# ---- Python ----
+find_package(Python COMPONENTS Interpreter Development.Module REQUIRED)
+
+execute_process(
+    COMMAND ${Python_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_config_var('EXT_SUFFIX') or '')"
+    OUTPUT_VARIABLE PY_EXT_SUFFIX
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+execute_process(
+    COMMAND ${Python_EXECUTABLE} -c "import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))"
+    OUTPUT_VARIABLE PYTORCH_PATH
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE PYTORCH_RESULT
+)
+if(NOT PYTORCH_RESULT EQUAL 0)
+    message(FATAL_ERROR "Failed to find PyTorch installation via Python")
+endif()
+message(STATUS "Found PyTorch at: ${PYTORCH_PATH}")
+
+set(CXX11_ABI "1")
+execute_process(
+    COMMAND ${Python_EXECUTABLE} -c "import torch; print(int(getattr(torch._C, '_GLIBCXX_USE_CXX11_ABI', 1)))"
+    OUTPUT_VARIABLE TORCH_ABI
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE TORCH_ABI_RESULT
+)
+if(TORCH_ABI_RESULT EQUAL 0)
+    set(CXX11_ABI "${TORCH_ABI}")
+endif()
+
+set(INCLUDE_DIRS
+    ${HIP_COMPAT_DIR}
+    ${CUDA_SRC_DIR}
+    ${PYTORCH_PATH}/include
+    ${PYTORCH_PATH}/include/torch/csrc/api/include
+)
+
+set(LIBRARY_DIRS
+    ${PYTORCH_PATH}/lib
+)
+
+set(LIBRARIES
+    torch
+    c10
+    c10_hip
+    torch_cpu
+    torch_hip
+    torch_python
+    pthread
+    hip::host
+)
+
+set_source_files_properties(${CUDA_SRC_DIR}/paged_ham_dist_mla.cu PROPERTIES LANGUAGE HIP)
+# NO_EXTRAS disables pybind11's default LTO + strip, which under
+# -fvisibility=hidden would drop the CPython module init symbol and make the
+# extension unloadable.
+pybind11_add_module(hamming NO_EXTRAS
+    ${CUDA_SRC_DIR}/cpy/hamming.cpp
+    ${CUDA_SRC_DIR}/paged_ham_dist_mla.cu
+)
+
+set_target_properties(hamming PROPERTIES
+    PREFIX ""
+    SUFFIX "${PY_EXT_SUFFIX}"
+    POSITION_INDEPENDENT_CODE ON
+    CXX_STANDARD 20
+    HIP_STANDARD 20
+)
+
+target_include_directories(hamming PRIVATE ${INCLUDE_DIRS})
+target_link_directories(hamming PRIVATE ${LIBRARY_DIRS})
+target_link_libraries(hamming PRIVATE ${LIBRARIES} Python::Module)
+
+# torch on ROCm keeps the cuda spelling and selects the HIP backend via these
+# macros; the half-operator unmacros are nvcc-only and unneeded under hipcc.
+target_compile_definitions(hamming PRIVATE
+    TORCH_EXTENSION_NAME=hamming
+    __HIP_PLATFORM_AMD__
+    USE_ROCM
+    _GLIBCXX_USE_CXX11_ABI=${CXX11_ABI}
+)
+
+target_compile_options(hamming PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-O3>
+    $<$<COMPILE_LANGUAGE:HIP>:-O3>
+    # The kernel uses INFINITY as a masking sentinel, so do not enable clang
+    # -ffast-math (it implies -ffinite-math-only and would discard the inf).
+    # torch 2.x headers use C++20 constructs; do not let the project-global
+    # -Werror promote warnings from third-party headers to errors here.
+    -Wno-error
+)
+
+file(RELATIVE_PATH INSTALL_REL_PATH
+     ${UCM_ROOT_DIR}
+     ${CUDA_SRC_DIR}
+)
+install(TARGETS hamming LIBRARY DESTINATION ${INSTALL_REL_PATH} COMPONENT ucm)
diff --git a/ucm/sparse/test/gsa/test_hamming_rocm_ref.py b/ucm/sparse/test/gsa/test_hamming_rocm_ref.py
new file mode 100644
index 000000000..526993954
--- /dev/null
+++ b/ucm/sparse/test/gsa/test_hamming_rocm_ref.py
@@ -0,0 +1,189 @@
+"""Cross-checked pass/fail test for the Hamming-distance scoring kernel.
+
+The original test_cuda_hamming_{mla,gqa}.py drivers only print/benchmark the
+kernel output. This test instead computes an independent CPU reference for the
+paged (block-mode) Hamming score and asserts the GPU kernel matches it, giving
+a deterministic correctness gate that is valid on both NVIDIA and ROCm. Run it
+with one GPU visible, e.g. HIP_VISIBLE_DEVICES=0 / CUDA_VISIBLE_DEVICES=0.
+"""
+
+import os
+import sys
+
+import torch
+
+try:
+    from ucm.sparse.gsa_on_device.csrc.cuda.ham_dist import hamming
+except ModuleNotFoundError:
+    # Allow running against the freshly built extension without an editable
+    # install: point HAMMING_DIR at the directory holding hamming*.so.
+    sys.path.insert(0, os.environ.get("HAMMING_DIR", os.getcwd()))
+    import hamming
+
+
+def num_chunk_for(hd: int) -> int:
+    return hd // 32
+
+
+def popcount32(x: torch.Tensor) -> torch.Tensor:
+    x = x.to(torch.int64) & 0xFFFFFFFF
+    count = torch.zeros_like(x)
+    for i in range(32):
+        count += (x >> i) & 1
+    return count
+
+
+def reference_block_score(
+    key_i32, query_i32, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead
+):
+    # key:   (num_blocks, block_size, num_kv_head, num_chunk) int32
+    # query: (b, 1, num_head, num_chunk) int32
+    #
+    # Exact integer reference (popcount of key XOR query, summed over the GQA
+    # group and over chunks). The kernel stores the score in fp16, so the
+    # comparison below allows for fp16 rounding of these large integer sums.
+    num_blocks, block_size, num_kv_head, num_chunk = key_i32.shape
+    b = query_i32.shape[0]
+    num_head = query_i32.shape[2]
+    kv_group = num_head // num_kv_head
+
+    key = key_i32.cpu()
+    query = query_i32.cpu()
+    bt = block_table.cpu()
+    sl = seqlen.cpu()
+
+    if reduce_kvhead:
+        out = torch.full((b, 1, max_seqlen), float("inf"), dtype=torch.float32)
+    else:
+        out = torch.full((b, num_kv_head, max_seqlen), float("inf"), dtype=torch.float32)
+
+    for bi in range(b):
+        actual = int(sl[bi].item())
+        for pos in range(max_seqlen):
+            is_inf = pos >= actual
+            is_sink_or_recent = (pos < sink) or (
+                (pos >= actual - recent) and (pos < actual)
+            )
+            if is_inf:
+                continue
+            block_slot = pos // block_size
+            offset = pos % block_size
+            phys = int(bt[bi, block_slot].item())
+            # The kernel reads the key block as (num_kv_head, block_size,
+            # num_chunk): base = phys*(num_kv_head*block_size*num_chunk),
+            # element (kv*block_size + offset)*num_chunk + chunk. Index a flat
+            # view of the key tensor the same way so the reference matches the
+            # kernel's layout regardless of the host tensor's nominal shape.
+            block = key[phys].reshape(-1)
+            per_kv = []
+            for kv in range(num_kv_head):
+                base = (kv * block_size + offset) * num_chunk
+                k_chunks = block[base : base + num_chunk]
+                s = 0
+                for g in range(kv_group):
+                    head = kv * kv_group + g
+                    q_chunks = query[bi, 0, head, :]
+                    s += int(popcount32(k_chunks ^ q_chunks).sum().item())
+                per_kv.append(s)
+            if reduce_kvhead:
+                val = 0.0 if is_sink_or_recent else float(min(per_kv))
+                out[bi, 0, pos] = val
+            else:
+                for kv in range(num_kv_head):
+                    val = 0.0 if is_sink_or_recent else float(per_kv[kv])
+                    out[bi, kv, pos] = val
+    return out
+
+
+def build_inputs(b, h, hk, hd, block_size, seqlen_list, seed=42):
+    torch.manual_seed(seed)
+    max_seqlen = max(seqlen_list)
+    seqlen = torch.tensor(seqlen_list, dtype=torch.int32).cuda()
+    num_blocks_per_seq = (seqlen + block_size - 1) // block_size
+    num_blocks = int(num_blocks_per_seq.sum().item()) + 1
+    max_num_block_per_seq = (max_seqlen + block_size - 1) // block_size
+    max_seqlen = int(max_num_block_per_seq * block_size)
+
+    block_table = torch.zeros((b, max_num_block_per_seq), dtype=torch.int32)
+    start = 1
+    for i, n in enumerate(num_blocks_per_seq):
+        block_table[i, :n] = torch.arange(start, start + n, dtype=torch.int32)
+        start += int(n)
+    block_table = block_table.cuda()
+
+    key = torch.randn(num_blocks, block_size, hk, hd // 32).to(torch.float32)
+    query = torch.randn(b, 1, h, hd // 32).to(torch.float32)
+    key = key.view(torch.int32).cuda()
+    query = query.view(torch.int32).cuda()
+    return key, query, block_table, seqlen, max_seqlen
+
+
+def run_case(name, b, h, hk, hd, block_size, seqlen_list, sink, recent, reduce_kvhead):
+    key, query, block_table, seqlen, max_seqlen = build_inputs(
+        b, h, hk, hd, block_size, seqlen_list
+    )
+    out = hamming.hamming_score(
+        key, query, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead
+    )
+    out_cpu = out.detach().float().cpu()
+    ref = reference_block_score(
+        key, query, block_table, seqlen, max_seqlen, sink, recent, reduce_kvhead
+    )
+
+    finite = torch.isfinite(ref)
+    mismatch_inf = torch.isinf(ref) ^ torch.isinf(out_cpu)
+    if mismatch_inf.any():
+        print(f"[{name}] FAIL: inf mask mismatch at {int(mismatch_inf.sum())} positions")
+        return False
+    # The kernel sums the per-chunk integer popcounts into an fp16 accumulator,
+    # so each finite score carries fp16 rounding (relative ~2^-10) accumulated
+    # over the chunks. Compare with a relative tolerance plus a small absolute
+    # floor; an order-of-magnitude error (a wrong popcount/index) is far outside
+    # this band and would still fail.
+    if finite.any():
+        diff = (out_cpu[finite] - ref[finite]).abs()
+        tol = (2 ** -10) * ref[finite].abs() * num_chunk_for(hd) + 1.0
+        max_abs = diff.max().item()
+        max_rel = (diff / (ref[finite].abs() + 1.0)).max().item()
+        ok = bool((diff <= tol).all())
+    else:
+        max_abs = max_rel = 0.0
+        ok = True
+    print(
+        f"[{name}] {'PASS' if ok else 'FAIL'}: max_abs_err={max_abs:.1f} "
+        f"max_rel_err={max_rel:.2e} shape={tuple(out_cpu.shape)} "
+        f"reduce_kvhead={reduce_kvhead}"
+    )
+    return ok
+
+
+def main():
+    torch.cuda.set_device(0)
+    results = []
+    # MLA-style: many heads, single kv head, no kv reduction.
+    results.append(
+        run_case("mla", b=2, h=128, hk=1, hd=576, block_size=64,
+                 seqlen_list=[513, 320], sink=1, recent=1, reduce_kvhead=False)
+    )
+    # GQA-style: grouped heads, multiple kv heads, kv reduction (min over kv).
+    results.append(
+        run_case("gqa", b=3, h=32, hk=8, hd=128, block_size=128,
+                 seqlen_list=[640, 512, 384], sink=1, recent=1, reduce_kvhead=True)
+    )
+    # Determinism: two runs must be bit-identical.
+    key, query, bt, sl, ms = build_inputs(2, 128, 1, 576, 64, [513, 320])
+    o1 = hamming.hamming_score(key, query, bt, sl, ms, 1, 1, False).detach().cpu()
+    o2 = hamming.hamming_score(key, query, bt, sl, ms, 1, 1, False).detach().cpu()
+    det = torch.equal(o1, o2)
+    print(f"[determinism] {'PASS' if det else 'FAIL'}: two-run bit-identical={det}")
+    results.append(det)
+
+    if all(results):
+        print("ALL HAMMING TESTS PASSED")
+        return 0
+    print("HAMMING TESTS FAILED")
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/ucm/store/nfsstore/device/CMakeLists.txt b/ucm/store/nfsstore/device/CMakeLists.txt
index a424119ee..a9de10b46 100644
--- a/ucm/store/nfsstore/device/CMakeLists.txt
+++ b/ucm/store/nfsstore/device/CMakeLists.txt
@@ -6,10 +6,12 @@ elseif(RUNTIME_ENVIRONMENT STREQUAL "maca")
     add_subdirectory(maca)
 elseif(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     add_subdirectory(cuda)
+elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm")
+    add_subdirectory(rocm)
 elseif(RUNTIME_ENVIRONMENT STREQUAL "simu")
     add_subdirectory(simu)
 else()
-    message(FATAL_ERROR "RUNTIME_ENVIRONMENT must be one of: ascend, musa, cuda, simu. Current value: ${RUNTIME_ENVIRONMENT}")
+    message(FATAL_ERROR "RUNTIME_ENVIRONMENT must be one of: ascend, musa, cuda, rocm, simu. Current value: ${RUNTIME_ENVIRONMENT}")
 endif()
 
 if(TARGET storedevice)
diff --git a/ucm/store/nfsstore/device/cuda/cuda_device.cu b/ucm/store/nfsstore/device/cuda/cuda_device.cu
index 235b860cb..32b3e7119 100644
--- a/ucm/store/nfsstore/device/cuda/cuda_device.cu
+++ b/ucm/store/nfsstore/device/cuda/cuda_device.cu
@@ -32,16 +32,32 @@
 
 inline __device__ void H2DUnit(uint8_t* __restrict__ dst, const volatile uint8_t* __restrict__ src)
 {
+#if defined(__CUDA_ARCH__)
     uint64_t a, b;
     asm volatile("ld.global.cs.v2.u64 {%0, %1}, [%2];" : "=l"(a), "=l"(b) : "l"(src));
     asm volatile("st.global.cg.v2.u64 [%0], {%1, %2};" ::"l"(dst), "l"(a), "l"(b));
+#else
+    // Plain 16-byte copy; see cuda_sm_kernel.cu for why dropping `volatile` is
+    // correct on AMD (coherent host registration + the per-transfer stream sync
+    // provide host visibility; AMD `volatile` is only an L1-bypass/glc hint at
+    // GPU-L2 scope, neither necessary nor sufficient here).
+    *reinterpret_cast<uint4*>(dst) =
+        *reinterpret_cast<const uint4*>(const_cast<const uint8_t*>(src));
+#endif
 }
 
 inline __device__ void D2HUnit(volatile uint8_t* __restrict__ dst, const uint8_t* __restrict__ src)
 {
+#if defined(__CUDA_ARCH__)
     uint64_t a, b;
     asm volatile("ld.global.cs.v2.u64 {%0, %1}, [%2];" : "=l"(a), "=l"(b) : "l"(src));
     asm volatile("st.volatile.global.v2.u64 [%0], {%1, %2};" ::"l"(dst), "l"(a), "l"(b));
+#else
+    // Plain 16-byte copy; volatile not needed on AMD (see H2DUnit and the
+    // cuda_sm_kernel.cu note).
+    *reinterpret_cast<uint4*>(const_cast<uint8_t*>(dst)) =
+        *reinterpret_cast<const uint4*>(src);
+#endif
 }
 
 __global__ void H2DKernel(uintptr_t* dst, const volatile uintptr_t* src, size_t num, size_t size)
diff --git a/ucm/store/nfsstore/device/rocm/CMakeLists.txt b/ucm/store/nfsstore/device/rocm/CMakeLists.txt
new file mode 100644
index 000000000..5fc1507ba
--- /dev/null
+++ b/ucm/store/nfsstore/device/rocm/CMakeLists.txt
@@ -0,0 +1,18 @@
+# enable_language(HIP) honors -DCMAKE_HIP_ARCHITECTURES, otherwise auto-detects the
+# host GPU(s) via rocm_agent_enumerator and errors if none is found (a no-GPU
+# build host must then set the arch explicitly).
+enable_language(HIP)
+find_package(hip REQUIRED)
+
+set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../../../shared/vendor/hip_compat)
+
+set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cu
+    PROPERTIES LANGUAGE HIP)
+add_library(storedevice STATIC ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cu)
+target_include_directories(storedevice PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/..)
+target_link_libraries(storedevice PUBLIC infra_status infra_logger fmt hip::host)
+if(WIN32)
+    target_compile_options(storedevice PRIVATE -Wall)
+else()
+    target_compile_options(storedevice PRIVATE -Wall -fPIC)
+endif()