ModelEngine-Group · jeffdaily · Jun 11, 2026 · Jun 23, 2026 · Jun 23, 2026 · flesher0813
@@ -30,9 +30,15 @@ if(BUILD_UCM_MINDIE)
     add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${UCM_CXX11_ABI})
 endif()
 
-set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
-set(FLAGS_DEBUG "-O0 -g")
-set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
+if(WIN32)
+    set(FLAGS_PUBLIC "/W3")
+    set(FLAGS_DEBUG "/Od /Zi")
+    set(FLAGS_RELEASE "/O2")
+else()
+    set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
+    set(FLAGS_DEBUG "-O0 -g")
+    set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
+endif()
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER)
 if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}")

@@ -85,6 +85,8 @@ docker build --build-arg INSTALL_MODE=package \
     pip install -v -e . --no-build-isolation
     ```
 
+    > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda` to select the ROCm device backend (the KV-transfer and Hamming-distance kernels built with HIP). It requires a ROCm installation and a ROCm build of PyTorch; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if your GPU is not auto-detected. Validated on gfx90a and gfx1100.
+
 
 ### Option 3: Install by pip
 1. Prepare SGLang Environment

@@ -93,6 +93,8 @@ docker build --build-arg INSTALL_MODE=package \
     pip install -v -e . --no-build-isolation
     ```
 
+    > **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda`. This selects the ROCm device backend, which builds the same KV-transfer and Hamming-distance kernels with HIP. It requires a ROCm installation and a ROCm build of PyTorch in the environment; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if CMake does not detect your GPU. Validated on gfx90a and gfx1100.
+
 3. Apply vLLM Integration Patches (Not required for versions > 0.11.0)
 
     To integrate UCM with vLLM 0.11.0, you can choose between a dynamic **monkey patch** (recommended) and a manual **git patch**.

@@ -58,6 +58,7 @@ This section presents the currently supported compute platforms and devices.
 |:----------------:|:------:|:------:|
 | CANN | Ascend | 910C, 910B |
 | CUDA | NVIDIA | H100, H20, L40, L20 |
+| ROCm | AMD | MI250X (gfx90a), Radeon Pro W7800 (gfx1100), Radeon PRO V710 (gfx1101), Radeon RX 9070 XT (gfx1201), Radeon 8060S (gfx1151) |
 | MUSA | Mthreads | S5000 |
 | MACA | MetaX | C500 |
 

@@ -68,7 +68,7 @@ def print_platform_warning():
 {RED}{'=' * 80}
 {BOLD}⚠️  WARNING: PLATFORM environment variable is not set! ⚠️{RESET}
 {RED}{'=' * 80}{RESET}
-{YELLOW}Please set PLATFORM to one of: cuda, ascend, ascend-a3, musa, maca{RESET}
+{YELLOW}Please set PLATFORM to one of: cuda, rocm, ascend, ascend-a3, musa, maca{RESET}
 Example:
   {BOLD}export PLATFORM=cuda{RESET}    # For CUDA platform
 {YELLOW}In CI scenarios only, you don't need to specify PLATFORM. If it's not a CI scenario, please uninstall and then reinstall with PLATFORM specified.{RESET}
@@ -174,6 +174,8 @@ def build_cmake(self, ext: CMakeExtension):
         match PLATFORM:
             case "cuda":
                 cmake_args += ["-DRUNTIME_ENVIRONMENT=cuda"]
+            case "rocm":
+                cmake_args += ["-DRUNTIME_ENVIRONMENT=rocm"]
             case "ascend" | "ascend-a3":
                 cmake_args += ["-DRUNTIME_ENVIRONMENT=ascend"]
             case "musa":

@@ -5,14 +5,11 @@ target_link_libraries(infra_status PUBLIC fmt)
 
 add_subdirectory(logger)
 
-file(GLOB_RECURSE UCMINFRA_TEMPLATE_SOURCE_FILES "template/*.*")
-add_library(infra_template OBJECT ${UCMINFRA_TEMPLATE_SOURCE_FILES})
-target_include_directories(infra_template PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_template INTERFACE)
+target_include_directories(infra_template INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-file(GLOB_RECURSE UCMINFRA_THREAD_SOURCE_FILES "thread/*.*")
-add_library(infra_thread OBJECT ${UCMINFRA_THREAD_SOURCE_FILES})
-target_include_directories(infra_thread PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_thread INTERFACE)
+target_include_directories(infra_thread INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-file(GLOB_RECURSE UCMINFRA_TIME_SOURCE_FILES "time/*.*")
-add_library(infra_time OBJECT ${UCMINFRA_TIME_SOURCE_FILES})
-target_include_directories(infra_time PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+add_library(infra_time INTERFACE)
+target_include_directories(infra_time INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
@@ -32,6 +32,12 @@
 #include <spdlog/spdlog.h>
 #include "compress_rotate_file_sink.h"
 #include "logger.h"
+#ifdef _WIN32
+#include <process.h>
+#define getpid _getpid
+#else
+#include <unistd.h>
+#endif
 namespace UC::Logger {
 constexpr uint32_t kRateLimitCountBits = 2;
 constexpr uint64_t kRateLimitCountMask = (1u << kRateLimitCountBits) - 1u;

@@ -24,7 +24,9 @@
 
 #include "logger.h"
 #include <iostream>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 namespace UC::Logger {
 
 void Log(Level lv, std::string file, std::string func, int line, std::string msg)

@@ -1,5 +1,8 @@
 file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc")
 add_library(metrics SHARED ${UCMMETRICS_CC_SOURCE_FILES})
+if(WIN32)
+    set_target_properties(metrics PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
 target_include_directories(metrics PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/cc/api
     ${CMAKE_CURRENT_SOURCE_DIR}/cc/domain

@@ -1,6 +1,9 @@
 if(BUILD_UNIT_TESTS)
     include(GoogleTest)
     file(GLOB_RECURSE UCMSHARED_TEST_SOURCE_FILES "./case/*.cc")
+    if(WIN32)
+        list(FILTER UCMSHARED_TEST_SOURCE_FILES EXCLUDE REGEX "thread_pool_test\\.cc$")
+    endif()
     add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES})
     target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case)
     target_link_libraries(ucmshared.test PRIVATE

@@ -27,7 +27,9 @@
 #include <iostream>
 #include <numeric>
 #include <thread>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
 #include <vector>
 #include "metrics_api.h"
 

@@ -7,6 +7,9 @@ endif()
 if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     add_subdirectory(cuda)
 endif()
+if(RUNTIME_ENVIRONMENT STREQUAL "rocm")
+    add_subdirectory(rocm)
+endif()
 if(RUNTIME_ENVIRONMENT STREQUAL "simu")
     add_subdirectory(simu)
 endif()

@@ -34,6 +34,7 @@ namespace UC::Trans {
 inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
                                     volatile uint8_t* __restrict__ dst)
 {
+#if defined(__CUDA_ARCH__)
     uint4 lo, hi;
     asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3}, [%4];"
                  : "=r"(lo.x), "=r"(lo.y), "=r"(lo.z), "=r"(lo.w)
@@ -47,6 +48,20 @@ inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
     asm volatile("st.volatile.global.v4.b32 [%0+16], {%1,%2,%3,%4};"
                  :
                  : "l"(dst), "r"(hi.x), "r"(hi.y), "r"(hi.z), "r"(hi.w));
+#else
+    // ROCm has no ld.global.cs/st.volatile.global PTX or __ldcs/__stcg builtins;
+    // this is a plain vectorized 32-byte copy. Dropping `volatile` is correct on
+    // AMD: host visibility comes from the fine-grained-coherent host registration
+    // plus the per-transfer hipStreamSynchronize (GPU caches flush at kernel
+    // completion), not from the qualifier. On AMD `volatile` only forces an L1
+    // bypass (glc, GPU-L2 scope), which is neither necessary nor sufficient for
+    // host visibility; system-scope ordering, if ever needed, is
+    // __threadfence_system.
+    const uint4* src4 = reinterpret_cast<const uint4*>(src);
+    uint4* dst4 = reinterpret_cast<uint4*>(const_cast<uint8_t*>(dst));
+    dst4[0] = src4[0];
+    dst4[1] = src4[1];
+#endif
 }
 
 __global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num)

@@ -0,0 +1,37 @@
+enable_language(HIP)
+# enable_language(HIP) auto-detects the host GPU arch (via rocm_agent_enumerator)
+# when CMAKE_HIP_ARCHITECTURES is unset; it may list one entry per agent, so
+# dedup it. Fall back to gfx90a only when nothing is detected (e.g. a CPU-only
+# build host). Pass -DCMAKE_HIP_ARCHITECTURES to override.
+if(CMAKE_HIP_ARCHITECTURES)
+    list(REMOVE_DUPLICATES CMAKE_HIP_ARCHITECTURES)
+else()
+    set(CMAKE_HIP_ARCHITECTURES "gfx90a")
+    message(STATUS "No AMD GPU detected; defaulting CMAKE_HIP_ARCHITECTURES to gfx90a")
+endif()
+find_package(hip REQUIRED)
+
+set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../vendor/hip_compat)
+
+set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu
+    PROPERTIES LANGUAGE HIP)
+add_library(kernel OBJECT ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu)
+target_include_directories(kernel PRIVATE ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
+if(WIN32)
+    target_compile_options(kernel PRIVATE -Wall)
+else()
+    target_compile_options(kernel PRIVATE -Wall -fPIC)
+endif()
+
+add_library(trans STATIC
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc
+    ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc
+)
+target_include_directories(trans PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
+target_link_libraries(trans PUBLIC
+    fmt
+    hip::host
+    kernel
+)
@@ -0,0 +1,40 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * */
+
+/*
+ * ROCm/HIP compatibility shim for the CUDA driver-API header. The sparse
+ * Hamming-distance extension includes <cuda.h> only to pull in the runtime
+ * declarations it shares with <cuda_runtime.h>; it uses no driver-API entry
+ * points. On a ROCm build we map it onto the runtime shim so the include
+ * resolves without the (absent) NVIDIA driver header.
+ */
+#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_H
+#define UNIFIEDCACHE_HIP_COMPAT_CUDA_H
+
+#include "cuda_runtime.h"
+
+#endif
@@ -0,0 +1,73 @@
+/**
+ * MIT License
+ *
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
+ * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Author: Jeff Daily <jeff.daily@amd.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ * */
+
+/*
+ * ROCm/HIP compatibility shim. On a ROCm build the per-backend CMake puts this
+ * directory ahead of the toolchain includes, so every existing
+ * `#include <cuda_runtime.h>` resolves here instead of the (absent) NVIDIA
+ * header. We pull in the HIP runtime and alias the small set of cuda* runtime
+ * symbols the KV-transfer backend uses to their hip* equivalents, so the
+ * device-backend sources compile unchanged. The NVIDIA path never sees this
+ * file (its include dir points at the real CUDA toolkit).
+ */
+#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H
+#define UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H
+
+#include <cstdlib>
+#include <cstring>
+
+#include <hip/hip_runtime.h>
+
+using cudaError_t = hipError_t;
+using cudaStream_t = hipStream_t;
+using cudaEvent_t = hipEvent_t;
+
+static constexpr hipError_t cudaSuccess = hipSuccess;
+static constexpr hipMemcpyKind cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
+static constexpr hipMemcpyKind cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
+static constexpr unsigned int cudaStreamNonBlocking = hipStreamNonBlocking;
+static constexpr unsigned int cudaHostRegisterDefault = hipHostRegisterDefault;
+
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMallocHost hipHostMalloc
+#define cudaFreeHost hipHostFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamAddCallback hipStreamAddCallback
+#define cudaStreamWaitEvent hipStreamWaitEvent
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaHostRegister hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostGetDevicePointer hipHostGetDevicePointer
+
+#endif
@@ -52,4 +52,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
     message(STATUS "Building GSAOnDevice for CUDA...")
     add_subdirectory(csrc/cuda/hash_retrieval)
     add_subdirectory(csrc/cuda/ham_dist)
+elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm")
+    message(STATUS "Building GSAOnDevice for ROCm...")
+    add_subdirectory(csrc/cuda/hash_retrieval)
+    add_subdirectory(csrc/rocm/ham_dist)
 endif()
@@ -1,7 +1,14 @@
 #pragma once
 
+#ifdef USE_ROCM
+// torch keeps the cuda spelling for its public symbols on ROCm; the hipified
+// context header provides c10::cuda::getCurrentCUDAStream backed by HIP, while
+// the cuda-spelled header pulls in NVIDIA-only cuda_runtime_api.h/cusparse.h.
+#include <ATen/hip/HIPContext.h>
+#else
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda.h>
+#endif
 #include <cuda_runtime.h>
 #include <torch/script.h>
 

@@ -1,4 +1,6 @@
+#ifndef USE_ROCM
 #include <cuda.h>
+#endif
 #include <cuda_runtime.h>
 #include <torch/script.h>