Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,15 @@ if(BUILD_UCM_MINDIE)
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=${UCM_CXX11_ABI})
endif()

set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
set(FLAGS_DEBUG "-O0 -g")
set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
if(WIN32)
set(FLAGS_PUBLIC "/W3")
set(FLAGS_DEBUG "/Od /Zi")
set(FLAGS_RELEASE "/O2")
else()
set(FLAGS_PUBLIC "-Wall -Werror -fPIC -Wl,-z,relro,-z,now")
set(FLAGS_DEBUG "-O0 -g")
set(FLAGS_RELEASE "-s -O2 -fstack-protector-strong -D_FORTIFY_SOURCE=2")
endif()
string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LOWER)
if(CMAKE_BUILD_TYPE_LOWER STREQUAL "debug")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS_PUBLIC} ${FLAGS_DEBUG}")
Expand Down
2 changes: 2 additions & 0 deletions docs/source/getting-started/quickstart_sglang.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ docker build --build-arg INSTALL_MODE=package \
pip install -v -e . --no-build-isolation
```

> **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda` to select the ROCm device backend (the KV-transfer and Hamming-distance kernels built with HIP). It requires a ROCm installation and a ROCm build of PyTorch; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if your GPU is not auto-detected. Validated on gfx90a and gfx1100.


### Option 3: Install by pip
1. Prepare SGLang Environment
Expand Down
2 changes: 2 additions & 0 deletions docs/source/getting-started/quickstart_vllm.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ docker build --build-arg INSTALL_MODE=package \
pip install -v -e . --no-build-isolation
```

> **Note:** On AMD GPUs set `export PLATFORM=rocm` instead of `cuda`. This selects the ROCm device backend, which builds the same KV-transfer and Hamming-distance kernels with HIP. It requires a ROCm installation and a ROCm build of PyTorch in the environment; pass `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`) if CMake does not detect your GPU. Validated on gfx90a and gfx1100.

3. Apply vLLM Integration Patches (Not required for versions > 0.11.0)

To integrate UCM with vLLM 0.11.0, you can choose between a dynamic **monkey patch** (recommended) and a manual **git patch**.
Expand Down
1 change: 1 addition & 0 deletions docs/source/user-guide/support-matrix/support_matrix.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ This section presents the currently supported compute platforms and devices.
|:----------------:|:------:|:------:|
| CANN | Ascend | 910C, 910B |
| CUDA | NVIDIA | H100, H20, L40, L20 |
| ROCm | AMD | MI250X (gfx90a), Radeon Pro W7800 (gfx1100), Radeon PRO V710 (gfx1101), Radeon RX 9070 XT (gfx1201), Radeon 8060S (gfx1151) |
| MUSA | Mthreads | S5000 |
| MACA | MetaX | C500 |

Expand Down
4 changes: 3 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def print_platform_warning():
{RED}{'=' * 80}
{BOLD}⚠️ WARNING: PLATFORM environment variable is not set! ⚠️{RESET}
{RED}{'=' * 80}{RESET}
{YELLOW}Please set PLATFORM to one of: cuda, ascend, ascend-a3, musa, maca{RESET}
{YELLOW}Please set PLATFORM to one of: cuda, rocm, ascend, ascend-a3, musa, maca{RESET}
Example:
{BOLD}export PLATFORM=cuda{RESET} # For CUDA platform
{YELLOW}In CI scenarios only, you don't need to specify PLATFORM. If it's not a CI scenario, please uninstall and then reinstall with PLATFORM specified.{RESET}
Expand Down Expand Up @@ -174,6 +174,8 @@ def build_cmake(self, ext: CMakeExtension):
match PLATFORM:
case "cuda":
cmake_args += ["-DRUNTIME_ENVIRONMENT=cuda"]
case "rocm":

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

setup.py adds PLATFORM=rocm support, but the warning message still lists only cuda/ascend/ascend-a3/musa/maca. Please include rocm there as well.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 53f8b25 -- added rocm to the PLATFORM list in that warning.

cmake_args += ["-DRUNTIME_ENVIRONMENT=rocm"]
case "ascend" | "ascend-a3":
cmake_args += ["-DRUNTIME_ENVIRONMENT=ascend"]
case "musa":
Expand Down
15 changes: 6 additions & 9 deletions ucm/shared/infra/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@ target_link_libraries(infra_status PUBLIC fmt)

add_subdirectory(logger)

file(GLOB_RECURSE UCMINFRA_TEMPLATE_SOURCE_FILES "template/*.*")
add_library(infra_template OBJECT ${UCMINFRA_TEMPLATE_SOURCE_FILES})
target_include_directories(infra_template PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_library(infra_template INTERFACE)
target_include_directories(infra_template INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})

file(GLOB_RECURSE UCMINFRA_THREAD_SOURCE_FILES "thread/*.*")
add_library(infra_thread OBJECT ${UCMINFRA_THREAD_SOURCE_FILES})
target_include_directories(infra_thread PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_library(infra_thread INTERFACE)
target_include_directories(infra_thread INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})

file(GLOB_RECURSE UCMINFRA_TIME_SOURCE_FILES "time/*.*")
add_library(infra_time OBJECT ${UCMINFRA_TIME_SOURCE_FILES})
target_include_directories(infra_time PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_library(infra_time INTERFACE)
target_include_directories(infra_time INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
6 changes: 6 additions & 0 deletions ucm/shared/infra/logger/cc/spdlog_logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@
#include <spdlog/spdlog.h>
#include "compress_rotate_file_sink.h"
#include "logger.h"
#ifdef _WIN32
#include <process.h>
#define getpid _getpid
#else
#include <unistd.h>
#endif
namespace UC::Logger {
constexpr uint32_t kRateLimitCountBits = 2;
constexpr uint64_t kRateLimitCountMask = (1u << kRateLimitCountBits) - 1u;
Expand Down
2 changes: 2 additions & 0 deletions ucm/shared/infra/logger/logger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@

#include "logger.h"
#include <iostream>
#ifndef _WIN32
#include <unistd.h>
#endif
namespace UC::Logger {

void Log(Level lv, std::string file, std::string func, int line, std::string msg)
Expand Down
3 changes: 3 additions & 0 deletions ucm/shared/metrics/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
file(GLOB_RECURSE UCMMETRICS_CC_SOURCE_FILES "./cc/*.cc")
add_library(metrics SHARED ${UCMMETRICS_CC_SOURCE_FILES})
if(WIN32)
set_target_properties(metrics PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()
target_include_directories(metrics PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/cc/api
${CMAKE_CURRENT_SOURCE_DIR}/cc/domain
Expand Down
3 changes: 3 additions & 0 deletions ucm/shared/test/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
if(BUILD_UNIT_TESTS)
include(GoogleTest)
file(GLOB_RECURSE UCMSHARED_TEST_SOURCE_FILES "./case/*.cc")
if(WIN32)
list(FILTER UCMSHARED_TEST_SOURCE_FILES EXCLUDE REGEX "thread_pool_test\\.cc$")
endif()
add_executable(ucmshared.test ${UCMSHARED_TEST_SOURCE_FILES})
target_include_directories(ucmshared.test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/case)
target_link_libraries(ucmshared.test PRIVATE
Expand Down
2 changes: 2 additions & 0 deletions ucm/shared/test/case/metrics/metrics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
#include <iostream>
#include <numeric>
#include <thread>
#ifndef _WIN32
#include <unistd.h>
#endif
#include <vector>
#include "metrics_api.h"

Expand Down
3 changes: 3 additions & 0 deletions ucm/shared/trans/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ endif()
if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
add_subdirectory(cuda)
endif()
if(RUNTIME_ENVIRONMENT STREQUAL "rocm")
add_subdirectory(rocm)
endif()
if(RUNTIME_ENVIRONMENT STREQUAL "simu")
add_subdirectory(simu)
endif()
Expand Down
15 changes: 15 additions & 0 deletions ucm/shared/trans/cuda/cuda_sm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ namespace UC::Trans {
inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
volatile uint8_t* __restrict__ dst)
{
#if defined(__CUDA_ARCH__)
uint4 lo, hi;
asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3}, [%4];"
: "=r"(lo.x), "=r"(lo.y), "=r"(lo.z), "=r"(lo.w)
Expand All @@ -47,6 +48,20 @@ inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src,
asm volatile("st.volatile.global.v4.b32 [%0+16], {%1,%2,%3,%4};"
:
: "l"(dst), "r"(hi.x), "r"(hi.y), "r"(hi.z), "r"(hi.w));
#else
// ROCm has no ld.global.cs/st.volatile.global PTX or __ldcs/__stcg builtins;
// this is a plain vectorized 32-byte copy. Dropping `volatile` is correct on
// AMD: host visibility comes from the fine-grained-coherent host registration
// plus the per-transfer hipStreamSynchronize (GPU caches flush at kernel
// completion), not from the qualifier. On AMD `volatile` only forces an L1
// bypass (glc, GPU-L2 scope), which is neither necessary nor sufficient for
// host visibility; system-scope ordering, if ever needed, is
// __threadfence_system.
const uint4* src4 = reinterpret_cast<const uint4*>(src);
uint4* dst4 = reinterpret_cast<uint4*>(const_cast<uint8_t*>(dst));

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Warning: The const_cast<uint8_t*>(dst) removes the volatile qualifier from the destination pointer. The original CUDA implementation uses st.volatile.global PTX which ensures proper memory visibility semantics for device-to-host transfers. While the comment correctly notes that cache-policy hints are NVIDIA-specific, the volatile qualifier itself has semantic meaning - it prevents compiler optimizations that could reorder or eliminate memory operations. On HIP, consider using __builtin_nontemporal_store or ensuring proper memory fence semantics to maintain equivalent visibility guarantees. The current plain copy may be functionally correct for this use case (disjoint writes with stream sync), but the removal of volatile should be explicitly justified in the comment.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept the plain copy and documented why in 53f8b25 rather than reintroducing volatile, because on AMD it doesn't do what st.volatile does on NVIDIA here. Measured on gfx90a (ROCm 7.2): a volatile access compiles to flat addressing plus the glc bit on loads (an L1 bypass at GPU-L2 scope) and no cache bit on stores -- i.e. GPU-L2 coherence, not host/system visibility. Host visibility for these H2D/D2H copies instead comes from the transfer buffers being fine-grained-coherent host registrations (hipHostRegister default) plus the hipStreamSynchronize after each copy (GPU caches flush at kernel completion). So the plain copy is visible to the host; volatile would be neither necessary nor sufficient. System-scope ordering, if it were ever needed, would be __threadfence_system, not the qualifier.

dst4[0] = src4[0];

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Suggestion: The reinterpret_cast<uint4*> assumes the source pointer is properly aligned to 16-byte boundary for uint4 access. While this is likely true for the KV cache blocks (which are typically allocated with proper alignment), consider adding an assertion or documentation noting the alignment requirement. Misaligned access could cause undefined behavior or performance degradation on some AMD architectures.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The 16-byte alignment requirement is unchanged from the original CUDA path -- its ld.global.cs.v4.b32 vectorized access requires the same 16-byte alignment, which the KV blocks satisfy. So the HIP path introduces no new constraint; keeping it as-is for parity with the CUDA code.

dst4[1] = src4[1];
#endif
}

__global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num)
Expand Down
37 changes: 37 additions & 0 deletions ucm/shared/trans/rocm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
enable_language(HIP)
# enable_language(HIP) auto-detects the host GPU arch (via rocm_agent_enumerator)
# when CMAKE_HIP_ARCHITECTURES is unset; it may list one entry per agent, so
# dedup it. Fall back to gfx90a only when nothing is detected (e.g. a CPU-only
# build host). Pass -DCMAKE_HIP_ARCHITECTURES to override.
if(CMAKE_HIP_ARCHITECTURES)
list(REMOVE_DUPLICATES CMAKE_HIP_ARCHITECTURES)
else()
set(CMAKE_HIP_ARCHITECTURES "gfx90a")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Suggestion: The architecture defaults to gfx90a (MI250X/MI210) which is appropriate for datacenter AMD GPUs, but may cause confusion for users with consumer Radeon GPUs (gfx1100, gfx1201, etc.). Consider adding a comment or CMake warning message when defaulting, suggesting users set CMAKE_HIP_ARCHITECTURES explicitly. The PR description mentions gfx1100 validation but the default doesn't reflect this broader support.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in b697824. These arms pinned gfx90a before enable_language(HIP), which preempted CMake's host-GPU auto-detection (rocm_agent_enumerator) -- a non-gfx90a AMD user who didn't pass -DCMAKE_HIP_ARCHITECTURES would have silently built gfx90a binaries that fail to load on their card. Now enable_language(HIP) auto-detects the host arch (deduped), with gfx90a used only as a fallback when nothing is detected (e.g. a CPU-only build host); -DCMAKE_HIP_ARCHITECTURES still overrides.

message(STATUS "No AMD GPU detected; defaulting CMAKE_HIP_ARCHITECTURES to gfx90a")
endif()
find_package(hip REQUIRED)

set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../vendor/hip_compat)

set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu
PROPERTIES LANGUAGE HIP)
add_library(kernel OBJECT ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu)
target_include_directories(kernel PRIVATE ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
if(WIN32)
target_compile_options(kernel PRIVATE -Wall)
else()
target_compile_options(kernel PRIVATE -Wall -fPIC)
endif()

add_library(trans STATIC
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc
${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc
)
target_include_directories(trans PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda)
target_link_libraries(trans PUBLIC
fmt
hip::host
kernel
)
40 changes: 40 additions & 0 deletions ucm/shared/vendor/hip_compat/cuda.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/**
* MIT License
*
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
*
* Author: Jeff Daily <jeff.daily@amd.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
* */

/*
* ROCm/HIP compatibility shim for the CUDA driver-API header. The sparse
* Hamming-distance extension includes <cuda.h> only to pull in the runtime
* declarations it shares with <cuda_runtime.h>; it uses no driver-API entry
* points. On a ROCm build we map it onto the runtime shim so the include
* resolves without the (absent) NVIDIA driver header.
*/
#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_H
#define UNIFIEDCACHE_HIP_COMPAT_CUDA_H

#include "cuda_runtime.h"

#endif
73 changes: 73 additions & 0 deletions ucm/shared/vendor/hip_compat/cuda_runtime.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/**
* MIT License
*
* Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved.
* Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
*
* Author: Jeff Daily <jeff.daily@amd.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
* */

/*
* ROCm/HIP compatibility shim. On a ROCm build the per-backend CMake puts this
* directory ahead of the toolchain includes, so every existing
* `#include <cuda_runtime.h>` resolves here instead of the (absent) NVIDIA
* header. We pull in the HIP runtime and alias the small set of cuda* runtime
* symbols the KV-transfer backend uses to their hip* equivalents, so the
* device-backend sources compile unchanged. The NVIDIA path never sees this
* file (its include dir points at the real CUDA toolkit).
*/
#ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H
#define UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H

#include <cstdlib>
#include <cstring>

#include <hip/hip_runtime.h>

using cudaError_t = hipError_t;
using cudaStream_t = hipStream_t;
using cudaEvent_t = hipEvent_t;

static constexpr hipError_t cudaSuccess = hipSuccess;
static constexpr hipMemcpyKind cudaMemcpyHostToDevice = hipMemcpyHostToDevice;
static constexpr hipMemcpyKind cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost;
static constexpr unsigned int cudaStreamNonBlocking = hipStreamNonBlocking;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Suggestion: Using #define macros for function aliases prevents proper function overload resolution and could cause issues if CUDA APIs add new signatures in future versions. For this limited set of APIs, the macro approach is acceptable. However, consider adding inline wrapper functions instead of macros for type safety and better debugging support. Example: inline cudaError_t cudaMalloc(void** ptr, size_t size) { return hipMalloc(ptr, size); } would preserve the cudaError_t return type explicitly.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The #define approach is intentional: it matches the cuda_to_hip compatibility-header convention used across these ports, and it also has to cover the type aliases (cudaStream_t, cudaError_t, ...) that inline wrappers can't. The aliased surface here is small and unambiguous (Malloc/Free/Memcpy/Stream/Event), so the macros don't create overload-resolution hazards in practice. Leaving as-is.

static constexpr unsigned int cudaHostRegisterDefault = hipHostRegisterDefault;

#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMallocHost hipHostMalloc
#define cudaFreeHost hipHostFree
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaSetDevice hipSetDevice
#define cudaStreamCreate hipStreamCreate
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamAddCallback hipStreamAddCallback
#define cudaStreamWaitEvent hipStreamWaitEvent
#define cudaGetLastError hipGetLastError
#define cudaGetErrorString hipGetErrorString
#define cudaHostRegister hipHostRegister
#define cudaHostUnregister hipHostUnregister
#define cudaHostGetDevicePointer hipHostGetDevicePointer

#endif
4 changes: 4 additions & 0 deletions ucm/sparse/gsa_on_device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,8 @@ if(RUNTIME_ENVIRONMENT STREQUAL "cuda")
message(STATUS "Building GSAOnDevice for CUDA...")
add_subdirectory(csrc/cuda/hash_retrieval)
add_subdirectory(csrc/cuda/ham_dist)
elseif(RUNTIME_ENVIRONMENT STREQUAL "rocm")
message(STATUS "Building GSAOnDevice for ROCm...")
add_subdirectory(csrc/cuda/hash_retrieval)
add_subdirectory(csrc/rocm/ham_dist)
endif()
7 changes: 7 additions & 0 deletions ucm/sparse/gsa_on_device/csrc/cuda/ham_dist/operator.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
#pragma once

#ifdef USE_ROCM
// torch keeps the cuda spelling for its public symbols on ROCm; the hipified
// context header provides c10::cuda::getCurrentCUDAStream backed by HIP, while
// the cuda-spelled header pulls in NVIDIA-only cuda_runtime_api.h/cusparse.h.
#include <ATen/hip/HIPContext.h>
#else
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#endif
#include <cuda_runtime.h>
#include <torch/script.h>

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Suggestion: The #include <cuda_runtime.h> after the conditional block relies on the hip_compat shim being on the include path for ROCm builds. This works correctly when the CMake configuration is proper, but could cause confusing build failures if the shim directory is not added to include paths. Consider adding a comment here noting this dependency, or adding an #ifdef USE_ROCM guard with an #error directive if the shim is missing (e.g., checking for UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H define).

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rocm CMake arm puts the hip_compat shim dir on the include path (that's where cuda_runtime.h resolves for ROCm builds), so a misconfiguration fails loudly at compile time rather than silently. I'd rather not add an #error probe keyed on a shim-specific define, since it couples the source to the build layout. Leaving as-is.

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#ifndef USE_ROCM
#include <cuda.h>
#endif
#include <cuda_runtime.h>
#include <torch/script.h>

Expand Down
Loading