-
Notifications
You must be signed in to change notification settings - Fork 86
Add a HIP/ROCm device backend for KV-transfer and Hamming kernels #1021
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
ab1b5a5
53f8b25
b697824
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -34,6 +34,7 @@ namespace UC::Trans { | |
| inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src, | ||
| volatile uint8_t* __restrict__ dst) | ||
| { | ||
| #if defined(__CUDA_ARCH__) | ||
| uint4 lo, hi; | ||
| asm volatile("ld.global.cs.v4.b32 {%0,%1,%2,%3}, [%4];" | ||
| : "=r"(lo.x), "=r"(lo.y), "=r"(lo.z), "=r"(lo.w) | ||
|
|
@@ -47,6 +48,20 @@ inline __device__ void CudaCopyUnit(const uint8_t* __restrict__ src, | |
| asm volatile("st.volatile.global.v4.b32 [%0+16], {%1,%2,%3,%4};" | ||
| : | ||
| : "l"(dst), "r"(hi.x), "r"(hi.y), "r"(hi.z), "r"(hi.w)); | ||
| #else | ||
| // ROCm has no ld.global.cs/st.volatile.global PTX or __ldcs/__stcg builtins; | ||
| // this is a plain vectorized 32-byte copy. Dropping `volatile` is correct on | ||
| // AMD: host visibility comes from the fine-grained-coherent host registration | ||
| // plus the per-transfer hipStreamSynchronize (GPU caches flush at kernel | ||
| // completion), not from the qualifier. On AMD `volatile` only forces an L1 | ||
| // bypass (glc, GPU-L2 scope), which is neither necessary nor sufficient for | ||
| // host visibility; system-scope ordering, if ever needed, is | ||
| // __threadfence_system. | ||
| const uint4* src4 = reinterpret_cast<const uint4*>(src); | ||
| uint4* dst4 = reinterpret_cast<uint4*>(const_cast<uint8_t*>(dst)); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kept the plain copy and documented why in 53f8b25 rather than reintroducing |
||
| dst4[0] = src4[0]; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Suggestion: The
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The 16-byte alignment requirement is unchanged from the original CUDA path -- its |
||
| dst4[1] = src4[1]; | ||
| #endif | ||
| } | ||
|
|
||
| __global__ void CudaCopyKernel(const void** src, void** dst, size_t size, size_t num) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,37 @@ | ||
| enable_language(HIP) | ||
| # enable_language(HIP) auto-detects the host GPU arch (via rocm_agent_enumerator) | ||
| # when CMAKE_HIP_ARCHITECTURES is unset; it may list one entry per agent, so | ||
| # dedup it. Fall back to gfx90a only when nothing is detected (e.g. a CPU-only | ||
| # build host). Pass -DCMAKE_HIP_ARCHITECTURES to override. | ||
| if(CMAKE_HIP_ARCHITECTURES) | ||
| list(REMOVE_DUPLICATES CMAKE_HIP_ARCHITECTURES) | ||
| else() | ||
| set(CMAKE_HIP_ARCHITECTURES "gfx90a") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Suggestion: The architecture defaults to gfx90a (MI250X/MI210) which is appropriate for datacenter AMD GPUs, but may cause confusion for users with consumer Radeon GPUs (gfx1100, gfx1201, etc.). Consider adding a comment or CMake warning message when defaulting, suggesting users set
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in b697824. These arms pinned gfx90a before |
||
| message(STATUS "No AMD GPU detected; defaulting CMAKE_HIP_ARCHITECTURES to gfx90a") | ||
| endif() | ||
| find_package(hip REQUIRED) | ||
|
|
||
| set(HIP_COMPAT_DIR ${CMAKE_CURRENT_LIST_DIR}/../../vendor/hip_compat) | ||
|
|
||
| set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu | ||
| PROPERTIES LANGUAGE HIP) | ||
| add_library(kernel OBJECT ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_kernel.cu) | ||
| target_include_directories(kernel PRIVATE ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda) | ||
| if(WIN32) | ||
| target_compile_options(kernel PRIVATE -Wall) | ||
| else() | ||
| target_compile_options(kernel PRIVATE -Wall -fPIC) | ||
| endif() | ||
|
|
||
| add_library(trans STATIC | ||
| ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_device.cc | ||
| ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_buffer.cc | ||
| ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_stream.cc | ||
| ${CMAKE_CURRENT_LIST_DIR}/../cuda/cuda_sm_stream.cc | ||
| ) | ||
| target_include_directories(trans PUBLIC ${HIP_COMPAT_DIR} ${CMAKE_CURRENT_LIST_DIR}/../cuda) | ||
| target_link_libraries(trans PUBLIC | ||
| fmt | ||
| hip::host | ||
| kernel | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,40 @@ | ||
| /** | ||
| * MIT License | ||
| * | ||
| * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. | ||
| * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. | ||
| * | ||
| * Author: Jeff Daily <jeff.daily@amd.com> | ||
| * | ||
| * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| * of this software and associated documentation files (the "Software"), to deal | ||
| * in the Software without restriction, including without limitation the rights | ||
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| * copies of the Software, and to permit persons to whom the Software is | ||
| * furnished to do so, subject to the following conditions: | ||
| * | ||
| * The above copyright notice and this permission notice shall be included in all | ||
| * copies or substantial portions of the Software. | ||
| * | ||
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| * SOFTWARE. | ||
| * */ | ||
|
|
||
| /* | ||
| * ROCm/HIP compatibility shim for the CUDA driver-API header. The sparse | ||
| * Hamming-distance extension includes <cuda.h> only to pull in the runtime | ||
| * declarations it shares with <cuda_runtime.h>; it uses no driver-API entry | ||
| * points. On a ROCm build we map it onto the runtime shim so the include | ||
| * resolves without the (absent) NVIDIA driver header. | ||
| */ | ||
| #ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_H | ||
| #define UNIFIEDCACHE_HIP_COMPAT_CUDA_H | ||
|
|
||
| #include "cuda_runtime.h" | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,73 @@ | ||
| /** | ||
| * MIT License | ||
| * | ||
| * Copyright (c) 2025 Huawei Technologies Co., Ltd. All rights reserved. | ||
| * Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved. | ||
| * | ||
| * Author: Jeff Daily <jeff.daily@amd.com> | ||
| * | ||
| * Permission is hereby granted, free of charge, to any person obtaining a copy | ||
| * of this software and associated documentation files (the "Software"), to deal | ||
| * in the Software without restriction, including without limitation the rights | ||
| * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
| * copies of the Software, and to permit persons to whom the Software is | ||
| * furnished to do so, subject to the following conditions: | ||
| * | ||
| * The above copyright notice and this permission notice shall be included in all | ||
| * copies or substantial portions of the Software. | ||
| * | ||
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
| * SOFTWARE. | ||
| * */ | ||
|
|
||
| /* | ||
| * ROCm/HIP compatibility shim. On a ROCm build the per-backend CMake puts this | ||
| * directory ahead of the toolchain includes, so every existing | ||
| * `#include <cuda_runtime.h>` resolves here instead of the (absent) NVIDIA | ||
| * header. We pull in the HIP runtime and alias the small set of cuda* runtime | ||
| * symbols the KV-transfer backend uses to their hip* equivalents, so the | ||
| * device-backend sources compile unchanged. The NVIDIA path never sees this | ||
| * file (its include dir points at the real CUDA toolkit). | ||
| */ | ||
| #ifndef UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H | ||
| #define UNIFIEDCACHE_HIP_COMPAT_CUDA_RUNTIME_H | ||
|
|
||
| #include <cstdlib> | ||
| #include <cstring> | ||
|
|
||
| #include <hip/hip_runtime.h> | ||
|
|
||
| using cudaError_t = hipError_t; | ||
| using cudaStream_t = hipStream_t; | ||
| using cudaEvent_t = hipEvent_t; | ||
|
|
||
| static constexpr hipError_t cudaSuccess = hipSuccess; | ||
| static constexpr hipMemcpyKind cudaMemcpyHostToDevice = hipMemcpyHostToDevice; | ||
| static constexpr hipMemcpyKind cudaMemcpyDeviceToHost = hipMemcpyDeviceToHost; | ||
| static constexpr unsigned int cudaStreamNonBlocking = hipStreamNonBlocking; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Suggestion: Using
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| static constexpr unsigned int cudaHostRegisterDefault = hipHostRegisterDefault; | ||
|
|
||
| #define cudaMalloc hipMalloc | ||
| #define cudaFree hipFree | ||
| #define cudaMallocHost hipHostMalloc | ||
| #define cudaFreeHost hipHostFree | ||
| #define cudaMemcpy hipMemcpy | ||
| #define cudaMemcpyAsync hipMemcpyAsync | ||
| #define cudaSetDevice hipSetDevice | ||
| #define cudaStreamCreate hipStreamCreate | ||
| #define cudaStreamCreateWithFlags hipStreamCreateWithFlags | ||
| #define cudaStreamSynchronize hipStreamSynchronize | ||
| #define cudaStreamAddCallback hipStreamAddCallback | ||
| #define cudaStreamWaitEvent hipStreamWaitEvent | ||
| #define cudaGetLastError hipGetLastError | ||
| #define cudaGetErrorString hipGetErrorString | ||
| #define cudaHostRegister hipHostRegister | ||
| #define cudaHostUnregister hipHostUnregister | ||
| #define cudaHostGetDevicePointer hipHostGetDevicePointer | ||
|
|
||
| #endif | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,14 @@ | ||
| #pragma once | ||
|
|
||
| #ifdef USE_ROCM | ||
| // torch keeps the cuda spelling for its public symbols on ROCm; the hipified | ||
| // context header provides c10::cuda::getCurrentCUDAStream backed by HIP, while | ||
| // the cuda-spelled header pulls in NVIDIA-only cuda_runtime_api.h/cusparse.h. | ||
| #include <ATen/hip/HIPContext.h> | ||
| #else | ||
| #include <ATen/cuda/CUDAContext.h> | ||
| #include <cuda.h> | ||
| #endif | ||
| #include <cuda_runtime.h> | ||
| #include <torch/script.h> | ||
|
|
||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Suggestion: The
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The rocm CMake arm puts the hip_compat shim dir on the include path (that's where |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,6 @@ | ||
| #ifndef USE_ROCM | ||
| #include <cuda.h> | ||
| #endif | ||
| #include <cuda_runtime.h> | ||
| #include <torch/script.h> | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
setup.py adds PLATFORM=rocm support, but the warning message still lists only cuda/ascend/ascend-a3/musa/maca. Please include rocm there as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done in 53f8b25 -- added
rocmto the PLATFORM list in that warning.