address reviewer feedback

tjtanaa · tjtanaa · commit f9c20349d575 · 2025-10-24T06:17:12.000Z
Signed-off-by: tjtanaa &lt;tunjian.tan@embeddedllm.com&gt;
diff --git a/fastsafetensors/common.py b/fastsafetensors/common.py
@@ -13,10 +13,14 @@
 from .frameworks import FrameworkOpBase, TensorBase
 from .st_types import Device, DType
 
-# Add compatibility alias for is_cuda_found -> is_hip_found
-# This allows code written for CUDA to work transparently on both CUDA and ROCm
-if not hasattr(fstcpp, 'is_cuda_found'):
-    fstcpp.is_cuda_found = fstcpp.is_hip_found
+
+def is_gpu_found():
+    """Check if any GPU (CUDA or HIP) is available.
+
+    Returns True if either CUDA or ROCm/HIP GPUs are detected.
+    This allows code to work transparently across both platforms.
+    """
+    return fstcpp.is_cuda_found() or fstcpp.is_hip_found()
 
 
 def get_device_numa_node(device: Optional[int]) -> Optional[int]:
diff --git a/fastsafetensors/cpp/cuda_compat.h b/fastsafetensors/cpp/cuda_compat.h
@@ -1,21 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0
 /*
  * Copyright 2024 IBM Inc. All rights reserved
- * SPDX-License-Identifier: Apache-2.0
  *
  * CUDA/HIP compatibility layer for fastsafetensors
  * Minimal compatibility header - only defines what hipify-perl doesn't handle
  */
 
-#pragma once
+#ifndef __CUDA_COMPAT_H__
+#define __CUDA_COMPAT_H__
 
 // Platform detection - this gets hipified to check __HIP_PLATFORM_AMD__
 #ifdef __HIP_PLATFORM_AMD__
   #ifndef USE_ROCM
     #define USE_ROCM
   #endif
-  #include <hip/hip_runtime.h>
+  // Note: We do NOT include <hip/hip_runtime.h> here to avoid compile-time dependencies.
+  // Instead, we dynamically load the ROCm runtime library (libamdhip64.so) at runtime
+  // using dlopen(), just like we do for CUDA (libcudart.so).
+  // Minimal types are defined in ext.hpp.
 #else
-  // For CUDA platform, or when CUDA headers aren't available, we define minimal types in ext.hpp
+  // For CUDA platform, we also avoid including headers and define minimal types in ext.hpp
 #endif
 
 // Runtime library name - hipify-perl doesn't change string literals
@@ -31,3 +35,5 @@
   #define cudaDeviceMalloc hipDeviceMalloc
   #define cudaDeviceFree hipDeviceFree
 #endif
+
+#endif // __CUDA_COMPAT_H__
diff --git a/fastsafetensors/cpp/ext.cpp b/fastsafetensors/cpp/ext.cpp
@@ -79,6 +79,7 @@ ext_funcs_t cpu_fns = ext_funcs_t {
 ext_funcs_t cuda_fns;
 
 static bool cuda_found = false;
+static bool is_hip_runtime = false;  // Track if we loaded HIP (not auto-hipified)
 static bool cufile_found = false;
 
 static int cufile_ver = 0;
@@ -123,8 +124,12 @@ static void load_nvidia_functions() {
                 count = 0; // why cudaGetDeviceCount returns non-zero for errors?
             }
             cuda_found = count > 0;
+            // Detect if we loaded HIP runtime (ROCm) vs CUDA runtime
+            if (cuda_found && std::string(cudartLib).find("hip") != std::string::npos) {
+                is_hip_runtime = true;
+            }
             if (init_log) {
-                fprintf(stderr, "[DEBUG] device count=%d, cuda_found=%d\n", count, cuda_found);
+                fprintf(stderr, "[DEBUG] device count=%d, cuda_found=%d, is_hip_runtime=%d\n", count, cuda_found, is_hip_runtime);
             }
         } else {
             cuda_found = false;
@@ -218,11 +223,28 @@ static void load_nvidia_functions() {
     }
 }
 
+// Note: is_cuda_found gets auto-hipified to is_hip_found on ROCm builds
+// So this function will be is_hip_found() after hipification on ROCm
 bool is_cuda_found()
 {
     return cuda_found;
 }
 
+// Separate function that always returns false on ROCm (CUDA not available on ROCm)
+// This will be used for the "is_cuda_found" Python export on ROCm builds
+bool cuda_not_available()
+{
+    return false;  // On ROCm, CUDA is never available
+}
+
+// Separate function for checking HIP runtime detection (not hipified)
+// On CUDA: checks if HIP runtime was detected
+// On ROCm: not used (is_cuda_found gets hipified to is_hip_found)
+bool check_hip_runtime()
+{
+    return is_hip_runtime;
+}
+
 bool is_cufile_found()
 {
     return cufile_found;
@@ -719,7 +741,21 @@ cpp_metrics_t get_cpp_metrics() {
 
 PYBIND11_MODULE(__MOD_NAME__, m)
 {
-    m.def("is_cuda_found", &is_cuda_found);
+    // Export both is_cuda_found and is_hip_found on all platforms
+    // Use string concatenation to prevent hipify from converting the export names
+#ifdef USE_ROCM
+    // On ROCm after hipify:
+    // - is_cuda_found() becomes is_hip_found(), so export it as "is_hip_found"
+    // - Export cuda_not_available() as "is_cuda_found" (CUDA not available on ROCm)
+    m.def(("is_" "cuda" "_found"), &cuda_not_available);  // Returns false on ROCm
+    m.def(("is_" "hip" "_found"), &is_cuda_found);  // hipified to is_hip_found, returns hip status
+#else
+    // On CUDA:
+    // - is_cuda_found() checks for CUDA
+    // - check_hip_runtime() checks if HIP runtime was loaded
+    m.def(("is_" "cuda" "_found"), &is_cuda_found);
+    m.def(("is_" "hip" "_found"), &check_hip_runtime);
+#endif
     m.def("is_cufile_found", &is_cufile_found);
     m.def("cufile_version", &cufile_version);
     m.def("set_debug_log", &set_debug_log);
diff --git a/fastsafetensors/cpp/ext.hpp b/fastsafetensors/cpp/ext.hpp
@@ -36,11 +36,10 @@ typedef struct CUfileDescr_t {
 } CUfileDescr_t;
 typedef struct CUfileError { CUfileOpError err; } CUfileError_t;
 
-// Only define minimal CUDA types if not using ROCm (where real headers are included)
-#ifndef USE_ROCM
+// Define minimal CUDA/HIP types for both platforms to avoid compile-time dependencies
+// We load all GPU functions dynamically at runtime via dlopen()
 typedef enum cudaError { cudaSuccess = 0, cudaErrorMemoryAllocation = 2 } cudaError_t;
 enum cudaMemcpyKind { cudaMemcpyHostToDevice=2, cudaMemcpyDefault = 4 };
-#endif
 
 
 typedef enum CUfileFeatureFlags {
diff --git a/fastsafetensors/dlpack.py b/fastsafetensors/dlpack.py
@@ -12,26 +12,43 @@
 _c_str_dltensor = b"dltensor"
 
 
-# Detect GPU type at module load time
+# Lazy GPU type detection - avoid calling framework-specific code at module load time
+_GPU_DEVICE_TYPE = None  # Will be detected lazily
+
+
 def _detect_gpu_type():
-    """Detect if we're running on ROCm or CUDA"""
-    try:
-        import torch
-        if torch.cuda.is_available():
-            # Check if this is ROCm build
-            if hasattr(torch.version, 'hip') and torch.version.hip is not None:
-                return 10  # kDLROCM
-    except:
-        pass
+    """Detect if we're running on ROCm or CUDA.
+
+    This detection is now done lazily to avoid framework-specific calls at module load time.
+    Uses the C++ extension's is_hip_found() to determine the platform.
+    """
+    # Import here to avoid circular dependency
+    from . import cpp as fstcpp
+
+    # Check if we loaded HIP runtime (ROCm)
+    if fstcpp.is_hip_found():
+        return 10  # kDLROCM
     return 2  # kDLCUDA
 
 
-_GPU_DEVICE_TYPE = _detect_gpu_type()
+def _get_gpu_device_type():
+    """Get the GPU device type, detecting it lazily if needed."""
+    global _GPU_DEVICE_TYPE
+    if _GPU_DEVICE_TYPE is None:
+        _GPU_DEVICE_TYPE = _detect_gpu_type()
+    return _GPU_DEVICE_TYPE
 
 
 class DLDevice(ctypes.Structure):
     def __init__(self, dev: Device):
-        self.device_type = self.DeviceToDL[dev.type]
+        # Use lazy detection to get the GPU device type
+        gpu_type = _get_gpu_device_type()
+        device_to_dl = {
+            DeviceType.CPU: self.kDLCPU,
+            DeviceType.CUDA: gpu_type,
+            DeviceType.GPU: gpu_type,
+        }
+        self.device_type = device_to_dl[dev.type]
         self.device_id = dev.index if dev.index is not None else 0
 
     kDLCPU = 1
@@ -42,12 +59,6 @@ def __init__(self, dev: Device):
         ("device_id", ctypes.c_int),
     ]
 
-    DeviceToDL = {
-        DeviceType.CPU: kDLCPU,
-        DeviceType.CUDA: _GPU_DEVICE_TYPE,
-        DeviceType.GPU: _GPU_DEVICE_TYPE,
-    }
-
 
 class c_DLDataType(ctypes.Structure):
     def __init__(self, dtype: DType):
diff --git a/fastsafetensors/frameworks/_torch.py b/fastsafetensors/frameworks/_torch.py
@@ -186,9 +186,18 @@ def copy_tensor(self, dst: TorchTensor, src: TorchTensor):
         dst.real_tensor.copy_(src.real_tensor)
 
     def get_cuda_ver(self) -> str:
+        """Get GPU runtime version with platform indicator.
+
+        Returns a string like 'hip-5.7.0' for ROCm or 'cuda-12.1' for CUDA,
+        or 'none' if no GPU is available. This allows code to distinguish
+        between different GPU platforms without using torch directly.
+        """
         if torch.cuda.is_available():
-            return str(torch.version.cuda)
-        return "0.0"
+            # Check if this is ROCm/HIP build
+            if hasattr(torch.version, "hip") and torch.version.hip is not None:
+                return f"hip-{torch.version.hip}"
+            return f"cuda-{torch.version.cuda}"
+        return "none"
 
     def get_device_ptr_align(self) -> int:
         CUDA_PTR_ALIGN: int = 16
diff --git a/fastsafetensors/loader.py b/fastsafetensors/loader.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, OrderedDict, Tuple, Union
 
 from . import cpp as fstcpp
-from .common import SafeTensorsMetadata, TensorFrame, get_device_numa_node
+from .common import SafeTensorsMetadata, TensorFrame, get_device_numa_node, is_gpu_found
 from .file_buffer import FilesBufferOnDevice
 from .frameworks import TensorBase, get_framework_op
 from .st_types import DeviceType, DType
@@ -69,8 +69,10 @@ def __init__(
             gl_set_numa = True
         fstcpp.set_debug_log(debug_log)
         device_is_not_cpu = self.device.type != DeviceType.CPU
-        if device_is_not_cpu and not fstcpp.is_cuda_found():
-            raise Exception("[FAIL] libcudart.so does not exist")
+        if device_is_not_cpu and not is_gpu_found():
+            raise Exception(
+                "[FAIL] GPU runtime library (libcudart.so or libamdhip64.so) does not exist"
+            )
         if not fstcpp.is_cufile_found() and not nogds:
             warnings.warn(
                 "libcufile.so does not exist but nogds is False. use nogds=True",
diff --git a/setup.py b/setup.py
@@ -27,6 +27,7 @@ def detect_platform():
         for path in ["/opt/rocm", "/opt/rocm-*"]:
             if "*" in path:
                 import glob
+
                 matches = sorted(glob.glob(path), reverse=True)
                 if matches:
                     rocm_path = matches[0]
@@ -45,14 +46,14 @@ def detect_platform():
                 rocm_version = f.read().strip()
         else:
             # Try to extract version from path
-            match = re.search(r'rocm[-/](\d+\.\d+(?:\.\d+)?)', rocm_path)
+            match = re.search(r"rocm[-/](\d+\.\d+(?:\.\d+)?)", rocm_path)
             if match:
                 rocm_version = match.group(1)
 
         print(f"Detected ROCm platform at {rocm_path}")
         if rocm_version:
             print(f"ROCm version: {rocm_version}")
-        return ('rocm', rocm_version, rocm_path)
+        return ("rocm", rocm_version, rocm_path)
 
     # Check for CUDA
     cuda_home = os.environ.get("CUDA_HOME") or os.environ.get("CUDA_PATH")
@@ -64,11 +65,11 @@ def detect_platform():
 
     if cuda_home and os.path.exists(cuda_home):
         print(f"Detected CUDA platform at {cuda_home}")
-        return ('cuda', None, None)
+        return ("cuda", None, None)
 
     # Default to CUDA if nothing detected
     print("No GPU platform detected, defaulting to CUDA")
-    return ('cuda', None, None)
+    return ("cuda", None, None)
 
 
 def hipify_source_files(rocm_path):
@@ -110,7 +111,7 @@ def hipify_source_files(rocm_path):
 
     hipified_files = []
     for source_path, result in hipify_result.items():
-        if hasattr(result, 'hipified_path') and result.hipified_path:
+        if hasattr(result, "hipified_path") and result.hipified_path:
             print(f"Successfully hipified: {source_path} -> {result.hipified_path}")
             hipified_files.append(result.hipified_path)
 
@@ -126,8 +127,9 @@ def hipify_source_files(rocm_path):
     return hipified_files
 
 
-
-def MyExtension(name, sources, mod_name, platform_type, rocm_path=None, *args, **kwargs):
+def MyExtension(
+    name, sources, mod_name, platform_type, rocm_path=None, *args, **kwargs
+):
     import pybind11
 
     pybind11_path = os.path.dirname(pybind11.__file__)
@@ -143,7 +145,7 @@ def MyExtension(name, sources, mod_name, platform_type, rocm_path=None, *args, *
     kwargs["extra_compile_args"] = ["-fvisibility=hidden", "-std=c++17"]
 
     # Platform-specific configuration
-    if platform_type == 'rocm' and rocm_path:
+    if platform_type == "rocm" and rocm_path:
         # ROCm/HIP configuration
         kwargs["define_macros"].append(("__HIP_PLATFORM_AMD__", "1"))
         kwargs["libraries"].append("amdhip64")
@@ -168,7 +170,7 @@ def run(self):
         self.rocm_path = rocm_path
 
         #  Configure build based on platform
-        if platform_type == 'rocm' and rocm_path:
+        if platform_type == "rocm" and rocm_path:
             print("=" * 60)
             print("Building for AMD ROCm platform")
             if rocm_version:
@@ -182,9 +184,14 @@ def run(self):
             for ext in self.extensions:
                 new_sources = []
                 for src in ext.sources:
-                    if 'fastsafetensors/cpp/ext.cpp' in src:
+                    if "fastsafetensors/cpp/ext.cpp" in src:
                         # torch.utils.hipify creates files in hip/ subdirectory
-                        new_sources.append(src.replace('fastsafetensors/cpp/ext.cpp', 'fastsafetensors/cpp/hip/ext.cpp'))
+                        new_sources.append(
+                            src.replace(
+                                "fastsafetensors/cpp/ext.cpp",
+                                "fastsafetensors/cpp/hip/ext.cpp",
+                            )
+                        )
                     else:
                         new_sources.append(src)
                 ext.sources = new_sources
@@ -234,6 +241,6 @@ def run(self):
         )
     ],
     cmdclass={
-        'build_ext': CustomBuildExt,
+        "build_ext": CustomBuildExt,
     },
 )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,6 +6,7 @@
 
 from fastsafetensors import SingleGroup
 from fastsafetensors import cpp as fstcpp
+from fastsafetensors.common import is_gpu_found
 from fastsafetensors.cpp import load_nvidia_functions
 from fastsafetensors.frameworks import FrameworkOpBase, get_framework_op
 from fastsafetensors.st_types import Device
@@ -14,6 +15,7 @@
 TESTS_DIR = os.path.dirname(__file__)
 sys.path.insert(0, TESTS_DIR)
 from platform_utils import get_platform_info, is_rocm_platform
+
 REPO_ROOT = os.path.dirname(os.path.dirname(TESTS_DIR))
 DATA_DIR = os.path.join(REPO_ROOT, ".testdata")
 TF_DIR = os.path.join(DATA_DIR, "transformers_cache")
@@ -81,7 +83,7 @@ def pg():
 
 @pytest.fixture(scope="session", autouse=True)
 def dev_init() -> None:
-    if fstcpp.is_cuda_found():
+    if is_gpu_found():
         dev_str = "cuda:0" if FRAMEWORK.get_name() == "pytorch" else "gpu:0"
     else:
         dev_str = "cpu"
diff --git a/tests/platform_utils.py b/tests/platform_utils.py
diff --git a/tests/test_fastsafetensors.py b/tests/test_fastsafetensors.py
diff --git a/tests/test_multi.py b/tests/test_multi.py