Skip to content

Commit 21cf516

Browse files
authored
Merge pull request #33 from ashvardanian/hopper-mma
Nvidia Hopper & Blackwell Support
2 parents ed1435e + d74d430 commit 21cf516

File tree

8 files changed

+1215
-217
lines changed

8 files changed

+1215
-217
lines changed

.gitignore

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,9 @@ debug/
33
build/
44
build_debug/
55
build_release/
6-
.DS_Store
6+
.DS_Store
7+
8+
# Temporary binaries
9+
less_slow_from_ptx.cubin
10+
less_slow_from_cu.cubin
11+
less_slow_from_cu.ptx

CMakeLists.txt

Lines changed: 38 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,11 @@ if(NOT CMAKE_BUILD_TYPE)
3232
set(CMAKE_BUILD_TYPE Release)
3333
endif()
3434

35+
# Set a default parallel build level if the user hasn't specified one.
36+
if(NOT DEFINED CMAKE_BUILD_PARALLEL_LEVEL)
37+
set(CMAKE_BUILD_PARALLEL_LEVEL 16 CACHE STRING "Default parallel build level" FORCE)
38+
endif()
39+
3540
# ------------------------------------------------------------------------------
3641
# Detect CUDA Support
3742
# ------------------------------------------------------------------------------
@@ -160,19 +165,30 @@ if(USE_INTEL_TBB)
160165
endif()
161166
endif()
162167

163-
# Nvidia's CUDA Core Compute Libraries for GPU acceleration
168+
# Nvidia's CUDA Core Compute Libraries for GPU-accelerated algorithms
164169
if(USE_NVIDIA_CCCL)
165170
# CUB, Thrust, and other libraries of interest are now included into the
166-
# CUDA Toolkit, so we don't need this anymore:
167-
#
168-
# FetchContent_Declare(NvidiaCCCL GIT_REPOSITORY https://github.com/nvidia/cccl.git)
169-
# FetchContent_MakeAvailable(NvidiaCCCL)
171+
# CUDA Toolkit:
170172
find_package(CUDAToolkit REQUIRED)
171173
message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
172174
message(STATUS "CUDA Toolkit Include Path: ${CUDAToolkit_INCLUDE_DIRS}")
173175
message(STATUS "CUDA Toolkit Libraries Path: ${CUDAToolkit_LIBRARY_DIR}")
174176
endif()
175177

178+
# Nvidia's CUTLASS for GPU-accelerated linear algebra
179+
# set(CUTLASS_ENABLE_HEADERS_ONLY ON)
180+
# set(CUTLASS_ENABLE_LIBRARY OFF)
181+
# set(CUTLASS_ENABLE_EXAMPLES OFF)
182+
# set(CUTLASS_ENABLE_TESTS OFF)
183+
# set(CUTLASS_ENABLE_TOOLS OFF)
184+
# set(CUTLASS_NVCC_ARCHS "90a")
185+
# FetchContent_Declare(
186+
# NvidiaCUTLASS
187+
# GIT_REPOSITORY https://github.com/nvidia/cutlass.git
188+
# GIT_TAG v3.7.0
189+
# )
190+
# FetchContent_MakeAvailable(NvidiaCUTLASS)
191+
176192
# FMT for logging, as `std::format` has limited functionality
177193
FetchContent_Declare(
178194
VictorZverovichFMT
@@ -317,7 +333,6 @@ endif()
317333
# List of all possible compiler IDs:
318334
# https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html
319335
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" OR CMAKE_CUDA_COMPILER_ID STREQUAL "NVHPC")
320-
set_property(SOURCE less_slow.cpp PROPERTY LANGUAGE CUDA)
321336
set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
322337
set_target_properties(less_slow PROPERTIES CUDA_ARCHITECTURES "70;75;80;89;90")
323338
target_compile_options(less_slow PRIVATE
@@ -413,16 +428,25 @@ endif()
413428
if(USE_NVIDIA_CCCL)
414429
# For CUB/Thrust, rely on CUDA Toolkit's bundled versions
415430
# These are automatically included when you include the CUDA Toolkit directories.
431+
target_sources(less_slow PRIVATE less_slow.cu)
416432
target_include_directories(less_slow PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
417433
target_link_libraries(less_slow PRIVATE CUDA::cudart CUDA::cublas CUDA::cuda_driver)
418-
target_sources(less_slow PRIVATE less_slow.cu)
419-
420-
# Copy the PTX Intermediate Representation file to the runtime directory
421-
set_source_files_properties(less_slow.ptx PROPERTIES LANGUAGE "")
422-
add_custom_command(
423-
TARGET less_slow POST_BUILD
424-
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow.ptx
425-
)
434+
# target_link_libraries(less_slow PRIVATE nvidia::cutlass::cutlass)
435+
436+
# List the PTX files you want to copy
437+
set(PTX_FILES less_slow_sm70.ptx less_slow_sm90a.ptx)
438+
439+
# Loop over each PTX file and add a custom command to copy it
440+
foreach(PTX ${PTX_FILES})
441+
# Make sure CMake doesn’t try to compile this file as source code
442+
set_source_files_properties(${PTX} PROPERTIES LANGUAGE "")
443+
add_custom_command(
444+
TARGET less_slow POST_BUILD
445+
COMMAND ${CMAKE_COMMAND} -E copy
446+
${CMAKE_CURRENT_SOURCE_DIR}/${PTX}
447+
${CMAKE_CURRENT_BINARY_DIR}/${PTX}
448+
)
449+
endforeach()
426450
endif()
427451

428452
if(OpenMP_FOUND)

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Some of the highlights include:
3333
- __Intel's oneAPI vs Nvidia's CCCL?__ What's so special about `<thrust>` and `<cub>`?
3434
- __CUDA C++, [PTX](https://en.wikipedia.org/wiki/Parallel_Thread_Execution) Intermediate Representations, and SASS__, and how do they differ from CPU code?
3535
- __How to choose between intrinsics, inline `asm`, and separate `.S` files__ for your performance-critical code?
36+
- __Tensor Cores & Memory__ differences on CPUs, and Volta, Ampere, Hopper, and Blackwell GPUs!
3637
- __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜
3738

3839
To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
@@ -77,7 +78,8 @@ The build will pull and compile several third-party dependencies from the source
7778
- Lewis Baker's [cppcoro](https://github.com/lewissbaker/cppcoro) implements C++20 coroutines.
7879
- Jens Axboe's [liburing](https://github.com/axboe/liburing) to simplify Linux kernel-bypass.
7980
- Chris Kohlhoff's [ASIO](https://github.com/chriskohlhoff/asio) as a [networking TS](https://en.cppreference.com/w/cpp/experimental/networking) extension.
80-
- Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated computations.
81+
- Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated algorithms.
82+
- Nvidia's [CUTLASS](https://github.com/nvidia/cutlass) for GPU-accelerated Linear Algebra.
8183

8284
To control the output or run specific benchmarks, use the following flags:
8385

0 commit comments

Comments
 (0)