ashvardanian
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 1 deletion b/‎.gitignore‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 38 additions & 14 deletions b/‎CMakeLists.txt‎
Lines changed: 38 additions & 14 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
@@ -3,4 +3,9 @@ debug/
 build/
 build_debug/
 build_release/
-.DS_Store
+.DS_Store
+
+# Temporary binaries
+less_slow_from_ptx.cubin
+less_slow_from_cu.cubin
+less_slow_from_cu.ptx
@@ -32,6 +32,11 @@ if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Release)
 endif()
 
+# Set a default parallel build level if the user hasn't specified one.
+if(NOT DEFINED CMAKE_BUILD_PARALLEL_LEVEL)
+  set(CMAKE_BUILD_PARALLEL_LEVEL 16 CACHE STRING "Default parallel build level" FORCE)
+endif()
+
 # ------------------------------------------------------------------------------
 # Detect CUDA Support
 # ------------------------------------------------------------------------------
@@ -160,19 +165,30 @@ if(USE_INTEL_TBB)
   endif()
 endif()
 
-# Nvidia's CUDA Core Compute Libraries for GPU acceleration
+# Nvidia's CUDA Core Compute Libraries for GPU-accelerated algorithms
 if(USE_NVIDIA_CCCL)
   # CUB, Thrust, and other libraries of interest are now included into the
-  # CUDA Toolkit, so we don't need this anymore:
-  #
-  # FetchContent_Declare(NvidiaCCCL GIT_REPOSITORY https://github.com/nvidia/cccl.git)
-  # FetchContent_MakeAvailable(NvidiaCCCL)
+  # CUDA Toolkit:
   find_package(CUDAToolkit REQUIRED)
   message(STATUS "CUDA Toolkit Version: ${CUDAToolkit_VERSION}")
   message(STATUS "CUDA Toolkit Include Path: ${CUDAToolkit_INCLUDE_DIRS}")
   message(STATUS "CUDA Toolkit Libraries Path: ${CUDAToolkit_LIBRARY_DIR}")
 endif()
 
+# Nvidia's CUTLASS for GPU-accelerated linear algebra
+# set(CUTLASS_ENABLE_HEADERS_ONLY ON)
+# set(CUTLASS_ENABLE_LIBRARY OFF)
+# set(CUTLASS_ENABLE_EXAMPLES OFF)
+# set(CUTLASS_ENABLE_TESTS OFF)
+# set(CUTLASS_ENABLE_TOOLS OFF)
+# set(CUTLASS_NVCC_ARCHS "90a")
+# FetchContent_Declare(
+#   NvidiaCUTLASS 
+#   GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+#   GIT_TAG v3.7.0
+# )
+# FetchContent_MakeAvailable(NvidiaCUTLASS)
+
 # FMT for logging, as `std::format` has limited functionality
 FetchContent_Declare(
   VictorZverovichFMT
@@ -317,7 +333,6 @@ endif()
 # List of all possible compiler IDs:
 # https://cmake.org/cmake/help/latest/variable/CMAKE_LANG_COMPILER_ID.html
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" OR CMAKE_CUDA_COMPILER_ID STREQUAL "NVHPC")
-  set_property(SOURCE less_slow.cpp PROPERTY LANGUAGE CUDA)
   set_target_properties(less_slow PROPERTIES POSITION_INDEPENDENT_CODE ON)
   set_target_properties(less_slow PROPERTIES CUDA_ARCHITECTURES "70;75;80;89;90")
   target_compile_options(less_slow PRIVATE
@@ -413,16 +428,25 @@ endif()
 if(USE_NVIDIA_CCCL)
   # For CUB/Thrust, rely on CUDA Toolkit's bundled versions
   # These are automatically included when you include the CUDA Toolkit directories.
+  target_sources(less_slow PRIVATE less_slow.cu)
   target_include_directories(less_slow PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
   target_link_libraries(less_slow PRIVATE CUDA::cudart CUDA::cublas CUDA::cuda_driver)
-  target_sources(less_slow PRIVATE less_slow.cu)
-
-  # Copy the PTX Intermediate Representation file to the runtime directory
-  set_source_files_properties(less_slow.ptx PROPERTIES LANGUAGE "")
-  add_custom_command(
-    TARGET less_slow POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/less_slow.ptx ${CMAKE_CURRENT_BINARY_DIR}/less_slow.ptx
-  )
+  # target_link_libraries(less_slow PRIVATE nvidia::cutlass::cutlass)
+
+  # List the PTX files you want to copy
+  set(PTX_FILES less_slow_sm70.ptx less_slow_sm90a.ptx)
+
+  # Loop over each PTX file and add a custom command to copy it
+  foreach(PTX ${PTX_FILES})
+    # Make sure CMake doesn’t try to compile this file as source code
+    set_source_files_properties(${PTX} PROPERTIES LANGUAGE "")
+    add_custom_command(
+      TARGET less_slow POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+              ${CMAKE_CURRENT_SOURCE_DIR}/${PTX}
+              ${CMAKE_CURRENT_BINARY_DIR}/${PTX}
+    )
+  endforeach()
 endif()
 
 if(OpenMP_FOUND)
 
@@ -33,6 +33,7 @@ Some of the highlights include:
 - __Intel's oneAPI vs Nvidia's CCCL?__ What's so special about `<thrust>` and `<cub>`?
 - __CUDA C++, [PTX](https://en.wikipedia.org/wiki/Parallel_Thread_Execution) Intermediate Representations, and SASS__, and how do they differ from CPU code?
 - __How to choose between intrinsics, inline `asm`, and separate `.S` files__ for your performance-critical code?
+- __Tensor Cores & Memory__ differences on CPUs, and Volta, Ampere, Hopper, and Blackwell GPUs!
 - __What are Encrypted Enclaves__ and what's the latency of Intel SGX, AMD SEV, and ARM Realm? 🔜
 
 To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
@@ -77,7 +78,8 @@ The build will pull and compile several third-party dependencies from the source
 - Lewis Baker's [cppcoro](https://github.com/lewissbaker/cppcoro) implements C++20 coroutines.
 - Jens Axboe's [liburing](https://github.com/axboe/liburing) to simplify Linux kernel-bypass.
 - Chris Kohlhoff's [ASIO](https://github.com/chriskohlhoff/asio) as a [networking TS](https://en.cppreference.com/w/cpp/experimental/networking) extension.
-- Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated computations.
+- Nvidia's [CCCL](https://github.com/nvidia/cccl) for GPU-accelerated algorithms.
+- Nvidia's [CUTLASS](https://github.com/nvidia/cutlass) for GPU-accelerated Linear Algebra.
 
 To control the output or run specific benchmarks, use the following flags: