From 102642f32836eba721eb995bc0a8f8bfbbb0999d Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 08:40:39 -0500 Subject: [PATCH 1/8] Refactor GPU verification kernel to gather erorr stats on GPU --- .../include/profiler/gpu_verification.hpp | 245 ++++++++++++++---- .../profile_grouped_conv_bwd_data_impl.hpp | 44 +--- .../profile_grouped_conv_bwd_weight_impl.hpp | 51 +--- .../profile_grouped_conv_fwd_impl.hpp | 20 +- 4 files changed, 225 insertions(+), 135 deletions(-) diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp index 808dc58c2f7..082851327c7 100644 --- a/profiler/include/profiler/gpu_verification.hpp +++ b/profiler/include/profiler/gpu_verification.hpp @@ -3,6 +3,9 @@ #pragma once +#include +#include + #include "ck/utility/data_type.hpp" #include "ck/utility/type_convert.hpp" #include "ck/utility/type.hpp" @@ -13,6 +16,40 @@ namespace ck { namespace profiler { +// Result struct for GPU verification with detailed error reporting +// Provides backward compatibility via operator bool() +struct GpuVerifyResult +{ + bool passed; // Overall pass/fail result + long long error_count; // Number of elements that exceeded tolerance + float max_error; // Maximum error value observed + std::size_t total; // Total number of elements compared + + // Implicit conversion to bool for backward compatibility + // Allows: if (gpu_verify(...)) { ... } + operator bool() const { return passed; } + + // Calculate error percentage + float error_percentage() const + { + if(total == 0) + return 0.0f; + return static_cast(error_count) / static_cast(total) * 100.0f; + } + + // Print error summary to stderr (matches check_err format) + void print_error_summary() const + { + if(!passed) + { + std::cerr << "max err: " << max_error; + std::cerr << ", number of errors: " << error_count; + std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage() + << "% wrong values" << std::endl; + } + } +}; + // Compute relative tolerance for GPU verification // Matches the logic of ck::utils::get_relative_threshold but handles all types template @@ -63,16 +100,77 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1) } } +// Helper function for atomic float max (using compare-and-swap) +__device__ __forceinline__ float atomicMaxFloat(float* address, float val) +{ + int* address_as_int = reinterpret_cast(address); + int old = *address_as_int; + int assumed; + + do + { + assumed = old; + old = + atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); + } while(assumed != old); + + return __int_as_float(old); +} + +// Helper function for atomic 64-bit add (using compare-and-swap) +// Needed because atomicAdd for long long isn't always available +__device__ __forceinline__ long long atomicAdd64(long long* address, long long val) +{ + unsigned long long* address_as_ull = reinterpret_cast(address); + unsigned long long old = *address_as_ull; + unsigned long long assumed; + + do + { + assumed = old; + old = atomicCAS(address_as_ull, assumed, assumed + static_cast(val)); + } while(assumed != old); + + return static_cast(old); +} + +// Device-side result structure for kernel output +// Packed into a single struct to minimize device memory allocations +struct GpuVerifyDeviceResult +{ + int passed; // 1 = passed, 0 = failed + long long error_count; // Number of errors found + float max_error; // Maximum error value +}; + // GPU verification kernel - compares device result against reference using relative and absolute -// tolerance Returns 1 in passed if all elements match within tolerance, 0 otherwise +// tolerance. Tracks all errors (no early exit) to provide detailed error reporting. +// +// Uses LDS (shared memory) for block-level reduction to minimize atomic contention. +// This reduces atomic operations from O(errors) to O(blocks), providing massive speedup +// when there are many errors. +// +// Assumption: Block size is 256 template __global__ void gpu_verify_kernel(const T* __restrict__ device_result, const T* __restrict__ reference_result, float rtol, float atol, long long size, - int* passed) + GpuVerifyDeviceResult* result) { + constexpr int block_size = 256; + + // Shared memory for block-level reduction + __shared__ long long shared_error_count[block_size]; + __shared__ float shared_max_error[block_size]; + __shared__ int shared_has_error[block_size]; + + // Thread-local accumulators (in registers) + long long local_error_count = 0; + float local_max_error = 0.0f; + int local_has_error = 0; + // Grid-stride loop to handle any tensor size long long idx = blockIdx.x * blockDim.x + threadIdx.x; long long stride = blockDim.x * gridDim.x; @@ -89,29 +187,98 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, // Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref)) if(abs_diff > atol + rtol * fabsf(ref_val)) { - atomicMin(passed, 0); // Mark as failed - return; // Early exit on first failure + local_has_error = 1; + local_error_count++; + local_max_error = fmaxf(local_max_error, abs_diff); + } + } + + // Store thread-local results to shared memory + shared_error_count[threadIdx.x] = local_error_count; + shared_max_error[threadIdx.x] = local_max_error; + shared_has_error[threadIdx.x] = local_has_error; + __syncthreads(); + + // Block-level reduction: 256 -> 128 -> 64 -> 32 + for(unsigned int s = block_size / 2; s > 32; s >>= 1) + { + if(threadIdx.x < s) + { + shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s]; + shared_max_error[threadIdx.x] = + fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]); + shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s]; + } + __syncthreads(); + } + + // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1 + // No sync needed within a warp (warp-synchronous programming) + if(threadIdx.x < 32) + { + // Use volatile to prevent compiler from caching shared memory reads + volatile long long* smem_count = shared_error_count; + volatile float* smem_max = shared_max_error; + volatile int* smem_has = shared_has_error; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 32]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32]; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 16]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16]; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 8]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8]; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 4]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4]; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 2]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2]; + + smem_count[threadIdx.x] += smem_count[threadIdx.x + 1]; + smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]); + smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1]; + } + + // Single atomic update per block (reduces contention from O(errors) to O(blocks)) + if(threadIdx.x == 0) + { + if(shared_has_error[0]) + { + atomicMin(&result->passed, 0); + atomicAdd64(&result->error_count, shared_error_count[0]); + atomicMaxFloat(&result->max_error, shared_max_error[0]); } } } // Host-side wrapper for GPU verification with explicit tolerances -// Returns true if verification passed, false otherwise +// Returns GpuVerifyResult with detailed error information template -bool gpu_verify(const void* device_result, - const void* reference_result, - float rtol, - float atol, - std::size_t size, - hipStream_t stream = nullptr) +GpuVerifyResult gpu_verify(const void* device_result, + const void* reference_result, + float rtol, + float atol, + std::size_t size, + hipStream_t stream = nullptr) { // Allocate result buffer on device - int* passed_dev; - hip_check_error(hipMalloc(&passed_dev, sizeof(int))); + GpuVerifyDeviceResult* result_dev; + hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult))); - // Initialize to passed (1) - int passed_host = 1; - hip_check_error(hipMemcpy(passed_dev, &passed_host, sizeof(int), hipMemcpyHostToDevice)); + // Initialize result struct + GpuVerifyDeviceResult result_host; + result_host.passed = 1; // Start as passed + result_host.error_count = 0; // No errors yet + result_host.max_error = 0.0f; // No error observed + hip_check_error( + hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice)); // Launch kernel with grid-stride loop // Use 65535 as max grid size (hardware limit for grid dimension in x) @@ -125,7 +292,7 @@ bool gpu_verify(const void* device_result, rtol, atol, static_cast(size), - passed_dev); + result_dev); hip_check_error(hipGetLastError()); @@ -133,12 +300,20 @@ bool gpu_verify(const void* device_result, hip_check_error(hipStreamSynchronize(stream)); // Get result - hip_check_error(hipMemcpy(&passed_host, passed_dev, sizeof(int), hipMemcpyDeviceToHost)); + hip_check_error( + hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost)); // Free device memory - hip_check_error(hipFree(passed_dev)); + hip_check_error(hipFree(result_dev)); - return passed_host == 1; + // Build and return result struct + GpuVerifyResult result; + result.passed = (result_host.passed == 1); + result.error_count = result_host.error_count; + result.max_error = result_host.max_error; + result.total = size; + + return result; } // Forward declaration of gpu_reduce_max @@ -147,15 +322,15 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st // Host-side wrapper for GPU verification with automatic tolerance computation // Computes max value on GPU, then computes tolerances and verifies -// Returns true if verification passed, false otherwise +// Returns GpuVerifyResult with detailed error information template -bool gpu_verify(const void* device_result, - const void* reference_result, - int number_of_accumulations, - std::size_t size, - hipStream_t stream = nullptr) +GpuVerifyResult gpu_verify(const void* device_result, + const void* reference_result, + int number_of_accumulations, + std::size_t size, + hipStream_t stream = nullptr) { // Compute max absolute value on GPU (only 4 bytes transferred!) double max_abs_value = @@ -187,24 +362,6 @@ bool gpu_verify(const void* device_result, return gpu_verify(device_result, reference_result, rtol, atol, size, stream); } -// -// Helper function for atomic float max (using compare-and-swap) -__device__ __forceinline__ float atomicMaxFloat(float* address, float val) -{ - int* address_as_int = reinterpret_cast(address); - int old = *address_as_int; - int assumed; - - do - { - assumed = old; - old = - atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); - } while(assumed != old); - - return __int_as_float(old); -} - // GPU reduction kernel for computing max(abs(data)) // This is an internal kernel called only by gpu_reduce_max() wrapper. // diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index d74cf576499..1946ff768b7 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -237,52 +237,24 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, // Perform GPU verification (max value computed internally on GPU) const std::size_t tensor_size = in_device.mDesc.GetElementSpaceSize(); - bool gpu_passed = ck::profiler::gpu_verify( + auto gpu_result = ck::profiler::gpu_verify( in_device_buf.GetDeviceBuffer(), gpu_ref_in_buf.GetDeviceBuffer(), total_accums, tensor_size); - if(!gpu_passed) + if(!gpu_result) { - // GPU verification failed - fall back to CPU for detailed diagnostics - std::cout << "GPU verification failed, running CPU verification for details..." - << std::endl; - - // Copy both buffers to host - in_device_buf.FromDevice(in_device.mData.data()); - gpu_ref_in_buf.FromDevice(in_host.mData.data()); - - // Recalculate tolerances for CPU verification with original logic - auto rtol = - ck::utils::get_relative_threshold( - num_accums); - auto atol = - ck::utils::get_absolute_threshold( - max_accumulated_value / split_k_for_run, num_accums); - - if(split_k_for_run > 1) - { - auto rtol_split_k = - ck::utils::get_relative_threshold( - split_k_for_run); - auto atol_split_k = - ck::utils::get_absolute_threshold( - max_accumulated_value, split_k_for_run); - rtol = std::max(rtol, rtol_split_k); - atol = std::max(atol, atol_split_k); - } - - // Run CPU verification for detailed error messages - ck::utils::check_err( - in_device, in_host, "Error: Incorrect results!", rtol, atol); + // GPU verification failed - print detailed error summary + gpu_result.print_error_summary(); pass = false; - std::cout << "Relative error threshold: " << rtol - << " Absolute error threshold: " << atol << std::endl; - if(do_log) { + // Copy buffers to host for logging + in_device_buf.FromDevice(in_device.mData.data()); + gpu_ref_in_buf.FromDevice(in_host.mData.data()); + LogRangeAsType(std::cout << "output : ", out.mData, ",") << std::endl; LogRangeAsType(std::cout << "weight: ", wei.mData, ",") << std::endl; diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 67ad21c5728..ee1cf89aaf0 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -343,63 +343,28 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, // Perform GPU verification (max value computed internally on GPU) const std::size_t tensor_size = weight_device_result.mDesc.GetElementSpaceSize(); - bool gpu_passed = + auto gpu_result = ck::profiler::gpu_verify( wei_device_buf.GetDeviceBuffer(), gpu_ref_wei_buf.GetDeviceBuffer(), total_accums, tensor_size); - if(!gpu_passed) + if(!gpu_result) { - // GPU verification failed - fall back to CPU for detailed diagnostics - std::cout - << "GPU verification failed, running CPU verification for details..." - << std::endl; - - // Copy both buffers to host - wei_device_buf.FromDevice(weight_device_result.mData.data()); - gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data()); - - // Recalculate tolerances for CPU verification with original logic - const index_t num_accums_full = output.GetElementSize() / conv_param.K_; - const index_t num_accums_split_k = split_k_value; - auto rtol = ck::utils:: - get_relative_threshold( - num_accums_full / num_accums_split_k); - auto atol = ck::utils:: - get_absolute_threshold( - max_accumulated_value / num_accums_split_k, - num_accums_full / num_accums_split_k); - - if(split_k_value > 1) - { - auto rtol_split_k = - ck::utils::get_relative_threshold(num_accums_split_k); - auto atol_split_k = ck::utils:: - get_absolute_threshold( - max_accumulated_value, num_accums_split_k); - rtol = std::max(rtol, rtol_split_k); - atol = std::max(atol, atol_split_k); - } - - // Run CPU verification for detailed error messages - ck::utils::check_err(weight_device_result, - weight_host_result, - "Error: Incorrect results!", - rtol, - atol); + // GPU verification failed - print detailed error summary + gpu_result.print_error_summary(); all_pass = false; - std::cout << "Relative error threshold: " << rtol - << " Absolute error threshold: " << atol << std::endl; std::cout << "Fail info: splitK: " << split_k_value << " " << op_ptr->GetTypeString() << std::endl; if(do_log) { + // Copy buffers to host for logging + wei_device_buf.FromDevice(weight_device_result.mData.data()); + gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data()); + LogRangeAsType(std::cout << "output : ", output.mData, ",") << std::endl; LogRangeAsType( diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 874d1e115c7..0ef55468764 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -243,28 +243,24 @@ bool profile_grouped_conv_fwd_impl(int do_verification, // Perform GPU verification (max value computed internally on GPU) const std::size_t tensor_size = device_output.mDesc.GetElementSpaceSize(); - bool gpu_passed = ck::profiler::gpu_verify( + auto gpu_result = ck::profiler::gpu_verify( out_device_buf.GetDeviceBuffer(), gpu_ref_out_buf.GetDeviceBuffer(), num_accums, tensor_size); - if(!gpu_passed) + if(!gpu_result) { - // GPU verification failed - fall back to CPU for detailed diagnostics - std::cout << "GPU verification failed, running CPU verification for details..." - << std::endl; - - // Copy both buffers to host - out_device_buf.FromDevice(device_output.mData.data()); - gpu_ref_out_buf.FromDevice(host_output.mData.data()); - - // Run CPU verification for detailed error messages - ck::utils::check_err(device_output, host_output); + // GPU verification failed - print detailed error summary + gpu_result.print_error_summary(); pass = false; if(do_log) { + // Copy buffers to host for logging + out_device_buf.FromDevice(device_output.mData.data()); + gpu_ref_out_buf.FromDevice(host_output.mData.data()); + LogRangeAsType(std::cout << "input : ", input.mData, ",") << std::endl; LogRangeAsType(std::cout << "weight: ", weight.mData, ",") From 20e7d7a569c87b5b3bbfc6671a447ecb8df6f561 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 09:05:21 -0500 Subject: [PATCH 2/8] Check if result is all zero --- .../include/profiler/gpu_verification.hpp | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp index 082851327c7..d90aff120e9 100644 --- a/profiler/include/profiler/gpu_verification.hpp +++ b/profiler/include/profiler/gpu_verification.hpp @@ -24,6 +24,7 @@ struct GpuVerifyResult long long error_count; // Number of elements that exceeded tolerance float max_error; // Maximum error value observed std::size_t total; // Total number of elements compared + bool all_zero; // True if device result is all zeros (likely kernel issue) // Implicit conversion to bool for backward compatibility // Allows: if (gpu_verify(...)) { ... } @@ -42,6 +43,12 @@ struct GpuVerifyResult { if(!passed) { + if(all_zero) + { + std::cerr << "WARNING: Device result is all zeros - kernel may not have executed " + "properly!" + << std::endl; + } std::cerr << "max err: " << max_error; std::cerr << ", number of errors: " << error_count; std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage() @@ -141,6 +148,7 @@ struct GpuVerifyDeviceResult int passed; // 1 = passed, 0 = failed long long error_count; // Number of errors found float max_error; // Maximum error value + int all_zero; // 1 = device result is all zeros, 0 = has non-zero values }; // GPU verification kernel - compares device result against reference using relative and absolute @@ -165,11 +173,13 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, __shared__ long long shared_error_count[block_size]; __shared__ float shared_max_error[block_size]; __shared__ int shared_has_error[block_size]; + __shared__ int shared_has_nonzero[block_size]; // Thread-local accumulators (in registers) long long local_error_count = 0; float local_max_error = 0.0f; int local_has_error = 0; + int local_has_nonzero = 0; // Grid-stride loop to handle any tensor size long long idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -181,6 +191,12 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, float dev_val = type_convert(device_result[i]); float ref_val = type_convert(reference_result[i]); + // Check if device value is non-zero + if(dev_val != 0.0f) + { + local_has_nonzero = 1; + } + // Compute absolute difference float abs_diff = fabsf(dev_val - ref_val); @@ -197,6 +213,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, shared_error_count[threadIdx.x] = local_error_count; shared_max_error[threadIdx.x] = local_max_error; shared_has_error[threadIdx.x] = local_has_error; + shared_has_nonzero[threadIdx.x] = local_has_nonzero; __syncthreads(); // Block-level reduction: 256 -> 128 -> 64 -> 32 @@ -208,6 +225,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, shared_max_error[threadIdx.x] = fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]); shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s]; + shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s]; } __syncthreads(); } @@ -220,30 +238,37 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, volatile long long* smem_count = shared_error_count; volatile float* smem_max = shared_max_error; volatile int* smem_has = shared_has_error; + volatile int* smem_nonzero = shared_has_nonzero; smem_count[threadIdx.x] += smem_count[threadIdx.x + 32]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 32]; smem_count[threadIdx.x] += smem_count[threadIdx.x + 16]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 16]; smem_count[threadIdx.x] += smem_count[threadIdx.x + 8]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 8]; smem_count[threadIdx.x] += smem_count[threadIdx.x + 4]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 4]; smem_count[threadIdx.x] += smem_count[threadIdx.x + 2]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 2]; smem_count[threadIdx.x] += smem_count[threadIdx.x + 1]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]); smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1]; + smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 1]; } // Single atomic update per block (reduces contention from O(errors) to O(blocks)) @@ -255,6 +280,15 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, atomicAdd64(&result->error_count, shared_error_count[0]); atomicMaxFloat(&result->max_error, shared_max_error[0]); } + // Update all_zero flag: if no nonzero values found, mark as all zero + if(!shared_has_nonzero[0]) + { + atomicMin(&result->all_zero, 1); + } + else + { + atomicMin(&result->all_zero, 0); + } } } @@ -277,6 +311,7 @@ GpuVerifyResult gpu_verify(const void* device_result, result_host.passed = 1; // Start as passed result_host.error_count = 0; // No errors yet result_host.max_error = 0.0f; // No error observed + result_host.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found) hip_check_error( hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice)); @@ -312,6 +347,7 @@ GpuVerifyResult gpu_verify(const void* device_result, result.error_count = result_host.error_count; result.max_error = result_host.max_error; result.total = size; + result.all_zero = (result_host.all_zero == 1); return result; } From 3563cd228a559fc4bd2579968f143a8540644f2e Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 09:10:58 -0500 Subject: [PATCH 3/8] non-negative error count doesn't need custom Atomics --- .../include/profiler/gpu_verification.hpp | 55 +++++++------------ 1 file changed, 19 insertions(+), 36 deletions(-) diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp index d90aff120e9..4d0ed635bda 100644 --- a/profiler/include/profiler/gpu_verification.hpp +++ b/profiler/include/profiler/gpu_verification.hpp @@ -20,11 +20,11 @@ namespace profiler { // Provides backward compatibility via operator bool() struct GpuVerifyResult { - bool passed; // Overall pass/fail result - long long error_count; // Number of elements that exceeded tolerance - float max_error; // Maximum error value observed - std::size_t total; // Total number of elements compared - bool all_zero; // True if device result is all zeros (likely kernel issue) + bool passed; // Overall pass/fail result + unsigned long long error_count; // Number of elements that exceeded tolerance + float max_error; // Maximum error value observed + std::size_t total; // Total number of elements compared + bool all_zero; // True if device result is all zeros (likely kernel issue) // Implicit conversion to bool for backward compatibility // Allows: if (gpu_verify(...)) { ... } @@ -124,31 +124,14 @@ __device__ __forceinline__ float atomicMaxFloat(float* address, float val) return __int_as_float(old); } -// Helper function for atomic 64-bit add (using compare-and-swap) -// Needed because atomicAdd for long long isn't always available -__device__ __forceinline__ long long atomicAdd64(long long* address, long long val) -{ - unsigned long long* address_as_ull = reinterpret_cast(address); - unsigned long long old = *address_as_ull; - unsigned long long assumed; - - do - { - assumed = old; - old = atomicCAS(address_as_ull, assumed, assumed + static_cast(val)); - } while(assumed != old); - - return static_cast(old); -} - // Device-side result structure for kernel output // Packed into a single struct to minimize device memory allocations struct GpuVerifyDeviceResult { - int passed; // 1 = passed, 0 = failed - long long error_count; // Number of errors found - float max_error; // Maximum error value - int all_zero; // 1 = device result is all zeros, 0 = has non-zero values + int passed; // 1 = passed, 0 = failed + unsigned long long error_count; // Number of errors found + float max_error; // Maximum error value + int all_zero; // 1 = device result is all zeros, 0 = has non-zero values }; // GPU verification kernel - compares device result against reference using relative and absolute @@ -170,16 +153,16 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, constexpr int block_size = 256; // Shared memory for block-level reduction - __shared__ long long shared_error_count[block_size]; + __shared__ unsigned long long shared_error_count[block_size]; __shared__ float shared_max_error[block_size]; __shared__ int shared_has_error[block_size]; __shared__ int shared_has_nonzero[block_size]; // Thread-local accumulators (in registers) - long long local_error_count = 0; - float local_max_error = 0.0f; - int local_has_error = 0; - int local_has_nonzero = 0; + unsigned long long local_error_count = 0; + float local_max_error = 0.0f; + int local_has_error = 0; + int local_has_nonzero = 0; // Grid-stride loop to handle any tensor size long long idx = blockIdx.x * blockDim.x + threadIdx.x; @@ -235,10 +218,10 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, if(threadIdx.x < 32) { // Use volatile to prevent compiler from caching shared memory reads - volatile long long* smem_count = shared_error_count; - volatile float* smem_max = shared_max_error; - volatile int* smem_has = shared_has_error; - volatile int* smem_nonzero = shared_has_nonzero; + volatile unsigned long long* smem_count = shared_error_count; + volatile float* smem_max = shared_max_error; + volatile int* smem_has = shared_has_error; + volatile int* smem_nonzero = shared_has_nonzero; smem_count[threadIdx.x] += smem_count[threadIdx.x + 32]; smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]); @@ -277,7 +260,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, if(shared_has_error[0]) { atomicMin(&result->passed, 0); - atomicAdd64(&result->error_count, shared_error_count[0]); + atomicAdd(&result->error_count, shared_error_count[0]); atomicMaxFloat(&result->max_error, shared_max_error[0]); } // Update all_zero flag: if no nonzero values found, mark as all zero From 2f14048c633bc4e06d8dee9264046b8c5b322dba Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 09:27:05 -0500 Subject: [PATCH 4/8] Remove unnecessary AtomicMaxFloat function --- .../include/profiler/gpu_verification.hpp | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp index 4d0ed635bda..da31771fb0a 100644 --- a/profiler/include/profiler/gpu_verification.hpp +++ b/profiler/include/profiler/gpu_verification.hpp @@ -107,23 +107,6 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1) } } -// Helper function for atomic float max (using compare-and-swap) -__device__ __forceinline__ float atomicMaxFloat(float* address, float val) -{ - int* address_as_int = reinterpret_cast(address); - int old = *address_as_int; - int assumed; - - do - { - assumed = old; - old = - atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed)))); - } while(assumed != old); - - return __int_as_float(old); -} - // Device-side result structure for kernel output // Packed into a single struct to minimize device memory allocations struct GpuVerifyDeviceResult @@ -261,7 +244,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, { atomicMin(&result->passed, 0); atomicAdd(&result->error_count, shared_error_count[0]); - atomicMaxFloat(&result->max_error, shared_max_error[0]); + atomicMax(&result->max_error, shared_max_error[0]); } // Update all_zero flag: if no nonzero values found, mark as all zero if(!shared_has_nonzero[0]) From 715eb8dd04f3d6eed172b96369008cb0d877fc19 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 09:44:24 -0500 Subject: [PATCH 5/8] Simpler warp reduction, remove passed flag --- .../include/profiler/gpu_verification.hpp | 87 +++++-------------- 1 file changed, 20 insertions(+), 67 deletions(-) diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp index da31771fb0a..c93f4deb8fc 100644 --- a/profiler/include/profiler/gpu_verification.hpp +++ b/profiler/include/profiler/gpu_verification.hpp @@ -20,7 +20,6 @@ namespace profiler { // Provides backward compatibility via operator bool() struct GpuVerifyResult { - bool passed; // Overall pass/fail result unsigned long long error_count; // Number of elements that exceeded tolerance float max_error; // Maximum error value observed std::size_t total; // Total number of elements compared @@ -28,7 +27,7 @@ struct GpuVerifyResult // Implicit conversion to bool for backward compatibility // Allows: if (gpu_verify(...)) { ... } - operator bool() const { return passed; } + operator bool() const { return error_count == 0; } // Calculate error percentage float error_percentage() const @@ -41,7 +40,7 @@ struct GpuVerifyResult // Print error summary to stderr (matches check_err format) void print_error_summary() const { - if(!passed) + if(error_count > 0) { if(all_zero) { @@ -111,7 +110,6 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1) // Packed into a single struct to minimize device memory allocations struct GpuVerifyDeviceResult { - int passed; // 1 = passed, 0 = failed unsigned long long error_count; // Number of errors found float max_error; // Maximum error value int all_zero; // 1 = device result is all zeros, 0 = has non-zero values @@ -196,53 +194,20 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, __syncthreads(); } - // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1 - // No sync needed within a warp (warp-synchronous programming) - if(threadIdx.x < 32) - { - // Use volatile to prevent compiler from caching shared memory reads - volatile unsigned long long* smem_count = shared_error_count; - volatile float* smem_max = shared_max_error; - volatile int* smem_has = shared_has_error; - volatile int* smem_nonzero = shared_has_nonzero; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 32]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 32]; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 16]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 16]; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 8]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 8]; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 4]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 4]; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 2]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 2]; - - smem_count[threadIdx.x] += smem_count[threadIdx.x + 1]; - smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]); - smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1]; - smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 1]; - } - - // Single atomic update per block (reduces contention from O(errors) to O(blocks)) + // Final reduction of remaining 32 elements in thread 0 if(threadIdx.x == 0) { + for(int i = 1; i < 32; ++i) + { + shared_error_count[0] += shared_error_count[i]; + shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]); + shared_has_error[0] |= shared_has_error[i]; + shared_has_nonzero[0] |= shared_has_nonzero[i]; + } + + // Single atomic update per block (reduces contention from O(errors) to O(blocks)) if(shared_has_error[0]) { - atomicMin(&result->passed, 0); atomicAdd(&result->error_count, shared_error_count[0]); atomicMax(&result->max_error, shared_max_error[0]); } @@ -274,7 +239,6 @@ GpuVerifyResult gpu_verify(const void* device_result, // Initialize result struct GpuVerifyDeviceResult result_host; - result_host.passed = 1; // Start as passed result_host.error_count = 0; // No errors yet result_host.max_error = 0.0f; // No error observed result_host.all_zero = 1; // Start assuming all zeros (will be cleared if nonzero found) @@ -309,7 +273,6 @@ GpuVerifyResult gpu_verify(const void* device_result, // Build and return result struct GpuVerifyResult result; - result.passed = (result_host.passed == 1); result.error_count = result_host.error_count; result.max_error = result_host.max_error; result.total = size; @@ -399,26 +362,16 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr __syncthreads(); } - // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1 - // No sync needed within a warp - if(threadIdx.x < 32) - { - volatile float* smem = shared_max; - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 32]); - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 16]); - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 8]); - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 4]); - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 2]); - smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 1]); - } - - // Two-phase reduction pattern minimizes atomic contention: - // 1. Each block reduces to shared memory (above) - // 2. Single thread per block updates global max (below) - // This limits atomic operations to O(grid_size) rather than O(total_threads) + // Final reduction of remaining 32 elements in thread 0 if(threadIdx.x == 0) { - atomicMaxFloat(max_val, shared_max[0]); + for(int i = 1; i < 32; ++i) + { + shared_max[0] = fmaxf(shared_max[0], shared_max[i]); + } + + // Single atomic update per block + atomicMax(max_val, shared_max[0]); } } From a8354cc5acd4c305c4d301728448ff66e43ac035 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Mon, 12 Jan 2026 10:15:02 -0500 Subject: [PATCH 6/8] Move verification header to include --- .../ck/library/utility}/gpu_verification.hpp | 0 .../include/profiler/profile_grouped_conv_bwd_data_impl.hpp | 2 +- .../include/profiler/profile_grouped_conv_bwd_weight_impl.hpp | 2 +- profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename {profiler/include/profiler => include/ck/library/utility}/gpu_verification.hpp (100%) diff --git a/profiler/include/profiler/gpu_verification.hpp b/include/ck/library/utility/gpu_verification.hpp similarity index 100% rename from profiler/include/profiler/gpu_verification.hpp rename to include/ck/library/utility/gpu_verification.hpp diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 1946ff768b7..907b3ed2423 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -20,7 +20,7 @@ #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp" #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp" -#include "profiler/gpu_verification.hpp" +#include "ck/library/utility/gpu_verification.hpp" namespace ck { namespace profiler { diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index ee1cf89aaf0..ab118fdebf5 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -24,7 +24,7 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp" #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp" -#include "profiler/gpu_verification.hpp" +#include "ck/library/utility/gpu_verification.hpp" namespace ck { namespace profiler { diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 0ef55468764..3af523723cd 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -23,7 +23,7 @@ #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" #include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp" -#include "profiler/gpu_verification.hpp" +#include "ck/library/utility/gpu_verification.hpp" namespace ck { namespace profiler { From c20a181ba9eb315bec2d96edede3052c94a987a6 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Tue, 13 Jan 2026 01:53:13 -0500 Subject: [PATCH 7/8] Fix header path in test --- test/gpu_verification/test_gpu_verification.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gpu_verification/test_gpu_verification.cpp b/test/gpu_verification/test_gpu_verification.cpp index 977475f0649..3be950c2b66 100644 --- a/test/gpu_verification/test_gpu_verification.cpp +++ b/test/gpu_verification/test_gpu_verification.cpp @@ -13,8 +13,8 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/gpu_verification.hpp" #include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp" -#include "profiler/gpu_verification.hpp" using namespace ck::profiler; using ck::ref::SimpleDeviceMem; From 218d6c49bb09e0ce5c440cf3c086afefc9c46a24 Mon Sep 17 00:00:00 2001 From: "Graner, Johannes" Date: Tue, 13 Jan 2026 05:16:07 -0500 Subject: [PATCH 8/8] Fix block reduction loop --- include/ck/library/utility/gpu_verification.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/ck/library/utility/gpu_verification.hpp b/include/ck/library/utility/gpu_verification.hpp index c93f4deb8fc..e4a444ecb9c 100644 --- a/include/ck/library/utility/gpu_verification.hpp +++ b/include/ck/library/utility/gpu_verification.hpp @@ -181,7 +181,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result, __syncthreads(); // Block-level reduction: 256 -> 128 -> 64 -> 32 - for(unsigned int s = block_size / 2; s > 32; s >>= 1) + for(unsigned int s = block_size / 2; s >= 32; s >>= 1) { if(threadIdx.x < s) { @@ -353,7 +353,7 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr __syncthreads(); // Block-level reduction: 256 -> 128 -> 64 -> 32 - for(unsigned int s = block_size / 2; s > 32; s >>= 1) + for(unsigned int s = block_size / 2; s >= 32; s >>= 1) { if(threadIdx.x < s) {