From 102642f32836eba721eb995bc0a8f8bfbbb0999d Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 08:40:39 -0500
Subject: [PATCH 1/8] Refactor GPU verification kernel to gather erorr stats on
 GPU

---
 .../include/profiler/gpu_verification.hpp     | 245 ++++++++++++++----
 .../profile_grouped_conv_bwd_data_impl.hpp    |  44 +---
 .../profile_grouped_conv_bwd_weight_impl.hpp  |  51 +---
 .../profile_grouped_conv_fwd_impl.hpp         |  20 +-
 4 files changed, 225 insertions(+), 135 deletions(-)
diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp
index 808dc58c2f7..082851327c7 100644
--- a/profiler/include/profiler/gpu_verification.hpp
+++ b/profiler/include/profiler/gpu_verification.hpp
@@ -3,6 +3,9 @@
 
 #pragma once
 
+#include <iomanip>
+#include <iostream>
+
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/type_convert.hpp"
 #include "ck/utility/type.hpp"
@@ -13,6 +16,40 @@
 namespace ck {
 namespace profiler {
 
+// Result struct for GPU verification with detailed error reporting
+// Provides backward compatibility via operator bool()
+struct GpuVerifyResult
+{
+    bool passed;           // Overall pass/fail result
+    long long error_count; // Number of elements that exceeded tolerance
+    float max_error;       // Maximum error value observed
+    std::size_t total;     // Total number of elements compared
+
+    // Implicit conversion to bool for backward compatibility
+    // Allows: if (gpu_verify(...)) { ... }
+    operator bool() const { return passed; }
+
+    // Calculate error percentage
+    float error_percentage() const
+    {
+        if(total == 0)
+            return 0.0f;
+        return static_cast<float>(error_count) / static_cast<float>(total) * 100.0f;
+    }
+
+    // Print error summary to stderr (matches check_err format)
+    void print_error_summary() const
+    {
+        if(!passed)
+        {
+            std::cerr << "max err: " << max_error;
+            std::cerr << ", number of errors: " << error_count;
+            std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage()
+                      << "% wrong values" << std::endl;
+        }
+    }
+};
+
 // Compute relative tolerance for GPU verification
 // Matches the logic of ck::utils::get_relative_threshold but handles all types
 template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
@@ -63,16 +100,77 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
     }
 }
 
+// Helper function for atomic float max (using compare-and-swap)
+__device__ __forceinline__ float atomicMaxFloat(float* address, float val)
+{
+    int* address_as_int = reinterpret_cast<int*>(address);
+    int old             = *address_as_int;
+    int assumed;
+
+    do
+    {
+        assumed = old;
+        old =
+            atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
+    } while(assumed != old);
+
+    return __int_as_float(old);
+}
+
+// Helper function for atomic 64-bit add (using compare-and-swap)
+// Needed because atomicAdd for long long isn't always available
+__device__ __forceinline__ long long atomicAdd64(long long* address, long long val)
+{
+    unsigned long long* address_as_ull = reinterpret_cast<unsigned long long*>(address);
+    unsigned long long old             = *address_as_ull;
+    unsigned long long assumed;
+
+    do
+    {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, assumed + static_cast<unsigned long long>(val));
+    } while(assumed != old);
+
+    return static_cast<long long>(old);
+}
+
+// Device-side result structure for kernel output
+// Packed into a single struct to minimize device memory allocations
+struct GpuVerifyDeviceResult
+{
+    int passed;            // 1 = passed, 0 = failed
+    long long error_count; // Number of errors found
+    float max_error;       // Maximum error value
+};
+
 // GPU verification kernel - compares device result against reference using relative and absolute
-// tolerance Returns 1 in passed if all elements match within tolerance, 0 otherwise
+// tolerance. Tracks all errors (no early exit) to provide detailed error reporting.
+//
+// Uses LDS (shared memory) for block-level reduction to minimize atomic contention.
+// This reduces atomic operations from O(errors) to O(blocks), providing massive speedup
+// when there are many errors.
+//
+// Assumption: Block size is 256
 template <typename T>
 __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
                                   const T* __restrict__ reference_result,
                                   float rtol,
                                   float atol,
                                   long long size,
-                                  int* passed)
+                                  GpuVerifyDeviceResult* result)
 {
+    constexpr int block_size = 256;
+
+    // Shared memory for block-level reduction
+    __shared__ long long shared_error_count[block_size];
+    __shared__ float shared_max_error[block_size];
+    __shared__ int shared_has_error[block_size];
+
+    // Thread-local accumulators (in registers)
+    long long local_error_count = 0;
+    float local_max_error       = 0.0f;
+    int local_has_error         = 0;
+
     // Grid-stride loop to handle any tensor size
     long long idx    = blockIdx.x * blockDim.x + threadIdx.x;
     long long stride = blockDim.x * gridDim.x;
@@ -89,29 +187,98 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         // Check tolerance (matches CPU check_err logic: err > atol + rtol * abs(ref))
         if(abs_diff > atol + rtol * fabsf(ref_val))
         {
-            atomicMin(passed, 0); // Mark as failed
-            return;               // Early exit on first failure
+            local_has_error = 1;
+            local_error_count++;
+            local_max_error = fmaxf(local_max_error, abs_diff);
+        }
+    }
+
+    // Store thread-local results to shared memory
+    shared_error_count[threadIdx.x] = local_error_count;
+    shared_max_error[threadIdx.x]   = local_max_error;
+    shared_has_error[threadIdx.x]   = local_has_error;
+    __syncthreads();
+
+    // Block-level reduction: 256 -> 128 -> 64 -> 32
+    for(unsigned int s = block_size / 2; s > 32; s >>= 1)
+    {
+        if(threadIdx.x < s)
+        {
+            shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s];
+            shared_max_error[threadIdx.x] =
+                fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
+            shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
+    // No sync needed within a warp (warp-synchronous programming)
+    if(threadIdx.x < 32)
+    {
+        // Use volatile to prevent compiler from caching shared memory reads
+        volatile long long* smem_count = shared_error_count;
+        volatile float* smem_max       = shared_max_error;
+        volatile int* smem_has         = shared_has_error;
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 32];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32];
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 16];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16];
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 8];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8];
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 4];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4];
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 2];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2];
+
+        smem_count[threadIdx.x] += smem_count[threadIdx.x + 1];
+        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]);
+        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1];
+    }
+
+    // Single atomic update per block (reduces contention from O(errors) to O(blocks))
+    if(threadIdx.x == 0)
+    {
+        if(shared_has_error[0])
+        {
+            atomicMin(&result->passed, 0);
+            atomicAdd64(&result->error_count, shared_error_count[0]);
+            atomicMaxFloat(&result->max_error, shared_max_error[0]);
         }
     }
 }
 
 // Host-side wrapper for GPU verification with explicit tolerances
-// Returns true if verification passed, false otherwise
+// Returns GpuVerifyResult with detailed error information
 template <typename T>
-bool gpu_verify(const void* device_result,
-                const void* reference_result,
-                float rtol,
-                float atol,
-                std::size_t size,
-                hipStream_t stream = nullptr)
+GpuVerifyResult gpu_verify(const void* device_result,
+                           const void* reference_result,
+                           float rtol,
+                           float atol,
+                           std::size_t size,
+                           hipStream_t stream = nullptr)
 {
     // Allocate result buffer on device
-    int* passed_dev;
-    hip_check_error(hipMalloc(&passed_dev, sizeof(int)));
+    GpuVerifyDeviceResult* result_dev;
+    hip_check_error(hipMalloc(&result_dev, sizeof(GpuVerifyDeviceResult)));
 
-    // Initialize to passed (1)
-    int passed_host = 1;
-    hip_check_error(hipMemcpy(passed_dev, &passed_host, sizeof(int), hipMemcpyHostToDevice));
+    // Initialize result struct
+    GpuVerifyDeviceResult result_host;
+    result_host.passed      = 1;    // Start as passed
+    result_host.error_count = 0;    // No errors yet
+    result_host.max_error   = 0.0f; // No error observed
+    hip_check_error(
+        hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));
 
     // Launch kernel with grid-stride loop
     // Use 65535 as max grid size (hardware limit for grid dimension in x)
@@ -125,7 +292,7 @@ bool gpu_verify(const void* device_result,
                                                rtol,
                                                atol,
                                                static_cast<long long>(size),
-                                               passed_dev);
+                                               result_dev);
 
     hip_check_error(hipGetLastError());
 
@@ -133,12 +300,20 @@ bool gpu_verify(const void* device_result,
     hip_check_error(hipStreamSynchronize(stream));
 
     // Get result
-    hip_check_error(hipMemcpy(&passed_host, passed_dev, sizeof(int), hipMemcpyDeviceToHost));
+    hip_check_error(
+        hipMemcpy(&result_host, result_dev, sizeof(GpuVerifyDeviceResult), hipMemcpyDeviceToHost));
 
     // Free device memory
-    hip_check_error(hipFree(passed_dev));
+    hip_check_error(hipFree(result_dev));
 
-    return passed_host == 1;
+    // Build and return result struct
+    GpuVerifyResult result;
+    result.passed      = (result_host.passed == 1);
+    result.error_count = result_host.error_count;
+    result.max_error   = result_host.max_error;
+    result.total       = size;
+
+    return result;
 }
 
 // Forward declaration of gpu_reduce_max
@@ -147,15 +322,15 @@ float gpu_reduce_max(const void* device_buffer, std::size_t size, hipStream_t st
 
 // Host-side wrapper for GPU verification with automatic tolerance computation
 // Computes max value on GPU, then computes tolerances and verifies
-// Returns true if verification passed, false otherwise
+// Returns GpuVerifyResult with detailed error information
 template <typename OutDataType,
           typename ComputeDataType = OutDataType,
           typename AccDataType     = ComputeDataType>
-bool gpu_verify(const void* device_result,
-                const void* reference_result,
-                int number_of_accumulations,
-                std::size_t size,
-                hipStream_t stream = nullptr)
+GpuVerifyResult gpu_verify(const void* device_result,
+                           const void* reference_result,
+                           int number_of_accumulations,
+                           std::size_t size,
+                           hipStream_t stream = nullptr)
 {
     // Compute max absolute value on GPU (only 4 bytes transferred!)
     double max_abs_value =
@@ -187,24 +362,6 @@ bool gpu_verify(const void* device_result,
     return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
 }
 
-//
-// Helper function for atomic float max (using compare-and-swap)
-__device__ __forceinline__ float atomicMaxFloat(float* address, float val)
-{
-    int* address_as_int = reinterpret_cast<int*>(address);
-    int old             = *address_as_int;
-    int assumed;
-
-    do
-    {
-        assumed = old;
-        old =
-            atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
-    } while(assumed != old);
-
-    return __int_as_float(old);
-}
-
 // GPU reduction kernel for computing max(abs(data))
 // This is an internal kernel called only by gpu_reduce_max() wrapper.
 //
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index d74cf576499..1946ff768b7 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -237,52 +237,24 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
 
                 // Perform GPU verification (max value computed internally on GPU)
                 const std::size_t tensor_size = in_device.mDesc.GetElementSpaceSize();
-                bool gpu_passed = ck::profiler::gpu_verify<InDataType, ComputeType, AccDataType>(
+                auto gpu_result = ck::profiler::gpu_verify<InDataType, ComputeType, AccDataType>(
                     in_device_buf.GetDeviceBuffer(),
                     gpu_ref_in_buf.GetDeviceBuffer(),
                     total_accums,
                     tensor_size);
 
-                if(!gpu_passed)
+                if(!gpu_result)
                 {
-                    // GPU verification failed - fall back to CPU for detailed diagnostics
-                    std::cout << "GPU verification failed, running CPU verification for details..."
-                              << std::endl;
-
-                    // Copy both buffers to host
-                    in_device_buf.FromDevice(in_device.mData.data());
-                    gpu_ref_in_buf.FromDevice(in_host.mData.data());
-
-                    // Recalculate tolerances for CPU verification with original logic
-                    auto rtol =
-                        ck::utils::get_relative_threshold<ComputeType, InDataType, AccDataType>(
-                            num_accums);
-                    auto atol =
-                        ck::utils::get_absolute_threshold<ComputeType, InDataType, AccDataType>(
-                            max_accumulated_value / split_k_for_run, num_accums);
-
-                    if(split_k_for_run > 1)
-                    {
-                        auto rtol_split_k =
-                            ck::utils::get_relative_threshold<InDataType, InDataType, InDataType>(
-                                split_k_for_run);
-                        auto atol_split_k =
-                            ck::utils::get_absolute_threshold<InDataType, InDataType, InDataType>(
-                                max_accumulated_value, split_k_for_run);
-                        rtol = std::max(rtol, rtol_split_k);
-                        atol = std::max(atol, atol_split_k);
-                    }
-
-                    // Run CPU verification for detailed error messages
-                    ck::utils::check_err(
-                        in_device, in_host, "Error: Incorrect results!", rtol, atol);
+                    // GPU verification failed - print detailed error summary
+                    gpu_result.print_error_summary();
                     pass = false;
 
-                    std::cout << "Relative error threshold: " << rtol
-                              << " Absolute error threshold: " << atol << std::endl;
-
                     if(do_log)
                     {
+                        // Copy buffers to host for logging
+                        in_device_buf.FromDevice(in_device.mData.data());
+                        gpu_ref_in_buf.FromDevice(in_host.mData.data());
+
                         LogRangeAsType<float>(std::cout << "output : ", out.mData, ",")
                             << std::endl;
                         LogRangeAsType<float>(std::cout << "weight: ", wei.mData, ",") << std::endl;
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 67ad21c5728..ee1cf89aaf0 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -343,63 +343,28 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                     // Perform GPU verification (max value computed internally on GPU)
                     const std::size_t tensor_size =
                         weight_device_result.mDesc.GetElementSpaceSize();
-                    bool gpu_passed =
+                    auto gpu_result =
                         ck::profiler::gpu_verify<WeiDataType, ComputeType, AccDataType>(
                             wei_device_buf.GetDeviceBuffer(),
                             gpu_ref_wei_buf.GetDeviceBuffer(),
                             total_accums,
                             tensor_size);
 
-                    if(!gpu_passed)
+                    if(!gpu_result)
                     {
-                        // GPU verification failed - fall back to CPU for detailed diagnostics
-                        std::cout
-                            << "GPU verification failed, running CPU verification for details..."
-                            << std::endl;
-
-                        // Copy both buffers to host
-                        wei_device_buf.FromDevice(weight_device_result.mData.data());
-                        gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data());
-
-                        // Recalculate tolerances for CPU verification with original logic
-                        const index_t num_accums_full    = output.GetElementSize() / conv_param.K_;
-                        const index_t num_accums_split_k = split_k_value;
-                        auto rtol                        = ck::utils::
-                            get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
-                                num_accums_full / num_accums_split_k);
-                        auto atol = ck::utils::
-                            get_absolute_threshold<ComputeType, WeiDataType, AccDataType>(
-                                max_accumulated_value / num_accums_split_k,
-                                num_accums_full / num_accums_split_k);
-
-                        if(split_k_value > 1)
-                        {
-                            auto rtol_split_k =
-                                ck::utils::get_relative_threshold<WeiDataType,
-                                                                  WeiDataType,
-                                                                  WeiDataType>(num_accums_split_k);
-                            auto atol_split_k = ck::utils::
-                                get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
-                                    max_accumulated_value, num_accums_split_k);
-                            rtol = std::max(rtol, rtol_split_k);
-                            atol = std::max(atol, atol_split_k);
-                        }
-
-                        // Run CPU verification for detailed error messages
-                        ck::utils::check_err(weight_device_result,
-                                             weight_host_result,
-                                             "Error: Incorrect results!",
-                                             rtol,
-                                             atol);
+                        // GPU verification failed - print detailed error summary
+                        gpu_result.print_error_summary();
                         all_pass = false;
 
-                        std::cout << "Relative error threshold: " << rtol
-                                  << " Absolute error threshold: " << atol << std::endl;
                         std::cout << "Fail info: splitK: " << split_k_value << " "
                                   << op_ptr->GetTypeString() << std::endl;
 
                         if(do_log)
                         {
+                            // Copy buffers to host for logging
+                            wei_device_buf.FromDevice(weight_device_result.mData.data());
+                            gpu_ref_wei_buf.FromDevice(weight_host_result.mData.data());
+
                             LogRangeAsType<float>(std::cout << "output : ", output.mData, ",")
                                 << std::endl;
                             LogRangeAsType<float>(
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 874d1e115c7..0ef55468764 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -243,28 +243,24 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
 
                 // Perform GPU verification (max value computed internally on GPU)
                 const std::size_t tensor_size = device_output.mDesc.GetElementSpaceSize();
-                bool gpu_passed = ck::profiler::gpu_verify<OutDataType, AComputeType, OutDataType>(
+                auto gpu_result = ck::profiler::gpu_verify<OutDataType, AComputeType, OutDataType>(
                     out_device_buf.GetDeviceBuffer(),
                     gpu_ref_out_buf.GetDeviceBuffer(),
                     num_accums,
                     tensor_size);
 
-                if(!gpu_passed)
+                if(!gpu_result)
                 {
-                    // GPU verification failed - fall back to CPU for detailed diagnostics
-                    std::cout << "GPU verification failed, running CPU verification for details..."
-                              << std::endl;
-
-                    // Copy both buffers to host
-                    out_device_buf.FromDevice(device_output.mData.data());
-                    gpu_ref_out_buf.FromDevice(host_output.mData.data());
-
-                    // Run CPU verification for detailed error messages
-                    ck::utils::check_err(device_output, host_output);
+                    // GPU verification failed - print detailed error summary
+                    gpu_result.print_error_summary();
                     pass = false;
 
                     if(do_log)
                     {
+                        // Copy buffers to host for logging
+                        out_device_buf.FromDevice(device_output.mData.data());
+                        gpu_ref_out_buf.FromDevice(host_output.mData.data());
+
                         LogRangeAsType<float>(std::cout << "input : ", input.mData, ",")
                             << std::endl;
                         LogRangeAsType<float>(std::cout << "weight: ", weight.mData, ",")

From 20e7d7a569c87b5b3bbfc6671a447ecb8df6f561 Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 09:05:21 -0500
Subject: [PATCH 2/8] Check if result is all zero

---
 .../include/profiler/gpu_verification.hpp     | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp
index 082851327c7..d90aff120e9 100644
--- a/profiler/include/profiler/gpu_verification.hpp
+++ b/profiler/include/profiler/gpu_verification.hpp
@@ -24,6 +24,7 @@ struct GpuVerifyResult
     long long error_count; // Number of elements that exceeded tolerance
     float max_error;       // Maximum error value observed
     std::size_t total;     // Total number of elements compared
+    bool all_zero;         // True if device result is all zeros (likely kernel issue)
 
     // Implicit conversion to bool for backward compatibility
     // Allows: if (gpu_verify(...)) { ... }
@@ -42,6 +43,12 @@ struct GpuVerifyResult
     {
         if(!passed)
         {
+            if(all_zero)
+            {
+                std::cerr << "WARNING: Device result is all zeros - kernel may not have executed "
+                             "properly!"
+                          << std::endl;
+            }
             std::cerr << "max err: " << max_error;
             std::cerr << ", number of errors: " << error_count;
             std::cerr << ", " << std::setprecision(2) << std::fixed << error_percentage()
@@ -141,6 +148,7 @@ struct GpuVerifyDeviceResult
     int passed;            // 1 = passed, 0 = failed
     long long error_count; // Number of errors found
     float max_error;       // Maximum error value
+    int all_zero;          // 1 = device result is all zeros, 0 = has non-zero values
 };
 
 // GPU verification kernel - compares device result against reference using relative and absolute
@@ -165,11 +173,13 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
     __shared__ long long shared_error_count[block_size];
     __shared__ float shared_max_error[block_size];
     __shared__ int shared_has_error[block_size];
+    __shared__ int shared_has_nonzero[block_size];
 
     // Thread-local accumulators (in registers)
     long long local_error_count = 0;
     float local_max_error       = 0.0f;
     int local_has_error         = 0;
+    int local_has_nonzero       = 0;
 
     // Grid-stride loop to handle any tensor size
     long long idx    = blockIdx.x * blockDim.x + threadIdx.x;
@@ -181,6 +191,12 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         float dev_val = type_convert<float>(device_result[i]);
         float ref_val = type_convert<float>(reference_result[i]);
 
+        // Check if device value is non-zero
+        if(dev_val != 0.0f)
+        {
+            local_has_nonzero = 1;
+        }
+
         // Compute absolute difference
         float abs_diff = fabsf(dev_val - ref_val);
 
@@ -197,6 +213,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
     shared_error_count[threadIdx.x] = local_error_count;
     shared_max_error[threadIdx.x]   = local_max_error;
     shared_has_error[threadIdx.x]   = local_has_error;
+    shared_has_nonzero[threadIdx.x] = local_has_nonzero;
     __syncthreads();
 
     // Block-level reduction: 256 -> 128 -> 64 -> 32
@@ -208,6 +225,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
             shared_max_error[threadIdx.x] =
                 fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
             shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
+            shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s];
         }
         __syncthreads();
     }
@@ -220,30 +238,37 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         volatile long long* smem_count = shared_error_count;
         volatile float* smem_max       = shared_max_error;
         volatile int* smem_has         = shared_has_error;
+        volatile int* smem_nonzero     = shared_has_nonzero;
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 32];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 32];
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 16];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 16];
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 8];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 8];
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 4];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 4];
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 2];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 2];
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 1];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]);
         smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1];
+        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 1];
     }
 
     // Single atomic update per block (reduces contention from O(errors) to O(blocks))
@@ -255,6 +280,15 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
             atomicAdd64(&result->error_count, shared_error_count[0]);
             atomicMaxFloat(&result->max_error, shared_max_error[0]);
         }
+        // Update all_zero flag: if no nonzero values found, mark as all zero
+        if(!shared_has_nonzero[0])
+        {
+            atomicMin(&result->all_zero, 1);
+        }
+        else
+        {
+            atomicMin(&result->all_zero, 0);
+        }
     }
 }
 
@@ -277,6 +311,7 @@ GpuVerifyResult gpu_verify(const void* device_result,
     result_host.passed      = 1;    // Start as passed
     result_host.error_count = 0;    // No errors yet
     result_host.max_error   = 0.0f; // No error observed
+    result_host.all_zero    = 1;    // Start assuming all zeros (will be cleared if nonzero found)
     hip_check_error(
         hipMemcpy(result_dev, &result_host, sizeof(GpuVerifyDeviceResult), hipMemcpyHostToDevice));
 
@@ -312,6 +347,7 @@ GpuVerifyResult gpu_verify(const void* device_result,
     result.error_count = result_host.error_count;
     result.max_error   = result_host.max_error;
     result.total       = size;
+    result.all_zero    = (result_host.all_zero == 1);
 
     return result;
 }

From 3563cd228a559fc4bd2579968f143a8540644f2e Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 09:10:58 -0500
Subject: [PATCH 3/8] non-negative error count doesn't need custom Atomics

---
 .../include/profiler/gpu_verification.hpp     | 55 +++++++------------
 1 file changed, 19 insertions(+), 36 deletions(-)

diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp
index d90aff120e9..4d0ed635bda 100644
--- a/profiler/include/profiler/gpu_verification.hpp
+++ b/profiler/include/profiler/gpu_verification.hpp
@@ -20,11 +20,11 @@ namespace profiler {
 // Provides backward compatibility via operator bool()
 struct GpuVerifyResult
 {
-    bool passed;           // Overall pass/fail result
-    long long error_count; // Number of elements that exceeded tolerance
-    float max_error;       // Maximum error value observed
-    std::size_t total;     // Total number of elements compared
-    bool all_zero;         // True if device result is all zeros (likely kernel issue)
+    bool passed;                    // Overall pass/fail result
+    unsigned long long error_count; // Number of elements that exceeded tolerance
+    float max_error;                // Maximum error value observed
+    std::size_t total;              // Total number of elements compared
+    bool all_zero;                  // True if device result is all zeros (likely kernel issue)
 
     // Implicit conversion to bool for backward compatibility
     // Allows: if (gpu_verify(...)) { ... }
@@ -124,31 +124,14 @@ __device__ __forceinline__ float atomicMaxFloat(float* address, float val)
     return __int_as_float(old);
 }
 
-// Helper function for atomic 64-bit add (using compare-and-swap)
-// Needed because atomicAdd for long long isn't always available
-__device__ __forceinline__ long long atomicAdd64(long long* address, long long val)
-{
-    unsigned long long* address_as_ull = reinterpret_cast<unsigned long long*>(address);
-    unsigned long long old             = *address_as_ull;
-    unsigned long long assumed;
-
-    do
-    {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed, assumed + static_cast<unsigned long long>(val));
-    } while(assumed != old);
-
-    return static_cast<long long>(old);
-}
-
 // Device-side result structure for kernel output
 // Packed into a single struct to minimize device memory allocations
 struct GpuVerifyDeviceResult
 {
-    int passed;            // 1 = passed, 0 = failed
-    long long error_count; // Number of errors found
-    float max_error;       // Maximum error value
-    int all_zero;          // 1 = device result is all zeros, 0 = has non-zero values
+    int passed;                     // 1 = passed, 0 = failed
+    unsigned long long error_count; // Number of errors found
+    float max_error;                // Maximum error value
+    int all_zero;                   // 1 = device result is all zeros, 0 = has non-zero values
 };
 
 // GPU verification kernel - compares device result against reference using relative and absolute
@@ -170,16 +153,16 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
     constexpr int block_size = 256;
 
     // Shared memory for block-level reduction
-    __shared__ long long shared_error_count[block_size];
+    __shared__ unsigned long long shared_error_count[block_size];
     __shared__ float shared_max_error[block_size];
     __shared__ int shared_has_error[block_size];
     __shared__ int shared_has_nonzero[block_size];
 
     // Thread-local accumulators (in registers)
-    long long local_error_count = 0;
-    float local_max_error       = 0.0f;
-    int local_has_error         = 0;
-    int local_has_nonzero       = 0;
+    unsigned long long local_error_count = 0;
+    float local_max_error                = 0.0f;
+    int local_has_error                  = 0;
+    int local_has_nonzero                = 0;
 
     // Grid-stride loop to handle any tensor size
     long long idx    = blockIdx.x * blockDim.x + threadIdx.x;
@@ -235,10 +218,10 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
     if(threadIdx.x < 32)
     {
         // Use volatile to prevent compiler from caching shared memory reads
-        volatile long long* smem_count = shared_error_count;
-        volatile float* smem_max       = shared_max_error;
-        volatile int* smem_has         = shared_has_error;
-        volatile int* smem_nonzero     = shared_has_nonzero;
+        volatile unsigned long long* smem_count = shared_error_count;
+        volatile float* smem_max                = shared_max_error;
+        volatile int* smem_has                  = shared_has_error;
+        volatile int* smem_nonzero              = shared_has_nonzero;
 
         smem_count[threadIdx.x] += smem_count[threadIdx.x + 32];
         smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]);
@@ -277,7 +260,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         if(shared_has_error[0])
         {
             atomicMin(&result->passed, 0);
-            atomicAdd64(&result->error_count, shared_error_count[0]);
+            atomicAdd(&result->error_count, shared_error_count[0]);
             atomicMaxFloat(&result->max_error, shared_max_error[0]);
         }
         // Update all_zero flag: if no nonzero values found, mark as all zero

From 2f14048c633bc4e06d8dee9264046b8c5b322dba Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 09:27:05 -0500
Subject: [PATCH 4/8] Remove unnecessary AtomicMaxFloat function

---
 .../include/profiler/gpu_verification.hpp     | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp
index 4d0ed635bda..da31771fb0a 100644
--- a/profiler/include/profiler/gpu_verification.hpp
+++ b/profiler/include/profiler/gpu_verification.hpp
@@ -107,23 +107,6 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
     }
 }
 
-// Helper function for atomic float max (using compare-and-swap)
-__device__ __forceinline__ float atomicMaxFloat(float* address, float val)
-{
-    int* address_as_int = reinterpret_cast<int*>(address);
-    int old             = *address_as_int;
-    int assumed;
-
-    do
-    {
-        assumed = old;
-        old =
-            atomicCAS(address_as_int, assumed, __float_as_int(fmaxf(val, __int_as_float(assumed))));
-    } while(assumed != old);
-
-    return __int_as_float(old);
-}
-
 // Device-side result structure for kernel output
 // Packed into a single struct to minimize device memory allocations
 struct GpuVerifyDeviceResult
@@ -261,7 +244,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         {
             atomicMin(&result->passed, 0);
             atomicAdd(&result->error_count, shared_error_count[0]);
-            atomicMaxFloat(&result->max_error, shared_max_error[0]);
+            atomicMax(&result->max_error, shared_max_error[0]);
         }
         // Update all_zero flag: if no nonzero values found, mark as all zero
         if(!shared_has_nonzero[0])

From 715eb8dd04f3d6eed172b96369008cb0d877fc19 Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 09:44:24 -0500
Subject: [PATCH 5/8] Simpler warp reduction, remove passed flag

---
 .../include/profiler/gpu_verification.hpp     | 87 +++++--------------
 1 file changed, 20 insertions(+), 67 deletions(-)

diff --git a/profiler/include/profiler/gpu_verification.hpp b/profiler/include/profiler/gpu_verification.hpp
index da31771fb0a..c93f4deb8fc 100644
--- a/profiler/include/profiler/gpu_verification.hpp
+++ b/profiler/include/profiler/gpu_verification.hpp
@@ -20,7 +20,6 @@ namespace profiler {
 // Provides backward compatibility via operator bool()
 struct GpuVerifyResult
 {
-    bool passed;                    // Overall pass/fail result
     unsigned long long error_count; // Number of elements that exceeded tolerance
     float max_error;                // Maximum error value observed
     std::size_t total;              // Total number of elements compared
@@ -28,7 +27,7 @@ struct GpuVerifyResult
 
     // Implicit conversion to bool for backward compatibility
     // Allows: if (gpu_verify(...)) { ... }
-    operator bool() const { return passed; }
+    operator bool() const { return error_count == 0; }
 
     // Calculate error percentage
     float error_percentage() const
@@ -41,7 +40,7 @@ struct GpuVerifyResult
     // Print error summary to stderr (matches check_err format)
     void print_error_summary() const
     {
-        if(!passed)
+        if(error_count > 0)
         {
             if(all_zero)
             {
@@ -111,7 +110,6 @@ inline float compute_relative_tolerance(const int number_of_accumulations = 1)
 // Packed into a single struct to minimize device memory allocations
 struct GpuVerifyDeviceResult
 {
-    int passed;                     // 1 = passed, 0 = failed
     unsigned long long error_count; // Number of errors found
     float max_error;                // Maximum error value
     int all_zero;                   // 1 = device result is all zeros, 0 = has non-zero values
@@ -196,53 +194,20 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
         __syncthreads();
     }
 
-    // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
-    // No sync needed within a warp (warp-synchronous programming)
-    if(threadIdx.x < 32)
-    {
-        // Use volatile to prevent compiler from caching shared memory reads
-        volatile unsigned long long* smem_count = shared_error_count;
-        volatile float* smem_max                = shared_max_error;
-        volatile int* smem_has                  = shared_has_error;
-        volatile int* smem_nonzero              = shared_has_nonzero;
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 32];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 32]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 32];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 32];
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 16];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 16]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 16];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 16];
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 8];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 8]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 8];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 8];
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 4];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 4]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 4];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 4];
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 2];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 2]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 2];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 2];
-
-        smem_count[threadIdx.x] += smem_count[threadIdx.x + 1];
-        smem_max[threadIdx.x] = fmaxf(smem_max[threadIdx.x], smem_max[threadIdx.x + 1]);
-        smem_has[threadIdx.x] |= smem_has[threadIdx.x + 1];
-        smem_nonzero[threadIdx.x] |= smem_nonzero[threadIdx.x + 1];
-    }
-
-    // Single atomic update per block (reduces contention from O(errors) to O(blocks))
+    // Final reduction of remaining 32 elements in thread 0
     if(threadIdx.x == 0)
     {
+        for(int i = 1; i < 32; ++i)
+        {
+            shared_error_count[0] += shared_error_count[i];
+            shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]);
+            shared_has_error[0] |= shared_has_error[i];
+            shared_has_nonzero[0] |= shared_has_nonzero[i];
+        }
+
+        // Single atomic update per block (reduces contention from O(errors) to O(blocks))
         if(shared_has_error[0])
         {
-            atomicMin(&result->passed, 0);
             atomicAdd(&result->error_count, shared_error_count[0]);
             atomicMax(&result->max_error, shared_max_error[0]);
         }
@@ -274,7 +239,6 @@ GpuVerifyResult gpu_verify(const void* device_result,
 
     // Initialize result struct
     GpuVerifyDeviceResult result_host;
-    result_host.passed      = 1;    // Start as passed
     result_host.error_count = 0;    // No errors yet
     result_host.max_error   = 0.0f; // No error observed
     result_host.all_zero    = 1;    // Start assuming all zeros (will be cleared if nonzero found)
@@ -309,7 +273,6 @@ GpuVerifyResult gpu_verify(const void* device_result,
 
     // Build and return result struct
     GpuVerifyResult result;
-    result.passed      = (result_host.passed == 1);
     result.error_count = result_host.error_count;
     result.max_error   = result_host.max_error;
     result.total       = size;
@@ -399,26 +362,16 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
         __syncthreads();
     }
 
-    // Warp-level reduction: 32 -> 16 -> 8 -> 4 -> 2 -> 1
-    // No sync needed within a warp
-    if(threadIdx.x < 32)
-    {
-        volatile float* smem = shared_max;
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 32]);
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 16]);
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 8]);
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 4]);
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 2]);
-        smem[threadIdx.x]    = fmaxf(smem[threadIdx.x], smem[threadIdx.x + 1]);
-    }
-
-    // Two-phase reduction pattern minimizes atomic contention:
-    // 1. Each block reduces to shared memory (above)
-    // 2. Single thread per block updates global max (below)
-    // This limits atomic operations to O(grid_size) rather than O(total_threads)
+    // Final reduction of remaining 32 elements in thread 0
     if(threadIdx.x == 0)
     {
-        atomicMaxFloat(max_val, shared_max[0]);
+        for(int i = 1; i < 32; ++i)
+        {
+            shared_max[0] = fmaxf(shared_max[0], shared_max[i]);
+        }
+
+        // Single atomic update per block
+        atomicMax(max_val, shared_max[0]);
     }
 }
 

From a8354cc5acd4c305c4d301728448ff66e43ac035 Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Mon, 12 Jan 2026 10:15:02 -0500
Subject: [PATCH 6/8] Move verification header to include

---
 .../ck/library/utility}/gpu_verification.hpp                    | 0
 .../include/profiler/profile_grouped_conv_bwd_data_impl.hpp     | 2 +-
 .../include/profiler/profile_grouped_conv_bwd_weight_impl.hpp   | 2 +-
 profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp     | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename {profiler/include/profiler => include/ck/library/utility}/gpu_verification.hpp (100%)

diff --git a/profiler/include/profiler/gpu_verification.hpp b/include/ck/library/utility/gpu_verification.hpp
similarity index 100%
rename from profiler/include/profiler/gpu_verification.hpp
rename to include/ck/library/utility/gpu_verification.hpp
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
index 1946ff768b7..907b3ed2423 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -20,7 +20,7 @@
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_data.hpp"
 #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp"
-#include "profiler/gpu_verification.hpp"
+#include "ck/library/utility/gpu_verification.hpp"
 
 namespace ck {
 namespace profiler {
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index ee1cf89aaf0..ab118fdebf5 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -24,7 +24,7 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_bwd_weight.hpp"
 #include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
-#include "profiler/gpu_verification.hpp"
+#include "ck/library/utility/gpu_verification.hpp"
 
 namespace ck {
 namespace profiler {
diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
index 0ef55468764..3af523723cd 100644
--- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
@@ -23,7 +23,7 @@
 #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
 #include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
-#include "profiler/gpu_verification.hpp"
+#include "ck/library/utility/gpu_verification.hpp"
 
 namespace ck {
 namespace profiler {

From c20a181ba9eb315bec2d96edede3052c94a987a6 Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Tue, 13 Jan 2026 01:53:13 -0500
Subject: [PATCH 7/8] Fix header path in test

---
 test/gpu_verification/test_gpu_verification.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gpu_verification/test_gpu_verification.cpp b/test/gpu_verification/test_gpu_verification.cpp
index 977475f0649..3be950c2b66 100644
--- a/test/gpu_verification/test_gpu_verification.cpp
+++ b/test/gpu_verification/test_gpu_verification.cpp
@@ -13,8 +13,8 @@
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
 #include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/gpu_verification.hpp"
 #include "ck/library/reference_tensor_operation/gpu/naive_conv_utils.hpp"
-#include "profiler/gpu_verification.hpp"
 
 using namespace ck::profiler;
 using ck::ref::SimpleDeviceMem;

From 218d6c49bb09e0ce5c440cf3c086afefc9c46a24 Mon Sep 17 00:00:00 2001
From: "Graner, Johannes" <Johannes.Graner@amd.com>
Date: Tue, 13 Jan 2026 05:16:07 -0500
Subject: [PATCH 8/8] Fix block reduction loop

---
 include/ck/library/utility/gpu_verification.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ck/library/utility/gpu_verification.hpp b/include/ck/library/utility/gpu_verification.hpp
index c93f4deb8fc..e4a444ecb9c 100644
--- a/include/ck/library/utility/gpu_verification.hpp
+++ b/include/ck/library/utility/gpu_verification.hpp
@@ -181,7 +181,7 @@ __global__ void gpu_verify_kernel(const T* __restrict__ device_result,
     __syncthreads();
 
     // Block-level reduction: 256 -> 128 -> 64 -> 32
-    for(unsigned int s = block_size / 2; s > 32; s >>= 1)
+    for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
     {
         if(threadIdx.x < s)
         {
@@ -353,7 +353,7 @@ gpu_reduce_max_kernel(const T* __restrict__ data, long long size, float* __restr
     __syncthreads();
 
     // Block-level reduction: 256 -> 128 -> 64 -> 32
-    for(unsigned int s = block_size / 2; s > 32; s >>= 1)
+    for(unsigned int s = block_size / 2; s >= 32; s >>= 1)
     {
         if(threadIdx.x < s)
         {