ARM-software · ArmDude · Dec 16, 2025
diff --git a/docs/user_guide/errata.dox b/docs/user_guide/errata.dox
@@ -1,5 +1,5 @@
 ///
-/// Copyright (c) 2019-2024 Arm Limited.
+/// Copyright (c) 2019-2025 Arm Limited.
 ///
 /// SPDX-License-Identifier: MIT
 ///
@@ -30,6 +30,14 @@ namespace arm_compute
 
 @section S7_1_errata Errata
 
+- (COMPMID-8727) An issue has been identified with the FP16 MMUL Reshaped RHS kernel for small N.
+    - Versions Affected: >= v52.5.0 && <= v52.7.0
+    - Conditions:
+        - One way to trigger this issue is by selecting the kernel through its heuristic function (e.g. through the CLGemm Operator class' configure() call stack) and the size of the N at the kernel level is less than or equal to 32.
+        - This is because in those cases, the kernel's heuristic function generates an invalid case where the product - for the N dimension - of the MMUL block size (MMUL_N0) and conventional block size (N0) is larger than the N dimension size.
+        - Another way is through using the kernel directly with N smaller than or equal to MMUL_N0 * N0 since the validation in the kernel-level configure() function does not catch this.
+    - Result: The FP16 MMUL Reshaped RHS kernel is incorrectly configured with block sizes that cause its operations to extend beyond the tensor dimensions.
+
 - (COMPMID-7536) NEDepthwiseConvolutionLayer and NEPoolingLayer may produce wrong results with OpenMP® scheduler
     - Versions: >= v24.05 && < v24.08
     - Oses: All

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -100,7 +100,7 @@ Status validate_arguments(const ITensorInfo       *src0,
         if (arm_matrix_multiply_fp16_supported(CLKernelLibrary::get().get_device()))
         {
             // These error messages are for FP16 acc.
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((n > rhs_info.n0 * mmul_n0),
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((n <= rhs_info.n0 * mmul_n0),
                                             "N must be greater than N0 * MMUL_N0 in the FP16 MMUL Kernel");
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k % 4) != 0), "K must be multiple of 4 in the FP16 MMUL Kernel");
             ARM_COMPUTE_RETURN_ERROR_ON_MSG((m < 4), "M must be greater than or equal to 4 in the FP16 MMUL Kernel");

diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -198,7 +198,7 @@ bool is_mmul_kernel_preferred_fp16_acc(const unsigned int m,
         const unsigned int m_div_m0                          = ceil_to_multiple_m_m0 / best_m0;
         const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
         const unsigned int gws_y                             = ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0;
-        return ((k % mmul_k0) == 0) && (gws_y >= 4);
+        return ((k % mmul_k0) == 0) && (gws_y >= 4) && (n > best_n0 * mmul_n0);
     }
 
     return false;

diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRhsMMUL.cpp
@@ -75,7 +75,7 @@ const auto m_values = make("M", {49});
 
 /** N values to test */
 const auto n_values              = make("N", {257, 64, 48});
-const auto n_values_fp16         = make("N", {79, 32, 80});
+const auto n_values_fp16         = make("N", {79, 80});
 const auto n_values_texture_fp16 = make("N", {128, 96, 48});
 
 /** K values to test */