Add MixedPrecision AddGrad (PaddlePaddle#76178)

cangtianhuang · LittleHeroZZZX · commit fbef36b6c314 · 2025-11-19T12:54:29.000Z
* add MixedPrecisionAddGrad

* revert

* refine

* refine

* refine

* refine

* fix kernel

* add IndexT, add test

* refine
diff --git a/paddle/phi/common/type_promotion.h b/paddle/phi/common/type_promotion.h
@@ -205,18 +205,10 @@ inline bool NeedTypePromotion(
   // floating-point numbers and between complex and real numbers.
   if (x_dtype != y_dtype) {
 // TODO(Xi Zhao): we got special case for add now, should remove it in future.
-#ifdef PADDLE_WITH_CUDA
-    if ((op_name == "add" || op_name == "add_") &&
-        x_dtype == DataType::FLOAT32 &&
-        (y_dtype == phi::DataType::BFLOAT16 ||
-         y_dtype == phi::DataType::FLOAT16)) {
-      return false;
-    }
-#elif defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU)
     if ((op_name == "add" || op_name == "add_") &&
         x_dtype == DataType::FLOAT32 &&
-        (y_dtype == phi::DataType::BFLOAT16 ||
-         y_dtype == phi::DataType::FLOAT16)) {
+        (y_dtype == DataType::FLOAT16 || y_dtype == DataType::BFLOAT16)) {
       return false;
     }
 #endif
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
+
 namespace phi {
 
 template <typename T>
@@ -112,6 +113,144 @@ void GetGradXOrYOut(const GPUContext &dev_ctx,
 ******************************
 */
 
+template <typename T>
+struct alignas(sizeof(T) * 4) Pack4 {
+  T val[4];
+};
+
+template <typename T_dy, typename IndexT = int>
+static __global__ void MixedPrecisionElemwiseAddGradCUDAKernel(
+    const float *__restrict__ dout,
+    IndexT size,
+    float *__restrict__ dx,
+    T_dy *__restrict__ dy) {
+  IndexT tid = static_cast<IndexT>(blockIdx.x) * blockDim.x + threadIdx.x;
+  IndexT stride = static_cast<IndexT>(gridDim.x) * blockDim.x;
+
+  constexpr int vec_size = 4;
+  IndexT loop = size / vec_size;
+  IndexT remainder = size % vec_size;
+
+  const float4 *__restrict__ dout_vec = reinterpret_cast<const float4 *>(dout);
+  float4 *__restrict__ dx_vec = reinterpret_cast<float4 *>(dx);
+  Pack4<T_dy> *__restrict__ dy_vec = reinterpret_cast<Pack4<T_dy> *>(dy);
+
+  for (IndexT i = tid; i < loop; i += stride) {
+    float4 val = __ldg(dout_vec + i);
+    dx_vec[i] = val;
+
+    Pack4<T_dy> dy_pack;
+    dy_pack.val[0] = static_cast<T_dy>(val.x);
+    dy_pack.val[1] = static_cast<T_dy>(val.y);
+    dy_pack.val[2] = static_cast<T_dy>(val.z);
+    dy_pack.val[3] = static_cast<T_dy>(val.w);
+    dy_vec[i] = dy_pack;
+  }
+
+  if (remainder != 0) {
+    IndexT tail_start = loop * vec_size;
+    for (IndexT i = tail_start + tid; i < size; i += stride) {
+      float val = __ldg(dout + i);
+      dx[i] = val;
+      dy[i] = static_cast<T_dy>(val);
+    }
+  }
+}
+
+template <typename T_dy>
+void ElementwiseMixedPrecisionAddGrad(const GPUContext &dev_ctx,
+                                      const DenseTensor &dout,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy) {
+  using T_dout = float;
+  using T_dx = float;
+
+  auto *dx_data = dev_ctx.template Alloc<T_dx>(dx);
+  T_dy *dy_data = dev_ctx.template Alloc<T_dy>(dy);
+  auto *dout_data = dout.data<T_dout>();
+
+  if (dx_data == dout_data) {
+    VLOG(7) << "Special case when dx_data is the same as dout_data, "
+               "need cast dout to dy.";
+    phi::CastKernel<T_dout>(dev_ctx, dout, dy->dtype(), dy);
+    return;
+  }
+
+  auto size = dout.numel();
+  if (size == 0) return;
+
+  constexpr int vec_size = 4;
+  const int64_t main_size = (size / vec_size) * vec_size;
+  const int block_size = PREDEFINED_BLOCK_SIZE;
+  const int grid_size =
+      std::min(static_cast<int>((main_size + block_size - 1) / block_size),
+               (dev_ctx.GetMaxPhysicalThreadCount() / block_size));
+
+  dim3 grid_dim(grid_size, 1, 1);
+  dim3 block_dim(block_size, 1, 1);
+
+  if (size < std::numeric_limits<int>::max()) {
+    MixedPrecisionElemwiseAddGradCUDAKernel<T_dy, int>
+        <<<grid_dim, block_dim, 0, dev_ctx.stream()>>>(
+            dout_data, static_cast<int>(size), dx_data, dy_data);
+  } else {
+    MixedPrecisionElemwiseAddGradCUDAKernel<T_dy, int64_t>
+        <<<grid_dim, block_dim, 0, dev_ctx.stream()>>>(
+            dout_data, static_cast<int64_t>(size), dx_data, dy_data);
+  }
+}
+
+template <typename T_dy>
+void DefaultMixedPrecisionAddGrad(const GPUContext &dev_ctx,
+                                  const DenseTensor &x,
+                                  const DenseTensor &y,
+                                  const DenseTensor &dout,
+                                  DenseTensor *dx,
+                                  DenseTensor *dy,
+                                  int axis = -1) {
+  using T_dout = float;
+  using T_dx = float;
+
+  auto *dout_data = dout.data<T_dout>();
+
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dev_ctx.template Alloc<T_dx>(dx);
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+      }
+    } else {
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->Resize(x.dims());
+        dev_ctx.template Alloc<T_dx>(dx);
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), dout.dims(), axis);
+      phi::SumKernel<T_dout, GPUContext>(
+          dev_ctx, dout, reduce_dims, dout.dtype(), false, dx);
+    }
+  }
+
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dev_ctx.template Alloc<T_dy>(dy);
+    if (dy->dims() == dout.dims()) {
+      phi::CastKernel<T_dout>(dev_ctx, dout, dy->dtype(), dy);
+    } else {
+      DenseTensor dy_fp32;
+      dy_fp32.Resize(dout.dims());
+      dev_ctx.template Alloc<float>(&dy_fp32);
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), dout.dims(), axis);
+      phi::SumKernel<float, GPUContext>(
+          dev_ctx, dout, reduce_dims, dout.dtype(), false, &dy_fp32);
+      phi::CastKernel<float>(dev_ctx, dy_fp32, dy->dtype(), dy);
+    }
+  }
+}
+
 template <typename T, typename IndexT = int>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T *__restrict__ dout, IndexT size, int vec_size, T *dx, T *dy) {
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -115,6 +115,54 @@ void DivideGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T>
+void MixedPrecisionAddGradFunc(const GPUContext& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out,
+                               const DenseTensor& dout,
+                               DenseTensor* dx,
+                               DenseTensor* dy,
+                               int axis = -1) {
+  const auto& x_dtype = x.dtype();
+  const auto& y_dtype = y.dtype();
+  bool no_broadcast =
+      (dx && dy && dx->dims() == dy->dims() && dx->dims() == dout.dims());
+  if (no_broadcast) {
+    // Dispatch to non-broadcast (elementwise) kernels
+    if (x_dtype == phi::DataType::FLOAT32 &&
+        y_dtype == phi::DataType::FLOAT16) {
+      ElementwiseMixedPrecisionAddGrad<phi::float16>(dev_ctx, dout, dx, dy);
+    } else if (x_dtype == phi::DataType::FLOAT32 &&
+               y_dtype == phi::DataType::BFLOAT16) {
+      ElementwiseMixedPrecisionAddGrad<phi::bfloat16>(dev_ctx, dout, dx, dy);
+    } else {
+      PADDLE_THROW(common::errors::Unimplemented(
+          "Unsupported mixed precision combination for AddGrad non-broadcast "
+          "path: x_dtype=%s, y_dtype=%s",
+          phi::DataTypeToString(x_dtype),
+          phi::DataTypeToString(y_dtype)));
+    }
+  } else {
+    // Dispatch to broadcast-aware kernels
+    if (x_dtype == phi::DataType::FLOAT32 &&
+        y_dtype == phi::DataType::FLOAT16) {
+      DefaultMixedPrecisionAddGrad<phi::float16>(
+          dev_ctx, x, y, dout, dx, dy, axis);
+    } else if (x_dtype == phi::DataType::FLOAT32 &&
+               y_dtype == phi::DataType::BFLOAT16) {
+      DefaultMixedPrecisionAddGrad<phi::bfloat16>(
+          dev_ctx, x, y, dout, dx, dy, axis);
+    } else {
+      PADDLE_THROW(common::errors::Unimplemented(
+          "Unsupported mixed precision combination for AddGrad broadcast path: "
+          "x_dtype=%s, y_dtype=%s",
+          phi::DataTypeToString(x_dtype),
+          phi::DataTypeToString(y_dtype)));
+    }
+  }
+}
+
 template <typename T>
 void AddGradFunc(const GPUContext& dev_ctx,
                  const DenseTensor& x,
@@ -139,6 +187,14 @@ void AddGradKernel(const Context& dev_ctx,
                    int axis,
                    DenseTensor* dx,
                    DenseTensor* dy) {
+#ifdef PADDLE_WITH_CUDA
+  if (x.dtype() == DataType::FLOAT32 &&
+      (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) {
+    phi::MixedPrecisionAddGradImpl<float>(
+        dev_ctx, x, y, dout, axis, dx, dy, MixedPrecisionAddGradFunc<float>);
+    return;
+  }
+#endif
   phi::AddGradImpl<T>(dev_ctx, x, y, dout, axis, dx, dy, AddGradFunc<T>);
 }
 
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
@@ -28,6 +29,31 @@ limitations under the License. */
 
 namespace phi {
 
+template <typename T, typename Context, typename GradFunc>
+void MixedPrecisionAddGradImpl(const Context& dev_ctx,
+                               const DenseTensor& x,
+                               const DenseTensor& y,
+                               const DenseTensor& out_grad,
+                               int axis,
+                               DenseTensor* x_grad,
+                               DenseTensor* y_grad,
+                               GradFunc grad_func) {
+  phi::funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  phi::funcs::ElementwiseGradPreProcess(out_grad, y_grad);
+  auto* out = &out_grad;
+  if (x_grad != nullptr && y_grad == nullptr &&
+      x_grad->dims() == out_grad.dims()) {
+    VLOG(4) << "Mixed precision: only x_grad needed, no reduce";
+    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
+  } else if (x_grad == nullptr && y_grad != nullptr &&
+             y_grad->dims() == out_grad.dims()) {
+    VLOG(4) << "Mixed precision: only y_grad needed, no reduce";
+    phi::CastKernel<T>(dev_ctx, out_grad, y.dtype(), y_grad);
+  } else {
+    grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
+  }
+}
+
 template <typename T, typename Context, typename GradFunc>
 void AddGradImpl(const Context& dev_ctx,
                  const DenseTensor& x,
@@ -38,6 +64,7 @@ void AddGradImpl(const Context& dev_ctx,
                  DenseTensor* y_grad,
                  GradFunc grad_func) {
   phi::funcs::ElementwiseGradPreProcess(out_grad, x_grad);
+  phi::funcs::ElementwiseGradPreProcess(out_grad, y_grad);
   auto* out = &out_grad;
   // Special case when y_grad is not needed and x_grad doesn't reduce
   if (x_grad != nullptr && y_grad == nullptr &&
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -100,16 +100,13 @@ void AddKernel(const Context& dev_ctx,
     return;
   }
 #ifdef PADDLE_WITH_CUDA
-  if (x.dtype() == phi::DataType::FLOAT32 &&
-      (y.dtype() == phi::DataType::BFLOAT16 ||
-       y.dtype() == phi::DataType::FLOAT16)) {
+  if (x.dtype() == DataType::FLOAT32 &&
+      (y.dtype() == DataType::FLOAT16 || y.dtype() == DataType::BFLOAT16)) {
     MultiPrecisionAddKernelImpl<float, Context>(dev_ctx, x, y, out);
-  } else {
-#endif
-    phi::AddRawKernel<T, Context>(dev_ctx, x, y, -1, out);
-#ifdef PADDLE_WITH_CUDA
+    return;
   }
 #endif
+  phi::AddRawKernel<T, Context>(dev_ctx, x, y, -1, out);
 }
 
 template <typename T, typename Context>
diff --git a/test/legacy_test/test_add_op.py b/test/legacy_test/test_add_op.py