Skip to content

Commit 14509d0

Browse files
Enhance the robustness of slice kernels when processing big Tensors. (#76278) (#76302)
* slice big tensor * enhance the slice kernels robust for big tensor --------- Co-authored-by: Wang Huan <[email protected]>
1 parent d6ea543 commit 14509d0

11 files changed

+54
-28
lines changed

paddle/fluid/pybind/slice_utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ static std::vector<paddle::Tensor> PrepareIndices(
740740
const paddle::Tensor& bool_2_idx,
741741
const paddle::Tensor& bool_index) {
742742
std::vector<paddle::Tensor> indices;
743-
for (int j = 0; j < bool_2_idx.shape()[1]; ++j) {
743+
for (int64_t j = 0; j < bool_2_idx.shape()[1]; ++j) {
744744
paddle::Tensor sliced_tensor =
745745
slice_ad_func(bool_2_idx, {1}, {j}, {j + 1}, {1}, {});
746746
paddle::Tensor sliced_tensor_c = sliced_tensor.contiguous();

paddle/phi/kernels/funcs/gather.cu.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,11 @@ __global__ void GatherGPUKernel(const T* input,
155155
int64_t input_index_dim_size,
156156
int64_t size) {
157157
int64_t block_size = blockDim.x;
158-
int64_t idx = (blockIdx.x * block_size + threadIdx.x) * VecSize;
158+
int64_t idx =
159+
(static_cast<int64_t>(blockIdx.x) * block_size + threadIdx.x) * VecSize;
159160
int64_t outer_size = outer_dim_size * out_index_dim_size;
160-
for (; idx < size; idx += gridDim.x * block_size * VecSize) {
161+
for (; idx < size;
162+
idx += static_cast<int64_t>(gridDim.x) * block_size * VecSize) {
161163
int64_t inner_dim_index = idx / outer_size;
162164
int64_t next_idx = idx % outer_size;
163165
int64_t index_dim_index = next_idx / outer_dim_size;

paddle/phi/kernels/funcs/index_elementwise.cu.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ __global__ void index_elementwise_with_tensor_kernel(const int64_t N,
3838
const func_t f) {
3939
const auto tid = threadIdx.x;
4040
const auto nv = nt * vt;
41-
auto idx = nv * blockIdx.x + tid;
41+
int64_t idx = static_cast<int64_t>(nv) * blockIdx.x + tid;
4242
#pragma unroll
4343
for (int i = 0; i < vt; i++) {
4444
if (idx < N) {
@@ -54,7 +54,7 @@ __global__ void index_elementwise_kernel(const int64_t N,
5454
const func_t f) {
5555
const auto tid = threadIdx.x;
5656
const auto nv = nt * vt;
57-
auto idx = nv * blockIdx.x + tid;
57+
int64_t idx = static_cast<int64_t>(nv) * blockIdx.x + tid;
5858
#pragma unroll
5959
for (int i = 0; i < vt; i++) {
6060
if (idx < N) {
@@ -70,7 +70,7 @@ __global__ void index_put_kernel(const int64_t N,
7070
const func_t f) {
7171
const auto tid = threadIdx.x;
7272
const auto nv = nt * vt;
73-
auto idx = nv * blockIdx.x + tid;
73+
int64_t idx = static_cast<int64_t>(nv) * blockIdx.x + tid;
7474
#pragma unroll
7575
for (int i = 0; i < vt; i++) {
7676
if (idx < N) {
@@ -227,6 +227,12 @@ static OffsetCalculator<N, uint32_t, signed_strides> make_offset_calculator(
227227
return OffsetCalculator<N, uint32_t, signed_strides>(
228228
iter.ndim(), iter.shape().data(), strides.data());
229229
}
230+
constexpr bool IsInUint32Range(int64_t value) {
231+
return value >= 0 && value <= std::numeric_limits<int32_t>::max();
232+
}
233+
constexpr bool IsInUint32Range(int64_t v1, int64_t v2) {
234+
return IsInUint32Range(v1) && IsInUint32Range(v2);
235+
}
230236

231237
} // namespace funcs
232238
} // namespace phi

paddle/phi/kernels/funcs/index_impl.cu.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ __global__ void VectorizedIndexKernel(T *out,
3131
size_t numel,
3232
size_t main_offset,
3333
Functor func) {
34-
size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
35-
size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
34+
size_t data_offset = static_cast<size_t>(BLOCK_ID_X) * BLOCK_NUM_X * VecSize;
35+
size_t stride = static_cast<size_t>(BLOCK_NUM_X) * GRID_NUM_X * VecSize;
3636
size_t args[VecSize];
3737
T result[VecSize];
3838
for (; data_offset < main_offset; data_offset += stride) {
@@ -69,7 +69,8 @@ void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
6969
int block = config.thread_per_block.x;
7070
auto stream = dev_ctx.stream();
7171
#endif
72-
size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
72+
size_t main_offset =
73+
(numel / (vec_size * static_cast<size_t>(block))) * vec_size * block;
7374
switch (vec_size) {
7475
case 4:
7576
VectorizedIndexKernel<T, Functor, 4>

paddle/phi/kernels/funcs/index_put_utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ static void CalCompressedDimsWith1AndWithout1(
310310
#if defined(__NVCC__) || defined(__HIPCC__)
311311
template <typename T>
312312
__global__ void range_cuda_kernel(int64_t N, T* out) {
313-
int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
313+
int64_t idx = threadIdx.x + static_cast<int64_t>(blockDim.x) * blockIdx.x;
314314

315315
if (idx >= N) {
316316
return;

paddle/phi/kernels/funcs/select_impl.cu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,15 @@ __global__ void CumsumOneBlock(const InT *in,
156156
int64_t numel,
157157
int64_t main_offset,
158158
Functor func) {
159-
int64_t stride = BLOCK_NUM_X * VecSize;
159+
int64_t stride = static_cast<int64_t>(BLOCK_NUM_X) * VecSize;
160160
int64_t offset = 0;
161161
OutT pre_cumsum = static_cast<OutT>(0);
162162
for (; offset < main_offset; offset += stride) {
163163
CumsumImpl<InT, OutT, Functor, VecSize, false>(
164164
in + offset, out + offset, &pre_cumsum, stride, func);
165165
}
166166

167-
int num = numel - offset;
167+
int64_t num = numel - offset;
168168
if (num > 0) {
169169
CumsumImpl<InT, OutT, Functor, VecSize, true>(
170170
in + offset, out + offset, &pre_cumsum, num, func);

paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ __global__ void IndexEleGetGradAccKernel(
4141
offset_calc_t offset_calc) {
4242
const int tid = threadIdx.x;
4343
const int nv = nt * vt;
44-
int idx = nv * blockIdx.x + tid;
44+
int64_t idx = nv * static_cast<int64_t>(blockIdx.x) + tid;
4545
#pragma unroll
4646
for (int i = 0; i < vt; i++) {
4747
if (idx < N) {
@@ -112,10 +112,17 @@ void GPUIndexElementwiseGetGrad(const phi::GPUContext& dev_ctx,
112112
funcs::make_offset_calculator_put<3>(desired_shape, strides_array);
113113

114114
const int64_t N = numel;
115+
116+
PADDLE_ENFORCE_EQ(true,
117+
funcs::IsInUint32Range(N, value.numel()),
118+
common::errors::PreconditionNotMet(
119+
"the numel of input or output should be in [0, "
120+
"std::numeric_limits<int32_t>::max()]"));
115121
constexpr int nt = 128;
116122
constexpr int vt = 4;
117123
const dim3 block(nt);
118-
const dim3 grid((N + block.x * vt - 1) / (block.x * vt));
124+
const dim3 grid((N + static_cast<int64_t>(block.x) * vt - 1) /
125+
(static_cast<int64_t>(block.x) * vt));
119126
auto stream = dev_ctx.stream();
120127

121128
using dtype = funcs::OpaqueType<sizeof(T)>;
@@ -172,11 +179,12 @@ __global__ void IndexingBackwardKernel(const int64_t* sorted_indices,
172179
using opmath_t = typename phi::dtype::MPTypeTrait<scalar_t>::Type;
173180

174181
for (int64_t z = blockIdx.z; z < outer_dim; z += gridDim.z) {
175-
int64_t idx = blockIdx.x * blockDim.y + threadIdx.y;
182+
int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.y + threadIdx.y;
176183
if (idx < numel &&
177184
(idx == 0 || sorted_indices[idx] != sorted_indices[idx - 1])) {
178185
do {
179-
int64_t start_feature = threadIdx.x + blockIdx.y * blockDim.x * SZ;
186+
int64_t start_feature =
187+
threadIdx.x + static_cast<int64_t>(blockIdx.y) * blockDim.x * SZ;
180188
if (!accumulate && (idx < numel - 1) &&
181189
sorted_indices[idx] == sorted_indices[idx + 1]) {
182190
idx++;
@@ -222,7 +230,7 @@ __global__ void IndexingBackwardKernel(const int64_t* sorted_indices,
222230
static_cast<scalar_t>(weight[ii]);
223231
}
224232
}
225-
start_feature += gridDim.y * blockDim.x * SZ;
233+
start_feature += static_cast<int64_t>(gridDim.y) * blockDim.x * SZ;
226234
}
227235
idx++;
228236
} while (idx < numel && sorted_indices[idx] == sorted_indices[idx - 1]);

paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,12 +68,11 @@ void GPUIndexElementwiseGetKernel(const phi::GPUContext& dev_ctx,
6868
funcs::make_offset_calculator_put<3>(desired_shape, strides_array);
6969

7070
const int64_t N = output->numel();
71-
PADDLE_ENFORCE_GE(
72-
N, 0, common::errors::InvalidArgument("Output numel must >= 0"));
73-
PADDLE_ENFORCE_LE(
74-
N,
75-
std::numeric_limits<int32_t>::max(),
76-
common::errors::InvalidArgument("Output numel must <= INT32_MAX"));
71+
PADDLE_ENFORCE_EQ(true,
72+
funcs::IsInUint32Range(N, input.numel()),
73+
common::errors::PreconditionNotMet(
74+
"the numel of input or output should be in [0, "
75+
"std::numeric_limits<int32_t>::max()]"));
7776
constexpr int nt = 128;
7877
constexpr int vt = 4;
7978
const dim3 block(nt);

paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,11 @@ void GPUIndexElementwisePutGradKernel(
129129
auto index_ptrs = funcs::GetIndexDataPtrs<IndexT>(index);
130130
const char* out_ptr = reinterpret_cast<const char*>(out_grad.data<T>());
131131
char* value_ptr = reinterpret_cast<char*>(value_grad->data<T>());
132+
PADDLE_ENFORCE_EQ(true,
133+
funcs::IsInUint32Range(value_grad->numel()),
134+
common::errors::PreconditionNotMet(
135+
"the numel of input or output should be in [0, "
136+
"std::numeric_limits<int32_t>::max()]"));
132137
funcs::index_elementwise_with_tensor_kernel<nt, vt>
133138
<<<grid, block, 0, stream>>>(N, [=] __device__(int idx) {
134139
const auto offsets = offset_calc.get(idx);
@@ -151,6 +156,11 @@ void GPUIndexElementwisePutGradKernel(
151156
} else {
152157
auto index_ptrs = funcs::GetIndexDataPtrs<IndexT>(index);
153158
char* out_ptr = reinterpret_cast<char*>(x_grad->data<T>());
159+
PADDLE_ENFORCE_EQ(true,
160+
funcs::IsInUint32Range(value_grad->numel()),
161+
common::errors::PreconditionNotMet(
162+
"the numel of input or output should be in [0, "
163+
"std::numeric_limits<int32_t>::max()]"));
154164
char* value_ptr = reinterpret_cast<char*>(value_grad->data<T>());
155165
funcs::index_elementwise_with_tensor_kernel<nt, vt>
156166
<<<grid, block, 0, stream>>>(N, [=] __device__(int idx) {

paddle/phi/kernels/gpu/masked_fill_grad_kernel.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ __global__ void GPUMaskedFillXGradKernel(const T* out_grad,
4242
const int64_t input_len,
4343
const int64_t batch_size,
4444
T* x_grad) {
45-
int64_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
45+
int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
4646

4747
if (idx >= (input_len / VecSize)) {
4848
return;
@@ -73,7 +73,7 @@ __global__ void GPUMaskedFillValueGradKernel(const T* out_grad,
7373
const int64_t input_len,
7474
const int64_t batch_size,
7575
T* value_grad) {
76-
int64_t idx = (blockIdx.x * blockDim.x + threadIdx.x);
76+
int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
7777

7878
if (idx >= (input_len / VecSize)) {
7979
return;
@@ -243,7 +243,7 @@ void GPUMaskedFillGrad(const phi::GPUContext& dev_ctx,
243243

244244
int64_t input_len = out_grad.numel();
245245
int64_t mask_len = mask.numel();
246-
int batch_size = input_len / mask_len;
246+
int64_t batch_size = input_len / mask_len;
247247

248248
int vec_size = 8;
249249
vec_size = std::min(phi::GetVectorizedSize(out_grad_data), vec_size);

0 commit comments

Comments
 (0)