diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index d74cf576499..3dc679860e7 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, const auto in_g_n_c_wis_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); - Tensor out(out_g_n_k_wos_desc); - Tensor wei(wei_g_k_c_xs_desc); - Tensor in_host(in_g_n_c_wis_desc); - Tensor in_device(in_g_n_c_wis_desc); + std::cout << "out: " << out_g_n_k_wos_desc << std::endl; + std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "in: " << in_g_n_c_wis_desc << std::endl; - std::cout << "out: " << out.mDesc << std::endl; - std::cout << "wei: " << wei.mDesc << std::endl; - std::cout << "in: " << in_host.mDesc << std::endl; + // Get element space sizes + const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + // Allocate GPU buffers + DeviceMem out_device_buf(sizeof(OutDataType) * out_element_space_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size); + DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size); + + // Generate data directly on GPU using DeviceMem methods switch(init_method) { - case 0: break; + case 0: + // Zero initialization + out_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; case 1: - out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + // Discrete integer values in range [-5, 5] + out_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); break; case 2: - out.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + // Continuous float values + out_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); break; default: - out.GenerateTensorValue(GeneratorTensor_1{1}); - wei.GenerateTensorValue(GeneratorTensor_1{1}); + // Constant value 1 + out_device_buf.SetValue(ck::type_convert(1)); + wei_device_buf.SetValue(ck::type_convert(1)); } - DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); - DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize()); + // Create host tensors (needed only for verification) + Tensor out(out_g_n_k_wos_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor in_host(in_g_n_c_wis_desc); + Tensor in_device(in_g_n_c_wis_desc); - out_device_buf.ToDevice(out.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); + // Copy GPU→CPU only if verification is enabled + if(do_verification == 1 || do_verification == 2) + { + out_device_buf.FromDevice(out.mData.data()); + wei_device_buf.FromDevice(wei.mData.data()); + } + + // Copy to host only if CPU verification is needed + if(do_verification == 1) + { + out_device_buf.FromDevice(out.mData.data()); + wei_device_buf.FromDevice(wei.mData.data()); + } // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_in_buf( diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 67ad21c5728..f2698537a37 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -63,34 +63,51 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, const auto out_g_n_k_wos_desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); - Tensor input(in_g_n_c_wis_desc); - Tensor weight_host_result(wei_g_k_c_xs_desc); - Tensor weight_device_result(wei_g_k_c_xs_desc); - Tensor output(out_g_n_k_wos_desc); + std::cout << "input: " << in_g_n_c_wis_desc << std::endl; + std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "output: " << out_g_n_k_wos_desc << std::endl; + + // Get element space sizes + const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); - std::cout << "input: " << input.mDesc << std::endl; - std::cout << "weight: " << weight_host_result.mDesc << std::endl; - std::cout << "output: " << output.mDesc << std::endl; + // Allocate GPU buffers + DeviceMem in_device_buf(sizeof(InDataType) * input_element_space_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size); + DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size); + // Generate data directly on GPU using DeviceMem methods switch(init_method) { - case 0: break; + case 0: + // Zero initialization + in_device_buf.SetZero(); + out_device_buf.SetZero(); + break; case 1: - input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - output.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + // Discrete integer values in range [-5, 5] + in_device_buf.FillUniformRandInteger(-5, 5); + out_device_buf.FillUniformRandInteger(-5, 5); break; default: - input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - output.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + // Continuous float values + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + out_device_buf.FillUniformRandFp(-0.5f, 0.5f); } - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * - weight_device_result.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize()); + // Create host tensors (needed only for verification) + Tensor input(in_g_n_c_wis_desc); + Tensor weight_host_result(wei_g_k_c_xs_desc); + Tensor weight_device_result(wei_g_k_c_xs_desc); + Tensor output(out_g_n_k_wos_desc); - in_device_buf.ToDevice(input.mData.data()); - out_device_buf.ToDevice(output.mData.data()); + // Copy to host only if CPU verification is needed + if(do_verification == 1) + { + in_device_buf.FromDevice(input.mData.data()); + out_device_buf.FromDevice(output.mData.data()); + } // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_wei_buf( diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 874d1e115c7..95b75ecff2f 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -86,33 +86,51 @@ bool profile_grouped_conv_fwd_impl(int do_verification, copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); - Tensor input(in_g_n_c_wis_desc); - Tensor weight(wei_g_k_c_xs_desc); - Tensor host_output(out_g_n_k_wos_desc); - Tensor device_output(out_g_n_k_wos_desc); + // Get element space sizes for GPU allocation + const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + + std::cout << "input: " << in_g_n_c_wis_desc << std::endl; + std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "output: " << out_g_n_k_wos_desc << std::endl; - std::cout << "input: " << input.mDesc << std::endl; - std::cout << "weight: " << weight.mDesc << std::endl; - std::cout << "output: " << host_output.mDesc << std::endl; + // Allocate GPU memory first (GPU-first workflow) + DeviceMem in_device_buf(sizeof(InDataType) * input_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size); + DeviceMem out_device_buf(sizeof(OutDataType) * output_size); + // Generate data directly on GPU using DeviceMem methods switch(init_method) { - case 0: break; + case 0: + // Zero initialization + in_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; case 1: - input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + // Discrete integer generation: {-5, -4, -3, ..., 3, 4} + in_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); break; default: - input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - weight.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + // Continuous float generation + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); } - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); + // Create host tensors (for verification if needed) + Tensor input(in_g_n_c_wis_desc); + Tensor weight(wei_g_k_c_xs_desc); + Tensor host_output(out_g_n_k_wos_desc); + Tensor device_output(out_g_n_k_wos_desc); - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weight.mData.data()); + // Copy to host only if CPU verification is needed + if(do_verification == 1) + { + in_device_buf.FromDevice(input.mData.data()); + wei_device_buf.FromDevice(weight.mData.data()); + } // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_out_buf(