Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 45 additions & 19 deletions profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
const auto in_g_n_c_wis_desc =
ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);

Tensor<OutDataType> out(out_g_n_k_wos_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
Tensor<InDataType> in_device(in_g_n_c_wis_desc);
std::cout << "out: " << out_g_n_k_wos_desc << std::endl;
std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "in: " << in_g_n_c_wis_desc << std::endl;

std::cout << "out: " << out.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "in: " << in_host.mDesc << std::endl;
// Get element space sizes
const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();
const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();

// Allocate GPU buffers
DeviceMem out_device_buf(sizeof(OutDataType) * out_element_space_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size);
DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size);

// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0: break;
case 0:
// Zero initialization
out_device_buf.SetZero();
wei_device_buf.SetZero();
break;
case 1:
out.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
// Discrete integer values in range [-5, 5]
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
break;
case 2:
out.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
// Continuous float values
out_device_buf.FillUniformRandFp<OutDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
break;
default:
out.GenerateTensorValue(GeneratorTensor_1<OutDataType>{1});
wei.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{1});
// Constant value 1
out_device_buf.SetValue<OutDataType>(ck::type_convert<OutDataType>(1));
wei_device_buf.SetValue<WeiDataType>(ck::type_convert<WeiDataType>(1));
}

DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize());
// Create host tensors (needed only for verification)
Tensor<OutDataType> out(out_g_n_k_wos_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
Tensor<InDataType> in_host(in_g_n_c_wis_desc);
Tensor<InDataType> in_device(in_g_n_c_wis_desc);

out_device_buf.ToDevice(out.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
// Copy GPU→CPU only if verification is enabled
if(do_verification == 1 || do_verification == 2)
{
out_device_buf.FromDevice(out.mData.data());
wei_device_buf.FromDevice(wei.mData.data());
}

// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
out_device_buf.FromDevice(out.mData.data());
wei_device_buf.FromDevice(wei.mData.data());
}

// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_in_buf(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,34 +63,51 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
const auto out_g_n_k_wos_desc =
ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);

Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc);
std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;

// Get element space sizes
const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize();
const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize();

std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight_host_result.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
// Allocate GPU buffers
DeviceMem in_device_buf(sizeof(InDataType) * input_element_space_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size);
DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size);

// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0: break;
case 0:
// Zero initialization
in_device_buf.SetZero();
out_device_buf.SetZero();
break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
output.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
// Discrete integer values in range [-5, 5]
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
output.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
// Continuous float values
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
out_device_buf.FillUniformRandFp<OutDataType>(-0.5f, 0.5f);
}

DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) *
weight_device_result.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize());
// Create host tensors (needed only for verification)
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
Tensor<WeiDataType> weight_device_result(wei_g_k_c_xs_desc);
Tensor<OutDataType> output(out_g_n_k_wos_desc);

in_device_buf.ToDevice(input.mData.data());
out_device_buf.ToDevice(output.mData.data());
// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
in_device_buf.FromDevice(input.mData.data());
out_device_buf.FromDevice(output.mData.data());
}

// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_wei_buf(
Expand Down
52 changes: 35 additions & 17 deletions profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,33 +86,51 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);

Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);
// Get element space sizes for GPU allocation
const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize();
const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize();
const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize();

std::cout << "input: " << in_g_n_c_wis_desc << std::endl;
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;

std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
// Allocate GPU memory first (GPU-first workflow)
DeviceMem in_device_buf(sizeof(InDataType) * input_size);
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size);
DeviceMem out_device_buf(sizeof(OutDataType) * output_size);

// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
case 0: break;
case 0:
// Zero initialization
in_device_buf.SetZero();
wei_device_buf.SetZero();
break;
case 1:
input.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
weight.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
// Discrete integer generation: {-5, -4, -3, ..., 3, 4}
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
break;
default:
input.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
weight.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
// Continuous float generation
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
}

DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize());
// Create host tensors (for verification if needed)
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
Tensor<OutDataType> host_output(out_g_n_k_wos_desc);
Tensor<OutDataType> device_output(out_g_n_k_wos_desc);

in_device_buf.ToDevice(input.mData.data());
wei_device_buf.ToDevice(weight.mData.data());
// Copy to host only if CPU verification is needed
if(do_verification == 1)
{
in_device_buf.FromDevice(input.mData.data());
wei_device_buf.FromDevice(weight.mData.data());
}

// Allocate GPU reference buffer (used only if do_verification == 2)
DeviceMem gpu_ref_out_buf(
Expand Down