From ac6bd87599a1faed9ede0e04ffdc61e84c1b7ec5 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Tue, 6 Jan 2026 17:05:37 +0000 Subject: [PATCH 01/24] [BULDER] Add grouped conv fwd ck tile profiler --- .gitignore | 3 + .../ck_tile/builder/testing/conv_fwd.hpp | 178 +++++++++++++ .../ck_tile/builder/testing/conv_fwd_ck.hpp | 40 +++ .../ck_tile/builder/testing/tensor_buffer.hpp | 6 + experimental/builder/test/CMakeLists.txt | 6 +- .../conv/ck_tile/test_ckb_conv_fwd_e2e.cpp | 74 ++++++ .../builder/test/profiling/CMakeLists.txt | 23 ++ .../test/profiling/configs/ndhwgc_bf16.conf | 237 ++++++++++++++++++ .../test/profiling/configs/ndhwgc_fp16.conf | 228 +++++++++++++++++ .../test/profiling/configs/ndhwgc_fp32.conf | 176 +++++++++++++ .../test/profiling/configs/nhwgc_bf16.conf | 237 ++++++++++++++++++ .../test/profiling/configs/nhwgc_fp16.conf | 228 +++++++++++++++++ .../test/profiling/configs/nhwgc_fp32.conf | 176 +++++++++++++ .../test/profiling/generate_instances.py | 235 +++++++++++++++++ .../grouped_convolution_forward_tile_algs.hpp | 201 +++++++++++++++ .../grouped_convolution_signatures.hpp | 72 ++++++ .../profiling/profile_ckb_tile_conv_fwd.cpp | 229 +++++++++++++++++ ...ion_forward_tile_ndhwgc_bf16_signature.inc | 12 + ...ion_forward_tile_ndhwgc_fp16_signature.inc | 12 + ...ion_forward_tile_ndhwgc_fp32_signature.inc | 12 + ...tion_forward_tile_nhwgc_bf16_signature.inc | 12 + ...tion_forward_tile_nhwgc_fp16_signature.inc | 12 + ...tion_forward_tile_nhwgc_fp32_signature.inc | 12 + .../test/profiling/src/instance_includes.inc | 8 + .../test/profiling/src/instance_run.inc | 7 + .../test/utils/ckb_conv_tile_test_configs.hpp | 4 +- include/ck/library/utility/host_tensor.hpp | 20 +- .../gemm_pipeline_agmem_bgmem_creg_v1.hpp | 2 + include/ck_tile/ops/gemm/warp/warp_gemm.hpp | 12 + .../ops/gemm/warp/warp_gemm_dispatcher.hpp | 2 + .../grouped_convolution_forward_kernel.hpp | 43 +++- 31 files changed, 2501 insertions(+), 18 deletions(-) create mode 100644 experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp create mode 100644 experimental/builder/test/profiling/CMakeLists.txt create mode 100644 experimental/builder/test/profiling/configs/ndhwgc_bf16.conf create mode 100644 experimental/builder/test/profiling/configs/ndhwgc_fp16.conf create mode 100644 experimental/builder/test/profiling/configs/ndhwgc_fp32.conf create mode 100644 experimental/builder/test/profiling/configs/nhwgc_bf16.conf create mode 100644 experimental/builder/test/profiling/configs/nhwgc_fp16.conf create mode 100644 experimental/builder/test/profiling/configs/nhwgc_fp32.conf create mode 100644 experimental/builder/test/profiling/generate_instances.py create mode 100644 experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp create mode 100644 experimental/builder/test/profiling/grouped_convolution_signatures.hpp create mode 100644 experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc create mode 100644 experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc create mode 100644 experimental/builder/test/profiling/src/instance_includes.inc create mode 100644 experimental/builder/test/profiling/src/instance_run.inc diff --git a/.gitignore b/.gitignore index 98234268c10..a048429b821 100644 --- a/.gitignore +++ b/.gitignore @@ -92,3 +92,6 @@ test_data/* # The experimental/builder directory should be tracked despite matching build* !experimental/builder !experimental/builder/** +experimental/builder/test/profiling/src/* +!experimental/builder/test/profiling/src/*.inc +experimental/builder/test/profiling/*.inc diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index f329a8a4d3d..458175b75c0 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -9,6 +9,7 @@ #include "ck_tile/builder/testing/testing.hpp" #include "ck_tile/builder/testing/extent.hpp" #include "ck_tile/builder/testing/tensor_buffer.hpp" +#include "ck_tile/host/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" /// This file implements common functionality for invoking/testing grouped @@ -151,8 +152,185 @@ struct Args to_vector(this->input_left_pad), to_vector(this->input_right_pad)); } + + /// Convert the Args structure into a CK Tile conv_param structure. + /// This function is mainly used to be able to use the existing + /// CK Tile functionality to obtain tensor descriptors. + ck_tile::conv::ConvParam to_ck_tile_conv_param() const + { + const auto to_vector = [](const auto& extent) { + if constexpr(SPATIAL_DIM == 1) + return std::vector{ck::index_t(extent.width)}; + else if constexpr(SPATIAL_DIM == 2) + return std::vector{ck::index_t(extent.height), + ck::index_t(extent.width)}; + else + return std::vector{ck::index_t(extent.depth), + ck::index_t(extent.height), + ck::index_t(extent.width)}; + }; + + return ck_tile::conv::ConvParam(SPATIAL_DIM, + this->lengths.groups, + this->lengths.batch_size, + this->lengths.output_channels, + this->lengths.input_channels, + to_vector(this->lengths.filter), + to_vector(this->lengths.image), + to_vector(this->filter_strides), + to_vector(this->filter_dilation), + to_vector(this->input_left_pad), + to_vector(this->input_right_pad)); + } }; +template +CK_TILE_HOST auto parse_conv_args(int arg_idx, char* const argv[]) +{ + const std::size_t G = static_cast(std::stol(argv[arg_idx++])); + const std::size_t N = static_cast(std::stol(argv[arg_idx++])); + const std::size_t K = static_cast(std::stol(argv[arg_idx++])); + const std::size_t C = static_cast(std::stol(argv[arg_idx++])); + + constexpr auto num_dim_spatial = SIGNATURE.spatial_dim; + + std::vector filter_spatial_lengths(num_dim_spatial); + std::vector input_spatial_lengths(num_dim_spatial); + std::vector conv_filter_strides(num_dim_spatial); + std::vector conv_filter_dilations(num_dim_spatial); + std::vector input_left_pads(num_dim_spatial); + std::vector input_right_pads(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + filter_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_strides[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_dilations[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_left_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + if constexpr(num_dim_spatial == 1) + { + Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = + { + .width = input_spatial_lengths[0], + }, + .filter = + { + .width = filter_spatial_lengths[0], + }, + }, + .filter_strides = {.width = conv_filter_strides[0]}, + .filter_dilation = {.width = conv_filter_dilations[0]}, + .input_left_pad = {.width = input_left_pads[0]}, + .input_right_pad = {.width = input_right_pads[0]}, + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; + } + else if constexpr(num_dim_spatial == 2) + { + Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = + { + .width = input_spatial_lengths[1], + .height = input_spatial_lengths[0], + }, + .filter = + { + .width = filter_spatial_lengths[1], + .height = filter_spatial_lengths[0], + }, + }, + .filter_strides = {.width = conv_filter_strides[1], .height = conv_filter_strides[0]}, + .filter_dilation = {.width = conv_filter_dilations[1], + .height = conv_filter_dilations[0]}, + .input_left_pad = {.width = input_left_pads[1], .height = input_left_pads[0]}, + .input_right_pad = {.width = input_right_pads[1], .height = input_right_pads[0]}, + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; + } + else + { + Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = + { + .width = input_spatial_lengths[2], + .height = input_spatial_lengths[1], + .depth = input_spatial_lengths[0], + }, + .filter = + { + .width = filter_spatial_lengths[2], + .height = filter_spatial_lengths[1], + .depth = filter_spatial_lengths[0], + }, + }, + .filter_strides = {.width = conv_filter_strides[2], + .height = conv_filter_strides[1], + .depth = conv_filter_strides[0]}, + .filter_dilation = {.width = conv_filter_dilations[2], + .height = conv_filter_dilations[1], + .depth = conv_filter_dilations[0]}, + .input_left_pad = {.width = input_left_pads[2], + .height = input_left_pads[1], + .depth = input_left_pads[0]}, + .input_right_pad = {.width = input_right_pads[2], + .height = input_right_pads[1], + .depth = input_right_pads[0]}, + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; + } +} + /// @brief `Inputs` specialization for forward convolution. /// /// @tparam SIGNATURE Forward convolution signature. diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index cc5c613d95c..3f00e3409a9 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -7,6 +7,7 @@ #include #include "ck_tile/builder/testing/conv_fwd.hpp" +#include "ck_tile/host/kernel_launch.hpp" /// This file contains the implementation details for invoking/testing /// grouped convolution operations in old CK. The main item is the @@ -99,4 +100,43 @@ void run(Conv& conv, conv.MakeInvoker().Run(ck_args, {}); } +/// @brief `run()` specialization for forward convolution and CK Tile. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// @throws std::runtime_error if the arguments werent actually valid for the +/// operation. This should be caught and reported by the testing framework. +/// +/// @see run() +template + requires ValidConvSignature && ConvDirectionIsForward && + IsCkConvInstance +float run_tile(Conv& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + const auto param = args.to_ck_tile_conv_param(); + + ck_tile::GroupedConvFwdHostArgs<> host_args( + param, inputs.input, inputs.weight, {}, outputs.output, 1 /*kbatch*/); + + auto kargs = Conv::MakeKernelArgs(host_args); + + const dim3 grids = Conv::GridSize(kargs); + const dim3 blocks = Conv::BlockSize(); + + if(!Conv::IsSupportedArgument(kargs)) + { + std::cout << "Not supported!"; + return 0.f; + } + + constexpr index_t minimum_occupancy = + Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2; + + return ck_tile::launch_kernel( + s_conf, ck_tile::make_kernel(conv, grids, blocks, 0, kargs)); +} + } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp index 42f85f80177..ce9002c754e 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_buffer.hpp @@ -184,6 +184,12 @@ struct TensorDescriptor return get_element_space_size() * data_type_sizeof(DT); } + friend std::ostream& operator<<(std::ostream& os, const TensorDescriptor
& tensor_desc) + { + os << tensor_desc.inner_descriptor_; + return os; + } + private: ck_tile::HostTensorDescriptor inner_descriptor_; }; diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 424bfd8781e..3651a4690fd 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -145,7 +145,9 @@ add_ck_builder_test(test_ckb_build_fwd_instances conv/ck/test_ckb_conv_fwd_3d_fp32.cpp conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp conv/ck_tile/test_ckb_conv_bwd_weight_2d_fp16_v3.cpp - conv/ck_tile/test_ckb_conv_bwd_data_2d_fp16_v3.cpp) + conv/ck_tile/test_ckb_conv_bwd_data_2d_fp16_v3.cpp + conv/ck_tile/test_ckb_conv_fwd_e2e.cpp + ) target_link_libraries(test_ckb_build_fwd_instances PRIVATE utility) @@ -254,6 +256,8 @@ add_custom_target(check-builder COMMENT "Running all experimental builder tests..." ) +add_subdirectory(profiling) + ################################################################################ # Build Summary ################################################################################ diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp new file mode 100644 index 00000000000..b96c617744b --- /dev/null +++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp @@ -0,0 +1,74 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "utils/ckb_conv_tile_test_configs.hpp" +#include "utils/ckb_conv_test_utils.hpp" +#include "utils/conv_algorithm_type_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/host/device_prop.hpp" + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace cku = ck_tile::builder::test_utils; + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto ALGORITHM = + cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} + .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT) + .with_tile_thread_block(cku::FwdTileThreadBlock_64x64x64) + .with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave) + .with_tile_transfer(cku::FwdTileTransfer_4x4x4) + .with_tile_optimizations(ckt::TileOptimizations{ + .num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false}); + +using Builder = ckb::ConvBuilder; +using Instance = Builder::Instance; + +TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd) +{ + if(!ck_tile::get_device_name().starts_with("gfx9")) + { + GTEST_SKIP() << "unsupported architecture"; + } + + ckt::Args args = { + .lengths = + { + .batch_size = 16, + .groups = 1, + .input_channels = 32, + .output_channels = 48, + .image = + { + .width = 56, + .height = 64, + }, + .filter = + { + .width = 3, + .height = 5, + }, + }, + .filter_strides = {.width = 1, .height = 1}, + .filter_dilation = {.width = 1, .height = 1}, + .input_left_pad = {.width = 0, .height = 0}, + .input_right_pad = {.width = 0, .height = 0}, + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + + auto inputs = alloc_inputs(args); + auto outputs = alloc_outputs(args); + + auto conv = Instance{}; + ckt::run_tile(conv, args, inputs.get(), outputs.get()); +} diff --git a/experimental/builder/test/profiling/CMakeLists.txt b/experimental/builder/test/profiling/CMakeLists.txt new file mode 100644 index 00000000000..f1267fbdc1a --- /dev/null +++ b/experimental/builder/test/profiling/CMakeLists.txt @@ -0,0 +1,23 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +if(GPU_TARGETS MATCHES "gfx94|gfx95") + + file(GLOB_RECURSE GROUPED_CONV_FWD_TILE CONFIGURE_DEPENDS "src/*.cpp") + add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) + + + set(EXAMPLE_CONV_COMPILE_OPTIONS) + list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0 -Wno-global-constructors -Wno-c++20-compat) + + add_executable(profile_ckb_tile_conv_fwd profile_ckb_tile_conv_fwd.cpp) + target_compile_options(profile_ckb_tile_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) + target_compile_features(profile_ckb_tile_conv_fwd PRIVATE cxx_std_20) + target_include_directories(profile_ckb_tile_conv_fwd PRIVATE + "${PROJECT_SOURCE_DIR}/experimental/builder/include" + "${PROJECT_SOURCE_DIR}/include" + "${CMAKE_CURRENT_SOURCE_DIR}/../" + ) + target_link_libraries(profile_ckb_tile_conv_fwd PRIVATE utility device_grouped_conv_fwd_tile_instances) + +endif() diff --git a/experimental/builder/test/profiling/configs/ndhwgc_bf16.conf b/experimental/builder/test/profiling/configs/ndhwgc_bf16.conf new file mode 100644 index 00000000000..ee62db40ba0 --- /dev/null +++ b/experimental/builder/test/profiling/configs/ndhwgc_bf16.conf @@ -0,0 +1,237 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Default, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Stride1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Default, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Stride1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/configs/ndhwgc_fp16.conf b/experimental/builder/test/profiling/configs/ndhwgc_fp16.conf new file mode 100644 index 00000000000..466b246787b --- /dev/null +++ b/experimental/builder/test/profiling/configs/ndhwgc_fp16.conf @@ -0,0 +1,228 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Default, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Stride1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Default, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Stride1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/configs/ndhwgc_fp32.conf b/experimental/builder/test/profiling/configs/ndhwgc_fp32.conf new file mode 100644 index 00000000000..7dc982b6f7a --- /dev/null +++ b/experimental/builder/test/profiling/configs/ndhwgc_fp32.conf @@ -0,0 +1,176 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Default, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Default, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Default, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Filter1x1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Filter1x1Pad0, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Filter1x1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Filter1x1Pad0, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Filter1x1Stride1Pad0, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/configs/nhwgc_bf16.conf b/experimental/builder/test/profiling/configs/nhwgc_bf16.conf new file mode 100644 index 00000000000..c7a6ba489e3 --- /dev/null +++ b/experimental/builder/test/profiling/configs/nhwgc_bf16.conf @@ -0,0 +1,237 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 64, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 64, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Default, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Default, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Stride1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Stride1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/configs/nhwgc_fp16.conf b/experimental/builder/test/profiling/configs/nhwgc_fp16.conf new file mode 100644 index 00000000000..4e31ba2b06c --- /dev/null +++ b/experimental/builder/test/profiling/configs/nhwgc_fp16.conf @@ -0,0 +1,228 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Default, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 32, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 2, 2, 2, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Default, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Default, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Default, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +# DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 224, 256, 64, Filter1x1Stride1Pad0, 16, 16, 7, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 224, 64, Filter1x1Stride1Pad0, 16, 16, 8, 7, 8, 8, 8, 2, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 128, 32, Filter1x1Stride1Pad0, 32, 32, 4, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Default, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 16, 256, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 32, 256, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/configs/nhwgc_fp32.conf b/experimental/builder/test/profiling/configs/nhwgc_fp32.conf new file mode 100644 index 00000000000..7dc982b6f7a --- /dev/null +++ b/experimental/builder/test/profiling/configs/nhwgc_fp32.conf @@ -0,0 +1,176 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Default, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Default, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Default, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Default, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Default, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Filter1x1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Filter1x1Pad0, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Filter1x1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Filter1x1Pad0, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 256, 128, 16, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Filter1x1Stride1Pad0, 32, 32, 2, 4, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 4, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 64, 128, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 64, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 128, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 128, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<128, 32, 128, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 32, 16, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 16, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 192, 16, Filter1x1Stride1Pad0, 32, 32, 2, 3, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 2, 1, 2, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 4, 4, 4, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 16, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<256, 256, 128, 16, Default, 32, 32, 4, 2, 4, 4, 4, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Default, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Default, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Default, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Default, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Default, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Default, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Default, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 128, 16, 64, Filter1x1Stride1Pad0, 16, 16, 4, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 32, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 64, 16, 64, Filter1x1Stride1Pad0, 16, 16, 2, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 64, 64, Filter1x1Stride1Pad0, 16, 16, 1, 2, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 64, 64, Filter1x1Stride1Pad0, 32, 32, 1, 1, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 128, 64, Filter1x1Stride1Pad0, 16, 16, 1, 4, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 64, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/test/profiling/generate_instances.py b/experimental/builder/test/profiling/generate_instances.py new file mode 100644 index 00000000000..d780d45c190 --- /dev/null +++ b/experimental/builder/test/profiling/generate_instances.py @@ -0,0 +1,235 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +from pathlib import Path + + +class ConvInstanceTemplateParams: + def __init__( + self, + specialization, + tile_size, + warps, + warp_tile, + double_smem_buffer, + num_wave_groups, + pipeline_version, + scheduler, + scalar_per_vector, + num_groups_to_merge, + split_image, + explicit_gemm, + id, + ): + self.specialization = specialization + self.tile_size = tile_size + self.warps = warps + self.warp_tile = warp_tile + self.double_smem_buffer = double_smem_buffer + self.num_wave_groups = num_wave_groups + self.pipeline_version = pipeline_version + self.scheduler = scheduler + self.scalar_per_vector = scalar_per_vector + self.num_groups_to_merge = num_groups_to_merge + self.split_image = split_image + self.explicit_gemm = explicit_gemm + self.id = id + + def get_optimizations(self): + explicit_gemm = "true" if self.explicit_gemm else "false" + split_image = "true" if self.split_image else "false" + num_groups_to_merge = str(self.num_groups_to_merge) + return f"ckt::TileOptimizations{{.num_groups_to_merge = {num_groups_to_merge}, .split_image = {split_image}, .explicit_gemm = {explicit_gemm}}}" + + def get_specialization(self): + namespace = "ckb::TileConvSpecialization::" + if self.specialization == "Default" or self.specialization == "OddC": + return namespace + "DEFAULT" + if self.specialization == "Filter1x1Pad0": + return namespace + "FILTER_1X1_PAD0" + if self.specialization == "Filter1x1Stride1Pad0": + return namespace + "FILTER_1X1_STRIDE1_PAD0" + if self.specialization == "Filter3x3": + return namespace + "FILTER_3x3" + else: + raise RuntimeError("not supported specialization") + + def get_thread_block(self): + return f"ckt::TileThreadBlock{{.tile_size = {{.m = {self.tile_size[0]}, .n = {self.tile_size[1]}, .k = {self.tile_size[2]}}}}}" + + def get_block_gemm_desc(self): + double_smem_buffer = "true" if self.double_smem_buffer else "false" + pipeline_version = self.pipeline_version[-1:] + scheduler = ( + "INTRAWAVE" if self.scheduler.find("Intrawave") != -1 else "INTERWAVE" + ) + return f"""ckt::TileBlockGemm{{ + .warps = {{.m = {self.warps[0]}, .n = {self.warps[1]}, .k = {self.warps[2]}}}, + .warp_tile = {{.m = {self.warp_tile[0]}, .n = {self.warp_tile[1]}, .k = {self.warp_tile[2]}}}, + .double_smem_buffer = {double_smem_buffer}, + .num_wave_groups = {self.num_wave_groups}, + .pipeline_version = ckb::PipelineVersion::V{pipeline_version}, + .scheduler = ckb::PipelineScheduler::{scheduler}}}""" + + def get_block_transfer(self): + return f"""ckt::TileTransfer{{.a_scalar_per_vector = {self.scalar_per_vector[0]}, + .b_scalar_per_vector = {self.scalar_per_vector[1]}, .c_scalar_per_vector = {self.scalar_per_vector[2]}}}""" + + +def get_dtype(problem_name): + if problem_name.find("fp32") != -1: + return "float" + if problem_name.find("fp16") != -1: + return "ck_tile::half_t" + if problem_name.find("bf16") != -1: + return "ck_tile::bf16_t" + else: + raise RuntimeError("wrong dtype") + + +def generate_calls_inc(instances, problem_name, direction): + with open(problem_name + "_calls.inc", "w") as f: + for instance in instances: + instance_name = problem_name + "_" + str(instance.id) + f.write(f"run_alg(run_{instance_name});\n") + + +def generate_defs_inc(instances, problem_name, signature, direction): + with open(problem_name + ".inc", "w") as f: + for instance in instances: + instance_name = problem_name + "_" + str(instance.id) + f.write( + f"std::tuple run_{instance_name}(\n" + f" const ckt::Args<{signature}>& args,\n" + f" const ckt::Inputs<{signature}>& inputs,\n" + f" const ckt::Outputs<{signature}>& outputs,\n" + f" const ck_tile::stream_config& s_conf);\n" + ) + + +def generate_fwd_cpp(instances, problem_name, config, direction): + for instance in instances: + instance_name = problem_name + "_" + str(instance.id) + directory_path = Path(f"src/{config}") + directory_path.mkdir(parents=True, exist_ok=True) + with open(f"src/{config}/{instance_name}.cpp", "w") as f: + f.write( + f"// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n" + f"// SPDX-License-Identifier: MIT\n" + f'#include "../instance_includes.inc"\n' + f'#include "../{problem_name}_signature.inc"\n' + f"namespace ck_tile::builder::profiling {{\n" + f"std::tuple run_{instance_name}(\n" + f" const ckt::Args& args,\n" + f" const ckt::Inputs& inputs,\n" + f" const ckt::Outputs& outputs,\n" + f" const ck_tile::stream_config& s_conf) {{\n" + ) + + f.write( + f"constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{{}}\n" + f" .with_tile_specializations({instance.get_specialization()})\n" + f" .with_tile_thread_block({instance.get_thread_block()})\n" + f" .with_tile_block_gemm({instance.get_block_gemm_desc()})\n" + f" .with_tile_transfer({instance.get_block_transfer()})\n" + f" .with_tile_optimizations(\n" + f" {instance.get_optimizations()});\n" + ) + + f.write( + '#include "../instance_run.inc"\n' + "}\n" + "} // namespace ck_tile::builder::profiling\n" + ) + + +def parse_fwd_instances(instances, problem_name): + convs = [] + for instance_id, instance in enumerate(instances): + if instance.find("#") != -1 or instance.find(";") != -1: + continue + instance_args_list = instance[instance.find("<") + 1 : instance.find(">")] + args = instance_args_list.split(", ") + + block_size = int(args[0]) + m_per_block = int(args[1]) + n_per_block = int(args[2]) + k_per_block = int(args[3]) + spec = args[4] + m_per_xdl = int(args[5]) + n_per_xdl = int(args[6]) + m_xdl_per_wave = int(args[7]) + n_xdl_per_wave = int(args[8]) + a_scalar_per_vector = int(args[9]) + b_scalar_per_vector = int(args[10]) + c_scalar_per_vector = int(args[11]) + if len(args) == 15: + num_groups_to_merge = int(args[14]) + elif len(args) != 16 and len(args) != 14: + raise RuntimeError("wrong number of parameters") + else: + num_groups_to_merge = 1 + split_image = instance.find("Large") != -1 + double_smem_buffer = instance.find("BlkGemmPipelineVersion: v4") != -1 + num_wave_groups = 2 if instance.find("BlkGemmPipelineVersion: v5") != -1 else 1 + scheduler = ( + "Intrawave" if instance.find("BlkGemmPipelineScheduler") == -1 else args[14] + ) + pipeline_version = ( + "v1" if instance.find("BlkGemmPipelineVersion") == -1 else args[15] + ) + + m_warp = int(m_per_block / (m_per_xdl * m_xdl_per_wave)) + n_warp = int(n_per_block / (n_per_xdl * n_xdl_per_wave)) + warp_size = 64 + k_warp = int(block_size / (warp_size * m_warp * n_warp)) + dtype = get_dtype(problem_name) + # TODO: Make it more flexible + # k_per_xdl = f"ck_tile::get_k_warp_tile<{dtype}, {m_per_xdl}>()" + k_per_xdl = 8 if dtype == "float" else 16 + + conv = ConvInstanceTemplateParams( + spec, + [m_per_block, n_per_block, k_per_block], + [m_warp, n_warp, k_warp], + [m_per_xdl, n_per_xdl, k_per_xdl], + double_smem_buffer, + num_wave_groups, + pipeline_version, + scheduler, + [a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector], + num_groups_to_merge, + split_image, + False, + instance_id, + ) + convs.append(conv) + return convs + + +def generate_instances_fwd(instances, problem_name, config): + direction = "forward" + instances = parse_fwd_instances(instances, problem_name) + generate_calls_inc(instances, problem_name, direction) + generate_defs_inc( + instances, problem_name, f"SIGNATURE_{config.upper()}_FWD", direction + ) + generate_fwd_cpp(instances, problem_name, config, direction) + + +if __name__ == "__main__": + fwd_configs = [ + "nhwgc_fp32", + "nhwgc_fp16", + "nhwgc_bf16", + "ndhwgc_fp32", + "ndhwgc_fp16", + "ndhwgc_bf16", + ] + for config in fwd_configs: + instances = [] + config_path = f"configs/{config}.conf" + with open(config_path, "r") as file: + instances = file.readlines() + problem_name = f"grouped_convolution_forward_tile_{config}" + generate_instances_fwd(instances, problem_name, config) diff --git a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp new file mode 100644 index 00000000000..948470e5166 --- /dev/null +++ b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp @@ -0,0 +1,201 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "../utils/ckb_conv_tile_test_configs.hpp" +#include "../utils/ckb_conv_test_utils.hpp" +#include "../utils/conv_algorithm_type_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck.hpp" + +#include "grouped_convolution_signatures.hpp" + +namespace ck_tile::builder::profiling { + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +/// @brief `run_grouped_conv_forward_tile_algs()` run all grouped conv fwd instances. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// +/// @see run_grouped_conv_forward_tile_algs() +template +std::tuple +run_grouped_conv_forward_tile_algs(const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf); + +#include "grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "grouped_convolution_forward_tile_ndhwgc_fp16.inc" + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +template <> +std::tuple run_grouped_conv_forward_tile_algs( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + +#include "grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" + + return std::make_tuple(best_avg_time, best_op_name); +} + +} // namespace ck_tile::builder::profiling diff --git a/experimental/builder/test/profiling/grouped_convolution_signatures.hpp b/experimental/builder/test/profiling/grouped_convolution_signatures.hpp new file mode 100644 index 00000000000..b6c3177ded9 --- /dev/null +++ b/experimental/builder/test/profiling/grouped_convolution_signatures.hpp @@ -0,0 +1,72 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "../utils/ckb_conv_tile_test_configs.hpp" +#include "../utils/ckb_conv_test_utils.hpp" +#include "../utils/conv_algorithm_type_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck.hpp" + +namespace ck_tile::builder::profiling { + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +constexpr auto SIGNATURE_NHWGC_FP32_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_BF16_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_FP16_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP32_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_BF16_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP16_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +} // namespace ck_tile::builder::profiling diff --git a/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp b/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp new file mode 100644 index 00000000000..902a45dc09d --- /dev/null +++ b/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp @@ -0,0 +1,229 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include + +#include "../utils/ckb_conv_tile_test_configs.hpp" +#include "../utils/ckb_conv_test_utils.hpp" +#include "../utils/conv_algorithm_type_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/host/device_prop.hpp" +#include "grouped_convolution_forward_tile_algs.hpp" + +namespace { + +enum struct ConvLayout +{ + GNHWC_GKYXC_GNHWK, // 0 + NHWGC_GKYXC_NHWGK, // 1 + NGCHW_GKYXC_NGKHW, // 2 + NGCHW_GKCYX_NGKHW, // 3 +}; + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 + F8_F8_F8, // 4 + BF8_BF8_F8, // 5 + F8_BF8_F8, // 6 + BF8_F8_F8, // 7 + F32_F32_F32_TF32, // 8 +}; + +enum struct IndexType +{ + INDEX_T, // 0 + LONG_INDEX_T, // 1 +}; + +static void print_helper_msg() +{ + std::cout + // clang-format off + << "arg1: tensor operation (grouped_conv_fwd : Grouped Convolution Forward)\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight bf16, Output bf16\n" + << " 3: Input int8, Weight int8, Output int8\n" + << " 4: Input fp8, Weight fp8, Output fp8\n" + << " 5: Input bf8, Weight bf8, Output fp8\n" + << " 6: Input fp8, Weight bf8, Output fp8\n" + << " 7: Input bf8, Weight fp8, Output fp8\n" + << " 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n" + << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n" + << " 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n" + << " 2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, " + "G, K, Ho, Wo]\n" + << " 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, " + "G, K, Ho, Wo])\n" + << "arg4: indexing data type (0: 32-bit, 1: 64-bit)\n" + << "arg5: verification (0: no, 1: yes)\n" + << "arg6: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg7: print tensor value (0: no; 1: yes)\n" + << "arg8: time kernel (0: no, 1: yes)\n" + << "Following arguments (depending on number of spatial dims):\n" + << " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + << " G, N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" << std::endl; + // clang-format on +} + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace cku = ck_tile::builder::test_utils; +namespace ckp = ck_tile::builder::profiling; + +template +int profile(const ckt::Args& args, bool time_kernel) +{ + auto inputs = alloc_inputs(args); + auto outputs = alloc_outputs(args); + + std::cout << args.make_input_descriptor() << std::endl; + std::cout << args.make_weight_descriptor() << std::endl; + std::cout << args.make_output_descriptor() << std::endl; + float avg_time; + std::string op_name; + std::tie(avg_time, op_name) = ckp::run_grouped_conv_forward_tile_algs( + args, inputs.get(), outputs.get(), ck_tile::stream_config{nullptr, time_kernel}); + if(time_kernel) + { + std::cout << "Best configuration parameters:" << "\nname: " << op_name + << "\navg_time: " << avg_time << std::endl; + } + return 0; +} + +} // namespace + +int main(int argc, char* argv[]) +{ + // 8 for control, 1 for num_dim_spatial + if(argc < 10) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const auto index_type = static_cast(std::stoi(argv[4])); + // TODO: Add support + [[maybe_unused]] const bool do_verification = std::stoi(argv[5]); + // TODO: Add support + [[maybe_unused]] const int init_method = std::stoi(argv[6]); + // TODO: Add support + [[maybe_unused]] const bool do_log = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[8]); + const int num_dim_spatial = std::stoi(argv[9]); + + // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) + { + print_helper_msg(); + return 1; + } + + if(index_type == IndexType::LONG_INDEX_T) + { + std::cout << "this indexing data type is not implemented" << std::endl; + return 1; + } + + if(layout == ConvLayout::NHWGC_GKYXC_NHWGK) + { + if(num_dim_spatial == 2) + { + if(data_type == ConvDataType::F32_F32_F32) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == ConvDataType::F32_F32_F32) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + else if(data_type == ConvDataType::F16_F16_F16) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + else if(data_type == ConvDataType::BF16_BF16_BF16) + { + constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + return profile(ckt::parse_conv_args(10, argv), time_kernel); + } + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + + return 1; +} diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc new file mode 100644 index 00000000000..e92c5cfb217 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc new file mode 100644 index 00000000000..03cd81e8683 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc new file mode 100644 index 00000000000..08ca9275e29 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc new file mode 100644 index 00000000000..908886e9b05 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc new file mode 100644 index 00000000000..b2698bceef5 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc new file mode 100644 index 00000000000..56654b042c8 --- /dev/null +++ b/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc @@ -0,0 +1,12 @@ +namespace { + +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +} // namespace diff --git a/experimental/builder/test/profiling/src/instance_includes.inc b/experimental/builder/test/profiling/src/instance_includes.inc new file mode 100644 index 00000000000..81a900de3e9 --- /dev/null +++ b/experimental/builder/test/profiling/src/instance_includes.inc @@ -0,0 +1,8 @@ +#include "../../utils/ckb_conv_tile_test_configs.hpp" +#include "../../utils/ckb_conv_test_utils.hpp" +#include "../../utils/conv_algorithm_type_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck.hpp" + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace cku = ck_tile::builder::test_utils; diff --git a/experimental/builder/test/profiling/src/instance_run.inc b/experimental/builder/test/profiling/src/instance_run.inc new file mode 100644 index 00000000000..68be987845b --- /dev/null +++ b/experimental/builder/test/profiling/src/instance_run.inc @@ -0,0 +1,7 @@ + +using Builder = ckb::ConvBuilder; +using Instance = Builder::Instance; + +auto conv = Instance{}; +return std::make_tuple(ckt::run_tile(conv, args, inputs, outputs, s_conf), + conv.GetInstanceString()); diff --git a/experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp b/experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp index 377234dd19a..e7942c4b10f 100644 --- a/experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp +++ b/experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp @@ -3,8 +3,8 @@ #pragma once -#include "impl/conv_algorithm_types.hpp" -#include "impl/conv_signature_types.hpp" +#include "../impl/conv_algorithm_types.hpp" +#include "../impl/conv_signature_types.hpp" #include "ck_tile/builder/conv_builder.hpp" namespace ck_tile::builder::test_utils { diff --git a/include/ck/library/utility/host_tensor.hpp b/include/ck/library/utility/host_tensor.hpp index 05bc4ded124..1dda0a48639 100644 --- a/include/ck/library/utility/host_tensor.hpp +++ b/include/ck/library/utility/host_tensor.hpp @@ -298,9 +298,12 @@ struct HostTensorDescriptor if constexpr(!(std::is_same_v || std::is_same_v)) { - std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty " - "strides, got " - << layout << ". Will calculate strides as RowMajor." << std::endl; + if(dbg) + { + std::cerr << "Only RowMajor and ColumnMajor layouts are supported for empty " + "strides, got " + << layout << ". Will calculate strides as RowMajor." << std::endl; + } } mStrides.clear(); @@ -443,9 +446,14 @@ struct HostTensorDescriptor { // TBD: implement verification for Conv layouts // For now, just print warning and return - std::cerr << "Warning: Tensor layout verification for ck::tensor_layout::convolution " - "layouts is not supported yet. Skipping..." - << std::endl; + if(dbg) + { + + std::cerr + << "Warning: Tensor layout verification for ck::tensor_layout::convolution " + "layouts is not supported yet. Skipping..." + << std::endl; + } return; } else diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp index 936c38ddf33..9b7213837a1 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp @@ -90,6 +90,8 @@ struct GemmPipelineAGmemBGmemCRegV1 : public BaseGemmPipelineAGmemBGmemCRegV1>; +template +using WarpGemmMfmaF32F32F32M16N16K8 = WarpGemmImpl, + 2, + AttrNumAccess>>; + +template +using WarpGemmMfmaF32F32F32M32N32K8 = WarpGemmImpl, + 4, + AttrNumAccess>>; + template using WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution = WarpGemmImpl struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K4; }; template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K16<>; }; +template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K8<>; }; +template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M32N32K8<>; }; template<> struct Dispatcher { using Type = WarpGemmMfmaF32F32F32M16N16K16TransposedCDistribution<>; }; // fp16 // ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 0f143d7ff7f..ea5e543362e 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -723,8 +723,11 @@ struct GroupedConvolutionForwardKernel if constexpr(GroupedConvTraitsType_::ExplicitGemm && ConvSpecialization != ConvolutionSpecialization::Filter1x1Stride1Pad0) { - CK_TILE_ERROR( - "Explicit Gemm is supported only for Filter1x1Stride1Pad0 specialization!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR( + "Explicit Gemm is supported only for Filter1x1Stride1Pad0 specialization!"); + } return false; } @@ -736,13 +739,19 @@ struct GroupedConvolutionForwardKernel // Check access per C if(ConvC % GroupedConvTraitsType_::VectorSizeA != 0) { - CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!"); + } return false; } } else { - CK_TILE_ERROR("Not supported input layout!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Not supported input layout!"); + } return false; } @@ -754,13 +763,19 @@ struct GroupedConvolutionForwardKernel { if(ConvC % GroupedConvTraitsType_::VectorSizeB != 0) { - CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!"); + } return false; } } else { - CK_TILE_ERROR("Not supported weight layout!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Not supported weight layout!"); + } return false; } @@ -771,13 +786,20 @@ struct GroupedConvolutionForwardKernel { if(ConvK % GroupedConvTraitsType_::VectorSizeC != 0) { - CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR( + "Conv K is not a multiple of vector store size for output image!"); + } return false; } } else { - CK_TILE_ERROR("Not supported output layout!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("Not supported output layout!"); + } return false; } @@ -786,7 +808,10 @@ struct GroupedConvolutionForwardKernel const index_t ConvG = kargs.wei_g_k_c_xs_lengths[number<0>{}]; if(ConvG % GroupedConvTraitsType_::NumGroupsToMerge != 0) { - CK_TILE_ERROR("ConvG must be a multiple of NumGroupsToMerge!"); + if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING))) + { + CK_TILE_ERROR("ConvG must be a multiple of NumGroupsToMerge!"); + } return false; } } From 02151f048855542fd8617f543b4b5d3bfcc1a748 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Wed, 7 Jan 2026 13:32:49 +0000 Subject: [PATCH 02/24] [CK TILE] Fix grouped conv kernels splitk and double lds --- ...ouped_convolution_backward_data_kernel.hpp | 138 +++-------------- ...ped_convolution_backward_weight_kernel.hpp | 105 ++----------- .../grouped_convolution_forward_kernel.hpp | 141 +++--------------- 3 files changed, 54 insertions(+), 330 deletions(-) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp index 2e5f536ab77..a0ade4d3182 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_data_kernel.hpp @@ -1036,84 +1036,16 @@ struct GroupedConvolutionBackwardDataKernel } else { - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, group_id, block_idx_m, block_idx_n); - - EpiloguePipeline{} - .template operator()( - c_block_window, c_block_tile, d_block_window, smem_ptr_0); - } - } - - /** - * @brief Runs single GEMM problem cooperatively by whole workgroup. - * - * @note RunGemm2LDS in with two shared memory buffers using the ping pong buffer mechanism. - * - * @param a_ptr input A pointer - * @param b_ptr input B pointer - * @param c_ptr output C pointer - * @param smem_ptr_0 The starting pointer of 1st shared memory block. - * @param smem_ptr_1 The starting pointer of 2nd shared memory block. - * @param kargs Grouped Convolution Backward Data kernel arguments - * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. - * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. - * - */ - CK_TILE_DEVICE static void RunGemm2LDS(const OutDataType* a_ptr, - const InDataType* b_ptr, - const std::array& ds_ptr, - WeiDataType* c_ptr, - void* __restrict__ smem_ptr_0, - void* __restrict__ smem_ptr_1, - const GroupedConvBwdDataKernelArgsSpecialized& kargs, - const index_t splitted_k, - const index_t block_idx_m, - const index_t block_idx_n, - const index_t block_idx_k, - const index_t group_id) - { - // Create block windows using specialized methods - const auto& a_block_window = - MakeABlockWindow(a_ptr, kargs, group_id, block_idx_m, block_idx_k); - const auto& b_block_window = - MakeBBlockWindow(b_ptr, kargs, group_id, block_idx_n, block_idx_k); - const auto& d_block_window = - MakeDBlockWindows(ds_ptr, kargs, group_id, block_idx_m, block_idx_n); - - const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitted_k)); - const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop); - const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop); - - // Run GEMM cooperatively by whole workgroup. - const auto& c_block_tile = GemmPipeline{}.template operator()(a_block_window, - b_block_window, - num_loop, - has_hot_loop, - tail_num, - smem_ptr_0, - smem_ptr_1); - - const index_t k_batch = amd_wave_read_first_lane(kargs.k_batch); - - // Run Epilogue Pipeline with k_batch dispatch - if(k_batch == 1) - { - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, group_id, block_idx_m, block_idx_n); - - EpiloguePipeline{} - .template operator()( - c_block_window, c_block_tile, d_block_window, smem_ptr_0); - } - else - { - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, group_id, block_idx_m, block_idx_n); + if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && + is_any_of::value)) + { + auto c_block_window = MakeCBlockWindow( + c_ptr, kargs, group_id, block_idx_m, block_idx_n); - EpiloguePipeline{} - .template operator()( - c_block_window, c_block_tile, d_block_window, smem_ptr_0); + EpiloguePipeline{} + .template operator()( + c_block_window, c_block_tile, d_block_window, smem_ptr_0); + } } } @@ -1195,46 +1127,18 @@ struct GroupedConvolutionBackwardDataKernel static_cast(kargs.in_ptr) + group_offset_c + input_batch_offset; // allocate LDS - __shared__ char smem_ptr_0[GetSmemSize()]; - - if constexpr(GemmPipeline::DoubleSmemBuffer == true) - { - __shared__ char smem_ptr_1[GemmPipeline::GetSmemSize()]; - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm2LDS(a_ptr, - b_ptr, - kargs.ds_ptr, - c_ptr, - smem_ptr_0, - smem_ptr_1, - kargs, - splitted_k, - i_m, - i_n, - i_k, - group_id); - } - } - else - { - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm(a_ptr, - b_ptr, - kargs.ds_ptr, - c_ptr, - smem_ptr_0, - kargs, - splitted_k, - i_m, - i_n, - i_k, - group_id); - } - } + __shared__ char smem_ptr[GetSmemSize()]; + RunGemm(a_ptr, + b_ptr, + kargs.ds_ptr, + c_ptr, + smem_ptr, + kargs, + splitted_k, + i_m, + i_n, + i_k, + group_id); } }; diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp index 6bcd05e9ba2..916f7a96ab8 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_backward_weight_kernel.hpp @@ -829,66 +829,14 @@ struct GroupedConvolutionBackwardWeightKernel } else { - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, block_idx_m, block_idx_n); - - EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_0); - } - } - - /** - * @brief Runs single GEMM problem cooperatively by whole workgroup. - * - * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism. - * - * @param a_ptr input A pointer - * @param b_ptr input B pointer - * @param c_ptr output C pointer - * @param smem_ptr_0 The starting pointer of 1st shared memory block. - * @param smem_ptr_1 The starting pointer of 2nd shared memory block. - * @param kargs Grouped Convolution Backward Weight kernel arguments - * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. - * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. - * - */ - CK_TILE_DEVICE static void RunGemm2LDS(const OutDataType* a_ptr, - const InDataType* b_ptr, - const std::array& ds_ptr, - WeiDataType* c_ptr, - void* __restrict__ smem_ptr_0, - void* __restrict__ smem_ptr_1, - const GroupedConvBwdWeightKernelArgsSpecialized& kargs, - const index_t num_loop, - const index_t block_idx_m, - const index_t block_idx_n, - const index_t block_idx_k) - { - // Create block windows using helper methods - const auto& a_block_window = MakeABlockWindow(a_ptr, kargs, block_idx_m, block_idx_k); - const auto& b_block_window = MakeBBlockWindow(b_ptr, kargs, block_idx_n, block_idx_k); - const auto& d_block_window = MakeDBlockWindows(ds_ptr, kargs, block_idx_m, block_idx_n); - - // Run GEMM cooperatively by whole workgroup. - const auto& c_block_tile = GemmPipeline{}.template operator()( - a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1); - - // Run Epilogue Pipeline with k_batch dispatching - if(kargs.k_batch == 1) - { - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, block_idx_m, block_idx_n); - - EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_0); - } - else - { -#if defined(__gfx11__) - return; -#endif - auto c_block_window = MakeCBlockWindow( - c_ptr, kargs, block_idx_m, block_idx_n); + if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && + is_any_of::value)) + { + auto c_block_window = MakeCBlockWindow( + c_ptr, kargs, block_idx_m, block_idx_n); - EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_0); + EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_0); + } } } @@ -949,44 +897,9 @@ struct GroupedConvolutionBackwardWeightKernel const InDataType* b_ptr = static_cast(kargs.in_ptr) + group_offset_b; WeiDataType* c_ptr = static_cast(kargs.wei_ptr) + group_offset_c; - __shared__ char smem_ptr_0[GetSmemSize()]; + __shared__ char smem_ptr[GetSmemSize()]; - if constexpr(GemmPipeline::DoubleSmemBuffer == true) - { - __shared__ char smem_ptr_1[GemmPipeline::GetSmemSize()]; - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm2LDS(a_ptr, - b_ptr, - kargs.ds_ptr, - c_ptr, - smem_ptr_0, - smem_ptr_1, - kargs, - num_loop, - i_m, - i_n, - i_k); - } - } - else - { - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm(a_ptr, - b_ptr, - kargs.ds_ptr, - c_ptr, - smem_ptr_0, - kargs, - num_loop, - i_m, - i_n, - i_k); - } - } + RunGemm(a_ptr, b_ptr, kargs.ds_ptr, c_ptr, smem_ptr, kargs, num_loop, i_m, i_n, i_k); } } }; diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 2751df43ab3..6f0ee2216f7 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -979,80 +979,16 @@ struct GroupedConvolutionForwardKernel } else { - auto c_block_window = MakeCBlockWindow( - c_ptr, c_desc, block_idx_m, block_idx_n); - - EpiloguePipeline{elfunc} - .template operator()( - c_block_window, c_block_tile, ds_block_window, smem_ptr_0); - } - } - - /** - * @brief Runs single GEMM problem cooperatively by whole workgroup. - * - * @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism. - * - * @param a_ptr input A pointer - * @param b_ptr input B pointer - * @param ds_ptr input D tensors pointer array - * @param c_ptr output C pointer - * @param smem_ptr_0 The starting pointer of 1st shared memory block. - * @param smem_ptr_1 The starting pointer of 2nd shared memory block. - * @param a_desc Input tensor A descriptor - * @param b_desc Weight tensor B descriptor - * @param c_desc Output tensor C descriptor - * @param gemm_k The GEMM K dimension - * @param k_batch The K batch parameter for split-K - * @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup. - * @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup. - * - */ - template - CK_TILE_DEVICE static void RunGemm2LDS(const InDataType* a_ptr, - const WeiDataType* b_ptr, - const std::array& ds_ptr, - OutDataType* c_ptr, - void* __restrict__ smem_ptr_0, - void* __restrict__ smem_ptr_1, - const ADescType& a_desc, - const BDescType& b_desc, - const CDescType& c_desc, - const index_t gemm_k, - const index_t k_batch, - const index_t block_idx_m, - const index_t block_idx_n, - const CDElementwise& elfunc) - { - // Create block windows using specialized methods - const auto& a_block_window = MakeABlockWindow(a_ptr, a_desc, block_idx_m); - const auto& b_block_window = MakeBBlockWindow(b_ptr, b_desc, block_idx_n); - const auto& ds_block_window = MakeDBlockWindows(ds_ptr, c_desc, block_idx_m, block_idx_n); - - const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(gemm_k)); - - // Run GEMM cooperatively by whole workgroup. - const auto& c_block_tile = GemmPipeline{}.template operator()( - a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1); - - // Run Epilogue Pipeline with k_batch dispatching - if(k_batch == 1) - { - auto c_block_window = MakeCBlockWindow( - c_ptr, c_desc, block_idx_m, block_idx_n); - - EpiloguePipeline{elfunc} - .template operator()( - c_block_window, c_block_tile, ds_block_window, smem_ptr_0); - } - else - { - auto c_block_window = MakeCBlockWindow( - c_ptr, c_desc, block_idx_m, block_idx_n); + if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && + is_any_of::value)) + { + auto c_block_window = MakeCBlockWindow( + c_ptr, c_desc, block_idx_m, block_idx_n); - EpiloguePipeline{elfunc} - .template operator()( - c_block_window, c_block_tile, ds_block_window, smem_ptr_0); + EpiloguePipeline{elfunc} + .template operator()( + c_block_window, c_block_tile, ds_block_window, smem_ptr_0); + } } } @@ -1202,50 +1138,21 @@ struct GroupedConvolutionForwardKernel const auto& c_desc = kargs.c_grid_desc_m_n; // allocate LDS - __shared__ char smem_ptr_0[GetSmemSize()]; - - if constexpr(GemmPipeline::DoubleSmemBuffer == true) - { - __shared__ char smem_ptr_1[GemmPipeline::GetSmemSize()]; - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm2LDS(a_ptr, - b_ptr, - ds_ptr_with_offsets, - c_ptr, - smem_ptr_0, - smem_ptr_1, - a_desc, - b_desc, - c_desc, - kargs.GemmK, - kargs.k_batch, - i_m, - i_n, - kargs.elfunc); - } - } - else - { - if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) - { - RunGemm(a_ptr, - b_ptr, - ds_ptr_with_offsets, - c_ptr, - smem_ptr_0, - a_desc, - b_desc, - c_desc, - kargs.GemmK, - kargs.k_batch, - i_m, - i_n, - kargs.elfunc); - } - } + __shared__ char smem_ptr[GetSmemSize()]; + + RunGemm(a_ptr, + b_ptr, + ds_ptr_with_offsets, + c_ptr, + smem_ptr, + a_desc, + b_desc, + c_desc, + kargs.GemmK, + kargs.k_batch, + i_m, + i_n, + kargs.elfunc); } } }; From 04ee697049d90c894242fc3caa3f903e2841cc10 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Wed, 7 Jan 2026 14:42:40 +0000 Subject: [PATCH 03/24] Updates --- .../builder/factory/conv_tile_factory.hpp | 41 +++++---- .../ck_tile/builder/testing/conv_fwd_ck.hpp | 51 ++--------- .../builder/testing/conv_fwd_ck_tile.hpp | 86 +++++++++++++++++++ .../builder/testing/conv_fwd_reference.hpp | 9 +- .../conv/ck_tile/test_ckb_conv_fwd_e2e.cpp | 14 ++- .../builder/test/profiling/CMakeLists.txt | 12 ++- experimental/builder/test/profiling/README.md | 42 +++++++++ .../test/profiling/generate_instances.py | 16 ++-- .../grouped_convolution_forward_tile_algs.hpp | 32 ++++++- .../grouped_convolution_signatures.hpp | 2 +- .../profiling/profile_ckb_tile_conv_fwd.cpp | 2 +- .../test/profiling/src/instance_includes.inc | 2 +- .../test/profiling/src/instance_run.inc | 2 +- .../grouped_convolution_forward_kernel.hpp | 3 +- 14 files changed, 226 insertions(+), 88 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp create mode 100644 experimental/builder/test/profiling/README.md diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp index 6ce508b47d6..35c87b61cef 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_tile_factory.hpp @@ -98,27 +98,26 @@ struct ConvTileFactory using GemmPipeline = typename internal::TilePipelineType< BLOCK_GEMM.pipeline_version>::template GemmPipeline; - using ConvEpilogue = ck_tile::CShuffleEpilogue>; + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; using Instance = typename internal::GroupedConvolutionTileKernel; /// @see run() template requires ValidConvSignature && ConvDirectionIsForward -void run(CkConvInstance auto& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs) +float run(CkConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + [[maybe_unused]] const ck_tile::index_t k_batch = 1, + const ck_tile::stream_config s_conf = {}) { constexpr auto spatial_dim = SIGNATURE.spatial_dim; @@ -148,46 +150,7 @@ void run(CkConvInstance auto& conv, throw std::runtime_error("invalid argument"); } - conv.MakeInvoker().Run(ck_args, {}); -} - -/// @brief `run()` specialization for forward convolution and CK Tile. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// @throws std::runtime_error if the arguments werent actually valid for the -/// operation. This should be caught and reported by the testing framework. -/// -/// @see run() -template - requires ValidConvSignature && ConvDirectionIsForward && - IsCkConvInstance -float run_tile(Conv& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - const auto param = args.to_ck_tile_conv_param(); - - ck_tile::GroupedConvFwdHostArgs<> host_args( - param, inputs.input, inputs.weight, {}, outputs.output, 1 /*kbatch*/); - - auto kargs = Conv::MakeKernelArgs(host_args); - - const dim3 grids = Conv::GridSize(kargs); - const dim3 blocks = Conv::BlockSize(); - - if(!Conv::IsSupportedArgument(kargs)) - { - std::cout << "Not supported!"; - return 0.f; - } - - constexpr index_t minimum_occupancy = - Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2; - - return ck_tile::launch_kernel( - s_conf, ck_tile::make_kernel(conv, grids, blocks, 0, kargs)); + return conv.MakeInvoker().Run(ck_args, s_conf); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp new file mode 100644 index 00000000000..36d95180790 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp @@ -0,0 +1,86 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/conv_fwd.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" +#include +#include + +/// This file contains the implementation details for invoking/testing +/// grouped convolution operations in CK Tile. The main item is the +/// `run()` function, which is the main implementation used to invoke +/// CK grouped forward convolution kernels. + +namespace ck_tile::builder::test { + +namespace detail { + +/// @brief Concept for checking whether this is the CK Tile convolution +/// implementation. +/// +/// This is the same as `::ck_tile::builder::test::CkConvInstance`, except +/// with some utility aliases. For that reason, its moved to this detail +/// namespace. +template +concept CkTileConvInstance = requires(Conv&) { + { Conv::BlockSize() }; +}; + +} // namespace detail + +/// @brief Concept for checking whether a convolution is invoked like CK Tile. +/// +/// This concept is used to tell whether a convolution implementation is +/// likely to be an "CK Tile" implementation - that is, whether we should +/// invoke it as an CK Tile kernel. This is mainly used with `run()` to +/// differentiate which implementation that should be invoked. +/// +/// - SIGNATURE is the operation signature. +/// - Conv is a convolution instance created by the CK Builder API. +template +concept CkTileConvInstance = detail::CkTileConvInstance; + +/// @brief `run()` specialization for forward convolution and CK Tile. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// @throws std::runtime_error if the arguments werent actually valid for the +/// operation. This should be caught and reported by the testing framework. +/// +/// @see run() +template + requires ValidConvSignature && ConvDirectionIsForward +float run(CkTileConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const ck_tile::index_t k_batch = 1, + const ck_tile::stream_config s_conf = {}) +{ + using Conv = std::remove_reference_t; + const auto param = args.to_ck_tile_conv_param(); + + ck_tile::GroupedConvFwdHostArgs<> host_args( + param, inputs.input, inputs.weight, {}, outputs.output, k_batch); + + auto kargs = Conv::MakeKernelArgs(host_args); + + const dim3 grids = Conv::GridSize(kargs); + const dim3 blocks = Conv::BlockSize(); + + if(!Conv::IsSupportedArgument(kargs)) + { + std::cout << "Not supported!"; + return 0.f; + } + + constexpr index_t minimum_occupancy = + Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2; + + return ck_tile::launch_kernel( + s_conf, ck_tile::make_kernel(conv, grids, blocks, 0, kargs)); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp index 85493e32eb1..cf3b256c802 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp @@ -69,10 +69,10 @@ template // for now, just concern outselves with reference and see when the // rest of the bwd/weight plumbing is there. ConvDirectionIsForward -void run(RefConvInstance auto& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs) +float run(RefConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs) { // We don't want to compute the output dims manually, just get // them via the existing infrastructure @@ -109,6 +109,7 @@ void run(RefConvInstance auto& conv, param.conv_filter_strides_, param.conv_filter_dilations_, param.input_left_pads_); + return 0.f; } } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp index b96c617744b..502c2e82bfa 100644 --- a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp +++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp @@ -4,7 +4,7 @@ #include "utils/ckb_conv_tile_test_configs.hpp" #include "utils/ckb_conv_test_utils.hpp" #include "utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" namespace ckb = ck_tile::builder; @@ -29,8 +29,9 @@ constexpr auto ALGORITHM = .with_tile_optimizations(ckt::TileOptimizations{ .num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false}); -using Builder = ckb::ConvBuilder; -using Instance = Builder::Instance; +using Builder = ckb::ConvBuilder; +using Instance = Builder::Instance; +using Reference = ckb::ConvBuilder::Instance; TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd) { @@ -70,5 +71,10 @@ TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd) auto outputs = alloc_outputs(args); auto conv = Instance{}; - ckt::run_tile(conv, args, inputs.get(), outputs.get()); + ckt::run(conv, args, inputs.get(), outputs.get()); + + auto ref_conv = Reference{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + + EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get())); } diff --git a/experimental/builder/test/profiling/CMakeLists.txt b/experimental/builder/test/profiling/CMakeLists.txt index f1267fbdc1a..70e9e016201 100644 --- a/experimental/builder/test/profiling/CMakeLists.txt +++ b/experimental/builder/test/profiling/CMakeLists.txt @@ -2,14 +2,19 @@ # SPDX-License-Identifier: MIT if(GPU_TARGETS MATCHES "gfx94|gfx95") - + # Generate instances using python script + find_package(Python3 COMPONENTS Interpreter Development) + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py + RESULT_VARIABLE ret + ) + # Find cpp files and create lib for instances file(GLOB_RECURSE GROUPED_CONV_FWD_TILE CONFIGURE_DEPENDS "src/*.cpp") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) - + # Add profiler executable set(EXAMPLE_CONV_COMPILE_OPTIONS) list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0 -Wno-global-constructors -Wno-c++20-compat) - add_executable(profile_ckb_tile_conv_fwd profile_ckb_tile_conv_fwd.cpp) target_compile_options(profile_ckb_tile_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) target_compile_features(profile_ckb_tile_conv_fwd PRIVATE cxx_std_20) @@ -18,6 +23,7 @@ if(GPU_TARGETS MATCHES "gfx94|gfx95") "${PROJECT_SOURCE_DIR}/include" "${CMAKE_CURRENT_SOURCE_DIR}/../" ) + # Link with instances target_link_libraries(profile_ckb_tile_conv_fwd PRIVATE utility device_grouped_conv_fwd_tile_instances) endif() diff --git a/experimental/builder/test/profiling/README.md b/experimental/builder/test/profiling/README.md new file mode 100644 index 00000000000..4b6ec887ef4 --- /dev/null +++ b/experimental/builder/test/profiling/README.md @@ -0,0 +1,42 @@ +# Builder profiler for Convolution + +This directory contains the profiler created with builder for CK Tile. + + +## Overview + +Instances are generated using `generate_instances.py`. This script is called with cmake files generation. Interface is the same as for ckProfiler. Example of the usage: +```bash +# arg1: tensor operation (grouped_conv_fwd : Grouped Convolution Forward) +# arg2: data type (0: Input fp32, Weight fp32, Output fp32 +# 1: Input fp16, Weight fp16, Output fp16 +# 2: Input bf16, Weight bf16, Output bf16 +# 3: Input int8, Weight int8, Output int8 +# 4: Input fp8, Weight fp8, Output fp8 +# 5: Input bf8, Weight bf8, Output fp8 +# 6: Input fp8, Weight bf8, Output fp8 +# 7: Input bf8, Weight fp8, Output fp8 +# 8: Input fp32, Weight fp32, Output fp32, Compute tf32) +# arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K] +# 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K] +# 2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, G, K, Ho, Wo] +# 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, G, K, Ho, Wo]) +# arg4: indexing data type (0: 32-bit, 1: 64-bit) +# arg5: verification (0: no, 1: yes) +# arg6: initialization (0: no init, 1: integer value, 2: decimal value) +# arg7: print tensor value (0: no; 1: yes) +# arg8: time kernel (0: no, 1: yes) +# Following arguments (depending on number of spatial dims): +# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) +# G, N, K, C, +# , (ie Y, X for 2D) +# , (ie Hi, Wi for 2D) +# , (ie Sy, Sx for 2D) +# , (ie Dy, Dx for 2D) +# , (ie LeftPy, LeftPx for 2D) +# , (ie RightPy, RightPx for 2D) + + ################ op datatype layout indexing verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx +./bin/profile_ckb_tile_conv_fwd grouped_conv_fwd 1 0 0 1 1 0 1 2 32 4 192 192 3 3 28 28 1 1 1 1 1 1 1 1 + +``` diff --git a/experimental/builder/test/profiling/generate_instances.py b/experimental/builder/test/profiling/generate_instances.py index d780d45c190..1cace163893 100644 --- a/experimental/builder/test/profiling/generate_instances.py +++ b/experimental/builder/test/profiling/generate_instances.py @@ -88,14 +88,16 @@ def get_dtype(problem_name): def generate_calls_inc(instances, problem_name, direction): - with open(problem_name + "_calls.inc", "w") as f: + with open( + f"../experimental/builder/test/profiling/{problem_name}_calls.inc", "w" + ) as f: for instance in instances: instance_name = problem_name + "_" + str(instance.id) f.write(f"run_alg(run_{instance_name});\n") def generate_defs_inc(instances, problem_name, signature, direction): - with open(problem_name + ".inc", "w") as f: + with open(f"../experimental/builder/test/profiling/{problem_name}.inc", "w") as f: for instance in instances: instance_name = problem_name + "_" + str(instance.id) f.write( @@ -110,9 +112,12 @@ def generate_defs_inc(instances, problem_name, signature, direction): def generate_fwd_cpp(instances, problem_name, config, direction): for instance in instances: instance_name = problem_name + "_" + str(instance.id) - directory_path = Path(f"src/{config}") + directory_path = Path(f"../experimental/builder/test/profiling/src/{config}") directory_path.mkdir(parents=True, exist_ok=True) - with open(f"src/{config}/{instance_name}.cpp", "w") as f: + with open( + f"../experimental/builder/test/profiling/src/{config}/{instance_name}.cpp", + "w", + ) as f: f.write( f"// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n" f"// SPDX-License-Identifier: MIT\n" @@ -226,9 +231,10 @@ def generate_instances_fwd(instances, problem_name, config): "ndhwgc_fp16", "ndhwgc_bf16", ] + for config in fwd_configs: instances = [] - config_path = f"configs/{config}.conf" + config_path = f"../experimental/builder/test/profiling/configs/{config}.conf" with open(config_path, "r") as file: instances = file.readlines() problem_name = f"grouped_convolution_forward_tile_{config}" diff --git a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp index 948470e5166..b7ff42f670a 100644 --- a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp +++ b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp @@ -8,10 +8,12 @@ #include "../utils/ckb_conv_tile_test_configs.hpp" #include "../utils/ckb_conv_test_utils.hpp" #include "../utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck.hpp" - #include "grouped_convolution_signatures.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" + +#include "ck_tile/host.hpp" + namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; @@ -47,8 +49,14 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + ck_tile::check_err(outputs.get(), reference.get()); + if(avg_time > 0.f) { best_avg_time = std::min(best_avg_time, avg_time); @@ -74,6 +82,10 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) @@ -101,6 +113,10 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) @@ -128,6 +144,10 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) @@ -155,6 +175,10 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) @@ -182,6 +206,10 @@ std::tuple run_grouped_conv_forward_tile_algs::Instance{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) diff --git a/experimental/builder/test/profiling/grouped_convolution_signatures.hpp b/experimental/builder/test/profiling/grouped_convolution_signatures.hpp index b6c3177ded9..d104bd72e18 100644 --- a/experimental/builder/test/profiling/grouped_convolution_signatures.hpp +++ b/experimental/builder/test/profiling/grouped_convolution_signatures.hpp @@ -8,7 +8,7 @@ #include "../utils/ckb_conv_tile_test_configs.hpp" #include "../utils/ckb_conv_test_utils.hpp" #include "../utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ck_tile::builder::profiling { diff --git a/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp b/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp index 902a45dc09d..31caa032a36 100644 --- a/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp +++ b/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp @@ -9,7 +9,7 @@ #include "../utils/ckb_conv_tile_test_configs.hpp" #include "../utils/ckb_conv_test_utils.hpp" #include "../utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "grouped_convolution_forward_tile_algs.hpp" diff --git a/experimental/builder/test/profiling/src/instance_includes.inc b/experimental/builder/test/profiling/src/instance_includes.inc index 81a900de3e9..c2543e95b45 100644 --- a/experimental/builder/test/profiling/src/instance_includes.inc +++ b/experimental/builder/test/profiling/src/instance_includes.inc @@ -1,7 +1,7 @@ #include "../../utils/ckb_conv_tile_test_configs.hpp" #include "../../utils/ckb_conv_test_utils.hpp" #include "../../utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; diff --git a/experimental/builder/test/profiling/src/instance_run.inc b/experimental/builder/test/profiling/src/instance_run.inc index 68be987845b..7b74cd328da 100644 --- a/experimental/builder/test/profiling/src/instance_run.inc +++ b/experimental/builder/test/profiling/src/instance_run.inc @@ -3,5 +3,5 @@ using Builder = ckb::ConvBuilder; using Instance = Builder::Instance; auto conv = Instance{}; -return std::make_tuple(ckt::run_tile(conv, args, inputs, outputs, s_conf), +return std::make_tuple(ckt::run(conv, args, inputs, outputs, 1 /*k_batch*/, s_conf), conv.GetInstanceString()); diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 6f0ee2216f7..555264eee8a 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -980,7 +980,8 @@ struct GroupedConvolutionForwardKernel else { if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) + is_any_of::value) && + IsSplitKSupported) { auto c_block_window = MakeCBlockWindow( c_ptr, c_desc, block_idx_m, block_idx_n); From 48d9113c252f35ec309f029bc2325c0969d61e5b Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Wed, 7 Jan 2026 15:26:10 +0000 Subject: [PATCH 04/24] Fixes --- .../ck_tile/builder/testing/tensor_descriptor.hpp | 9 +++++++++ .../profiling/grouped_convolution_forward_tile_algs.hpp | 2 -- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp index 15fe4d89dbb..69b4ec63a66 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp @@ -415,6 +415,15 @@ struct TensorDescriptor return TensorDescriptor(lengths, strides); } + /// @brief Print tensor descriptor details. + /// + /// Print tensor descriptor details - lengths and strides. + friend std::ostream& operator<<(std::ostream& os, const TensorDescriptor& tensor_desc) + { + os << tensor_desc.inner_descriptor_; + return os; + } + private: ck_tile::HostTensorDescriptor inner_descriptor_; }; diff --git a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp index b7ff42f670a..8e11e067c82 100644 --- a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp +++ b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp @@ -12,8 +12,6 @@ #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" -#include "ck_tile/host.hpp" - namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; From c7709eaaf23bd35af034c1770079ab1f1132722c Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Sun, 11 Jan 2026 23:29:05 +0000 Subject: [PATCH 05/24] Move to ckProfiler --- .gitignore | 6 +- CMakeLists.txt | 8 +- experimental/builder/CMakeLists.txt | 2 + .../ck_tile/builder/testing/conv_fwd_ck.hpp | 2 +- experimental/builder/src/CMakeLists.txt | 8 + .../configs/ndhwgc_bf16.conf | 0 .../configs/ndhwgc_fp16.conf | 0 .../configs/ndhwgc_fp32.conf | 0 .../profiling => src}/configs/nhwgc_bf16.conf | 0 .../profiling => src}/configs/nhwgc_fp16.conf | 0 .../profiling => src}/configs/nhwgc_fp32.conf | 0 .../profiling => src}/generate_instances.py | 54 +++-- ...ion_forward_tile_ndhwgc_bf16_signature.inc | 0 ...ion_forward_tile_ndhwgc_fp16_signature.inc | 0 ...ion_forward_tile_ndhwgc_fp32_signature.inc | 0 ...tion_forward_tile_nhwgc_bf16_signature.inc | 0 ...tion_forward_tile_nhwgc_fp16_signature.inc | 0 ...tion_forward_tile_nhwgc_fp32_signature.inc | 0 .../instances}/instance_includes.inc | 5 +- .../src => src/instances}/instance_run.inc | 0 experimental/builder/test/CMakeLists.txt | 2 - .../conv/ck_tile/test_ckb_conv_fwd_e2e.cpp | 10 +- .../test/impl/conv_signature_types.hpp | 25 ++ .../builder/test/profiling/CMakeLists.txt | 29 --- experimental/builder/test/profiling/README.md | 42 ---- .../grouped_convolution_forward_tile_algs.hpp | 227 ------------------ .../grouped_convolution_forward_tile_algs.hpp | 101 ++++++++ .../grouped_convolution_signatures.hpp | 6 +- profiler/src/CMakeLists.txt | 9 + .../src/profile_grouped_conv_fwd_tile.cpp | 64 +++-- 30 files changed, 243 insertions(+), 357 deletions(-) create mode 100644 experimental/builder/src/CMakeLists.txt rename experimental/builder/{test/profiling => src}/configs/ndhwgc_bf16.conf (100%) rename experimental/builder/{test/profiling => src}/configs/ndhwgc_fp16.conf (100%) rename experimental/builder/{test/profiling => src}/configs/ndhwgc_fp32.conf (100%) rename experimental/builder/{test/profiling => src}/configs/nhwgc_bf16.conf (100%) rename experimental/builder/{test/profiling => src}/configs/nhwgc_fp16.conf (100%) rename experimental/builder/{test/profiling => src}/configs/nhwgc_fp32.conf (100%) rename experimental/builder/{test/profiling => src}/generate_instances.py (83%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc (100%) rename experimental/builder/{test/profiling/src => src/instances}/instance_includes.inc (53%) rename experimental/builder/{test/profiling/src => src/instances}/instance_run.inc (100%) delete mode 100644 experimental/builder/test/profiling/CMakeLists.txt delete mode 100644 experimental/builder/test/profiling/README.md delete mode 100644 experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp create mode 100644 profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp rename {experimental/builder/test/profiling => profiler/include/profiler}/grouped_convolution_signatures.hpp (94%) rename experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp => profiler/src/profile_grouped_conv_fwd_tile.cpp (78%) diff --git a/.gitignore b/.gitignore index a048429b821..3cccad4a144 100644 --- a/.gitignore +++ b/.gitignore @@ -92,6 +92,6 @@ test_data/* # The experimental/builder directory should be tracked despite matching build* !experimental/builder !experimental/builder/** -experimental/builder/test/profiling/src/* -!experimental/builder/test/profiling/src/*.inc -experimental/builder/test/profiling/*.inc +experimental/builder/src/instances/* +!experimental/builder/src/instances/*.inc +experimental/builder/src/*.inc diff --git a/CMakeLists.txt b/CMakeLists.txt index eaed7d35097..4af66404c38 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -694,6 +694,10 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) add_subdirectory(library) +if (CK_EXPERIMENTAL_BUILDER) + add_subdirectory(experimental/builder) +endif() + if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY) rocm_package_setup_component(tests LIBRARY_NAME composablekernel @@ -725,10 +729,6 @@ if (NOT MIOPEN_REQ_LIBS_ONLY) add_subdirectory(profiler) endif() -if (CK_EXPERIMENTAL_BUILDER) - add_subdirectory(experimental/builder) -endif() - if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) add_subdirectory(codegen) endif() diff --git a/experimental/builder/CMakeLists.txt b/experimental/builder/CMakeLists.txt index 95b41da40b2..3fd713e9316 100644 --- a/experimental/builder/CMakeLists.txt +++ b/experimental/builder/CMakeLists.txt @@ -4,3 +4,5 @@ if(BUILD_TESTING) add_subdirectory(test) endif() + +add_subdirectory(src) diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index ceaa26d8140..1f00eff1216 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -99,7 +99,7 @@ float run(CkConvInstance auto& conv, const Inputs& inputs, const Outputs& outputs, [[maybe_unused]] const ck_tile::index_t k_batch = 1, - const ck_tile::stream_config s_conf = {}) + const StreamConfig s_conf = {}) { constexpr auto spatial_dim = SIGNATURE.spatial_dim; diff --git a/experimental/builder/src/CMakeLists.txt b/experimental/builder/src/CMakeLists.txt new file mode 100644 index 00000000000..0a0cfea98a3 --- /dev/null +++ b/experimental/builder/src/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +if(GPU_TARGETS MATCHES "gfx9") + # Find cpp files and create lib for instances + file(GLOB_RECURSE GROUPED_CONV_FWD_TILE CONFIGURE_DEPENDS "instances/*.cpp") + add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) +endif() diff --git a/experimental/builder/test/profiling/configs/ndhwgc_bf16.conf b/experimental/builder/src/configs/ndhwgc_bf16.conf similarity index 100% rename from experimental/builder/test/profiling/configs/ndhwgc_bf16.conf rename to experimental/builder/src/configs/ndhwgc_bf16.conf diff --git a/experimental/builder/test/profiling/configs/ndhwgc_fp16.conf b/experimental/builder/src/configs/ndhwgc_fp16.conf similarity index 100% rename from experimental/builder/test/profiling/configs/ndhwgc_fp16.conf rename to experimental/builder/src/configs/ndhwgc_fp16.conf diff --git a/experimental/builder/test/profiling/configs/ndhwgc_fp32.conf b/experimental/builder/src/configs/ndhwgc_fp32.conf similarity index 100% rename from experimental/builder/test/profiling/configs/ndhwgc_fp32.conf rename to experimental/builder/src/configs/ndhwgc_fp32.conf diff --git a/experimental/builder/test/profiling/configs/nhwgc_bf16.conf b/experimental/builder/src/configs/nhwgc_bf16.conf similarity index 100% rename from experimental/builder/test/profiling/configs/nhwgc_bf16.conf rename to experimental/builder/src/configs/nhwgc_bf16.conf diff --git a/experimental/builder/test/profiling/configs/nhwgc_fp16.conf b/experimental/builder/src/configs/nhwgc_fp16.conf similarity index 100% rename from experimental/builder/test/profiling/configs/nhwgc_fp16.conf rename to experimental/builder/src/configs/nhwgc_fp16.conf diff --git a/experimental/builder/test/profiling/configs/nhwgc_fp32.conf b/experimental/builder/src/configs/nhwgc_fp32.conf similarity index 100% rename from experimental/builder/test/profiling/configs/nhwgc_fp32.conf rename to experimental/builder/src/configs/nhwgc_fp32.conf diff --git a/experimental/builder/test/profiling/generate_instances.py b/experimental/builder/src/generate_instances.py similarity index 83% rename from experimental/builder/test/profiling/generate_instances.py rename to experimental/builder/src/generate_instances.py index 1cace163893..6082e952cb9 100644 --- a/experimental/builder/test/profiling/generate_instances.py +++ b/experimental/builder/src/generate_instances.py @@ -1,6 +1,7 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT +import argparse from pathlib import Path @@ -87,17 +88,21 @@ def get_dtype(problem_name): raise RuntimeError("wrong dtype") -def generate_calls_inc(instances, problem_name, direction): - with open( - f"../experimental/builder/test/profiling/{problem_name}_calls.inc", "w" - ) as f: +def generate_calls_inc(instances, problem_name, direction, filter_pattern): + generate_dir = Path(__file__).resolve().parent + with open(f"{generate_dir}/{problem_name}_calls.inc", "w") as f: + if problem_name.find(filter_pattern) == -1: + return for instance in instances: instance_name = problem_name + "_" + str(instance.id) f.write(f"run_alg(run_{instance_name});\n") -def generate_defs_inc(instances, problem_name, signature, direction): - with open(f"../experimental/builder/test/profiling/{problem_name}.inc", "w") as f: +def generate_defs_inc(instances, problem_name, signature, direction, filter_pattern): + generate_dir = Path(__file__).resolve().parent + with open(f"{generate_dir}/{problem_name}.inc", "w") as f: + if problem_name.find(filter_pattern) == -1: + return for instance in instances: instance_name = problem_name + "_" + str(instance.id) f.write( @@ -109,13 +114,16 @@ def generate_defs_inc(instances, problem_name, signature, direction): ) -def generate_fwd_cpp(instances, problem_name, config, direction): +def generate_fwd_cpp(instances, problem_name, config, direction, filter_pattern): for instance in instances: + if problem_name.find(filter_pattern) == -1: + break instance_name = problem_name + "_" + str(instance.id) - directory_path = Path(f"../experimental/builder/test/profiling/src/{config}") + generate_dir = Path(__file__).resolve().parent + directory_path = Path(f"{generate_dir}/instances/{config}") directory_path.mkdir(parents=True, exist_ok=True) with open( - f"../experimental/builder/test/profiling/src/{config}/{instance_name}.cpp", + f"{generate_dir}/instances/{config}/{instance_name}.cpp", "w", ) as f: f.write( @@ -212,14 +220,18 @@ def parse_fwd_instances(instances, problem_name): return convs -def generate_instances_fwd(instances, problem_name, config): +def generate_instances_fwd(instances, problem_name, config, filter_pattern): direction = "forward" instances = parse_fwd_instances(instances, problem_name) - generate_calls_inc(instances, problem_name, direction) + generate_calls_inc(instances, problem_name, direction, filter_pattern) generate_defs_inc( - instances, problem_name, f"SIGNATURE_{config.upper()}_FWD", direction + instances, + problem_name, + f"SIGNATURE_{config.upper()}_FWD", + direction, + filter_pattern, ) - generate_fwd_cpp(instances, problem_name, config, direction) + generate_fwd_cpp(instances, problem_name, config, direction, filter_pattern) if __name__ == "__main__": @@ -232,10 +244,22 @@ def generate_instances_fwd(instances, problem_name, config): "ndhwgc_bf16", ] + parser = argparse.ArgumentParser( + description="Generate grouped conv CK Tile instances." + ) + parser.add_argument( + "--filter_pattern", + type=str, + default="convolution", + help="Filter pattern for configs.", + ) + args = parser.parse_args() + for config in fwd_configs: instances = [] - config_path = f"../experimental/builder/test/profiling/configs/{config}.conf" + generate_dir = Path(__file__).resolve().parent + config_path = f"{generate_dir}/configs/{config}.conf" with open(config_path, "r") as file: instances = file.readlines() problem_name = f"grouped_convolution_forward_tile_{config}" - generate_instances_fwd(instances, problem_name, config) + generate_instances_fwd(instances, problem_name, config, args.filter_pattern) diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc diff --git a/experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc similarity index 100% rename from experimental/builder/test/profiling/src/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc rename to experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc diff --git a/experimental/builder/test/profiling/src/instance_includes.inc b/experimental/builder/src/instances/instance_includes.inc similarity index 53% rename from experimental/builder/test/profiling/src/instance_includes.inc rename to experimental/builder/src/instances/instance_includes.inc index c2543e95b45..669b1ec4d9f 100644 --- a/experimental/builder/test/profiling/src/instance_includes.inc +++ b/experimental/builder/src/instances/instance_includes.inc @@ -1,6 +1,5 @@ -#include "../../utils/ckb_conv_tile_test_configs.hpp" -#include "../../utils/ckb_conv_test_utils.hpp" -#include "../../utils/conv_algorithm_type_utils.hpp" +#include "../../test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ckb = ck_tile::builder; diff --git a/experimental/builder/test/profiling/src/instance_run.inc b/experimental/builder/src/instances/instance_run.inc similarity index 100% rename from experimental/builder/test/profiling/src/instance_run.inc rename to experimental/builder/src/instances/instance_run.inc diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index 8fe816c79b6..ef6d362d073 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -275,8 +275,6 @@ add_custom_target(check-builder COMMENT "Running all experimental builder tests..." ) -add_subdirectory(profiling) - ################################################################################ # Build Summary ################################################################################ diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp index 502c2e82bfa..128744dcc68 100644 --- a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp +++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_e2e.cpp @@ -5,7 +5,9 @@ #include "utils/ckb_conv_test_utils.hpp" #include "utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/builder/testing/conv_fwd_reference.hpp" #include "ck_tile/host/device_prop.hpp" +#include "testing_utils.hpp" namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; @@ -67,8 +69,10 @@ TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd) .cde_elementwise_op = {}, }; - auto inputs = alloc_inputs(args); - auto outputs = alloc_outputs(args); + auto inputs = alloc_inputs(args); + auto outputs = alloc_outputs(args); + auto reference = alloc_outputs(args); + ckt::init_inputs(args, inputs.get()); auto conv = Instance{}; ckt::run(conv, args, inputs.get(), outputs.get()); @@ -76,5 +80,5 @@ TEST(Fwd2DFp16_CShufV3_NHWGC, EndToEnd) auto ref_conv = Reference{}; ckt::run(ref_conv, args, inputs.get(), reference.get()); - EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get())); + EXPECT_THAT(outputs.get(), ck_tile::test::MatchesReference(args, reference.get())); } diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index f046289057c..ad146ddc06e 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -16,6 +16,12 @@ struct TensorConfig // Optional data types, override the type defined in the signature if provided. DataType data_type{DataType::UNDEFINED_DATA_TYPE}; DataType compute_type{DataType::UNDEFINED_DATA_TYPE}; + + constexpr bool operator==(const TensorConfig& other) const + { + return layout == other.layout && data_type == other.data_type && + compute_type == other.compute_type; + } }; template @@ -31,6 +37,12 @@ struct TensorOperation return TensorOperation{ .elementwise_operation = this->elementwise_operation}; } + + constexpr bool operator==(const TensorOperation& other) const + { + return elementwise_operation == other.elementwise_operation && + auxiliary_operand_configs == other.auxiliary_operand_configs; + } }; template > @@ -38,6 +50,11 @@ struct ConvolutionTensor { TensorConfig config; Op operation{}; + + constexpr bool operator==(const ConvolutionTensor& other) const + { + return config == other.config && operation == other.operation; + } }; template , @@ -52,6 +69,14 @@ struct ConvSignature InputTensor input; WeightTensor weight; OutputTensor output; + + constexpr bool operator==(const ConvSignature& other) const + { + return spatial_dim == other.spatial_dim && direction == other.direction && + data_type == other.data_type && + accumulation_data_type == other.accumulation_data_type && input == other.input && + weight == other.weight && output == other.output; + } }; } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/profiling/CMakeLists.txt b/experimental/builder/test/profiling/CMakeLists.txt deleted file mode 100644 index 70e9e016201..00000000000 --- a/experimental/builder/test/profiling/CMakeLists.txt +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -# SPDX-License-Identifier: MIT - -if(GPU_TARGETS MATCHES "gfx94|gfx95") - # Generate instances using python script - find_package(Python3 COMPONENTS Interpreter Development) - execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py - RESULT_VARIABLE ret - ) - # Find cpp files and create lib for instances - file(GLOB_RECURSE GROUPED_CONV_FWD_TILE CONFIGURE_DEPENDS "src/*.cpp") - add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) - - # Add profiler executable - set(EXAMPLE_CONV_COMPILE_OPTIONS) - list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0 -Wno-global-constructors -Wno-c++20-compat) - add_executable(profile_ckb_tile_conv_fwd profile_ckb_tile_conv_fwd.cpp) - target_compile_options(profile_ckb_tile_conv_fwd PRIVATE ${EXAMPLE_CONV_COMPILE_OPTIONS}) - target_compile_features(profile_ckb_tile_conv_fwd PRIVATE cxx_std_20) - target_include_directories(profile_ckb_tile_conv_fwd PRIVATE - "${PROJECT_SOURCE_DIR}/experimental/builder/include" - "${PROJECT_SOURCE_DIR}/include" - "${CMAKE_CURRENT_SOURCE_DIR}/../" - ) - # Link with instances - target_link_libraries(profile_ckb_tile_conv_fwd PRIVATE utility device_grouped_conv_fwd_tile_instances) - -endif() diff --git a/experimental/builder/test/profiling/README.md b/experimental/builder/test/profiling/README.md deleted file mode 100644 index 4b6ec887ef4..00000000000 --- a/experimental/builder/test/profiling/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# Builder profiler for Convolution - -This directory contains the profiler created with builder for CK Tile. - - -## Overview - -Instances are generated using `generate_instances.py`. This script is called with cmake files generation. Interface is the same as for ckProfiler. Example of the usage: -```bash -# arg1: tensor operation (grouped_conv_fwd : Grouped Convolution Forward) -# arg2: data type (0: Input fp32, Weight fp32, Output fp32 -# 1: Input fp16, Weight fp16, Output fp16 -# 2: Input bf16, Weight bf16, Output bf16 -# 3: Input int8, Weight int8, Output int8 -# 4: Input fp8, Weight fp8, Output fp8 -# 5: Input bf8, Weight bf8, Output fp8 -# 6: Input fp8, Weight bf8, Output fp8 -# 7: Input bf8, Weight fp8, Output fp8 -# 8: Input fp32, Weight fp32, Output fp32, Compute tf32) -# arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K] -# 1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K] -# 2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, G, K, Ho, Wo] -# 3: Input[N, G, C, Hi, Wi], Weight[G, K, C, Y, X], Output[N, G, K, Ho, Wo]) -# arg4: indexing data type (0: 32-bit, 1: 64-bit) -# arg5: verification (0: no, 1: yes) -# arg6: initialization (0: no init, 1: integer value, 2: decimal value) -# arg7: print tensor value (0: no; 1: yes) -# arg8: time kernel (0: no, 1: yes) -# Following arguments (depending on number of spatial dims): -# Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) -# G, N, K, C, -# , (ie Y, X for 2D) -# , (ie Hi, Wi for 2D) -# , (ie Sy, Sx for 2D) -# , (ie Dy, Dx for 2D) -# , (ie LeftPy, LeftPx for 2D) -# , (ie RightPy, RightPx for 2D) - - ################ op datatype layout indexing verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx -./bin/profile_ckb_tile_conv_fwd grouped_conv_fwd 1 0 0 1 1 0 1 2 32 4 192 192 3 3 28 28 1 1 1 1 1 1 1 1 - -``` diff --git a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp b/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp deleted file mode 100644 index 8e11e067c82..00000000000 --- a/experimental/builder/test/profiling/grouped_convolution_forward_tile_algs.hpp +++ /dev/null @@ -1,227 +0,0 @@ -// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. -// SPDX-License-Identifier: MIT - -#pragma once - -#include - -#include "../utils/ckb_conv_tile_test_configs.hpp" -#include "../utils/ckb_conv_test_utils.hpp" -#include "../utils/conv_algorithm_type_utils.hpp" -#include "grouped_convolution_signatures.hpp" - -#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" - -namespace ck_tile::builder::profiling { - -namespace ckb = ck_tile::builder; -namespace ckt = ck_tile::builder::test; - -/// @brief `run_grouped_conv_forward_tile_algs()` run all grouped conv fwd instances. -/// -/// @tparam SIGNATURE Forward convolution signature. -/// -/// @see run_grouped_conv_forward_tile_algs() -template -std::tuple -run_grouped_conv_forward_tile_algs(const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf); - -#include "grouped_convolution_forward_tile_nhwgc_fp32.inc" -#include "grouped_convolution_forward_tile_nhwgc_bf16.inc" -#include "grouped_convolution_forward_tile_nhwgc_fp16.inc" -#include "grouped_convolution_forward_tile_ndhwgc_fp32.inc" -#include "grouped_convolution_forward_tile_ndhwgc_bf16.inc" -#include "grouped_convolution_forward_tile_ndhwgc_fp16.inc" - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - ck_tile::check_err(outputs.get(), reference.get()); - - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -template <> -std::tuple run_grouped_conv_forward_tile_algs( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) -{ - float best_avg_time = std::numeric_limits::max(); - std::string best_op_name, op_name; - float avg_time; - - auto ref_conv = - ckb::ConvBuilder::Instance{}; - ckt::run(ref_conv, args, inputs.get(), reference.get()); - - auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) - { - best_avg_time = std::min(best_avg_time, avg_time); - best_op_name = best_avg_time < avg_time ? best_op_name : op_name; - std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; - } - std::cout << " " << op_name << std::endl; - }; - -#include "grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" - - return std::make_tuple(best_avg_time, best_op_name); -} - -} // namespace ck_tile::builder::profiling diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp new file mode 100644 index 00000000000..21a253e2f89 --- /dev/null +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -0,0 +1,101 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "grouped_convolution_signatures.hpp" + +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/builder/testing/conv_fwd_reference.hpp" + +namespace ck_tile::builder::profiling { + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; + +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" + +/// @brief `run_grouped_conv_forward_tile_algs()` run all grouped conv fwd instances. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// +/// @see run_grouped_conv_forward_tile_algs() +template +std::tuple +run_grouped_conv_forward_tile_algs(const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + float best_avg_time = std::numeric_limits::max(); + std::string best_op_name, op_name; + float avg_time; + bool valid = true; + + auto reference = ckt::alloc_outputs(args); + using ReferenceInstance = + typename ckb::ConvBuilder::Instance; + auto ref_conv = ReferenceInstance{}; + ckt::run(ref_conv, args, inputs, reference.get()); + + auto run_alg = [&](auto&& run_alg_func) { + std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(avg_time > 0.f) + { + const auto errors = ckt::validate(args, outputs, reference.get()).get_errors(); + for(const auto& error : errors) + { + valid = false; + std::cout << "Number of incorrect values: " << error.wrong_elements + << " Is all zero:" << error.is_all_zero() << std::endl; + } + best_avg_time = std::min(best_avg_time, avg_time); + best_op_name = best_avg_time < avg_time ? best_op_name : op_name; + std::cout << "Perf: " << std::setw(10) << avg_time << " ms,"; + } + std::cout << " " << op_name << std::endl; + }; + + if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" + } + else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_FWD) + { +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" + } + else + { + std::cout << "Signature not supported" << std::endl; + return std::make_tuple(false, best_avg_time, best_op_name); + } + return std::make_tuple(valid, best_avg_time, best_op_name); +} + +} // namespace ck_tile::builder::profiling diff --git a/experimental/builder/test/profiling/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp similarity index 94% rename from experimental/builder/test/profiling/grouped_convolution_signatures.hpp rename to profiler/include/profiler/grouped_convolution_signatures.hpp index d104bd72e18..b4da583b434 100644 --- a/experimental/builder/test/profiling/grouped_convolution_signatures.hpp +++ b/profiler/include/profiler/grouped_convolution_signatures.hpp @@ -5,9 +5,9 @@ #include -#include "../utils/ckb_conv_tile_test_configs.hpp" -#include "../utils/ckb_conv_test_utils.hpp" -#include "../utils/conv_algorithm_type_utils.hpp" +#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ck_tile::builder::profiling { diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index e484ff9ef7a..3379fd15d1f 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -43,6 +43,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp) list(APPEND PROFILER_OPS profile_contraction_scale.cpp) endif() + if(CK_EXPERIMENTAL_BUILDER) + list(APPEND PROFILER_OPS profile_grouped_conv_fwd_tile.cpp) + endif() endif() if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") @@ -256,6 +259,12 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9|gfx1[12]") list(APPEND DEVICE_INSTANCES device_grouped_conv3d_bwd_weight_instance) endif() +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") + if(CK_EXPERIMENTAL_BUILDER) + list(APPEND DEVICE_INSTANCES device_grouped_conv_fwd_tile_instances) + endif() +endif() + if(DL_KERNELS) list(APPEND DEVICE_INSTANCES device_batched_gemm_multi_d_instance) list(APPEND DEVICE_INSTANCES device_grouped_conv1d_bwd_weight_instance) diff --git a/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp similarity index 78% rename from experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp rename to profiler/src/profile_grouped_conv_fwd_tile.cpp index 31caa032a36..185a1841824 100644 --- a/experimental/builder/test/profiling/profile_ckb_tile_conv_fwd.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -6,12 +6,14 @@ #include #include -#include "../utils/ckb_conv_tile_test_configs.hpp" -#include "../utils/ckb_conv_test_utils.hpp" -#include "../utils/conv_algorithm_type_utils.hpp" -#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" -#include "grouped_convolution_forward_tile_algs.hpp" +#include "profiler/grouped_convolution_forward_tile_algs.hpp" + +#include "profiler_operation_registry.hpp" namespace { @@ -85,29 +87,34 @@ namespace cku = ck_tile::builder::test_utils; namespace ckp = ck_tile::builder::profiling; template -int profile(const ckt::Args& args, bool time_kernel) +int call_profiler(const ckt::Args& args, bool time_kernel) { auto inputs = alloc_inputs(args); auto outputs = alloc_outputs(args); + ckt::init_inputs(args, inputs.get()); std::cout << args.make_input_descriptor() << std::endl; std::cout << args.make_weight_descriptor() << std::endl; std::cout << args.make_output_descriptor() << std::endl; float avg_time; std::string op_name; - std::tie(avg_time, op_name) = ckp::run_grouped_conv_forward_tile_algs( + bool valid; + std::tie(valid, avg_time, op_name) = ckp::run_grouped_conv_forward_tile_algs( args, inputs.get(), outputs.get(), ck_tile::stream_config{nullptr, time_kernel}); if(time_kernel) { std::cout << "Best configuration parameters:" << "\nname: " << op_name << "\navg_time: " << avg_time << std::endl; } - return 0; + return !valid; } +#define OP_NAME "grouped_conv_fwd_tile" +#define OP_DESC "Grouped Convolution Forward (CK Tile)" + } // namespace -int main(int argc, char* argv[]) +int profile_grouped_conv_fwd_tile(int argc, char* argv[]) { // 8 for control, 1 for num_dim_spatial if(argc < 10) @@ -116,17 +123,14 @@ int main(int argc, char* argv[]) return 1; } - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); - const auto index_type = static_cast(std::stoi(argv[4])); - // TODO: Add support + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const auto index_type = static_cast(std::stoi(argv[4])); [[maybe_unused]] const bool do_verification = std::stoi(argv[5]); - // TODO: Add support - [[maybe_unused]] const int init_method = std::stoi(argv[6]); - // TODO: Add support - [[maybe_unused]] const bool do_log = std::stoi(argv[7]); - const bool time_kernel = std::stoi(argv[8]); - const int num_dim_spatial = std::stoi(argv[9]); + [[maybe_unused]] const int init_method = std::stoi(argv[6]); + [[maybe_unused]] const bool do_log = std::stoi(argv[7]); + const bool time_kernel = std::stoi(argv[8]); + const int num_dim_spatial = std::stoi(argv[9]); // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) @@ -135,6 +139,8 @@ int main(int argc, char* argv[]) return 1; } + const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv); + if(index_type == IndexType::LONG_INDEX_T) { std::cout << "this indexing data type is not implemented" << std::endl; @@ -155,7 +161,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { @@ -167,7 +174,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { @@ -179,7 +187,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } } else if(num_dim_spatial == 3) @@ -194,7 +203,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { @@ -206,7 +216,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { @@ -218,7 +229,8 @@ int main(int argc, char* argv[]) .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - return profile(ckt::parse_conv_args(10, argv), time_kernel); + return call_profiler(ckt::parse_conv_args(10, argv), + time_kernel); } } } @@ -227,3 +239,5 @@ int main(int argc, char* argv[]) return 1; } + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_grouped_conv_fwd_tile); From 2b82f2116cbfa6562f7230495828aed31e1c1f4c Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 12 Jan 2026 16:11:20 +0000 Subject: [PATCH 06/24] Fixes --- .../ck_tile/builder/testing/conv_fwd.hpp | 120 +++--------------- .../ck_tile/builder/testing/conv_fwd_ck.hpp | 3 +- .../builder/testing/conv_fwd_ck_tile.hpp | 3 +- .../ck_tile/builder/testing/filter_extent.hpp | 21 +++ .../ck_tile/builder/testing/testing.hpp | 6 +- .../builder/src/instances/instance_run.inc | 3 +- .../src/profile_grouped_conv_fwd_tile.cpp | 1 - 7 files changed, 49 insertions(+), 108 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index bf13217aa13..45fbea5d872 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -94,6 +94,8 @@ struct Args Ops::BElementwiseOp b_elementwise_op; Ops::CDEElementwiseOp cde_elementwise_op; + int k_batch = 1; + /// This function returns the `TensorDescriptor` corresponding to /// the input-tensor of the convolution problem. This can then /// be used to, for example, allocate memory. @@ -248,105 +250,25 @@ CK_TILE_HOST auto parse_conv_args(int arg_idx, char* const argv[]) input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); } - if constexpr(num_dim_spatial == 1) - { - Args args = { - .lengths = - { - .batch_size = N, - .groups = G, - .input_channels = C, - .output_channels = K, - .image = - { - .width = input_spatial_lengths[0], - }, - .filter = - { - .width = filter_spatial_lengths[0], - }, - }, - .filter_strides = {.width = conv_filter_strides[0]}, - .filter_dilation = {.width = conv_filter_dilations[0]}, - .input_left_pad = {.width = input_left_pads[0]}, - .input_right_pad = {.width = input_right_pads[0]}, - .a_elementwise_op = {}, - .b_elementwise_op = {}, - .cde_elementwise_op = {}, - }; - return args; - } - else if constexpr(num_dim_spatial == 2) - { - Args args = { - .lengths = - { - .batch_size = N, - .groups = G, - .input_channels = C, - .output_channels = K, - .image = - { - .width = input_spatial_lengths[1], - .height = input_spatial_lengths[0], - }, - .filter = - { - .width = filter_spatial_lengths[1], - .height = filter_spatial_lengths[0], - }, - }, - .filter_strides = {.width = conv_filter_strides[1], .height = conv_filter_strides[0]}, - .filter_dilation = {.width = conv_filter_dilations[1], - .height = conv_filter_dilations[0]}, - .input_left_pad = {.width = input_left_pads[1], .height = input_left_pads[0]}, - .input_right_pad = {.width = input_right_pads[1], .height = input_right_pads[0]}, - .a_elementwise_op = {}, - .b_elementwise_op = {}, - .cde_elementwise_op = {}, - }; - return args; - } - else - { - Args args = { - .lengths = - { - .batch_size = N, - .groups = G, - .input_channels = C, - .output_channels = K, - .image = - { - .width = input_spatial_lengths[2], - .height = input_spatial_lengths[1], - .depth = input_spatial_lengths[0], - }, - .filter = - { - .width = filter_spatial_lengths[2], - .height = filter_spatial_lengths[1], - .depth = filter_spatial_lengths[0], - }, - }, - .filter_strides = {.width = conv_filter_strides[2], - .height = conv_filter_strides[1], - .depth = conv_filter_strides[0]}, - .filter_dilation = {.width = conv_filter_dilations[2], - .height = conv_filter_dilations[1], - .depth = conv_filter_dilations[0]}, - .input_left_pad = {.width = input_left_pads[2], - .height = input_left_pads[1], - .depth = input_left_pads[0]}, - .input_right_pad = {.width = input_right_pads[2], - .height = input_right_pads[1], - .depth = input_right_pads[0]}, - .a_elementwise_op = {}, - .b_elementwise_op = {}, - .cde_elementwise_op = {}, - }; - return args; - } + Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = from_vector(input_spatial_lengths), + .filter = from_vector(filter_spatial_lengths), + }, + .filter_strides = from_vector(conv_filter_strides), + .filter_dilation = from_vector(conv_filter_dilations), + .input_left_pad = from_vector(input_left_pads), + .input_right_pad = from_vector(input_right_pads), + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; } /// @brief `Inputs` specialization for forward convolution. diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index 1f00eff1216..7ac2abd3845 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -98,8 +98,7 @@ float run(CkConvInstance auto& conv, const Args& args, const Inputs& inputs, const Outputs& outputs, - [[maybe_unused]] const ck_tile::index_t k_batch = 1, - const StreamConfig s_conf = {}) + const StreamConfig s_conf = {}) { constexpr auto spatial_dim = SIGNATURE.spatial_dim; diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp index 36d95180790..428e439dc48 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp @@ -56,14 +56,13 @@ float run(CkTileConvInstance auto& conv, const Args& args, const Inputs& inputs, const Outputs& outputs, - const ck_tile::index_t k_batch = 1, const ck_tile::stream_config s_conf = {}) { using Conv = std::remove_reference_t; const auto param = args.to_ck_tile_conv_param(); ck_tile::GroupedConvFwdHostArgs<> host_args( - param, inputs.input, inputs.weight, {}, outputs.output, k_batch); + param, inputs.input, inputs.weight, {}, outputs.output, args.k_batch); auto kargs = Conv::MakeKernelArgs(host_args); diff --git a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp index 3587ac406f1..e2f0abce1d6 100644 --- a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp @@ -34,4 +34,25 @@ struct FilterExtent<3> size_t depth = 1; }; +template +FilterExtent from_vector(const std::vector& vec); + +template <> +FilterExtent<1> from_vector<1>(const std::vector& vec) +{ + return FilterExtent<1>{.width = vec[0]}; +} + +template <> +FilterExtent<2> from_vector<2>(const std::vector& vec) +{ + return FilterExtent<2>{.width = vec[1], .height = vec[0]}; +} + +template <> +FilterExtent<3> from_vector<3>(const std::vector& vec) +{ + return FilterExtent<3>{.width = vec[2], .height = vec[1], .depth = vec[0]}; +} + } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index 609c93cacf2..f622152055b 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -306,13 +306,15 @@ ValidationReport validate(const Args& args, /// @param inputs The input tensor data. Will not be modified by this function. /// @param outputs The output tensor data. The contents will be overwritten by /// this function. +/// @param s_conf Stream config used to launch kernel. /// /// @note This function is explicitly deleted to generate compile errors /// for missing implementations. -template +template void run(Operation& operation, const Args& args, const Inputs& inputs, - const Outputs& outputs) = delete; + const Outputs& outputs, + const StreamConf s_conf = {}) = delete; } // namespace ck_tile::builder::test diff --git a/experimental/builder/src/instances/instance_run.inc b/experimental/builder/src/instances/instance_run.inc index 7b74cd328da..6f51db2d17d 100644 --- a/experimental/builder/src/instances/instance_run.inc +++ b/experimental/builder/src/instances/instance_run.inc @@ -3,5 +3,4 @@ using Builder = ckb::ConvBuilder; using Instance = Builder::Instance; auto conv = Instance{}; -return std::make_tuple(ckt::run(conv, args, inputs, outputs, 1 /*k_batch*/, s_conf), - conv.GetInstanceString()); +return std::make_tuple(ckt::run(conv, args, inputs, outputs, s_conf), conv.GetInstanceString()); diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 185a1841824..71dde5714a7 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -7,7 +7,6 @@ #include #include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" #include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" From c0dcba091b5d7def496969e9a13bccb39f06f694 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 12 Jan 2026 16:56:44 +0000 Subject: [PATCH 07/24] fix --- experimental/builder/src/CMakeLists.txt | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/experimental/builder/src/CMakeLists.txt b/experimental/builder/src/CMakeLists.txt index 0a0cfea98a3..fba33b49399 100644 --- a/experimental/builder/src/CMakeLists.txt +++ b/experimental/builder/src/CMakeLists.txt @@ -2,7 +2,14 @@ # SPDX-License-Identifier: MIT if(GPU_TARGETS MATCHES "gfx9") + # Generate instances using python script if they are not exist + find_package(Python3 COMPONENTS Interpreter Development) + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py + RESULT_VARIABLE ret + ) + # Find cpp files and create lib for instances - file(GLOB_RECURSE GROUPED_CONV_FWD_TILE CONFIGURE_DEPENDS "instances/*.cpp") + file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/*.cpp") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) endif() From dcf8a5027869c84cedbfdd7e299ff464f4b63eac Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 12 Jan 2026 21:43:58 +0000 Subject: [PATCH 08/24] fix --- .../kernel/grouped_convolution_forward_kernel.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp index 6f0ee2216f7..555264eee8a 100644 --- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp +++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp @@ -980,7 +980,8 @@ struct GroupedConvolutionForwardKernel else { if constexpr(!(GroupedConvTraitsType_::VectorSizeC % 2 != 0 && - is_any_of::value)) + is_any_of::value) && + IsSplitKSupported) { auto c_block_window = MakeCBlockWindow( c_ptr, c_desc, block_idx_m, block_idx_n); From 527d98ee9e1df98302fa30c05d4705fb82bdc38c Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 12 Jan 2026 22:14:08 +0000 Subject: [PATCH 09/24] Change instances to empty list by default --- experimental/builder/src/CMakeLists.txt | 4 ++-- .../profiler/grouped_convolution_forward_tile_algs.hpp | 2 +- profiler/src/profile_grouped_conv_fwd_tile.cpp | 4 ++++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/experimental/builder/src/CMakeLists.txt b/experimental/builder/src/CMakeLists.txt index fba33b49399..ca4b2617be4 100644 --- a/experimental/builder/src/CMakeLists.txt +++ b/experimental/builder/src/CMakeLists.txt @@ -2,10 +2,10 @@ # SPDX-License-Identifier: MIT if(GPU_TARGETS MATCHES "gfx9") - # Generate instances using python script if they are not exist + # Generate instances using python script (empty to just generate empty instance list) find_package(Python3 COMPONENTS Interpreter Development) execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --filter_pattern=empty RESULT_VARIABLE ret ) diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index 21a253e2f89..e26db108346 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -48,7 +48,7 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, auto ref_conv = ReferenceInstance{}; ckt::run(ref_conv, args, inputs, reference.get()); - auto run_alg = [&](auto&& run_alg_func) { + [[maybe_unused]] auto run_alg = [&](auto&& run_alg_func) { std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); if(avg_time > 0.f) { diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 71dde5714a7..2cdc9836374 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -138,6 +138,10 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) return 1; } + std::cout << "IMPORTANT: Generate instances using: python " + "experimental/builder/src/generate_instances.py" + << std::endl; + const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv); if(index_type == IndexType::LONG_INDEX_T) From 3291468441498df01a3d82ba6c614f86f353e84b Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 12 Jan 2026 22:44:08 +0000 Subject: [PATCH 10/24] fix --- .../include/ck_tile/builder/testing/conv_fwd.hpp | 12 ++++++------ .../ck_tile/builder/testing/filter_extent.hpp | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index 6c6231f54d7..43640ccf3dc 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -258,13 +258,13 @@ CK_TILE_HOST auto parse_conv_args(int arg_idx, char* const argv[]) .groups = G, .input_channels = C, .output_channels = K, - .image = from_vector(input_spatial_lengths), - .filter = from_vector(filter_spatial_lengths), + .image = filter_extent_from_vector(input_spatial_lengths), + .filter = filter_extent_from_vector(filter_spatial_lengths), }, - .filter_strides = from_vector(conv_filter_strides), - .filter_dilation = from_vector(conv_filter_dilations), - .input_left_pad = from_vector(input_left_pads), - .input_right_pad = from_vector(input_right_pads), + .filter_strides = filter_extent_from_vector(conv_filter_strides), + .filter_dilation = filter_extent_from_vector(conv_filter_dilations), + .input_left_pad = filter_extent_from_vector(input_left_pads), + .input_right_pad = filter_extent_from_vector(input_right_pads), .a_elementwise_op = {}, .b_elementwise_op = {}, .cde_elementwise_op = {}, diff --git a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp index e2f0abce1d6..e89057a38b0 100644 --- a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp @@ -35,22 +35,22 @@ struct FilterExtent<3> }; template -FilterExtent from_vector(const std::vector& vec); +FilterExtent filter_extent_from_vector(const std::vector& vec); template <> -FilterExtent<1> from_vector<1>(const std::vector& vec) +FilterExtent<1> filter_extent_from_vector<1>(const std::vector& vec) { return FilterExtent<1>{.width = vec[0]}; } template <> -FilterExtent<2> from_vector<2>(const std::vector& vec) +FilterExtent<2> filter_extent_from_vector<2>(const std::vector& vec) { return FilterExtent<2>{.width = vec[1], .height = vec[0]}; } template <> -FilterExtent<3> from_vector<3>(const std::vector& vec) +FilterExtent<3> filter_extent_from_vector<3>(const std::vector& vec) { return FilterExtent<3>{.width = vec[2], .height = vec[1], .depth = vec[0]}; } From f754aa133dc98f07ba52bc00d6f2d3b8d7f628ad Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Tue, 13 Jan 2026 10:00:42 +0000 Subject: [PATCH 11/24] fix --- .../include/ck_tile/builder/testing/filter_extent.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp index e89057a38b0..2fc1f390127 100644 --- a/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/filter_extent.hpp @@ -35,22 +35,22 @@ struct FilterExtent<3> }; template -FilterExtent filter_extent_from_vector(const std::vector& vec); +inline FilterExtent filter_extent_from_vector(const std::vector& vec); template <> -FilterExtent<1> filter_extent_from_vector<1>(const std::vector& vec) +inline FilterExtent<1> filter_extent_from_vector<1>(const std::vector& vec) { return FilterExtent<1>{.width = vec[0]}; } template <> -FilterExtent<2> filter_extent_from_vector<2>(const std::vector& vec) +inline FilterExtent<2> filter_extent_from_vector<2>(const std::vector& vec) { return FilterExtent<2>{.width = vec[1], .height = vec[0]}; } template <> -FilterExtent<3> filter_extent_from_vector<3>(const std::vector& vec) +inline FilterExtent<3> filter_extent_from_vector<3>(const std::vector& vec) { return FilterExtent<3>{.width = vec[2], .height = vec[1], .depth = vec[0]}; } From 18d08a3925cd421ffcc173e4e672727308a73a92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 13 Jan 2026 15:48:39 +0100 Subject: [PATCH 12/24] Update grouped_convolution_signatures.hpp --- profiler/include/profiler/grouped_convolution_signatures.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/profiler/include/profiler/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp index b4da583b434..df71b50e70e 100644 --- a/profiler/include/profiler/grouped_convolution_signatures.hpp +++ b/profiler/include/profiler/grouped_convolution_signatures.hpp @@ -6,7 +6,6 @@ #include #include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" #include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" From ca8d5af43e996cb2ad81d66cac3b7bdcd28b1500 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 13 Jan 2026 15:49:02 +0100 Subject: [PATCH 13/24] Update grouped_convolution_forward_tile_algs.hpp --- .../include/profiler/grouped_convolution_forward_tile_algs.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index e26db108346..179d67f8fcf 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -6,7 +6,6 @@ #include #include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/ckb_conv_test_utils.hpp" #include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "grouped_convolution_signatures.hpp" From 0725777255f1bb2078e28c4682cb2957374bbf52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Thu, 15 Jan 2026 13:33:17 +0100 Subject: [PATCH 14/24] [CK TILE] Add grouped convolution forward tests (#3556) * [CK TILE] Add grouped convolution forward tests * fix jenkins --- Jenkinsfile | 37 ++- .../builder/testing/tensor_descriptor.hpp | 4 + experimental/builder/src/CMakeLists.txt | 12 +- .../configs/{ => profiler}/ndhwgc_bf16.conf | 0 .../configs/{ => profiler}/ndhwgc_fp16.conf | 0 .../configs/{ => profiler}/ndhwgc_fp32.conf | 0 .../configs/{ => profiler}/nhwgc_bf16.conf | 0 .../configs/{ => profiler}/nhwgc_fp16.conf | 0 .../configs/{ => profiler}/nhwgc_fp32.conf | 0 .../src/configs/tests/ndhwgc_bf16.conf | 41 +++ .../src/configs/tests/ndhwgc_fp16.conf | 41 +++ .../src/configs/tests/ndhwgc_fp32.conf | 41 +++ .../builder/src/configs/tests/nhwgc_bf16.conf | 41 +++ .../builder/src/configs/tests/nhwgc_fp16.conf | 41 +++ .../builder/src/configs/tests/nhwgc_fp32.conf | 41 +++ .../builder/src/generate_instances.py | 20 +- .../src/profile_grouped_conv_fwd_tile.cpp | 2 +- test/grouped_convnd_fwd/CMakeLists.txt | 9 + .../test_grouped_convnd_fwd_tile.cpp | 276 ++++++++++++++++++ 19 files changed, 597 insertions(+), 9 deletions(-) rename experimental/builder/src/configs/{ => profiler}/ndhwgc_bf16.conf (100%) rename experimental/builder/src/configs/{ => profiler}/ndhwgc_fp16.conf (100%) rename experimental/builder/src/configs/{ => profiler}/ndhwgc_fp32.conf (100%) rename experimental/builder/src/configs/{ => profiler}/nhwgc_bf16.conf (100%) rename experimental/builder/src/configs/{ => profiler}/nhwgc_fp16.conf (100%) rename experimental/builder/src/configs/{ => profiler}/nhwgc_fp32.conf (100%) create mode 100644 experimental/builder/src/configs/tests/ndhwgc_bf16.conf create mode 100644 experimental/builder/src/configs/tests/ndhwgc_fp16.conf create mode 100644 experimental/builder/src/configs/tests/ndhwgc_fp32.conf create mode 100644 experimental/builder/src/configs/tests/nhwgc_bf16.conf create mode 100644 experimental/builder/src/configs/tests/nhwgc_fp16.conf create mode 100644 experimental/builder/src/configs/tests/nhwgc_fp32.conf create mode 100644 test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp diff --git a/Jenkinsfile b/Jenkinsfile index 7292d9b70ca..2222c180402 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -578,7 +578,7 @@ def cmake_build(Map conf=[:]){ if (params.NINJA_BUILD_TRACE) { echo "running ninja build trace" } - if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) { + if ((params.RUN_BUILDER_TESTS || params.RUN_FULL_CONV_TILE_TESTS) && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) { setup_args = " -D CK_EXPERIMENTAL_BUILDER=ON " + setup_args } setup_cmd = conf.get( @@ -1119,7 +1119,7 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_ 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true - 0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true + 0 13 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true 0 11 * * * % RUN_PYTORCH_TESTS=true;RUN_CODEGEN_TESTS=false;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : "" pipeline { @@ -1283,6 +1283,10 @@ pipeline { name: "RUN_AITER_TESTS", defaultValue: false, description: "Run AITER tests with latest CK develop branch (default: OFF)") + booleanParam( + name: "RUN_FULL_CONV_TILE_TESTS", + defaultValue: false, + description: "Run AITER tests with latest CK develop branch (default: OFF)") string( name: 'aiter_branch', defaultValue: 'main', @@ -1451,6 +1455,35 @@ pipeline { } } } + stage("Run Full Grouped Conv Tile Tests") + { + when { + beforeAgent true + expression { env.SHOULD_RUN_CI.toBoolean() } + } + parallel + { + stage("Run Full Grouped Conv Tile Tests on gfx90a") + { + when { + beforeAgent true + expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx90a")} + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ python3 ../experimental/builder/src/generate_instances.py --mode=profiler && \ + ../script/cmake-ck-dev.sh ../ gfx90a && \ + make -j64 test_grouped_convnd_fwd_tile && \ + ./bin/test_grouped_convnd_fwd_tile""" + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } + } + } stage("Run Grouped Conv Large Case Tests") { when { diff --git a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp index 69eb2b08930..6a150a02333 100644 --- a/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/tensor_descriptor.hpp @@ -418,6 +418,10 @@ struct TensorDescriptor size_t x = 1; for(size_t i = 0; i < RANK; ++i) { + if(lengths[indices[i]] == 1) + { + continue; + } if(strides[indices[i]] != x) return false; diff --git a/experimental/builder/src/CMakeLists.txt b/experimental/builder/src/CMakeLists.txt index ca4b2617be4..cdd682aeadc 100644 --- a/experimental/builder/src/CMakeLists.txt +++ b/experimental/builder/src/CMakeLists.txt @@ -3,11 +3,13 @@ if(GPU_TARGETS MATCHES "gfx9") # Generate instances using python script (empty to just generate empty instance list) - find_package(Python3 COMPONENTS Interpreter Development) - execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --filter_pattern=empty - RESULT_VARIABLE ret - ) + if(NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/grouped_convolution_forward_tile_ndhwgc_fp32.inc) + find_package(Python3 COMPONENTS Interpreter Development) + execute_process( + COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/generate_instances.py --mode=tests + RESULT_VARIABLE ret + ) + endif() # Find cpp files and create lib for instances file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/*.cpp") diff --git a/experimental/builder/src/configs/ndhwgc_bf16.conf b/experimental/builder/src/configs/profiler/ndhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/ndhwgc_bf16.conf rename to experimental/builder/src/configs/profiler/ndhwgc_bf16.conf diff --git a/experimental/builder/src/configs/ndhwgc_fp16.conf b/experimental/builder/src/configs/profiler/ndhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/ndhwgc_fp16.conf rename to experimental/builder/src/configs/profiler/ndhwgc_fp16.conf diff --git a/experimental/builder/src/configs/ndhwgc_fp32.conf b/experimental/builder/src/configs/profiler/ndhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/ndhwgc_fp32.conf rename to experimental/builder/src/configs/profiler/ndhwgc_fp32.conf diff --git a/experimental/builder/src/configs/nhwgc_bf16.conf b/experimental/builder/src/configs/profiler/nhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/nhwgc_bf16.conf rename to experimental/builder/src/configs/profiler/nhwgc_bf16.conf diff --git a/experimental/builder/src/configs/nhwgc_fp16.conf b/experimental/builder/src/configs/profiler/nhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/nhwgc_fp16.conf rename to experimental/builder/src/configs/profiler/nhwgc_fp16.conf diff --git a/experimental/builder/src/configs/nhwgc_fp32.conf b/experimental/builder/src/configs/profiler/nhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/nhwgc_fp32.conf rename to experimental/builder/src/configs/profiler/nhwgc_fp32.conf diff --git a/experimental/builder/src/configs/tests/ndhwgc_bf16.conf b/experimental/builder/src/configs/tests/ndhwgc_bf16.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/ndhwgc_bf16.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp16.conf b/experimental/builder/src/configs/tests/ndhwgc_fp16.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/ndhwgc_fp16.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp32.conf b/experimental/builder/src/configs/tests/ndhwgc_fp32.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/ndhwgc_fp32.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/nhwgc_bf16.conf b/experimental/builder/src/configs/tests/nhwgc_bf16.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/nhwgc_bf16.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/nhwgc_fp16.conf b/experimental/builder/src/configs/tests/nhwgc_fp16.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/nhwgc_fp16.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/nhwgc_fp32.conf b/experimental/builder/src/configs/tests/nhwgc_fp32.conf new file mode 100644 index 00000000000..d6f856dc05c --- /dev/null +++ b/experimental/builder/src/configs/tests/nhwgc_fp32.conf @@ -0,0 +1,41 @@ +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Default, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 64, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 1, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 32, 64, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 8, 8, 8, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Default, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 64, 64, 32, Filter1x1Stride1Pad0, 16, 16, 2, 2, 1, 2, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<64, 64, 64, 32, Default, 32, 32, 2, 2, 1, 1, 1, 1, 1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v5> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/generate_instances.py b/experimental/builder/src/generate_instances.py index 6082e952cb9..4064bcba9de 100644 --- a/experimental/builder/src/generate_instances.py +++ b/experimental/builder/src/generate_instances.py @@ -253,12 +253,30 @@ def generate_instances_fwd(instances, problem_name, config, filter_pattern): default="convolution", help="Filter pattern for configs.", ) + parser.add_argument( + "--mode", + choices=["compilation", "tests", "profiler"], + type=str, + default="profiler", + help="Generator modes. compilation - empty instance list, tests - limited instance list, profiler - generate all instances", + ) args = parser.parse_args() + # apply empty filter + if args.mode == "compilation": + args.filter_pattern = "empty" + configs_prefix = "profiler" + elif args.mode == "tests": + configs_prefix = "tests" + elif args.mode == "profiler": + configs_prefix = "profiler" + else: + raise RuntimeError("wrong mode") + for config in fwd_configs: instances = [] generate_dir = Path(__file__).resolve().parent - config_path = f"{generate_dir}/configs/{config}.conf" + config_path = f"{generate_dir}/configs/{configs_prefix}/{config}.conf" with open(config_path, "r") as file: instances = file.readlines() problem_name = f"grouped_convolution_forward_tile_{config}" diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 2cdc9836374..326d3a2db5b 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -139,7 +139,7 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) } std::cout << "IMPORTANT: Generate instances using: python " - "experimental/builder/src/generate_instances.py" + "experimental/builder/src/generate_instances.py --mode=profiler and rerun cmake" << std::endl; const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv); diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt index 5e2db1184c8..d01971226bc 100644 --- a/test/grouped_convnd_fwd/CMakeLists.txt +++ b/test/grouped_convnd_fwd/CMakeLists.txt @@ -19,6 +19,15 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12") target_link_libraries(test_grouped_convnd_fwd_large_cases PRIVATE gtest_main getopt::getopt utility device_grouped_conv1d_fwd_instance device_grouped_conv2d_fwd_instance device_grouped_conv3d_fwd_instance) endif() +if(GPU_TARGETS MATCHES "gfx9") + add_executable(test_grouped_convnd_fwd_tile test_grouped_convnd_fwd_tile.cpp) + target_compile_options(test_grouped_convnd_fwd_tile PRIVATE -Wno-global-constructors -Wno-undef -Wno-c++20-compat) + target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE gtest_main getopt::getopt utility) + if(TARGET device_grouped_conv_fwd_tile_instances) + target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE device_grouped_conv_fwd_tile_instances) + endif() +endif() + add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp) if(result EQUAL 0) target_link_libraries(test_grouped_convnd_fwd_multi_ab_interface PRIVATE utility) diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp new file mode 100644 index 00000000000..11f7a1f5fdd --- /dev/null +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp @@ -0,0 +1,276 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include + +#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/host/device_prop.hpp" +#include "profiler/grouped_convolution_forward_tile_algs.hpp" + +// TODO: Remove limitation of conv fwd gpu reference which does not support right pad +#define CK_CONV_FWD_REF_SKIP_RIGHT_PAD_CASES 1 + +static ck::index_t args_mask = 0xffff; +static ck::index_t instance_index = -1; + +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace cku = ck_tile::builder::test_utils; +namespace ckp = ck_tile::builder::profiling; + +template +struct SignatureDetails +{ + static constexpr ck_tile::index_t num_spatial_dim = num_spatial_dim_; + static constexpr ckb::DataType data_type = data_type_; + static constexpr ckb::DataType acc_data_type = acc_data_type_; + static constexpr ckb::TensorLayout in_layout = in_layout_; + static constexpr ckb::TensorLayout wei_layout = wei_layout_; + static constexpr ckb::TensorLayout out_layout = out_layout_; +}; + +template +class TestGroupedConvndFwdTile : public ::testing::Test +{ + protected: + static constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = SignatureDetailsType::num_spatial_dim, + .direction = ckb::ConvDirection::FORWARD, + .data_type = SignatureDetailsType::data_type, + .accumulation_data_type = SignatureDetailsType::acc_data_type, + .input = {.config = {.layout = SignatureDetailsType::in_layout}}, + .weight = {.config = {.layout = SignatureDetailsType::wei_layout}}, + .output = {.config = {.layout = SignatureDetailsType::out_layout}}}; + + std::vector> conv_args; + + template + void Run() + { + EXPECT_FALSE(conv_args.empty()); + bool pass = true; + for(size_t i = 0; i < conv_args.size(); i++) + { + if((args_mask & (1 << i)) == 0) + { + continue; + } + auto& args = conv_args[i]; + + auto inputs = alloc_inputs(args); + auto outputs = alloc_outputs(args); + ckt::init_inputs(args, inputs.get()); + + std::cout << args.make_input_descriptor() << std::endl; + std::cout << args.make_weight_descriptor() << std::endl; + std::cout << args.make_output_descriptor() << std::endl; + float avg_time; + std::string op_name; + bool case_passed; + std::tie(case_passed, avg_time, op_name) = ckp::run_grouped_conv_forward_tile_algs( + args, + inputs.get(), + outputs.get(), + ck_tile::stream_config{nullptr, false /*time_kernel*/}); + + pass = pass && case_passed; + } + EXPECT_TRUE(pass); + } + + void conv_args_append(std::size_t, + std::size_t G, + std::size_t N, + std::size_t K, + std::size_t C, + const std::vector& filter_spatial_lengths, + const std::vector& input_spatial_lengths, + const std::vector& conv_filter_strides, + const std::vector& conv_filter_dilations, + const std::vector& input_left_pads, + const std::vector& input_right_pads) + { +#if CK_CONV_FWD_REF_SKIP_RIGHT_PAD_CASES + bool without_right_pad = true; + for(const std::size_t& right_pad : input_right_pads) + { + without_right_pad &= right_pad == 0; + } + if(!without_right_pad) + { + return; + } +#endif + ckt::Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = ckt::filter_extent_from_vector( + input_spatial_lengths), + .filter = ckt::filter_extent_from_vector( + filter_spatial_lengths), + }, + .filter_strides = ckt::filter_extent_from_vector( + conv_filter_strides), + .filter_dilation = + ckt::filter_extent_from_vector( + conv_filter_dilations), + .input_left_pad = ckt::filter_extent_from_vector( + input_left_pads), + .input_right_pad = + ckt::filter_extent_from_vector( + input_right_pads), + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + conv_args.push_back(args); + } +}; + +using KernelTypes2d = ::testing::Types, + SignatureDetails<2, + ckb::DataType::FP16, + ckb::DataType::FP32, + ckb::TensorLayout::NHWGC, + ckb::TensorLayout::GKYXC, + ckb::TensorLayout::NHWGK>, + SignatureDetails<2, + ckb::DataType::BF16, + ckb::DataType::FP32, + ckb::TensorLayout::NHWGC, + ckb::TensorLayout::GKYXC, + ckb::TensorLayout::NHWGK>>; + +using KernelTypes3d = ::testing::Types, + SignatureDetails<3, + ckb::DataType::FP16, + ckb::DataType::FP32, + ckb::TensorLayout::NDHWGC, + ckb::TensorLayout::GKZYXC, + ckb::TensorLayout::NDHWGK>, + SignatureDetails<3, + ckb::DataType::BF16, + ckb::DataType::FP32, + ckb::TensorLayout::NDHWGC, + ckb::TensorLayout::GKZYXC, + ckb::TensorLayout::NDHWGK>>; + +template +class TestGroupedConvndFwdTile2d : public TestGroupedConvndFwdTile +{ +}; + +template +class TestGroupedConvndFwdTile3d : public TestGroupedConvndFwdTile +{ +}; + +TYPED_TEST_SUITE(TestGroupedConvndFwdTile2d, KernelTypes2d); +TYPED_TEST_SUITE(TestGroupedConvndFwdTile3d, KernelTypes3d); + +TYPED_TEST(TestGroupedConvndFwdTile2d, Test2D) +{ + this->conv_args.clear(); + this->conv_args_append(2, 3, 5, 96, 200, {1, 1}, {73, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 32, 32, {1, 1}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 32, 32, {2, 2}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 32, 32, {3, 3}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 32, 32, {5, 5}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 32, 32, {9, 9}, {128, 128}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + + this->conv_args_append(2, 2, 32, 128, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}); + + this->conv_args_append(2, 2, 32, 128, 256, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + + this->conv_args_append(2, 2, 32, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}); + this->conv_args_append(2, 1, 1, 1, 32, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + this->conv_args_append(2, 1, 1, 64, 3, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + this->conv_args_append(2, 1, 1, 1, 1, {3, 3}, {32, 32}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + + this->conv_args_append(2, 96, 1, 1, 1, {1, 1}, {120, 160}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + this->conv_args_append(2, 96, 1, 1, 1, {3, 3}, {120, 160}, {1, 1}, {1, 1}, {1, 1}, {1, 1}); + this->template Run<2>(); +} + +TYPED_TEST(TestGroupedConvndFwdTile3d, Test3D) +{ + this->conv_args.clear(); + + this->conv_args_append( + 3, 3, 5, 96, 200, {1, 1, 1}, {37, 37, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {1, 1, 1}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {2, 2, 2}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {5, 5, 5}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {9, 9, 9}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + + this->conv_args_append( + 3, 2, 32, 128, 256, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 2, 32, 128, 256, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + + this->conv_args_append( + 3, 2, 32, 128, 256, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + this->conv_args_append( + 3, 1, 1, 32, 32, {1, 1, 1}, {16, 16, 16}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}); + + this->conv_args_append( + 3, 1, 1, 1, 32, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + this->conv_args_append( + 3, 1, 1, 64, 3, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + this->conv_args_append( + 3, 1, 1, 1, 1, {3, 3, 3}, {32, 32, 32}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + + this->conv_args_append( + 3, 96, 1, 1, 1, {1, 1, 1}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + this->conv_args_append( + 3, 96, 1, 1, 1, {3, 3, 3}, {120, 40, 20}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}); + this->template Run<3>(); +} + +int main(int argc, char** argv) +{ + testing::InitGoogleTest(&argc, argv); + if(argc == 1) {} + else if(argc == 3) + { + args_mask = strtol(argv[1], nullptr, 0); + instance_index = atoi(argv[2]); + } + else + { + std::cout << "Usage of " << argv[0] << std::endl; + std::cout << "Arg1,2: args_mask instance_index(-1 means all)" << std::endl; + } + return RUN_ALL_TESTS(); +} From e81f6cf8940113ec4277f6346f623644b4f9c8ee Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Thu, 15 Jan 2026 18:41:52 -0500 Subject: [PATCH 15/24] fixes --- .gitignore | 1 + .../ck_tile/builder/testing/conv_fwd.hpp | 67 ------------- .../ck_tile/builder/testing/testing.hpp | 10 +- .../src/configs/tests/ndhwgc_bf16.conf | 6 +- .../src/configs/tests/ndhwgc_fp16.conf | 6 +- .../src/configs/tests/ndhwgc_fp32.conf | 17 ++-- .../builder/src/configs/tests/nhwgc_bf16.conf | 6 +- .../builder/src/configs/tests/nhwgc_fp16.conf | 6 +- .../builder/src/configs/tests/nhwgc_fp32.conf | 17 ++-- .../builder/src/generate_instances.py | 54 +++++------ .../grouped_convolution_forward_tile.cpp.in | 19 ++++ ...ion_forward_tile_ndhwgc_bf16_signature.inc | 12 --- ...ion_forward_tile_ndhwgc_fp16_signature.inc | 12 --- ...ion_forward_tile_ndhwgc_fp32_signature.inc | 12 --- ...tion_forward_tile_nhwgc_bf16_signature.inc | 12 --- ...tion_forward_tile_nhwgc_fp16_signature.inc | 12 --- ...tion_forward_tile_nhwgc_fp32_signature.inc | 12 --- .../src/instances/instance_includes.inc | 58 +++++++++++ .../test/impl/conv_signature_types.hpp | 25 +---- .../grouped_convolution_forward_tile_algs.hpp | 95 ++++++++++++++++--- .../grouped_convolution_signatures.hpp | 4 +- .../src/profile_grouped_conv_fwd_tile.cpp | 60 ++---------- .../test_grouped_convnd_fwd_tile.cpp | 6 +- 23 files changed, 235 insertions(+), 294 deletions(-) create mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc delete mode 100644 experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc diff --git a/.gitignore b/.gitignore index 3cccad4a144..5c2b939ec16 100644 --- a/.gitignore +++ b/.gitignore @@ -93,5 +93,6 @@ test_data/* !experimental/builder !experimental/builder/** experimental/builder/src/instances/* +!experimental/builder/src/instances/*.in !experimental/builder/src/instances/*.inc experimental/builder/src/*.inc diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index 43640ccf3dc..24b287a01ce 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -205,73 +205,6 @@ struct Args } }; -template -CK_TILE_HOST auto parse_conv_args(int arg_idx, char* const argv[]) -{ - const std::size_t G = static_cast(std::stol(argv[arg_idx++])); - const std::size_t N = static_cast(std::stol(argv[arg_idx++])); - const std::size_t K = static_cast(std::stol(argv[arg_idx++])); - const std::size_t C = static_cast(std::stol(argv[arg_idx++])); - - constexpr auto num_dim_spatial = SIGNATURE.spatial_dim; - - std::vector filter_spatial_lengths(num_dim_spatial); - std::vector input_spatial_lengths(num_dim_spatial); - std::vector conv_filter_strides(num_dim_spatial); - std::vector conv_filter_dilations(num_dim_spatial); - std::vector input_left_pads(num_dim_spatial); - std::vector input_right_pads(num_dim_spatial); - for(int i = 0; i < num_dim_spatial; ++i) - { - filter_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - conv_filter_strides[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - conv_filter_dilations[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_left_pads[i] = static_cast(std::stol(argv[arg_idx++])); - } - - for(int i = 0; i < num_dim_spatial; ++i) - { - input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); - } - - Args args = { - .lengths = - { - .batch_size = N, - .groups = G, - .input_channels = C, - .output_channels = K, - .image = filter_extent_from_vector(input_spatial_lengths), - .filter = filter_extent_from_vector(filter_spatial_lengths), - }, - .filter_strides = filter_extent_from_vector(conv_filter_strides), - .filter_dilation = filter_extent_from_vector(conv_filter_dilations), - .input_left_pad = filter_extent_from_vector(input_left_pads), - .input_right_pad = filter_extent_from_vector(input_right_pads), - .a_elementwise_op = {}, - .b_elementwise_op = {}, - .cde_elementwise_op = {}, - }; - return args; -} - /// @brief `Inputs` specialization for forward convolution. /// /// @tparam SIGNATURE Forward convolution signature. diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index 0f00a4dcd8d..2a9bc746622 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -322,10 +322,10 @@ ValidationReport validate(const Args& args, /// @note This function is explicitly deleted to generate compile errors /// for missing implementations. template -void run(Operation& operation, - const Args& args, - const Inputs& inputs, - const Outputs& outputs, - const StreamConf s_conf = {}) = delete; +float run(Operation& operation, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const StreamConf s_conf = {}) = delete; } // namespace ck_tile::builder::test diff --git a/experimental/builder/src/configs/tests/ndhwgc_bf16.conf b/experimental/builder/src/configs/tests/ndhwgc_bf16.conf index d6f856dc05c..9222a0858fd 100644 --- a/experimental/builder/src/configs/tests/ndhwgc_bf16.conf +++ b/experimental/builder/src/configs/tests/ndhwgc_bf16.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp16.conf b/experimental/builder/src/configs/tests/ndhwgc_fp16.conf index d6f856dc05c..9222a0858fd 100644 --- a/experimental/builder/src/configs/tests/ndhwgc_fp16.conf +++ b/experimental/builder/src/configs/tests/ndhwgc_fp16.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp32.conf b/experimental/builder/src/configs/tests/ndhwgc_fp32.conf index d6f856dc05c..b9704c81009 100644 --- a/experimental/builder/src/configs/tests/ndhwgc_fp32.conf +++ b/experimental/builder/src/configs/tests/ndhwgc_fp32.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> @@ -26,16 +26,17 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/configs/tests/nhwgc_bf16.conf b/experimental/builder/src/configs/tests/nhwgc_bf16.conf index d6f856dc05c..9222a0858fd 100644 --- a/experimental/builder/src/configs/tests/nhwgc_bf16.conf +++ b/experimental/builder/src/configs/tests/nhwgc_bf16.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> diff --git a/experimental/builder/src/configs/tests/nhwgc_fp16.conf b/experimental/builder/src/configs/tests/nhwgc_fp16.conf index d6f856dc05c..9222a0858fd 100644 --- a/experimental/builder/src/configs/tests/nhwgc_fp16.conf +++ b/experimental/builder/src/configs/tests/nhwgc_fp16.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> diff --git a/experimental/builder/src/configs/tests/nhwgc_fp32.conf b/experimental/builder/src/configs/tests/nhwgc_fp32.conf index d6f856dc05c..b9704c81009 100644 --- a/experimental/builder/src/configs/tests/nhwgc_fp32.conf +++ b/experimental/builder/src/configs/tests/nhwgc_fp32.conf @@ -14,9 +14,9 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Default, 16, 16, 4, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 8> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 16> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<64, 64, 16, 16, Filter3x3, 16, 16, 4, 1, 4, 1, 1, 1, 1, 32> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 32, 32, 4, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Default, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 32, Filter1x1Stride1Pad0, 32, 32, 2, 2, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v4> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Default, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 256, 32, Filter1x1Stride1Pad0, 16, 16, 8, 8, 8, 8, 8, 1, 2, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v3> @@ -26,16 +26,17 @@ DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 128, 64, Filter1x1Stri DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Default, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 128, 256, 32, Filter1x1Stride1Pad0, 32, 32, 2, 4, 8, 8, 8, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Default, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 16, 32, 64, Default, 16, 16, 1, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Default, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 16, 64, Filter1x1Stride1Pad0, 16, 16, 1, 1, 8, 8, 2, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v1> -DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<256, 256, 32, 64, Filter1x1Stride1Pad0, 32, 32, 2, 1, 8, 8, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> \ No newline at end of file +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Default, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<64, 16, 16, 128, Filter1x1Stride1Pad0, 16, 16, 1, 1, 4, 4, 4, 1, 1, BlkGemmPipelineScheduler: Interwave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Default, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Filter1x1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> +DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<128, 32, 128, 32, Filter1x1Stride1Pad0, 32, 32, 1, 2, 4, 4, 8, 1, 1, BlkGemmPipelineScheduler: Intrawave, BlkGemmPipelineVersion: v2> \ No newline at end of file diff --git a/experimental/builder/src/generate_instances.py b/experimental/builder/src/generate_instances.py index 4064bcba9de..476fa4a9ac9 100644 --- a/experimental/builder/src/generate_instances.py +++ b/experimental/builder/src/generate_instances.py @@ -114,7 +114,9 @@ def generate_defs_inc(instances, problem_name, signature, direction, filter_patt ) -def generate_fwd_cpp(instances, problem_name, config, direction, filter_pattern): +def generate_fwd_cpp( + instances, problem_name, config, direction, signature_name, filter_pattern +): for instance in instances: if problem_name.find(filter_pattern) == -1: break @@ -123,37 +125,24 @@ def generate_fwd_cpp(instances, problem_name, config, direction, filter_pattern) directory_path = Path(f"{generate_dir}/instances/{config}") directory_path.mkdir(parents=True, exist_ok=True) with open( - f"{generate_dir}/instances/{config}/{instance_name}.cpp", - "w", + f"{generate_dir}/instances/grouped_convolution_forward_tile.cpp.in", + "r", ) as f: - f.write( - f"// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.\n" - f"// SPDX-License-Identifier: MIT\n" - f'#include "../instance_includes.inc"\n' - f'#include "../{problem_name}_signature.inc"\n' - f"namespace ck_tile::builder::profiling {{\n" - f"std::tuple run_{instance_name}(\n" - f" const ckt::Args& args,\n" - f" const ckt::Inputs& inputs,\n" - f" const ckt::Outputs& outputs,\n" - f" const ck_tile::stream_config& s_conf) {{\n" - ) + content = f.read() - f.write( - f"constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{{}}\n" - f" .with_tile_specializations({instance.get_specialization()})\n" - f" .with_tile_thread_block({instance.get_thread_block()})\n" - f" .with_tile_block_gemm({instance.get_block_gemm_desc()})\n" - f" .with_tile_transfer({instance.get_block_transfer()})\n" - f" .with_tile_optimizations(\n" - f" {instance.get_optimizations()});\n" - ) + content = content.replace("gen_signature", signature_name) + content = content.replace("gen_instance_name", instance_name) + content = content.replace("gen_specialization", instance.get_specialization()) + content = content.replace("gen_thread_block", instance.get_thread_block()) + content = content.replace("gen_block_gemm_desc", instance.get_block_gemm_desc()) + content = content.replace("gen_block_transfer", instance.get_block_transfer()) + content = content.replace("gen_optimizations", instance.get_optimizations()) - f.write( - '#include "../instance_run.inc"\n' - "}\n" - "} // namespace ck_tile::builder::profiling\n" - ) + with open( + f"{generate_dir}/instances/{config}/{instance_name}.cpp", + "w", + ) as f: + f.write(content) def parse_fwd_instances(instances, problem_name): @@ -222,16 +211,19 @@ def parse_fwd_instances(instances, problem_name): def generate_instances_fwd(instances, problem_name, config, filter_pattern): direction = "forward" + signature_name = f"SIGNATURE_{config.upper()}_FWD" instances = parse_fwd_instances(instances, problem_name) generate_calls_inc(instances, problem_name, direction, filter_pattern) generate_defs_inc( instances, problem_name, - f"SIGNATURE_{config.upper()}_FWD", + signature_name, direction, filter_pattern, ) - generate_fwd_cpp(instances, problem_name, config, direction, filter_pattern) + generate_fwd_cpp( + instances, problem_name, config, direction, signature_name, filter_pattern + ) if __name__ == "__main__": diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in b/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in new file mode 100644 index 00000000000..e4cb1b7ad99 --- /dev/null +++ b/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in @@ -0,0 +1,19 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT +#include "../instance_includes.inc" +namespace ck_tile::builder::profiling { +constexpr auto SIGNATURE = gen_signature; +std::tuple run_gen_instance_name( + const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) { +constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} + .with_tile_specializations(gen_specialization) + .with_tile_thread_block(gen_thread_block) + .with_tile_block_gemm(gen_block_gemm_desc) + .with_tile_transfer(gen_block_transfer) + .with_tile_optimizations(gen_optimizations); +#include "../instance_run.inc" +} +} // namespace ck_tile::builder::profiling diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc deleted file mode 100644 index e92c5cfb217..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_bf16_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::BF16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc deleted file mode 100644 index 03cd81e8683..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp16_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc deleted file mode 100644 index 08ca9275e29..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_ndhwgc_fp32_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP32, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc deleted file mode 100644 index 908886e9b05..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_bf16_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::BF16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc deleted file mode 100644 index b2698bceef5..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp16_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc b/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc deleted file mode 100644 index 56654b042c8..00000000000 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile_nhwgc_fp32_signature.inc +++ /dev/null @@ -1,12 +0,0 @@ -namespace { - -constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP32, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; - -} // namespace diff --git a/experimental/builder/src/instances/instance_includes.inc b/experimental/builder/src/instances/instance_includes.inc index 669b1ec4d9f..1d6baace44c 100644 --- a/experimental/builder/src/instances/instance_includes.inc +++ b/experimental/builder/src/instances/instance_includes.inc @@ -5,3 +5,61 @@ namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; namespace cku = ck_tile::builder::test_utils; + +namespace ck_tile::builder::profiling { + +constexpr auto SIGNATURE_NHWGC_FP32_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_BF16_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NHWGC_FP16_FWD = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP32_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP32, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_BF16_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::BF16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +constexpr auto SIGNATURE_NDHWGC_FP16_FWD = + ckt::ConvSignature{.spatial_dim = 3, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + +} // namespace ck_tile::builder::profiling diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp index ad146ddc06e..e90e10141df 100644 --- a/experimental/builder/test/impl/conv_signature_types.hpp +++ b/experimental/builder/test/impl/conv_signature_types.hpp @@ -17,11 +17,7 @@ struct TensorConfig DataType data_type{DataType::UNDEFINED_DATA_TYPE}; DataType compute_type{DataType::UNDEFINED_DATA_TYPE}; - constexpr bool operator==(const TensorConfig& other) const - { - return layout == other.layout && data_type == other.data_type && - compute_type == other.compute_type; - } + constexpr bool operator==(const TensorConfig& other) const = default; }; template @@ -38,11 +34,7 @@ struct TensorOperation .elementwise_operation = this->elementwise_operation}; } - constexpr bool operator==(const TensorOperation& other) const - { - return elementwise_operation == other.elementwise_operation && - auxiliary_operand_configs == other.auxiliary_operand_configs; - } + constexpr bool operator==(const TensorOperation& other) const = default; }; template > @@ -51,10 +43,7 @@ struct ConvolutionTensor TensorConfig config; Op operation{}; - constexpr bool operator==(const ConvolutionTensor& other) const - { - return config == other.config && operation == other.operation; - } + constexpr bool operator==(const ConvolutionTensor& other) const = default; }; template , @@ -70,13 +59,7 @@ struct ConvSignature WeightTensor weight; OutputTensor output; - constexpr bool operator==(const ConvSignature& other) const - { - return spatial_dim == other.spatial_dim && direction == other.direction && - data_type == other.data_type && - accumulation_data_type == other.accumulation_data_type && input == other.input && - weight == other.weight && output == other.output; - } + constexpr bool operator==(const ConvSignature& other) const = default; }; } // namespace ck_tile::builder::test diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index 179d67f8fcf..d2c14f2f710 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -5,8 +5,8 @@ #include -#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "grouped_convolution_signatures.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" @@ -17,12 +17,79 @@ namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" + +template +auto parse_conv_args(int arg_idx, char* const argv[]) +{ + const std::size_t G = static_cast(std::stol(argv[arg_idx++])); + const std::size_t N = static_cast(std::stol(argv[arg_idx++])); + const std::size_t K = static_cast(std::stol(argv[arg_idx++])); + const std::size_t C = static_cast(std::stol(argv[arg_idx++])); + + constexpr auto num_dim_spatial = SIGNATURE.spatial_dim; + + std::vector filter_spatial_lengths(num_dim_spatial); + std::vector input_spatial_lengths(num_dim_spatial); + std::vector conv_filter_strides(num_dim_spatial); + std::vector conv_filter_dilations(num_dim_spatial); + std::vector input_left_pads(num_dim_spatial); + std::vector input_right_pads(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + filter_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_spatial_lengths[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_strides[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_dilations[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_left_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_right_pads[i] = static_cast(std::stol(argv[arg_idx++])); + } + + ckt::Args args = { + .lengths = + { + .batch_size = N, + .groups = G, + .input_channels = C, + .output_channels = K, + .image = filter_extent_from_vector(input_spatial_lengths), + .filter = filter_extent_from_vector(filter_spatial_lengths), + }, + .filter_strides = filter_extent_from_vector(conv_filter_strides), + .filter_dilation = filter_extent_from_vector(conv_filter_dilations), + .input_left_pad = filter_extent_from_vector(input_left_pads), + .input_right_pad = filter_extent_from_vector(input_right_pads), + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + return args; +} /// @brief `run_grouped_conv_forward_tile_algs()` run all grouped conv fwd instances. /// @@ -67,27 +134,27 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" +#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" } else { diff --git a/profiler/include/profiler/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp index df71b50e70e..2ee3c50b6e5 100644 --- a/profiler/include/profiler/grouped_convolution_signatures.hpp +++ b/profiler/include/profiler/grouped_convolution_signatures.hpp @@ -5,8 +5,8 @@ #include -#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ck_tile::builder::profiling { diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 326d3a2db5b..07ea3afd7b0 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -6,9 +6,9 @@ #include #include -#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" -#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" @@ -156,40 +156,19 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) { if(data_type == ConvDataType::F32_F32_F32) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP32, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP16_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 2, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::BF16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_BF16_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } @@ -198,40 +177,19 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) { if(data_type == ConvDataType::F32_F32_F32) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP32, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::FP16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP16_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { - constexpr auto SIGNATURE = - ckt::ConvSignature{.spatial_dim = 3, - .direction = ckb::ConvDirection::FORWARD, - .data_type = ckb::DataType::BF16, - .accumulation_data_type = ckb::DataType::FP32, - .input = {.config = {.layout = ckb::TensorLayout::NDHWGC}}, - .weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}}, - .output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}}; + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_BF16_FWD; return call_profiler(ckt::parse_conv_args(10, argv), time_kernel); } diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp index 11f7a1f5fdd..a9a3fd66d12 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp @@ -7,9 +7,9 @@ #include #include -#include "../../experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" -#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" From 48444fc39a51d44f4ed18b806b620dd179a63705 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 05:49:59 -0500 Subject: [PATCH 16/24] comments fixes --- .../ck_tile/builder/testing/conv_fwd.hpp | 12 ++--- .../ck_tile/builder/testing/conv_fwd_ck.hpp | 18 ++++--- .../builder/testing/conv_fwd_ck_tile.hpp | 24 +++++---- .../builder/testing/conv_fwd_reference.hpp | 32 ++++++++---- .../ck_tile/builder/testing/testing.hpp | 12 +++-- .../builder/src/generate_instances.py | 2 +- .../grouped_convolution_forward_tile.cpp.in | 2 +- .../builder/src/instances/instance_run.inc | 5 +- .../grouped_convolution_forward_tile_algs.hpp | 50 ++++++++++--------- .../grouped_convolution_signatures.hpp | 3 +- .../src/profile_grouped_conv_fwd_tile.cpp | 17 +++---- .../test_grouped_convnd_fwd_tile.cpp | 5 +- 12 files changed, 102 insertions(+), 80 deletions(-) diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index 24b287a01ce..e2624b41ca7 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -181,14 +181,14 @@ struct Args { const auto to_vector = [](const auto& extent) { if constexpr(SPATIAL_DIM == 1) - return std::vector{ck::index_t(extent.width)}; + return std::vector{ck::index_t(extent.width)}; else if constexpr(SPATIAL_DIM == 2) - return std::vector{ck::index_t(extent.height), - ck::index_t(extent.width)}; + return std::vector{ck::index_t(extent.height), + ck::index_t(extent.width)}; else - return std::vector{ck::index_t(extent.depth), - ck::index_t(extent.height), - ck::index_t(extent.width)}; + return std::vector{ck::index_t(extent.depth), + ck::index_t(extent.height), + ck::index_t(extent.width)}; }; return ck_tile::conv::ConvParam(SPATIAL_DIM, diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index 7ac2abd3845..7d93512fb45 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -88,17 +88,19 @@ concept CkConvInstance = detail::CkConvInstance; /// @brief `run()` specialization for forward convolution and old CK. /// /// @tparam SIGNATURE Forward convolution signature. -/// @throws std::runtime_error if the arguments werent actually valid for the +/// @throws std::runtime_error if the arguments weren't actually valid for the /// operation. This should be caught and reported by the testing framework. +/// @return std::tuple - whether the problem is supported and +/// kernel execution time (0.0f if s_conf time_kernel is false). /// /// @see run() template requires ValidConvSignature && ConvDirectionIsForward -float run(CkConvInstance auto& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs, - const StreamConfig s_conf = {}) +std::tuple run(CkConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const StreamConfig s_conf = {}) { constexpr auto spatial_dim = SIGNATURE.spatial_dim; @@ -146,10 +148,10 @@ float run(CkConvInstance auto& conv, if(!conv.IsSupportedArgument(ck_args)) { - throw std::runtime_error("invalid argument"); + std::cout << "invalid argument" << std::endl; } - return conv.MakeInvoker().Run(ck_args, s_conf); + return std::make_tuple(true, conv.MakeInvoker().Run(ck_args, s_conf)); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp index 428e439dc48..c4c39ba221a 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp @@ -6,6 +6,8 @@ #include "ck_tile/builder/testing/conv_fwd.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" +#include "ck_tile/ops/gemm.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" #include #include @@ -46,17 +48,19 @@ concept CkTileConvInstance = detail::CkTileConvInstance; /// @brief `run()` specialization for forward convolution and CK Tile. /// /// @tparam SIGNATURE Forward convolution signature. -/// @throws std::runtime_error if the arguments werent actually valid for the +/// @throws std::runtime_error if the arguments weren't actually valid for the /// operation. This should be caught and reported by the testing framework. +/// @return std::tuple - whether the problem is supported and +/// kernel execution time (0.0f if s_conf time_kernel is false). /// /// @see run() template requires ValidConvSignature && ConvDirectionIsForward -float run(CkTileConvInstance auto& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs, - const ck_tile::stream_config s_conf = {}) +std::tuple run(CkTileConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const ck_tile::stream_config s_conf = {}) { using Conv = std::remove_reference_t; const auto param = args.to_ck_tile_conv_param(); @@ -72,14 +76,16 @@ float run(CkTileConvInstance auto& conv, if(!Conv::IsSupportedArgument(kargs)) { std::cout << "Not supported!"; - return 0.f; + return std::make_tuple(false, 0.f); } constexpr index_t minimum_occupancy = Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2; - return ck_tile::launch_kernel( - s_conf, ck_tile::make_kernel(conv, grids, blocks, 0, kargs)); + return std::make_tuple( + true, + ck_tile::launch_kernel( + s_conf, ck_tile::make_kernel(conv, grids, blocks, 0, kargs))); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp index cf3b256c802..6401c6a5d57 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp @@ -62,6 +62,8 @@ concept RefConvInstance = requires(Conv& conv, /// @throws std::runtime_error if the arguments weren't actually valid for the /// operation. This should be caught and reported by the testing framework. /// +/// @return std::tuple - whether the problem is supported and +/// kernel execution time (0.0f for reference). /// @see run() template requires ValidConvSignature && @@ -69,10 +71,10 @@ template // for now, just concern outselves with reference and see when the // rest of the bwd/weight plumbing is there. ConvDirectionIsForward -float run(RefConvInstance auto& conv, - const Args& args, - const Inputs& inputs, - const Outputs& outputs) +std::tuple run(RefConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs) { // We don't want to compute the output dims manually, just get // them via the existing infrastructure @@ -86,15 +88,27 @@ float run(RefConvInstance auto& conv, for(auto right_pad : param.input_right_pads_) { if(right_pad != 0) - throw std::runtime_error("TODO: Support right pad in reference conv"); + { + std::cout << "TODO: Support right pad in reference conv" << std::endl; + return std::make_tuple(false, 0.0f); + } } if(!args.make_input_descriptor().is_packed()) - throw std::runtime_error("TODO: Support non-packed input tensor in reference conv"); + { + std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl; + return std::make_tuple(false, 0.0f); + } if(!args.make_weight_descriptor().is_packed()) - throw std::runtime_error("TODO: Support non-packed weight tensor in reference conv"); + { + std::cout << "TODO: Support non-packed weight tensor in reference conv" << std::endl; + return std::make_tuple(false, 0.0f); + } if(!args.make_output_descriptor().is_packed()) - throw std::runtime_error("TODO: Support non-packed output tensor in reference conv"); + { + std::cout << "TODO: Support non-packed output tensor in reference conv" << std::endl; + return std::make_tuple(false, 0.0f); + } conv.Run(inputs.input, inputs.weight, @@ -109,7 +123,7 @@ float run(RefConvInstance auto& conv, param.conv_filter_strides_, param.conv_filter_dilations_, param.input_left_pads_); - return 0.f; + return std::make_tuple(true, 0.0f); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/testing.hpp b/experimental/builder/include/ck_tile/builder/testing/testing.hpp index 2a9bc746622..e61d7c4da5f 100644 --- a/experimental/builder/include/ck_tile/builder/testing/testing.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/testing.hpp @@ -318,14 +318,16 @@ ValidationReport validate(const Args& args, /// @param outputs The output tensor data. The contents will be overwritten by /// this function. /// @param s_conf Stream config used to launch kernel. +/// @return std::tuple - whether the problem is supported and +/// kernel execution time (0.0f if s_conf time_kernel is false). /// /// @note This function is explicitly deleted to generate compile errors /// for missing implementations. template -float run(Operation& operation, - const Args& args, - const Inputs& inputs, - const Outputs& outputs, - const StreamConf s_conf = {}) = delete; +std::tuple run(Operation& operation, + const Args& args, + const Inputs& inputs, + const Outputs& outputs, + const StreamConf s_conf = {}) = delete; } // namespace ck_tile::builder::test diff --git a/experimental/builder/src/generate_instances.py b/experimental/builder/src/generate_instances.py index 476fa4a9ac9..91424987f37 100644 --- a/experimental/builder/src/generate_instances.py +++ b/experimental/builder/src/generate_instances.py @@ -106,7 +106,7 @@ def generate_defs_inc(instances, problem_name, signature, direction, filter_patt for instance in instances: instance_name = problem_name + "_" + str(instance.id) f.write( - f"std::tuple run_{instance_name}(\n" + f"std::tuple run_{instance_name}(\n" f" const ckt::Args<{signature}>& args,\n" f" const ckt::Inputs<{signature}>& inputs,\n" f" const ckt::Outputs<{signature}>& outputs,\n" diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in b/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in index e4cb1b7ad99..a9c5b7a7cba 100644 --- a/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in +++ b/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in @@ -3,7 +3,7 @@ #include "../instance_includes.inc" namespace ck_tile::builder::profiling { constexpr auto SIGNATURE = gen_signature; -std::tuple run_gen_instance_name( +std::tuple run_gen_instance_name( const ckt::Args& args, const ckt::Inputs& inputs, const ckt::Outputs& outputs, diff --git a/experimental/builder/src/instances/instance_run.inc b/experimental/builder/src/instances/instance_run.inc index 6f51db2d17d..6b8024fa93a 100644 --- a/experimental/builder/src/instances/instance_run.inc +++ b/experimental/builder/src/instances/instance_run.inc @@ -3,4 +3,7 @@ using Builder = ckb::ConvBuilder; using Instance = Builder::Instance; auto conv = Instance{}; -return std::make_tuple(ckt::run(conv, args, inputs, outputs, s_conf), conv.GetInstanceString()); +bool is_supported; +float avg_time; +std::tie(is_supported, avg_time) = ckt::run(conv, args, inputs, outputs, s_conf); +return std::make_tuple(is_supported, avg_time, conv.GetInstanceString()); diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index d2c14f2f710..18d7bef5213 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -5,24 +5,25 @@ #include -#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "../../experimental/builder/test/utils/conv_algorithm_type_utils.hpp" #include "grouped_convolution_signatures.hpp" +#include "ck_tile/builder/testing/filter_extent.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/builder/testing/conv_fwd_reference.hpp" +#include "ck_tile/builder/conv_builder.hpp" namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" template auto parse_conv_args(int arg_idx, char* const argv[]) @@ -77,15 +78,15 @@ auto parse_conv_args(int arg_idx, char* const argv[]) .groups = G, .input_channels = C, .output_channels = K, - .image = filter_extent_from_vector(input_spatial_lengths), - .filter = filter_extent_from_vector(filter_spatial_lengths), + .image = ckt::filter_extent_from_vector(input_spatial_lengths), + .filter = ckt::filter_extent_from_vector(filter_spatial_lengths), }, - .filter_strides = filter_extent_from_vector(conv_filter_strides), - .filter_dilation = filter_extent_from_vector(conv_filter_dilations), - .input_left_pad = filter_extent_from_vector(input_left_pads), - .input_right_pad = filter_extent_from_vector(input_right_pads), - .a_elementwise_op = {}, - .b_elementwise_op = {}, + .filter_strides = ckt::filter_extent_from_vector(conv_filter_strides), + .filter_dilation = ckt::filter_extent_from_vector(conv_filter_dilations), + .input_left_pad = ckt::filter_extent_from_vector(input_left_pads), + .input_right_pad = ckt::filter_extent_from_vector(input_right_pads), + .a_elementwise_op = {}, + .b_elementwise_op = {}, .cde_elementwise_op = {}, }; return args; @@ -105,6 +106,7 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, { float best_avg_time = std::numeric_limits::max(); std::string best_op_name, op_name; + bool is_supported; float avg_time; bool valid = true; @@ -115,8 +117,8 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, ckt::run(ref_conv, args, inputs, reference.get()); [[maybe_unused]] auto run_alg = [&](auto&& run_alg_func) { - std::tie(avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); - if(avg_time > 0.f) + std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + if(is_supported) { const auto errors = ckt::validate(args, outputs, reference.get()).get_errors(); for(const auto& error : errors) @@ -134,27 +136,27 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_FWD) { -#include "experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" +#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" } else { diff --git a/profiler/include/profiler/grouped_convolution_signatures.hpp b/profiler/include/profiler/grouped_convolution_signatures.hpp index 2ee3c50b6e5..5103b0f2350 100644 --- a/profiler/include/profiler/grouped_convolution_signatures.hpp +++ b/profiler/include/profiler/grouped_convolution_signatures.hpp @@ -5,8 +5,7 @@ #include -#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" +#include "../../experimental/builder/test/impl/conv_signature_types.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ck_tile::builder::profiling { diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 07ea3afd7b0..0053846e808 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -6,9 +6,7 @@ #include #include -#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" -#include "experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" @@ -82,7 +80,6 @@ static void print_helper_msg() namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -namespace cku = ck_tile::builder::test_utils; namespace ckp = ck_tile::builder::profiling; template @@ -157,19 +154,19 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) if(data_type == ConvDataType::F32_F32_F32) { constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP16_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_BF16_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } } @@ -178,19 +175,19 @@ int profile_grouped_conv_fwd_tile(int argc, char* argv[]) if(data_type == ConvDataType::F32_F32_F32) { constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::F16_F16_F16) { constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP16_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } else if(data_type == ConvDataType::BF16_BF16_BF16) { constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_BF16_FWD; - return call_profiler(ckt::parse_conv_args(10, argv), + return call_profiler(ckp::parse_conv_args(10, argv), time_kernel); } } diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp index a9a3fd66d12..a7a55ba8ffa 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp @@ -7,9 +7,7 @@ #include #include -#include "experimental/builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "experimental/builder/test/utils/conv_algorithm_type_utils.hpp" -#include "experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" @@ -21,7 +19,6 @@ static ck::index_t instance_index = -1; namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -namespace cku = ck_tile::builder::test_utils; namespace ckp = ck_tile::builder::profiling; template Date: Fri, 16 Jan 2026 05:54:33 -0500 Subject: [PATCH 17/24] unit test --- experimental/builder/test/unit_tensor_descriptor.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp index ce6209795a1..f14b7d5535b 100644 --- a/experimental/builder/test/unit_tensor_descriptor.cpp +++ b/experimental/builder/test/unit_tensor_descriptor.cpp @@ -190,6 +190,8 @@ TEST(TensorDescriptor, IsPacked) ckt::make_descriptor
(ckt::Extent{10, 11, 12}, ckt::Extent{1, 100, 1100}).is_packed()); EXPECT_FALSE( ckt::make_descriptor
(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed()); + EXPECT_TRUE( + ckt::make_descriptor
(ckt::Extent{30, 20, 1}, ckt::Extent{1, 600, 30}).is_packed()); } TEST(TensorDescriptor, PrintExtent) From b0d2562425540842c1cee8a52b02c7ed1396e4ca Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 06:23:53 -0500 Subject: [PATCH 18/24] unit test fix --- experimental/builder/test/unit_tensor_descriptor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/experimental/builder/test/unit_tensor_descriptor.cpp b/experimental/builder/test/unit_tensor_descriptor.cpp index f14b7d5535b..8e6e2696103 100644 --- a/experimental/builder/test/unit_tensor_descriptor.cpp +++ b/experimental/builder/test/unit_tensor_descriptor.cpp @@ -191,7 +191,7 @@ TEST(TensorDescriptor, IsPacked) EXPECT_FALSE( ckt::make_descriptor
(ckt::Extent{30, 20, 10}, ckt::Extent{1, 1, 1}).is_packed()); EXPECT_TRUE( - ckt::make_descriptor
(ckt::Extent{30, 20, 1}, ckt::Extent{1, 600, 30}).is_packed()); + ckt::make_descriptor
(ckt::Extent{30, 20, 1}, ckt::Extent{1, 30, 30}).is_packed()); } TEST(TensorDescriptor, PrintExtent) From 888cafba77716dfb4efaef4aad844dd7d1d07bc8 Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 09:24:08 -0500 Subject: [PATCH 19/24] Move instances outside builder --- .gitignore | 8 +++---- CMakeLists.txt | 1 + Jenkinsfile | 3 ++- experimental/builder/CMakeLists.txt | 2 -- .../CMakeLists.txt | 0 .../README.md | 2 ++ .../configs/profiler/ndhwgc_bf16.conf | 0 .../configs/profiler/ndhwgc_fp16.conf | 0 .../configs/profiler/ndhwgc_fp32.conf | 0 .../configs/profiler/nhwgc_bf16.conf | 0 .../configs/profiler/nhwgc_fp16.conf | 0 .../configs/profiler/nhwgc_fp32.conf | 0 .../configs/tests/ndhwgc_bf16.conf | 0 .../configs/tests/ndhwgc_fp16.conf | 0 .../configs/tests/ndhwgc_fp32.conf | 0 .../configs/tests/nhwgc_bf16.conf | 0 .../configs/tests/nhwgc_fp16.conf | 0 .../configs/tests/nhwgc_fp32.conf | 0 .../generate_instances.py | 0 .../grouped_convolution_forward_tile.cpp.in | 0 .../instances/instance_includes.inc | 4 ++-- .../instances/instance_run.inc | 0 .../grouped_convolution_forward_tile_algs.hpp | 24 +++++++++---------- test/grouped_convnd_fwd/CMakeLists.txt | 15 +++++++----- 24 files changed, 32 insertions(+), 27 deletions(-) rename experimental/{builder/src => grouped_convolution_tile_instances}/CMakeLists.txt (100%) create mode 100644 experimental/grouped_convolution_tile_instances/README.md rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/ndhwgc_bf16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/ndhwgc_fp16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/ndhwgc_fp32.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/nhwgc_bf16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/nhwgc_fp16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/profiler/nhwgc_fp32.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/ndhwgc_bf16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/ndhwgc_fp16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/ndhwgc_fp32.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/nhwgc_bf16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/nhwgc_fp16.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/configs/tests/nhwgc_fp32.conf (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/generate_instances.py (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/instances/grouped_convolution_forward_tile.cpp.in (100%) rename experimental/{builder/src => grouped_convolution_tile_instances}/instances/instance_includes.inc (96%) rename experimental/{builder/src => grouped_convolution_tile_instances}/instances/instance_run.inc (100%) diff --git a/.gitignore b/.gitignore index 5c2b939ec16..740d5464fb9 100644 --- a/.gitignore +++ b/.gitignore @@ -92,7 +92,7 @@ test_data/* # The experimental/builder directory should be tracked despite matching build* !experimental/builder !experimental/builder/** -experimental/builder/src/instances/* -!experimental/builder/src/instances/*.in -!experimental/builder/src/instances/*.inc -experimental/builder/src/*.inc +experimental/grouped_convolution_tile_instances/instances/* +!experimental/grouped_convolution_tile_instances/instances/*.in +!experimental/grouped_convolution_tile_instances/instances/*.inc +experimental/grouped_convolution_tile_instances/*.inc diff --git a/CMakeLists.txt b/CMakeLists.txt index 78c63ba7e58..cd7121b39db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -706,6 +706,7 @@ add_subdirectory(library) if (CK_EXPERIMENTAL_BUILDER) add_subdirectory(experimental/builder) + add_subdirectory(experimental/grouped_convolution_tile_instances) endif() if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY) diff --git a/Jenkinsfile b/Jenkinsfile index 039185b97fe..562a8fab797 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1450,7 +1450,8 @@ pipeline { ./bin/test_grouped_convnd_fwd_tile""" } steps{ - buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) + // TODO: Reenable after the instance fixes + // buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args) cleanWs() } } diff --git a/experimental/builder/CMakeLists.txt b/experimental/builder/CMakeLists.txt index 3fd713e9316..95b41da40b2 100644 --- a/experimental/builder/CMakeLists.txt +++ b/experimental/builder/CMakeLists.txt @@ -4,5 +4,3 @@ if(BUILD_TESTING) add_subdirectory(test) endif() - -add_subdirectory(src) diff --git a/experimental/builder/src/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt similarity index 100% rename from experimental/builder/src/CMakeLists.txt rename to experimental/grouped_convolution_tile_instances/CMakeLists.txt diff --git a/experimental/grouped_convolution_tile_instances/README.md b/experimental/grouped_convolution_tile_instances/README.md new file mode 100644 index 00000000000..b608298b45b --- /dev/null +++ b/experimental/grouped_convolution_tile_instances/README.md @@ -0,0 +1,2 @@ +# Grouped Convolution Tile Instances Generator +This functionality will be refactored and moved under the Tile Engine \ No newline at end of file diff --git a/experimental/builder/src/configs/profiler/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/profiler/ndhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_bf16.conf diff --git a/experimental/builder/src/configs/profiler/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/profiler/ndhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp16.conf diff --git a/experimental/builder/src/configs/profiler/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/profiler/ndhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/ndhwgc_fp32.conf diff --git a/experimental/builder/src/configs/profiler/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/profiler/nhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_bf16.conf diff --git a/experimental/builder/src/configs/profiler/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/profiler/nhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp16.conf diff --git a/experimental/builder/src/configs/profiler/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/profiler/nhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/profiler/nhwgc_fp32.conf diff --git a/experimental/builder/src/configs/tests/ndhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/tests/ndhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_bf16.conf diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/tests/ndhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp16.conf diff --git a/experimental/builder/src/configs/tests/ndhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/tests/ndhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/ndhwgc_fp32.conf diff --git a/experimental/builder/src/configs/tests/nhwgc_bf16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf similarity index 100% rename from experimental/builder/src/configs/tests/nhwgc_bf16.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_bf16.conf diff --git a/experimental/builder/src/configs/tests/nhwgc_fp16.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf similarity index 100% rename from experimental/builder/src/configs/tests/nhwgc_fp16.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp16.conf diff --git a/experimental/builder/src/configs/tests/nhwgc_fp32.conf b/experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf similarity index 100% rename from experimental/builder/src/configs/tests/nhwgc_fp32.conf rename to experimental/grouped_convolution_tile_instances/configs/tests/nhwgc_fp32.conf diff --git a/experimental/builder/src/generate_instances.py b/experimental/grouped_convolution_tile_instances/generate_instances.py similarity index 100% rename from experimental/builder/src/generate_instances.py rename to experimental/grouped_convolution_tile_instances/generate_instances.py diff --git a/experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in b/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in similarity index 100% rename from experimental/builder/src/instances/grouped_convolution_forward_tile.cpp.in rename to experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in diff --git a/experimental/builder/src/instances/instance_includes.inc b/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc similarity index 96% rename from experimental/builder/src/instances/instance_includes.inc rename to experimental/grouped_convolution_tile_instances/instances/instance_includes.inc index 1d6baace44c..cd389e3dce3 100644 --- a/experimental/builder/src/instances/instance_includes.inc +++ b/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc @@ -1,5 +1,5 @@ -#include "../../test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../test/utils/conv_algorithm_type_utils.hpp" +#include "../../builder/test/utils/ckb_conv_tile_test_configs.hpp" +#include "../../builder/test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ckb = ck_tile::builder; diff --git a/experimental/builder/src/instances/instance_run.inc b/experimental/grouped_convolution_tile_instances/instances/instance_run.inc similarity index 100% rename from experimental/builder/src/instances/instance_run.inc rename to experimental/grouped_convolution_tile_instances/instances/instance_run.inc diff --git a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp index 18d7bef5213..e58c884729e 100644 --- a/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_forward_tile_algs.hpp @@ -18,12 +18,12 @@ namespace ck_tile::builder::profiling { namespace ckb = ck_tile::builder; namespace ckt = ck_tile::builder::test; -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16.inc" -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp32.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_bf16.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp32.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_bf16.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp16.inc" template auto parse_conv_args(int arg_idx, char* const argv[]) @@ -136,27 +136,27 @@ run_grouped_conv_forward_tile_algs(const ckt::Args& args, if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_BF16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NHWGC_FP32_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_nhwgc_fp32_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_BF16_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_bf16_calls.inc" } else if constexpr(SIGNATURE == SIGNATURE_NDHWGC_FP32_FWD) { -#include "../../experimental/builder/src/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" +#include "../../experimental/grouped_convolution_tile_instances/grouped_convolution_forward_tile_ndhwgc_fp32_calls.inc" } else { diff --git a/test/grouped_convnd_fwd/CMakeLists.txt b/test/grouped_convnd_fwd/CMakeLists.txt index d01971226bc..6f8b71679c0 100644 --- a/test/grouped_convnd_fwd/CMakeLists.txt +++ b/test/grouped_convnd_fwd/CMakeLists.txt @@ -20,12 +20,15 @@ if(GPU_TARGETS MATCHES "gfx9|gfx11|gfx12") endif() if(GPU_TARGETS MATCHES "gfx9") - add_executable(test_grouped_convnd_fwd_tile test_grouped_convnd_fwd_tile.cpp) - target_compile_options(test_grouped_convnd_fwd_tile PRIVATE -Wno-global-constructors -Wno-undef -Wno-c++20-compat) - target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE gtest_main getopt::getopt utility) - if(TARGET device_grouped_conv_fwd_tile_instances) - target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE device_grouped_conv_fwd_tile_instances) - endif() + if(CK_EXPERIMENTAL_BUILDER) + # TODO: Reenable after the instance fixes + # add_executable(test_grouped_convnd_fwd_tile test_grouped_convnd_fwd_tile.cpp) + # target_compile_options(test_grouped_convnd_fwd_tile PRIVATE -Wno-global-constructors -Wno-undef -Wno-c++20-compat) + # target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE gtest_main getopt::getopt utility) + # if(TARGET device_grouped_conv_fwd_tile_instances) + # target_link_libraries(test_grouped_convnd_fwd_tile PRIVATE device_grouped_conv_fwd_tile_instances) + # endif() + endif() endif() add_gtest_executable(test_grouped_convnd_fwd_multi_ab_interface test_grouped_convnd_fwd_multi_ab_interface.cpp) From bc1bf350634e759c5cde94934f3b722e8ace4dfc Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 10:04:30 -0500 Subject: [PATCH 20/24] fix includes --- experimental/grouped_convolution_tile_instances/CMakeLists.txt | 2 ++ profiler/src/profile_grouped_conv_fwd_tile.cpp | 2 +- test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/experimental/grouped_convolution_tile_instances/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt index cdd682aeadc..1264a689064 100644 --- a/experimental/grouped_convolution_tile_instances/CMakeLists.txt +++ b/experimental/grouped_convolution_tile_instances/CMakeLists.txt @@ -14,4 +14,6 @@ if(GPU_TARGETS MATCHES "gfx9") # Find cpp files and create lib for instances file(GLOB_RECURSE GROUPED_CONV_FWD_TILE "instances/*.cpp") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) + target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE + "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") endif() diff --git a/profiler/src/profile_grouped_conv_fwd_tile.cpp b/profiler/src/profile_grouped_conv_fwd_tile.cpp index 0053846e808..8023dcf2f66 100644 --- a/profiler/src/profile_grouped_conv_fwd_tile.cpp +++ b/profiler/src/profile_grouped_conv_fwd_tile.cpp @@ -6,7 +6,7 @@ #include #include -#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp index a7a55ba8ffa..c04a15ec982 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_tile.cpp @@ -7,7 +7,7 @@ #include #include -#include "../../experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" #include "ck_tile/host/device_prop.hpp" #include "profiler/grouped_convolution_forward_tile_algs.hpp" From 45b4c4597b39624689f1e8644cc9bf042a30009f Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 11:57:59 -0500 Subject: [PATCH 21/24] clang format fix --- .../grouped_convolution_forward_tile.cpp.in | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in b/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in index a9c5b7a7cba..7e86576f7b1 100644 --- a/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in +++ b/experimental/grouped_convolution_tile_instances/instances/grouped_convolution_forward_tile.cpp.in @@ -3,17 +3,17 @@ #include "../instance_includes.inc" namespace ck_tile::builder::profiling { constexpr auto SIGNATURE = gen_signature; -std::tuple run_gen_instance_name( - const ckt::Args& args, - const ckt::Inputs& inputs, - const ckt::Outputs& outputs, - const ck_tile::stream_config& s_conf) { -constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} - .with_tile_specializations(gen_specialization) - .with_tile_thread_block(gen_thread_block) - .with_tile_block_gemm(gen_block_gemm_desc) - .with_tile_transfer(gen_block_transfer) - .with_tile_optimizations(gen_optimizations); +std::tuple run_gen_instance_name(const ckt::Args& args, + const ckt::Inputs& inputs, + const ckt::Outputs& outputs, + const ck_tile::stream_config& s_conf) +{ + constexpr auto ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} + .with_tile_specializations(gen_specialization) + .with_tile_thread_block(gen_thread_block) + .with_tile_block_gemm(gen_block_gemm_desc) + .with_tile_transfer(gen_block_transfer) + .with_tile_optimizations(gen_optimizations); #include "../instance_run.inc" } } // namespace ck_tile::builder::profiling From 9308292c8e3b0cd7e17bc622e6a1d419d4b3b2eb Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Fri, 16 Jan 2026 12:06:13 -0500 Subject: [PATCH 22/24] readme fix --- experimental/grouped_convolution_tile_instances/README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/experimental/grouped_convolution_tile_instances/README.md b/experimental/grouped_convolution_tile_instances/README.md index b608298b45b..1ba51896952 100644 --- a/experimental/grouped_convolution_tile_instances/README.md +++ b/experimental/grouped_convolution_tile_instances/README.md @@ -1,2 +1,5 @@ # Grouped Convolution Tile Instances Generator -This functionality will be refactored and moved under the Tile Engine \ No newline at end of file +CK Tile Convolution instances implemented via builder and generated via python script. +It is integrated with tests and ckProfiler +This functionality will be refactored and moved under the Tile Engine. +At now to speed up development and provide tests for CK Tile Convolution it has been implemented under experimental directory. From b1de110c2d7169a9a351610e77694371fabc77dc Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 19 Jan 2026 06:31:25 -0500 Subject: [PATCH 23/24] fix includes --- .../instances/instance_includes.inc | 1 - 1 file changed, 1 deletion(-) diff --git a/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc b/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc index cd389e3dce3..4b4c1444281 100644 --- a/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc +++ b/experimental/grouped_convolution_tile_instances/instances/instance_includes.inc @@ -1,5 +1,4 @@ #include "../../builder/test/utils/ckb_conv_tile_test_configs.hpp" -#include "../../builder/test/utils/conv_algorithm_type_utils.hpp" #include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" namespace ckb = ck_tile::builder; From a7488f56b602363608710105a7ec2487196e3b5d Mon Sep 17 00:00:00 2001 From: Bartlomiej Kocot Date: Mon, 19 Jan 2026 15:11:15 +0000 Subject: [PATCH 24/24] fixes --- Jenkinsfile | 2 +- .../include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 562a8fab797..b4cc4c0f48b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1258,7 +1258,7 @@ pipeline { booleanParam( name: "RUN_FULL_CONV_TILE_TESTS", defaultValue: false, - description: "Run AITER tests with latest CK develop branch (default: OFF)") + description: "Run CK Tile grouped convolution tests with latest CK develop branch (default: OFF)") string( name: 'aiter_branch', defaultValue: 'main', diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp index c4c39ba221a..a8f68255249 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp @@ -14,7 +14,7 @@ /// This file contains the implementation details for invoking/testing /// grouped convolution operations in CK Tile. The main item is the /// `run()` function, which is the main implementation used to invoke -/// CK grouped forward convolution kernels. +/// CK Tile grouped forward convolution kernels. namespace ck_tile::builder::test {