Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
ac6bd87
[BULDER] Add grouped conv fwd ck tile profiler
bartekxk Jan 6, 2026
7798fec
Merge branch 'develop' of github.com:ROCm/composable_kernel into bark…
bartekxk Jan 6, 2026
02151f0
[CK TILE] Fix grouped conv kernels splitk and double lds
bartekxk Jan 7, 2026
04ee697
Updates
bartekxk Jan 7, 2026
48d9113
Fixes
bartekxk Jan 7, 2026
c7709ea
Move to ckProfiler
bartekxk Jan 11, 2026
2b82f21
Fixes
bartekxk Jan 12, 2026
bed4e7e
Merge branch 'develop' of github.com:ROCm/composable_kernel into bark…
bartekxk Jan 12, 2026
c0dcba0
fix
bartekxk Jan 12, 2026
dcf8a50
fix
bartekxk Jan 12, 2026
527d98e
Change instances to empty list by default
bartekxk Jan 12, 2026
3291468
fix
bartekxk Jan 12, 2026
f754aa1
fix
bartekxk Jan 13, 2026
18d08a3
Update grouped_convolution_signatures.hpp
bartekxk Jan 13, 2026
ca8d5af
Update grouped_convolution_forward_tile_algs.hpp
bartekxk Jan 13, 2026
97f5953
Merge branch 'develop' into barkocot/tile-builder-testing
bartekxk Jan 13, 2026
0725777
[CK TILE] Add grouped convolution forward tests (#3556)
bartekxk Jan 15, 2026
e81f6cf
fixes
bartekxk Jan 15, 2026
48444fc
comments fixes
bartekxk Jan 16, 2026
0922ea0
unit test
bartekxk Jan 16, 2026
afd634c
Merge branch 'develop' of github.com:ROCm/composable_kernel into bark…
bartekxk Jan 16, 2026
b0d2562
unit test fix
bartekxk Jan 16, 2026
888cafb
Move instances outside builder
bartekxk Jan 16, 2026
bc1bf35
fix includes
bartekxk Jan 16, 2026
45b4c45
clang format fix
bartekxk Jan 16, 2026
9308292
readme fix
bartekxk Jan 16, 2026
b1de110
fix includes
bartekxk Jan 19, 2026
a7488f5
fixes
bartekxk Jan 19, 2026
7a8393d
Merge branch 'develop' into barkocot/tile-builder-testing
bartekxk Jan 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,7 @@ test_data/*
# The experimental/builder directory should be tracked despite matching build*
!experimental/builder
!experimental/builder/**
experimental/grouped_convolution_tile_instances/instances/*
!experimental/grouped_convolution_tile_instances/instances/*.in
!experimental/grouped_convolution_tile_instances/instances/*.inc
experimental/grouped_convolution_tile_instances/*.inc
9 changes: 5 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,11 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)

add_subdirectory(library)

if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
add_subdirectory(experimental/grouped_convolution_tile_instances)
endif()

if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
Expand Down Expand Up @@ -735,10 +740,6 @@ if (NOT MIOPEN_REQ_LIBS_ONLY)
add_subdirectory(profiler)
endif()

if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
endif()

if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
add_subdirectory(codegen)
endif()
Expand Down
38 changes: 36 additions & 2 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ def cmake_build(Map conf=[:]){
if (params.NINJA_BUILD_TRACE) {
echo "running ninja build trace"
}
if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
if ((params.RUN_BUILDER_TESTS || params.RUN_FULL_CONV_TILE_TESTS) && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
setup_args = " -D CK_EXPERIMENTAL_BUILDER=ON " + setup_args
}
setup_cmd = conf.get(
Expand Down Expand Up @@ -1091,7 +1091,7 @@ CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;RUN_
0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true;FORCE_CI=true
0 15 * * * % BUILD_INSTANCES_ONLY=true;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;FORCE_CI=true
0 13 * * * % RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
0 13 * * * % RUN_FULL_CONV_TILE_TESTS=true;RUN_AITER_TESTS=true;BUILD_LEGACY_OS=true;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;FORCE_CI=true
0 11 * * * % RUN_PYTORCH_TESTS=true;RUN_CODEGEN_TESTS=false;USE_SCCACHE=false;RUN_PERFORMANCE_TESTS=false;BUILD_GFX101=false;BUILD_GFX103=false;BUILD_GFX11=false;BUILD_GFX12=false;BUILD_GFX90A=false;FORCE_CI=true''' : ""

pipeline {
Expand Down Expand Up @@ -1255,6 +1255,10 @@ pipeline {
name: "RUN_AITER_TESTS",
defaultValue: false,
description: "Run AITER tests with latest CK develop branch (default: OFF)")
booleanParam(
name: "RUN_FULL_CONV_TILE_TESTS",
defaultValue: false,
description: "Run CK Tile grouped convolution tests with latest CK develop branch (default: OFF)")
string(
name: 'aiter_branch',
defaultValue: 'main',
Expand Down Expand Up @@ -1423,6 +1427,36 @@ pipeline {
}
}
}
stage("Run Full Grouped Conv Tile Tests")
{
when {
beforeAgent true
expression { env.SHOULD_RUN_CI.toBoolean() }
}
parallel
{
stage("Run Full Grouped Conv Tile Tests on gfx90a")
{
when {
beforeAgent true
expression { params.RUN_FULL_CONV_TILE_TESTS.toBoolean() }
}
agent{ label rocmnode("gfx90a")}
environment{
setup_args = "NO_CK_BUILD"
execute_args = """ python3 ../experimental/builder/src/generate_instances.py --mode=profiler && \
../script/cmake-ck-dev.sh ../ gfx90a && \
make -j64 test_grouped_convnd_fwd_tile && \
./bin/test_grouped_convnd_fwd_tile"""
}
steps{
// TODO: Reenable after the instance fixes
// buildHipClangJobAndReboot(setup_args:setup_args, build_type: 'Release', execute_cmd: execute_args)
cleanWs()
}
}
}
}
stage("Run Grouped Conv Large Case Tests")
{
when {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,27 +98,26 @@ struct ConvTileFactory
using GemmPipeline = typename internal::TilePipelineType<
BLOCK_GEMM.pipeline_version>::template GemmPipeline<UniversalGemmProblem>;

using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
typename Types::ADataType,
typename Types::BDataType,
typename Types::DsDataTypes,
typename Types::AccDataType,
typename Types::EDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
typename GroupedConvTraitsType::FixedGemmParams::ELayout,
typename Ops::CDEElementwiseOp,
BLOCK.per_block.m,
BLOCK.per_block.n,
BLOCK_GEMM.warps.m,
BLOCK_GEMM.warps.n,
BLOCK_GEMM.warp_tile.m,
BLOCK_GEMM.warp_tile.n,
BLOCK_GEMM.warp_tile.k,
GroupedConvTraitsType::FixedGemmParams::TransposeC,
// TODO:: This template parameter will be moved inside the kernel
BLOCK_GEMM.num_wave_groups,
GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
SCALAR_PER_VECTOR.c>>;
using ConvEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<typename Types::ADataType,
typename Types::BDataType,
typename Types::DsDataTypes,
typename Types::AccDataType,
typename Types::EDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
typename GroupedConvTraitsType::FixedGemmParams::ELayout,
typename Ops::CDEElementwiseOp,
BLOCK.per_block.m,
BLOCK.per_block.n,
BLOCK_GEMM.warps.m,
BLOCK_GEMM.warps.n,
BLOCK_GEMM.warp_tile.m,
BLOCK_GEMM.warp_tile.n,
BLOCK_GEMM.warp_tile.k,
GroupedConvTraitsType::FixedGemmParams::TransposeC,
BLOCK_GEMM.num_wave_groups,
GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
SCALAR_PER_VECTOR.c>>;

using Instance = typename internal::GroupedConvolutionTileKernel<SIGNATURE,
GroupedConvTraitsType,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "ck_tile/builder/testing/testing_reflect.hpp"
#include "ck_tile/builder/testing/filter_extent.hpp"
#include "ck_tile/builder/testing/tensor_buffer.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/builder/testing/tensor_initialization.hpp"
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
#include "ck_tile/builder/testing/validation.hpp"
Expand Down Expand Up @@ -93,6 +94,8 @@ struct Args<SIGNATURE>
Ops::WeiElementwiseOp b_elementwise_op;
Ops::OutElementwiseOp cde_elementwise_op;

int k_batch = 1;

/// This function returns the `TensorDescriptor` corresponding to
/// the input-tensor of the convolution problem. This can then
/// be used to, for example, allocate memory.
Expand Down Expand Up @@ -169,6 +172,36 @@ struct Args<SIGNATURE>
to_vector(this->input_left_pad),
to_vector(this->input_right_pad));
}

/// Convert the Args structure into a CK Tile conv_param structure.
/// This function is mainly used to be able to use the existing
/// CK Tile functionality to obtain tensor descriptors.
ck_tile::conv::ConvParam to_ck_tile_conv_param() const
{
const auto to_vector = [](const auto& extent) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This lambda looks fine, but isn't this defined some common place? I've seen this repeated other places in the code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We dont have, I wanted to add tihs but the both usage of this function return different types: vectorck::index_t and vector<ck_tile::index_t>

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, I'm fine with duplication, we should just think about what were doing each time we duplicate something like this.

if constexpr(SPATIAL_DIM == 1)
return std::vector<ck_tile::index_t>{ck::index_t(extent.width)};
else if constexpr(SPATIAL_DIM == 2)
return std::vector<ck_tile::index_t>{ck::index_t(extent.height),
ck::index_t(extent.width)};
else
return std::vector<ck_tile::index_t>{ck::index_t(extent.depth),
ck::index_t(extent.height),
ck::index_t(extent.width)};
};

return ck_tile::conv::ConvParam(SPATIAL_DIM,
this->lengths.groups,
this->lengths.batch_size,
this->lengths.output_channels,
this->lengths.input_channels,
to_vector(this->lengths.filter),
to_vector(this->lengths.image),
to_vector(this->filter_strides),
to_vector(this->filter_dilation),
to_vector(this->input_left_pad),
to_vector(this->input_right_pad));
}
};

/// @brief `Inputs` specialization for forward convolution.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#pragma once

#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include <type_traits>
#include <array>
Expand Down Expand Up @@ -87,16 +88,19 @@ concept CkConvInstance = detail::CkConvInstance<Conv, SIGNATURE>;
/// @brief `run()` specialization for forward convolution and old CK.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments werent actually valid for the
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
void run(CkConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs)
std::tuple<bool, float> run(CkConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const StreamConfig s_conf = {})
{
constexpr auto spatial_dim = SIGNATURE.spatial_dim;

Expand Down Expand Up @@ -144,10 +148,10 @@ void run(CkConvInstance<SIGNATURE> auto& conv,

if(!conv.IsSupportedArgument(ck_args))
{
throw std::runtime_error("invalid argument");
std::cout << "invalid argument" << std::endl;
}

conv.MakeInvoker().Run(ck_args, {});
return std::make_tuple(true, conv.MakeInvoker().Run(ck_args, s_conf));
}

} // namespace ck_tile::builder::test
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#pragma once

#include "ck_tile/builder/testing/conv_fwd.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
#include <type_traits>
#include <array>

/// This file contains the implementation details for invoking/testing
/// grouped convolution operations in CK Tile. The main item is the
/// `run()` function, which is the main implementation used to invoke
/// CK Tile grouped forward convolution kernels.

namespace ck_tile::builder::test {

namespace detail {

/// @brief Concept for checking whether this is the CK Tile convolution
/// implementation.
///
/// This is the same as `::ck_tile::builder::test::CkConvInstance`, except
/// with some utility aliases. For that reason, its moved to this detail
/// namespace.
template <typename Conv, auto SIGNATURE>
concept CkTileConvInstance = requires(Conv&) {
{ Conv::BlockSize() };
};

} // namespace detail

/// @brief Concept for checking whether a convolution is invoked like CK Tile.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note: we no longer need the doxygen comments. AMD is not running this code through doxygen for the CK / CK Tile API. Keep the file comments, but you don't need the triple slashes or the doxygen tags.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even if we dont use doxygen I think it looks nice and it is commonly used in other headers. Let me know if you prefer to change it if no I would like to keep this

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eventually we'll probably strip out the doxygen tags, but I'm fine with it either way now. I'm just letting you know you don't have to use them.

///
/// This concept is used to tell whether a convolution implementation is
/// likely to be an "CK Tile" implementation - that is, whether we should
/// invoke it as an CK Tile kernel. This is mainly used with `run()` to
/// differentiate which implementation that should be invoked.
///
/// - SIGNATURE is the operation signature.
/// - Conv is a convolution instance created by the CK Builder API.
template <typename Conv, auto SIGNATURE>
concept CkTileConvInstance = detail::CkTileConvInstance<Conv, SIGNATURE>;

/// @brief `run()` specialization for forward convolution and CK Tile.
///
/// @tparam SIGNATURE Forward convolution signature.
/// @throws std::runtime_error if the arguments weren't actually valid for the
/// operation. This should be caught and reported by the testing framework.
/// @return std::tuple<bool, float> - whether the problem is supported and
/// kernel execution time (0.0f if s_conf time_kernel is false).
///
/// @see run()
template <auto SIGNATURE>
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
std::tuple<bool, float> run(CkTileConvInstance<SIGNATURE> auto& conv,
const Args<SIGNATURE>& args,
const Inputs<SIGNATURE>& inputs,
const Outputs<SIGNATURE>& outputs,
const ck_tile::stream_config s_conf = {})
{
using Conv = std::remove_reference_t<decltype(conv)>;
const auto param = args.to_ck_tile_conv_param();

ck_tile::GroupedConvFwdHostArgs<> host_args(
param, inputs.input, inputs.weight, {}, outputs.output, args.k_batch);

auto kargs = Conv::MakeKernelArgs(host_args);

const dim3 grids = Conv::GridSize(kargs);
const dim3 blocks = Conv::BlockSize();

if(!Conv::IsSupportedArgument(kargs))
{
std::cout << "Not supported!";
return std::make_tuple(false, 0.f);
}

constexpr index_t minimum_occupancy =
Conv::GemmPipeline::Scheduler == ck_tile::GemmPipelineScheduler::Intrawave ? 1 : 2;

return std::make_tuple(
true,
ck_tile::launch_kernel(
s_conf, ck_tile::make_kernel<minimum_occupancy>(conv, grids, blocks, 0, kargs)));
}

} // namespace ck_tile::builder::test
Loading