diff --git a/script/tools/ck-rocprof b/script/tools/ck-rocprof
new file mode 100755
index 00000000000..cf72e3cf9a0
--- /dev/null
+++ b/script/tools/ck-rocprof
@@ -0,0 +1,441 @@
+#!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
+# CK ROCProf Tool - Profile CK applications with rocprof-compute
+
+set -e
+set -o pipefail
+
+# Find script directory and load common utilities
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "${SCRIPT_DIR}/common.sh"
+
+# Initialize configuration
+PROJECT_ROOT=$(get_project_root "${SCRIPT_DIR}")
+CONTAINER_NAME=$(get_container_name "${PROJECT_ROOT}")
+
+# Profiling configuration
+VENV_PATH="${CK_PROFILE_VENV:-/opt/rocprof_venv}"
+ROCPROF_BIN="${CK_ROCPROF_BIN:-/opt/rocm-7.0.1/bin/rocprof-compute}"
+WORKLOAD_DIR="${CK_WORKLOAD_DIR:-/workspace/workloads}"
+ROCM_REQUIREMENTS="/opt/rocm-7.0.1/libexec/rocprofiler-compute/requirements.txt"
+
+# Help message
+show_help() {
+    cat << EOF
+CK ROCProf Tool - Profile CK applications with rocprof-compute
+
+Usage: ck-rocprof <command> [options]
+
+Commands:
+  setup                           One-time setup: create Python venv and install dependencies
+  run <name> <executable> [args]  Profile executable and save results as <name>
+  analyze <name> [block]          Analyze profiling results (default: block 12 - LDS metrics)
+  compare <name1> <name2>         Compare two profiling runs
+  list                            List available profiling runs
+  help                            Show this help message
+
+Examples:
+  ck-rocprof setup
+  ck-rocprof run baseline ./bin/tile_example_gemm_universal
+  ck-rocprof analyze baseline
+  ck-rocprof analyze baseline 12
+  ck-rocprof compare baseline optimized
+  ck-rocprof list
+
+Environment Variables:
+  CK_PROFILE_VENV    - Python venv path (default: /opt/rocprof_venv)
+  CK_ROCPROF_BIN     - rocprof-compute binary path (default: /opt/rocm-7.0.1/bin/rocprof-compute)
+  CK_WORKLOAD_DIR    - Workload storage directory (default: /workspace/build/workloads)
+  CK_CONTAINER_NAME  - Docker container name (for remote profiling)
+
+Profiling Metrics:
+  Block 12 (LDS - Local Data Share):
+    - 12.1.3: Bank Conflict Rate (% of peak)
+    - 12.2.9: Bank Conflicts/Access (conflicts/access)
+    - 12.2.12: Bank Conflict (cycles per kernel)
+    - 12.2.17: LDS Data FIFO Full Rate (cycles)
+
+Notes:
+  - All commands run inside Docker container (auto-started if needed)
+  - Results stored in /workspace/workloads/<name>/<gpu_arch>/
+  - Use 'analyze' to view detailed metrics for specific blocks
+  - Use 'compare' to see side-by-side metrics from two runs
+EOF
+}
+
+# Get rocprof-compute wrapper path
+get_rocprof_wrapper() {
+    echo "${VENV_PATH}/bin/rocprof-compute"
+}
+
+# Check if setup is complete (inside container)
+is_setup_complete() {
+    local wrapper=$(get_rocprof_wrapper)
+    # Check if container is running and setup is complete inside it
+    container_is_running "${CONTAINER_NAME}" && \
+        docker exec "${CONTAINER_NAME}" test -d "${VENV_PATH}" 2>/dev/null && \
+        docker exec "${CONTAINER_NAME}" test -f "${wrapper}" 2>/dev/null
+}
+
+# Setup: Create Python venv and install rocprof-compute dependencies in Docker container
+cmd_setup() {
+    echo "Setting up rocprof-compute profiling environment..."
+    echo "==========================================="
+
+    # Ensure Docker container is running
+    echo "Ensuring Docker container '${CONTAINER_NAME}' is running..."
+    ensure_container_running "${CONTAINER_NAME}" "${SCRIPT_DIR}"
+
+    # Check if rocprofiler-compute is installed, install if not
+    echo "Checking for rocprofiler-compute package..."
+    if ! docker exec "${CONTAINER_NAME}" test -f "${ROCPROF_BIN}" 2>/dev/null; then
+        echo "Installing rocprofiler-compute package..."
+        docker exec "${CONTAINER_NAME}" bash -c "apt update && apt install -y rocprofiler-compute" 2>&1 | tail -5
+        echo "* rocprofiler-compute installed"
+    else
+        echo "* rocprofiler-compute already installed"
+    fi
+
+    # Verify rocprof-compute exists after installation
+    if ! docker exec "${CONTAINER_NAME}" test -f "${ROCPROF_BIN}" 2>/dev/null; then
+        echo "Error: rocprof-compute not found at ${ROCPROF_BIN} after installation"
+        echo "Please check ROCm installation in container"
+        return 1
+    fi
+
+    # Check if requirements file exists
+    if ! docker exec "${CONTAINER_NAME}" test -f "${ROCM_REQUIREMENTS}" 2>/dev/null; then
+        echo "Error: rocprofiler-compute requirements.txt not found at ${ROCM_REQUIREMENTS}"
+        echo "Please ensure ROCm is properly installed in container"
+        return 1
+    fi
+
+    # Install uv if not present
+    echo "Checking for uv package manager..."
+    if ! docker exec "${CONTAINER_NAME}" command -v uv &>/dev/null; then
+        echo "Installing uv package manager..."
+        docker exec "${CONTAINER_NAME}" bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" 2>&1 | tail -3
+        echo "* uv installed"
+    else
+        echo "* uv already installed"
+    fi
+
+    # Create Python venv in container using uv
+    if docker exec "${CONTAINER_NAME}" test -d "${VENV_PATH}" 2>/dev/null; then
+        echo "Python venv already exists at ${VENV_PATH}"
+    else
+        echo "Creating Python virtual environment at ${VENV_PATH}..."
+        docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv venv '${VENV_PATH}'"
+        echo "* Virtual environment created"
+    fi
+
+    # Install dependencies in container using uv
+    echo "Installing rocprofiler-compute dependencies..."
+    docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv pip install --python '${VENV_PATH}/bin/python' -r '${ROCM_REQUIREMENTS}'"
+    # Pin pandas to <3.0 to avoid CSV conversion bug in rocprof-compute
+    docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv pip install --python '${VENV_PATH}/bin/python' 'pandas<3.0'"
+    echo "* Dependencies installed"
+
+    # Create wrapper script in container
+    local wrapper=$(get_rocprof_wrapper)
+    docker exec "${CONTAINER_NAME}" bash -c "cat > '${wrapper}' << 'WRAPPER_EOF'
+#!/bin/bash
+# rocprof-compute wrapper using venv Python
+VENV_DIR=\"\$(cd \"\$(dirname \"\$0\")/..\" && pwd)\"
+exec \"\${VENV_DIR}/bin/python\" /opt/rocm-7.0.1/bin/rocprof-compute \"\$@\"
+WRAPPER_EOF"
+    docker exec "${CONTAINER_NAME}" chmod +x "${wrapper}"
+    echo "* Wrapper script created at ${wrapper}"
+
+    echo ""
+    echo "Setup complete! You can now use:"
+    echo "  ck-rocprof run <name> <executable>"
+}
+
+# Detect GPU architecture (inside container)
+# rocprof-compute uses marketing names (MI350, MI300X, etc.) for directory naming
+# This function attempts to match that convention
+detect_gpu_arch() {
+    if [ -n "${GPU_TARGET:-}" ]; then
+        echo "${GPU_TARGET}"
+        return 0
+    fi
+
+    # Try to get marketing name from rocminfo in container
+    local marketing_name=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep 'Marketing Name:' | grep -oP 'MI\d+[A-Z]*' | head -1" 2>/dev/null || echo "")
+    if [ -n "$marketing_name" ]; then
+        echo "$marketing_name"
+        return 0
+    fi
+
+    # Fallback: Try gfx name from rocminfo in container
+    local gfx_name=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1" 2>/dev/null || echo "")
+    if [ -n "$gfx_name" ]; then
+        echo "$gfx_name"
+        return 0
+    fi
+
+    # Fallback: try to detect from existing workload dirs in container
+    local first_dir=$(docker exec "${CONTAINER_NAME}" bash -c "find '${WORKLOAD_DIR}' -maxdepth 2 -type d \( -name 'gfx*' -o -name 'MI*' \) 2>/dev/null | head -1" 2>/dev/null || echo "")
+    if [ -n "$first_dir" ]; then
+        basename "$first_dir"
+        return 0
+    fi
+
+    # Final fallback
+    echo "MI350"
+}
+
+# Run profiling (inside container)
+cmd_run() {
+    local name="$1"
+    local executable="$2"
+    shift 2
+    local -a exe_args=("$@")
+
+    if [ -z "$name" ] || [ -z "$executable" ]; then
+        echo "Error: name and executable required"
+        echo "Usage: ck-rocprof run <name> <executable> [args]"
+        return 1
+    fi
+
+    # Check setup
+    if ! is_setup_complete; then
+        echo "Error: Profiling environment not set up"
+        echo "Run: ck-rocprof setup"
+        return 1
+    fi
+
+    # Check if executable exists in container
+    if ! docker exec "${CONTAINER_NAME}" test -f "$executable" 2>/dev/null; then
+        echo "Error: Executable not found in container: $executable"
+        return 1
+    fi
+
+    local wrapper=$(get_rocprof_wrapper)
+    local gpu_arch=$(detect_gpu_arch)
+
+    echo "Profiling: $executable ${exe_args[*]}"
+    echo "Run name: $name"
+    echo "GPU arch: $gpu_arch"
+    echo "Container: $CONTAINER_NAME"
+    echo "==========================================="
+
+    # Build command for container execution
+    # --no-roof skips roofline analysis to speed up profiling
+    local cmd="${wrapper} profile --no-roof --name ${name} -- ${executable}"
+    for arg in "${exe_args[@]}"; do
+        cmd="${cmd} $(printf '%q' "$arg")"
+    done
+
+    # Run profiling in container
+    docker exec "${CONTAINER_NAME}" bash -c "${cmd}"
+
+    echo ""
+    echo "* Profiling complete"
+    echo "Results saved to: ${WORKLOAD_DIR}/${name}/${gpu_arch}/"
+    echo ""
+    echo "Analyze with: ck-rocprof analyze ${name}"
+}
+
+# Find actual GPU architecture directory for a given run name (inside container)
+# rocprof-compute creates directories with various naming schemes (MI350, gfx950, etc.)
+# This function finds whatever directory actually exists
+find_workload_path() {
+    local name="$1"
+    local run_dir="${WORKLOAD_DIR}/${name}"
+
+    if ! docker exec "${CONTAINER_NAME}" test -d "$run_dir" 2>/dev/null; then
+        return 1
+    fi
+
+    # Find first subdirectory (should be GPU architecture)
+    local gpu_dir=$(docker exec "${CONTAINER_NAME}" bash -c "find '$run_dir' -maxdepth 1 -mindepth 1 -type d 2>/dev/null | head -1" 2>/dev/null)
+
+    if [ -n "$gpu_dir" ]; then
+        echo "$gpu_dir"
+        return 0
+    fi
+
+    return 1
+}
+
+# Analyze profiling results (inside container)
+cmd_analyze() {
+    local name="$1"
+    local block="${2:-12}"  # Default to block 12 (LDS metrics)
+
+    if [ -z "$name" ]; then
+        echo "Error: name required"
+        echo "Usage: ck-rocprof analyze <name> [block]"
+        return 1
+    fi
+
+    # Check setup
+    if ! is_setup_complete; then
+        echo "Error: Profiling environment not set up"
+        echo "Run: ck-rocprof setup"
+        return 1
+    fi
+
+    local wrapper=$(get_rocprof_wrapper)
+    local workload_path=$(find_workload_path "${name}")
+
+    if [ -z "$workload_path" ]; then
+        echo "Error: Profiling results not found for '${name}'"
+        echo ""
+        echo "Available runs:"
+        cmd_list
+        return 1
+    fi
+
+    local gpu_arch=$(basename "$workload_path")
+    echo "Analyzing: ${name} (GPU: ${gpu_arch}, Block ${block})"
+    echo "==========================================="
+    echo ""
+
+    docker exec "${CONTAINER_NAME}" "${wrapper}" analyze --path "${workload_path}" --block "${block}"
+}
+
+# Extract key LDS metrics from profiling results (inside container)
+extract_lds_metrics() {
+    local name="$1"
+    local gpu_arch=$(detect_gpu_arch)
+    local workload_path="${WORKLOAD_DIR}/${name}/${gpu_arch}"
+    local csv_file="${workload_path}/SQ_INST_LEVEL_LDS.csv"
+
+    if ! docker exec "${CONTAINER_NAME}" test -f "$csv_file" 2>/dev/null; then
+        echo "N/A (CSV not found)"
+        return 1
+    fi
+
+    # Extract key metrics (simplified - would need proper CSV parsing)
+    # For now, just indicate that raw data is available
+    echo "Raw data available at: ${csv_file}"
+}
+
+# Compare two profiling runs
+cmd_compare() {
+    local name1="$1"
+    local name2="$2"
+
+    if [ -z "$name1" ] || [ -z "$name2" ]; then
+        echo "Error: two run names required"
+        echo "Usage: ck-rocprof compare <name1> <name2>"
+        return 1
+    fi
+
+    # Check setup
+    if ! is_setup_complete; then
+        echo "Error: Profiling environment not set up"
+        echo "Run: ck-rocprof setup"
+        return 1
+    fi
+
+    # Verify both runs exist
+    local path1=$(find_workload_path "${name1}")
+    local path2=$(find_workload_path "${name2}")
+
+    if [ -z "$path1" ]; then
+        echo "Error: Profiling results not found for '${name1}'"
+        return 1
+    fi
+
+    if [ -z "$path2" ]; then
+        echo "Error: Profiling results not found for '${name2}'"
+        return 1
+    fi
+
+    local gpu1=$(basename "$path1")
+    local gpu2=$(basename "$path2")
+
+    echo "Comparing profiling runs:"
+    echo "  Baseline:  ${name1} (${gpu1})"
+    echo "  Optimized: ${name2} (${gpu2})"
+    echo "==========================================="
+    echo ""
+
+    echo "=== ${name1} - Block 12 (LDS) ==="
+    cmd_analyze "${name1}" 12 2>/dev/null | head -40
+
+    echo ""
+    echo "=== ${name2} - Block 12 (LDS) ==="
+    cmd_analyze "${name2}" 12 2>/dev/null | head -40
+
+    echo ""
+    echo "==========================================="
+    echo "For detailed analysis, run:"
+    echo "  ck-rocprof analyze ${name1} 12"
+    echo "  ck-rocprof analyze ${name2} 12"
+}
+
+# List available profiling runs (inside container)
+cmd_list() {
+    if ! docker exec "${CONTAINER_NAME}" test -d "${WORKLOAD_DIR}" 2>/dev/null; then
+        echo "No profiling runs found (workload directory doesn't exist)"
+        return 0
+    fi
+
+    local runs=$(docker exec "${CONTAINER_NAME}" bash -c "find '${WORKLOAD_DIR}' -maxdepth 1 -mindepth 1 -type d -exec basename {} \; 2>/dev/null | sort" 2>/dev/null)
+
+    if [ -z "$runs" ]; then
+        echo "No profiling runs found in ${WORKLOAD_DIR}"
+        return 0
+    fi
+
+    echo "Available profiling runs:"
+    echo "==========================================="
+
+    while IFS= read -r run; do
+        local path=$(find_workload_path "$run")
+
+        if [ -n "$path" ]; then
+            local gpu_arch=$(basename "$path")
+            local size=$(docker exec "${CONTAINER_NAME}" bash -c "du -sh '$path' 2>/dev/null | cut -f1" 2>/dev/null)
+            local date=$(docker exec "${CONTAINER_NAME}" bash -c "stat -c %y '$path' 2>/dev/null | cut -d' ' -f1" 2>/dev/null)
+            printf "  %-25s [%-8s, %s, %s]\n" "$run" "$gpu_arch" "$size" "$date"
+        else
+            printf "  %-25s [no data]\n" "$run"
+        fi
+    done <<< "$runs"
+
+    echo ""
+    echo "Analyze with: ck-rocprof analyze <name>"
+}
+
+# Main command dispatcher
+case "${1:-}" in
+    setup)
+        cmd_setup
+        ;;
+    run)
+        shift
+        cmd_run "$@"
+        ;;
+    analyze)
+        shift
+        cmd_analyze "$@"
+        ;;
+    compare)
+        shift
+        cmd_compare "$@"
+        ;;
+    list)
+        cmd_list
+        ;;
+    help|--help|-h)
+        show_help
+        ;;
+    *)
+        if [ -z "${1:-}" ]; then
+            show_help
+        else
+            echo "Unknown command: ${1}"
+            echo ""
+            show_help
+            exit 1
+        fi
+        ;;
+esac
diff --git a/script/tools/ck-rocprof.md b/script/tools/ck-rocprof.md
new file mode 100644
index 00000000000..b9dc858dfaf
--- /dev/null
+++ b/script/tools/ck-rocprof.md
@@ -0,0 +1,157 @@
+# CK ROCProf Tool
+
+GPU performance profiling for Composable Kernel applications using AMD rocprof-compute.
+
+## Quick Start
+
+```bash
+# One-time setup
+./script/tools/ck-rocprof setup
+
+# Profile executable
+cd build
+../script/tools/ck-rocprof run baseline ./bin/tile_example_gemm_universal
+
+# Analyze LDS metrics
+../script/tools/ck-rocprof analyze baseline
+
+# Compare optimizations
+../script/tools/ck-rocprof run optimized ./bin/tile_example_gemm_universal
+../script/tools/ck-rocprof compare baseline optimized
+```
+
+## Commands
+
+### `setup`
+One-time setup: creates Python venv, installs dependencies, configures rocprof-compute.
+
+### `run <name> <executable> [args]`
+Profile executable and save results.
+
+```bash
+# Basic profiling
+ck-rocprof run baseline ./bin/gemm_example
+
+# With arguments
+ck-rocprof run large_matrix ./bin/gemm_example -m 8192 -n 8192 -k 4096
+
+# Test filtering
+ck-rocprof run unit_test ./bin/test_gemm --gtest_filter="*Fp16*"
+```
+
+### `analyze <name> [block]`
+Display profiling metrics (default: Block 12 - LDS).
+
+```bash
+ck-rocprof analyze baseline        # LDS metrics
+ck-rocprof analyze baseline 2      # L2 Cache
+ck-rocprof analyze baseline 7      # Instruction Mix
+```
+
+### `compare <name1> <name2>`
+Side-by-side comparison of two runs.
+
+### `list`
+List all profiling runs with size and date.
+
+## Key LDS Metrics (Block 12)
+
+**Target Values:**
+- Bank Conflicts/Access: <0.01 (1% conflict rate)
+- Bank Conflict Rate: >90% of peak bandwidth
+
+**Critical Metrics:**
+- **12.2.9 Bank Conflicts/Access**: Direct conflict measure
+  - Baseline (naive): ~0.04 (4% conflicts)
+  - Optimized: <0.005 (<0.5% conflicts)
+- **12.2.12 Bank Conflict Cycles**: Wasted cycles per kernel
+- **12.2.17 LDS Data FIFO Full**: Memory system pressure
+
+## Optimization Workflow
+
+```bash
+# 1. Baseline
+ck-rocprof run baseline ./bin/my_kernel
+
+# 2. Check conflicts
+ck-rocprof analyze baseline
+# Look for Bank Conflicts/Access > 0.02
+
+# 3. Optimize code (XOR transforms, padding, etc.)
+# ... edit source ...
+
+# 4. Test optimization
+ninja my_kernel
+ck-rocprof run optimized ./bin/my_kernel
+
+# 5. Verify improvement
+ck-rocprof compare baseline optimized
+# Target: 8-10x reduction in conflicts
+```
+
+## Environment Variables
+
+- `CK_PROFILE_VENV`: Python venv path (default: `/opt/rocprof_venv`)
+- `CK_ROCPROF_BIN`: rocprof-compute binary path
+- `CK_WORKLOAD_DIR`: Results directory (default: `/workspace/build/workloads`)
+- `GPU_TARGET`: Override GPU detection (e.g., `gfx950`, `gfx942`)
+
+## Interpreting Results
+
+**Good Performance:**
+```
+Bank Conflicts/Access: <0.01
+Bank Conflict Rate: >90% of peak
+LDS Data FIFO Full: Minimal cycles
+```
+
+**Needs Optimization:**
+```
+Bank Conflicts/Access: >0.02
+Bank Conflict Cycles: High MAX values
+LDS Data FIFO Full: High memory pressure
+```
+
+## Troubleshooting
+
+**"Profiling environment not set up"**
+```bash
+ck-rocprof setup
+```
+
+**"rocprof-compute not found"**
+```bash
+export CK_ROCPROF_BIN=/custom/path/rocprof-compute
+ck-rocprof setup
+```
+
+**"Profiling results not found"**
+```bash
+ck-rocprof list                    # Check available runs
+rocminfo | grep gfx               # Verify GPU arch
+export GPU_TARGET=gfx950          # Override if needed
+```
+
+## Storage Layout
+
+Results stored in `/workspace/build/workloads/<name>/<gpu_arch>/`:
+- `SQ_INST_LEVEL_LDS.csv`: LDS metrics
+- `pmc_perf.csv`: Performance counters
+- `counter_collection.csv`: All metrics
+
+## Technical Details
+
+- **Setup**: Creates isolated Python venv, installs dependencies
+- **Profiling**: Runs `rocprof-compute profile --name <name> -- <executable>`
+- **Analysis**: Runs `rocprof-compute analyze --path <path> --block <block>`
+- **GPU Support**: MI300/MI350 series, auto-detects architecture
+
+## Related Tools
+
+- `ck-docker`: Container management
+- `rocprof-compute`: AMD GPU profiler v2
+- `rocm-smi`: System monitoring
+
+## License
+
+Copyright (c) Advanced Micro Devices, Inc. SPDX-License-Identifier: MIT