diff --git a/script/tools/ck-rocprof b/script/tools/ck-rocprof new file mode 100755 index 00000000000..cf72e3cf9a0 --- /dev/null +++ b/script/tools/ck-rocprof @@ -0,0 +1,441 @@ +#!/bin/bash +# Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +# SPDX-License-Identifier: MIT + +# CK ROCProf Tool - Profile CK applications with rocprof-compute + +set -e +set -o pipefail + +# Find script directory and load common utilities +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "${SCRIPT_DIR}/common.sh" + +# Initialize configuration +PROJECT_ROOT=$(get_project_root "${SCRIPT_DIR}") +CONTAINER_NAME=$(get_container_name "${PROJECT_ROOT}") + +# Profiling configuration +VENV_PATH="${CK_PROFILE_VENV:-/opt/rocprof_venv}" +ROCPROF_BIN="${CK_ROCPROF_BIN:-/opt/rocm-7.0.1/bin/rocprof-compute}" +WORKLOAD_DIR="${CK_WORKLOAD_DIR:-/workspace/workloads}" +ROCM_REQUIREMENTS="/opt/rocm-7.0.1/libexec/rocprofiler-compute/requirements.txt" + +# Help message +show_help() { + cat << EOF +CK ROCProf Tool - Profile CK applications with rocprof-compute + +Usage: ck-rocprof [options] + +Commands: + setup One-time setup: create Python venv and install dependencies + run [args] Profile executable and save results as + analyze [block] Analyze profiling results (default: block 12 - LDS metrics) + compare Compare two profiling runs + list List available profiling runs + help Show this help message + +Examples: + ck-rocprof setup + ck-rocprof run baseline ./bin/tile_example_gemm_universal + ck-rocprof analyze baseline + ck-rocprof analyze baseline 12 + ck-rocprof compare baseline optimized + ck-rocprof list + +Environment Variables: + CK_PROFILE_VENV - Python venv path (default: /opt/rocprof_venv) + CK_ROCPROF_BIN - rocprof-compute binary path (default: /opt/rocm-7.0.1/bin/rocprof-compute) + CK_WORKLOAD_DIR - Workload storage directory (default: /workspace/build/workloads) + CK_CONTAINER_NAME - Docker container name (for remote profiling) + +Profiling Metrics: + Block 12 (LDS - Local Data Share): + - 12.1.3: Bank Conflict Rate (% of peak) + - 12.2.9: Bank Conflicts/Access (conflicts/access) + - 12.2.12: Bank Conflict (cycles per kernel) + - 12.2.17: LDS Data FIFO Full Rate (cycles) + +Notes: + - All commands run inside Docker container (auto-started if needed) + - Results stored in /workspace/workloads/// + - Use 'analyze' to view detailed metrics for specific blocks + - Use 'compare' to see side-by-side metrics from two runs +EOF +} + +# Get rocprof-compute wrapper path +get_rocprof_wrapper() { + echo "${VENV_PATH}/bin/rocprof-compute" +} + +# Check if setup is complete (inside container) +is_setup_complete() { + local wrapper=$(get_rocprof_wrapper) + # Check if container is running and setup is complete inside it + container_is_running "${CONTAINER_NAME}" && \ + docker exec "${CONTAINER_NAME}" test -d "${VENV_PATH}" 2>/dev/null && \ + docker exec "${CONTAINER_NAME}" test -f "${wrapper}" 2>/dev/null +} + +# Setup: Create Python venv and install rocprof-compute dependencies in Docker container +cmd_setup() { + echo "Setting up rocprof-compute profiling environment..." + echo "===========================================" + + # Ensure Docker container is running + echo "Ensuring Docker container '${CONTAINER_NAME}' is running..." + ensure_container_running "${CONTAINER_NAME}" "${SCRIPT_DIR}" + + # Check if rocprofiler-compute is installed, install if not + echo "Checking for rocprofiler-compute package..." + if ! docker exec "${CONTAINER_NAME}" test -f "${ROCPROF_BIN}" 2>/dev/null; then + echo "Installing rocprofiler-compute package..." + docker exec "${CONTAINER_NAME}" bash -c "apt update && apt install -y rocprofiler-compute" 2>&1 | tail -5 + echo "* rocprofiler-compute installed" + else + echo "* rocprofiler-compute already installed" + fi + + # Verify rocprof-compute exists after installation + if ! docker exec "${CONTAINER_NAME}" test -f "${ROCPROF_BIN}" 2>/dev/null; then + echo "Error: rocprof-compute not found at ${ROCPROF_BIN} after installation" + echo "Please check ROCm installation in container" + return 1 + fi + + # Check if requirements file exists + if ! docker exec "${CONTAINER_NAME}" test -f "${ROCM_REQUIREMENTS}" 2>/dev/null; then + echo "Error: rocprofiler-compute requirements.txt not found at ${ROCM_REQUIREMENTS}" + echo "Please ensure ROCm is properly installed in container" + return 1 + fi + + # Install uv if not present + echo "Checking for uv package manager..." + if ! docker exec "${CONTAINER_NAME}" command -v uv &>/dev/null; then + echo "Installing uv package manager..." + docker exec "${CONTAINER_NAME}" bash -c "curl -LsSf https://astral.sh/uv/install.sh | sh" 2>&1 | tail -3 + echo "* uv installed" + else + echo "* uv already installed" + fi + + # Create Python venv in container using uv + if docker exec "${CONTAINER_NAME}" test -d "${VENV_PATH}" 2>/dev/null; then + echo "Python venv already exists at ${VENV_PATH}" + else + echo "Creating Python virtual environment at ${VENV_PATH}..." + docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv venv '${VENV_PATH}'" + echo "* Virtual environment created" + fi + + # Install dependencies in container using uv + echo "Installing rocprofiler-compute dependencies..." + docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv pip install --python '${VENV_PATH}/bin/python' -r '${ROCM_REQUIREMENTS}'" + # Pin pandas to <3.0 to avoid CSV conversion bug in rocprof-compute + docker exec "${CONTAINER_NAME}" bash -c "source ~/.local/bin/env && uv pip install --python '${VENV_PATH}/bin/python' 'pandas<3.0'" + echo "* Dependencies installed" + + # Create wrapper script in container + local wrapper=$(get_rocprof_wrapper) + docker exec "${CONTAINER_NAME}" bash -c "cat > '${wrapper}' << 'WRAPPER_EOF' +#!/bin/bash +# rocprof-compute wrapper using venv Python +VENV_DIR=\"\$(cd \"\$(dirname \"\$0\")/..\" && pwd)\" +exec \"\${VENV_DIR}/bin/python\" /opt/rocm-7.0.1/bin/rocprof-compute \"\$@\" +WRAPPER_EOF" + docker exec "${CONTAINER_NAME}" chmod +x "${wrapper}" + echo "* Wrapper script created at ${wrapper}" + + echo "" + echo "Setup complete! You can now use:" + echo " ck-rocprof run " +} + +# Detect GPU architecture (inside container) +# rocprof-compute uses marketing names (MI350, MI300X, etc.) for directory naming +# This function attempts to match that convention +detect_gpu_arch() { + if [ -n "${GPU_TARGET:-}" ]; then + echo "${GPU_TARGET}" + return 0 + fi + + # Try to get marketing name from rocminfo in container + local marketing_name=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep 'Marketing Name:' | grep -oP 'MI\d+[A-Z]*' | head -1" 2>/dev/null || echo "") + if [ -n "$marketing_name" ]; then + echo "$marketing_name" + return 0 + fi + + # Fallback: Try gfx name from rocminfo in container + local gfx_name=$(docker exec "${CONTAINER_NAME}" bash -c "rocminfo 2>/dev/null | grep -oP 'gfx[0-9a-z]+' | head -1" 2>/dev/null || echo "") + if [ -n "$gfx_name" ]; then + echo "$gfx_name" + return 0 + fi + + # Fallback: try to detect from existing workload dirs in container + local first_dir=$(docker exec "${CONTAINER_NAME}" bash -c "find '${WORKLOAD_DIR}' -maxdepth 2 -type d \( -name 'gfx*' -o -name 'MI*' \) 2>/dev/null | head -1" 2>/dev/null || echo "") + if [ -n "$first_dir" ]; then + basename "$first_dir" + return 0 + fi + + # Final fallback + echo "MI350" +} + +# Run profiling (inside container) +cmd_run() { + local name="$1" + local executable="$2" + shift 2 + local -a exe_args=("$@") + + if [ -z "$name" ] || [ -z "$executable" ]; then + echo "Error: name and executable required" + echo "Usage: ck-rocprof run [args]" + return 1 + fi + + # Check setup + if ! is_setup_complete; then + echo "Error: Profiling environment not set up" + echo "Run: ck-rocprof setup" + return 1 + fi + + # Check if executable exists in container + if ! docker exec "${CONTAINER_NAME}" test -f "$executable" 2>/dev/null; then + echo "Error: Executable not found in container: $executable" + return 1 + fi + + local wrapper=$(get_rocprof_wrapper) + local gpu_arch=$(detect_gpu_arch) + + echo "Profiling: $executable ${exe_args[*]}" + echo "Run name: $name" + echo "GPU arch: $gpu_arch" + echo "Container: $CONTAINER_NAME" + echo "===========================================" + + # Build command for container execution + # --no-roof skips roofline analysis to speed up profiling + local cmd="${wrapper} profile --no-roof --name ${name} -- ${executable}" + for arg in "${exe_args[@]}"; do + cmd="${cmd} $(printf '%q' "$arg")" + done + + # Run profiling in container + docker exec "${CONTAINER_NAME}" bash -c "${cmd}" + + echo "" + echo "* Profiling complete" + echo "Results saved to: ${WORKLOAD_DIR}/${name}/${gpu_arch}/" + echo "" + echo "Analyze with: ck-rocprof analyze ${name}" +} + +# Find actual GPU architecture directory for a given run name (inside container) +# rocprof-compute creates directories with various naming schemes (MI350, gfx950, etc.) +# This function finds whatever directory actually exists +find_workload_path() { + local name="$1" + local run_dir="${WORKLOAD_DIR}/${name}" + + if ! docker exec "${CONTAINER_NAME}" test -d "$run_dir" 2>/dev/null; then + return 1 + fi + + # Find first subdirectory (should be GPU architecture) + local gpu_dir=$(docker exec "${CONTAINER_NAME}" bash -c "find '$run_dir' -maxdepth 1 -mindepth 1 -type d 2>/dev/null | head -1" 2>/dev/null) + + if [ -n "$gpu_dir" ]; then + echo "$gpu_dir" + return 0 + fi + + return 1 +} + +# Analyze profiling results (inside container) +cmd_analyze() { + local name="$1" + local block="${2:-12}" # Default to block 12 (LDS metrics) + + if [ -z "$name" ]; then + echo "Error: name required" + echo "Usage: ck-rocprof analyze [block]" + return 1 + fi + + # Check setup + if ! is_setup_complete; then + echo "Error: Profiling environment not set up" + echo "Run: ck-rocprof setup" + return 1 + fi + + local wrapper=$(get_rocprof_wrapper) + local workload_path=$(find_workload_path "${name}") + + if [ -z "$workload_path" ]; then + echo "Error: Profiling results not found for '${name}'" + echo "" + echo "Available runs:" + cmd_list + return 1 + fi + + local gpu_arch=$(basename "$workload_path") + echo "Analyzing: ${name} (GPU: ${gpu_arch}, Block ${block})" + echo "===========================================" + echo "" + + docker exec "${CONTAINER_NAME}" "${wrapper}" analyze --path "${workload_path}" --block "${block}" +} + +# Extract key LDS metrics from profiling results (inside container) +extract_lds_metrics() { + local name="$1" + local gpu_arch=$(detect_gpu_arch) + local workload_path="${WORKLOAD_DIR}/${name}/${gpu_arch}" + local csv_file="${workload_path}/SQ_INST_LEVEL_LDS.csv" + + if ! docker exec "${CONTAINER_NAME}" test -f "$csv_file" 2>/dev/null; then + echo "N/A (CSV not found)" + return 1 + fi + + # Extract key metrics (simplified - would need proper CSV parsing) + # For now, just indicate that raw data is available + echo "Raw data available at: ${csv_file}" +} + +# Compare two profiling runs +cmd_compare() { + local name1="$1" + local name2="$2" + + if [ -z "$name1" ] || [ -z "$name2" ]; then + echo "Error: two run names required" + echo "Usage: ck-rocprof compare " + return 1 + fi + + # Check setup + if ! is_setup_complete; then + echo "Error: Profiling environment not set up" + echo "Run: ck-rocprof setup" + return 1 + fi + + # Verify both runs exist + local path1=$(find_workload_path "${name1}") + local path2=$(find_workload_path "${name2}") + + if [ -z "$path1" ]; then + echo "Error: Profiling results not found for '${name1}'" + return 1 + fi + + if [ -z "$path2" ]; then + echo "Error: Profiling results not found for '${name2}'" + return 1 + fi + + local gpu1=$(basename "$path1") + local gpu2=$(basename "$path2") + + echo "Comparing profiling runs:" + echo " Baseline: ${name1} (${gpu1})" + echo " Optimized: ${name2} (${gpu2})" + echo "===========================================" + echo "" + + echo "=== ${name1} - Block 12 (LDS) ===" + cmd_analyze "${name1}" 12 2>/dev/null | head -40 + + echo "" + echo "=== ${name2} - Block 12 (LDS) ===" + cmd_analyze "${name2}" 12 2>/dev/null | head -40 + + echo "" + echo "===========================================" + echo "For detailed analysis, run:" + echo " ck-rocprof analyze ${name1} 12" + echo " ck-rocprof analyze ${name2} 12" +} + +# List available profiling runs (inside container) +cmd_list() { + if ! docker exec "${CONTAINER_NAME}" test -d "${WORKLOAD_DIR}" 2>/dev/null; then + echo "No profiling runs found (workload directory doesn't exist)" + return 0 + fi + + local runs=$(docker exec "${CONTAINER_NAME}" bash -c "find '${WORKLOAD_DIR}' -maxdepth 1 -mindepth 1 -type d -exec basename {} \; 2>/dev/null | sort" 2>/dev/null) + + if [ -z "$runs" ]; then + echo "No profiling runs found in ${WORKLOAD_DIR}" + return 0 + fi + + echo "Available profiling runs:" + echo "===========================================" + + while IFS= read -r run; do + local path=$(find_workload_path "$run") + + if [ -n "$path" ]; then + local gpu_arch=$(basename "$path") + local size=$(docker exec "${CONTAINER_NAME}" bash -c "du -sh '$path' 2>/dev/null | cut -f1" 2>/dev/null) + local date=$(docker exec "${CONTAINER_NAME}" bash -c "stat -c %y '$path' 2>/dev/null | cut -d' ' -f1" 2>/dev/null) + printf " %-25s [%-8s, %s, %s]\n" "$run" "$gpu_arch" "$size" "$date" + else + printf " %-25s [no data]\n" "$run" + fi + done <<< "$runs" + + echo "" + echo "Analyze with: ck-rocprof analyze " +} + +# Main command dispatcher +case "${1:-}" in + setup) + cmd_setup + ;; + run) + shift + cmd_run "$@" + ;; + analyze) + shift + cmd_analyze "$@" + ;; + compare) + shift + cmd_compare "$@" + ;; + list) + cmd_list + ;; + help|--help|-h) + show_help + ;; + *) + if [ -z "${1:-}" ]; then + show_help + else + echo "Unknown command: ${1}" + echo "" + show_help + exit 1 + fi + ;; +esac diff --git a/script/tools/ck-rocprof.md b/script/tools/ck-rocprof.md new file mode 100644 index 00000000000..b9dc858dfaf --- /dev/null +++ b/script/tools/ck-rocprof.md @@ -0,0 +1,157 @@ +# CK ROCProf Tool + +GPU performance profiling for Composable Kernel applications using AMD rocprof-compute. + +## Quick Start + +```bash +# One-time setup +./script/tools/ck-rocprof setup + +# Profile executable +cd build +../script/tools/ck-rocprof run baseline ./bin/tile_example_gemm_universal + +# Analyze LDS metrics +../script/tools/ck-rocprof analyze baseline + +# Compare optimizations +../script/tools/ck-rocprof run optimized ./bin/tile_example_gemm_universal +../script/tools/ck-rocprof compare baseline optimized +``` + +## Commands + +### `setup` +One-time setup: creates Python venv, installs dependencies, configures rocprof-compute. + +### `run [args]` +Profile executable and save results. + +```bash +# Basic profiling +ck-rocprof run baseline ./bin/gemm_example + +# With arguments +ck-rocprof run large_matrix ./bin/gemm_example -m 8192 -n 8192 -k 4096 + +# Test filtering +ck-rocprof run unit_test ./bin/test_gemm --gtest_filter="*Fp16*" +``` + +### `analyze [block]` +Display profiling metrics (default: Block 12 - LDS). + +```bash +ck-rocprof analyze baseline # LDS metrics +ck-rocprof analyze baseline 2 # L2 Cache +ck-rocprof analyze baseline 7 # Instruction Mix +``` + +### `compare ` +Side-by-side comparison of two runs. + +### `list` +List all profiling runs with size and date. + +## Key LDS Metrics (Block 12) + +**Target Values:** +- Bank Conflicts/Access: <0.01 (1% conflict rate) +- Bank Conflict Rate: >90% of peak bandwidth + +**Critical Metrics:** +- **12.2.9 Bank Conflicts/Access**: Direct conflict measure + - Baseline (naive): ~0.04 (4% conflicts) + - Optimized: <0.005 (<0.5% conflicts) +- **12.2.12 Bank Conflict Cycles**: Wasted cycles per kernel +- **12.2.17 LDS Data FIFO Full**: Memory system pressure + +## Optimization Workflow + +```bash +# 1. Baseline +ck-rocprof run baseline ./bin/my_kernel + +# 2. Check conflicts +ck-rocprof analyze baseline +# Look for Bank Conflicts/Access > 0.02 + +# 3. Optimize code (XOR transforms, padding, etc.) +# ... edit source ... + +# 4. Test optimization +ninja my_kernel +ck-rocprof run optimized ./bin/my_kernel + +# 5. Verify improvement +ck-rocprof compare baseline optimized +# Target: 8-10x reduction in conflicts +``` + +## Environment Variables + +- `CK_PROFILE_VENV`: Python venv path (default: `/opt/rocprof_venv`) +- `CK_ROCPROF_BIN`: rocprof-compute binary path +- `CK_WORKLOAD_DIR`: Results directory (default: `/workspace/build/workloads`) +- `GPU_TARGET`: Override GPU detection (e.g., `gfx950`, `gfx942`) + +## Interpreting Results + +**Good Performance:** +``` +Bank Conflicts/Access: <0.01 +Bank Conflict Rate: >90% of peak +LDS Data FIFO Full: Minimal cycles +``` + +**Needs Optimization:** +``` +Bank Conflicts/Access: >0.02 +Bank Conflict Cycles: High MAX values +LDS Data FIFO Full: High memory pressure +``` + +## Troubleshooting + +**"Profiling environment not set up"** +```bash +ck-rocprof setup +``` + +**"rocprof-compute not found"** +```bash +export CK_ROCPROF_BIN=/custom/path/rocprof-compute +ck-rocprof setup +``` + +**"Profiling results not found"** +```bash +ck-rocprof list # Check available runs +rocminfo | grep gfx # Verify GPU arch +export GPU_TARGET=gfx950 # Override if needed +``` + +## Storage Layout + +Results stored in `/workspace/build/workloads///`: +- `SQ_INST_LEVEL_LDS.csv`: LDS metrics +- `pmc_perf.csv`: Performance counters +- `counter_collection.csv`: All metrics + +## Technical Details + +- **Setup**: Creates isolated Python venv, installs dependencies +- **Profiling**: Runs `rocprof-compute profile --name -- ` +- **Analysis**: Runs `rocprof-compute analyze --path --block ` +- **GPU Support**: MI300/MI350 series, auto-detects architecture + +## Related Tools + +- `ck-docker`: Container management +- `rocprof-compute`: AMD GPU profiler v2 +- `rocm-smi`: System monitoring + +## License + +Copyright (c) Advanced Micro Devices, Inc. SPDX-License-Identifier: MIT