diff --git a/.github/workflows/ci-paddle.yml b/.github/workflows/ci-paddle.yml new file mode 100644 index 000000000..a9fee0476 --- /dev/null +++ b/.github/workflows/ci-paddle.yml @@ -0,0 +1,105 @@ +name: CI Paddle +on: + push: + branches: [paddle] + tags: ["v*"] + pull_request: + merge_group: + workflow_dispatch: + +permissions: + contents: read + +concurrency: + group: "${{ github.workflow }}-${{ github.ref }}" + cancel-in-progress: true + +jobs: + test: + name: Test + runs-on: + group: H20 + timeout-minutes: 30 + env: + container_name: tilelang-paddle-test-${{ github.run_id }} + steps: + - name: Check docker image and run container + env: + FLAGS_fraction_of_gpu_memory_to_use: 0.15 + CTEST_PARALLEL_LEVEL: 2 + WITH_GPU: "ON" + CUDA_ARCH_NAME: Hopper + WITH_AVX: "ON" + PY_VERSION: "3.10" + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + no_proxy: "bcebos.com,apiin.im.baidu.com,gitee.com,aliyun.com,.baidu.com,.tuna.tsinghua.edu.cn" + run: | + docker_image=ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda129-coverage-test + docker run -d -t --gpus all --name ${{ env.container_name }} \ + -v "/dev/shm:/dev/shm" \ + -v ${{ github.workspace }}/../../..:${{ github.workspace }}/../../.. \ + -v ${{ github.workspace }}:/workspace \ + -e FLAGS_fraction_of_gpu_memory_to_use \ + -e CTEST_PARALLEL_LEVEL \ + -e WITH_GPU \ + -e CUDA_ARCH_NAME \ + -e WITH_AVX \ + -e PY_VERSION \ + -e GITHUB_TOKEN \ + -e no_proxy \ + -w /workspace \ + --network host \ + ${docker_image} + + - name: Checkout repository + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + set -e + source ${{ github.workspace }}/../../../proxy + git config --global --add safe.directory "*" + # Clean workspace + find . -maxdepth 1 ! -name "." -exec rm -rf {} + + # Checkout + git init + git remote add origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} + git fetch origin ${{ github.ref }} --depth=1 + git checkout FETCH_HEAD + git submodule update --init --recursive + ' + + - name: Install dependencies + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + set -e + source ${{ github.workspace }}/../../../proxy + + # Install uv + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env + + # Create and activate virtual environment + uv venv .venv --seed + source .venv/bin/activate + + # Install paddle + uv pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu129/ + + # Install project and minimal test runner + uv pip install pytest + uv pip install -e . + ' + + - name: Run tests + run: | + docker exec -t ${{ env.container_name }} /bin/bash -c ' + set -e + source .venv/bin/activate + pytest tests_paddle/ + ' + + - name: Terminate and delete the container + if: always() + run: | + set +e + docker stop ${{ env.container_name }} + docker rm ${{ env.container_name }} diff --git a/.github/workflows/dist-paddle.yml b/.github/workflows/dist-paddle.yml index 67a2d9372..8f3896633 100644 --- a/.github/workflows/dist-paddle.yml +++ b/.github/workflows/dist-paddle.yml @@ -1,7 +1,10 @@ -name: Dist +name: Dist Paddle on: push: + branches: [paddle] tags: ["v*"] + pull_request: + merge_group: workflow_dispatch: permissions: @@ -28,7 +31,7 @@ jobs: # Otherwise, the version of the SDist has a git hash suffix (e.g., 0.1.0+gitabcdef12), # but the package built from the SDist has no way to get the git hash (it is not a git repo), # leading to inconsistent versions between SDist and built packages (+gitabcdef12 vs. +gitunknown). - NO_VERSION_LABEL: 'ON' + NO_VERSION_LABEL: "ON" steps: - name: Checkout repository diff --git a/tests_paddle/test_quick_start.py b/tests_paddle/test_quick_start.py new file mode 100644 index 000000000..5b1b9cf66 --- /dev/null +++ b/tests_paddle/test_quick_start.py @@ -0,0 +1,81 @@ +import numpy as np +import paddle + +paddle.compat.enable_torch_proxy(scope={"tilelang"}) + +import tilelang +import tilelang.language as T + + +# @tilelang.jit(target="cuda") +# target currently can be "cuda" or "hip" or "cpu". +# if not specified, it will be inferred from the input tensors during compile time +@tilelang.jit +def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"): + @T.prim_func + def matmul_relu_kernel( + A: T.Tensor((M, K), dtype), + B: T.Tensor((K, N), dtype), + C: T.Tensor((M, N), dtype), + ): + # Initialize Kernel Context + with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by): + A_shared = T.alloc_shared((block_M, block_K), dtype) + B_shared = T.alloc_shared((block_K, block_N), dtype) + C_local = T.alloc_fragment((block_M, block_N), accum_dtype) + + # Enable rasterization for better L2 cache locality (Optional) + # T.use_swizzle(panel_size=10, enable=True) + + # Clear local accumulation + T.clear(C_local) + + for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3): + # Copy tile of A + # This is a sugar syntax for parallelized copy + T.copy(A[by * block_M, ko * block_K], A_shared) + + # Copy tile of B + T.copy(B[ko * block_K, bx * block_N], B_shared) + + # Perform a tile-level GEMM on the shared buffers + # Currently we dispatch to the cute/hip on Nvidia/AMD GPUs + T.gemm(A_shared, B_shared, C_local) + + # relu + for i, j in T.Parallel(block_M, block_N): + C_local[i, j] = T.max(C_local[i, j], 0) + + # Copy result back to global memory + T.copy(C_local, C[by * block_M, bx * block_N]) + + return matmul_relu_kernel + + +def test_quick_start(): + M = 1024 # M = T.dynamic("m") if you want to use dynamic shape + N = 1024 + K = 1024 + block_M = 128 + block_N = 128 + block_K = 32 + + # Define the kernel (matmul) and compile/lower it into an executable module + matmul_relu_kernel = matmul(M, N, K, block_M, block_N, block_K) + # Test the kernel in Python with PyTorch data + import paddle + + # Create random input tensors on the GPU + a = paddle.randn(M, K, device="cuda", dtype=paddle.float16) + b = paddle.randn(K, N, device="cuda", dtype=paddle.float16) + c = paddle.empty(M, N, device="cuda", dtype=paddle.float16) + + # Run the kernel through the Profiler + matmul_relu_kernel(a, b, c) + + print(c) + # Reference multiplication using PyTorch + ref_c = paddle.nn.functional.relu(a @ b) + + # Validate correctness + np.testing.assert_allclose(c.numpy(), ref_c.numpy(), rtol=1e-2, atol=1e-2)