Skip to content

Commit acd9588

Browse files
committed
Merge remote-tracking branch 'origin/main' into autoparallel
2 parents d54a6d4 + 58fa181 commit acd9588

31 files changed

+3323
-46
lines changed

.ci/docker/common/install_conda.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ install_pip_dependencies() {
4343
pip_install -r /opt/conda/requirements.txt
4444
pip_install -r /opt/conda/requirements-flux.txt
4545
pip_install -r /opt/conda/requirements-vlm.txt
46+
pip_install -r /opt/conda/requirements-transformers-backend.txt
4647
popd
4748
}
4849

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
transformers==4.57.1

.ci/docker/ubuntu/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ COPY requirements-dev.txt /opt/conda/
3333
COPY requirements.txt /opt/conda/
3434
COPY requirements-flux.txt /opt/conda/
3535
COPY requirements-vlm.txt /opt/conda/
36+
COPY requirements-transformers-backend.txt /opt/conda/
3637
COPY conda-env-ci.txt /opt/conda/
3738
COPY ./common/install_conda.sh install_conda.sh
3839
COPY ./common/utils.sh utils.sh

.github/workflows/integration_test_8gpu_compiler_toolkit.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ jobs:
5050
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
5151
5252
mkdir artifacts-to-be-uploaded
53-
python -m torchtitan.experiments.compiler_toolkit.tests.integration_tests artifacts-to-be-uploaded --ngpu 4
53+
TRAIN_FILE=torchtitan.experiments.compiler_toolkit.train python -m torchtitan.experiments.compiler_toolkit.tests.integration_tests artifacts-to-be-uploaded --ngpu 4

.github/workflows/integration_test_8gpu_features.yaml

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,43 @@ permissions:
2525
contents: read
2626

2727
jobs:
28+
# Step 1: Dynamically compute the matrix based on conditions
29+
set-matrix:
30+
runs-on: ubuntu-latest
31+
outputs:
32+
matrix: ${{ steps.set.outputs.matrix }}
33+
steps:
34+
- id: set
35+
run: |
36+
# Decide which matrix entries to include based on event type
37+
if [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
38+
# Include both CUDA and ROCm
39+
echo '{"include":[
40+
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"},
41+
{"name":"rocm","runner":"linux.rocm.gpu.gfx942.8","gpu-arch-type":"rocm","gpu-arch-version":"7.0","docker-image":"torchtitan-rocm-ubuntu-22.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/rocm7.0"}
42+
]}' > matrix.json
43+
else
44+
# Include only CUDA
45+
echo '{"include":[
46+
{"name":"cuda","runner":"linux.g5.48xlarge.nvidia.gpu","gpu-arch-type":"cuda","gpu-arch-version":"12.6","docker-image":"torchtitan-ubuntu-20.04-clang12","index-url":"https://download.pytorch.org/whl/nightly/cu126"}
47+
]}' > matrix.json
48+
fi
49+
50+
# Export matrix to job outputs
51+
{
52+
echo 'matrix<<EOF'
53+
cat matrix.json
54+
echo 'EOF'
55+
} >> $GITHUB_OUTPUT
56+
57+
58+
# Step 2: Use the dynamic matrix in the build-test job
2859
build-test:
60+
needs: set-matrix
2961
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
3062
strategy:
3163
fail-fast: false
32-
matrix:
33-
include:
34-
- name: cuda
35-
runner: linux.g5.48xlarge.nvidia.gpu
36-
gpu-arch-type: cuda
37-
gpu-arch-version: "12.6"
38-
# This image is faster to clone than the default, but it lacks CC needed by triton
39-
# (1m25s vs 2m37s).
40-
docker-image: torchtitan-ubuntu-20.04-clang12
41-
index-url: https://download.pytorch.org/whl/nightly/cu126
42-
- name: rocm
43-
runner: linux.rocm.gpu.gfx942.8
44-
gpu-arch-type: rocm
45-
gpu-arch-version: "7.0"
46-
docker-image: torchtitan-rocm-ubuntu-22.04-clang12
47-
index-url: https://download.pytorch.org/whl/nightly/rocm7.0
64+
matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
4865
with:
4966
runner: ${{ matrix.runner }}
5067
gpu-arch-type: ${{ matrix.gpu-arch-type }}
@@ -73,8 +90,14 @@ jobs:
7390
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
7491
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
7592
76-
export TEST_WITH_ROCM=$([[ "${{ matrix.gpu-arch-type }}" == "rocm" ]] && echo 1 || echo 0)
77-
python -m tests.integration_tests.run_tests --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
93+
python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8
94+
95+
# Verify the accuracy.
96+
echo "Checking FSDP4 v.s. HSDP2FSDP2TP2 accuracy parity"
97+
export baseline_options="--parallelism.data_parallel_replicate_degree=1"
98+
export test_options="--parallelism.data_parallel_replicate_degree=2 --parallelism.tensor_parallel_degree=2"
99+
python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --baseline-ngpus=4 --test-ngpus=8 --steps=1
78100
101+
# Cleanup the checkpoints so that we don't waste network bandwidth and time.
79102
rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
80103
rm -rf artifacts-to-be-uploaded/*/checkpoint
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
name: Transformers Backend 8 GPU Integration Tests
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
paths:
7+
- 'torchtitan/experiments/transformers_backend/**'
8+
pull_request:
9+
paths:
10+
- 'torchtitan/experiments/transformers_backend/**'
11+
schedule:
12+
# Runs every 12 hours
13+
- cron: '0 */12 * * *'
14+
15+
concurrency:
16+
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
17+
cancel-in-progress: true
18+
19+
defaults:
20+
run:
21+
shell: bash -l -eo pipefail {0}
22+
23+
jobs:
24+
build-test:
25+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
26+
with:
27+
runner: linux.g5.48xlarge.nvidia.gpu
28+
gpu-arch-type: cuda
29+
gpu-arch-version: "12.6"
30+
# This image is faster to clone than the default, but it lacks CC needed by triton
31+
# (1m25s vs 2m37s).
32+
docker-image: torchtitan-ubuntu-20.04-clang12
33+
repository: pytorch/torchtitan
34+
upload-artifact: outputs
35+
script: |
36+
set -eux
37+
38+
# The generic Linux job chooses to use base env, not the one setup by the image
39+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
40+
conda activate "${CONDA_ENV}"
41+
42+
# Log CUDA driver version for debugging.
43+
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
44+
echo "CUDA driver version: ${DRIVER_VERSION}"
45+
46+
pip config --user set global.progress_bar off
47+
48+
python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
49+
50+
USE_CPP=0 python -m pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
51+
52+
mkdir artifacts-to-be-uploaded
53+
python -m torchtitan.experiments.transformers_backend.tests.integration_tests artifacts-to-be-uploaded --ngpu 8

0 commit comments

Comments
 (0)