Skip to content

Enabling ZeroBubbleV schedule in GraphPP (#250) #685

Enabling ZeroBubbleV schedule in GraphPP (#250)

Enabling ZeroBubbleV schedule in GraphPP (#250) #685

name: Test TorchTitan Integration
on:
pull_request:
push:
branches:
- main
- release/*
concurrency:
group: test-torchtitan-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
jobs:
test-torchtitan:
name: Test TorchTitan Integration (cuda12.6-py3.12)
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
strategy:
fail-fast: true
matrix:
include:
- name: 12xlargegpu
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.6"
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
script: |
conda create --yes --quiet --name py312 python=3.12
source $(conda info --base)/etc/profile.d/conda.sh
conda activate py312
pip install --quiet -r requirements-test.txt
# For some reason the spec above isnt working
pip uninstall -y torch
pip install --no-input --quiet --pre torch --index-url https://download.pytorch.org/whl/nightly/cu126
pip install --quiet .
# Clone TorchTitan on autoparallel branch
git clone --branch autoparallel https://github.com/pytorch/torchtitan.git
cd torchtitan
pip install --quiet -r requirements.txt
# Run TorchTitan training with AutoParallel
NGPU=4 CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh \
--model.name auto_parallel.llama3 \
--parallelism.tensor_parallel_degree 4 \
--training.dataset c4 \
--compile.enable