Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .meta/mast/qwen3_1_7b_mast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ max_res_tokens: 512
model: "/mnt/wsfuse/teamforge/hf/qwen3_1.7b"
off_by_n: 1 # Off by one by default
launcher: mast
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Main loop configuration
rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas
Expand Down Expand Up @@ -37,7 +38,7 @@ policy:
model: /mnt/wsfuse/teamforge/hf/qwen3_1.7b
tensor_parallel_size: 1
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
# TODO: Had to disable this becasue vLLm wouldn't like
# needs to revisited.
disable_custom_all_reduce: true
Expand Down Expand Up @@ -68,7 +69,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down Expand Up @@ -112,7 +113,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions .meta/mast/qwen3_32b_mast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ max_res_tokens: 512
model: "/mnt/wsfuse/teamforge/hf/qwen3_32b"
off_by_n: 1 # Off by one by default
launcher: mast
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Main loop configuration
rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas
Expand Down Expand Up @@ -37,7 +38,7 @@ policy:
model: /mnt/wsfuse/teamforge/hf/qwen3_32b
tensor_parallel_size: 2
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
# TODO: Had to disable this becasue vLLm wouldn't like
# needs to revisited.
disable_custom_all_reduce: true
Expand Down Expand Up @@ -67,7 +68,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 8
Expand Down Expand Up @@ -110,7 +111,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions .meta/mast/qwen3_4b_mast.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ max_res_tokens: 512
model: "Qwen/Qwen3-4B"
off_by_n: 1 # Off by one by default
launcher: mast
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file should have been removed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its there: https://github.com/meta-pytorch/torchforge/blob/main/.meta/mast/qwen3_4b_mast.yaml

But i can delete it in this PR if you want

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OOf, why are there so many configs.
Yes, i missed it in https://github.com/meta-pytorch/torchforge/pull/632/files . Please just remove it.


# Main loop configuration
rollout_threads: ${services.policy.num_replicas} # Recommended to set equal to policy.num_replicas
Expand Down Expand Up @@ -37,7 +38,7 @@ policy:
model: /mnt/wsfuse/teamforge/hf/qwen3_4b
tensor_parallel_size: 2
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
# TODO: Had to disable this becasue vLLm wouldn't like
# needs to revisited.
disable_custom_all_reduce: true
Expand Down Expand Up @@ -68,7 +69,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 4
Expand Down Expand Up @@ -112,7 +113,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions apps/grpo/llama3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ max_req_tokens: 1024
max_res_tokens: 2048
model: "meta-llama/Meta-Llama-3.1-8B-Instruct"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Observability configuration
metric_logging:
Expand All @@ -32,7 +33,7 @@ policy:
model: ${model}
tensor_parallel_size: 2
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -59,7 +60,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: -1
Expand Down Expand Up @@ -100,7 +101,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions apps/grpo/qwen3_1_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ max_req_tokens: 1024
max_res_tokens: 2048
model: "Qwen/Qwen3-1.7B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

# Main loop configuration
rollout_threads: 1 # Recommended to set equal to policy.num_replicas
Expand Down Expand Up @@ -36,7 +37,7 @@ policy:
model: ${model}
tensor_parallel_size: 1
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -63,7 +64,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down Expand Up @@ -101,7 +102,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions apps/grpo/qwen3_32b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ max_req_tokens: 1024
max_res_tokens: 1024
model: "Qwen/Qwen3-32B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM

provisioner:
launcher: slurm
Expand Down Expand Up @@ -39,7 +40,7 @@ policy:
model: ${model}
tensor_parallel_size: 4
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -66,7 +67,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down Expand Up @@ -104,7 +105,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
7 changes: 4 additions & 3 deletions apps/grpo/qwen3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ max_req_tokens: 1024
max_res_tokens: 2048
model: "Qwen/Qwen3-8B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not enabling it by default if you're updating all of the configs?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wdym by "enabling it by default"? We still need to expose the flag because compile can be tricky in some setups. It also add a bit of warmup time, so if someone is just quickly testing something, they may want to set it to false

Copy link
Contributor

@JenniferWang JenniferWang Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I was suggesting that to reduce the number of hyper parameters in the yaml config because

  1. We seem to want it to be enable for production runs
  2. This is a niche config (many things can slow down warmup time) that I don't expect people to remember to toggle in practice. We can set the default to be false when launching the job in local mode or ONLY set them to be true for large models.

Not a big deal.


# Observability configuration
metric_logging:
Expand All @@ -32,7 +33,7 @@ policy:
model: ${model}
tensor_parallel_size: 2
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -59,7 +60,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: -1
Expand Down Expand Up @@ -100,7 +101,7 @@ ref_model:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
3 changes: 3 additions & 0 deletions src/forge/util/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
# Add support for summing lists of numbers, e.g. ${sum:${max_req_tokens},${max_res_tokens}}
OmegaConf.register_new_resolver("sum", lambda *args: sum(args), replace=True)

# Add support for boolean negation, e.g. ${not:${compile}}
OmegaConf.register_new_resolver("not", lambda x: not x, replace=True)


def _has_component(node: Any) -> bool:
"""Check if a node has a _component_ field."""
Expand Down
5 changes: 3 additions & 2 deletions tests/integration_tests/fixtures/qwen3_1_7b_no_tp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ max_req_tokens: 512
max_res_tokens: 512
model: "Qwen/Qwen3-1.7B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer, and CUDA graphs for vLLM


# Policy configuration
Expand All @@ -13,7 +14,7 @@ policy:
model: ${model}
tensor_parallel_size: 1
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params:
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -40,7 +41,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
5 changes: 3 additions & 2 deletions tests/integration_tests/fixtures/qwen3_1_7b_tp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ max_req_tokens: 512
max_res_tokens: 512
model: "Qwen/Qwen3-1.7B"
off_by_n: 1 # Off by one by default
compile: true # Enable torch.compile for trainer, and CUDA graphs for vLLM


# Policy configuration
Expand All @@ -15,7 +16,7 @@ policy:
model: ${model}
tensor_parallel_size: 4
pipeline_parallel_size: 1
enforce_eager: false
enforce_eager: ${not:${compile}}
sampling_params:
n: ${group_size}
max_tokens: ${max_res_tokens}
Expand All @@ -42,7 +43,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1
Expand Down
5 changes: 3 additions & 2 deletions tests/sandbox/weight_sync/qwen3_1_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ model: "Qwen/Qwen3-1.7B"
local_batch_size: 4
max_req_tokens: 64
max_res_tokens: 64
compile: true # Enable torch.compile for trainer, and CUDA graphs for vLLM

metric_logging:
console:
Expand All @@ -16,7 +17,7 @@ policy:
model: ${model}
tensor_parallel_size: 1
pipeline_parallel_size: 1
enforce_eager: true
enforce_eager: ${not:${compile}}
sampling_params:
n: 1
max_tokens: 32 # Just for verification forward pass
Expand All @@ -42,7 +43,7 @@ trainer:
dtype: bfloat16
gc_freq: 1
compile:
enable: false
enable: ${compile}
parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: 1 # Single GPU, no FSDP
Expand Down
Loading