Skip to content
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
36db136
automodel on latest main
adil-a Oct 28, 2025
8e562e8
new automodel checkpointing
adil-a Oct 29, 2025
5a1cff1
adding automodel sharding
adil-a Oct 29, 2025
acff747
adding moe init
adil-a Oct 29, 2025
0cbc3ac
fix
adil-a Oct 29, 2025
3336fe9
removing legacy checkpointing utils
adil-a Oct 29, 2025
19d29aa
linting
adil-a Oct 29, 2025
dcb4cb2
adding moe check
adil-a Oct 29, 2025
738338f
automodel
adil-a Oct 29, 2025
62acdfc
latest automodel bump
adil-a Oct 29, 2025
b6a3fdd
changes
adil-a Oct 30, 2025
2b86310
cfg
adil-a Oct 30, 2025
dd634b6
eof fix
adil-a Oct 31, 2025
2f74d79
feat: automodel moe integration
hemildesai Nov 4, 2025
1163407
bump
Nov 5, 2025
d270a5b
adding torch arch list for grouped gemm isntall
adil-a Nov 5, 2025
d038aca
linting
adil-a Nov 5, 2025
e936ebf
main merge
adil-a Nov 5, 2025
7df0cc5
uv lock
adil-a Nov 5, 2025
b4139f1
fix
adil-a Nov 5, 2025
a55a2f1
wandb yaml fix
adil-a Nov 5, 2025
39bd74c
minimizing yaml
adil-a Nov 5, 2025
4e151cb
clean up
adil-a Nov 5, 2025
4b6ce6d
dtype map
adil-a Nov 5, 2025
ef2f92c
lint
adil-a Nov 5, 2025
1eef903
removing unit test
adil-a Nov 5, 2025
24214e9
adding fixes from unit tests
Nov 6, 2025
2ed872a
merging main
adil-a Nov 25, 2025
5489b21
bumping automodel + v2 fixes
adil-a Nov 25, 2025
ed69abd
pre-commit
adil-a Nov 25, 2025
b754c7c
ckpt fix
adil-a Nov 26, 2025
3877e79
pre commit
adil-a Nov 26, 2025
661b596
Sync Automodel submodule to origin/main
adil-a Nov 26, 2025
d89180c
removing RL specific changes for future PR
adil-a Nov 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/Automodel-workspace/Automodel
Submodule Automodel updated 314 files
217 changes: 217 additions & 0 deletions examples/configs/sft_automodel.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# SFT Algorithm Configuration
sft:
## total number of steps to train will equal
## min((max_num_epochs * len(train_dataloader)), max_num_steps)
max_num_epochs: 1
max_num_steps: 60

val_period: 10
val_batches: 8
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: true
seed: 42

checkpointing:
enabled: false
checkpoint_dir: "results/sft"
metric_name: "val_loss" ## set to null to save most recent k checkpoints
higher_is_better: false
keep_top_k: 3
save_period: 10
checkpoint_must_save_by: null

policy:
model_name: "openai/gpt-oss-20b"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
# chat_template can be a Jinja template string or path to a .jinja file
chat_template: "{% for message in messages %}{%- if message['role'] == 'system' %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user' %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant' %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
chat_template_kwargs: null # can be used to pass kwargs to the chat template, e.g., enable_thinking=true
train_global_batch_size: 128
train_micro_batch_size: 8
max_total_sequence_length: 512
precision: "bfloat16"
dequantize_base_checkpoint: true

automodel_model_kwargs:
use_liger_kernel: false
backend:
_target_: nemo_automodel.components.moe.utils.BackendConfig
attn: te
linear: te
rms_norm: te
enable_deepep: true
fake_balanced_gate: false
enable_hf_state_dict_adapter: true

dtensor_cfg:
enabled: true
_v2: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
expert_parallel_size: 8
data_parallel_size: 8
custom_parallel_plan: null

dynamic_batching:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
sequence_length_round: 64

sequence_packing:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64

# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0

optimizer:
#name: "torch.optim.AdamW"
name: "transformer_engine.pytorch.optimizers.fused_adam.FusedAdam"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-5
store_param_remainders: true
master_weights: true
# when using Dtensor, we need to set foreach
# and fused to False
#foreach: False
#fused: False

# ignored since enabled=false, but needed for testing purposes
megatron_cfg:
enabled: false
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: false
freeze_moe_router: false
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
# gives ~25% training perf speedup with sequence packing and apply_rope_fusion
bias_activation_fusion: True
defer_fp32_logits: null

optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 4.9999e-6
weight_decay: 0.1
bf16: false
fp16: false
params_dtype: "float32"

#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-5

#sgd
sgd_momentum: 0.9

#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true

clip_grad: ${policy.max_grad_norm}

# optimizer cpu offload
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0

scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 50
lr_warmup_init: 4.9999e-6

distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
average_in_collective: true
data_parallel_sharding_strategy: "optim_grads_params"
use_custom_fsdp: false

data:
max_input_seq_length: ${policy.max_total_sequence_length}
add_bos: true
add_eos: true
add_generation_prompt: false
shuffle: true
num_workers: 1

dataset_name: "squad"
# You can use custom response datasets for training and validation. For example:
# data:
# dataset_name: ResponseDataset
# train_data_path: <PathToTrainingDataset> # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
# val_data_path: <PathToValidationDataset>
# input_key: <QuestionKey>, default is "input"
# output_key: <AnswerKey>, default is "output"
# train_split: <TrainSplit>, default is None # used for HuggingFace datasets
# val_split: <ValSplit>, default is None # used for HuggingFace datasets
# See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details.

## unused with squad dataset
prompt_file: null
split: null
output_key: null
seed: null


## OpenAI format specific configs
# train_data_path: "/path/to/train.jsonl" # Path to training data
# val_data_path: "/path/to/val.jsonl" # Path to validation data
# chat_key: "messages" # Key for messages in the data
# system_key: null # Key for system message (optional)
# system_prompt: null # Default system prompt (optional)
# tool_key: "tools" # Key for tools in the data
# use_preserving_dataset: false # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures)

logger:
log_dir: "logs" # Base directory for all logs
wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
tensorboard_enabled: true
mlflow_enabled: false
swanlab_enabled: false # Disable SwanLab logging
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "hemild-rl-automodel"
name: "${policy.model_name}-${data.dataset_name}-attn-${policy.automodel_model_kwargs.backend.attn}-ep${policy.dtensor_cfg.expert_parallel_size}"
tensorboard:
log_dir: "tb_logs-sft-dev-${data.dataset_name}"
mlflow:
experiment_name: "sft-dev"
run_name: "sft-dev-${data.dataset_name}"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)

cluster:
gpus_per_node: 8
num_nodes: 1
Loading
Loading