diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml new file mode 100644 index 00000000000..81d8bb863c6 --- /dev/null +++ b/examples/config/dpo/full_tp_pp_ep.yaml @@ -0,0 +1,59 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: ./data/dpo/train.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./data/dpo/dev.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 8192 +num_samples_each_epoch: 6000000 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask + +### finetuning +# base +stage: DPO +fine_tuning: full +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: -1 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-6 + +# performance +tensor_parallel_degree: 2 +pipeline_parallel_degree: 2 +expert_parallel_degree: 4 +use_expert_parallel: true +pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv +sequence_parallel: true +sharding: stage1 +recompute: true +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml new file mode 100644 index 00000000000..85426a4eeb9 --- /dev/null +++ b/examples/config/dpo/lora_tp_pp_ep.yaml @@ -0,0 +1,57 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: ./data/dpo/train.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./data/dpo/dev.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 8192 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: DPO +fine_tuning: lora +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: -1 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-5 + +# performance +tensor_parallel_degree: 4 +pipeline_parallel_degree: 2 +expert_parallel_degree: 4 +use_expert_parallel: true +pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv +sequence_parallel: true +sharding: stage1 +recompute: true +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/sft/full_tp_pp_ep.yaml b/examples/config/sft/full_tp_pp_ep.yaml new file mode 100644 index 00000000000..5189df9c281 --- /dev/null +++ b/examples/config/sft/full_tp_pp_ep.yaml @@ -0,0 +1,61 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: ./data/sft/train.json +train_dataset_prob: "1.0" +eval_dataset_path: ./data/sft/dev.json +eval_dataset_prob: "1.0" +max_seq_len: 8192 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask + +### finetuning +# base +stage: SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: -1 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-5 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/sft/lora_tp_pp_ep.yaml b/examples/config/sft/lora_tp_pp_ep.yaml new file mode 100644 index 00000000000..dce5d38662c --- /dev/null +++ b/examples/config/sft/lora_tp_pp_ep.yaml @@ -0,0 +1,63 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: ./data/sft/train.json +train_dataset_prob: "1.0" +eval_dataset_path: ./data/sft/dev.json +eval_dataset_prob: "1.0" +max_seq_len: 8192 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: -1 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-4 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU