From 108bb7ead945645cbecc5bec91038b11d0a75d84 Mon Sep 17 00:00:00 2001 From: Shenwei Hu Date: Thu, 6 Nov 2025 22:40:08 +0800 Subject: [PATCH 1/3] Add Qwen3MoE CI Config --- examples/config/dpo/qwen3moe.yaml | 57 +++++++++++++++ .../config/sft/lora-qwen3moe-alltoall.yaml | 72 +++++++++++++++++++ examples/config/sft/lora-qwen3moe.yaml | 71 ++++++++++++++++++ examples/config/sft/qwen3moe-alltoall.yaml | 69 ++++++++++++++++++ examples/config/sft/qwen3moe.yaml | 66 +++++++++++++++++ 5 files changed, 335 insertions(+) create mode 100644 examples/config/dpo/qwen3moe.yaml create mode 100644 examples/config/sft/lora-qwen3moe-alltoall.yaml create mode 100644 examples/config/sft/lora-qwen3moe.yaml create mode 100644 examples/config/sft/qwen3moe-alltoall.yaml create mode 100644 examples/config/sft/qwen3moe.yaml diff --git a/examples/config/dpo/qwen3moe.yaml b/examples/config/dpo/qwen3moe.yaml new file mode 100644 index 00000000000..f708aa4f1c6 --- /dev/null +++ b/examples/config/dpo/qwen3moe.yaml @@ -0,0 +1,57 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: data-dpo/train.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: data-dpo/dev.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 8192 +num_samples_each_epoch: 6000000 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask + +### finetuning +# base +stage: DPO +fine_tuning: full +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: 100 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-6 + +# performance +tensor_parallel_degree: 2 +pipeline_parallel_degree: 2 +expert_parallel_degree: 4 +use_expert_parallel: true +pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv +sequence_parallel: true +sharding: stage1 +recompute: true +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true \ No newline at end of file diff --git a/examples/config/sft/lora-qwen3moe-alltoall.yaml b/examples/config/sft/lora-qwen3moe-alltoall.yaml new file mode 100644 index 00000000000..612748dcb35 --- /dev/null +++ b/examples/config/sft/lora-qwen3moe-alltoall.yaml @@ -0,0 +1,72 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json +train_dataset_prob: "1.0" +eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json +eval_dataset_prob: "1.0" +max_seq_len: 4096 +num_samples_each_epoch: 6000000 +packing: true +mix_strategy: concat + + +### model +model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 5 +max_steps: 50 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 10000 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-4 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +#3 offload +# offload: true, +offload_optim: false +use_fused_head_and_loss_fn: true +# use_filtered_label_loss: true +optim: adamw_custom +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +ep_communication_type: "alltoall" +use_unified_moe: true diff --git a/examples/config/sft/lora-qwen3moe.yaml b/examples/config/sft/lora-qwen3moe.yaml new file mode 100644 index 00000000000..804190be740 --- /dev/null +++ b/examples/config/sft/lora-qwen3moe.yaml @@ -0,0 +1,71 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: data-sft/train_gsm8k.json +train_dataset_prob: "1.0" +eval_dataset_path: data-sft/test_gsm8k.json +eval_dataset_prob: "1.0" +max_seq_len: 4096 +num_samples_each_epoch: 6000000 +packing: true +mix_strategy: concat + + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: SFT +fine_tuning: lora +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 5 +max_steps: 100 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 10000 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-4 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +#3 offload +# offload: true, +offload_optim: false +use_fused_head_and_loss_fn: true +# use_filtered_label_loss: true +optim: adamw_custom +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +ep_communication_type: "alltoall" diff --git a/examples/config/sft/qwen3moe-alltoall.yaml b/examples/config/sft/qwen3moe-alltoall.yaml new file mode 100644 index 00000000000..bf81fd462f9 --- /dev/null +++ b/examples/config/sft/qwen3moe-alltoall.yaml @@ -0,0 +1,69 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json +train_dataset_prob: "1.0" +eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json +eval_dataset_prob: "1.0" +max_seq_len: 4096 +packing: true +mix_strategy: concat + + +### model +model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B +attn_impl: flashmask + +### finetuning +# base +stage: SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 5 +max_steps: 50 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 10000 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-5 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +#3 offload +# offload: true, +offload_optim: false +use_fused_head_and_loss_fn: true +# use_filtered_label_loss: true +optim: adamw_custom +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true + +use_unified_moe: true +ep_communication_type: "alltoall" \ No newline at end of file diff --git a/examples/config/sft/qwen3moe.yaml b/examples/config/sft/qwen3moe.yaml new file mode 100644 index 00000000000..a68b02bba64 --- /dev/null +++ b/examples/config/sft/qwen3moe.yaml @@ -0,0 +1,66 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: data-sft/train_gsm8k.json +train_dataset_prob: "1.0" +eval_dataset_path: data-sft/test_gsm8k.json +eval_dataset_prob: "1.0" +max_seq_len: 4096 +packing: true +mix_strategy: concat + + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask + +### finetuning +# base +stage: SFT +fine_tuning: full +seed: 23 +do_train: true +do_eval: false +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 5 +max_steps: 100 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_total_limit: 1 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-5 + +# performance +tensor_parallel_degree: 2 +expert_parallel_degree: 4 +pipeline_parallel_degree: 2 +sequence_parallel: true +use_expert_parallel: true +sharding: stage1 +#3 offload +# offload: true, +offload_optim: false +use_fused_head_and_loss_fn: true +# use_filtered_label_loss: true +optim: adamw_custom +tensorwise_offload_optimizer: true +recompute: true +# recompute_use_reentrant: true +# unified_checkpoint_config: ignore_merge_optimizer +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +sharding_parallel_config: "split_param" +amp_master_grad: true \ No newline at end of file From 8c726f6268a1823d9d7ff6edceb2014376717309 Mon Sep 17 00:00:00 2001 From: Shenwei Hu Date: Fri, 7 Nov 2025 18:12:51 +0800 Subject: [PATCH 2/3] fix config --- .../dpo/{qwen3moe.yaml => full_tp_pp_ep.yaml} | 6 +- .../sft/{qwen3moe.yaml => full_tp_pp_ep.yaml} | 21 +++--- .../config/sft/lora-qwen3moe-alltoall.yaml | 72 ------------------- ...{lora-qwen3moe.yaml => lora_tp_pp_ep.yaml} | 22 ++---- examples/config/sft/qwen3moe-alltoall.yaml | 69 ------------------ 5 files changed, 18 insertions(+), 172 deletions(-) rename examples/config/dpo/{qwen3moe.yaml => full_tp_pp_ep.yaml} (92%) rename examples/config/sft/{qwen3moe.yaml => full_tp_pp_ep.yaml} (78%) delete mode 100644 examples/config/sft/lora-qwen3moe-alltoall.yaml rename examples/config/sft/{lora-qwen3moe.yaml => lora_tp_pp_ep.yaml} (75%) delete mode 100644 examples/config/sft/qwen3moe-alltoall.yaml diff --git a/examples/config/dpo/qwen3moe.yaml b/examples/config/dpo/full_tp_pp_ep.yaml similarity index 92% rename from examples/config/dpo/qwen3moe.yaml rename to examples/config/dpo/full_tp_pp_ep.yaml index f708aa4f1c6..b1abf05eda4 100644 --- a/examples/config/dpo/qwen3moe.yaml +++ b/examples/config/dpo/full_tp_pp_ep.yaml @@ -1,9 +1,9 @@ ### data train_dataset_type: erniekit eval_dataset_type: erniekit -train_dataset_path: data-dpo/train.jsonl +train_dataset_path: ./data/dpo/train.jsonl train_dataset_prob: "1.0" -eval_dataset_path: data-dpo/dev.jsonl +eval_dataset_path: ./data/dpo/dev.jsonl eval_dataset_prob: "1.0" max_seq_len: 8192 num_samples_each_epoch: 6000000 @@ -24,7 +24,7 @@ do_eval: true per_device_eval_batch_size: 1 per_device_train_batch_size: 1 num_train_epochs: 1 -max_steps: 100 +max_steps: -1 eval_steps: 100 evaluation_strategy: steps save_steps: 100 diff --git a/examples/config/sft/qwen3moe.yaml b/examples/config/sft/full_tp_pp_ep.yaml similarity index 78% rename from examples/config/sft/qwen3moe.yaml rename to examples/config/sft/full_tp_pp_ep.yaml index a68b02bba64..5189df9c281 100644 --- a/examples/config/sft/qwen3moe.yaml +++ b/examples/config/sft/full_tp_pp_ep.yaml @@ -1,15 +1,14 @@ ### data train_dataset_type: erniekit eval_dataset_type: erniekit -train_dataset_path: data-sft/train_gsm8k.json +train_dataset_path: ./data/sft/train.json train_dataset_prob: "1.0" -eval_dataset_path: data-sft/test_gsm8k.json +eval_dataset_path: ./data/sft/dev.json eval_dataset_prob: "1.0" -max_seq_len: 4096 +max_seq_len: 8192 packing: true mix_strategy: concat - ### model model_name_or_path: Qwen/Qwen3-30B-A3B attn_impl: flashmask @@ -23,8 +22,8 @@ do_train: true do_eval: false per_device_eval_batch_size: 1 per_device_train_batch_size: 1 -num_train_epochs: 5 -max_steps: 100 +num_train_epochs: 1 +max_steps: -1 eval_steps: 100 evaluation_strategy: steps save_steps: 100 @@ -48,12 +47,6 @@ pipeline_parallel_degree: 2 sequence_parallel: true use_expert_parallel: true sharding: stage1 -#3 offload -# offload: true, -offload_optim: false -use_fused_head_and_loss_fn: true -# use_filtered_label_loss: true -optim: adamw_custom tensorwise_offload_optimizer: true recompute: true # recompute_use_reentrant: true @@ -63,4 +56,6 @@ fp16_opt_level: O2 unified_checkpoint: true sharding_parallel_config: "split_param" -amp_master_grad: true \ No newline at end of file +amp_master_grad: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/sft/lora-qwen3moe-alltoall.yaml b/examples/config/sft/lora-qwen3moe-alltoall.yaml deleted file mode 100644 index 612748dcb35..00000000000 --- a/examples/config/sft/lora-qwen3moe-alltoall.yaml +++ /dev/null @@ -1,72 +0,0 @@ -### data -train_dataset_type: erniekit -eval_dataset_type: erniekit -train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json -train_dataset_prob: "1.0" -eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json -eval_dataset_prob: "1.0" -max_seq_len: 4096 -num_samples_each_epoch: 6000000 -packing: true -mix_strategy: concat - - -### model -model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B -attn_impl: flashmask -lora: true -lora_rank: 8 - -### finetuning -# base -stage: SFT -fine_tuning: lora -seed: 23 -do_train: true -do_eval: false -per_device_eval_batch_size: 1 -per_device_train_batch_size: 1 -num_train_epochs: 5 -max_steps: 50 -eval_steps: 100 -evaluation_strategy: steps -save_steps: 10000 -save_total_limit: 1 -save_strategy: steps -logging_steps: 1 -gradient_accumulation_steps: 4 -logging_dir: ./vdl_log -output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel -disable_tqdm: true -eval_accumulation_steps: 16 - -# train -warmup_steps: 20 -learning_rate: 1.0e-4 - -# performance -tensor_parallel_degree: 2 -expert_parallel_degree: 4 -pipeline_parallel_degree: 2 -sequence_parallel: true -use_expert_parallel: true -sharding: stage1 -#3 offload -# offload: true, -offload_optim: false -use_fused_head_and_loss_fn: true -# use_filtered_label_loss: true -optim: adamw_custom -tensorwise_offload_optimizer: true -recompute: true -# recompute_use_reentrant: true -# unified_checkpoint_config: ignore_merge_optimizer -bf16: true -fp16_opt_level: O2 -unified_checkpoint: true - -sharding_parallel_config: "split_param" -amp_master_grad: true - -ep_communication_type: "alltoall" -use_unified_moe: true diff --git a/examples/config/sft/lora-qwen3moe.yaml b/examples/config/sft/lora_tp_pp_ep.yaml similarity index 75% rename from examples/config/sft/lora-qwen3moe.yaml rename to examples/config/sft/lora_tp_pp_ep.yaml index 804190be740..dce5d38662c 100644 --- a/examples/config/sft/lora-qwen3moe.yaml +++ b/examples/config/sft/lora_tp_pp_ep.yaml @@ -1,16 +1,14 @@ ### data train_dataset_type: erniekit eval_dataset_type: erniekit -train_dataset_path: data-sft/train_gsm8k.json +train_dataset_path: ./data/sft/train.json train_dataset_prob: "1.0" -eval_dataset_path: data-sft/test_gsm8k.json +eval_dataset_path: ./data/sft/dev.json eval_dataset_prob: "1.0" -max_seq_len: 4096 -num_samples_each_epoch: 6000000 +max_seq_len: 8192 packing: true mix_strategy: concat - ### model model_name_or_path: Qwen/Qwen3-30B-A3B attn_impl: flashmask @@ -26,11 +24,11 @@ do_train: true do_eval: false per_device_eval_batch_size: 1 per_device_train_batch_size: 1 -num_train_epochs: 5 -max_steps: 100 +num_train_epochs: 1 +max_steps: -1 eval_steps: 100 evaluation_strategy: steps -save_steps: 10000 +save_steps: 100 save_total_limit: 1 save_strategy: steps logging_steps: 1 @@ -51,12 +49,6 @@ pipeline_parallel_degree: 2 sequence_parallel: true use_expert_parallel: true sharding: stage1 -#3 offload -# offload: true, -offload_optim: false -use_fused_head_and_loss_fn: true -# use_filtered_label_loss: true -optim: adamw_custom tensorwise_offload_optimizer: true recompute: true # recompute_use_reentrant: true @@ -68,4 +60,4 @@ unified_checkpoint: true sharding_parallel_config: "split_param" amp_master_grad: true -ep_communication_type: "alltoall" +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/sft/qwen3moe-alltoall.yaml b/examples/config/sft/qwen3moe-alltoall.yaml deleted file mode 100644 index bf81fd462f9..00000000000 --- a/examples/config/sft/qwen3moe-alltoall.yaml +++ /dev/null @@ -1,69 +0,0 @@ -### data -train_dataset_type: erniekit -eval_dataset_type: erniekit -train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json -train_dataset_prob: "1.0" -eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json -eval_dataset_prob: "1.0" -max_seq_len: 4096 -packing: true -mix_strategy: concat - - -### model -model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B -attn_impl: flashmask - -### finetuning -# base -stage: SFT -fine_tuning: full -seed: 23 -do_train: true -do_eval: false -per_device_eval_batch_size: 1 -per_device_train_batch_size: 1 -num_train_epochs: 5 -max_steps: 50 -eval_steps: 100 -evaluation_strategy: steps -save_steps: 10000 -save_total_limit: 1 -save_strategy: steps -logging_steps: 1 -gradient_accumulation_steps: 4 -logging_dir: ./vdl_log -output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel -disable_tqdm: true -eval_accumulation_steps: 16 - -# train -warmup_steps: 20 -learning_rate: 1.0e-5 - -# performance -tensor_parallel_degree: 2 -expert_parallel_degree: 4 -pipeline_parallel_degree: 2 -sequence_parallel: true -use_expert_parallel: true -sharding: stage1 -#3 offload -# offload: true, -offload_optim: false -use_fused_head_and_loss_fn: true -# use_filtered_label_loss: true -optim: adamw_custom -tensorwise_offload_optimizer: true -recompute: true -# recompute_use_reentrant: true -# unified_checkpoint_config: ignore_merge_optimizer -bf16: true -fp16_opt_level: O2 -unified_checkpoint: true - -sharding_parallel_config: "split_param" -amp_master_grad: true - -use_unified_moe: true -ep_communication_type: "alltoall" \ No newline at end of file From 5e5f70e2744f0ca643eadb5b03a9beae6e0cd16f Mon Sep 17 00:00:00 2001 From: Shenwei Hu Date: Fri, 7 Nov 2025 19:52:45 +0800 Subject: [PATCH 3/3] fix config --- examples/config/dpo/full_tp_pp_ep.yaml | 4 +- examples/config/dpo/lora_tp_pp_ep.yaml | 57 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 examples/config/dpo/lora_tp_pp_ep.yaml diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml index b1abf05eda4..81d8bb863c6 100644 --- a/examples/config/dpo/full_tp_pp_ep.yaml +++ b/examples/config/dpo/full_tp_pp_ep.yaml @@ -54,4 +54,6 @@ fp16_opt_level: O2 unified_checkpoint: true sharding_parallel_config: "split_param" -amp_master_grad: true \ No newline at end of file +amp_master_grad: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml new file mode 100644 index 00000000000..85426a4eeb9 --- /dev/null +++ b/examples/config/dpo/lora_tp_pp_ep.yaml @@ -0,0 +1,57 @@ +### data +train_dataset_type: erniekit +eval_dataset_type: erniekit +train_dataset_path: ./data/dpo/train.jsonl +train_dataset_prob: "1.0" +eval_dataset_path: ./data/dpo/dev.jsonl +eval_dataset_prob: "1.0" +max_seq_len: 8192 +packing: true +mix_strategy: concat + +### model +model_name_or_path: Qwen/Qwen3-30B-A3B +attn_impl: flashmask +lora: true +lora_rank: 8 + +### finetuning +# base +stage: DPO +fine_tuning: lora +seed: 23 +do_train: true +do_eval: true +per_device_eval_batch_size: 1 +per_device_train_batch_size: 1 +num_train_epochs: 1 +max_steps: -1 +eval_steps: 100 +evaluation_strategy: steps +save_steps: 100 +save_strategy: steps +logging_steps: 1 +gradient_accumulation_steps: 4 +logging_dir: ./vdl_log +output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel +disable_tqdm: true +eval_accumulation_steps: 16 + +# train +warmup_steps: 20 +learning_rate: 1.0e-5 + +# performance +tensor_parallel_degree: 4 +pipeline_parallel_degree: 2 +expert_parallel_degree: 4 +use_expert_parallel: true +pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv +sequence_parallel: true +sharding: stage1 +recompute: true +bf16: true +fp16_opt_level: O2 +unified_checkpoint: true + +ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU