From 108bb7ead945645cbecc5bec91038b11d0a75d84 Mon Sep 17 00:00:00 2001
From: Shenwei Hu <hushenwei@baidu.com>
Date: Thu, 6 Nov 2025 22:40:08 +0800
Subject: [PATCH 1/3] Add Qwen3MoE CI Config

---
 examples/config/dpo/qwen3moe.yaml             | 57 +++++++++++++++
 .../config/sft/lora-qwen3moe-alltoall.yaml    | 72 +++++++++++++++++++
 examples/config/sft/lora-qwen3moe.yaml        | 71 ++++++++++++++++++
 examples/config/sft/qwen3moe-alltoall.yaml    | 69 ++++++++++++++++++
 examples/config/sft/qwen3moe.yaml             | 66 +++++++++++++++++
 5 files changed, 335 insertions(+)
 create mode 100644 examples/config/dpo/qwen3moe.yaml
 create mode 100644 examples/config/sft/lora-qwen3moe-alltoall.yaml
 create mode 100644 examples/config/sft/lora-qwen3moe.yaml
 create mode 100644 examples/config/sft/qwen3moe-alltoall.yaml
 create mode 100644 examples/config/sft/qwen3moe.yaml

diff --git a/examples/config/dpo/qwen3moe.yaml b/examples/config/dpo/qwen3moe.yaml
new file mode 100644
index 00000000000..f708aa4f1c6
--- /dev/null
+++ b/examples/config/dpo/qwen3moe.yaml
@@ -0,0 +1,57 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: data-dpo/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: data-dpo/dev.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+num_samples_each_epoch: 6000000
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: DPO
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: 100
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-6
+
+# performance
+tensor_parallel_degree: 2
+pipeline_parallel_degree: 2
+expert_parallel_degree: 4
+use_expert_parallel: true
+pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
+sequence_parallel: true
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
\ No newline at end of file
diff --git a/examples/config/sft/lora-qwen3moe-alltoall.yaml b/examples/config/sft/lora-qwen3moe-alltoall.yaml
new file mode 100644
index 00000000000..612748dcb35
--- /dev/null
+++ b/examples/config/sft/lora-qwen3moe-alltoall.yaml
@@ -0,0 +1,72 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json
+train_dataset_prob: "1.0"
+eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json
+eval_dataset_prob: "1.0"
+max_seq_len: 4096
+num_samples_each_epoch: 6000000
+packing: true
+mix_strategy: concat
+
+
+### model
+model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 5
+max_steps: 50
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 10000
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-4
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+#3 offload
+# offload: true,
+offload_optim: false
+use_fused_head_and_loss_fn: true
+# use_filtered_label_loss: true
+optim: adamw_custom
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+ep_communication_type: "alltoall"
+use_unified_moe: true
diff --git a/examples/config/sft/lora-qwen3moe.yaml b/examples/config/sft/lora-qwen3moe.yaml
new file mode 100644
index 00000000000..804190be740
--- /dev/null
+++ b/examples/config/sft/lora-qwen3moe.yaml
@@ -0,0 +1,71 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: data-sft/train_gsm8k.json
+train_dataset_prob: "1.0"
+eval_dataset_path: data-sft/test_gsm8k.json
+eval_dataset_prob: "1.0"
+max_seq_len: 4096
+num_samples_each_epoch: 6000000
+packing: true
+mix_strategy: concat
+
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 5
+max_steps: 100
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 10000
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-4
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+#3 offload
+# offload: true,
+offload_optim: false
+use_fused_head_and_loss_fn: true
+# use_filtered_label_loss: true
+optim: adamw_custom
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+ep_communication_type: "alltoall"
diff --git a/examples/config/sft/qwen3moe-alltoall.yaml b/examples/config/sft/qwen3moe-alltoall.yaml
new file mode 100644
index 00000000000..bf81fd462f9
--- /dev/null
+++ b/examples/config/sft/qwen3moe-alltoall.yaml
@@ -0,0 +1,69 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json
+train_dataset_prob: "1.0"
+eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json
+eval_dataset_prob: "1.0"
+max_seq_len: 4096
+packing: true
+mix_strategy: concat
+
+
+### model
+model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 5
+max_steps: 50
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 10000
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+#3 offload
+# offload: true,
+offload_optim: false
+use_fused_head_and_loss_fn: true
+# use_filtered_label_loss: true
+optim: adamw_custom
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+use_unified_moe: true
+ep_communication_type: "alltoall"
\ No newline at end of file
diff --git a/examples/config/sft/qwen3moe.yaml b/examples/config/sft/qwen3moe.yaml
new file mode 100644
index 00000000000..a68b02bba64
--- /dev/null
+++ b/examples/config/sft/qwen3moe.yaml
@@ -0,0 +1,66 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: data-sft/train_gsm8k.json
+train_dataset_prob: "1.0"
+eval_dataset_path: data-sft/test_gsm8k.json
+eval_dataset_prob: "1.0"
+max_seq_len: 4096
+packing: true
+mix_strategy: concat
+
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 5
+max_steps: 100
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+#3 offload
+# offload: true,
+offload_optim: false
+use_fused_head_and_loss_fn: true
+# use_filtered_label_loss: true
+optim: adamw_custom
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
\ No newline at end of file

From 8c726f6268a1823d9d7ff6edceb2014376717309 Mon Sep 17 00:00:00 2001
From: Shenwei Hu <hushenwei@baidu.com>
Date: Fri, 7 Nov 2025 18:12:51 +0800
Subject: [PATCH 2/3] fix config

---
 .../dpo/{qwen3moe.yaml => full_tp_pp_ep.yaml} |  6 +-
 .../sft/{qwen3moe.yaml => full_tp_pp_ep.yaml} | 21 +++---
 .../config/sft/lora-qwen3moe-alltoall.yaml    | 72 -------------------
 ...{lora-qwen3moe.yaml => lora_tp_pp_ep.yaml} | 22 ++----
 examples/config/sft/qwen3moe-alltoall.yaml    | 69 ------------------
 5 files changed, 18 insertions(+), 172 deletions(-)
 rename examples/config/dpo/{qwen3moe.yaml => full_tp_pp_ep.yaml} (92%)
 rename examples/config/sft/{qwen3moe.yaml => full_tp_pp_ep.yaml} (78%)
 delete mode 100644 examples/config/sft/lora-qwen3moe-alltoall.yaml
 rename examples/config/sft/{lora-qwen3moe.yaml => lora_tp_pp_ep.yaml} (75%)
 delete mode 100644 examples/config/sft/qwen3moe-alltoall.yaml

diff --git a/examples/config/dpo/qwen3moe.yaml b/examples/config/dpo/full_tp_pp_ep.yaml
similarity index 92%
rename from examples/config/dpo/qwen3moe.yaml
rename to examples/config/dpo/full_tp_pp_ep.yaml
index f708aa4f1c6..b1abf05eda4 100644
--- a/examples/config/dpo/qwen3moe.yaml
+++ b/examples/config/dpo/full_tp_pp_ep.yaml
@@ -1,9 +1,9 @@
 ### data
 train_dataset_type: erniekit
 eval_dataset_type: erniekit
-train_dataset_path: data-dpo/train.jsonl
+train_dataset_path: ./data/dpo/train.jsonl
 train_dataset_prob: "1.0"
-eval_dataset_path: data-dpo/dev.jsonl
+eval_dataset_path: ./data/dpo/dev.jsonl
 eval_dataset_prob: "1.0"
 max_seq_len: 8192
 num_samples_each_epoch: 6000000
@@ -24,7 +24,7 @@ do_eval: true
 per_device_eval_batch_size: 1
 per_device_train_batch_size: 1
 num_train_epochs: 1
-max_steps: 100
+max_steps: -1
 eval_steps: 100
 evaluation_strategy: steps
 save_steps: 100
diff --git a/examples/config/sft/qwen3moe.yaml b/examples/config/sft/full_tp_pp_ep.yaml
similarity index 78%
rename from examples/config/sft/qwen3moe.yaml
rename to examples/config/sft/full_tp_pp_ep.yaml
index a68b02bba64..5189df9c281 100644
--- a/examples/config/sft/qwen3moe.yaml
+++ b/examples/config/sft/full_tp_pp_ep.yaml
@@ -1,15 +1,14 @@
 ### data
 train_dataset_type: erniekit
 eval_dataset_type: erniekit
-train_dataset_path: data-sft/train_gsm8k.json
+train_dataset_path: ./data/sft/train.json
 train_dataset_prob: "1.0"
-eval_dataset_path: data-sft/test_gsm8k.json
+eval_dataset_path: ./data/sft/dev.json
 eval_dataset_prob: "1.0"
-max_seq_len: 4096
+max_seq_len: 8192
 packing: true
 mix_strategy: concat
 
-
 ### model
 model_name_or_path: Qwen/Qwen3-30B-A3B
 attn_impl: flashmask
@@ -23,8 +22,8 @@ do_train: true
 do_eval: false
 per_device_eval_batch_size: 1
 per_device_train_batch_size: 1
-num_train_epochs: 5
-max_steps: 100
+num_train_epochs: 1
+max_steps: -1
 eval_steps: 100
 evaluation_strategy: steps
 save_steps: 100
@@ -48,12 +47,6 @@ pipeline_parallel_degree: 2
 sequence_parallel: true
 use_expert_parallel: true
 sharding: stage1
-#3 offload
-# offload: true,
-offload_optim: false
-use_fused_head_and_loss_fn: true
-# use_filtered_label_loss: true
-optim: adamw_custom
 tensorwise_offload_optimizer: true
 recompute: true
 # recompute_use_reentrant: true
@@ -63,4 +56,6 @@ fp16_opt_level: O2
 unified_checkpoint: true
 
 sharding_parallel_config: "split_param"
-amp_master_grad: true
\ No newline at end of file
+amp_master_grad: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/sft/lora-qwen3moe-alltoall.yaml b/examples/config/sft/lora-qwen3moe-alltoall.yaml
deleted file mode 100644
index 612748dcb35..00000000000
--- a/examples/config/sft/lora-qwen3moe-alltoall.yaml
+++ /dev/null
@@ -1,72 +0,0 @@
-### data
-train_dataset_type: erniekit
-eval_dataset_type: erniekit
-train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json
-train_dataset_prob: "1.0"
-eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json
-eval_dataset_prob: "1.0"
-max_seq_len: 4096
-num_samples_each_epoch: 6000000
-packing: true
-mix_strategy: concat
-
-
-### model
-model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B
-attn_impl: flashmask
-lora: true
-lora_rank: 8
-
-### finetuning
-# base
-stage: SFT
-fine_tuning: lora
-seed: 23
-do_train: true
-do_eval: false
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-num_train_epochs: 5
-max_steps: 50
-eval_steps: 100
-evaluation_strategy: steps
-save_steps: 10000
-save_total_limit: 1
-save_strategy: steps
-logging_steps: 1
-gradient_accumulation_steps: 4
-logging_dir: ./vdl_log
-output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel
-disable_tqdm: true
-eval_accumulation_steps: 16
-
-# train
-warmup_steps: 20
-learning_rate: 1.0e-4
-
-# performance
-tensor_parallel_degree: 2
-expert_parallel_degree: 4
-pipeline_parallel_degree: 2
-sequence_parallel: true
-use_expert_parallel: true
-sharding: stage1
-#3 offload
-# offload: true,
-offload_optim: false
-use_fused_head_and_loss_fn: true
-# use_filtered_label_loss: true
-optim: adamw_custom
-tensorwise_offload_optimizer: true
-recompute: true
-# recompute_use_reentrant: true
-# unified_checkpoint_config: ignore_merge_optimizer
-bf16: true
-fp16_opt_level: O2
-unified_checkpoint: true
-
-sharding_parallel_config: "split_param"
-amp_master_grad: true
-
-ep_communication_type: "alltoall"
-use_unified_moe: true
diff --git a/examples/config/sft/lora-qwen3moe.yaml b/examples/config/sft/lora_tp_pp_ep.yaml
similarity index 75%
rename from examples/config/sft/lora-qwen3moe.yaml
rename to examples/config/sft/lora_tp_pp_ep.yaml
index 804190be740..dce5d38662c 100644
--- a/examples/config/sft/lora-qwen3moe.yaml
+++ b/examples/config/sft/lora_tp_pp_ep.yaml
@@ -1,16 +1,14 @@
 ### data
 train_dataset_type: erniekit
 eval_dataset_type: erniekit
-train_dataset_path: data-sft/train_gsm8k.json
+train_dataset_path: ./data/sft/train.json
 train_dataset_prob: "1.0"
-eval_dataset_path: data-sft/test_gsm8k.json
+eval_dataset_path: ./data/sft/dev.json
 eval_dataset_prob: "1.0"
-max_seq_len: 4096
-num_samples_each_epoch: 6000000
+max_seq_len: 8192
 packing: true
 mix_strategy: concat
 
-
 ### model
 model_name_or_path: Qwen/Qwen3-30B-A3B
 attn_impl: flashmask
@@ -26,11 +24,11 @@ do_train: true
 do_eval: false
 per_device_eval_batch_size: 1
 per_device_train_batch_size: 1
-num_train_epochs: 5
-max_steps: 100
+num_train_epochs: 1
+max_steps: -1
 eval_steps: 100
 evaluation_strategy: steps
-save_steps: 10000
+save_steps: 100
 save_total_limit: 1
 save_strategy: steps
 logging_steps: 1
@@ -51,12 +49,6 @@ pipeline_parallel_degree: 2
 sequence_parallel: true
 use_expert_parallel: true
 sharding: stage1
-#3 offload
-# offload: true,
-offload_optim: false
-use_fused_head_and_loss_fn: true
-# use_filtered_label_loss: true
-optim: adamw_custom
 tensorwise_offload_optimizer: true
 recompute: true
 # recompute_use_reentrant: true
@@ -68,4 +60,4 @@ unified_checkpoint: true
 sharding_parallel_config: "split_param"
 amp_master_grad: true
 
-ep_communication_type: "alltoall"
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/sft/qwen3moe-alltoall.yaml b/examples/config/sft/qwen3moe-alltoall.yaml
deleted file mode 100644
index bf81fd462f9..00000000000
--- a/examples/config/sft/qwen3moe-alltoall.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-### data
-train_dataset_type: erniekit
-eval_dataset_type: erniekit
-train_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/train.json
-train_dataset_prob: "1.0"
-eval_dataset_path: /root/paddlejob/gpfs/hushenwei/PaddleFormers/examples/experiments/glm_ci/data/sft/dev.json
-eval_dataset_prob: "1.0"
-max_seq_len: 4096
-packing: true
-mix_strategy: concat
-
-
-### model
-model_name_or_path: /root/paddlejob/gpfs/hushenwei/huggingface-model/huggingface/Qwen/Qwen3-30B-A3B
-attn_impl: flashmask
-
-### finetuning
-# base
-stage: SFT
-fine_tuning: full
-seed: 23
-do_train: true
-do_eval: false
-per_device_eval_batch_size: 1
-per_device_train_batch_size: 1
-num_train_epochs: 5
-max_steps: 50
-eval_steps: 100
-evaluation_strategy: steps
-save_steps: 10000
-save_total_limit: 1
-save_strategy: steps
-logging_steps: 1
-gradient_accumulation_steps: 4
-logging_dir: ./vdl_log
-output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel
-disable_tqdm: true
-eval_accumulation_steps: 16
-
-# train
-warmup_steps: 20
-learning_rate: 1.0e-5
-
-# performance
-tensor_parallel_degree: 2
-expert_parallel_degree: 4
-pipeline_parallel_degree: 2
-sequence_parallel: true
-use_expert_parallel: true
-sharding: stage1
-#3 offload
-# offload: true,
-offload_optim: false
-use_fused_head_and_loss_fn: true
-# use_filtered_label_loss: true
-optim: adamw_custom
-tensorwise_offload_optimizer: true
-recompute: true
-# recompute_use_reentrant: true
-# unified_checkpoint_config: ignore_merge_optimizer
-bf16: true
-fp16_opt_level: O2
-unified_checkpoint: true
-
-sharding_parallel_config: "split_param"
-amp_master_grad: true
-
-use_unified_moe: true
-ep_communication_type: "alltoall"
\ No newline at end of file

From 5e5f70e2744f0ca643eadb5b03a9beae6e0cd16f Mon Sep 17 00:00:00 2001
From: Shenwei Hu <hushenwei@baidu.com>
Date: Fri, 7 Nov 2025 19:52:45 +0800
Subject: [PATCH 3/3] fix config

---
 examples/config/dpo/full_tp_pp_ep.yaml |  4 +-
 examples/config/dpo/lora_tp_pp_ep.yaml | 57 ++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 examples/config/dpo/lora_tp_pp_ep.yaml

diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml
index b1abf05eda4..81d8bb863c6 100644
--- a/examples/config/dpo/full_tp_pp_ep.yaml
+++ b/examples/config/dpo/full_tp_pp_ep.yaml
@@ -54,4 +54,6 @@ fp16_opt_level: O2
 unified_checkpoint: true
 
 sharding_parallel_config: "split_param"
-amp_master_grad: true
\ No newline at end of file
+amp_master_grad: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml
new file mode 100644
index 00000000000..85426a4eeb9
--- /dev/null
+++ b/examples/config/dpo/lora_tp_pp_ep.yaml
@@ -0,0 +1,57 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/dpo/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/dpo/dev.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: DPO
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_parallel_degree: 4
+pipeline_parallel_degree: 2
+expert_parallel_degree: 4
+use_expert_parallel: true
+pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
+sequence_parallel: true
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU