diff --git a/examples/config/dpo/full_tp_pp_ep.yaml b/examples/config/dpo/full_tp_pp_ep.yaml
new file mode 100644
index 00000000000..81d8bb863c6
--- /dev/null
+++ b/examples/config/dpo/full_tp_pp_ep.yaml
@@ -0,0 +1,59 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/dpo/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/dpo/dev.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+num_samples_each_epoch: 6000000
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: DPO
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-6
+
+# performance
+tensor_parallel_degree: 2
+pipeline_parallel_degree: 2
+expert_parallel_degree: 4
+use_expert_parallel: true
+pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
+sequence_parallel: true
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/dpo/lora_tp_pp_ep.yaml b/examples/config/dpo/lora_tp_pp_ep.yaml
new file mode 100644
index 00000000000..85426a4eeb9
--- /dev/null
+++ b/examples/config/dpo/lora_tp_pp_ep.yaml
@@ -0,0 +1,57 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/dpo/train.jsonl
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/dpo/dev.jsonl
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: DPO
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_0p6b_dpo_lora_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_parallel_degree: 4
+pipeline_parallel_degree: 2
+expert_parallel_degree: 4
+use_expert_parallel: true
+pipeline_parallel_config: enable_clear_every_step_cache disable_partial_send_recv
+sequence_parallel: true
+sharding: stage1
+recompute: true
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/sft/full_tp_pp_ep.yaml b/examples/config/sft/full_tp_pp_ep.yaml
new file mode 100644
index 00000000000..5189df9c281
--- /dev/null
+++ b/examples/config/sft/full_tp_pp_ep.yaml
@@ -0,0 +1,61 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/sft/train.json
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/sft/dev.json
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: full
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-5
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU
diff --git a/examples/config/sft/lora_tp_pp_ep.yaml b/examples/config/sft/lora_tp_pp_ep.yaml
new file mode 100644
index 00000000000..dce5d38662c
--- /dev/null
+++ b/examples/config/sft/lora_tp_pp_ep.yaml
@@ -0,0 +1,63 @@
+### data
+train_dataset_type: erniekit
+eval_dataset_type: erniekit
+train_dataset_path: ./data/sft/train.json
+train_dataset_prob: "1.0"
+eval_dataset_path: ./data/sft/dev.json
+eval_dataset_prob: "1.0"
+max_seq_len: 8192
+packing: true
+mix_strategy: concat
+
+### model
+model_name_or_path: Qwen/Qwen3-30B-A3B
+attn_impl: flashmask
+lora: true
+lora_rank: 8
+
+### finetuning
+# base
+stage: SFT
+fine_tuning: lora
+seed: 23
+do_train: true
+do_eval: false
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 1
+num_train_epochs: 1
+max_steps: -1
+eval_steps: 100
+evaluation_strategy: steps
+save_steps: 100
+save_total_limit: 1
+save_strategy: steps
+logging_steps: 1
+gradient_accumulation_steps: 4
+logging_dir: ./vdl_log
+output_dir: ./checkpoints/qwen3_hf_30b_sft_lora_ckpts_parallel
+disable_tqdm: true
+eval_accumulation_steps: 16
+
+# train
+warmup_steps: 20
+learning_rate: 1.0e-4
+
+# performance
+tensor_parallel_degree: 2
+expert_parallel_degree: 4
+pipeline_parallel_degree: 2
+sequence_parallel: true
+use_expert_parallel: true
+sharding: stage1
+tensorwise_offload_optimizer: true
+recompute: true
+# recompute_use_reentrant: true
+# unified_checkpoint_config: ignore_merge_optimizer
+bf16: true
+fp16_opt_level: O2
+unified_checkpoint: true
+
+sharding_parallel_config: "split_param"
+amp_master_grad: true
+
+ep_communication_type: "alltoall" # choices: ["deepep", "alltoall"], "deepep" only for Hooper GPU