Skip to content

Commit 322c2ad

Browse files
committed
talk-llama : sync llama.cpp
1 parent 35ea5ce commit 322c2ad

File tree

13 files changed

+431
-45
lines changed

13 files changed

+431
-45
lines changed

examples/talk-llama/llama-arch.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <map>
66

77
static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8+
{ LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize
89
{ LLM_ARCH_LLAMA, "llama" },
910
{ LLM_ARCH_LLAMA4, "llama4" },
1011
{ LLM_ARCH_DECI, "deci" },
@@ -84,6 +85,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
8485
{ LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
8586
{ LLM_ARCH_PLM, "plm" },
8687
{ LLM_ARCH_BAILINGMOE, "bailingmoe" },
88+
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
8789
{ LLM_ARCH_DOTS1, "dots1" },
8890
{ LLM_ARCH_ARCEE, "arcee" },
8991
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
@@ -134,6 +136,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
134136
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
135137
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
136138
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
139+
{ LLM_KV_EXPERT_GROUP_COUNT, "%s.expert_group_count" },
140+
{ LLM_KV_EXPERT_GROUP_USED_COUNT, "%s.expert_group_used_count" },
137141
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
138142
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
139143
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
@@ -275,6 +279,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
275279
};
276280

277281
static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
282+
{
283+
LLM_ARCH_CLIP,
284+
{},
285+
},
278286
{
279287
LLM_ARCH_LLAMA,
280288
{
@@ -1941,6 +1949,38 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
19411949
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
19421950
},
19431951
},
1952+
{
1953+
LLM_ARCH_BAILINGMOE2,
1954+
{
1955+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
1956+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
1957+
{ LLM_TENSOR_OUTPUT, "output" },
1958+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
1959+
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
1960+
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
1961+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
1962+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
1963+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
1964+
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
1965+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
1966+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
1967+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
1968+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
1969+
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
1970+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
1971+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
1972+
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
1973+
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
1974+
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
1975+
{ LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" },
1976+
{ LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" },
1977+
{ LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" },
1978+
{ LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" },
1979+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" },
1980+
{ LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
1981+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
1982+
},
1983+
},
19441984
{
19451985
LLM_ARCH_DOTS1,
19461986
{

examples/talk-llama/llama-arch.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
//
1010

1111
enum llm_arch {
12+
LLM_ARCH_CLIP,
1213
LLM_ARCH_LLAMA,
1314
LLM_ARCH_LLAMA4,
1415
LLM_ARCH_DECI,
@@ -88,6 +89,7 @@ enum llm_arch {
8889
LLM_ARCH_WAVTOKENIZER_DEC,
8990
LLM_ARCH_PLM,
9091
LLM_ARCH_BAILINGMOE,
92+
LLM_ARCH_BAILINGMOE2,
9193
LLM_ARCH_DOTS1,
9294
LLM_ARCH_ARCEE,
9395
LLM_ARCH_ERNIE4_5,
@@ -138,6 +140,8 @@ enum llm_kv {
138140
LLM_KV_EXPERT_COUNT,
139141
LLM_KV_EXPERT_USED_COUNT,
140142
LLM_KV_EXPERT_SHARED_COUNT,
143+
LLM_KV_EXPERT_GROUP_COUNT,
144+
LLM_KV_EXPERT_GROUP_USED_COUNT,
141145
LLM_KV_EXPERT_WEIGHTS_SCALE,
142146
LLM_KV_EXPERT_WEIGHTS_NORM,
143147
LLM_KV_EXPERT_GATING_FUNC,

examples/talk-llama/llama-batch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ class llama_batch_allocr {
123123
uint32_t n_seq_max;
124124
uint32_t n_outputs;
125125

126-
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
126+
std::array<llama_seq_id, 1> seq_id_0 = {{ 0 }}; // default sequence id
127127

128128
std::vector<llama_pos> pos;
129129
std::vector<int32_t> n_seq_id;

examples/talk-llama/llama-chat.cpp

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
6363
{ "megrez", LLM_CHAT_TEMPLATE_MEGREZ },
6464
{ "yandex", LLM_CHAT_TEMPLATE_YANDEX },
6565
{ "bailing", LLM_CHAT_TEMPLATE_BAILING },
66+
{ "bailing-think", LLM_CHAT_TEMPLATE_BAILING_THINK },
67+
{ "bailing2", LLM_CHAT_TEMPLATE_BAILING2 },
6668
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
6769
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
6870
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
@@ -191,6 +193,10 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
191193
return LLM_CHAT_TEMPLATE_YANDEX;
192194
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("'HUMAN'")) {
193195
return LLM_CHAT_TEMPLATE_BAILING;
196+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("\"HUMAN\"") && tmpl_contains("<think>")) {
197+
return LLM_CHAT_TEMPLATE_BAILING_THINK;
198+
} else if (tmpl_contains("<role>ASSISTANT</role>") && tmpl_contains("<role>HUMAN</role>") && tmpl_contains("<|role_end|>")) {
199+
return LLM_CHAT_TEMPLATE_BAILING2;
194200
} else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
195201
return LLM_CHAT_TEMPLATE_LLAMA4;
196202
} else if (tmpl_contains("<|endofuserprompt|>")) {
@@ -644,8 +650,8 @@ int32_t llm_chat_apply_template(
644650
if (add_ass) {
645651
ss << " Ассистент:[SEP]";
646652
}
647-
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING) {
648-
// Bailing (Ling) template
653+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING || tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
654+
// Bailing (Ling/Ring) template
649655
for (auto message : chat) {
650656
std::string role(message->role);
651657

@@ -658,6 +664,33 @@ int32_t llm_chat_apply_template(
658664
ss << "<role>" << role << "</role>" << message->content;
659665
}
660666

667+
if (add_ass) {
668+
ss << "<role>ASSISTANT</role>";
669+
670+
if (tmpl == LLM_CHAT_TEMPLATE_BAILING_THINK) {
671+
ss << "<think>";
672+
}
673+
}
674+
} else if (tmpl == LLM_CHAT_TEMPLATE_BAILING2) {
675+
// Bailing2 (Ling 2.0) template
676+
bool has_system = !chat.empty() && std::string(chat[0]->role) == "system";
677+
678+
if (!has_system) {
679+
ss << "<role>SYSTEM</role>detailed thinking off<|role_end|>";
680+
}
681+
682+
for (auto message : chat) {
683+
std::string role(message->role);
684+
685+
if (role == "user") {
686+
role = "HUMAN";
687+
} else {
688+
std::transform(role.begin(), role.end(), role.begin(), ::toupper);
689+
}
690+
691+
ss << "<role>" << role << "</role>" << message->content << "<|role_end|>";
692+
}
693+
661694
if (add_ass) {
662695
ss << "<role>ASSISTANT</role>";
663696
}

examples/talk-llama/llama-chat.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ enum llm_chat_template {
4242
LLM_CHAT_TEMPLATE_MEGREZ,
4343
LLM_CHAT_TEMPLATE_YANDEX,
4444
LLM_CHAT_TEMPLATE_BAILING,
45+
LLM_CHAT_TEMPLATE_BAILING_THINK,
46+
LLM_CHAT_TEMPLATE_BAILING2,
4547
LLM_CHAT_TEMPLATE_LLAMA4,
4648
LLM_CHAT_TEMPLATE_SMOLVLM,
4749
LLM_CHAT_TEMPLATE_DOTS1,

examples/talk-llama/llama-context.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2346,7 +2346,8 @@ llama_context * llama_init_from_model(
23462346
return nullptr;
23472347
}
23482348

2349-
if (params.pooling_type != model->hparams.pooling_type) {
2349+
if (params.pooling_type != LLAMA_POOLING_TYPE_UNSPECIFIED &&
2350+
params.pooling_type != model->hparams.pooling_type) {
23502351
//user-specified pooling-type is different from the model default
23512352
LLAMA_LOG_WARN("%s: model default pooling_type is [%d], but [%d] was specified\n", __func__,
23522353
model->hparams.pooling_type, params.pooling_type);

examples/talk-llama/llama-graph.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -950,6 +950,31 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
950950
cb(selection_probs, "ffn_moe_probs_biased", il);
951951
}
952952

953+
// select top n_group_used expert groups
954+
// https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/e815299b0bcbac849fa540c768ef21845365c9eb/modeling_deepseek.py#L440-L457
955+
if (hparams.n_expert_groups > 1 && n_tokens > 0) {
956+
const int64_t n_exp_per_group = n_expert / hparams.n_expert_groups;
957+
958+
// organize experts into n_expert_groups
959+
ggml_tensor * selection_groups = ggml_reshape_3d(ctx0, selection_probs, n_exp_per_group, hparams.n_expert_groups, n_tokens); // [n_exp_per_group, n_expert_groups, n_tokens]
960+
961+
ggml_tensor * group_scores = ggml_top_k(ctx0, selection_groups, 2); // [2, n_expert_groups, n_tokens]
962+
group_scores = ggml_get_rows(ctx0, ggml_reshape_4d(ctx0, selection_groups, 1, selection_groups->ne[0], selection_groups->ne[1], selection_groups->ne[2]), group_scores); // [1, 2, n_expert_groups, n_tokens]
963+
964+
// get top n_group_used expert groups
965+
group_scores = ggml_sum_rows(ctx0, ggml_reshape_3d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2], group_scores->ne[3])); // [1, n_expert_groups, n_tokens]
966+
group_scores = ggml_reshape_2d(ctx0, group_scores, group_scores->ne[1], group_scores->ne[2]); // [n_expert_groups, n_tokens]
967+
968+
ggml_tensor * expert_groups = ggml_top_k(ctx0, group_scores, hparams.n_group_used); // [n_group_used, n_tokens]
969+
cb(expert_groups, "ffn_moe_group_topk", il);
970+
971+
// mask out the other groups
972+
selection_probs = ggml_get_rows(ctx0, selection_groups, expert_groups); // [n_exp_per_group, n_group_used, n_tokens]
973+
selection_probs = ggml_set_rows(ctx0, ggml_scale_bias(ctx0, selection_groups, 0.0f, -INFINITY), selection_probs, expert_groups); // [n_exp_per_group, n_expert_groups, n_tokens]
974+
selection_probs = ggml_reshape_2d(ctx0, selection_probs, n_expert, n_tokens); // [n_expert, n_tokens]
975+
cb(selection_probs, "ffn_moe_probs_masked", il);
976+
}
977+
953978
// select experts
954979
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
955980
cb(selected_experts->src[0], "ffn_moe_argsort", il);
@@ -981,6 +1006,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
9811006
ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); // [1, n_tokens]
9821007
cb(weights_sum, "ffn_moe_weights_sum", il);
9831008

1009+
if (arch == LLM_ARCH_BAILINGMOE2) {
1010+
weights_sum = ggml_scale_bias(ctx0, weights_sum, 1.0, 1e-20);
1011+
cb(weights_sum, "ffn_moe_weights_sum_biased", il);
1012+
}
1013+
9841014
weights = ggml_div(ctx0, weights, weights_sum); // [n_expert_used, n_tokens]
9851015
cb(weights, "ffn_moe_weights_norm", il);
9861016

examples/talk-llama/llama-hparams.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ struct llama_hparams {
7272
uint32_t n_ff_chexp = 0;
7373
uint32_t n_expert_shared = 0;
7474
uint32_t n_norm_groups = 0;
75+
uint32_t n_expert_groups = 0;
76+
uint32_t n_group_used = 0;
7577
uint32_t n_group_experts = 0;
7678

7779
float expert_group_scale = 0.05f;

0 commit comments

Comments
 (0)