@@ -1173,8 +1173,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
11731173 if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5" :
11741174 # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
11751175 res = "deepseek-v3"
1176- if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1" :
1177- res = "utu-vl"
11781176 if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5" :
11791177 # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
11801178 res = "deepseek-r1-qwen"
@@ -1232,6 +1230,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
12321230 if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665" :
12331231 # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
12341232 res = "kormo"
1233+ if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1" :
1234+ # ref: ./Youtu-VL
1235+ res = "utu-vl"
12351236
12361237 if res is None :
12371238 logger .warning ("\n " )
@@ -3808,15 +3809,10 @@ def set_gguf_parameters(self):
38083809 else :
38093810 self .gguf_writer .add_clip_projector_type (gguf .VisionProjectorType .QWEN25VL )
38103811 self .gguf_writer .add_vision_use_silu (True )
3811- # find n_wa_pattern (window attention pattern )
3812+ # save window attention layers (full attention block indexes )
38123813 fullatt_block_indexes = hparams .get ("fullatt_block_indexes" )
38133814 assert fullatt_block_indexes is not None , "fullatt_block_indexes is required for qwen2_5_vl"
3814- n_wa_pattern = fullatt_block_indexes [0 ] + 1
3815- # validate n_wa_pattern
3816- for i in range (1 , len (fullatt_block_indexes )):
3817- if fullatt_block_indexes [i ] - fullatt_block_indexes [i - 1 ] != n_wa_pattern :
3818- raise ValueError (f"Invalid fullatt_block_indexes: { fullatt_block_indexes } " )
3819- self .gguf_writer .add_vision_n_wa_pattern (n_wa_pattern )
3815+ self .gguf_writer .add_vision_wa_layers (fullatt_block_indexes )
38203816 else :
38213817 raise ValueError (f"Unknown QwenVL model type: { self .global_config ['model_type' ]} " )
38223818 # default values below are taken from HF tranformers code
@@ -7214,26 +7210,26 @@ def set_gguf_parameters(self):
72147210 self .gguf_writer .add_key_length_mla (hparams ["qk_nope_head_dim" ] + hparams ["qk_rope_head_dim" ])
72157211 self .gguf_writer .add_value_length_mla (hparams ["v_head_dim" ])
72167212
7217- if hparams .get ("moe_intermediate_size" ) is not None :
7218- self .gguf_writer .add_expert_feed_forward_length (hparams [ " moe_intermediate_size" ] )
7213+ if ( moe_intermediate_size := hparams .get ("moe_intermediate_size" ) ) is not None :
7214+ self .gguf_writer .add_expert_feed_forward_length (moe_intermediate_size )
72197215 else :
72207216 self .gguf_writer .add_expert_feed_forward_length (hparams .get ("intermediate_size" , 0 ))
72217217
7222- if hparams .get ("n_routed_experts" ) is not None :
7223- self .gguf_writer .add_expert_count (hparams [ " n_routed_experts" ] )
7218+ if ( n_routed_experts := hparams .get ("n_routed_experts" ) ) is not None :
7219+ self .gguf_writer .add_expert_count (n_routed_experts )
72247220
7225- if hparams .get ("n_shared_experts" ) is not None :
7226- self .gguf_writer .add_expert_shared_count (hparams [ " n_shared_experts" ] )
7221+ if ( n_shared_experts := hparams .get ("n_shared_experts" ) ) is not None :
7222+ self .gguf_writer .add_expert_shared_count (n_shared_experts )
72277223 else :
72287224 self .gguf_writer .add_expert_shared_count (0 )
72297225
7230- if hparams .get ("routed_scaling_factor" ) is not None :
7231- self .gguf_writer .add_expert_weights_scale (hparams [ " routed_scaling_factor" ] )
7226+ if ( routed_scaling_factor := hparams .get ("routed_scaling_factor" ) ) is not None :
7227+ self .gguf_writer .add_expert_weights_scale (routed_scaling_factor )
72327228 else :
72337229 self .gguf_writer .add_expert_weights_scale (1.0 )
72347230
7235- if hparams .get ("norm_topk_prob" ) is not None and hparams [ " norm_topk_prob" ] :
7236- self .gguf_writer .add_expert_weights_norm (hparams [ " norm_topk_prob" ] )
7231+ if ( norm_topk_prob := hparams .get ("norm_topk_prob" )) is not None and norm_topk_prob :
7232+ self .gguf_writer .add_expert_weights_norm (norm_topk_prob )
72377233
72387234 self .gguf_writer .add_rope_dimension_count (hparams ["qk_rope_head_dim" ])
72397235
@@ -7244,7 +7240,6 @@ def set_gguf_parameters(self):
72447240 self .gguf_writer .add_rope_scaling_yarn_log_mul (0.1 * rope_mscale_all )
72457241
72467242 _experts : list [dict [str , Tensor ]] | None = None
7247- _token_embd : Tensor | None = None
72487243
72497244 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
72507245 # skip vision tensors and remove "language_model." for Kimi-VL
@@ -7257,11 +7252,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
72577252
72587253 # skip lm_head.weight if tie_word_embeddings is True
72597254 if self .hparams .get ("tie_word_embeddings" , False ):
7260- # Save token_embd for potential duplication as output if tie_word_embeddings is True
7261- if name == "model.embed_tokens.weight" :
7262- self ._token_embd = data_torch
72637255 if name == "lm_head.weight" or name == "model.lm_head.weight" :
7264- logger .info ("Skipping tied output layer 'lm_head.weight' - will duplicate from token_embd.weight" )
7256+ logger .info ("Skipping tied output layer 'lm_head.weight' ( will use token_embd.weight) " )
72657257 return []
72667258
72677259 # rename e_score_correction_bias tensors
@@ -7337,10 +7329,6 @@ def prepare_tensors(self):
73377329 experts = [k for d in self ._experts for k in d .keys ()]
73387330 if len (experts ) > 0 :
73397331 raise ValueError (f"Unprocessed experts: { experts } " )
7340- if self ._token_embd is not None :
7341- logger .info ("Model has tie_word_embeddings=True but no lm_head.weight found - adding output.weight from token_embd.weight" )
7342- output_name = self .format_tensor_name (gguf .MODEL_TENSOR .OUTPUT )
7343- self .gguf_writer .add_tensor (output_name , self ._token_embd .numpy ())
73447332
73457333@ModelBase .register ("MiniMaxM2ForCausalLM" )
73467334class MiniMaxM2Model (TextModel ):
@@ -10521,7 +10509,14 @@ def set_gguf_parameters(self):
1052110509 raise ValueError (f"Unsupported activation function for UTUVL: { hidden_act } " )
1052210510
1052310511 self .gguf_writer .add_vision_spatial_merge_size (self .hparams .get ("spatial_merge_size" , 2 ))
10524-
10512+
10513+ window_size = self .hparams .get ("window_size" )
10514+ if window_size is not None :
10515+ self .gguf_writer .add_vision_window_size (window_size )
10516+ fullatt_block_indexes = self .hparams .get ("fullatt_block_indexes" )
10517+ assert fullatt_block_indexes is not None , "fullatt_block_indexes is required for utuvl"
10518+ self .gguf_writer .add_vision_wa_layers (layers = fullatt_block_indexes )
10519+
1052510520 def modify_tensors (self , data_torch : Tensor , name : str , bid : int | None ) -> Iterable [tuple [str , Tensor ]]:
1052610521 del bid # unused
1052710522
0 commit comments