fix bug

f291400 · f291400 · commit 867709ca75db · 2025-12-24T17:31:16.000+08:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1173,8 +1173,6 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "877081d19cf6996e2c4ff0e1236341e9b7bde288f5311a56a937f0afbbb3aeb5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-V3
             res = "deepseek-v3"
-        if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
-            res = "utu-vl"
         if chkhsh == "b3f499bb4255f8ca19fccd664443283318f2fd2414d5e0b040fbdd0cc195d6c5":
             # ref: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
             res = "deepseek-r1-qwen"
@@ -1232,6 +1230,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
             # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
             res = "kormo"
+        if chkhsh == "9d70134b369a70e5735009b6de918f7581b5211f7c074d1f89f753aea8248af1":
+            # ref: ./Youtu-VL
+            res = "utu-vl"
 
         if res is None:
             logger.warning("\n")
@@ -3808,15 +3809,10 @@ def set_gguf_parameters(self):
             else:
                 self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
             self.gguf_writer.add_vision_use_silu(True)
-            # find n_wa_pattern (window attention pattern)
+            # save window attention layers (full attention block indexes)
             fullatt_block_indexes = hparams.get("fullatt_block_indexes")
             assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
-            n_wa_pattern = fullatt_block_indexes[0] + 1
-            # validate n_wa_pattern
-            for i in range(1, len(fullatt_block_indexes)):
-                if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
-                    raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
-            self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
+            self.gguf_writer.add_vision_wa_layers(fullatt_block_indexes)
         else:
             raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
         # default values below are taken from HF tranformers code
@@ -7214,26 +7210,26 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
         self.gguf_writer.add_value_length_mla(hparams["v_head_dim"])
 
-        if hparams.get("moe_intermediate_size") is not None:
-            self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+        if (moe_intermediate_size := hparams.get("moe_intermediate_size")) is not None:
+            self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size)
         else:
             self.gguf_writer.add_expert_feed_forward_length(hparams.get("intermediate_size", 0))
         
-        if hparams.get("n_routed_experts") is not None:
-            self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+        if (n_routed_experts := hparams.get("n_routed_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_routed_experts)
         
-        if hparams.get("n_shared_experts") is not None:
-            self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+        if (n_shared_experts := hparams.get("n_shared_experts")) is not None:
+            self.gguf_writer.add_expert_shared_count(n_shared_experts)
         else:
             self.gguf_writer.add_expert_shared_count(0)
         
-        if hparams.get("routed_scaling_factor") is not None:
-            self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+        if (routed_scaling_factor := hparams.get("routed_scaling_factor")) is not None:
+            self.gguf_writer.add_expert_weights_scale(routed_scaling_factor)
         else:
             self.gguf_writer.add_expert_weights_scale(1.0)
         
-        if hparams.get("norm_topk_prob") is not None and hparams["norm_topk_prob"]:
-            self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"])
+        if (norm_topk_prob := hparams.get("norm_topk_prob")) is not None and norm_topk_prob:
+            self.gguf_writer.add_expert_weights_norm(norm_topk_prob)
 
         self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
 
@@ -7244,7 +7240,6 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * rope_mscale_all)
 
     _experts: list[dict[str, Tensor]] | None = None
-    _token_embd: Tensor | None = None
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # skip vision tensors and remove "language_model." for Kimi-VL
@@ -7257,11 +7252,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         # skip lm_head.weight if tie_word_embeddings is True
         if self.hparams.get("tie_word_embeddings", False):
-            # Save token_embd for potential duplication as output if tie_word_embeddings is True
-            if name == "model.embed_tokens.weight":
-                self._token_embd = data_torch
             if name == "lm_head.weight" or name == "model.lm_head.weight":
-                logger.info("Skipping tied output layer 'lm_head.weight' - will duplicate from token_embd.weight")
+                logger.info("Skipping tied output layer 'lm_head.weight' (will use token_embd.weight)")
                 return []
 
         # rename e_score_correction_bias tensors
@@ -7337,10 +7329,6 @@ def prepare_tensors(self):
             experts = [k for d in self._experts for k in d.keys()]
             if len(experts) > 0:
                 raise ValueError(f"Unprocessed experts: {experts}")
-        if self._token_embd is not None:
-            logger.info("Model has tie_word_embeddings=True but no lm_head.weight found - adding output.weight from token_embd.weight")
-            output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
-            self.gguf_writer.add_tensor(output_name, self._token_embd.numpy())
 
 @ModelBase.register("MiniMaxM2ForCausalLM")
 class MiniMaxM2Model(TextModel):
@@ -10521,7 +10509,14 @@ def set_gguf_parameters(self):
             raise ValueError(f"Unsupported activation function for UTUVL: {hidden_act}")
         
         self.gguf_writer.add_vision_spatial_merge_size(self.hparams.get("spatial_merge_size", 2))
-
+        
+        window_size = self.hparams.get("window_size")
+        if window_size is not None:
+            self.gguf_writer.add_vision_window_size(window_size)
+        fullatt_block_indexes = self.hparams.get("fullatt_block_indexes")
+        assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for utuvl"
+        self.gguf_writer.add_vision_wa_layers(layers=fullatt_block_indexes)
+        
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
         
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -145,6 +145,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
     {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
     {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
+    {"name": "utu-vl",           "tokt": TOKENIZER_TYPE.BPE, "repo": "./Youtu-VL", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -293,8 +293,9 @@ class ClipVision:
         SPATIAL_MERGE_SIZE  = "clip.vision.spatial_merge_size"
         USE_GELU            = "clip.use_gelu"
         USE_SILU            = "clip.use_silu"
-        N_WA_PATTERN        = "clip.vision.n_wa_pattern" # used by qwen2.5vl
+        WA_LAYERS           = "clip.vision.wa_layers" # used by qwen2.5vl and utuvl
         IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
+        WINDOW_SIZE         = "clip.vision.window_size"
 
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -1128,12 +1128,15 @@ def add_vision_use_silu(self, value: bool) -> None:
     def add_vision_projector_scale_factor(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
 
-    def add_vision_n_wa_pattern(self, value: int) -> None:
-        self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
+    def add_vision_wa_layers(self, layers: Sequence[int]) -> None:
+        self.add_array(Keys.ClipVision.WA_LAYERS, layers)
 
     def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
         self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)
 
+    def add_vision_window_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.WINDOW_SIZE, value)
+
     # audio models
 
     def add_audio_projection_dim(self, value: int) -> None:
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -4699,7 +4699,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    // try to load output.weight, if not found, use token_embd (tied embeddings)
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    if (!output) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
@@ -4762,7 +4766,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
 
                     // output
                     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+                    // try to load output.weight, if not found, use token_embd (tied embeddings)
+                    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+                    if (!output) {
+                        output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
+                    }
 
                     for (int i = 0; i < n_layer; ++i) {
                         auto & layer = layers[i];
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
@@ -48,7 +48,7 @@
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
-#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_WIN_ATTN_LAYERS       "clip.vision.wa_layers"
 #define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
 #define KEY_MINICPMV_VERSION      "clip.minicpmv_version"
 #define KEY_MINICPMV_QUERY_NUM    "clip.minicpmv_query_num"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h
@@ -60,7 +60,7 @@ struct clip_hparams {
     int32_t image_crop_resolution;
     std::unordered_set<int32_t> vision_feature_layer;
     int32_t attn_window_size = 0;
-    int32_t n_wa_pattern = 0;
+    std::unordered_set<int32_t> wa_layers; // window attention full layers
 
     // audio
     int32_t n_mel_bins = 0; // whisper preprocessor
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
@@ -1151,7 +1151,14 @@ struct clip_model_loader {
                     {
                         hparams.n_merge = 2; // default value for Qwen 2 and 2.5
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
+                        // load window attention layers (only 2.5 requires it)
+                        if (model.proj_type == PROJECTOR_TYPE_QWEN25VL) {
+                            std::vector<int> wa_layers_vec;
+                            get_arr_int(KEY_WIN_ATTN_LAYERS, wa_layers_vec, true);
+                            for (auto & layer : wa_layers_vec) {
+                                hparams.wa_layers.insert(layer);
+                            }
+                        }
                         // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
                         hparams.set_limit_image_tokens(8, 4096);
                         hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
@@ -1166,6 +1173,12 @@ struct clip_model_loader {
                     {
                         hparams.n_merge = 2; 
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
+                        std::vector<int> wa_layers_vec;
+                        get_arr_int(KEY_WIN_ATTN_LAYERS, wa_layers_vec, true);
+                        for (auto & layer : wa_layers_vec) {
+                            hparams.wa_layers.insert(layer);
+                        }
                         hparams.set_limit_image_tokens(8, 4096);
                         hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
                         const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
@@ -1240,7 +1253,13 @@ struct clip_model_loader {
                 LOG_INF("%s: has_llava_proj:     %d\n", __func__, hparams.has_llava_projector);
                 LOG_INF("%s: minicpmv_version:   %d\n", __func__, hparams.minicpmv_version);
                 LOG_INF("%s: n_merge:            %d\n", __func__, hparams.n_merge);
-                LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
+                if (!hparams.wa_layers.empty()) {
+                    LOG_INF("%s: wa_layers:          ", __func__);
+                    for (auto & layer : hparams.wa_layers) {
+                        LOG_INF("%d ", layer);
+                    }
+                    LOG_INF("\n");
+                }
                 if (hparams.image_min_pixels > 0) {
                     LOG_INF("%s: image_min_pixels:   %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
                 }
@@ -3346,7 +3365,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             {
                 // pw * ph = number of tokens output by ViT after apply patch merger
                 // ipw * ipw = number of vision token been processed inside ViT
-                const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
+                const bool use_window_attn = !hparams.wa_layers.empty(); // for qwen2.5vl
                 const int merge_ratio = 2;
                 const int pw  = image_size_width  / patch_size / merge_ratio;
                 const int ph  = image_size_height / patch_size / merge_ratio;
@@ -3357,7 +3376,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 std::vector<int> inv_idx(ph * pw);
 
                 if (use_window_attn) {
-                    const int attn_window_size = ctx->model.proj_type == PROJECTOR_TYPE_QWEN25VL ? 112 : patch_size * 2 * 8;
+                    const int attn_window_size = hparams.attn_window_size > 0 ? hparams.attn_window_size : 112;
                     const int grid_window = attn_window_size / patch_size / merge_ratio;
                     int dst = 0;
                     // [num_vision_tokens, num_vision_tokens] attention mask tensor
diff --git a/tools/mtmd/models/qwen2vl.cpp b/tools/mtmd/models/qwen2vl.cpp
@@ -5,8 +5,7 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
     GGML_ASSERT(model.class_embedding == nullptr);
 
     const int batch_size       = 1;
-    const bool use_window_attn = hparams.n_wa_pattern > 0;
-    const int n_wa_pattern     = hparams.n_wa_pattern;
+    const bool use_window_attn = !hparams.wa_layers.empty();
     const int n_pos            = n_patches;
     const int num_position_ids = n_pos * 4; // m-rope requires 4 dim per position
 
@@ -79,7 +78,7 @@ ggml_cgraph * clip_graph_qwen2vl::build() {
     // loop over layers
     for (int il = 0; il < n_layer; il++) {
         const auto & layer = model.layers[il];
-        const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+        const bool full_attn = use_window_attn ? hparams.wa_layers.count(il) > 0 : true;
 
         ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
 
diff --git a/tools/mtmd/models/utuvl.cpp b/tools/mtmd/models/utuvl.cpp
@@ -3,7 +3,7 @@
 ggml_cgraph * clip_graph_utuvl::build() {
     GGML_ASSERT(model.class_embedding == nullptr);
     const int batch_size       = 1;
-    const bool use_window_attn = true;
+    const bool use_window_attn = !hparams.wa_layers.empty();
     const int n_pos            = n_patches;
     const int num_position_ids = n_pos * 4; 
     const int m = 2;
@@ -17,29 +17,32 @@ ggml_cgraph * clip_graph_utuvl::build() {
 
     ggml_tensor * inp = build_inp_raw();
     
-    inp = ggml_reshape_4d(
-        ctx0, inp,
-        Wm * m * patch_size, m * patch_size, Hm, 3);
-    inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); 
-    inp = ggml_cont_4d(
-        ctx0, inp,
-        m * patch_size * 3, Wm, m * patch_size, Hm);
-
-    inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
-    inp = ggml_cont_4d(
-        ctx0, inp,
-        m * patch_size * 3, patch_size, m, Hm * Wm);
-
-    inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
-    inp = ggml_cont_4d(
-        ctx0, inp,
-        patch_size, 3, patch_size, Hm * Wm * m * m);
-    
-    inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
-    inp = ggml_cont_3d(
-        ctx0, inp,
-        3*patch_size* patch_size,  Hm * Wm * m * m, 1);
-
+    // change conv3d to linear 
+    // reshape and permute to get patches, permute from (patch_size, m, Wm, patch_size, m, Hm, C) to (C, patch_size, patch_size, m, m, Wm, Hm)
+    {
+        inp = ggml_reshape_4d(
+            ctx0, inp,
+            Wm * m * patch_size, m * patch_size, Hm, 3);
+        inp = ggml_permute(ctx0, inp, 1, 2, 3, 0); 
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            m * patch_size * 3, Wm, m * patch_size, Hm);
+
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            m * patch_size * 3, patch_size, m, Hm * Wm);
+
+        inp = ggml_permute(ctx0, inp, 1, 0, 2, 3);
+        inp = ggml_cont_4d(
+            ctx0, inp,
+            patch_size, 3, patch_size, Hm * Wm * m * m);
+        
+        inp = ggml_permute(ctx0, inp, 2, 0, 1, 3);
+        inp = ggml_cont_3d(
+            ctx0, inp,
+            3*patch_size* patch_size,  Hm * Wm * m * m, 1);
+    }
     inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
     
     if (model.patch_bias) {
@@ -85,7 +88,7 @@ ggml_cgraph * clip_graph_utuvl::build() {
     // loop over layers
     for (int il = 0; il < n_layer; il++) {
         const auto & layer = model.layers[il];
-        const bool full_attn = (il + 1) % 8 == 0 || il == n_layer - 1;
+        const bool full_attn = use_window_attn ? hparams.wa_layers.count(il) > 0 : true;
 
         ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states
             

Original file line number	Diff line number	Diff line change
`@@ -145,6 +145,7 @@ class TOKENIZER_TYPE(IntEnum):`
`145`	`145`	`{"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },`
`146`	`146`	`{"name": "minimax-m2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },`
`147`	`147`	`{"name": "kormo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },`
	`148`	`+ {"name": "utu-vl", "tokt": TOKENIZER_TYPE.BPE, "repo": "./Youtu-VL", },`
`148`	`149`	`]`
`149`	`150`
`150`	`151`	`# some models are known to be broken upstream, so we will skip them as exceptions`