From 531a712115b9cc288ebcefe6e452726f854fbd57 Mon Sep 17 00:00:00 2001
From: "Peng, Bo" <bo.peng@intel.com>
Date: Mon, 31 Mar 2025 15:58:42 +0800
Subject: [PATCH 1/2] add extra_options use_channel_wised_quantization to
 builder.py, quantize the model with block size = K

---
 src/python/py/models/builder.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 65e01797f7..426c67f718 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -302,6 +302,7 @@ def __init__(
                 "algo_config": int4_algo_config,
             },
             "use_qdq": extra_options.get("use_qdq", False),
+            "use_channel_wised_quantization": extra_options.get("use_channel_wised_quantization", False),
         }
         if self.quant_type is not None:
             # Create quantized attributes from quantization config
@@ -482,6 +483,7 @@ def to_int4(self) -> ir.Model:
             quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
             op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"],
             algo_config=self.quant_attrs["int4"]["algo_config"],
+            channel_wised_quantize = self.quant_attrs["use_channel_wised_quantization"],
         )
         quant.process()
         return ir.from_proto(quant.model.model)
@@ -3637,7 +3639,7 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
+    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", "use_channel_wised_quantization"]
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:
@@ -3947,6 +3949,8 @@ def get_args():
                     Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx).
                 adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
                     Use this option for LoRA models.
+                use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K)
+                    Use this option when you want use K as block size, default is False
             """),
     )
 

From 260e02f7e884bc9e12446d1ce80970802ee13411 Mon Sep 17 00:00:00 2001
From: "Peng, Bo" <bo.peng@intel.com>
Date: Tue, 29 Jul 2025 10:28:35 +0800
Subject: [PATCH 2/2] add int_ prefix to use_channel_wised_quantization and fix
 typo

---
 src/python/py/models/builder.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
index 426c67f718..26c160dbc6 100644
--- a/src/python/py/models/builder.py
+++ b/src/python/py/models/builder.py
@@ -300,9 +300,9 @@ def __init__(
                 "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
                 "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []),
                 "algo_config": int4_algo_config,
+                "use_channel_wised_quantization": extra_options.get("int4_use_channel_wised_quantization", False),
             },
             "use_qdq": extra_options.get("use_qdq", False),
-            "use_channel_wised_quantization": extra_options.get("use_channel_wised_quantization", False),
         }
         if self.quant_type is not None:
             # Create quantized attributes from quantization config
@@ -483,7 +483,7 @@ def to_int4(self) -> ir.Model:
             quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
             op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"],
             algo_config=self.quant_attrs["int4"]["algo_config"],
-            channel_wised_quantize = self.quant_attrs["use_channel_wised_quantization"],
+            channel_wised_quantize = self.quant_attrs["int4"]["use_channel_wised_quantization"],
         )
         quant.process()
         return ir.from_proto(quant.model.model)
@@ -3639,7 +3639,7 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", "use_channel_wised_quantization"]
+    bools = ["int4_is_symmetric", "int4_use_channel_wised_quantization", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:
@@ -3918,6 +3918,8 @@ def get_args():
                     Currently supported options are: 'default', 'rtn', 'k_quant_mixed', 'k_quant_last'.
                     k_quant_mixed = k_quant algorithm with mixed precision (int4 + int8).
                     k_quant_last = k_quant algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4.
+                int4_use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K)
+                    Use this option when you want use K as block size. Default is False
                 num_hidden_layers = Manually specify the number of layers in your ONNX model (for unit testing purposes).
                 filename = Filename for ONNX model (default is 'model.onnx').
                     For models with multiple components, each component is exported to its own ONNX model.
@@ -3949,8 +3951,6 @@ def get_args():
                     Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx).
                 adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights).
                     Use this option for LoRA models.
-                use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K)
-                    Use this option when you want use K as block size, default is False
             """),
     )