microsoft · bopeng1234 · Mar 31, 2025 · Jul 29, 2025
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -300,6 +300,7 @@ def __init__(
                 "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
                 "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []),
                 "algo_config": int4_algo_config,
+                "use_channel_wised_quantization": extra_options.get("int4_use_channel_wised_quantization", False),
             },
             "use_qdq": extra_options.get("use_qdq", False),
         }
@@ -482,6 +483,7 @@ def to_int4(self) -> ir.Model:
             quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
             op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"],
             algo_config=self.quant_attrs["int4"]["algo_config"],
+            channel_wised_quantize = self.quant_attrs["int4"]["use_channel_wised_quantization"],
         )
         quant.process()
         return ir.from_proto(quant.model.model)
@@ -3637,7 +3639,7 @@ def check_extra_options(kv_pairs):
     """
     Check key-value pairs and set values correctly
     """
-    bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
+    bools = ["int4_is_symmetric", "int4_use_channel_wised_quantization", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
     for key in bools:
         if key in kv_pairs:
             if kv_pairs[key] in {"false", "False", "0"}:
@@ -3916,6 +3918,8 @@ def get_args():
                     Currently supported options are: 'default', 'rtn', 'k_quant_mixed', 'k_quant_last'.
                     k_quant_mixed = k_quant algorithm with mixed precision (int4 + int8).
                     k_quant_last = k_quant algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4.
+                int4_use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K)
+                    Use this option when you want use K as block size. Default is False
                 num_hidden_layers = Manually specify the number of layers in your ONNX model (for unit testing purposes).
                 filename = Filename for ONNX model (default is 'model.onnx').
                     For models with multiple components, each component is exported to its own ONNX model.