From 531a712115b9cc288ebcefe6e452726f854fbd57 Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Mon, 31 Mar 2025 15:58:42 +0800 Subject: [PATCH 1/2] add extra_options use_channel_wised_quantization to builder.py, quantize the model with block size = K --- src/python/py/models/builder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 65e01797f7..426c67f718 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -302,6 +302,7 @@ def __init__( "algo_config": int4_algo_config, }, "use_qdq": extra_options.get("use_qdq", False), + "use_channel_wised_quantization": extra_options.get("use_channel_wised_quantization", False), } if self.quant_type is not None: # Create quantized attributes from quantization config @@ -482,6 +483,7 @@ def to_int4(self) -> ir.Model: quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator, op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"], algo_config=self.quant_attrs["int4"]["algo_config"], + channel_wised_quantize = self.quant_attrs["use_channel_wised_quantization"], ) quant.process() return ir.from_proto(quant.model.model) @@ -3637,7 +3639,7 @@ def check_extra_options(kv_pairs): """ Check key-value pairs and set values correctly """ - bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"] + bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", "use_channel_wised_quantization"] for key in bools: if key in kv_pairs: if kv_pairs[key] in {"false", "False", "0"}: @@ -3947,6 +3949,8 @@ def get_args(): Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx). adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights). Use this option for LoRA models. + use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K) + Use this option when you want use K as block size, default is False """), ) From 260e02f7e884bc9e12446d1ce80970802ee13411 Mon Sep 17 00:00:00 2001 From: "Peng, Bo" Date: Tue, 29 Jul 2025 10:28:35 +0800 Subject: [PATCH 2/2] add int_ prefix to use_channel_wised_quantization and fix typo --- src/python/py/models/builder.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py index 426c67f718..26c160dbc6 100644 --- a/src/python/py/models/builder.py +++ b/src/python/py/models/builder.py @@ -300,9 +300,9 @@ def __init__( "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )), "nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []), "algo_config": int4_algo_config, + "use_channel_wised_quantization": extra_options.get("int4_use_channel_wised_quantization", False), }, "use_qdq": extra_options.get("use_qdq", False), - "use_channel_wised_quantization": extra_options.get("use_channel_wised_quantization", False), } if self.quant_type is not None: # Create quantized attributes from quantization config @@ -483,7 +483,7 @@ def to_int4(self) -> ir.Model: quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator, op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"], algo_config=self.quant_attrs["int4"]["algo_config"], - channel_wised_quantize = self.quant_attrs["use_channel_wised_quantization"], + channel_wised_quantize = self.quant_attrs["int4"]["use_channel_wised_quantization"], ) quant.process() return ir.from_proto(quant.model.model) @@ -3639,7 +3639,7 @@ def check_extra_options(kv_pairs): """ Check key-value pairs and set values correctly """ - bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32", "use_channel_wised_quantization"] + bools = ["int4_is_symmetric", "int4_use_channel_wised_quantization", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"] for key in bools: if key in kv_pairs: if kv_pairs[key] in {"false", "False", "0"}: @@ -3918,6 +3918,8 @@ def get_args(): Currently supported options are: 'default', 'rtn', 'k_quant_mixed', 'k_quant_last'. k_quant_mixed = k_quant algorithm with mixed precision (int4 + int8). k_quant_last = k_quant algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4. + int4_use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K) + Use this option when you want use K as block size. Default is False num_hidden_layers = Manually specify the number of layers in your ONNX model (for unit testing purposes). filename = Filename for ONNX model (default is 'model.onnx'). For models with multiple components, each component is exported to its own ONNX model. @@ -3949,8 +3951,6 @@ def get_args(): Use this option to enable GPUs that do not support FP16 on WebGPU (e.g. GTX 10xx). adapter_path = Path to folder on disk containing the adapter files (adapter_config.json and adapter model weights). Use this option for LoRA models. - use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K) - Use this option when you want use K as block size, default is False """), )