Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ def __init__(
"op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
"nodes_to_exclude": extra_options.get("int4_nodes_to_exclude", []),
"algo_config": int4_algo_config,
"use_channel_wised_quantization": extra_options.get("int4_use_channel_wised_quantization", False),
},
"use_qdq": extra_options.get("use_qdq", False),
}
Expand Down Expand Up @@ -482,6 +483,7 @@ def to_int4(self) -> ir.Model:
quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
op_types_to_quantize=self.quant_attrs["int4"]["op_types_to_quantize"],
algo_config=self.quant_attrs["int4"]["algo_config"],
channel_wised_quantize = self.quant_attrs["int4"]["use_channel_wised_quantization"],
)
quant.process()
return ir.from_proto(quant.model.model)
Expand Down Expand Up @@ -3637,7 +3639,7 @@ def check_extra_options(kv_pairs):
"""
Check key-value pairs and set values correctly
"""
bools = ["int4_is_symmetric", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
bools = ["int4_is_symmetric", "int4_use_channel_wised_quantization", "exclude_embeds", "exclude_lm_head", "include_hidden_states", "enable_cuda_graph", "use_8bits_moe", "use_qdq", "use_webgpu_fp32"]
for key in bools:
if key in kv_pairs:
if kv_pairs[key] in {"false", "False", "0"}:
Expand Down Expand Up @@ -3916,6 +3918,8 @@ def get_args():
Currently supported options are: 'default', 'rtn', 'k_quant_mixed', 'k_quant_last'.
k_quant_mixed = k_quant algorithm with mixed precision (int4 + int8).
k_quant_last = k_quant algorithm where only the last MatMul (/lm_head/MatMul) is quantized as int8. Other MatMuls are quantized as int4.
int4_use_channel_wised_quantization = Use channel wised quantization, in which block size = rows (K)
Use this option when you want use K as block size. Default is False
num_hidden_layers = Manually specify the number of layers in your ONNX model (for unit testing purposes).
filename = Filename for ONNX model (default is 'model.onnx').
For models with multiple components, each component is exported to its own ONNX model.
Expand Down
Loading