8. 示例代码
8.1. ResNet50模型量化示例
下面示例介绍如何量化一个ResNet50 ONNX模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。
import argparse
from pathlib import Path
import onnx
import torch
from xhquant.api import (
DeviceType,
HMONNXGoldenInference,
QuantScheme,
convert_onnx_to_hmonnx,
create_quant_config,
xhquant_init,
)
def main(args):
xhquant_init(None, debug=args.debug)
onnx_file = args.onnx
# 1. 读取onnx ======================
onnx_model = onnx.load(onnx_file)
# 2. 路径配置 ======================
input_shape = [int(dim.dim_value) for dim in onnx_model.graph.input[0].type.tensor_type.shape.dim]
input_shape[0] = batch_size
str_shape = "x".join([str(dim) for dim in input_shape])
onnx_name = Path(onnx_file).stem
onnx_name = f"{onnx_name}_{str_shape}"
work_dirs = Path("work_dirs") / onnx_name
work_dirs.mkdir(exist_ok=True, parents=True)
target_device = DeviceType.XH2a
out_hmonnx_file = work_dirs / "hmonnx" / f"{onnx_name}_{target_device}.onnx"
out_hmonnx_file.parent.mkdir(exist_ok=True, parents=True)
out_hmonnx_file: str = str(out_hmonnx_file)
# 3. 量化参数设置 ======================
device = "cuda" if torch.cuda.is_available() else "cpu"
input = torch.randn(input_shape, dtype=torch.float32)
quant_type = args.quant_type
quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
quant_config = create_quant_config(quant_scheme)
# 3. 量化以及HMONNX导出 ======================
convert_onnx_to_hmonnx(
onnx_file,
[input],
DeviceType.XH2a,
out_hmonnx_file,
quant_config=quant_config,
input_names=["images"],
output_names=["cls_score"],
)
# 4. 生成芯片所需格式模型 ======================
session = HMONNXGoldenInference(out_hmonnx_file)
session.to(device)
session.save_golden = True
session.golden_dir = work_dirs / "hmonnx/golden"
session.step = 0
session(input.half().to(device))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--onnx", type=str, default="data/models/resnet/resnet50_224x224.onnx")
parser.add_argument("--debug", action="store_true", help="debug mode")
parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
parser.add_argument("--batch-size", type=int, default=1)
args = parser.parse_args()
main(args)
量化后生成的 HMONNX 模型通过 out_hmonnx_file 参数指定路径进行保存。用户可将上面示例代码保存为 resnet50_quantization.py 文件,然后执行下面命令执行模型量化和导出:
python resnet50_quantization.py --onnx {ONNX_Path} --batch-size 1
需将 {ONNX_Path} 替换为要量化的模型文件名。
8.2. YOLOv8模型量化示例
下面示例介绍如何量化一个YOLOv8 ONNX模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。
import argparse
from pathlib import Path
import torch
from xhquant.api import (
DeviceType,
QuantScheme,
convert_onnx_to_hmonnx,
create_quant_config,
get_root_logger,
xhquant_init,
)
def main(args):
# 1. 读取onnx&路径配置 ======================
onnx_file = args.onnx
onnx_name = Path(onnx_file).stem
work_dirs = Path("work_dirs") / onnx_name
work_dirs.mkdir(exist_ok=True, parents=True)
target_device = DeviceType.XH2a
out_hmonnx_file = work_dirs / "hmonnx" / f"{onnx_name}_{target_device}.onnx"
out_hmonnx_file.parent.mkdir(exist_ok=True, parents=True)
out_hmonnx_file = str(out_hmonnx_file)
xhquant_init(None, debug=args.debug)
# 2. 量化参数设置 ======================
quant_type = args.quant_type
quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
quant_config = create_quant_config(quant_scheme)
# 3. 量化以及HMONNX转换 ======================
convert_onnx_to_hmonnx(
onnx_file,
[torch.randn(1, 3, 640, 640, dtype=torch.float32)],
DeviceType.XH2a,
out_hmonnx_file,
quant_config=quant_config,
input_names=["images"],
output_names=["outs"],
)
logger = get_root_logger()
logger.info(f"Convert onnx to hmonnx success, out hmonnx file to: {out_hmonnx_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--onnx", type=str, default="data/models/yolo/yolov8m.onnx")
parser.add_argument("--debug", action="store_true", help="debug mode")
parser.add_argument("--image", type=str, default="data/images/000000001490.jpg")
parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
args = parser.parse_args()
main(args)
量化后生成的 HMONNX 模型通过 out_hmonnx_file 参数指定路径进行保存。用户可将上面示例代码保存为 yolov8_quantization.py 文件,然后执行下面命令执行模型量化和导出:
python yolov8_quantization.py --onnx {ONNX_Path} --batch-size 1
需将 {ONNX_Path} 替换为要量化的模型文件名。
8.3. Qwen3-14B模型量化示例
下面示例介绍如何量化一个Qwen3-14B模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。
import argparse
import os.path as osp
from pathlib import Path
from xh_model_zoo.utils import MemoryTracker, TimeProfiler
from xh_model_zoo.xh_llm import LLMConverter
from xh_model_zoo.xh_llm.models.qwen2 import Qwen2ConvertConfig
from xhquant.api import DeviceType, get_root_logger, xhquant_init, QuantScheme # isort:skip
def main(args):
# Get absolute path of HuggingFace model directory
hf_model_path = osp.normpath(osp.abspath(args.model))
# Extract the model directory name from the path
model_name = Path(hf_model_path).name
# Set target device type for quantization
target_device = DeviceType.XH2a
# Get quantization type from command-line arguments
quant_type = args.quant_type
# Create a quantization scheme object with device and quant type
quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
# Set configuration for the Qwen3-14B model
config = Qwen2ConvertConfig(
batch_size=1, # Set batch size to 1
context_length=args.context_length, # Max context length
input_sequence_length=args.input_sequence_length, # Input sequence length
quant_scheme=quant_scheme, # The quantization scheme
quant_weight=args.quant_weight,
)
# Compose a unique working directory name prefix
prefix = f"{model_name}-{target_device}-{args.context_length//1024}k-{quant_type}"
# Define the working directory path
work_dir = Path("work_dirs") / prefix
# Create the working directory if it does not exist, including parents
work_dir.mkdir(exist_ok=True, parents=True)
# Define log file path inside the working directory
log_file = work_dir / "convert.log"
# Initialize quantization logging system with log file and debug option
xhquant_init(log_file, debug=args.debug)
# Obtain the root logger instance for logging
logger = get_root_logger()
# Profile time and memory usage during model conversion
with TimeProfiler("convert", logger), MemoryTracker("cuda:0", "convert", logger):
# Perform model conversion from pretrained checkpoint to target format
LLMConverter.from_pretrained(
hf_model_path, # Pretrained model path
"Qwen3ForCausalLM_legacy", # Model class name
config, # Conversion config
str(work_dir) # Output directory
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--debug", action="store_true", help="debug mode")
parser.add_argument("--model", type=str, default="qwen3-14b")
parser.add_argument("--context—length", type=int, default=8192, help="max sequence length")
parser.add_argument("--input-sequence-length", type=int, default=256, help="input sequence length")
parser.add_argument("--quant-type", default="w4a8_ssfp", help="quant type, default is w8a8")
parser.add_argument(
"--quant-weight",
type=str,
default=None,
help="quant weight path, for example: gptq or quarot, if empty, use w8a8",
)
args = parser.parse_args()
main(args)
8.4. Qwen2VL多模态模型量化示例
下面示例介绍如何量化一个Qwen2VL多模态模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。
import argparse
import os.path as osp
from pathlib import Path
from transformers import AutoConfig
from xh_model_zoo.utils import MemoryTracker, TimeProfiler
from xh_model_zoo.xh_llm import LLMConverter
from xh_model_zoo.xh_llm.models.qwen2_vl import Qwen2VLConvertConfig, VisualConfig
from xhquant.api import DeviceType, get_root_logger, xhquant_init, QuantScheme # isort:skip
def main(args):
# Set batch size to 1
args.batch_size = 1
# Get absolute path of HuggingFace model directory
hf_model_path = osp.normpath(osp.abspath(args.model))
# Extract model directory name as model name
model_name = Path(hf_model_path).name
# Specify target device for quantization
target_device = DeviceType.XH2a
# Get quantization type from command-line arguments
quant_type = args.quant_type
# Create quantization scheme with target device and quant type
quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
# Set configuration for the Qwen2VL model
config = Qwen2VLConvertConfig(
batch_size=args.batch_size,
context_length=args.context_length,
input_sequence_length=args.input_sequence_length,
quant_scheme=quant_scheme,
visual_config=VisualConfig(
image_max_size=1204, # Max image size for visual input
patch_size=14, # Patch size for visual tokens
),
)
# Build output directory prefix with model info and quant type
prefix = f"{model_name}-{target_device}-batch_{args.batch_size}-{args.context_length//1024}k-{quant_type}"
# Define working directory path for output files
work_dir = Path("work_dirs") / prefix
# Create working directory if it doesn't exist, including parent dirs
work_dir.mkdir(exist_ok=True, parents=True)
# Define path for log file inside working directory
log_file = work_dir / "convert.log"
# Initialize quantization environment with log file and debug flag
xhquant_init(log_file, debug=args.debug)
# Get root logger for logging
logger = get_root_logger()
# Profile time and memory during conversion process
with TimeProfiler("convert", logger), MemoryTracker("cuda:0", "convert", logger):
# Convert pretrained HuggingFace model to quantized HMONNX format
LLMConverter.from_pretrained(hf_model_path, None, config, work_dir)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--debug", action="store_true", help="debug mode")
parser.add_argument("--model", type=str, default="data/models/Qwen2-VL-2B-Instruct-AWQ")
parser.add_argument("--context-length", type=int, default=2048, help="max sequence length")
parser.add_argument("--input-sequence-length", type=int, default=256, help="input sequence length")
parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
args = parser.parse_args()
main(args)
8.5. Stable Diffusion 3模型量化示例
下面示例介绍如何量化一个Stable Diffusion 3模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。
import argparse
import os
from pathlib import Path
from xhquant.api import DeviceType, QuantScheme, xhquant_init
from xhquant.utils import set_random_seed
from xh_model_zoo.xh_aigc.models.sd3 import SD3ConvertConfig, SD3Converter
def main(args):
# Get model path from arguments
model = args.model
# Get absolute path of model directory
model_dir = os.path.normpath(model)
# Extract model directory name as model name
model_name = Path(model_dir).name
# Set target device for quantization
target_device = DeviceType.XH2a
# Get image height from arguments
height = args.height
# Get image width from arguments
width = args.width
# Build working directory name using model name, device, and image size
cfg_name = f"{model_name}_{target_device.name}_{width}x{height}"
# Define working directory path
work_dir = Path("work_dirs") / cfg_name
# Create working directory if not exists, including parent directories
work_dir.mkdir(exist_ok=True, parents=True)
# Define log file path in working directory
log_file = work_dir / f"{cfg_name}.log"
# Initialize quantization environment with log file, debug disabled
xhquant_init(log_file, debug=False)
# Set target device again (can be removed as redundant)
target_device = DeviceType.XH2a
# Define quantization type
quant_type = "w8a8_sefp"
# Create quantization scheme with device and quant type
quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
# Set configuration for the Stable Diffusion 3 model
convert_config = SD3ConvertConfig(
quant_scheme=quant_scheme, # Quantization scheme configuration
guidance_scale=args.guidance_scale, # Controls prompt adherence in image generation
height=height, # Target image height
width=width, # Target image width
)
# Convert pretrained model to quantized HMONNX format
SD3Converter.from_pretrained(model_dir, convert_config, str(work_dir))
def parse_arguments():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--model",
type=str,
default="data/models/stable-diffusion-3-medium-diffusers",
)
parser.add_argument("--height", type=int, default=512)
parser.add_argument("--width", type=int, default=512)
parser.add_argument("--seed", type=int, default=1024)
parser.add_argument(
"--guidance-scale",
type=float,
default=7.0,
help="Seed for the random number generator",
)
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_arguments()
set_random_seed(args.seed, deterministic=False)
main(args)