8. 示例代码

8.1. ResNet50模型量化示例

下面示例介绍如何量化一个ResNet50 ONNX模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。

import argparse
from pathlib import Path
import onnx
import torch
from xhquant.api import (
    DeviceType,
    HMONNXGoldenInference,
    QuantScheme,
    convert_onnx_to_hmonnx,
    create_quant_config,
    xhquant_init,
)

def main(args):
    xhquant_init(None, debug=args.debug)
    onnx_file = args.onnx

    # 1. 读取onnx ======================
    onnx_model = onnx.load(onnx_file)

    # 2. 路径配置 ======================
    input_shape = [int(dim.dim_value) for dim in onnx_model.graph.input[0].type.tensor_type.shape.dim]
    input_shape[0] = batch_size
    str_shape = "x".join([str(dim) for dim in input_shape])
    onnx_name = Path(onnx_file).stem
    onnx_name = f"{onnx_name}_{str_shape}"
    work_dirs = Path("work_dirs") / onnx_name
    work_dirs.mkdir(exist_ok=True, parents=True)
    target_device = DeviceType.XH2a
    out_hmonnx_file = work_dirs / "hmonnx" / f"{onnx_name}_{target_device}.onnx"
    out_hmonnx_file.parent.mkdir(exist_ok=True, parents=True)
    out_hmonnx_file: str = str(out_hmonnx_file)

    # 3. 量化参数设置 ======================
    device = "cuda" if torch.cuda.is_available() else "cpu"
    input = torch.randn(input_shape, dtype=torch.float32)
    quant_type = args.quant_type
    quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
    quant_config = create_quant_config(quant_scheme)

    # 3. 量化以及HMONNX导出 ======================
    convert_onnx_to_hmonnx(
        onnx_file,
        [input],
        DeviceType.XH2a,
        out_hmonnx_file,
        quant_config=quant_config,
        input_names=["images"],
        output_names=["cls_score"],
    )
    # 4. 生成芯片所需格式模型 ======================
    session = HMONNXGoldenInference(out_hmonnx_file)
    session.to(device)
    session.save_golden = True
    session.golden_dir = work_dirs / "hmonnx/golden"
    session.step = 0
    session(input.half().to(device))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--onnx", type=str, default="data/models/resnet/resnet50_224x224.onnx")
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
    parser.add_argument("--batch-size", type=int, default=1)
    args = parser.parse_args()
    main(args)

量化后生成的 HMONNX 模型通过 out_hmonnx_file 参数指定路径进行保存。用户可将上面示例代码保存为 resnet50_quantization.py 文件,然后执行下面命令执行模型量化和导出:

python resnet50_quantization.py --onnx {ONNX_Path} --batch-size 1

需将 {ONNX_Path} 替换为要量化的模型文件名。

8.2. YOLOv8模型量化示例

下面示例介绍如何量化一个YOLOv8 ONNX模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。

import argparse
from pathlib import Path

import torch
from xhquant.api import (
    DeviceType,
    QuantScheme,
    convert_onnx_to_hmonnx,
    create_quant_config,
    get_root_logger,
    xhquant_init,
)


def main(args):
    # 1. 读取onnx&路径配置 ======================
    onnx_file = args.onnx
    onnx_name = Path(onnx_file).stem
    work_dirs = Path("work_dirs") / onnx_name
    work_dirs.mkdir(exist_ok=True, parents=True)
    target_device = DeviceType.XH2a
    out_hmonnx_file = work_dirs / "hmonnx" / f"{onnx_name}_{target_device}.onnx"
    out_hmonnx_file.parent.mkdir(exist_ok=True, parents=True)
    out_hmonnx_file = str(out_hmonnx_file)
    xhquant_init(None, debug=args.debug)

    # 2. 量化参数设置 ======================
    quant_type = args.quant_type
    quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
    quant_config = create_quant_config(quant_scheme)

    # 3. 量化以及HMONNX转换 ======================
    convert_onnx_to_hmonnx(
        onnx_file,
        [torch.randn(1, 3, 640, 640, dtype=torch.float32)],
        DeviceType.XH2a,
        out_hmonnx_file,
        quant_config=quant_config,
        input_names=["images"],
        output_names=["outs"],
    )
    logger = get_root_logger()
    logger.info(f"Convert onnx to hmonnx success, out hmonnx file to: {out_hmonnx_file}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--onnx", type=str, default="data/models/yolo/yolov8m.onnx")
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--image", type=str, default="data/images/000000001490.jpg")
    parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
    args = parser.parse_args()
    main(args)

量化后生成的 HMONNX 模型通过 out_hmonnx_file 参数指定路径进行保存。用户可将上面示例代码保存为 yolov8_quantization.py 文件,然后执行下面命令执行模型量化和导出:

python yolov8_quantization.py --onnx {ONNX_Path} --batch-size 1

需将 {ONNX_Path} 替换为要量化的模型文件名。

8.3. Qwen3-14B模型量化示例

下面示例介绍如何量化一个Qwen3-14B模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。

import argparse
import os.path as osp
from pathlib import Path
from xh_model_zoo.utils import MemoryTracker, TimeProfiler
from xh_model_zoo.xh_llm import LLMConverter
from xh_model_zoo.xh_llm.models.qwen2 import Qwen2ConvertConfig
from xhquant.api import DeviceType, get_root_logger, xhquant_init, QuantScheme  # isort:skip

def main(args):
    # Get absolute path of HuggingFace model directory
    hf_model_path = osp.normpath(osp.abspath(args.model))
    # Extract the model directory name from the path
    model_name = Path(hf_model_path).name
    # Set target device type for quantization
    target_device = DeviceType.XH2a
    # Get quantization type from command-line arguments
    quant_type = args.quant_type
    # Create a quantization scheme object with device and quant type
    quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
    # Set configuration for the Qwen3-14B model
    config = Qwen2ConvertConfig(
        batch_size=1,  # Set batch size to 1
        context_length=args.context_length,  # Max context length
        input_sequence_length=args.input_sequence_length,  # Input sequence length
        quant_scheme=quant_scheme,  # The quantization scheme
        quant_weight=args.quant_weight,
    )
    # Compose a unique working directory name prefix
    prefix = f"{model_name}-{target_device}-{args.context_length//1024}k-{quant_type}"
    # Define the working directory path
    work_dir = Path("work_dirs") / prefix
    # Create the working directory if it does not exist, including parents
    work_dir.mkdir(exist_ok=True, parents=True)
    # Define log file path inside the working directory
    log_file = work_dir / "convert.log"
    # Initialize quantization logging system with log file and debug option
    xhquant_init(log_file, debug=args.debug)
    # Obtain the root logger instance for logging
    logger = get_root_logger()
    # Profile time and memory usage during model conversion
    with TimeProfiler("convert", logger), MemoryTracker("cuda:0", "convert", logger):
        # Perform model conversion from pretrained checkpoint to target format
        LLMConverter.from_pretrained(
            hf_model_path,  # Pretrained model path
            "Qwen3ForCausalLM_legacy",  # Model class name
            config,  # Conversion config
            str(work_dir)  # Output directory
        )
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--model", type=str, default="qwen3-14b")
    parser.add_argument("--context—length", type=int, default=8192, help="max sequence length")
    parser.add_argument("--input-sequence-length", type=int, default=256, help="input sequence length")
    parser.add_argument("--quant-type", default="w4a8_ssfp", help="quant type, default is w8a8")
    parser.add_argument(
        "--quant-weight",
        type=str,
        default=None,
        help="quant weight path, for example: gptq or quarot, if empty, use w8a8",
    )
    args = parser.parse_args()
    main(args)

8.4. Qwen2VL多模态模型量化示例

下面示例介绍如何量化一个Qwen2VL多模态模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。

import argparse
import os.path as osp
from pathlib import Path
from transformers import AutoConfig
from xh_model_zoo.utils import MemoryTracker, TimeProfiler
from xh_model_zoo.xh_llm import LLMConverter
from xh_model_zoo.xh_llm.models.qwen2_vl import Qwen2VLConvertConfig, VisualConfig
from xhquant.api import DeviceType, get_root_logger, xhquant_init, QuantScheme  # isort:skip
def main(args):
    # Set batch size to 1
    args.batch_size = 1
    # Get absolute path of HuggingFace model directory
    hf_model_path = osp.normpath(osp.abspath(args.model))
    # Extract model directory name as model name
    model_name = Path(hf_model_path).name
    # Specify target device for quantization
    target_device = DeviceType.XH2a
    # Get quantization type from command-line arguments
    quant_type = args.quant_type
    # Create quantization scheme with target device and quant type
    quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
    # Set configuration for the Qwen2VL model
    config = Qwen2VLConvertConfig(
        batch_size=args.batch_size,
        context_length=args.context_length,
        input_sequence_length=args.input_sequence_length,
        quant_scheme=quant_scheme,
        visual_config=VisualConfig(
            image_max_size=1204,  # Max image size for visual input
            patch_size=14,        # Patch size for visual tokens
        ),
    )
    # Build output directory prefix with model info and quant type
    prefix = f"{model_name}-{target_device}-batch_{args.batch_size}-{args.context_length//1024}k-{quant_type}"
    # Define working directory path for output files
    work_dir = Path("work_dirs") / prefix
    # Create working directory if it doesn't exist, including parent dirs
    work_dir.mkdir(exist_ok=True, parents=True)
    # Define path for log file inside working directory
    log_file = work_dir / "convert.log"
    # Initialize quantization environment with log file and debug flag
    xhquant_init(log_file, debug=args.debug)
    # Get root logger for logging
    logger = get_root_logger()
    # Profile time and memory during conversion process
    with TimeProfiler("convert", logger), MemoryTracker("cuda:0", "convert", logger):
        # Convert pretrained HuggingFace model to quantized HMONNX format
        LLMConverter.from_pretrained(hf_model_path, None, config, work_dir)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--debug", action="store_true", help="debug mode")
    parser.add_argument("--model", type=str, default="data/models/Qwen2-VL-2B-Instruct-AWQ")
    parser.add_argument("--context-length", type=int, default=2048, help="max sequence length")
    parser.add_argument("--input-sequence-length", type=int, default=256, help="input sequence length")
    parser.add_argument("--quant-type", default="w8a8_sefp", help="quant type, default is w8a8")
    args = parser.parse_args()
    main(args)

8.5. Stable Diffusion 3模型量化示例

下面示例介绍如何量化一个Stable Diffusion 3模型,并导出量化后HMONNX模型。示例展示关键步骤代码,仅供参考,不可以直接拷贝运行。

import argparse
import os
from pathlib import Path

from xhquant.api import DeviceType, QuantScheme, xhquant_init
from xhquant.utils import set_random_seed
from xh_model_zoo.xh_aigc.models.sd3 import SD3ConvertConfig, SD3Converter

def main(args):
    # Get model path from arguments
    model = args.model
    # Get absolute path of model directory
    model_dir = os.path.normpath(model)
    # Extract model directory name as model name
    model_name = Path(model_dir).name
    # Set target device for quantization
    target_device = DeviceType.XH2a
    # Get image height from arguments
    height = args.height
    # Get image width from arguments
    width = args.width
    # Build working directory name using model name, device, and image size
    cfg_name = f"{model_name}_{target_device.name}_{width}x{height}"
    # Define working directory path
    work_dir = Path("work_dirs") / cfg_name
    # Create working directory if not exists, including parent directories
    work_dir.mkdir(exist_ok=True, parents=True)
    # Define log file path in working directory
    log_file = work_dir / f"{cfg_name}.log"
    # Initialize quantization environment with log file, debug disabled
    xhquant_init(log_file, debug=False)

    # Set target device again (can be removed as redundant)
    target_device = DeviceType.XH2a
    # Define quantization type
    quant_type = "w8a8_sefp"
    # Create quantization scheme with device and quant type
    quant_scheme = QuantScheme(target_device=DeviceType.XH2a, quant_type=quant_type)
    # Set configuration for the Stable Diffusion 3 model
    convert_config = SD3ConvertConfig(
        quant_scheme=quant_scheme, # Quantization scheme configuration
        guidance_scale=args.guidance_scale, # Controls prompt adherence in image generation
        height=height,  # Target image height
        width=width,  # Target image width
    )

    # Convert pretrained model to quantized HMONNX format
    SD3Converter.from_pretrained(model_dir, convert_config, str(work_dir))

def parse_arguments():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--model",
        type=str,
        default="data/models/stable-diffusion-3-medium-diffusers",
    )
    parser.add_argument("--height", type=int, default=512)
    parser.add_argument("--width", type=int, default=512)
    parser.add_argument("--seed", type=int, default=1024)
    parser.add_argument(
        "--guidance-scale",
        type=float,
        default=7.0,
        help="Seed for the random number generator",
    )
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = parse_arguments()
    set_random_seed(args.seed, deterministic=False)
    main(args)