""" convert hugging face transformer to ONNX model requires pytorch 1.5.1 + CUDA 10.2 """ from os import listdir, makedirs from pathlib import Path from typing import Optional from transformers.convert_graph_to_onnx import ( convert_pytorch, convert_tensorflow, load_graph_from_args, generate_identified_filename, ) def convert( framework: str, model: str, model_bin: str, output: Path, opset: int, tokenizer: Optional[str] = None, use_external_format: bool = False, pipeline_name: str = "feature-extraction", ): """ Convert the pipeline object to the ONNX Intermediate Representation (IR) format. Args: framework: The framework the pipeline is backed by ("pt" or "tf") model: The name of the model to load for the pipeline model_bin: the model object that override model to suport custmized model output: The path where the ONNX graph will be stored, it has to be instance of pathlib.Path opset: The actual version of the ONNX operator set to use tokenizer: The name of the model to load for the pipeline, default to the model's name if not provided use_external_format: Split the model definition from its parameters to allow model bigger than 2GB (PyTorch only) pipeline_name: The kind of pipeline to instantiate (ner, question-answering, etc.) Returns: """ print(f"ONNX opset version set to: {opset}") # Load the pipeline nlp = load_graph_from_args(pipeline_name, framework, model, tokenizer) if not output.parent.exists(): print(f"Creating folder {output.parent}") makedirs(output.parent.as_posix()) if model_bin: nlp.model = model_bin # Export the graph if framework == "pt": convert_pytorch(nlp, opset, output, use_external_format) else: convert_tensorflow(nlp, opset, output) def quantize(onnx_model_path: Path) -> Path: """ Quantize the weights of the model from float32 to in8 to allow very efficient inference on modern CPU. Args: onnx_model_path: Path to location the exported ONNX model is stored Returns: The Path generated for the quantized """ import onnx from onnxruntime.quantization import QuantizationMode, quantize onnx_model = onnx.load(onnx_model_path.as_posix()) # Discussed with @yufenglee from ONNX runtime, this will be address in the next release of onnxruntime print( "As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n" "This limitation will be removed in the next release of onnxruntime." ) quantized_model = quantize( model=onnx_model, quantization_mode=QuantizationMode.IntegerOps, force_fusions=True, symmetric_weight=True, nbits=8 ) # Append "-quantized" at the end of the model's name quantized_model_path = generate_identified_filename(onnx_model_path, "-quantized") # Save model print(f"Quantized model has been written at {quantized_model_path}: \N{heavy check mark}") onnx.save_model(quantized_model, quantized_model_path.as_posix()) return quantized_model_path