#%% from onnxruntime_tools.transformers.optimizer import optimize_model from torch._C import InferredType from torch.hub import import_module from src.convert_graph_to_onnx import convert, quantize from src import AutoYuedlModelForSequenceClassification from transformers import AutoConfig from pathlib import Path from onnxruntime_tools import optimizer import torch import datetime import time import json import os import logging from inference import BertEDL from transformers import ( AutoConfig, AutoTokenizer, HfArgumentParser, Trainer, BertTokenizerFast, ) # quantize_dynamic def save_model_to_file(self, output_path): from pathlib import Path external_data_helper.convert_model_to_external_data( self.model, all_tensors_to_one_file=True, location=Path(output_path).name + ".data", ) save_model(self.model, output_path) # MODEL_PATH = ( # "/data/yujh/model/entity_disambiguation/distillation/bert_3layers_lr5e-6_posid_aidahedwig30_epoch1_bs40_summse" # ) # MODEL_PATH = "/data/yujh/model/yukon/Yukonv3_en_3layers_student_4e-5_72_notoverfit" MODEL_PATH = "/data/yujh/model/entity_disambiguation/distillation/tulrv3_3layers_lr1e-5_posid_aidahedwig900M30_epoch1_bs40_summse_evalmsit" # MODEL_PATH = "/data/yujh/model/entity_disambiguation/TULRv3-large_crossgroup_score_aidafiltered15_hedwigoossplit_mm2_lr4e-6_bs5x3_epoch4_rand" config_path = "/home/yujh/gitrepos/B4BEDL/py/cross_encoder/config/yuedl_config.json" disambig = BertEDL.from_config(MODEL_PATH, config_path=config_path) config = AutoConfig.from_pretrained(MODEL_PATH) model = AutoYuedlModelForSequenceClassification.from_pretrained( MODEL_PATH, from_tf=bool(".ckpt" in MODEL_PATH), config=config, ) model.eval() do_onnx_model_export = 0 do_optimize_onnx = 1 #%% import json with open( "/home/yujh/gitrepos/B4BEDL/py/cross_encoder/unittests/data/sample_eval_mentionmode.json", "r", encoding="utf-8", ) as f: inputs = json.load(f) inferDataset, processed_examples = disambig.preprocess_batch_mention_input(inputs) #%% from collections import OrderedDict sample_range = range(0,1) ex_input_1 = OrderedDict( { # 'query_id':torch.IntTensor([inferDataset[i].query_id for i in sample_range]), "input_ids": torch.IntTensor([inferDataset[i].input_ids for i in sample_range]), "attention_mask": torch.IntTensor( [inferDataset[i].attention_mask for i in sample_range] ), "mention_idx": torch.LongTensor( [inferDataset[i].mention_idx for i in sample_range] ), "entity_idx": torch.LongTensor( [inferDataset[i].entity_idx for i in sample_range] ), } ) #%% if do_onnx_model_export: print("start converting model============") torch.onnx.export( model, # model being run tuple( [v for k, v in ex_input_1.items()] ), # model input (or a tuple for multiple inputs) f"{MODEL_PATH}/model.onnx", # where to save the model (can be a file or file-like object) export_params=True, # store the trained parameter weights inside the model file opset_version=12, # the ONNX version to export the model to do_constant_folding=True, # whether to execute constant folding for optimization input_names=[ # 'query_id', # query_id is needed because model has it as input, but resulting onnx graph doesn't have it "input_ids", "attention_mask", "mention_idx", "entity_idx", ], # the model's input names output_names=["output_0"], # the model's output names dynamic_axes={ # 'query_id': {0:'batch'}, "input_ids": {0: "batch", 1: "sequence"}, # variable length axes "attention_mask": {0: "batch", 1: "sequence"}, "mention_idx": {0: "batch"}, "entity_idx": {0: "batch"}, "output_0": {0: "batch"}, }, enable_onnx_checker=True, use_external_data_format=True ) #%% verify exported onnx is producing same result print('===== evaluate onnx ======') import onnx onnx_model = onnx.load(f"{MODEL_PATH}/model.onnx") # onnx.checker.check_model(onnx_model) import onnxruntime import numpy as np sample_range = range(0, 5) query_id = [inferDataset[i].query_id for i in sample_range] input_ids = [inferDataset[i].input_ids for i in sample_range] attention_mask = [inferDataset[i].attention_mask for i in sample_range] mention_idx = [inferDataset[i].mention_idx for i in sample_range] entity_idx = [inferDataset[i].entity_idx for i in sample_range] ort_session = onnxruntime.InferenceSession(f"{MODEL_PATH}/model.onnx") torch_out = model( query_id=torch.IntTensor(query_id), input_ids=torch.IntTensor(input_ids), attention_mask=torch.IntTensor(attention_mask), mention_idx=torch.LongTensor(mention_idx), entity_idx=torch.LongTensor(entity_idx), ) print(torch_out) # compute ONNX Runtime output prediction inputs_onnx = { # "query_id": query_id, "input_ids": input_ids, "attention_mask": attention_mask, "mention_idx": mention_idx, "entity_idx": entity_idx, # "token_type_ids": [d["token_type_ids"] for d in inputs], # remove token_type_ids because it is not useful } ort_outs = ort_session.run(None, inputs_onnx) print(ort_outs) # compare ONNX Runtime and PyTorch results # np.testing.assert_allclose(to_numpy(torch_out[0]), ort_outs, rtol=1e-03, atol=1e-05) # print("Exported model has been tested with ONNXRuntime, and the result looks good!") #%% # %% if do_optimize_onnx: print('optimize model=====') optimized_model = optimizer.optimize_model( f"{MODEL_PATH}/model.onnx", model_type="bert", num_heads=16, hidden_size=1024, ) optimized_model.save_model_to_file( f"{MODEL_PATH}/model_optimized.onnx" ) # onnx.save_model() # use this to save when model is > 2GB quantize( Path( f"{MODEL_PATH}/model.onnx" ) ) optimized_model.convert_model_float32_to_float16() optimized_model.save_model_to_file( f"{MODEL_PATH}/model_fp16.onnx" )