[GitHub] [tvm] YongtaoHuang1994 opened a new issue #7379: the speed of BERT's inference

GitBox Sat, 30 Jan 2021 17:31:26 -0800


YongtaoHuang1994 opened a new issue #7379:
URL: https://github.com/apache/tvm/issues/7379



   Hello:
   I use TVM to speed up the inference of BERT model by CPU-avx2. Why is it 
slower?
   Device:  8  Intel(R) Xeon(R) CPU E5-1620 v3 @ 3.50GHz.
   
   The inference speed result is as followed:
   ```
   MXNet latency for batch 1 and seq length 128: 159.11 ms
   TVM latency for batch 1 and seq length 128: 732.59 ms
   ```
   The inference code is as followed:
   ```
   import time
   import argparse
   import numpy as np
   import mxnet as mx
   import gluonnlp as nlp
   import tvm
   from tvm import relay
   import tvm.contrib.graph_runtime as runtime
   
   def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000):
       """Helper function to time a function"""
       for i in range(dryrun):
           thunk()
       ret = []
       for _ in range(repeat):
           while True:
               beg = time.time()
               for _ in range(number):
                   thunk()
               end = time.time()
               lat = (end - beg) * 1e3
               if lat >= min_repeat_ms:
                   break
               number = int(max(min_repeat_ms / (lat / number) + 1, number * 
1.618))
           ret.append(lat / number)
       return ret
   
   
   parser = argparse.ArgumentParser(description="Optimize BERT-base model from 
GluonNLP")
   parser.add_argument("-b", "--batch", type=int, default=1,
                       help="Batch size (default: 1)")
   parser.add_argument("-l", "--length", type=int, default=128,
                       help="Sequence length (default: 128)")
   args = parser.parse_args()
   batch = args.batch
   seq_length = args.length
   
   
   # Instantiate a BERT classifier using GluonNLP
   model_name = 'bert_12_768_12'
   dataset = 'book_corpus_wiki_en_uncased'
   mx_ctx = mx.cpu()
   bert, _ = nlp.model.get_model(
       name=model_name,
       ctx=mx_ctx,
       dataset_name=dataset,
       pretrained=False,
       use_pooler=True,
       use_decoder=False,
       use_classifier=False)
   model = nlp.model.BERTClassifier(bert, dropout=0.1, num_classes=2)
   model.initialize(ctx=mx_ctx)
   model.hybridize(static_alloc=True)
   
   # Prepare input data
   dtype = "float32"
   inputs = np.random.randint(0, 2000, size=(batch, seq_length)).astype(dtype)
   token_types = np.random.uniform(size=(batch, seq_length)).astype(dtype)
   valid_length = np.asarray([seq_length] * batch).astype(dtype)
   
   # Convert to MXNet NDArray and run the MXNet model
   inputs_nd = mx.nd.array(inputs, ctx=mx_ctx)
   token_types_nd = mx.nd.array(token_types, ctx=mx_ctx)
   valid_length_nd = mx.nd.array(valid_length, ctx=mx_ctx)
   mx_out = model(inputs_nd, token_types_nd, valid_length_nd)
   mx_out.wait_to_read()
   
   # Benchmark the MXNet latency
   res = timer(lambda: model(inputs_nd, token_types_nd, 
valid_length_nd).wait_to_read(),
               repeat=3,
               dryrun=5,
               min_repeat_ms=1000)
   print(f"MXNet latency for batch {batch} and seq length {seq_length}: 
{np.mean(res):.2f} ms")
   
   
   ######################################
   # Optimize the BERT model using TVM
   ######################################
   
   # First, Convert the MXNet model into TVM Relay format
   shape_dict = {
       'data0': (batch, seq_length),
       'data1': (batch, seq_length),
       'data2': (batch,)
   }
   mod, params = relay.frontend.from_mxnet(model, shape_dict)
   
   # Compile the imported model
   
   target = "llvm -mcpu=core-avx2"
   with relay.build_config(opt_level=3, required_pass=["FastMath"]):
       graph, lib, cparams = relay.build(mod, target, params=params)
   
   # Create the executor and set the parameters and inputs
   ctx = tvm.cpu()
   rt = runtime.create(graph, lib, ctx)
   rt.set_input(**cparams)
   rt.set_input(data0=inputs, data1=token_types, data2=valid_length)
   
   # Run the executor and validate the correctness
   rt.run()
   out = rt.get_output(0)
   tvm.testing.assert_allclose(out.asnumpy(), mx_out.asnumpy(), rtol=1e-3, 
atol=1e-3)
   
   # Benchmark the TVM latency
   ftimer = rt.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=1000)
   prof_res = np.array(ftimer().results) * 1000
   print(f"TVM latency for batch {batch} and seq length {seq_length}: 
{np.mean(prof_res):.2f} ms")
   
   ```
   Thank you for your help.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] YongtaoHuang1994 opened a new issue #7379: the speed of BERT's inference

Reply via email to