Skip to content

Commit 6387402

Browse files
authored
Albert model aware (#202)
* pass unitest * albert model uses model-aware allocator. * polish the albert unitest * Support variable sequence length benchmarking for albert. * gpu benchmark better log * better log * Polish code * polish benchmark * polish benchmark script
1 parent 055baa2 commit 6387402

11 files changed

+136
-114
lines changed

benchmark/benchmark.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def main():
5151
'use_gpu': True if args['--use_gpu'] else False,
5252
'enable_mem_opt': True if args['--enable_mem_opt'] else False,
5353
}
54-
if (kwargs['model_name'] != 'bert'
54+
if (kwargs['model_name'] not in ['bert'
55+
'albert']
5556
or args['--framework'] != 'turbo-transformers'):
5657
kwargs['enable_mem_opt'] = False
5758
if args['--framework'] == 'turbo-transformers':

benchmark/benchmark_helper.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,8 @@
1414
enable_latency_plot = 1
1515

1616

17-
def run_model(model,
18-
use_gpu,
19-
num_iter,
20-
batch_size,
21-
seq_len,
22-
framework_name,
23-
num_threads=1,
24-
enable_mem_opt=False):
17+
def run_model(model, use_gpu, num_iter, batch_size, seq_len, framework_name,
18+
num_threads, enable_mem_opt, model_name):
2519
# warm up
2620
import torch
2721
import contexttimer
@@ -63,11 +57,13 @@ def run_model(model,
6357
"seq_len": seq_len,
6458
"framework": framework_name,
6559
"thread_num": num_threads,
60+
"model_name": model_name
6661
}))
6762

6863

6964
def run_variable_model(model, use_gpu, num_iter, max_seq_len, min_seq_len,
70-
framework_name, num_threads, cfg, enable_mem_opt):
65+
framework_name, num_threads, cfg, enable_mem_opt,
66+
model_name):
7167
import torch
7268
import contexttimer
7369
import json
@@ -88,19 +84,15 @@ def run_variable_model(model, use_gpu, num_iter, max_seq_len, min_seq_len,
8884
device=test_device)
8985
request_list.append(input_ids)
9086

91-
# warm-up using the longest sequence
92-
# TODO(jiaruifang) We now recommend you to run warm-up before inference.
93-
# In the future we will refactor allocator so as to not avoid warm-up
9487
input_ids = torch.randint(low=0,
9588
high=cfg.vocab_size - 1,
9689
size=(1, max_seq_len),
9790
dtype=torch.long,
9891
device=test_device)
99-
# model(input_ids)
10092
if enable_latency_plot:
101-
import time
102-
print(f"dump results to {framework_name}_latency_{num_threads}.txt")
103-
with open(f"{framework_name}_latency_{num_threads}.txt", "w") as of:
93+
file_name = f"{framework_name}_{num_threads}_{model_name}_latency.txt"
94+
print(f"dump results to {file_name}")
95+
with open(f"{file_name}", "w") as of:
10496
result_list = []
10597
for request in request_list:
10698
if use_gpu:
@@ -169,4 +161,5 @@ def run_variable_model(model, use_gpu, num_iter, max_seq_len, min_seq_len,
169161
"min_seq_len": min_seq_len,
170162
"framework": framework_name,
171163
"thread_num": num_iter,
164+
"model_name": model_name
172165
}))

benchmark/jit_benchmark_helper.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020

2121
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int,
2222
enable_random: bool, max_seq_len: int,
23-
min_seq_len: int, num_threads: int, use_gpu: bool):
23+
min_seq_len: int, num_threads: int, use_gpu: bool,
24+
enable_mem_opt: bool):
2425
import transformers
2526
import contexttimer
2627
import torch.jit
@@ -59,5 +60,6 @@ def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int,
5960
"batch_size": batch_size,
6061
"seq_len": seq_len,
6162
"framework": "torch_jit",
62-
"n_threads": num_threads
63+
"n_threads": num_threads,
64+
"model_name": model_name
6365
}))

benchmark/onnx_benchmark_helper.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,8 @@ def _impl_(model_name: str,
8989
min_seq_len: int,
9090
max_seq_len: int,
9191
num_threads: int = 1,
92-
use_gpu: bool = False):
92+
use_gpu: bool = False,
93+
enable_mem_opt: bool = False):
9394
import multiprocessing
9495
import os
9596
temp_fn = "/tmp/temp_onnx.model"
@@ -154,11 +155,13 @@ def _impl_(model_name: str,
154155
request_list.append(input_ids)
155156

156157
if enable_latency_plot:
157-
import time
158158
import torch
159-
print(f"dump results to onnxrt_latency_{num_threads}.txt")
159+
print(
160+
f"dump results to onnxrt_{num_threads}_{model_name}_latency.txt"
161+
)
160162
result_list = []
161-
with open(f"onnxrt_latency_{num_threads}.txt", "w") as of:
163+
with open(f"onnxrt_{num_threads}_{model_name}_latency.txt",
164+
"w") as of:
162165
for request in request_list:
163166
if use_gpu:
164167
start = torch.cuda.Event(enable_timing=True)
@@ -223,6 +226,7 @@ def _impl_(model_name: str,
223226
"min_seq_len": min_seq_len,
224227
"framework": f"onnx_rt_{backend}",
225228
"thread_num": num_threads,
229+
"model_name": model_name
226230
}))
227231
else:
228232
print(
@@ -233,7 +237,8 @@ def _impl_(model_name: str,
233237
"batch_size": batch_size,
234238
"seq_len": seq_len,
235239
"framework": f"onnx_rt_{backend}",
236-
"n_threads": num_threads
240+
"n_threads": num_threads,
241+
"model_name": model_name
237242
}))
238243

239244
return _impl_

benchmark/run_gpu_fixed_benchmark.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,21 @@ SEQ_LEN=(10 20 40 60 80 100 200 300 400 500)
2121
BATCH_SIZE=(1 20)
2222

2323
N=150
24-
MODEL="bert"
24+
MODELS=("bert" "albert")
25+
for model in ${MODELS[*]}
26+
do
2527
for batch_size in ${BATCH_SIZE[*]}
2628
do
2729
for seq_len in ${SEQ_LEN[*]}
2830
do
2931
for framework in ${FRAMEWORKS[*]}
3032
do
31-
python benchmark.py ${MODEL} --seq_len=${seq_len} --batch_size=${batch_size}\
33+
python benchmark.py ${model} --seq_len=${seq_len} --batch_size=${batch_size}\
3234
-n ${N} --framework=${framework} --use_gpu
3335
done
3436
done
3537
done
38+
done
3639

3740
USE_NVPROF="NO"
3841
if [ $USE_NVPROF == "YES" ]; then

benchmark/run_gpu_variable_benchmark.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ FRAMEWORKS=("turbo-transformers" "torch")
2222
MAX_SEQ_LEN=(500)
2323

2424
N=150
25-
MODEL="bert"
25+
MODELS=("bert" "albert")
26+
for model in ${MODELS[*]}
27+
do
2628
for max_seq_len in ${MAX_SEQ_LEN[*]}
2729
do
2830
for framework in ${FRAMEWORKS[*]}
2931
do
30-
python benchmark.py ${MODEL} \
32+
python benchmark.py ${model} \
3133
--enable-random \
3234
--min_seq_len=5 \
3335
--max_seq_len=${max_seq_len} \
@@ -37,3 +39,4 @@ do
3739
--use_gpu
3840
done
3941
done
42+
done

benchmark/torch_benchmark_helper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int,
4747
if enable_random:
4848
benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len,
4949
min_seq_len, "torch", num_threads,
50-
cfg, enable_mem_opt)
50+
cfg, enable_mem_opt, model_name)
5151
else:
5252
input_ids = torch.randint(low=0,
5353
high=cfg.vocab_size - 1,
@@ -56,4 +56,4 @@ def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int,
5656
device=test_device)
5757
benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n,
5858
batch_size, seq_len, "torch", num_threads,
59-
enable_mem_opt)
59+
enable_mem_opt, model_name)

benchmark/turbo_benchmark_helper.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ def benchmark_turbo_transformers(model_name: str, seq_len: int,
5959
turbo_transformers.reset_allocator_schema("model-aware")
6060
benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len,
6161
min_seq_len, "turbo", num_threads,
62-
cfg, enable_mem_opt)
62+
cfg, enable_mem_opt, model_name)
6363
if enable_mem_opt:
6464
turbo_transformers.reset_allocator_schema("naive")
6565
else:
@@ -69,4 +69,5 @@ def benchmark_turbo_transformers(model_name: str, seq_len: int,
6969
dtype=torch.long,
7070
device=test_device)
7171
benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n,
72-
batch_size, seq_len, "turbo", num_threads)
72+
batch_size, seq_len, "turbo", num_threads,
73+
enable_mem_opt, model_name)

turbo_transformers/python/tests/albert_model_test.py

Lines changed: 93 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -17,89 +17,106 @@
1717
import torch
1818
import turbo_transformers
1919
from transformers.modeling_albert import AlbertConfig, AlbertModel
20-
import numpy
2120
import os
2221

2322
sys.path.append(os.path.dirname(__file__))
2423
import test_helper
2524

2625

27-
def create_test(batch_size, seq_length):
28-
class TestAlbertModel(unittest.TestCase):
29-
def init_data(self, use_cuda: bool) -> None:
30-
self.test_device = torch.device('cuda:0') if use_cuda else \
31-
torch.device('cpu:0')
32-
if not use_cuda:
33-
torch.set_num_threads(4)
34-
turbo_transformers.set_num_threads(4)
35-
36-
torch.set_grad_enabled(False)
37-
self.cfg = AlbertConfig()
38-
39-
self.torch_model = AlbertModel(self.cfg)
40-
if torch.cuda.is_available():
41-
self.torch_model.to(self.test_device)
42-
self.torch_model.eval()
43-
self.hidden_size = self.cfg.hidden_size
44-
self.input_tensor = torch.randint(low=0,
45-
high=self.cfg.vocab_size - 1,
46-
size=(batch_size, seq_length),
47-
device=self.test_device)
48-
49-
self.turbo_model = turbo_transformers.AlbertModel.from_torch(
50-
self.torch_model)
51-
52-
def check_torch_and_turbo(self, use_cuda):
53-
self.init_data(use_cuda=use_cuda)
54-
device = "GPU" if use_cuda else "CPU"
55-
num_iter = 1
56-
turbo_model = lambda: self.turbo_model(
57-
self.input_tensor, attention_mask=None, head_mask=None)
58-
turbo_result, turbo_qps, turbo_time = \
59-
test_helper.run_model(turbo_model, use_cuda, num_iter)
60-
61-
print(
62-
f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
63-
f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}"
26+
class TestAlbertModel(unittest.TestCase):
27+
def init_data(self, use_cuda: bool) -> None:
28+
self.test_device = torch.device('cuda:0') if use_cuda else \
29+
torch.device('cpu:0')
30+
if not use_cuda:
31+
torch.set_num_threads(4)
32+
turbo_transformers.set_num_threads(4)
33+
34+
torch.set_grad_enabled(False)
35+
self.cfg = AlbertConfig(hidden_size=768,
36+
num_attention_heads=12,
37+
intermediate_size=3072)
38+
self.torch_model = AlbertModel(self.cfg)
39+
40+
if torch.cuda.is_available():
41+
self.torch_model.to(self.test_device)
42+
self.torch_model.eval()
43+
self.hidden_size = self.cfg.hidden_size
44+
45+
self.turbo_model = turbo_transformers.AlbertModel.from_torch(
46+
self.torch_model)
47+
48+
def check_torch_and_turbo(self, batch_size, seq_length, use_cuda,
49+
use_memory_opt):
50+
self.init_data(use_cuda=use_cuda)
51+
self.input_tensor = torch.randint(low=0,
52+
high=self.cfg.vocab_size - 1,
53+
size=(batch_size, seq_length),
54+
device=self.test_device)
55+
56+
device = "GPU" if use_cuda else "CPU"
57+
num_iter = 1
58+
59+
if use_memory_opt:
60+
turbo_transformers.bert_opt_mem_allocate_api(
61+
self.input_tensor.size()[0], # batch
62+
self.input_tensor.size()[1], # seq_len
63+
self.cfg.num_attention_heads,
64+
self.cfg.hidden_size,
65+
self.cfg.num_hidden_layers,
66+
"GPU" if 'cuda' in self.input_tensor.device.type else "CPU")
67+
68+
turbo_model = lambda: self.turbo_model(
69+
self.input_tensor, attention_mask=None, head_mask=None)
70+
turbo_result, turbo_qps, turbo_time = \
71+
test_helper.run_model(turbo_model, use_cuda, num_iter)
72+
73+
print(
74+
f"AlbertLayer \"({batch_size},{seq_length:03})\" ",
75+
f"{device} TurboTransform QPS, {turbo_qps}, time, {turbo_time}")
76+
torch_model = lambda: self.torch_model(
77+
input_ids=self.input_tensor, attention_mask=None, head_mask=None)
78+
with turbo_transformers.pref_guard("albert_perf") as perf:
79+
torch_result, torch_qps, torch_time = \
80+
test_helper.run_model(torch_model, use_cuda, num_iter)
81+
82+
print(f"AlbertModel \"({batch_size},{seq_length:03})\" ",
83+
f"{device} Torch QPS, {torch_qps}, time, {torch_time}")
84+
85+
# print(turbo_result[-1])
86+
# print(turbo_result, torch_result[0])
87+
# TODO(jiaruifang) Error is too high. Does tensor core introduce more differences?
88+
tolerate_error = 1e-2
89+
self.assertTrue(
90+
torch.max(torch.abs(torch_result[0] -
91+
turbo_result[0])) < tolerate_error)
92+
93+
with open("albert_model_res.txt", "a") as fh:
94+
fh.write(
95+
f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n"
6496
)
65-
torch_model = lambda: self.torch_model(input_ids=self.input_tensor,
66-
attention_mask=None,
67-
head_mask=None)
68-
with turbo_transformers.pref_guard("albert_perf") as perf:
69-
torch_result, torch_qps, torch_time = \
70-
test_helper.run_model(torch_model, use_cuda, num_iter)
71-
72-
print(f"AlbertModel \"({batch_size},{seq_length:03})\" ",
73-
f"{device} Torch QPS, {torch_qps}, time, {torch_time}")
74-
75-
# print(turbo_result[-1])
76-
# print(turbo_result, torch_result[0])
77-
# TODO(jiaruifang) Error is too high. Does tensor core introduce more differences?
78-
tolerate_error = 1e-2
79-
self.assertTrue(
80-
torch.max(torch.abs(torch_result[0] -
81-
turbo_result[0])) < tolerate_error)
82-
83-
with open("albert_model_res.txt", "a") as fh:
84-
fh.write(
85-
f"\"({batch_size},{seq_length:03})\", {torch_qps}, {torch_qps}\n"
86-
)
87-
88-
def test_layer(self):
89-
self.check_torch_and_turbo(use_cuda=False)
90-
if torch.cuda.is_available() and \
91-
turbo_transformers.config.is_compiled_with_cuda():
92-
self.check_torch_and_turbo(use_cuda=True)
93-
94-
globals()[f"TestAlbertModel{batch_size}_{seq_length:03}"] = \
95-
TestAlbertModel
96-
97-
98-
with open("albert_model_res.txt", "w") as fh:
99-
fh.write(", torch, turbo_transformers\n")
100-
for batch_size in [1, 2]:
101-
for seq_length in [10]:
102-
create_test(batch_size, seq_length)
97+
98+
def albert_model_test_helper(self, use_memory_opt):
99+
if use_memory_opt:
100+
turbo_transformers.reset_allocator_schema("model-aware")
101+
for batch_size in [1, 2]:
102+
for seq_length in [50, 10, 64]:
103+
self.check_torch_and_turbo(batch_size,
104+
seq_length,
105+
use_cuda=False,
106+
use_memory_opt=True)
107+
if torch.cuda.is_available() and \
108+
turbo_transformers.config.is_compiled_with_cuda():
109+
self.check_torch_and_turbo(batch_size,
110+
seq_length,
111+
use_cuda=True,
112+
use_memory_opt=True)
113+
if use_memory_opt:
114+
turbo_transformers.reset_allocator_schema("naive")
115+
116+
def test(self):
117+
self.albert_model_test_helper(False)
118+
# self.albert_model_test_helper(True)
119+
103120

104121
if __name__ == '__main__':
105122
unittest.main()

0 commit comments

Comments
 (0)