forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
127 lines (108 loc) · 3.91 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# import torch
# # Create a multidimensional tensor (e.g., a MxNxL tensor)
# M, N, P = 300, 200, 500
# tensor1 = torch.randn(M, N, P)
# tensor1.to('cuda')
# tensor2 = torch.randn(P, N)
# tensor2.to('cuda')
# org_times = []
# for i in range(0, M):
# # record the cost of tensorxtensor
# torch.cuda.Event(enable_timing=True)
# start = torch.cuda.Event(enable_timing=True)
# end = torch.cuda.Event(enable_timing=True)
# start.record()
# result = torch.matmul(tensor1, tensor2)
# end.record()
# torch.cuda.synchronize()
# org_time = start.elapsed_time(end)
# org_times.append(org_time)
# modified_times = []
# # Set some values to zero, e.g. N values in the first row
# tensor1[:10, :, :] = 0 # 将前10行设置为全0
# for i in range(0, M):
# # record the cost of tensorxtensor
# torch.cuda.Event(enable_timing=True)
# start = torch.cuda.Event(enable_timing=True)
# end = torch.cuda.Event(enable_timing=True)
# start.record()
# result = torch.matmul(tensor1, tensor2)
# end.record()
# torch.cuda.synchronize()
# modified_time = start.elapsed_time(end)
# modified_times.append(modified_time)
# print("Mean original time: ", sum(org_times) / len(org_times))
# print("Mean modified time: ", sum(modified_times) / len(modified_times))
import torch, time
import torch.nn.functional as F
import pandas as pd
import numpy as np
import os
# 设置随机数
torch.manual_seed(0)
# np.random.seed(0)
# 设置参数 for Llama2-13B on one A100 GPU
batch_size = 8
sequence_length = 350
head_size = 64
num_heads = 40
embed_size_per_head = 128
num_layers = 40
min_cache_layers = 10
max_cache_layers = 25
recom_num_layers = num_layers - min_cache_layers
loops = 100
org_times = []
# 初始化Query, Key, Value
Q = torch.randn(batch_size, sequence_length, head_size, num_heads).to('cuda')
K = torch.randn(batch_size, sequence_length, head_size, num_heads).to('cuda')
V = torch.randn(batch_size, sequence_length, head_size, num_heads).to('cuda')
mask_tensor = torch.ones_like(Q) # 初始化为全 1
mask_tensor[:batch_size-1] = 0 # 将前 bs-1 行设置为 0
Q_m = Q*mask_tensor
K_m = K*mask_tensor
V_m = V*mask_tensor
for i in range(loops):
start_time = torch.cuda.Event(enable_timing=True)
end_time = torch.cuda.Event(enable_timing=True)
start_time.record()
d_k = embed_size_per_head
scores = torch.matmul(Q_m, K_m.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
weights = F.softmax(scores, dim=-1)
output = torch.matmul(weights, V_m)
end_time.record()
torch.cuda.synchronize()
org_times.append(start_time.elapsed_time(end_time))
time.sleep(10)
modified_times = []
for i in range(loops):
start_time = torch.cuda.Event(enable_timing=True)
end_time = torch.cuda.Event(enable_timing=True)
start_time.record()
d_k = embed_size_per_head
Q_s = Q[-1:]
K_s = K[-1:]
V_s = V[-1:]
scores = torch.matmul(Q_s, K_s.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
weights = F.softmax(scores, dim=-1)
output = torch.matmul(weights, V_s)
end_time.record()
torch.cuda.synchronize()
modified_times.append(start_time.elapsed_time(end_time))
print("Mean original time: ", sum(org_times) / len(org_times), "ms")
print("Mean modified time: ", sum(modified_times) / len(modified_times), "ms")
time.sleep(10)
# save to csv
mean_org_time = sum(org_times) / len(org_times)
mean_modified_time = sum(modified_times) / len(modified_times)
data = {
'sequence_length': [sequence_length],
'max_cache_layers': [max_cache_layers],
'min_cache_layers': [min_cache_layers],
'mean_org_time': [mean_org_time],
'mean_modified_time': [mean_modified_time],
'delta': [mean_org_time - mean_modified_time],
'speedup': [(mean_org_time - mean_modified_time)/mean_org_time]
}
df = pd.DataFrame(data)
df.to_csv('test_policy.csv', mode='a', header=not os.path.exists('test_policy.csv'), index=False)