-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathtest.cu
163 lines (134 loc) · 4.22 KB
/
test.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
/*
matrix add using CUDA
demonstrate that GPU cache line is important to high speed performance
compile: make
run: ./test 10240 1024
output(using 2080ti):
#################### Better Cache #########################
<M, N> = <10240 1024>
Take 0.247000 ms 169.809879 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.240000 ms 174.762667 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.240000 ms 174.762667 GB/s
Take 0.238000 ms 176.231261 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.238000 ms 176.231261 GB/s
Take 0.238000 ms 176.231261 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.238000 ms 176.231261 GB/s
Take 0.239000 ms 175.493891 GB/s
Take 0.240000 ms 174.762667 GB/s
#################### Worse Cache #########################
<M, N> = <10240 1024>
Take 0.760000 ms 55.188211 GB/s
Take 0.752000 ms 55.775319 GB/s
Take 0.768000 ms 54.613333 GB/s
Take 0.763000 ms 54.971219 GB/s
Take 0.767000 ms 54.684537 GB/s
Take 0.751000 ms 55.849587 GB/s
Take 0.748000 ms 56.073583 GB/s
Take 0.752000 ms 55.775319 GB/s
Take 0.750000 ms 55.924053 GB/s
Take 0.751000 ms 55.849587 GB/s
Take 0.750000 ms 55.924053 GB/s
Take 0.750000 ms 55.924053 GB/s
Take 0.761000 ms 55.115690 GB/s
Take 0.751000 ms 55.849587 GB/s
Take 0.765000 ms 54.827503 GB/s
Take 0.768000 ms 54.613333 GB/s
Take 0.751000 ms 55.849587 GB/s
Take 0.766000 ms 54.755927 GB/s
Take 0.750000 ms 55.924053 GB/s
Take 0.766000 ms 54.755927 GB/s
*/
#include <stdio.h>
#include <curand.h>
#include <sys/time.h>
#include <assert.h>
#include <iostream>
#include <limits>
#include <mma.h>
#include <vector>
// step 1. compile and run. fast
// step 2. comment this line, compile and run. slow
#define BETTER_CACHE
#define TIME(a,b) ((double)((b).tv_sec-(a).tv_sec) * 1000.0 + (double)((b).tv_usec-(a).tv_usec)/(double)1000.0)
// Define some error checking macros.
#define cudaErrCheck(stat) { cudaErrCheck_((stat), __FILE__, __LINE__); }
void cudaErrCheck_(cudaError_t stat, const char *file, int line) {
if (stat != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(stat), file, line);
}
}
__global__ void AddGpuKernel(float *C,
const float *A, const float *B,
size_t M, size_t N)
{
const int tidx = threadIdx.x + blockIdx.x * blockDim.x;
const int tidy = threadIdx.y + blockIdx.y * blockDim.y;
const int tnumx = blockDim.x * gridDim.x;
const int tnumy = blockDim.y * gridDim.y;
#if defined(BETTER_CACHE)
for (int i = tidy; i < M; i += tnumy)
{
for (int j = tidx; j < N; j += tnumx)
{
const int offset = i * N + j;
C[offset] = A[offset] + B[offset];
}
}
#else
for (int i = tidy; i < N; i += tnumy)
{
for (int j = tidx; j < M; j += tnumx)
{
const int offset = j * N + i;
C[offset] = A[offset] + B[offset];
}
}
#endif
}
int main(int argc, char* argv[]) {
int M = atoi(argv[1]);
int N = atoi(argv[2]);
printf("<M, N> = <%d %d>\n", M, N);
// cudaSetDevice(0);
// input data
float *A_device;
float *B_device;
// output data
float *C_device;
// init input data
size_t size = M * N * sizeof(float);
cudaErrCheck(cudaMalloc((void**)(&A_device), size));
cudaErrCheck(cudaMalloc((void**)(&B_device), size));
cudaErrCheck(cudaMemset(A_device, 0, size));
cudaErrCheck(cudaMemset(B_device, 0, size));
// init output data
cudaErrCheck(cudaMalloc((void**)(&C_device), size));
// used to log processing speed
size_t bytes = M * N * sizeof(float);
for (int s = 0; s < 20; s ++)
{
dim3 grid(32, 32, 1);
dim3 block(32, 32, 1);
struct timeval t0, t1;
gettimeofday(&t0, NULL);
AddGpuKernel<<<grid, block>>>(C_device, A_device, B_device, M, N);
cudaDeviceSynchronize();
gettimeofday(&t1, NULL);
printf("Take %.6f ms %.6lf GB/s\n", TIME(t0, t1), bytes / TIME(t0, t1) / 1000000.0);
}
cudaErrCheck(cudaFree(A_device));
cudaErrCheck(cudaFree(B_device));
cudaErrCheck(cudaFree(C_device));
return 0;
}