-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlearn_opencl.py
68 lines (50 loc) · 2.49 KB
/
learn_opencl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
__author__ = 'toster'
import pyopencl as cl
from pyopencl import array
import numpy
if __name__ == "__main__":
vector = numpy.zeros((1, 1), cl.array.vec.float4)
matrix = numpy.zeros((1, 4), cl.array.vec.float4)
matrix[0, 0] = (1, 2, 4, 8)
matrix[0, 1] = (16, 32, 64, 128)
matrix[0, 2] = (3, 6, 9, 12)
matrix[0, 3] = (5, 10, 15, 25)
vector[0, 0] = (1, 2, 4, 8)
## Step #1. Obtain an OpenCL platform
platform = cl.get_platforms()[0]
print platform.extensions
## It would be necessary to add some code to check the check the support for
## the necessary platform extensions with platform.extensions
## Step #2. Obtain a device id for at least one device (accelerator).
device = platform.get_devices()[0]
## It would be necessary to add some code to check the check the support for
## the necessary device extensions with device.extensions
## Step #3. Create a context for the selected device.
context = cl.Context([device])
## Step #4. Create the accelerator program from source code.
## Step #5. Build the program.
## Step #6. Create one or more kernels from the program functions.
program = cl.Program(context, """
__kernel void matrix_dot_vector(__global const float4 *matrix,
__global const float4 *vector, __global float *result)
{
int gid = get_global_id(0);
result[gid] = dot(matrix[gid], vector[0]);
}
""").build()
## Step #7. Create a command queue for the target device.
queue = cl.CommandQueue(context)
## Step #8. Allocate device memory and move input data from the host to the device memory.
mem_flags = cl.mem_flags
matrix_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=matrix)
vector_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=vector)
matrix_dot_vector = numpy.zeros(4, numpy.float32)
destination_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, matrix_dot_vector.nbytes)
## Step #9. Associate the arguments to the kernel with kernel object.
## Step #10. Deploy the kernel for device execution.
program.matrix_dot_vector(queue, matrix_dot_vector.shape, None, matrix_buf, vector_buf, destination_buf)
cl.enqueue_copy(queue, matrix_dot_vector, destination_buf)
## Step #12. Release context, program, kernels and memory.
## PyOpenCL performs this step for you, and therefore,
## you don't need to worry about cleanup code
print(matrix_dot_vector)