Skip to content

Commit

Permalink
Allow switching queues, and update examples.
Browse files Browse the repository at this point in the history
  • Loading branch information
maleadt committed Sep 9, 2024
1 parent eae3030 commit b59d898
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 174 deletions.
12 changes: 5 additions & 7 deletions examples/demo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,18 @@ const sum_kernel_src = "
a = rand(Float32, 50_000)
b = rand(Float32, 50_000)

device, ctx, queue = cl.create_compute_context()

# create opencl buffer objects
# copies to the device initiated when the kernel function is called
a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy); hostbuf=a)
b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy); hostbuf=b)
c_buff = cl.Buffer(Float32, ctx, length(a), :w)
a_buff = cl.Buffer(Float32, length(a), (:r, :copy); hostbuf=a)
b_buff = cl.Buffer(Float32, length(b), (:r, :copy); hostbuf=b)
c_buff = cl.Buffer(Float32, length(a), :w)

# build the program and construct a kernel object
p = cl.Program(ctx, source=sum_kernel_src) |> cl.build!
p = cl.Program(source=sum_kernel_src) |> cl.build!
sum_kernel = cl.Kernel(p, "sum")

# call the kernel object with global size set to the size our arrays
sum_kernel[queue, size(a)](a_buff, b_buff, c_buff)
sum_kernel[size(a)](a_buff, b_buff, c_buff)

# perform a blocking read of the result from the device
r = cl.read(c_buff)
Expand Down
29 changes: 12 additions & 17 deletions examples/hands_on_opencl/ex04/vadd_chain.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,8 @@ __kernel void vadd(

# create a compute context

# this selects the fastest opencl device available
# and creates a context and queue for using the
# the selected device
device, ctx, queue = cl.create_compute_context()

# create the compute program and build it
program = cl.Program(ctx, source=kernelsource) |> cl.build!
program = cl.Program(source=kernelsource) |> cl.build!

#create a, b, e, and g vectors and fill with random float values
#create empty vectors for c, d, and f
Expand All @@ -67,14 +62,14 @@ h_g = rand(Float32, LENGTH)
# {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}

# Create the input (a, b, e, g) arrays in device memory and copy data from host
d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
d_e = cl.Buffer(Float32, ctx, length(h_e), (:r, :copy), hostbuf=h_e)
d_g = cl.Buffer(Float32, ctx, length(h_g), (:r, :copy), hostbuf=h_g)
d_a = cl.Buffer(Float32, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, length(h_b), (:r, :copy), hostbuf=h_b)
d_e = cl.Buffer(Float32, length(h_e), (:r, :copy), hostbuf=h_e)
d_g = cl.Buffer(Float32, length(h_g), (:r, :copy), hostbuf=h_g)
# Create the output (c, d, f) array in device memory
d_c = cl.Buffer(Float32, ctx, :LENGTH, w)
d_d = cl.Buffer(Float32, ctx, :LENGTH, w)
d_f = cl.Buffer(Float32, ctx, :LENGTH, w)
d_c = cl.Buffer(Float32, LENGTH, :w)
d_d = cl.Buffer(Float32, LENGTH, :w)
d_f = cl.Buffer(Float32, LENGTH, :w)

# create the kernel
vadd = cl.Kernel(program, "vadd")
Expand All @@ -86,18 +81,18 @@ vadd = cl.Kernel(program, "vadd")
# here we call the kernel with work size set to the number of elements and a local
# work size of nothing. This enables the opencl runtime to optimize the local size
# for simple kernels
queue(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))
cl.launch(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))

# an alternative syntax is to create an partial function to call
# by julia's getindex syntax for Kernel types.
# here the queue, global_size, and (optional) local_size are passed in which
# returns a partial function with these parameters set.
vadd[queue, size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
vadd[queue, size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))
vadd[size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
vadd[size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))

# copy back the results from the compute device
# copy!(queue, dst, src) follows same interface as julia's built in copy!
cl.copy!(queue, h_f, d_f)
cl.copy!(h_f, d_f)

# test the results
correct = 0
Expand Down
15 changes: 6 additions & 9 deletions examples/hands_on_opencl/ex05/vadd_abc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,30 +38,27 @@ __kernel void vadd(
r[i] = a[i] + b[i] + c[i];
}"

# create a compute context
device, ctx, queue = cl.create_compute_context()

# create the compute program and build it
program = cl.Program(ctx, source=kernelsource) |> cl.build!
program = cl.Program(source=kernelsource) |> cl.build!

# create a, b and c vectors and fill with random float values
# (the result array will be created when reading back from the device)
h_a = rand(Float32, LENGTH)
h_b = rand(Float32, LENGTH)
h_c = rand(Float32, LENGTH)

d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
d_c = cl.Buffer(Float32, ctx, length(h_c), (:r, :copy), hostbuf=h_c)
d_a = cl.Buffer(Float32, length(h_a), (:r, :copy), hostbuf=h_a)
d_b = cl.Buffer(Float32, length(h_b), (:r, :copy), hostbuf=h_b)
d_c = cl.Buffer(Float32, length(h_c), (:r, :copy), hostbuf=h_c)

# create the output (r) buffer in device memory
d_r = cl.Buffer(Float32, ctx, LENGTH, :w)
d_r = cl.Buffer(Float32, LENGTH, :w)

# create the kernel
vadd = cl.Kernel(program, "vadd")

# execute the kernel over the entire range of the input
vadd[queue, size(h_a)](d_a, d_b, d_c, d_r, UInt32(LENGTH))
vadd[size(h_a)](d_a, d_b, d_c, d_r, UInt32(LENGTH))

# read the results back from the compute device
# by convention..
Expand Down
37 changes: 17 additions & 20 deletions examples/hands_on_opencl/ex06/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,19 +104,12 @@ for i in 1:COUNT
results(Mdim, Ndim, Pdim, h_C, t2 - t1)
end

# set up OpenCL
ctx = cl.create_some_context()

# You can enable profiling events on the queue
# by calling the constructor with the :profile flag
queue = cl.CmdQueue(ctx, :profile)

# create OpenCL Buffers
d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
d_a = cl.Buffer(Float32, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, length(h_C), :w)

prg = cl.Program(ctx, source=kernel_source) |> cl.build!
prg = cl.Program(source=kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
Expand All @@ -125,13 +118,17 @@ for i in 1:COUNT
fill!(h_C, 0.0)

global_range = (Ndim, Mdim)
mmul_ocl = mmul[queue, global_range]

evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)

# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(queue, h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
mmul_ocl = mmul[global_range]

# You can enable profiling events on the queue
# by calling the constructor with the :profile flag
cl.queue!(:profile) do
evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)

# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
end
59 changes: 28 additions & 31 deletions examples/hands_on_opencl/ex07/matmul.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,68 +85,65 @@ for i in 1:COUNT
results(Mdim, Ndim, Pdim, h_C, t2 - t1)
end

# set up OpenCL
ctx = cl.create_some_context()

# You can enable profiling events on the queue
# by calling the constructor with the :profile flag
queue = cl.CmdQueue(ctx, :profile)

# create OpenCL Buffers
d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
d_a = cl.Buffer(Float32, length(h_A), (:r,:copy), hostbuf=h_A)
d_b = cl.Buffer(Float32, length(h_B), (:r,:copy), hostbuf=h_B)
d_c = cl.Buffer(Float32, length(h_C), :w)

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... Naive
#--------------------------------------------------------------------------------

kernel_source = read(joinpath(src_dir, "C_elem.cl"), String)
prg = cl.Program(ctx, source=kernel_source) |> cl.build!
prg = cl.Program(source=kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")

for i in 1:COUNT
fill!(h_C, 0.0)
evt = queue(mmul, (Ndim, Mdim), nothing,
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(queue, h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
cl.queue!(:profile) do
evt = cl.launch(mmul, (Ndim, Mdim), nothing,
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
end

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... C row per work item
#--------------------------------------------------------------------------------

kernel_source = read(joinpath(src_dir, "C_row.cl"), String)
prg = cl.Program(ctx, source=kernel_source) |> cl.build!
prg = cl.Program(source=kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")

@info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")

for i in 1:COUNT
fill!(h_C, 0.0)
mmul_ocl = mmul[queue, (Ndim,), (div(ORDER, 16),)]
mmul_ocl = mmul[(Ndim,), (div(ORDER, 16),)]

evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim), d_a, d_b, d_c)
cl.queue!(:profile) do
evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim), d_a, d_b, d_c)

# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(queue, h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
end

#--------------------------------------------------------------------------------
# OpenCL matrix multiplication ... C row per work item, A row in pivate memory
#--------------------------------------------------------------------------------
kernel_source = read(joinpath(src_dir, "C_row_priv.cl"), String)
prg = cl.Program(ctx, source=kernel_source) |> cl.build!
prg = cl.Program(source=kernel_source) |> cl.build!
mmul = cl.Kernel(prg, "mmul")
wk_size = cl.info(first(cl.devices(ctx)), :max_work_group_size)
wk_size = cl.info(cl.device(), :max_work_group_size)
if Ndim * (ORDER ÷ 16) >= wk_size
@warn("Specified work_size $(Ndim * (ORDER ÷ 16)) is bigger than $wk_size")
else
Expand All @@ -155,12 +152,12 @@ else

for i in 1:COUNT
fill!(h_C, 0.0)
evt = queue(mmul, (Ndim,), (ORDER,),
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
evt = cl.launch(mmul, (Ndim,), (ORDER,),
Int32(Mdim), Int32(Ndim), Int32(Pdim),
d_a, d_b, d_c)
# profiling events are measured in ns
run_time = evt[:profile_duration] / 1e9
cl.copy!(queue, h_C, d_c)
cl.copy!(h_C, d_c)
results(Mdim, Ndim, Pdim, h_C, run_time)
end
end
Loading

0 comments on commit b59d898

Please sign in to comment.