Allow switching queues, and update examples.

JuliaGPU · Sep 9, 2024 · b59d898 · b59d898
1 parent eae3030
commit b59d898
Show file tree

Hide file tree

Showing 10 changed files with 170 additions and 174 deletions.
diff --git a/examples/demo.jl b/examples/demo.jl
@@ -12,20 +12,18 @@ const sum_kernel_src = "
 a = rand(Float32, 50_000)
 b = rand(Float32, 50_000)
 
-device, ctx, queue = cl.create_compute_context()
-
 # create opencl buffer objects
 # copies to the device initiated when the kernel function is called
-a_buff = cl.Buffer(Float32, ctx, length(a), (:r, :copy); hostbuf=a)
-b_buff = cl.Buffer(Float32, ctx, length(b), (:r, :copy); hostbuf=b)
-c_buff = cl.Buffer(Float32, ctx, length(a), :w)
+a_buff = cl.Buffer(Float32, length(a), (:r, :copy); hostbuf=a)
+b_buff = cl.Buffer(Float32, length(b), (:r, :copy); hostbuf=b)
+c_buff = cl.Buffer(Float32, length(a), :w)
 
 # build the program and construct a kernel object
-p = cl.Program(ctx, source=sum_kernel_src) |> cl.build!
+p = cl.Program(source=sum_kernel_src) |> cl.build!
 sum_kernel = cl.Kernel(p, "sum")
 
 # call the kernel object with global size set to the size our arrays
-sum_kernel[queue, size(a)](a_buff, b_buff, c_buff)
+sum_kernel[size(a)](a_buff, b_buff, c_buff)
 
 # perform a blocking read of the result from the device
 r = cl.read(c_buff)

diff --git a/examples/hands_on_opencl/ex04/vadd_chain.jl b/examples/hands_on_opencl/ex04/vadd_chain.jl
@@ -40,13 +40,8 @@ __kernel void vadd(
 
 # create a compute context
 
-# this selects the fastest opencl device available
-# and creates a context and queue for using the
-# the selected device
-device, ctx, queue = cl.create_compute_context()
-
 # create the compute program and build it
-program = cl.Program(ctx, source=kernelsource) |> cl.build!
+program = cl.Program(source=kernelsource) |> cl.build!
 
 #create a, b, e, and g vectors and fill with random float values
 #create empty vectors for c, d, and f
@@ -67,14 +62,14 @@ h_g = rand(Float32, LENGTH)
 # {:use (use host buffer), :alloc (alloc pinned memory), :copy (default)}
 
 # Create the input (a, b, e, g) arrays in device memory and copy data from host
-d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
-d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
-d_e = cl.Buffer(Float32, ctx, length(h_e), (:r, :copy), hostbuf=h_e)
-d_g = cl.Buffer(Float32, ctx, length(h_g), (:r, :copy), hostbuf=h_g)
+d_a = cl.Buffer(Float32, length(h_a), (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, length(h_b), (:r, :copy), hostbuf=h_b)
+d_e = cl.Buffer(Float32, length(h_e), (:r, :copy), hostbuf=h_e)
+d_g = cl.Buffer(Float32, length(h_g), (:r, :copy), hostbuf=h_g)
 # Create the output (c, d, f) array in device memory
-d_c = cl.Buffer(Float32, ctx, :LENGTH, w)
-d_d = cl.Buffer(Float32, ctx, :LENGTH, w)
-d_f = cl.Buffer(Float32, ctx, :LENGTH, w)
+d_c = cl.Buffer(Float32, LENGTH, :w)
+d_d = cl.Buffer(Float32, LENGTH, :w)
+d_f = cl.Buffer(Float32, LENGTH, :w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")
@@ -86,18 +81,18 @@ vadd = cl.Kernel(program, "vadd")
 # here we call the kernel with work size set to the number of elements and a local
 # work size of nothing. This enables the opencl runtime to optimize the local size
 # for simple kernels
-queue(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))
+cl.launch(vadd, size(h_a), nothing, d_a, d_b, d_c, UInt32(LENGTH))
 
 # an alternative syntax is to create an partial function to call
 # by julia's getindex syntax for Kernel types.
 # here the queue, global_size, and (optional) local_size are passed in which
 # returns a partial function with these parameters set.
-vadd[queue, size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
-vadd[queue, size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))
+vadd[size(h_e)](d_e, d_c, d_d, UInt32(LENGTH))
+vadd[size(h_g)](d_g, d_d, d_f, UInt32(LENGTH))
 
 # copy back the results from the compute device
 # copy!(queue, dst, src) follows same interface as julia's built in copy!
-cl.copy!(queue, h_f, d_f)
+cl.copy!(h_f, d_f)
 
 # test the results
 correct = 0

diff --git a/examples/hands_on_opencl/ex05/vadd_abc.jl b/examples/hands_on_opencl/ex05/vadd_abc.jl
@@ -38,30 +38,27 @@ __kernel void vadd(
         r[i] = a[i] + b[i] + c[i];
 }"
 
-# create a compute context
-device, ctx, queue = cl.create_compute_context()
-
 # create the compute program and build it
-program = cl.Program(ctx, source=kernelsource) |> cl.build!
+program = cl.Program(source=kernelsource) |> cl.build!
 
 # create a, b and c vectors and fill with random float values
 # (the result array will be created when reading back from the device)
 h_a = rand(Float32, LENGTH)
 h_b = rand(Float32, LENGTH)
 h_c = rand(Float32, LENGTH)
 
-d_a = cl.Buffer(Float32, ctx, length(h_a), (:r, :copy), hostbuf=h_a)
-d_b = cl.Buffer(Float32, ctx, length(h_b), (:r, :copy), hostbuf=h_b)
-d_c = cl.Buffer(Float32, ctx, length(h_c), (:r, :copy), hostbuf=h_c)
+d_a = cl.Buffer(Float32, length(h_a), (:r, :copy), hostbuf=h_a)
+d_b = cl.Buffer(Float32, length(h_b), (:r, :copy), hostbuf=h_b)
+d_c = cl.Buffer(Float32, length(h_c), (:r, :copy), hostbuf=h_c)
 
 # create the output (r) buffer in device memory
-d_r = cl.Buffer(Float32, ctx, LENGTH, :w)
+d_r = cl.Buffer(Float32, LENGTH, :w)
 
 # create the kernel
 vadd = cl.Kernel(program, "vadd")
 
 # execute the kernel over the entire range of the input
-vadd[queue, size(h_a)](d_a, d_b, d_c, d_r, UInt32(LENGTH))
+vadd[size(h_a)](d_a, d_b, d_c, d_r, UInt32(LENGTH))
 
 # read the results back from the compute device
 # by convention..

diff --git a/examples/hands_on_opencl/ex06/matmul.jl b/examples/hands_on_opencl/ex06/matmul.jl
@@ -104,19 +104,12 @@ for i in 1:COUNT
     results(Mdim, Ndim, Pdim, h_C, t2 - t1)
 end
 
-# set up OpenCL
-ctx = cl.create_some_context()
-
-# You can enable profiling events on the queue
-# by calling the constructor with the :profile flag
-queue = cl.CmdQueue(ctx, :profile)
-
 # create OpenCL Buffers
-d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
-d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
-d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
+d_a = cl.Buffer(Float32, length(h_A), (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, length(h_B), (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, length(h_C), :w)
 
-prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+prg  = cl.Program(source=kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
@@ -125,13 +118,17 @@ for i in 1:COUNT
     fill!(h_C, 0.0)
 
     global_range = (Ndim, Mdim)
-    mmul_ocl = mmul[queue, global_range]
-
-    evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                           d_a, d_b, d_c)
-
-    # profiling events are measured in ns
-    run_time = evt[:profile_duration] / 1e9
-    cl.copy!(queue, h_C, d_c)
-    results(Mdim, Ndim, Pdim, h_C, run_time)
+    mmul_ocl = mmul[global_range]
+
+    # You can enable profiling events on the queue
+    # by calling the constructor with the :profile flag
+    cl.queue!(:profile) do
+        evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                               d_a, d_b, d_c)
+
+        # profiling events are measured in ns
+        run_time = evt[:profile_duration] / 1e9
+        cl.copy!(h_C, d_c)
+        results(Mdim, Ndim, Pdim, h_C, run_time)
+    end
 end
diff --git a/examples/hands_on_opencl/ex07/matmul.jl b/examples/hands_on_opencl/ex07/matmul.jl
@@ -85,68 +85,65 @@ for i in 1:COUNT
     results(Mdim, Ndim, Pdim, h_C, t2 - t1)
 end
 
-# set up OpenCL
-ctx = cl.create_some_context()
-
-# You can enable profiling events on the queue
-# by calling the constructor with the :profile flag
-queue = cl.CmdQueue(ctx, :profile)
-
 # create OpenCL Buffers
-d_a = cl.Buffer(Float32, ctx, length(h_A), (:r,:copy), hostbuf=h_A)
-d_b = cl.Buffer(Float32, ctx, length(h_B), (:r,:copy), hostbuf=h_B)
-d_c = cl.Buffer(Float32, ctx, length(h_C), :w)
+d_a = cl.Buffer(Float32, length(h_A), (:r,:copy), hostbuf=h_A)
+d_b = cl.Buffer(Float32, length(h_B), (:r,:copy), hostbuf=h_B)
+d_c = cl.Buffer(Float32, length(h_C), :w)
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... Naive
 #--------------------------------------------------------------------------------
 
 kernel_source = read(joinpath(src_dir, "C_elem.cl"), String)
-prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+prg  = cl.Program(source=kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C(i, j) per work item, order $Ndim ====")
 
 for i in 1:COUNT
     fill!(h_C, 0.0)
-    evt = queue(mmul, (Ndim, Mdim), nothing,
-                Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                d_a, d_b, d_c)
-    # profiling events are measured in ns
-    run_time = evt[:profile_duration] / 1e9
-    cl.copy!(queue, h_C, d_c)
-    results(Mdim, Ndim, Pdim, h_C, run_time)
+    cl.queue!(:profile) do
+        evt = cl.launch(mmul, (Ndim, Mdim), nothing,
+                        Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                        d_a, d_b, d_c)
+        # profiling events are measured in ns
+        run_time = evt[:profile_duration] / 1e9
+        cl.copy!(h_C, d_c)
+        results(Mdim, Ndim, Pdim, h_C, run_time)
+    end
 end
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... C row per work item
 #--------------------------------------------------------------------------------
 
 kernel_source = read(joinpath(src_dir, "C_row.cl"), String)
-prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+prg  = cl.Program(source=kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
 
 @info("=== OpenCL, matrix mult, C row per work item, order $Ndim ====")
 
 for i in 1:COUNT
     fill!(h_C, 0.0)
-    mmul_ocl = mmul[queue, (Ndim,), (div(ORDER, 16),)]
+    mmul_ocl = mmul[(Ndim,), (div(ORDER, 16),)]
 
-    evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim), d_a, d_b, d_c)
+    cl.queue!(:profile) do
+        evt = mmul_ocl(Int32(Mdim), Int32(Ndim), Int32(Pdim), d_a, d_b, d_c)
 
-    # profiling events are measured in ns
-    run_time = evt[:profile_duration] / 1e9
-    cl.copy!(queue, h_C, d_c)
-    results(Mdim, Ndim, Pdim, h_C, run_time)
+        # profiling events are measured in ns
+        run_time = evt[:profile_duration] / 1e9
+        cl.copy!(h_C, d_c)
+        results(Mdim, Ndim, Pdim, h_C, run_time)
+    end
 end
 
 #--------------------------------------------------------------------------------
 # OpenCL matrix multiplication ... C row per work item, A row in pivate memory
 #--------------------------------------------------------------------------------
 kernel_source = read(joinpath(src_dir, "C_row_priv.cl"), String)
-prg  = cl.Program(ctx, source=kernel_source) |> cl.build!
+prg  = cl.Program(source=kernel_source) |> cl.build!
 mmul = cl.Kernel(prg, "mmul")
-wk_size = cl.info(first(cl.devices(ctx)), :max_work_group_size)
+wk_size = cl.info(cl.device(), :max_work_group_size)
 if Ndim * (ORDER ÷ 16) >= wk_size
     @warn("Specified work_size $(Ndim * (ORDER ÷ 16)) is bigger than $wk_size")
 else
@@ -155,12 +152,12 @@ else
 
 for i in 1:COUNT
     fill!(h_C, 0.0)
-    evt = queue(mmul, (Ndim,), (ORDER,),
-                Int32(Mdim), Int32(Ndim), Int32(Pdim),
-                d_a, d_b, d_c)
+    evt = cl.launch(mmul, (Ndim,), (ORDER,),
+                    Int32(Mdim), Int32(Ndim), Int32(Pdim),
+                    d_a, d_b, d_c)
     # profiling events are measured in ns
     run_time = evt[:profile_duration] / 1e9
-    cl.copy!(queue, h_C, d_c)
+    cl.copy!(h_C, d_c)
     results(Mdim, Ndim, Pdim, h_C, run_time)
 end
 end