Add doc and test for fastmath kwarg

Zentrik · Aug 14, 2023 · e87fa74 · e87fa74
1 parent 81cc186
commit e87fa74
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 0 deletions.
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -304,6 +304,7 @@ The following keyword arguments are supported:
   multiprocessor
 - `maxregs`: the maximum number of registers to be allocated to a single thread (only
   supported on LLVM 4.0+)
+- `fastmath`: use less precise square roots and flush denormals
 - `name`: override the name that the kernel will have in the generated code
 - `always_inline`: inline all function calls in the kernel
 

diff --git a/test/core/codegen.jl b/test/core/codegen.jl
@@ -145,6 +145,29 @@ end
     @test occursin(r"\.func .*julia_f_expensive", asm)
 end
 
+@testset "fastmath" begin
+    function sqrt_kernel(x)
+        i = threadIdx().x
+        @inbounds x[i] = sqrt(x[i])
+        return
+    end
+
+    function div_kernel(x)
+        i = threadIdx().x
+        @fastmath @inbounds x[i] = 1 / x[i]
+        return
+    end
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
+    @test occursin("sqrt.r", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("sqrt.approx.ftz", asm)
+
+    asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
+    @test occursin("div.approx.ftz", asm)
+end
+
 @testset "local memory stores due to byval" begin
     # JuliaGPU/GPUCompiler.jl#92
     function kernel(y1, y2)