Skip to content

Commit

Permalink
Add doc and test for fastmath kwarg
Browse files Browse the repository at this point in the history
  • Loading branch information
Zentrik committed Aug 14, 2023
1 parent 81cc186 commit e87fa74
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ The following keyword arguments are supported:
multiprocessor
- `maxregs`: the maximum number of registers to be allocated to a single thread (only
supported on LLVM 4.0+)
- `fastmath`: use less precise square roots and flush denormals
- `name`: override the name that the kernel will have in the generated code
- `always_inline`: inline all function calls in the kernel
Expand Down
23 changes: 23 additions & 0 deletions test/core/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,29 @@ end
@test occursin(r"\.func .*julia_f_expensive", asm)
end

@testset "fastmath" begin
function sqrt_kernel(x)
i = threadIdx().x
@inbounds x[i] = sqrt(x[i])
return
end

function div_kernel(x)
i = threadIdx().x
@fastmath @inbounds x[i] = 1 / x[i]
return
end

asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}))
@test occursin("sqrt.r", asm)

asm = sprint(io->CUDA.code_ptx(io, sqrt_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
@test occursin("sqrt.approx.ftz", asm)

asm = sprint(io->CUDA.code_ptx(io, div_kernel, Tuple{CuDeviceArray{Float32,1,AS.Global}}; fastmath=true))
@test occursin("div.approx.ftz", asm)
end

@testset "local memory stores due to byval" begin
# JuliaGPU/GPUCompiler.jl#92
function kernel(y1, y2)
Expand Down

0 comments on commit e87fa74

Please sign in to comment.