From ed0fb5de4c0f8e1ac7e2749ad29e8cd8ee8f8d5d Mon Sep 17 00:00:00 2001 From: Lorenzo Van Munoz <66997677+lxvm@users.noreply.github.com> Date: Wed, 20 Sep 2023 14:44:55 -0400 Subject: [PATCH] Parallelization of integrand evaluations (#80) * initial commit * fix parallel segment bug * parallelize over initial segments * fix allocs test * remove threading and bisect all segments that pass rtol * draft BatchIntegrand * remove obsolete bits * simplify batching to evaluate whole segments * reduce heap allocation size * export BatchIntegrand * add examples to manual * update constructors and various bugfixes * typo * move example to docs * Apply suggestions from code review Co-authored-by: Steven G. Johnson * apply suggestions to examples * move max_batch check to quadgk * add BatchIntegrand outer constructor * Update docs/src/quadgk-examples.md Co-authored-by: Steven G. Johnson * whitespace typo * make max_batch a rought limit * update docstrings * fix codecov * idiomatic constructors * edit type parameters * add parameters for consistency * simplify constructors --------- Co-authored-by: Steven G. Johnson --- docs/src/api.md | 8 ++ docs/src/index.md | 3 +- docs/src/quadgk-examples.md | 34 +++++++ src/QuadGK.jl | 2 + src/adapt.jl | 64 +++++++----- src/batch.jl | 196 ++++++++++++++++++++++++++++++++++++ test/runtests.jl | 36 +++++++ 7 files changed, 318 insertions(+), 25 deletions(-) create mode 100644 src/batch.jl diff --git a/docs/src/api.md b/docs/src/api.md index 684b34d..dbd30c0 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -18,6 +18,14 @@ QuadGK.quadgk! QuadGK.alloc_segbuf ``` +For a vectorized integrand that can evaluate the integrand at multiple points +simultaneously, `quadgk` accepts a `BatchIntegrand` wrapper around the user's +integrand and pre-allocated input and output buffers. + +```@docs +QuadGK.BatchIntegrand +``` + ## Gauss and Gauss–Kronrod rules For more specialized applications, you may wish to construct your own Gauss or Gauss–Kronrod quadrature rules, as described in [Gauss and Gauss–Kronrod quadrature rules](@ref). To compute rules for $\int_{-1}^{+1} f(x) dx$ and $\int_a^b f(x) dx$ (unweighted integrals), use: diff --git a/docs/src/index.md b/docs/src/index.md index 10ae9f9..81fc963 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -20,6 +20,7 @@ Features of the QuadGK package include: * [Contour integrals](https://en.wikipedia.org/wiki/Contour_integration): You can specify a sequence of points in the complex plane to perform a contour integral along a piecewise-linear contour. * Arbitrary-order and custom quadrature rules: Any polynomial `order` of the Gauss–Kronrod quadrature rule can be specified, as well as generating quadrature rules and weights directly; see [Gauss and Gauss–Kronrod quadrature rules](@ref). Custom Gaussian-quadrature rules can also be constructed for arbitrary weight functions; see [Gaussian quadrature and arbitrary weight functions](@ref). * In-place integration: For memory efficiency, integrand functions that write in-place into a pre-allocated buffer (e.g. for vector-valued integrands) can be used with the [`quadgk!`](@ref) function, along with pre-allocated buffers using [`alloc_segbuf`](@ref). +* Batched integrand evaluation: Providing an integrand that can evaluate multiple points simultaneously allows for user-controlled parallelization (e.g. using threads, the GPU, or distributed memory). ## Quick start @@ -45,7 +46,7 @@ For extremely [smooth functions](https://en.wikipedia.org/wiki/Smoothness) like ## Tutorial examples -The [`quadgk` examples](@ref) chapter of this manual presents several other examples, including improper integrals, vector-valued integrands, improper integrals, singular or near-singular integrands, and Cauchy principal values. +The [`quadgk` examples](@ref) chapter of this manual presents several other examples, including improper integrals, vector-valued integrands, batched integrand evaluation, improper integrals, singular or near-singular integrands, and Cauchy principal values. ## In-place operations for array-valued integrands diff --git a/docs/src/quadgk-examples.md b/docs/src/quadgk-examples.md index cb82ee6..c26386f 100644 --- a/docs/src/quadgk-examples.md +++ b/docs/src/quadgk-examples.md @@ -222,6 +222,40 @@ Note that the return value also gives the `integral` as an `SVector` (a [statica The QuadGK package did not need any code specific to StaticArrays, and was written long before that package even existed. The fact that unrelated packages like this can be [composed](https://en.wikipedia.org/wiki/Composability) is part of the [beauty of multiple dispatch](https://www.youtube.com/watch?v=kc9HwsxE1OY) and [duck typing](https://en.wikipedia.org/wiki/Duck_typing) for [generic programming](https://en.wikipedia.org/wiki/Generic_programming). +## Batched integrand evaluation + +User-side parallelization of integrand evaluations is also possible by providing +an in-place function of the form `f!(y,x) = y .= f.(x)`, which evaluates the +integrand at multiple points simultaneously. To use this API, `quadgk` +dispatches on a [`BatchIntegrand`](@ref) type containing `f!` and buffers for +`y` and `x`. These buffers may be pre-allocated and reused for multiple +`BatchIntegrand`s with the same domain and range types. + +For example, we can perform multi-threaded integration of a highly oscillatory +function that needs to be refined globally: +``` +julia> f(x) = sin(100x) +f (generic function with 1 method) + +julia> function f!(y, x) + n = Threads.nthreads() + Threads.@threads for i in 1:n + y[i:n:end] .= f.(@view(x[i:n:end])) + end + end +f! (generic function with 1 method) + +julia> quadgk(BatchIntegrand{Float64}(f!), 0, 1) +(0.0013768112771231598, 8.493080824940099e-12) +``` + +Batching also changes how the adaptive refinement is done, which typically leads +to slightly different results and sometimes more integrand evaluations. You +can limit the maximum batch size by setting the `max_batch` parameter +of the [`BatchIntegrand`](@ref), which can be useful in order to set an +upper bound on the size of the buffers allocated by `quadgk`. + + ## Arbitrary-precision integrals `quadgk` also supports [arbitrary-precision arithmetic](https://en.wikipedia.org/wiki/Arbitrary-precision_arithmetic) using Julia's [`BigFloat` type](https://docs.julialang.org/en/v1/base/numbers/#BigFloats-and-BigInts) to compute integrals to arbitrary accuracy (albeit at increased computational cost). diff --git a/src/QuadGK.jl b/src/QuadGK.jl index f7ebfdb..90c8681 100644 --- a/src/QuadGK.jl +++ b/src/QuadGK.jl @@ -24,6 +24,7 @@ and returns the approximate `integral = 0.746824132812427` and error estimate module QuadGK export quadgk, quadgk!, gauss, kronrod, alloc_segbuf, quadgk_count, quadgk_print +export BatchIntegrand using DataStructures, LinearAlgebra import Base.Order.Reverse @@ -52,5 +53,6 @@ include("gausskronrod.jl") include("evalrule.jl") include("adapt.jl") include("weightedgauss.jl") +include("batch.jl") end # module QuadGK diff --git a/src/adapt.jl b/src/adapt.jl index db1c366..4bad6fa 100644 --- a/src/adapt.jl +++ b/src/adapt.jl @@ -7,7 +7,11 @@ function do_quadgk(f::F, s::NTuple{N,T}, n, atol, rtol, maxevals, nrm, segbuf) w x,w,gw = cachedrule(T,n) @assert N ≥ 2 - segs = ntuple(i -> evalrule(f, s[i],s[i+1], x,w,gw, nrm), Val{N-1}()) + if f isa BatchIntegrand + segs = evalrules(f, s, x,w,gw, nrm) + else + segs = ntuple(i -> evalrule(f, s[i],s[i+1], x,w,gw, nrm), Val{N-1}()) + end if f isa InplaceIntegrand I = f.I .= segs[1].I for i = 2:length(segs) @@ -34,7 +38,7 @@ function do_quadgk(f::F, s::NTuple{N,T}, n, atol, rtol, maxevals, nrm, segbuf) w segheap = segbuf === nothing ? collect(segs) : (resize!(segbuf, N-1) .= segs) heapify!(segheap, Reverse) - return adapt(f, segheap, I, E, numevals, x,w,gw,n, atol_, rtol_, maxevals, nrm) + return resum(f, adapt(f, segheap, I, E, numevals, x,w,gw,n, atol_, rtol_, maxevals, nrm)) end # internal routine to perform the h-adaptive refinement of the integration segments (segs) @@ -42,32 +46,44 @@ function adapt(f::F, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxe # Pop the biggest-error segment and subdivide (h-adaptation) # until convergence is achieved or maxevals is exceeded. while E > atol && E > rtol * nrm(I) && numevals < maxevals - s = heappop!(segs, Reverse) - mid = (s.a + s.b) / 2 - s1 = evalrule(f, s.a, mid, x,w,gw, nrm) - s2 = evalrule(f, mid, s.b, x,w,gw, nrm) - if f isa InplaceIntegrand - I .= (I .- s.I) .+ s1.I .+ s2.I - else - I = (I - s.I) + s1.I + s2.I - end - E = (E - s.E) + s1.E + s2.E - numevals += 4n+2 - - # handle type-unstable functions by converting to a wider type if needed - Tj = promote_type(typeof(s1), promote_type(typeof(s2), T)) - if Tj !== T - return adapt(f, heappush!(heappush!(Vector{Tj}(segs), s1, Reverse), s2, Reverse), - I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) - end + next = refine(f, segs, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) + next isa Vector && return next # handle type-unstable functions + I, E, numevals = next + end + return segs +end - heappush!(segs, s1, Reverse) - heappush!(segs, s2, Reverse) +# internal routine to refine the segment with largest error +function refine(f::F, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) where {F, T} + s = heappop!(segs, Reverse) + mid = (s.a + s.b) / 2 + s1 = evalrule(f, s.a, mid, x,w,gw, nrm) + s2 = evalrule(f, mid, s.b, x,w,gw, nrm) + if f isa InplaceIntegrand + I .= (I .- s.I) .+ s1.I .+ s2.I + else + I = (I - s.I) + s1.I + s2.I + end + E = (E - s.E) + s1.E + s2.E + numevals += 4n+2 + + # handle type-unstable functions by converting to a wider type if needed + Tj = promote_type(typeof(s1), promote_type(typeof(s2), T)) + if Tj !== T + return adapt(f, heappush!(heappush!(Vector{Tj}(segs), s1, Reverse), s2, Reverse), + I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) end - # re-sum (paranoia about accumulated roundoff) + heappush!(segs, s1, Reverse) + heappush!(segs, s2, Reverse) + + return I, E, numevals +end + +# re-sum (paranoia about accumulated roundoff) +function resum(f, segs) if f isa InplaceIntegrand - I .= segs[1].I + I = f.I .= segs[1].I E = segs[1].E for i in 2:length(segs) I .+= segs[i].I diff --git a/src/batch.jl b/src/batch.jl new file mode 100644 index 0000000..f2bf7ff --- /dev/null +++ b/src/batch.jl @@ -0,0 +1,196 @@ +""" + BatchIntegrand(f!, y::AbstractVector, [x::AbstractVector]; max_batch=typemax(Int)) + BatchIntegrand{Y,X}(f!; max_batch=typemax(Int)) where {Y,X} + BatchIntegrand{Y}(f!; max_batch=typemax(Int)) where {Y} + +Constructor for a `BatchIntegrand` accepting an integrand of the form `f!(y,x) = y .= f.(x)` +that can evaluate the integrand at multiple quadrature nodes using, for example, threads, +the GPU, or distributed memory. The arguments `y`, and optionally `x`, are pre-allocated +buffers of the correct element type for the domain and range of `f!`. Additionally, they +must be `resize!`-able since the number of evaluation points may vary between calls to `f!`. +Alternatively, the element types of those buffers, `Y` and optionally `X`, may be passed as +type parameters to the constructor. The `max_batch` keyword roughly limits the number of +nodes passed to the integrand, though at least `4*order+2` nodes will be used by the GK +rule. + +## Internal allocations + +If `x` or `X` are not specified, `quadgk` internally creates a new `BatchIntegrand` with the +user-supplied `y` buffer and a freshly-allocated `x` buffer based on the domain types. So, +if you want to reuse the `x` buffer between calls, supply `{Y,X}` or pass `y,x` explicitly. +""" +struct BatchIntegrand{Y,X,Ty<:AbstractVector{Y},Tx<:AbstractVector{X},F} + # in-place function f!(y, x) that takes an array of x values and outputs an array of results in-place + f!::F + y::Ty + x::Tx + max_batch::Int # maximum number of x to supply in parallel +end + +function BatchIntegrand(f!, y::AbstractVector, x::AbstractVector=similar(y, Nothing); max_batch::Integer=typemax(Int)) + max_batch > 0 || throw(ArgumentError("max_batch must be positive")) + return BatchIntegrand(f!, y, x, max_batch) +end +BatchIntegrand{Y,X}(f!; kws...) where {Y,X} = BatchIntegrand(f!, Y[], X[]; kws...) +BatchIntegrand{Y}(f!; kws...) where {Y} = BatchIntegrand(f!, Y[]; kws...) + +function evalrule(fx::AbstractVector{T}, a,b, x,w,gw, nrm) where {T} + l = length(x) + n = 2l - 1 # number of Kronrod points + n1 = 1 - (l & 1) # 0 if even order, 1 if odd order + s = convert(eltype(x), 0.5) * (b-a) + if n1 == 0 # even: Gauss rule does not include x == 0 + Ik = fx[l] * w[end] + Ig = zero(Ik) + else # odd: don't count x==0 twice in Gauss rule + f0 = fx[l] + Ig = f0 * gw[end] + Ik = f0 * w[end] + (fx[l-1] + fx[l+1]) * w[end-1] + end + for i = 1:length(gw)-n1 + fg = fx[2i] + fx[n-2i+1] + fk = fx[2i-1] + fx[n-2i+2] + Ig += fg * gw[i] + Ik += fg * w[2i] + fk * w[2i-1] + end + Ik_s, Ig_s = Ik * s, Ig * s # new variable since this may change the type + E = nrm(Ik_s - Ig_s) + if isnan(E) || isinf(E) + throw(DomainError(a+s, "integrand produced $E in the interval ($a, $b)")) + end + return Segment(oftype(s, a), oftype(s, b), Ik_s, E) +end + +function evalrules(f::BatchIntegrand, s::NTuple{N}, x,w,gw, nrm) where {N} + l = length(x) + m = 2l-1 # evaluations per segment + n = (N-1)*m # total evaluations + resize!(f.x, n) + resize!(f.y, n) + for i in 1:(N-1) # fill buffer with evaluation points + a = s[i]; b = s[i+1] + c = convert(eltype(x), 0.5) * (b-a) + o = (i-1)*m + f.x[l+o] = a + c + for j in 1:l-1 + f.x[j+o] = a + (1 + x[j]) * c + f.x[m+1-j+o] = a + (1 - x[j]) * c + end + end + f.f!(f.y, f.x) # evaluate integrand + return ntuple(Val(N-1)) do i + return evalrule(view(f.y, (1+(i-1)*m):(i*m)), s[i], s[i+1], x,w,gw, nrm) + end +end + +# we refine as many segments as we can fit into the buffer +function refine(f::BatchIntegrand, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) where {T} + tol = max(atol, rtol*nrm(I)) + nsegs = 0 + len = length(segs) + l = length(x) + m = 2l-1 # == 2n+1 + + # collect as many segments that will have to be evaluated for the current tolerance + # while staying under max_batch and maxevals + while len > nsegs && 2m*nsegs < f.max_batch && E > tol && numevals < maxevals + # same as heappop!, but moves segments to end of heap/vector to avoid allocations + s = segs[1] + y = segs[len-nsegs] + segs[len-nsegs] = s + nsegs += 1 + tol += s.E + numevals += 2m + len > nsegs && DataStructures.percolate_down!(segs, 1, y, Reverse, len-nsegs) + end + + resize!(f.x, 2m*nsegs) + resize!(f.y, 2m*nsegs) + for i in 1:nsegs # fill buffer with evaluation points + s = segs[len-i+1] + mid = (s.a+s.b)/2 + for (j,a,b) in ((2,s.a,mid), (1,mid,s.b)) + c = convert(eltype(x), 0.5) * (b-a) + o = (2i-j)*m + f.x[l+o] = a + c + for k in 1:l-1 + f.x[k+o] = a + (1 + x[k]) * c + f.x[m+1-k+o] = a + (1 - x[k]) * c + end + end + end + f.f!(f.y, f.x) + + resize!(segs, len+nsegs) + for i in 1:nsegs # evaluate segments and update estimates & heap + s = segs[len-i+1] + mid = (s.a + s.b)/2 + s1 = evalrule(view(f.y, 1+2(i-1)*m:(2i-1)*m), s.a,mid, x,w,gw, nrm) + s2 = evalrule(view(f.y, 1+(2i-1)*m:2i*m), mid,s.b, x,w,gw, nrm) + I = (I - s.I) + s1.I + s2.I + E = (E - s.E) + s1.E + s2.E + segs[len-i+1] = s1 + segs[len+i] = s2 + end + for i in 1:2nsegs + DataStructures.percolate_up!(segs, len-nsegs+i, Reverse) + end + + return I, E, numevals +end + +function handle_infinities(workfunc, f::BatchIntegrand, s) + s1, s2 = s[1], s[end] + if realone(s1) && realone(s2) # check for infinite or semi-infinite intervals + inf1, inf2 = isinf(s1), isinf(s2) + if inf1 || inf2 + xtmp = f.x # buffer to store evaluation points + ytmp = f.y # original integrand may have different units + xbuf = similar(xtmp, typeof(one(eltype(f.x)))) + ybuf = similar(ytmp, typeof(oneunit(eltype(f.y))*oneunit(s1))) + if inf1 && inf2 # x = t/(1-t^2) coordinate transformation + return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v)); + f.f!(ytmp, xtmp .= oneunit(s1) .* t ./ (1 .- t .* t)); v .= ytmp .* (1 .+ t .* t) .* oneunit(s1) ./ (1 .- t .* t) .^ 2; end, ybuf, xbuf, f.max_batch), + map(x -> isinf(x) ? (signbit(x) ? -one(x) : one(x)) : 2x / (oneunit(x)+hypot(oneunit(x),2x)), s), + t -> oneunit(s1) * t / (1 - t^2)) + end + let (s0,si) = inf1 ? (s2,s1) : (s1,s2) # let is needed for JuliaLang/julia#15276 + if si < zero(si) # x = s0 - t/(1-t) + return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v)); + f.f!(ytmp, xtmp .= s0 .- oneunit(s1) .* t ./ (1 .- t)); v .= ytmp .* oneunit(s1) ./ (1 .- t) .^ 2; end, ybuf, xbuf, f.max_batch), + reverse(map(x -> 1 / (1 + oneunit(x) / (s0 - x)), s)), + t -> s0 - oneunit(s1)*t/(1-t)) + else # x = s0 + t/(1-t) + return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v)); + f.f!(ytmp, xtmp .= s0 .+ oneunit(s1) .* t ./ (1 .- t)); v .= ytmp .* oneunit(s1) ./ (1 .- t) .^ 2; end, ybuf, xbuf, f.max_batch), + map(x -> 1 / (1 + oneunit(x) / (x - s0)), s), + t -> s0 + oneunit(s1)*t/(1-t)) + end + end + end + end + return workfunc(f, s, identity) +end + +""" + quadgk(f::BatchIntegrand, a,b,c...; kws...) + +Like [`quadgk`](@ref), but batches evaluation points for an in-place integrand to evaluate +simultaneously. In particular, there are two differences from `quadgk` + +1. The function `f.f!` should be of the form `f!(y, x) = y .= f.(x)`. That is, it writes + the return values of the integand `f(x)` in-place into its first argument `y`. (The + return value of `f!` is ignored.) See [`BatchIntegrand`](@ref) for how to define the + integrand. + +2. `f.max_batch` changes how the adaptive refinement is done by batching multiple segments + together. Choosing `max_batch<=4*order+2` will reproduce the result of + non-batched `quadgk`, however if `max_batch=n*(4*order+2)` up to `2n` Kronrod rules will be evaluated + together, which can produce slightly different results and sometimes require more + integrand evaluations when using relative tolerances. +""" +function quadgk(f::BatchIntegrand{Y,Nothing}, segs::T...; kws...) where {Y,T} + FT = float(T) # the gk points are floating-point + g = BatchIntegrand(f.f!, f.y, similar(f.x, FT), f.max_batch) + return quadgk(g, segs...; kws...) +end diff --git a/test/runtests.jl b/test/runtests.jl index 8608626..4afed62 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -293,3 +293,39 @@ end @test sprint(io -> quadgk_print(io, x -> x^2, 0, 1, order=2), context=:compact=>true) == "f(0.5) = 0.25\nf(0.211325) = 0.0446582\nf(0.788675) = 0.622008\nf(0.03709) = 0.00137566\nf(0.96291) = 0.927196\n" end + +@testset "batch" begin + f(x) = min(abs(x), 1) # integrand requiring exactly three levels of refinement on [-2,2] + gcnt = fill(0) + f!(y, x) = (gcnt[] += 1; y .= f.(x)) + for order=1:7 + for max_batch in (4*order+2, 8*order+4) + gcnt[] = 0 + g = QuadGK.BatchIntegrand{Float64}(f!, max_batch=max_batch) + I′,E′ = quadgk(g, -2, 2, order=order) + @test quadgk(f, -2, 2, order=order) == (I′,E′) + @test gcnt[] == 2 + 1 + (max_batch < 8*order+4) # check if calls were batched + end + end + + # test constructors + ref = BatchIntegrand(f!, Float64[], Nothing[], typemax(Int)) + for b in ( + BatchIntegrand(f!, Float64[]), + BatchIntegrand(f!, Float64[], Nothing[]), + BatchIntegrand{Float64}(f!), + BatchIntegrand{Float64,Nothing}(f!), + ) + for name in (:f!, :y, :x, :max_batch) + @test getproperty(ref, name) == getproperty(b, name) + end + end +end + +@testset "batch Inf" begin + f!(v, x) = v .= exp.(-1 .* x .^ 2) + g = QuadGK.BatchIntegrand{Float64}(f!) + @test quadgk(g, 1., Inf)[1] ≈ quadgk(x -> exp(-x^2), 1., Inf)[1] + @test quadgk(g, -Inf, 1.)[1] ≈ quadgk(x -> exp(-x^2), -Inf, 1.)[1] + @test quadgk(g, -Inf, Inf)[1] ≈ quadgk(x -> exp(-x^2), -Inf, Inf)[1] +end