JuliaMath · stevengj · Sep 20, 2023 · May 25, 2023 · May 30, 2023 · May 30, 2023
diff --git a/docs/src/api.md b/docs/src/api.md
@@ -18,6 +18,14 @@ QuadGK.quadgk!
 QuadGK.alloc_segbuf
 ```
 
+For a vectorized integrand that can evaluate the integrand at multiple points
+simultaneously, `quadgk` accepts a `BatchIntegrand` wrapper around the user's
+integrand and pre-allocated input and output buffers.
+
+```@docs
+QuadGK.BatchIntegrand
+```
+
 ## Gauss and Gauss–Kronrod rules
 
 For more specialized applications, you may wish to construct your own Gauss or Gauss–Kronrod quadrature rules, as described in [Gauss and Gauss–Kronrod quadrature rules](@ref).   To compute rules for $\int_{-1}^{+1} f(x) dx$ and $\int_a^b f(x) dx$ (unweighted integrals), use:

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -20,6 +20,7 @@ Features of the QuadGK package include:
 * [Contour integrals](https://en.wikipedia.org/wiki/Contour_integration): You can specify a sequence of points in the complex plane to perform a contour integral along a piecewise-linear contour.
 * Arbitrary-order and custom quadrature rules: Any polynomial `order` of the Gauss–Kronrod quadrature rule can be specified, as well as generating quadrature rules and weights directly; see [Gauss and Gauss–Kronrod quadrature rules](@ref).  Custom Gaussian-quadrature rules can also be constructed for arbitrary weight functions; see [Gaussian quadrature and arbitrary weight functions](@ref).
 * In-place integration: For memory efficiency, integrand functions that write in-place into a pre-allocated buffer (e.g. for vector-valued integrands) can be used with the [`quadgk!`](@ref) function, along with pre-allocated buffers using [`alloc_segbuf`](@ref).
+* Batched integrand evaluation: Providing an integrand that can evaluate multiple points simultaneously allows for user-controlled parallelization (e.g. using threads, the GPU, or distributed memory).
 
 ## Quick start
 
@@ -45,7 +46,7 @@ For extremely [smooth functions](https://en.wikipedia.org/wiki/Smoothness) like
 
 ## Tutorial examples
 
-The [`quadgk` examples](@ref) chapter of this manual presents several other examples, including improper integrals, vector-valued integrands, improper integrals, singular or near-singular integrands, and Cauchy principal values.
+The [`quadgk` examples](@ref) chapter of this manual presents several other examples, including improper integrals, vector-valued integrands, batched integrand evaluation, improper integrals, singular or near-singular integrands, and Cauchy principal values.
 
 ## In-place operations for array-valued integrands
 

diff --git a/docs/src/quadgk-examples.md b/docs/src/quadgk-examples.md
@@ -222,6 +222,40 @@ Note that the return value also gives the `integral` as an `SVector` (a [statica
 The QuadGK package did not need any code specific to StaticArrays, and was written long before that package even existed.  The
 fact that unrelated packages like this can be [composed](https://en.wikipedia.org/wiki/Composability) is part of the [beauty of multiple dispatch](https://www.youtube.com/watch?v=kc9HwsxE1OY) and [duck typing](https://en.wikipedia.org/wiki/Duck_typing) for [generic programming](https://en.wikipedia.org/wiki/Generic_programming).
 
+## Batched integrand evaluation
+
+User-side parallelization of integrand evaluations is also possible by providing
+an in-place function of the form `f!(y,x) = y .= f.(x)`, which evaluates the
+integrand at multiple points simultaneously. To use this API, `quadgk`
+dispatches on a [`BatchIntegrand`](@ref) type containing `f!` and buffers for
+`y` and `x`. These buffers may be pre-allocated and reused for multiple
+`BatchIntegrand`s with the same domain and range types.
+
+For example, we can perform multi-threaded integration of a highly oscillatory
+function that needs to be refined globally:
+```
+julia> f(x) = sin(100x)
+f (generic function with 1 method)
+
+julia> function f!(y, x)
+           n = Threads.nthreads()
+           Threads.@threads for i in 1:n
+                y[i:n:end] .= f.(@view(x[i:n:end]))
+           end
+       end
+f! (generic function with 1 method)
+
+julia> quadgk(BatchIntegrand{Float64}(f!), 0, 1)
+(0.0013768112771231598, 8.493080824940099e-12)
+```
+
+Batching also changes how the adaptive refinement is done, which typically leads
+to slightly different results and sometimes more integrand evaluations. You
+can limit the maximum batch size by setting the `max_batch` parameter
+of the [`BatchIntegrand`](@ref), which can be useful in order to set an
+upper bound on the size of the buffers allocated by `quadgk`.
+
+
 ## Arbitrary-precision integrals
 
 `quadgk` also supports [arbitrary-precision arithmetic](https://en.wikipedia.org/wiki/Arbitrary-precision_arithmetic) using Julia's [`BigFloat` type](https://docs.julialang.org/en/v1/base/numbers/#BigFloats-and-BigInts) to compute integrals to arbitrary accuracy (albeit at increased computational cost).

diff --git a/src/QuadGK.jl b/src/QuadGK.jl
@@ -24,6 +24,7 @@ and returns the approximate `integral = 0.746824132812427` and error estimate
 module QuadGK
 
 export quadgk, quadgk!, gauss, kronrod, alloc_segbuf, quadgk_count, quadgk_print
+export BatchIntegrand
 
 using DataStructures, LinearAlgebra
 import Base.Order.Reverse
@@ -52,5 +53,6 @@ include("gausskronrod.jl")
 include("evalrule.jl")
 include("adapt.jl")
 include("weightedgauss.jl")
+include("batch.jl")
 
 end # module QuadGK
diff --git a/src/adapt.jl b/src/adapt.jl
@@ -7,7 +7,11 @@ function do_quadgk(f::F, s::NTuple{N,T}, n, atol, rtol, maxevals, nrm, segbuf) w
     x,w,gw = cachedrule(T,n)
 
     @assert N ≥ 2
-    segs = ntuple(i -> evalrule(f, s[i],s[i+1], x,w,gw, nrm), Val{N-1}())
+    if f isa BatchIntegrand
+        segs = evalrules(f, s, x,w,gw, nrm)
+    else
+        segs = ntuple(i -> evalrule(f, s[i],s[i+1], x,w,gw, nrm), Val{N-1}())
+    end
     if f isa InplaceIntegrand
         I = f.I .= segs[1].I
         for i = 2:length(segs)
@@ -34,40 +38,52 @@ function do_quadgk(f::F, s::NTuple{N,T}, n, atol, rtol, maxevals, nrm, segbuf) w
 
     segheap = segbuf === nothing ? collect(segs) : (resize!(segbuf, N-1) .= segs)
     heapify!(segheap, Reverse)
-    return adapt(f, segheap, I, E, numevals, x,w,gw,n, atol_, rtol_, maxevals, nrm)
+    return resum(f, adapt(f, segheap, I, E, numevals, x,w,gw,n, atol_, rtol_, maxevals, nrm))
 end
 
 # internal routine to perform the h-adaptive refinement of the integration segments (segs)
 function adapt(f::F, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) where {F, T}
     # Pop the biggest-error segment and subdivide (h-adaptation)
     # until convergence is achieved or maxevals is exceeded.
     while E > atol && E > rtol * nrm(I) && numevals < maxevals
-        s = heappop!(segs, Reverse)
-        mid = (s.a + s.b) / 2
-        s1 = evalrule(f, s.a, mid, x,w,gw, nrm)
-        s2 = evalrule(f, mid, s.b, x,w,gw, nrm)
-        if f isa InplaceIntegrand
-            I .= (I .- s.I) .+ s1.I .+ s2.I
-        else
-            I = (I - s.I) + s1.I + s2.I
-        end
-        E = (E - s.E) + s1.E + s2.E
-        numevals += 4n+2
-
-        # handle type-unstable functions by converting to a wider type if needed
-        Tj = promote_type(typeof(s1), promote_type(typeof(s2), T))
-        if Tj !== T
-            return adapt(f, heappush!(heappush!(Vector{Tj}(segs), s1, Reverse), s2, Reverse),
-                         I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm)
-        end
+        next = refine(f, segs, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm)
+        next isa Vector && return next # handle type-unstable functions
+        I, E, numevals = next
+    end
+    return segs
+end
 
-        heappush!(segs, s1, Reverse)
-        heappush!(segs, s2, Reverse)
+# internal routine to refine the segment with largest error
+function refine(f::F, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) where {F, T}
+    s = heappop!(segs, Reverse)
+    mid = (s.a + s.b) / 2
+    s1 = evalrule(f, s.a, mid, x,w,gw, nrm)
+    s2 = evalrule(f, mid, s.b, x,w,gw, nrm)
+    if f isa InplaceIntegrand
+        I .= (I .- s.I) .+ s1.I .+ s2.I
+    else
+        I = (I - s.I) + s1.I + s2.I
+    end
+    E = (E - s.E) + s1.E + s2.E
+    numevals += 4n+2
+
+    # handle type-unstable functions by converting to a wider type if needed
+    Tj = promote_type(typeof(s1), promote_type(typeof(s2), T))
+    if Tj !== T
+        return adapt(f, heappush!(heappush!(Vector{Tj}(segs), s1, Reverse), s2, Reverse),
+                     I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm)
     end
 
-    # re-sum (paranoia about accumulated roundoff)
+    heappush!(segs, s1, Reverse)
+    heappush!(segs, s2, Reverse)
+
+    return I, E, numevals
+end
+
+# re-sum (paranoia about accumulated roundoff)
+function resum(f, segs)
     if f isa InplaceIntegrand
-        I .= segs[1].I
+        I = f.I .= segs[1].I
         E = segs[1].E
         for i in 2:length(segs)
             I .+= segs[i].I

diff --git a/src/batch.jl b/src/batch.jl
@@ -0,0 +1,196 @@
+"""
+    BatchIntegrand(f!, y::AbstractVector, [x::AbstractVector]; max_batch=typemax(Int))
+    BatchIntegrand{Y,X}(f!; max_batch=typemax(Int)) where {Y,X}
+    BatchIntegrand{Y}(f!; max_batch=typemax(Int)) where {Y}
+
+Constructor for a `BatchIntegrand` accepting an integrand of the form `f!(y,x) = y .= f.(x)`
+that can evaluate the integrand at multiple quadrature nodes using, for example, threads,
+the GPU, or distributed memory. The arguments `y`, and optionally `x`, are pre-allocated
+buffers of the correct element type for the domain and range of `f!`. Additionally, they
+must be `resize!`-able since the number of evaluation points may vary between calls to `f!`.
+Alternatively, the element types of those buffers, `Y` and optionally `X`, may be passed as
+type parameters to the constructor. The `max_batch` keyword roughly limits the number of
+nodes passed to the integrand, though at least `4*order+2` nodes will be used by the GK
+rule.
+
+## Internal allocations
+
+If `x` or `X` are not specified, `quadgk` internally creates a new `BatchIntegrand` with the
+user-supplied `y` buffer and a freshly-allocated `x` buffer based on the domain types. So,
+if you want to reuse the `x` buffer between calls, supply `{Y,X}` or pass `y,x` explicitly.
+"""
+struct BatchIntegrand{Y,X,Ty<:AbstractVector{Y},Tx<:AbstractVector{X},F}
+    # in-place function f!(y, x) that takes an array of x values and outputs an array of results in-place
+    f!::F
+    y::Ty
+    x::Tx
+    max_batch::Int # maximum number of x to supply in parallel
+end
+
+function BatchIntegrand(f!, y::AbstractVector, x::AbstractVector=similar(y, Nothing); max_batch::Integer=typemax(Int))
+    max_batch > 0 || throw(ArgumentError("max_batch must be positive"))
+    return BatchIntegrand(f!, y, x, max_batch)
+end
+BatchIntegrand{Y,X}(f!; kws...) where {Y,X} = BatchIntegrand(f!, Y[], X[]; kws...)
+BatchIntegrand{Y}(f!; kws...) where {Y} = BatchIntegrand(f!, Y[]; kws...)
+
+function evalrule(fx::AbstractVector{T}, a,b, x,w,gw, nrm) where {T}
+    l = length(x)
+    n = 2l - 1 # number of Kronrod points
+    n1 = 1 - (l & 1) # 0 if even order, 1 if odd order
+    s = convert(eltype(x), 0.5) * (b-a)
+    if n1 == 0 # even: Gauss rule does not include x == 0
+        Ik = fx[l] * w[end]
+        Ig = zero(Ik)
+    else # odd: don't count x==0 twice in Gauss rule
+        f0 = fx[l]
+        Ig = f0 * gw[end]
+        Ik = f0 * w[end] + (fx[l-1] + fx[l+1]) * w[end-1]
+    end
+    for i = 1:length(gw)-n1
+        fg = fx[2i] + fx[n-2i+1]
+        fk = fx[2i-1] + fx[n-2i+2]
+        Ig += fg * gw[i]
+        Ik += fg * w[2i] + fk * w[2i-1]
+    end
+    Ik_s, Ig_s = Ik * s, Ig * s # new variable since this may change the type
+    E = nrm(Ik_s - Ig_s)
+    if isnan(E) || isinf(E)
+        throw(DomainError(a+s, "integrand produced $E in the interval ($a, $b)"))
+    end
+    return Segment(oftype(s, a), oftype(s, b), Ik_s, E)
+end
+
+function evalrules(f::BatchIntegrand, s::NTuple{N}, x,w,gw, nrm) where {N}
+    l = length(x)
+    m = 2l-1    # evaluations per segment
+    n = (N-1)*m # total evaluations
+    resize!(f.x, n)
+    resize!(f.y, n)
+    for i in 1:(N-1)    # fill buffer with evaluation points
+        a = s[i]; b = s[i+1]
+        c = convert(eltype(x), 0.5) * (b-a)
+        o = (i-1)*m
+        f.x[l+o] = a + c
+        for j in 1:l-1
+            f.x[j+o] = a + (1 + x[j]) * c
+            f.x[m+1-j+o] = a + (1 - x[j]) * c
+        end
+    end
+    f.f!(f.y, f.x)  # evaluate integrand
+    return ntuple(Val(N-1)) do i
+        return evalrule(view(f.y, (1+(i-1)*m):(i*m)), s[i], s[i+1], x,w,gw, nrm)
+    end
+end
+
+# we refine as many segments as we can fit into the buffer
+function refine(f::BatchIntegrand, segs::Vector{T}, I, E, numevals, x,w,gw,n, atol, rtol, maxevals, nrm) where {T}
+    tol = max(atol, rtol*nrm(I))
+    nsegs = 0
+    len = length(segs)
+    l = length(x)
+    m = 2l-1 # == 2n+1
+
+    # collect as many segments that will have to be evaluated for the current tolerance
+    # while staying under max_batch and maxevals
+    while len > nsegs && 2m*nsegs < f.max_batch && E > tol && numevals < maxevals
+        # same as heappop!, but moves segments to end of heap/vector to avoid allocations
+        s = segs[1]
+        y = segs[len-nsegs]
+        segs[len-nsegs] = s
+        nsegs += 1
+        tol += s.E
+        numevals += 2m
+        len > nsegs && DataStructures.percolate_down!(segs, 1, y, Reverse, len-nsegs)
+    end
+
+    resize!(f.x, 2m*nsegs)
+    resize!(f.y, 2m*nsegs)
+    for i in 1:nsegs    # fill buffer with evaluation points
+        s = segs[len-i+1]
+        mid = (s.a+s.b)/2
+        for (j,a,b) in ((2,s.a,mid), (1,mid,s.b))
+            c = convert(eltype(x), 0.5) * (b-a)
+            o = (2i-j)*m
+            f.x[l+o] = a + c
+            for k in 1:l-1
+                f.x[k+o] = a + (1 + x[k]) * c
+                f.x[m+1-k+o] = a + (1 - x[k]) * c
+            end
+        end
+    end
+    f.f!(f.y, f.x)
+
+    resize!(segs, len+nsegs)
+    for i in 1:nsegs    # evaluate segments and update estimates & heap
+        s = segs[len-i+1]
+        mid = (s.a + s.b)/2
+        s1 = evalrule(view(f.y, 1+2(i-1)*m:(2i-1)*m), s.a,mid, x,w,gw, nrm)
+        s2 = evalrule(view(f.y, 1+(2i-1)*m:2i*m), mid,s.b, x,w,gw, nrm)
+        I = (I - s.I) + s1.I + s2.I
+        E = (E - s.E) + s1.E + s2.E
+        segs[len-i+1] = s1
+        segs[len+i]   = s2
+    end
+    for i in 1:2nsegs
+        DataStructures.percolate_up!(segs, len-nsegs+i, Reverse)
+    end
+
+    return I, E, numevals
+end
+
+function handle_infinities(workfunc, f::BatchIntegrand, s)
+    s1, s2 = s[1], s[end]
+    if realone(s1) && realone(s2) # check for infinite or semi-infinite intervals
+        inf1, inf2 = isinf(s1), isinf(s2)
+        if inf1 || inf2
+            xtmp = f.x # buffer to store evaluation points
+            ytmp = f.y # original integrand may have different units
+            xbuf = similar(xtmp, typeof(one(eltype(f.x))))
+            ybuf = similar(ytmp, typeof(oneunit(eltype(f.y))*oneunit(s1)))
+            if inf1 && inf2 # x = t/(1-t^2) coordinate transformation
+                return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v));
+                                            f.f!(ytmp, xtmp .= oneunit(s1) .* t ./ (1 .- t .* t)); v .= ytmp .* (1 .+ t .* t) .* oneunit(s1) ./ (1 .- t .* t) .^ 2; end, ybuf, xbuf, f.max_batch),
+                                map(x -> isinf(x) ? (signbit(x) ? -one(x) : one(x)) : 2x / (oneunit(x)+hypot(oneunit(x),2x)), s),
+                                t -> oneunit(s1) * t / (1 - t^2))
+            end
+            let (s0,si) = inf1 ? (s2,s1) : (s1,s2) # let is needed for JuliaLang/julia#15276
+                if si < zero(si) # x = s0 - t/(1-t)
+                    return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v));
+                                            f.f!(ytmp, xtmp .= s0 .- oneunit(s1) .* t ./ (1 .- t)); v .= ytmp .* oneunit(s1) ./ (1 .- t) .^ 2; end, ybuf, xbuf, f.max_batch),
+                                    reverse(map(x -> 1 / (1 + oneunit(x) / (s0 - x)), s)),
+                                    t -> s0 - oneunit(s1)*t/(1-t))
+                else # x = s0 + t/(1-t)
+                    return workfunc(BatchIntegrand((v, t) -> begin resize!(xtmp, length(t)); resize!(ytmp, length(v));
+                                            f.f!(ytmp, xtmp .= s0 .+ oneunit(s1) .* t ./ (1 .- t)); v .= ytmp .* oneunit(s1) ./ (1 .- t) .^ 2; end, ybuf, xbuf, f.max_batch),
+                                    map(x -> 1 / (1 + oneunit(x) / (x - s0)), s),
+                                    t -> s0 + oneunit(s1)*t/(1-t))
+                end
+            end
+        end
+    end
+    return workfunc(f, s, identity)
+end
+
+"""
+    quadgk(f::BatchIntegrand, a,b,c...; kws...)
+
+Like [`quadgk`](@ref), but batches evaluation points for an in-place integrand to evaluate
+simultaneously. In particular, there are two differences from `quadgk`
+
+1. The function `f.f!` should be of the form `f!(y, x) = y .= f.(x)`.  That is, it writes
+   the return values of the integand `f(x)` in-place into its first argument `y`. (The
+   return value of `f!` is ignored.) See [`BatchIntegrand`](@ref) for how to define the
+   integrand.
+
+2. `f.max_batch` changes how the adaptive refinement is done by batching multiple segments
+   together. Choosing `max_batch<=4*order+2` will reproduce the result of
+   non-batched `quadgk`, however if `max_batch=n*(4*order+2)` up to `2n` Kronrod rules will be evaluated
+   together, which can produce slightly different results and sometimes require more
+   integrand evaluations when using relative tolerances.
+"""
+function quadgk(f::BatchIntegrand{Y,Nothing}, segs::T...; kws...) where {Y,T}
+    FT = float(T) # the gk points are floating-point
+    g = BatchIntegrand(f.f!, f.y, similar(f.x, FT), f.max_batch)
+    return quadgk(g, segs...; kws...)
+end