diff --git a/base/floatfuncs.jl b/base/floatfuncs.jl index 1edd20b8c49d75..b22dd7c238aaeb 100644 --- a/base/floatfuncs.jl +++ b/base/floatfuncs.jl @@ -358,7 +358,7 @@ end return hi, x-hi end -@inline function twomul(a::Float64, b::Float64) +function twomul(a::Float64, b::Float64) ahi, alo = splitbits(a) bhi, blo = splitbits(b) abhi = a*b @@ -368,7 +368,7 @@ end end function fma_emulated(a::Float64, b::Float64,c::Float64) - abhi, ablo = twomul(a,b) + abhi, ablo = @inline twomul(a,b) if !isfinite(abhi+c) || isless(abs(abhi), nextfloat(0x1p-969)) || issubnormal(a) || issubnormal(b) aandbfinite = isfinite(a) && isfinite(b) if !(isfinite(c) && aandbfinite) diff --git a/base/math.jl b/base/math.jl index 5760a7b9916ab8..820c728a861656 100644 --- a/base/math.jl +++ b/base/math.jl @@ -41,6 +41,24 @@ end # non-type specific math functions +@inline function two_mul(x::Float64, y::Float64) + if Core.Intrinsics.have_fma(Float64) + xy = x*y + return xy, fma(x, y, -xy) + end + return Base.twomul(x,y) +end + +@inline function two_mul(x::T, y::T) where T<: Union{Float16, Float32} + if Core.Intrinsics.have_fma(T) + xy = x*y + return xy, fma(x, y, -xy) + end + xy = widen(x)*y + Txy = T(xy) + return Txy, T(xy-Txy) +end + """ clamp(x, lo, hi) @@ -278,8 +296,7 @@ end hi, lo = p[end], zero(x) for i in length(p)-1:-1:1 pi = p[i] - prod = hi*x - err = fma(hi, x, -prod) + prod, err = two_mul(hi,x) hi = pi+prod lo = fma(lo, x, prod - (hi - pi) + err) end @@ -686,7 +703,7 @@ function _hypot(x, y) end h = sqrt(muladd(ax, ax, ay*ay)) # This branch is correctly rounded but requires a native hardware fma. - if Base.Math.FMA_NATIVE + if Core.Intrinsics.have_fma(typeof(h)) hsquared = h*h axsquared = ax*ax h -= (fma(-ay, ay, hsquared-axsquared) + fma(h, h,-hsquared) - fma(ax, ax, -axsquared))/(2*h) @@ -990,13 +1007,13 @@ function ^(x::Float64, n::Integer) n == 3 && return x*x*x # keep compatibility with literal_pow while n > 1 if n&1 > 0 - yn = x*y - ynlo = fma(x, y , -yn) + muladd(y, xnlo, x*ynlo) - y = yn + err = muladd(y, xnlo, x*ynlo) + y, ynlo = two_mul(x,y) + ynlo += err end - xn = x * x - xnlo = muladd(x, 2*xnlo, fma(x, x, -xn)) - x = xn + err = x*2*xnlo + x, xnlo = two_mul(x, x) + xnlo += err n >>>= 1 end !isfinite(x) && return x*y diff --git a/base/special/hyperbolic.jl b/base/special/hyperbolic.jl index d84cadcb2b6f21..74f750064c7c25 100644 --- a/base/special/hyperbolic.jl +++ b/base/special/hyperbolic.jl @@ -32,8 +32,7 @@ SINH_SMALL_X(::Type{Float32}) = 3.0f0 # For Float64, use DoubleFloat scheme for extra accuracy function sinh_kernel(x::Float64) - x2 = x*x - x2lo = fma(x,x,-x2) + x2, x2lo = two_mul(x,x) hi_order = evalpoly(x2, (8.333333333336817e-3, 1.9841269840165435e-4, 2.7557319381151335e-6, 2.5052096530035283e-8, 1.6059550718903307e-10, 7.634842144412119e-13, diff --git a/base/special/log.jl b/base/special/log.jl index d9c91bc5659518..bca0d7143db488 100644 --- a/base/special/log.jl +++ b/base/special/log.jl @@ -139,10 +139,6 @@ const t_log_Float32 = (0.0,0.007782140442054949,0.015504186535965254,0.023167059 0.6773988235918061,0.6813592248079031,0.6853040030989194,0.689233281238809, 0.6931471805599453) -# determine if hardware FMA is available -# should probably check with LLVM, see #9855. -const FMA_NATIVE = muladd(nextfloat(1.0),nextfloat(1.0),-nextfloat(1.0,2)) != 0 - # truncate lower order bits (up to 26) # ideally, this should be able to use ANDPD instructions, see #9868. @inline function truncbits(x::Float64) @@ -209,18 +205,10 @@ end # 2(f-u1-u2) - f*(u1+u2) = 0 # 2(f-u1) - f*u1 = (2+f)u2 # u2 = (2(f-u1) - f*u1)/(2+f) - if FMA_NATIVE - return u + fma(fma(-u,f,2(f-u)), g, q) - else - u1 = truncbits(u) # round to 24 bits - f1 = truncbits(f) - f2 = f-f1 - u2 = ((2.0*(f-u1)-u1*f1)-u1*f2)*g - ## Step 4 - m_hi = logbU(Float64, base) - m_lo = logbL(Float64, base) - return fma(m_hi, u1, fma(m_hi, (u2 + q), m_lo*u1)) - end + + m_hi = logbU(Float64, base) + m_lo = logbL(Float64, base) + return fma(m_hi, u, fma(m_lo, u, m_hi*fma(fma(-u,f,2(f-u)), g, q))) end @@ -417,8 +405,8 @@ end 0.153846227114512262845736, 0.13332981086846273921509, 0.117754809412463995466069, 0.103239680901072952701192, 0.116255524079935043668677)) - res_hi = hi_order * x_hi - res_lo = fma(x_lo, hi_order, fma(hi_order, x_hi, -res_hi)) + res_hi, res_lo = two_mul(hi_order, x_hi) + res_lo = fma(x_lo, hi_order, res_lo) ans_hi = c1hi + res_hi ans_lo = ((c1hi - ans_hi) + res_hi) + (res_lo + 3.80554962542412056336616e-17) return ans_hi, ans_lo @@ -440,8 +428,8 @@ function _log_ext(d::Float64) invy = inv(mp1hi) xhi = (m - 1.0) * invy xlo = fma(-xhi, mp1lo, fma(-xhi, mp1hi, m - 1.0)) * invy - x2hi = xhi * xhi - x2lo = muladd(xhi, xlo * 2.0, fma(xhi, xhi, -x2hi)) + x2hi, x2lo = two_mul(xhi, xhi) + x2lo = muladd(xhi, xlo * 2.0, x2lo) thi, tlo = log_ext_kernel(x2hi, x2lo) shi = 0.6931471805582987 * e @@ -449,10 +437,10 @@ function _log_ext(d::Float64) shinew = muladd(xhi, 2.0, shi) slo = muladd(1.6465949582897082e-12, e, muladd(xlo, 2.0, (((shi - shinew) + xhi2)))) shi = shinew - x3hi = x2hi * xhi - x3lo = muladd(x2hi, xlo, muladd(xhi, x2lo,fma(x2hi, xhi, -x3hi))) - x3thi = x3hi * thi - x3tlo = muladd(x3hi, tlo, muladd(x3lo, thi, fma(x3hi, thi, -x3thi))) + x3hi, x3lo = two_mul(x2hi, xhi) + x3lo = muladd(x2hi, xlo, muladd(xhi, x2lo,x3lo)) + x3thi, x3tlo = two_mul(x3hi, thi) + x3tlo = muladd(x3hi, tlo, muladd(x3lo, thi, x3tlo)) anshi = x3thi + shi anslo = slo + x3tlo - ((anshi - shi) - x3thi) return anshi, anslo