From 6f554ef7eedde1c904190e84f8ad82ca80ddaa67 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 17:16:37 +0200 Subject: [PATCH 01/16] Added erf(x) Float64/Float32 Julia implementation --- src/erf.jl | 264 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 259 insertions(+), 5 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index c878cf5d..f53724b8 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -12,9 +12,6 @@ for f in (:erf, :erfc) @eval begin $f(x::Number) = $internalf(float(x)) - $internalf(x::Float64) = ccall(($libopenlibmf, libopenlibm), Float64, (Float64,), x) - $internalf(x::Float32) = ccall(($libopenlibmf0, libopenlibm), Float32, (Float32,), x) - $internalf(x::Float16) = Float16($internalf(Float32(x))) $internalf(z::Complex{Float64}) = Complex{Float64}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), z, zero(Float64))) $internalf(z::Complex{Float32}) = Complex{Float32}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), Complex{Float64}(z), Float64(eps(Float32)))) @@ -28,6 +25,10 @@ for f in (:erf, :erfc) end end + erfc(x::Float64) = ccall((erfc, libopenlibm), Float64, (Float64,), x) + erfc(x::Float32) = ccall((erfcf, libopenlibm), Float32, (Float32,), x) + erfc(x::Float16) = Float16(erfc(Float32(x))) + for f in (:erfcx, :erfi, :dawson, :faddeeva) internalf = Symbol(:_, f) openspecfunfsym = Symbol(:Faddeeva_, f === :dawson ? :Dawson : f === :faddeeva ? :w : f) @@ -96,10 +97,263 @@ See also: [`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv). # Implementation by -- `Float32`/`Float64`: C standard math library - [libm](https://en.wikipedia.org/wiki/C_mathematical_functions#libm). +- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c - `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/) """ + +# Fast erf implementation using a mix of +# rational and polynomial approximations. +# Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. +function erf(x::Float64) + # Minimax approximation of erf + PA=(0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,-0x1.18c47fd143c5ep-23) + # Rational approximation on [0x1p-28, 0.84375] + NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16) + DA=(0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18) + # Rational approximation on [0.84375, 1.25] + NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 ) + DB=( 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 ) + + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 + PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19) + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 + PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 ) + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 + PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 ) + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 + PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 ) + + C = 0x1.b0ac16p-1 + + TwoOverSqrtPiMinusOne=0x1.06eba8214db69p-3 + + + # # top 32 bits + ix::UInt32=reinterpret(UInt64,x)>>32 + # # top 32, without sign bit + ia::UInt32=ix & 0x7fffffff + # # sign + # sign::UInt32=ix>>31 + + sign::Bool=x<0 + + + + if (ia < 0x3feb0000) + # a = |x| < 0.84375. + + if (ia < 0x3e300000) + # a < 2^(-28). + if (ia < 0x00800000) + # a < 2^(-1015). + y = fma(TwoOverSqrtPiMinusOne, x, x) + + ## case of underflow TBD + #return check_uflow (y) + return y + end + return x + TwoOverSqrtPiMinusOne * x + end + + x2 = x * x + + if (ia < 0x3fe00000) + ## a < 0.5 - Use polynomial approximation. + r1 = fma(x2, PA[2], PA[1]) + r2 = fma(x2, PA[4], PA[3]) + r3 = fma(x2, PA[6], PA[5]) + r4 = fma(x2, PA[8], PA[7]) + r5 = fma(x2, PA[10], PA[9]) + + x4 = x2 * x2 + r = r5 + r = fma(x4, r, r4) + r = fma(x4, r, r3) + r = fma(x4, r, r2) + r = fma(x4, r, r1) + return fma(r, x, x) ## This fma is crucial for accuracy. + else + ## 0.5 <= a < 0.84375 - Use rational approximation. + + r1n = fma(x2, NA[2], NA[1]) + x4 = x2 * x2 + r2n = fma(x2, NA[4], NA[3]) + x8 = x4 * x4 + r1d = fma(x2, DA[1], 1.0) + r2d = fma(x2, DA[3], DA[2]) + r3d = fma(x2, DA[5], DA[4]) + P = r1n + x4 * r2n + x8 * NA[5] + + Q = r1d + x4 * r2d + x8 * r3d + return fma(P / Q, x, x) + end + elseif (ia < 0x3ff40000) + ## 0.84375 <= |x| < 1.25. + + a = abs(x) - 1.0 + r1n = fma(a, NB[2], NB[1]) + a2 = a * a + r1d = fma(a, DB[1], 1.0) + a4 = a2 * a2 + r2n = fma(a, NB[4], NB[3]) + a6 = a4 * a2 + r2d = fma(a, DB[3], DB[2]) + r3n = fma(a, NB[6], NB[5]) + r3d = fma(a, DB[5], DB[4]) + r4n = NB[7] + r4d = DB[6] + + P = r1n + a2 * r2n + a4 * r3n + a6 * r4n + Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d + if (sign) + return -C - P / Q + else + return C + P / Q + end + elseif (ia < 0x40000000) + ## 1.25 <= |x| < 2.0. + a = abs(x) + a = a - 1.25 + + r1 = fma(a, PC[2], PC[1]) + r2 = fma(a, PC[4], PC[3]) + r3 = fma(a, PC[6], PC[5]) + r4 = fma(a, PC[8], PC[7]) + r5 = fma(a, PC[10], PC[9]) + r6 = fma(a, PC[12], PC[11]) + r7 = fma(a, PC[14], PC[13]) + r8 = fma(a, PC[16], PC[15]) + + + a2 = a * a + + r = r8 + r = fma(a2, r, r7) + r = fma(a2, r, r6) + r = fma(a2, r, r5) + r = fma(a2, r, r4) + r = fma(a2, r, r3) + r = fma(a2, r, r2) + r = fma(a2, r, r1) + + if (sign) + return -1.0 + r + else + return 1.0 - r + end + elseif (ia < 0x400a0000) + ## 2 <= |x| < 3.25. + a = abs(x) + a = fma(0.5, a, -1.0) + + r1 = fma(a, PD[2], PD[1]) + r2 = fma(a, PD[4], PD[3]) + r3 = fma(a, PD[6], PD[5]) + r4 = fma(a, PD[8], PD[7]) + r5 = fma(a, PD[10], PD[9]) + r6 = fma(a, PD[12], PD[11]) + r7 = fma(a, PD[14], PD[13]) + r8 = fma(a, PD[16], PD[15]) + r9 = fma(a, PD[18], PD[17]) + + a2 = a * a + + r = r9 + r = fma(a2, r, r8) + r = fma(a2, r, r7) + r = fma(a2, r, r6) + r = fma(a2, r, r5) + r = fma(a2, r, r4) + r = fma(a2, r, r3) + r = fma(a2, r, r2) + r = fma(a2, r, r1) + + if (sign) + return -1.0 + r + else + return 1.0 - r + end + elseif (ia < 0x40100000) + ## 3.25 <= |x| < 4.0. + a = abs(x) + a = a - 3.25 + + r1 = fma(a, PE[2], PE[1]) + r2 = fma(a, PE[4], PE[3]) + r3 = fma(a, PE[6], PE[5]) + r4 = fma(a, PE[8], PE[7]) + r5 = fma(a, PE[10], PE[9]) + r6 = fma(a, PE[12], PE[11]) + r7 = fma(a, PE[14], PE[13]) + + + a2 = a * a + + r = r7 + r = fma(a2, r, r6) + r = fma(a2, r, r5) + r = fma(a2, r, r4) + r = fma(a2, r, r3) + r = fma(a2, r, r2) + r = fma(a2, r, r1) + + if (sign) + return -1.0 + r + else + return 1.0 - r + end + elseif (ia < 0x4017a000) + ## 4 <= |x| < 5.90625. + a = abs(x) + a = fma(0.5, a, -2.0) + + r1 = fma(a, PF[2], PF[1]) + r2 = fma(a, PF[4], PF[3]) + r3 = fma(a, PF[6], PF[5]) + r4 = fma(a, PF[8], PF[7]) + r5 = fma(a, PF[10], PF[9]) + r6 = fma(a, PF[12], PF[11]) + r7 = fma(a, PF[14], PF[13]) + r8 = fma(a, PF[16], PF[15]) + + r9 = PF[17] + + a2 = a * a + + r = r9 + r = fma(a2, r, r8) + r = fma(a2, r, r7) + r = fma(a2, r, r6) + r = fma(a2, r, r5) + r = fma(a2, r, r4) + r = fma(a2, r, r3) + r = fma(a2, r, r2) + r = fma(a2, r, r1) + + if (sign) + return -1.0 + r + else + return 1.0 - r + end + else + + + if (sign) + return -1.0 + else + return 1.0 + end + + end + + +end + +erf(x::Float32)=Float32(erf(Float64(x))) + +erf(x::Float16)=Float16(erf(Float64(x))) + + function erf end """ erf(x, y) From 7f4fd2d3ba045549f33348087c4e778242232633 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 17:42:00 +0200 Subject: [PATCH 02/16] changed erf to _erf, got rid of unnecessary branch --- src/erf.jl | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index f53724b8..77f1411b 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -25,9 +25,9 @@ for f in (:erf, :erfc) end end - erfc(x::Float64) = ccall((erfc, libopenlibm), Float64, (Float64,), x) - erfc(x::Float32) = ccall((erfcf, libopenlibm), Float32, (Float32,), x) - erfc(x::Float16) = Float16(erfc(Float32(x))) + _erfc(x::Float64) = ccall((_erfc, libopenlibm), Float64, (Float64,), x) + _erfc(x::Float32) = ccall((_erfcf, libopenlibm), Float32, (Float32,), x) + _erfc(x::Float16) = Float16(_erfc(Float32(x))) for f in (:erfcx, :erfi, :dawson, :faddeeva) internalf = Symbol(:_, f) @@ -104,7 +104,7 @@ See also: # Fast erf implementation using a mix of # rational and polynomial approximations. # Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. -function erf(x::Float64) +function _erf(x::Float64) # Minimax approximation of erf PA=(0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,-0x1.18c47fd143c5ep-23) # Rational approximation on [0x1p-28, 0.84375] @@ -142,19 +142,6 @@ function erf(x::Float64) if (ia < 0x3feb0000) # a = |x| < 0.84375. - if (ia < 0x3e300000) - # a < 2^(-28). - if (ia < 0x00800000) - # a < 2^(-1015). - y = fma(TwoOverSqrtPiMinusOne, x, x) - - ## case of underflow TBD - #return check_uflow (y) - return y - end - return x + TwoOverSqrtPiMinusOne * x - end - x2 = x * x if (ia < 0x3fe00000) @@ -349,9 +336,9 @@ function erf(x::Float64) end -erf(x::Float32)=Float32(erf(Float64(x))) +_erf(x::Float32)=Float32(_erf(Float64(x))) -erf(x::Float16)=Float16(erf(Float64(x))) +_erf(x::Float16)=Float16(_erf(Float64(x))) function erf end From e784c9f0834681e030b078c4d0556629d16a4cdc Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 17:53:36 +0200 Subject: [PATCH 03/16] fixed syntax error in ccall --- src/erf.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 77f1411b..598c925b 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -25,8 +25,8 @@ for f in (:erf, :erfc) end end - _erfc(x::Float64) = ccall((_erfc, libopenlibm), Float64, (Float64,), x) - _erfc(x::Float32) = ccall((_erfcf, libopenlibm), Float32, (Float32,), x) + _erfc(x::Float64) = ccall((erfc, libopenlibm), Float64, (Float64,), x) + _erfc(x::Float32) = ccall((erfcf, libopenlibm), Float32, (Float32,), x) _erfc(x::Float16) = Float16(_erfc(Float32(x))) for f in (:erfcx, :erfi, :dawson, :faddeeva) From da16cb124bb7c58ca4086b0e6d70bf19367cb4df Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 17:57:07 +0200 Subject: [PATCH 04/16] fixed syntax error in ccall 2 --- src/erf.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 598c925b..ea252819 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -25,8 +25,8 @@ for f in (:erf, :erfc) end end - _erfc(x::Float64) = ccall((erfc, libopenlibm), Float64, (Float64,), x) - _erfc(x::Float32) = ccall((erfcf, libopenlibm), Float32, (Float32,), x) + _erfc(x::Float64) = ccall((:erfc, libopenlibm), Float64, (Float64,), x) + _erfc(x::Float32) = ccall((:erfcf, libopenlibm), Float32, (Float32,), x) _erfc(x::Float16) = Float16(_erfc(Float32(x))) for f in (:erfcx, :erfi, :dawson, :faddeeva) From 0a755b6e398accfb6dfa9ac5a8a8b733ac9a8164 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 18:24:47 +0200 Subject: [PATCH 05/16] NaN edge case for erf(x) --- src/erf.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/erf.jl b/src/erf.jl index ea252819..92865360 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -324,6 +324,9 @@ function _erf(x::Float64) end else + if(isnan(x)) + return NaN + end if (sign) return -1.0 From 6efcec8e1948cfab242c6048d018ac952ec0c251 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Mon, 31 Mar 2025 19:05:43 +0200 Subject: [PATCH 06/16] added test cases for erf(x) --- test/erf.jl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/erf.jl b/test/erf.jl index 29127343..70f1bff0 100644 --- a/test/erf.jl +++ b/test/erf.jl @@ -2,7 +2,23 @@ @testset "real argument" begin for T in (Float16, Float32, Float64) @test @inferred(erf(T(1))) isa T + @test erf(T(0.25)) ≈ T(0.27632639016823696) rtol=2*eps(T) + @test erf(T(0.75)) ≈ T(0.7111556336535151) rtol=2*eps(T) @test erf(T(1)) ≈ T(0.84270079294971486934) rtol=2*eps(T) + @test erf(T(1.5)) ≈ T(0.9661051464753108) rtol=2*eps(T) + @test erf(T(2.5)) ≈ T(0.9995930479825551) rtol=2*eps(T) + @test erf(T(3.5)) ≈ T(0.9999992569016276) rtol=2*eps(T) + @test erf(T(4.5)) ≈ T(0.9999999998033839) rtol=2*eps(T) + @test erf(T(6)) ≈ T(1.0) rtol=2*eps(T) + + @test erf(T(-0.25)) ≈ T(-0.27632639016823696) rtol=2*eps(T) + @test erf(T(-0.75)) ≈ T(-0.7111556336535151) rtol=2*eps(T) + @test erf(T(-1)) ≈ T(-0.84270079294971486934) rtol=2*eps(T) + @test erf(T(-1.5)) ≈ T(-0.9661051464753108) rtol=2*eps(T) + @test erf(T(-2.5)) ≈ T(-0.9995930479825551) rtol=2*eps(T) + @test erf(T(-3.5)) ≈ T(-0.9999992569016276) rtol=2*eps(T) + @test erf(T(-4.5)) ≈ T(-0.9999999998033839) rtol=2*eps(T) + @test erf(T(-6)) ≈ T(-1.0) rtol=2*eps(T) @test @inferred(erfc(T(1))) isa T @test erfc(T(1)) ≈ T(0.15729920705028513066) rtol=2*eps(T) From 3cee8ce25fa49bde6a377d8235da24fc524375e8 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Sat, 2 Aug 2025 10:35:12 +0300 Subject: [PATCH 07/16] cleaned up erf(Float64) --- src/erf.jl | 184 ++++++++++++++--------------------------------------- 1 file changed, 46 insertions(+), 138 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 9fcdc110..8a9f1df7 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -97,7 +97,7 @@ See also: [`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv). # Implementation by -- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c +- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c with modification - `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/) """ @@ -105,28 +105,6 @@ See also: # rational and polynomial approximations. # Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. function _erf(x::Float64) - # Minimax approximation of erf - PA=(0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,-0x1.18c47fd143c5ep-23) - # Rational approximation on [0x1p-28, 0.84375] - NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16) - DA=(0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18) - # Rational approximation on [0.84375, 1.25] - NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 ) - DB=( 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 ) - - # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 - PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19) - # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 - PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 ) - # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 - PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 ) - # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 - PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 ) - - C = 0x1.b0ac16p-1 - - TwoOverSqrtPiMinusOne=0x1.06eba8214db69p-3 - # # top 32 bits ix::UInt32=reinterpret(UInt64,x)>>32 @@ -146,52 +124,40 @@ function _erf(x::Float64) if (ia < 0x3fe00000) ## a < 0.5 - Use polynomial approximation. - r1 = fma(x2, PA[2], PA[1]) - r2 = fma(x2, PA[4], PA[3]) - r3 = fma(x2, PA[6], PA[5]) - r4 = fma(x2, PA[8], PA[7]) - r5 = fma(x2, PA[10], PA[9]) - - x4 = x2 * x2 - r = r5 - r = fma(x4, r, r4) - r = fma(x4, r, r3) - r = fma(x4, r, r2) - r = fma(x4, r, r1) - return fma(r, x, x) ## This fma is crucial for accuracy. + + # Minimax approximation of erf of the form x*P(x^2) approximately on the interval [0;0.5] + PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7) + + r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy. + + return r else ## 0.5 <= a < 0.84375 - Use rational approximation. - r1n = fma(x2, NA[2], NA[1]) - x4 = x2 * x2 - r2n = fma(x2, NA[4], NA[3]) - x8 = x4 * x4 - r1d = fma(x2, DA[1], 1.0) - r2d = fma(x2, DA[3], DA[2]) - r3d = fma(x2, DA[5], DA[4]) - P = r1n + x4 * r2n + x8 * NA[5] + # Rational approximation on [0x1p-28, 0.84375] + NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16) + DA=(1,0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18) + + P=evalpoly(x2,NA) + Q=evalpoly(x2,DA) - Q = r1d + x4 * r2d + x8 * r3d return fma(P / Q, x, x) end elseif (ia < 0x3ff40000) ## 0.84375 <= |x| < 1.25. + # Rational approximation on [0.84375, 1.25] + NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 ) + DB=( 1, 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 ) + + C = 0x1.b0ac16p-1 + a = abs(x) - 1.0 - r1n = fma(a, NB[2], NB[1]) - a2 = a * a - r1d = fma(a, DB[1], 1.0) - a4 = a2 * a2 - r2n = fma(a, NB[4], NB[3]) - a6 = a4 * a2 - r2d = fma(a, DB[3], DB[2]) - r3n = fma(a, NB[6], NB[5]) - r3d = fma(a, DB[5], DB[4]) - r4n = NB[7] - r4d = DB[6] - - P = r1n + a2 * r2n + a4 * r3n + a6 * r4n - Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d + + P=evalpoly(a,NB) + Q=evalpoly(a,DB) + + if (sign) return -C - P / Q else @@ -201,27 +167,13 @@ function _erf(x::Float64) ## 1.25 <= |x| < 2.0. a = abs(x) a = a - 1.25 + + # erfc polynomial approximation + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25 + PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19) - r1 = fma(a, PC[2], PC[1]) - r2 = fma(a, PC[4], PC[3]) - r3 = fma(a, PC[6], PC[5]) - r4 = fma(a, PC[8], PC[7]) - r5 = fma(a, PC[10], PC[9]) - r6 = fma(a, PC[12], PC[11]) - r7 = fma(a, PC[14], PC[13]) - r8 = fma(a, PC[16], PC[15]) - - - a2 = a * a - - r = r8 - r = fma(a2, r, r7) - r = fma(a2, r, r6) - r = fma(a2, r, r5) - r = fma(a2, r, r4) - r = fma(a2, r, r3) - r = fma(a2, r, r2) - r = fma(a2, r, r1) + # Obtains erfc of |x| + r=evalpoly(a,PC) if (sign) return -1.0 + r @@ -233,27 +185,13 @@ function _erf(x::Float64) a = abs(x) a = fma(0.5, a, -1.0) - r1 = fma(a, PD[2], PD[1]) - r2 = fma(a, PD[4], PD[3]) - r3 = fma(a, PD[6], PD[5]) - r4 = fma(a, PD[8], PD[7]) - r5 = fma(a, PD[10], PD[9]) - r6 = fma(a, PD[12], PD[11]) - r7 = fma(a, PD[14], PD[13]) - r8 = fma(a, PD[16], PD[15]) - r9 = fma(a, PD[18], PD[17]) - - a2 = a * a - - r = r9 - r = fma(a2, r, r8) - r = fma(a2, r, r7) - r = fma(a2, r, r6) - r = fma(a2, r, r5) - r = fma(a2, r, r4) - r = fma(a2, r, r3) - r = fma(a2, r, r2) - r = fma(a2, r, r1) + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 + PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 ) + + + # Obtains erfc of |x| + r=evalpoly(a,PD) + if (sign) return -1.0 + r @@ -265,24 +203,11 @@ function _erf(x::Float64) a = abs(x) a = a - 3.25 - r1 = fma(a, PE[2], PE[1]) - r2 = fma(a, PE[4], PE[3]) - r3 = fma(a, PE[6], PE[5]) - r4 = fma(a, PE[8], PE[7]) - r5 = fma(a, PE[10], PE[9]) - r6 = fma(a, PE[12], PE[11]) - r7 = fma(a, PE[14], PE[13]) - + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 + PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 ) - a2 = a * a + r=evalpoly(a,PE) - r = r7 - r = fma(a2, r, r6) - r = fma(a2, r, r5) - r = fma(a2, r, r4) - r = fma(a2, r, r3) - r = fma(a2, r, r2) - r = fma(a2, r, r1) if (sign) return -1.0 + r @@ -294,28 +219,11 @@ function _erf(x::Float64) a = abs(x) a = fma(0.5, a, -2.0) - r1 = fma(a, PF[2], PF[1]) - r2 = fma(a, PF[4], PF[3]) - r3 = fma(a, PF[6], PF[5]) - r4 = fma(a, PF[8], PF[7]) - r5 = fma(a, PF[10], PF[9]) - r6 = fma(a, PF[12], PF[11]) - r7 = fma(a, PF[14], PF[13]) - r8 = fma(a, PF[16], PF[15]) - - r9 = PF[17] - - a2 = a * a - - r = r9 - r = fma(a2, r, r8) - r = fma(a2, r, r7) - r = fma(a2, r, r6) - r = fma(a2, r, r5) - r = fma(a2, r, r4) - r = fma(a2, r, r3) - r = fma(a2, r, r2) - r = fma(a2, r, r1) + # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 + PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 ) + + r=evalpoly(a,PF) + if (sign) return -1.0 + r From b819c5896afd08be4876909e339e8c5cbcf08d06 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Sun, 14 Sep 2025 02:40:30 +0300 Subject: [PATCH 08/16] added erf(x::Float32) implementation --- src/erf.jl | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 8a9f1df7..76116f3c 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -97,7 +97,8 @@ See also: [`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv). # Implementation by -- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c with modification +- `Float32`: polynomial approximations of erf +- `Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c - `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/) """ @@ -247,9 +248,55 @@ function _erf(x::Float64) end -_erf(x::Float32)=Float32(_erf(Float64(x))) +function _erf(x::Float32) + xabs=abs(x) -_erf(x::Float16)=Float16(_erf(Float64(x))) + if (xabs< 0.5) + # range [0;0.5] + # # erf approximation using erf(x)=x+x*P(x^2) with degree 6 + # Sollya::remez(erf(sqrt(x))/sqrt(x)-1,6,[1e-32;0.5],1,1e-32); + + p=(0.128379167084072442015971708878153077794812875442854468252262977759538466466026108f0 ,-0.37612638677173360887708232154133809982218085424044576090554860522057182310851056f0 ,0.112837843774249243616280397250157777250507394914770984188915122197580134574091983f0 ,-2.68652865375831097248410803484628824158061934615600565461807890879374835870381413f-2 ,5.2188560068141289500478726517878958032487316358404909126782459380536157982980317f-3 ,-8.3948481565340715162646401327604259604058983841328238470236141987304564416187075f-4 ) + return copysign(fma(evalpoly(x^2,p),x,x),x) + + + elseif(xabs<1.25) + # range [0.5;1.25] + # # erfc approximation with degree 11 + # Sollya::remez(erfc(x+0.5),11,[0;1.25-0.5],1,1e-32); + + p=(0.47950012218627633266759004480928346323282597139306026470914603497045110845486174f0 ,-0.87878257867905935820797978014037919344903083907274027447236660730320123742574287f0 ,0.43939127340837007019212165648359529028950189241566682841945406944095273511955846f0 ,0.14646415648788187125149469711213532491861248937046075996222718522704324115126361f0 ,-0.18308466657525286729680773848609379638263457954975049708494183866284386220432776f0 ,-7.2864220919698610613876940798326935208926368515022370715399462640511442447273458f-3 ,4.9870470155967931820488830916894059744198728056328087545059900897748278238141407f-2 ,-4.8868244174386309272067176761136942655656143898910274020685204636212063119576779f-3 ,-1.1067663329874328692394302924580932358903686528252386541750628802256971128485086f-2 ,3.422346846104011114882331840937679141622349918363223030054135870224315763897819f-3 ,7.3027064271433070734900533407935697302293380332736344236451943042328241926267078f-4 ,-3.75817085020390291674591289563810393435170791862443379088867595819930602228183736f-4) + return copysign(1f0-evalpoly(xabs-0.5f0,p),x) + + + elseif(xabs<2.5) + # range [1.25;2.5] + # erfc approximation with degree 13 + # Sollya::remez(erfc(x+1.25),13,[1.25-1.25;2.5-1.25],1,1e-32); + + p=(7.7099871743416328126356260566547494286908999516428835479787235777958146662395595f-2 ,-0.236521122401763914017434484798130978411662107330375092318490618856395191655478216f0,0.29565140031735793699573224918347147906505950384598003349376150036910634397925794f0 ,-0.1675357298956664512006671881841788827839151996664942041368823179421003600967294f0 ,6.1585929514334837247453472606059635765901344910897544510201614098797139034234648f-3 ,4.7187118101037459805673209483303989110029907488612006887134009408435903419517292f-2 ,-2.13310233800625824496628789189780598311678632855392393834221656949343403265787497f-2 ,-3.5262544206616132931628291419365561301271536625777609043061691511329815920675659f-3 ,5.4618311186313877025711961981462734764321122794521259327727086934478142942850845f-3 ,-4.785867288141399400801105017099155449896231371492841611827465311139149075249888f-4 ,-1.27638529129849191843323454760991411930129610353375308047589690520960413915824394f-3 ,7.3386942792237055752542546076918383420248583637581325208367761430501498054660812f-4 ,-1.78316578653402766600700240536421560298308014794597944402973993319799701405750455f-4 ,1.7451623823305231302742688272669315889569143556707803172195238203096188492888552f-5) + return copysign(1f0-evalpoly(xabs-1.25f0,p),x) + + + elseif (xabs<4.0) + # range [2.5;4.0] + # erfc approximation with degree 13 + # Sollya::remez(erfc(x+2.5),13,[0;4-2.5],1,1e-32); + + p=(4.0695201730499156824979069048504659592517281936379802348048709133497117253173134f-4 ,-2.17828418992563158453260234674605666557610132217183615223667886784864793391972216f-3 ,5.4457086379161451966966115384340866958878328106709954104596733367011474611248579f-3 ,-8.3500528577413591092464067696190605106058226371648352228707616455735161644530442f-3 ,8.6220108431254600006776817786647116978577973677436276371791579294890933467761521f-3 ,-6.1151670580428489239821330412642128179703517796977571695036324539244245861906337f-3 ,2.7899458310817905157828387844634029030573127219171061924068857965376725014125688f-3 ,-5.1939500417227631511066424033635849427151215499427143347485235667556121428433924f-4 ,-3.0461048275173680879818513198875622675191891190039217228437020935156706025853604f-4 ,3.1068457426894474803882479013975662189036326925351567029013595934579911165234626f-4 ,-1.38668990773606991790025193904557899401436779771288081224199889420050208965460715f-4 ,3.6909690646602578301139502069782980693656088359691610474313757700491321631191208f-5 ,-5.6828887569336940854242289035551153938157998442490411471025066283558049904037765f-6 ,3.9297630357752347428080345871297027499036518175420666529979940028638158683656671f-7) + return copysign(1f0-evalpoly(xabs-2.5f0,p),x) + + + else + # |x| >= 4.0. + r=copysign(1f0,x) + end + return r + + +end + +_erf(x::Float16)=Float16(_erf(Float32(x))) function erf end From 57bbaf25a7b3051a936571cdaf64121713c74f6e Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Sun, 14 Sep 2025 02:48:46 +0300 Subject: [PATCH 09/16] added NaN edge case to erf(x::Float32) --- src/erf.jl | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 76116f3c..bfb5d3af 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -288,11 +288,17 @@ function _erf(x::Float32) else - # |x| >= 4.0. - r=copysign(1f0,x) - end +# |x| >= 4.0. + + if(isnan(x)) + return NaN + end + + r=copysign(1f0,x) + return r + end end From 26b3b1f62eb2bbe3735ca64186084cd7073128aa Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Sun, 14 Sep 2025 02:58:19 +0300 Subject: [PATCH 10/16] reversed NaN check --- src/erf.jl | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index bfb5d3af..d253d5eb 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -290,9 +290,6 @@ function _erf(x::Float32) else # |x| >= 4.0. - if(isnan(x)) - return NaN - end r=copysign(1f0,x) From 693f78878c5481ce6c4fdc75df847209f12e0433 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Sun, 14 Sep 2025 03:37:34 +0300 Subject: [PATCH 11/16] cleaned up float32 polynomials, added accuracy of float32, removed unnecessary whitespace, and removed explicit copysigns --- src/erf.jl | 88 +++++++++++++----------------------------------------- 1 file changed, 21 insertions(+), 67 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index d253d5eb..1b59b24b 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -111,12 +111,6 @@ function _erf(x::Float64) ix::UInt32=reinterpret(UInt64,x)>>32 # # top 32, without sign bit ia::UInt32=ix & 0x7fffffff - # # sign - # sign::UInt32=ix>>31 - - sign::Bool=x<0 - - if (ia < 0x3feb0000) # a = |x| < 0.84375. @@ -130,7 +124,6 @@ function _erf(x::Float64) PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7) r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy. - return r else ## 0.5 <= a < 0.84375 - Use rational approximation. @@ -158,12 +151,9 @@ function _erf(x::Float64) P=evalpoly(a,NB) Q=evalpoly(a,DB) + r= C + P / Q + return copysign(r,x) - if (sign) - return -C - P / Q - else - return C + P / Q - end elseif (ia < 0x40000000) ## 1.25 <= |x| < 2.0. a = abs(x) @@ -174,13 +164,9 @@ function _erf(x::Float64) PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19) # Obtains erfc of |x| - r=evalpoly(a,PC) + r=1.0-evalpoly(a,PC) + return copysign(r,x) - if (sign) - return -1.0 + r - else - return 1.0 - r - end elseif (ia < 0x400a0000) ## 2 <= |x| < 3.25. a = abs(x) @@ -189,16 +175,10 @@ function _erf(x::Float64) # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2 PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 ) - # Obtains erfc of |x| - r=evalpoly(a,PD) + r=1.0-evalpoly(a,PD) + return copysign(r,x) - - if (sign) - return -1.0 + r - else - return 1.0 - r - end elseif (ia < 0x40100000) ## 3.25 <= |x| < 4.0. a = abs(x) @@ -207,14 +187,9 @@ function _erf(x::Float64) # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25 PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 ) - r=evalpoly(a,PE) - + r=1.0-evalpoly(a,PE) + return copysign(r,x) - if (sign) - return -1.0 + r - else - return 1.0 - r - end elseif (ia < 0x4017a000) ## 4 <= |x| < 5.90625. a = abs(x) @@ -223,31 +198,19 @@ function _erf(x::Float64) # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4 PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 ) - r=evalpoly(a,PF) - - - if (sign) - return -1.0 + r - else - return 1.0 - r - end + r=1.0-evalpoly(a,PF) + return copysign(r,x) else - if(isnan(x)) return NaN end - - if (sign) - return -1.0 - else - return 1.0 - end - + copysign(1.0,x) end - - end +# Fast erf implementation using +# polynomial approximations of erf and erfc. +# Highest measured error is 1.12 ULPs at x = 1.232469 function _erf(x::Float32) xabs=abs(x) @@ -256,47 +219,38 @@ function _erf(x::Float32) # # erf approximation using erf(x)=x+x*P(x^2) with degree 6 # Sollya::remez(erf(sqrt(x))/sqrt(x)-1,6,[1e-32;0.5],1,1e-32); - p=(0.128379167084072442015971708878153077794812875442854468252262977759538466466026108f0 ,-0.37612638677173360887708232154133809982218085424044576090554860522057182310851056f0 ,0.112837843774249243616280397250157777250507394914770984188915122197580134574091983f0 ,-2.68652865375831097248410803484628824158061934615600565461807890879374835870381413f-2 ,5.2188560068141289500478726517878958032487316358404909126782459380536157982980317f-3 ,-8.3948481565340715162646401327604259604058983841328238470236141987304564416187075f-4 ) + p=(0.12837917f0, -0.37612638f0, 0.11283784f0, -0.026865287f0, 0.005218856f0, -0.0008394848f0) return copysign(fma(evalpoly(x^2,p),x,x),x) - elseif(xabs<1.25) # range [0.5;1.25] # # erfc approximation with degree 11 # Sollya::remez(erfc(x+0.5),11,[0;1.25-0.5],1,1e-32); - p=(0.47950012218627633266759004480928346323282597139306026470914603497045110845486174f0 ,-0.87878257867905935820797978014037919344903083907274027447236660730320123742574287f0 ,0.43939127340837007019212165648359529028950189241566682841945406944095273511955846f0 ,0.14646415648788187125149469711213532491861248937046075996222718522704324115126361f0 ,-0.18308466657525286729680773848609379638263457954975049708494183866284386220432776f0 ,-7.2864220919698610613876940798326935208926368515022370715399462640511442447273458f-3 ,4.9870470155967931820488830916894059744198728056328087545059900897748278238141407f-2 ,-4.8868244174386309272067176761136942655656143898910274020685204636212063119576779f-3 ,-1.1067663329874328692394302924580932358903686528252386541750628802256971128485086f-2 ,3.422346846104011114882331840937679141622349918363223030054135870224315763897819f-3 ,7.3027064271433070734900533407935697302293380332736344236451943042328241926267078f-4 ,-3.75817085020390291674591289563810393435170791862443379088867595819930602228183736f-4) + p=(0.47950011f0, -0.8787826f0, 0.4393913f0, 0.14646415f0, -0.18308467f0, -0.007286422f0, 0.04987047f0, -0.0048868246f0, -0.011067663f0, 0.003422347f0, 0.00073027064f0, -0.0003758171f0) return copysign(1f0-evalpoly(xabs-0.5f0,p),x) - elseif(xabs<2.5) # range [1.25;2.5] # erfc approximation with degree 13 # Sollya::remez(erfc(x+1.25),13,[1.25-1.25;2.5-1.25],1,1e-32); - p=(7.7099871743416328126356260566547494286908999516428835479787235777958146662395595f-2 ,-0.236521122401763914017434484798130978411662107330375092318490618856395191655478216f0,0.29565140031735793699573224918347147906505950384598003349376150036910634397925794f0 ,-0.1675357298956664512006671881841788827839151996664942041368823179421003600967294f0 ,6.1585929514334837247453472606059635765901344910897544510201614098797139034234648f-3 ,4.7187118101037459805673209483303989110029907488612006887134009408435903419517292f-2 ,-2.13310233800625824496628789189780598311678632855392393834221656949343403265787497f-2 ,-3.5262544206616132931628291419365561301271536625777609043061691511329815920675659f-3 ,5.4618311186313877025711961981462734764321122794521259327727086934478142942850845f-3 ,-4.785867288141399400801105017099155449896231371492841611827465311139149075249888f-4 ,-1.27638529129849191843323454760991411930129610353375308047589690520960413915824394f-3 ,7.3386942792237055752542546076918383420248583637581325208367761430501498054660812f-4 ,-1.78316578653402766600700240536421560298308014794597944402973993319799701405750455f-4 ,1.7451623823305231302742688272669315889569143556707803172195238203096188492888552f-5) + p=(0.077099875f0, -0.23652112f0, 0.2956514f0, -0.16753574f0, 0.006158593f0, 0.04718712f0, -0.021331023f0, -0.0035262543f0, 0.005461831f0, -0.00047858673f0, -0.0012763853f0, 0.00073386944f0, -0.00017831658f0, 1.7451624f-5) return copysign(1f0-evalpoly(xabs-1.25f0,p),x) - elseif (xabs<4.0) # range [2.5;4.0] # erfc approximation with degree 13 # Sollya::remez(erfc(x+2.5),13,[0;4-2.5],1,1e-32); - p=(4.0695201730499156824979069048504659592517281936379802348048709133497117253173134f-4 ,-2.17828418992563158453260234674605666557610132217183615223667886784864793391972216f-3 ,5.4457086379161451966966115384340866958878328106709954104596733367011474611248579f-3 ,-8.3500528577413591092464067696190605106058226371648352228707616455735161644530442f-3 ,8.6220108431254600006776817786647116978577973677436276371791579294890933467761521f-3 ,-6.1151670580428489239821330412642128179703517796977571695036324539244245861906337f-3 ,2.7899458310817905157828387844634029030573127219171061924068857965376725014125688f-3 ,-5.1939500417227631511066424033635849427151215499427143347485235667556121428433924f-4 ,-3.0461048275173680879818513198875622675191891190039217228437020935156706025853604f-4 ,3.1068457426894474803882479013975662189036326925351567029013595934579911165234626f-4 ,-1.38668990773606991790025193904557899401436779771288081224199889420050208965460715f-4 ,3.6909690646602578301139502069782980693656088359691610474313757700491321631191208f-5 ,-5.6828887569336940854242289035551153938157998442490411471025066283558049904037765f-6 ,3.9297630357752347428080345871297027499036518175420666529979940028638158683656671f-7) + p=(0.00040695202f0, -0.002178284f0, 0.0054457085f0, -0.008350053f0, 0.008622011f0, -0.006115167f0, 0.0027899458f0, -0.000519395f0, -0.00030461047f0, 0.00031068458f0, -0.00013866898f0, 3.6909692f-5, -5.682889f-6, 3.929763f-7) return copysign(1f0-evalpoly(xabs-2.5f0,p),x) - else -# |x| >= 4.0. - - - r=copysign(1f0,x) - - return r - + # range [4.0,inf) + r=copysign(1f0,x) + return r end - end _erf(x::Float16)=Float16(_erf(Float32(x))) From 642c1de6a0c5bf62ce6f469794618adf37d41950 Mon Sep 17 00:00:00 2001 From: Ahmed Yasser <141057309+AhmedYKadah@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:11:26 +0300 Subject: [PATCH 12/16] Explicit NaN case for erf(Float64) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: David Müller-Widmann --- src/erf.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 1b59b24b..a2870529 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -200,11 +200,10 @@ function _erf(x::Float64) r=1.0-evalpoly(a,PF) return copysign(r,x) + elseif isnan(x) + return NaN else - if(isnan(x)) - return NaN - end - copysign(1.0,x) + return copysign(1.0,x) end end From 29113124c97574ec3002f5f587d94d449911d1d3 Mon Sep 17 00:00:00 2001 From: Ahmed Yasser <141057309+AhmedYKadah@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:12:03 +0300 Subject: [PATCH 13/16] Added NaN case to erf(Float32) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: David Müller-Widmann --- src/erf.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index a2870529..0c6cc881 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -245,10 +245,11 @@ function _erf(x::Float32) p=(0.00040695202f0, -0.002178284f0, 0.0054457085f0, -0.008350053f0, 0.008622011f0, -0.006115167f0, 0.0027899458f0, -0.000519395f0, -0.00030461047f0, 0.00031068458f0, -0.00013866898f0, 3.6909692f-5, -5.682889f-6, 3.929763f-7) return copysign(1f0-evalpoly(xabs-2.5f0,p),x) + elseif isnan(x) + return NaN else # range [4.0,inf) - r=copysign(1f0,x) - return r + return copysign(1f0, x) end end From bba5f9bb48ca9e18e06abcf104fd45e28fdf198c Mon Sep 17 00:00:00 2001 From: Ahmed Yasser <141057309+AhmedYKadah@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:13:57 +0300 Subject: [PATCH 14/16] Removed unneccessary indent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: David Müller-Widmann --- src/erf.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/erf.jl b/src/erf.jl index 0c6cc881..a391c260 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -25,9 +25,9 @@ for f in (:erf, :erfc) end end - _erfc(x::Float64) = ccall((:erfc, libopenlibm), Float64, (Float64,), x) - _erfc(x::Float32) = ccall((:erfcf, libopenlibm), Float32, (Float32,), x) - _erfc(x::Float16) = Float16(_erfc(Float32(x))) +_erfc(x::Float64) = ccall((:erfc, libopenlibm), Float64, (Float64,), x) +_erfc(x::Float32) = ccall((:erfcf, libopenlibm), Float32, (Float32,), x) +_erfc(x::Float16) = Float16(_erfc(Float32(x))) for f in (:erfcx, :erfi, :dawson, :faddeeva) internalf = Symbol(:_, f) From c2a37a13df7ce199cca87b2d1d0150a2247f6e02 Mon Sep 17 00:00:00 2001 From: Ahmed Yasser <141057309+AhmedYKadah@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:14:36 +0300 Subject: [PATCH 15/16] Fixed wrong NaN type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: David Müller-Widmann --- src/erf.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/erf.jl b/src/erf.jl index a391c260..a97e9496 100644 --- a/src/erf.jl +++ b/src/erf.jl @@ -246,7 +246,7 @@ function _erf(x::Float32) return copysign(1f0-evalpoly(xabs-2.5f0,p),x) elseif isnan(x) - return NaN + return NaN32 else # range [4.0,inf) return copysign(1f0, x) From 1d45644099fd44129ec5bc48e6b7be3ad2403bb5 Mon Sep 17 00:00:00 2001 From: AhmedsElysium Date: Fri, 19 Sep 2025 12:42:38 +0300 Subject: [PATCH 16/16] added test for NaN for erf --- test/erf.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/erf.jl b/test/erf.jl index 70f1bff0..2aa01387 100644 --- a/test/erf.jl +++ b/test/erf.jl @@ -20,6 +20,8 @@ @test erf(T(-4.5)) ≈ T(-0.9999999998033839) rtol=2*eps(T) @test erf(T(-6)) ≈ T(-1.0) rtol=2*eps(T) + @test isnan(erf(T(NaN))) + @test @inferred(erfc(T(1))) isa T @test erfc(T(1)) ≈ T(0.15729920705028513066) rtol=2*eps(T)