pxfm/err/
erfc.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::dd_fmla;
30use crate::double_double::DoubleDouble;
31use crate::err::erf::{Erf, erf_accurate, erf_fast};
32use crate::exponents::{EXP_REDUCE_T0, EXP_REDUCE_T1, ldexp};
33use crate::rounding::CpuRoundTiesEven;
34use std::hint::black_box;
35
36static ASYMPTOTIC_POLY: [[u64; 13]; 6] = [
37    [
38        0x3fe20dd750429b6d,
39        0x3c61a1feb75a48a8,
40        0xbfd20dd750429b6c,
41        0x3fdb14c2f863e403,
42        0xbff0ecf9db3af35d,
43        0x400d9eb53ca6eeed,
44        0xc030a945830d95c8,
45        0x4056e8a963e2f1f5,
46        0xc0829b7ccc8f396f,
47        0x40b15e716e83c27e,
48        0xc0e1cfdcfbcaf22a,
49        0x4111986cc7a7e8fe,
50        0xc1371f7540590a91,
51    ],
52    [
53        0x3fe20dd750429ae7,
54        0x3c863da89e801fd4,
55        0xbfd20dd750400795,
56        0x3fdb14c2f57c490c,
57        0xbff0ecf95c8c9014,
58        0x400d9e981f2321ef,
59        0xc030a81482de1506,
60        0x4056d662420a604b,
61        0xc08233c96fff7772,
62        0x40af5d62018d3e37,
63        0xc0d9ae55e9554450,
64        0x410052901e10d139,
65        0xc1166465df1385f0,
66    ],
67    [
68        0x3fe20dd75041e3fc,
69        0xbc7c9b491c4920fc,
70        0xbfd20dd74e5f1526,
71        0x3fdb14c1d35a40e0,
72        0xbff0ecdecd30e86b,
73        0x400d9b4e7f725263,
74        0xc030958b5ca8fb39,
75        0x40563e3179bf609c,
76        0xc0806bbd1cd2d0fd,
77        0x40a7b66eb6d1d2f2,
78        0xc0cce5a4b1afab75,
79        0x40e8b5c6ae6f773c,
80        0xc0f5475860326f86,
81    ],
82    [
83        0x3fe20dd75025cfe9,
84        0x3c55a92eef32fb20,
85        0xbfd20dd71eb9d4e7,
86        0x3fdb14af4c25db28,
87        0xbff0ebc78a22b3d8,
88        0x400d85287a0b3399,
89        0xc03045f751e5ca1d,
90        0x4054a0d87ddea589,
91        0xc07ac6a0981d1eee,
92        0x409f44822f567956,
93        0xc0bcba372de71349,
94        0x40d1a4a19f550ca4,
95        0xc0d52a580455ed79,
96    ],
97    [
98        0x3fe20dd74eb31d84,
99        0xbc439c4054b7c090,
100        0xbfd20dd561af98c4,
101        0x3fdb1435165d9df1,
102        0xbff0e6b60308e940,
103        0x400d3ce30c140882,
104        0xc02f2083e404c299,
105        0x40520f113d89b42a,
106        0xc0741433ebd89f19,
107        0x4092f35b6a3154f6,
108        0xc0ab020a4313cf3b,
109        0x40b90f07e92da7ee,
110        0xc0b6565e1d7665c3,
111    ],
112    [
113        0x3fe20dd744b3517b,
114        0xbc6f77ab25e01ab4,
115        0xbfd20dcc62ec4024,
116        0x3fdb125bfa4f66c1,
117        0xbff0d80e65381970,
118        0x400ca11fbcfa65b2,
119        0xc02cd9eaffb88315,
120        0x404e010db42e0da7,
121        0xc06c5c85250ef6a3,
122        0x4085e118d9c1eeaf,
123        0xc098d74be13d3d30,
124        0x40a211b1b2b5ac83,
125        0xc09900be759fc663,
126    ],
127];
128
129static ASYMPTOTIC_POLY_ACCURATE: [[u64; 30]; 10] = [
130    [
131        0x3fe20dd750429b6d,
132        0x3c61ae3a912b08f0,
133        0xbfd20dd750429b6d,
134        0xbc51ae34c0606d68,
135        0x3fdb14c2f863e924,
136        0xbc796c0f4c848fc8,
137        0xbff0ecf9db3e71b6,
138        0x3c645d756bd288b0,
139        0x400d9eb53fad4672,
140        0xbcac61629de9adf2,
141        0xc030a945f3d147ea,
142        0x3cb8fec5ad7ece20,
143        0x4056e8c02f27ca6d,
144        0xc0829d1c21c363e0,
145        0x40b17349b70be627,
146        0xc0e28a6bb4686182,
147        0x411602d1662523ca,
148        0xc14ccae7625c4111,
149        0x4184237d064f6e0d,
150        0xc1bb1e5466ca3a2f,
151        0x41e90ae06a0f6cc1,
152        0,
153        0,
154        0,
155        0,
156        0,
157        0,
158        0,
159        0,
160        0,
161    ],
162    [
163        0x3fe20dd750429b6d,
164        0x3c61adaa62435c10,
165        0xbfd20dd750429b6d,
166        0xbc441516126827c8,
167        0x3fdb14c2f863e90b,
168        0x3c7a535780ba5ed4,
169        0xbff0ecf9db3e65d6,
170        0xbc9089edde27ad07,
171        0x400d9eb53fa52f20,
172        0xbcabc9737e9464ac,
173        0xc030a945f2cd7621,
174        0xbcc589f28b700332,
175        0x4056e8bffd7e194e,
176        0xc0829d18716876e2,
177        0x40b17312abe18250,
178        0xc0e287e73592805c,
179        0x4115ebf7394a39c1,
180        0xc14c2f14d46d0cf9,
181        0x4182af3d256f955e,
182        0xc1b7041659ebd7aa,
183        0x41e6039c232e2f71,
184        0xc2070ca15c5a07cb,
185        0,
186        0,
187        0,
188        0,
189        0,
190        0,
191        0,
192        0,
193    ],
194    [
195        0x3fe20dd750429b6d,
196        0x3c5d3c35b5d37410,
197        0xbfd20dd750429b56,
198        0xbc7c028415f6f81b,
199        0x3fdb14c2f863c1cf,
200        0x3c51bb0de6470dbc,
201        0xbff0ecf9db33c363,
202        0x3c80f8068459eb16,
203        0x400d9eb53b9ce57b,
204        0x3ca20cce33e7d84a,
205        0xc030a945aa2ec4fa,
206        0xbcdf6e0fcd7c6030,
207        0x4056e8b824d2bfaa,
208        0xc0829cc372a6d0b0,
209        0x40b1703a99ddd429,
210        0xc0e2749f9a267cc6,
211        0x4115856a17271849,
212        0xc14a8bcb4ba9753f,
213        0x418035dcce882940,
214        0xc1b1e5d8c5e6e043,
215        0x41dfe3b4f365386e,
216        0xc20398fdef2b98fe,
217        0x42184234d4f4ea12,
218        0,
219        0,
220        0,
221        0,
222        0,
223        0,
224        0,
225    ],
226    [
227        0x3fe20dd750429b6a,
228        0x3c8ae622b765e9fd,
229        0xbfd20dd750428f0e,
230        0x3c703c6c67d69513,
231        0x3fdb14c2f8563e8e,
232        0x3c6766a6bd7aa89c,
233        0xbff0ecf9d8dedd48,
234        0x3c90af52e90336e3,
235        0x400d9eb4aad086fe,
236        0x3ca640d371d54a19,
237        0xc030a93f1d01cfe0,
238        0xbcc68dbd8d9c522c,
239        0x4056e842e9fd5898,
240        0xc08299886ef1fb80,
241        0x40b15e0f0162c9a0,
242        0xc0e222dbc6b04cd8,
243        0x411460268db1ebdf,
244        0xc1474f53ce065fb3,
245        0x417961ca8553f870,
246        0xc1a8788395d13798,
247        0x41d35e37b25d0e81,
248        0xc1f707b7457c8f5e,
249        0x4211ff852df1c023,
250        0xc21b75d0ec56e2cd,
251        0,
252        0,
253        0,
254        0,
255        0,
256        0,
257    ],
258    [
259        0x3fe20dd750429a8f,
260        0xbc766d8dda59bcea,
261        0xbfd20dd7503fdbab,
262        0x3c6707bdffc2b3fe,
263        0x3fdb14c2f6526025,
264        0xbc27fa4bb9541140,
265        0xbff0ecf99c417d45,
266        0xbc9748645ef7af94,
267        0x400d9eaa9c712a7d,
268        0x3ca79e478994ebb4,
269        0xc030a8ef11fbf141,
270        0x3cbb5c72d69f8954,
271        0x4056e4653e0455b1,
272        0xc08286909448e6cf,
273        0x40b113424ce76821,
274        0xc0e1346d859e76de,
275        0x4111f9f6cf2293bf,
276        0xc14258e6e3b337db,
277        0x41714029ecd465fb,
278        0xc19c530df5337a6f,
279        0x41c34bc4bbccd336,
280        0xc1e4a37c52641688,
281        0x420019707cec2974,
282        0xc21031fe736ea169,
283        0x420f6b3003de3ddf,
284        0,
285        0,
286        0,
287        0,
288        0,
289    ],
290    [
291        0x3fe20dd75042756b,
292        0x3c84ad9178b56910,
293        0xbfd20dd74feda9e8,
294        0xbc78141c70bbc8d6,
295        0x3fdb14c2cb128467,
296        0xbc709aebaa106821,
297        0xbff0ecf603921a0b,
298        0x3c97d3cb5bceaf0b,
299        0x400d9e3e1751ca59,
300        0x3c76622ae5642670,
301        0xc030a686af57f547,
302        0x3cc083b320aff6b6,
303        0x4056cf0b6c027326,
304        0xc0823afcb69443d3,
305        0x40b03ab450d9f1b9,
306        0xc0de74cdb76bcab4,
307        0x410c671b60e607f1,
308        0xc138f1376d324ce4,
309        0x4163b64276234676,
310        0xc18aff0ce13c5a8e,
311        0x41aef20247251e87,
312        0xc1cc9f5662f721f6,
313        0x41e4687858e185e1,
314        0xc1f4fa507be073c2,
315        0x41fb99ac35ee4acc,
316        0xc1f16cb585ee3fa9,
317        0,
318        0,
319        0,
320        0,
321    ],
322    [
323        0x3fe20dd7503e730d,
324        0x3c84e524a098a467,
325        0xbfd20dd7498fa6b2,
326        0x3c260a4e27751c80,
327        0x3fdb14c061bd2a0c,
328        0x3c695a8f847d2fc2,
329        0xbff0ecd0f11b8c7d,
330        0xbc94126deea76061,
331        0x400d9b1344463548,
332        0x3cafe09a4eca9b0e,
333        0xc030996ea52a87ed,
334        0xbca924f920db26c0,
335        0x40567a2264b556b0,
336        0xc0815dfc2c86b6b5,
337        0x40accc291b62efe4,
338        0xc0d81375a78e746a,
339        0x41033a6f15546329,
340        0xc12c1e9dc1216010,
341        0x4152397ea3d43fda,
342        0xc174661e5b2ea512,
343        0x4193412367ca5d45,
344        0xc1ade56b9d7f37c4,
345        0x41c2851d9722146d,
346        0xc1d19027baf0c3fe,
347        0x41d7e7b8b6ab58ac,
348        0xc1d4c446d56aaf22,
349        0x41c1492190400505,
350        0,
351        0,
352        0,
353    ],
354    [
355        0x3fe20dd74ff10852,
356        0x3c8a32f26deff875,
357        0xbfd20dd6f06c491c,
358        0x3c770c16e1793358,
359        0x3fdb14a7d5e7fd4a,
360        0x3c7479998b54db5b,
361        0xbff0ebbdb3889c5f,
362        0xbc759b853e11369c,
363        0x400d89dd249d7ef8,
364        0xbc84b5edf0c8c314,
365        0xc0306526fb386114,
366        0xbc840d04eed7c7e0,
367        0x40557ff657e429ce,
368        0xc07ef63e90d38630,
369        0x40a6d4f34c4ea3da,
370        0xc0d04542b9e36a54,
371        0x40f577bf19097738,
372        0xc119702fe47c736d,
373        0x413a7ae12b54fdc6,
374        0xc157ca3f0f7c4fa9,
375        0x417225d983963cbf,
376        0xc1871a6eac612f9e,
377        0x4198086324225e1e,
378        0xc1a3de68670a7716,
379        0x41a91674de4dcbe9,
380        0xc1a6b44cc15b76c2,
381        0x419a36dae0f30d80,
382        0xc17cffc1747ea3dc,
383        0,
384        0,
385    ],
386    [
387        0x3fe20dd74ba8f300,
388        0xbc59dd256871d210,
389        0xbfd20dd3593675bc,
390        0x3c7ec0e7ffa91ad9,
391        0x3fdb13eef86a077a,
392        0xbc74fb5d78d411b8,
393        0xbff0e5cf52a11f3a,
394        0xbc851f36c779dc8c,
395        0x400d4417a08b39d5,
396        0x3c91be9fb5956638,
397        0xc02f91b9f6ce80c3,
398        0xbccc9c99dd42829c,
399        0x405356439f45bb43,
400        0xc078c0ca12819b48,
401        0x409efcad2ecd6671,
402        0xc0c21b0af6fc1039,
403        0x40e327d215ee30c9,
404        0xc101fabda96167b0,
405        0x411d82e4373b315d,
406        0xc134ed9e2ff591e9,
407        0x41495c85dcd8eab5,
408        0xc159f016f0a3d62a,
409        0x41660e89d918b96f,
410        0xc16e97be202cba64,
411        0x4170d8a081619793,
412        0xc16c5422b4fcfc65,
413        0x4161131a9dc6aed1,
414        0xc14a457d9dced257,
415        0x4123605e980e8b86,
416        0,
417    ],
418    [
419        0x3fe20dd7319d4d25,
420        0x3c82b02992c3b7ab,
421        0xbfd20dc29c13ab1b,
422        0xbc7d78d79b4ad767,
423        0x3fdb115a57b5ab13,
424        0xbc6aa8c45be0aa2e,
425        0xbff0d58ec437efd7,
426        0xbc5994f00a15e850,
427        0x400cb1742e229f23,
428        0xbca8000471d54399,
429        0xc02d99a5edf7b946,
430        0xbcbaf76ed7e35cde,
431        0x4050a8b71058eb28,
432        0xc072d88289da5bfc,
433        0x40943ddf24168edb,
434        0xc0b3e9dfc38b6d1a,
435        0x40d18d4df97ab3df,
436        0xc0eb550fc62dcab5,
437        0x41029cb71f116ed1,
438        0xc115fc9cc4e854e3,
439        0x41265915fd0567b1,
440        0xc1335eb5fca0e46d,
441        0x413c5261ecc0d789,
442        0xc14138932dc4eafc,
443        0x414117d4eb18facd,
444        0xc13af96163e35eca,
445        0x4130454a3a63c766,
446        0xc11c2ebc1d39b44a,
447        0x40ff3327698e0e6b,
448        0xc0d094febc3dff35,
449    ],
450];
451
452// Approximation for the fast path of exp(z) for z=zh+zl,
453// with |z| < 0.000130273 < 2^-12.88 and |zl| < 2^-42.6
454// (assuming x^y does not overflow or underflow)
455#[inline]
456fn q_1(z_dd: DoubleDouble) -> DoubleDouble {
457    const C: [u64; 5] = [
458        0x3ff0000000000000,
459        0x3ff0000000000000,
460        0x3fe0000000000000,
461        0x3fc5555555995d37,
462        0x3fa55555558489dc,
463    ];
464    let z = z_dd.to_f64();
465    let mut q = dd_fmla(f64::from_bits(C[4]), z_dd.hi, f64::from_bits(C[3]));
466
467    q = dd_fmla(q, z, f64::from_bits(C[2]));
468
469    let mut v = DoubleDouble::from_exact_add(f64::from_bits(C[1]), q * z);
470    v = DoubleDouble::quick_mult(z_dd, v);
471    DoubleDouble::f64_add(f64::from_bits(C[0]), v)
472}
473
474#[inline]
475fn exp_1(x: DoubleDouble) -> DoubleDouble {
476    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe); /* |INVLOG2-2^12/log(2)| < 2^-43.4 */
477    let k = (x.hi * INVLOG2).cpu_round_ties_even();
478
479    const LOG2_DD: DoubleDouble = DoubleDouble::new(
480        f64::from_bits(0x3bbabc9e3b39803f),
481        f64::from_bits(0x3f262e42fefa39ef),
482    );
483    let k_dd = DoubleDouble::quick_f64_mult(k, LOG2_DD);
484    let mut y_dd = DoubleDouble::from_exact_add(x.hi - k_dd.hi, x.lo);
485    y_dd.lo -= k_dd.lo;
486
487    let ki: i64 = k as i64; /* Note: k is an integer, this is just a conversion. */
488    let mi = (ki >> 12).wrapping_add(0x3ff);
489    let i2: i64 = (ki >> 6) & 0x3f;
490    let i1: i64 = ki & 0x3f;
491    let t1 = DoubleDouble::new(
492        f64::from_bits(EXP_REDUCE_T0[i2 as usize].0),
493        f64::from_bits(EXP_REDUCE_T0[i2 as usize].1),
494    );
495    let t2 = DoubleDouble::new(
496        f64::from_bits(EXP_REDUCE_T1[i1 as usize].0),
497        f64::from_bits(EXP_REDUCE_T1[i1 as usize].1),
498    );
499    let mut v = DoubleDouble::quick_mult(t2, t1);
500    let q = q_1(y_dd);
501    v = DoubleDouble::quick_mult(v, q);
502
503    let scale = f64::from_bits((mi as u64) << 52);
504    v.hi *= scale;
505    v.lo *= scale;
506    v
507}
508
509struct Exp {
510    e: i32,
511    result: DoubleDouble,
512}
513
514fn exp_accurate(x_dd: DoubleDouble) -> Exp {
515    static E2: [u64; 28] = [
516        0x3ff0000000000000,
517        0xb960000000000000,
518        0x3ff0000000000000,
519        0xb9be200000000000,
520        0x3fe0000000000000,
521        0x3a03c00000000000,
522        0x3fc5555555555555,
523        0x3c655555555c78d9,
524        0x3fa5555555555555,
525        0x3c455555545616e2,
526        0x3f81111111111111,
527        0x3c011110121fc314,
528        0x3f56c16c16c16c17,
529        0xbbef49e06ee3a56e,
530        0x3f2a01a01a01a01a,
531        0x3b6b053e1eeab9c0,
532        0x3efa01a01a01a01a,
533        0x3ec71de3a556c733,
534        0x3e927e4fb7789f66,
535        0x3e5ae64567f54abe,
536        0x3e21eed8eff8958b,
537        0x3de6124613837216,
538        0x3da93974aaf26a57,
539        0x3d6ae7f4fd6d0bd9,
540        0x3d2ae7e982620b25,
541        0x3ce94e4ca59460d8,
542        0x3ca69a2a4b7ef36d,
543        0x3c6abfe1602308c9,
544    ];
545    const LOG2INV: f64 = f64::from_bits(0x3ff71547652b82fe);
546    let k: i32 = unsafe {
547        (x_dd.hi * LOG2INV)
548            .cpu_round_ties_even()
549            .to_int_unchecked::<i32>()
550    };
551
552    const LOG2_H: f64 = f64::from_bits(0x3fe62e42fefa39ef);
553    /* we approximate LOG2Lacc ~ log(2) - LOG2H with 38 bits, so that
554    k*LOG2Lacc is exact (k has at most 11 bits) */
555    const LOG2_L: f64 = f64::from_bits(0x3c7abc9e3b398000);
556    const LOG2_TINY: f64 = f64::from_bits(0x398f97b57a079a19);
557    let yh = dd_fmla(-k as f64, LOG2_H, x_dd.hi);
558    /* since |xh+xl| >= 2.92 we have |k| >= 4;
559    (|k|-1/2)*log(2) <= |x| <= (|k|+1/2)*log(2) thus
560    1-1/(2|k|) <= |x/(k*log(2))| <= 1+1/(2|k|) thus by Sterbenz theorem
561    yh is exact too */
562    let mut t = DoubleDouble::from_full_exact_add(-k as f64 * LOG2_L, x_dd.lo);
563    let mut y_dd = DoubleDouble::from_exact_add(yh, t.hi);
564    y_dd.lo = dd_fmla(-k as f64, LOG2_TINY, y_dd.lo + t.lo);
565    /* now yh+yl approximates xh + xl - k*log(2), and we approximate p(yh+yl)
566    in h + l */
567    /* Since |xh| <= 742, we assume |xl| <= ulp(742) = 2^-43. Then since
568       |k| <= round(742/log(2)) = 1070, |yl| <= 1070*LOG2L + 2^-42 < 2^-42.7.
569       Since |yh| <= log(2)/2, the contribution of yl is negligible as long
570       as |i*p[i]*yh^(i-1)*yl| < 2^-104, which holds for i >= 16.
571       Thus for coefficients of degree 16 or more, we don't take yl into account.
572    */
573    let mut h = f64::from_bits(E2[19 + 8]); // degree 19
574    for a in (16..=18).rev() {
575        h = dd_fmla(h, y_dd.hi, f64::from_bits(E2[a + 8])); // degree i
576    }
577    /* degree 15: h*(yh+yl)+E2[15 + 8] */
578    t = DoubleDouble::from_exact_mult(h, y_dd.hi);
579    t.lo = dd_fmla(h, y_dd.lo, t.lo);
580    let mut v = DoubleDouble::from_exact_add(f64::from_bits(E2[15 + 8]), t.hi);
581    v.lo += t.lo;
582    for a in (8..=14).rev() {
583        /* degree i: (h+l)*(yh+yl)+E2[i+8] */
584        t = DoubleDouble::quick_mult(v, y_dd);
585        v = DoubleDouble::from_exact_add(f64::from_bits(E2[a + 8]), t.hi);
586        v.lo += t.lo;
587    }
588    for a in (0..=7).rev() {
589        /* degree i: (h+l)*(yh+yl)+E2[2i]+E2[2i+1] */
590        t = DoubleDouble::quick_mult(v, y_dd);
591        v = DoubleDouble::from_exact_add(f64::from_bits(E2[2 * a]), t.hi);
592        v.lo += t.lo + f64::from_bits(E2[2 * a + 1]);
593    }
594
595    Exp { e: k, result: v }
596}
597
598#[cold]
599fn erfc_asympt_accurate(x: f64) -> f64 {
600    /* subnormal exceptions */
601    if x == f64::from_bits(0x403a8f7bfbd15495) {
602        return dd_fmla(
603            f64::from_bits(0x0000000000000001),
604            -0.25,
605            f64::from_bits(0x000667bd620fd95b),
606        );
607    }
608    let u_dd = DoubleDouble::from_exact_mult(x, x);
609    let exp_result = exp_accurate(DoubleDouble::new(-u_dd.lo, -u_dd.hi));
610
611    /* compute 1/x as double-double */
612    let yh = 1.0 / x;
613    /* Newton's iteration for 1/x is y -> y + y*(1-x*y) */
614    let yl = yh * dd_fmla(-x, yh, 1.0);
615    // yh+yl approximates 1/x
616    static THRESHOLD: [u64; 10] = [
617        0x3fb4500000000000,
618        0x3fbe000000000000,
619        0x3fc3f00000000000,
620        0x3fc9500000000000,
621        0x3fcf500000000000,
622        0x3fd3100000000000,
623        0x3fd7100000000000,
624        0x3fdbc00000000000,
625        0x3fe0b00000000000,
626        0x3fe3000000000000,
627    ];
628    let mut i = 0usize;
629    while i < THRESHOLD.len() && yh > f64::from_bits(THRESHOLD[i]) {
630        i += 1;
631    }
632    let p = ASYMPTOTIC_POLY_ACCURATE[i];
633    let mut u_dd = DoubleDouble::from_exact_mult(yh, yh);
634    u_dd.lo = dd_fmla(2.0 * yh, yl, u_dd.lo);
635    /* the polynomial p has degree 29+2i, and its coefficient of largest
636    degree is p[14+6+i] */
637    let mut z_dd = DoubleDouble::new(0., f64::from_bits(p[14 + 6 + i]));
638    for a in (13..=27 + 2 * i).rev().step_by(2) {
639        /* degree j: (zh+zl)*(uh+ul)+p[(j-1)/2+6]] */
640        let v = DoubleDouble::quick_mult(z_dd, u_dd);
641        z_dd = DoubleDouble::from_full_exact_add(f64::from_bits(p[(a - 1) / 2 + 6]), v.hi);
642        z_dd.lo += v.lo;
643    }
644    for a in (1..=11).rev().step_by(2) {
645        let v = DoubleDouble::quick_mult(z_dd, u_dd);
646        z_dd = DoubleDouble::from_full_exact_add(f64::from_bits(p[a - 1]), v.hi);
647        z_dd.lo += v.lo + f64::from_bits(p[a]);
648    }
649
650    /* multiply by yh+yl */
651    u_dd = DoubleDouble::quick_mult(z_dd, DoubleDouble::new(yl, yh));
652    /* now uh+ul approximates p(1/x), i.e., erfc(x)*exp(x^2) */
653    /* now multiply (uh+ul)*(eh+el), after normalizing uh+ul to reduce the
654    number of exceptional cases */
655    u_dd = DoubleDouble::from_exact_add(u_dd.hi, u_dd.lo);
656    let v = DoubleDouble::quick_mult(u_dd, exp_result.result);
657    /* multiply by 2^e */
658    /* multiply by 2^e */
659    let mut res = ldexp(v.to_f64(), exp_result.e);
660    if res < f64::from_bits(0x0010000000000000) {
661        /* for erfc(x) in the subnormal range, we have to perform a special
662        rounding */
663        let mut corr = v.hi - ldexp(res, -exp_result.e);
664        corr += v.lo;
665        /* add corr*2^e */
666        res += ldexp(corr, exp_result.e);
667    }
668    res
669}
670
671#[cold]
672fn erfc_accurate(x: f64) -> f64 {
673    if x < 0. {
674        let mut v_dd = erf_accurate(-x);
675        let t = DoubleDouble::from_exact_add(1.0, v_dd.hi);
676        v_dd.hi = t.hi;
677        v_dd.lo += t.lo;
678        return v_dd.to_f64();
679    } else if x <= f64::from_bits(0x3ffb59ffb450828c) {
680        // erfc(x) >= 2^-6
681        let mut v_dd = erf_accurate(x);
682        let t = DoubleDouble::from_exact_add(1.0, -v_dd.hi);
683        v_dd.hi = t.hi;
684        v_dd.lo = t.lo - v_dd.lo;
685        return v_dd.to_f64();
686    }
687    // now 0x1.b59ffb450828cp+0 < x < 0x1.b39dc41e48bfdp+4
688    erfc_asympt_accurate(x)
689}
690
691/* Fast path for 0x1.713786d9c7c09p+1 < x < 0x1.b39dc41e48bfdp+4,
692using the asymptotic formula erfc(x) = exp(-x^2) * p(1/x)*/
693fn erfc_asympt_fast(x: f64) -> Erf {
694    /* for x >= 0x1.9db1bb14e15cap+4, erfc(x) < 2^-970, and we might encounter
695    underflow issues in the computation of l, thus we delegate this case
696    to the accurate path */
697    if x >= f64::from_bits(0x4039db1bb14e15ca) {
698        return Erf {
699            err: 1.0,
700            result: DoubleDouble::default(),
701        };
702    }
703
704    let mut u = DoubleDouble::from_exact_mult(x, x);
705    let e_dd = exp_1(DoubleDouble::new(-u.lo, -u.hi));
706
707    /* the assumptions from exp_1 are satisfied:
708    * a_mul ensures |ul| <= ulp(uh), thus |ul/uh| <= 2^-52
709    * since |x| < 0x1.9db1bb14e15cap+4 we have
710      |ul| < ulp(0x1.9db1bb14e15cap+4^2) = 2^-43 */
711    /* eh+el approximates exp(-x^2) with maximal relative error 2^-74.139 */
712
713    /* compute 1/x as double-double */
714    let yh = 1.0 / x;
715    /* Assume 1 <= x < 2, then 0.5 <= yh <= 1,
716    and yh = 1/x + eps with |eps| <= 2^-53. */
717    /* Newton's iteration for 1/x is y -> y + y*(1-x*y) */
718    let yl = yh * dd_fmla(-x, yh, 1.0);
719    /* x*yh-1 = x*(1/x+eps)-1 = x*eps
720       with |x*eps| <= 2^-52, thus the error on the FMA is bounded by
721       ulp(2^-52.1) = 2^-105.
722       Now |yl| <= |yh| * 2^-52 <= 2^-52, thus the rounding error on
723       yh * __builtin_fma (-x, yh, 1.0) is bounded also by ulp(2^-52.1) = 2^-105.
724       From [6], Lemma 3.7, if yl was computed exactly, then yh+yl would differ
725       from 1/x by at most yh^2/theta^3*(1/x-yh)^2 for some theta in [yh,1/x]
726       or [1/x,yh].
727       Since yh, 1/x <= 1, this gives eps^2 <= 2^-106.
728       Adding the rounding errors, we have:
729       |yh + yl - 1/x| <= 2^-105 + 2^-105 + 2^-106 < 2^-103.67.
730       For the relative error, since |yh| >= 1/2, this gives:
731       |yh + yl - 1/x| < 2^-102.67 * |yh+yl|
732    */
733    const THRESHOLD: [u64; 6] = [
734        0x3fbd500000000000,
735        0x3fc59da6ca291ba6,
736        0x3fcbc00000000000,
737        0x3fd0c00000000000,
738        0x3fd3800000000000,
739        0x3fd6300000000000,
740    ];
741    let mut i = 0usize;
742    while i < THRESHOLD.len() && yh > f64::from_bits(THRESHOLD[i]) {
743        i += 1;
744    }
745    let p = ASYMPTOTIC_POLY[i];
746    u = DoubleDouble::from_exact_mult(yh, yh);
747    /* Since |yh| <= 1, we have |uh| <= 1 and |ul| <= 2^-53. */
748    u.lo = dd_fmla(2.0 * yh, yl, u.lo);
749    /* uh+ul approximates (yh+yl)^2, with absolute error bounded by
750       ulp(ul) + yl^2, where ulp(ul) is the maximal rounding error in
751       the FMA, and yl^2 is the neglected term.
752       Since |ul| <= 2^-53, ulp(ul) <= 2^-105, and since |yl| <= 2^-52,
753       this yields |uh + ul - yh^2| <= 2^-105 + 2^-104 < 2^-103.41.
754       For the relative error, since |(yh+yl)^2| >= 1/4:
755       |uh + ul - yh^2| < 2^-101.41 * |uh+ul|.
756       And relatively to 1/x^2:
757       yh + yl = 1/x * (1 + eps1)       with |eps1| < 2^-102.67
758       uh + ul = (yh+yl)^2 * (1 + eps2) with |eps2| < 2^-101.41
759       This yields:
760       |uh + ul - 1/x| < 2^-100.90 * |uh+ul|.
761    */
762
763    /* evaluate p(uh+ul) */
764    let mut zh: f64 = f64::from_bits(p[12]); // degree 23
765    zh = dd_fmla(zh, u.hi, f64::from_bits(p[11])); // degree 21
766    zh = dd_fmla(zh, u.hi, f64::from_bits(p[10])); // degree 19
767
768    /* degree 17: zh*(uh+ul)+p[i] */
769    let mut v = DoubleDouble::quick_f64_mult(zh, u);
770    let mut z_dd = DoubleDouble::from_exact_add(f64::from_bits(p[9]), v.hi);
771    z_dd.lo += v.lo;
772
773    for a in (3..=15).rev().step_by(2) {
774        v = DoubleDouble::quick_mult(z_dd, u);
775        z_dd = DoubleDouble::from_exact_add(f64::from_bits(p[((a + 1) / 2) as usize]), v.hi);
776        z_dd.lo += v.lo;
777    }
778
779    /* degree 1: (zh+zl)*(uh+ul)+p[0]+p[1] */
780    v = DoubleDouble::quick_mult(z_dd, u);
781    z_dd = DoubleDouble::from_exact_add(f64::from_bits(p[0]), v.hi);
782    z_dd.lo += v.lo + f64::from_bits(p[1]);
783    /* multiply by yh+yl */
784    u = DoubleDouble::quick_mult(z_dd, DoubleDouble::new(yl, yh));
785    /* now uh+ul approximates p(1/x) */
786    /* now multiply (uh+ul)*(eh+el) */
787    v = DoubleDouble::quick_mult(u, e_dd);
788
789    /* Write y = 1/x.  We have the following errors:
790       * the maximal mathematical error is:
791         |erfc(x)*exp(x^2) - p(y)| < 2^-71.804 * |p(y)| (for i=3) thus
792         |erfc(x) - exp(-x^2)*p(y)| < 2^-71.804 * |exp(-x^2)*p(y)|
793       * the error in approximating exp(-x^2) by eh+el:
794         |eh + el - exp(-x^2)| < 2^-74.139 * |eh + el|
795       * the fact that we evaluate p on yh+yl instead of 1/x
796         this error is bounded by |p'| * |yh+yl - 1/x|, where
797         |yh+yl - 1/x| < 2^-102.67 * |yh+yl|, and the relative
798         error is bounded by |p'/p| * |yh+yl - 1/x|.
799         Since the maximal value of |p'/p| is bounded by 27.2 (for i=0),
800         this yields 27.2 * 2^-102.67 < 2^-97.9
801       * the rounding errors when evaluating p on yh+yl: this error is bounded
802         (relatively) by 2^-67.184 (for i=5), see analyze_erfc_asympt_fast()
803         in erfc.sage
804       * the rounding error in (uh+ul)*(eh+el): we assume this error is bounded
805         by 2^-80 (relatively)
806       This yields a global relative bound of:
807       (1+2^-71.804)*(1+2^-74.139)*(1+2^-97.9)*(1+2^-67.184)*(1+2^-80)-1
808       < 2^-67.115
809    */
810    if v.hi >= f64::from_bits(0x044151b9a3fdd5c9) {
811        Erf {
812            err: f64::from_bits(0x3bbd900000000000) * v.hi,
813            result: v,
814        } /* 2^-67.115 < 0x1.d9p-68 */
815    } else {
816        Erf {
817            result: v,
818            err: f64::from_bits(0x0010000000000000),
819        } // this overestimates 0x1.d9p-68 * h
820    }
821}
822
823#[inline]
824fn erfc_fast(x: f64) -> Erf {
825    if x < 0.
826    // erfc(x) = 1 - erf(x) = 1 + erf(-x)
827    {
828        let res = erf_fast(-x);
829        /* h+l approximates erf(-x), with relative error bounded by err,
830        where err <= 0x1.78p-69 */
831        let err = res.err * res.result.hi; /* convert into absolute error */
832        let mut t = DoubleDouble::from_exact_add(1.0, res.result.hi);
833        t.lo += res.result.lo;
834        // since h <= 2, the fast_two_sum() error is bounded by 2^-105*h <= 2^-104
835        /* After the fast_two_sum() call, we have |t| <= ulp(h) <= ulp(2) = 2^-51
836        thus assuming |l| <= 2^-51 after the cr_erf_fast() call,
837        we have |t| <= 2^-50 here, thus the rounding
838        error on t -= *l is bounded by ulp(2^-50) = 2^-102.
839        The absolute error is thus bounded by err + 2^-104 + 2^-102
840        = err + 0x1.4p-102.
841        The maximal value of err here is for |x| < 0.0625, where cr_erf_fast()
842        returns 0x1.78p-69, and h=1/2, yielding err = 0x1.78p-70 here.
843        Adding 0x1.4p-102 is thus exact. */
844        return Erf {
845            err: err + f64::from_bits(0x3994000000000000),
846            result: t,
847        };
848    } else if x <= f64::from_bits(0x400713786d9c7c09) {
849        let res = erf_fast(x);
850        /* h+l approximates erf(x), with relative error bounded by err,
851        where err <= 0x1.78p-69 */
852        let err = res.err * res.result.hi; /* convert into absolute error */
853        let mut t = DoubleDouble::from_exact_add(1.0, -res.result.hi);
854        t.lo -= res.result.lo;
855        /* for x >= 0x1.e861fbb24c00ap-2, erf(x) >= 1/2, thus 1-h is exact
856        by Sterbenz theorem, thus t = 0 in fast_two_sum(), and we have t = -l
857        here, thus the absolute error is err */
858        if x >= f64::from_bits(0x3fde861fbb24c00a) {
859            return Erf { err, result: t };
860        }
861        /* for x < 0x1.e861fbb24c00ap-2, the error in fast_two_sum() is bounded
862        by 2^-105*h, and since h <= 1/2, this yields 2^-106.
863        After the fast_two_sum() call, we have |t| <= ulp(h) <= ulp(1/2) = 2^-53
864        thus assuming |l| <= 2^-53 after the cr_erf_fast() call,
865        we have |t| <= 2^-52 here, thus the rounding
866        error on t -= *l is bounded by ulp(2^-52) = 2^-104.
867        The absolute error is thus bounded by err + 2^-106 + 2^-104
868        The maximal value of err here is for x < 0.0625, where cr_erf_fast()
869        returns 0x1.78p-69, and h=1/2, yielding err = 0x1.78p-70 here.
870        Adding 0x1.4p-104 is thus exact. */
871        return Erf {
872            err: err + f64::from_bits(0x3974000000000000),
873            result: t,
874        };
875    }
876    /* Now THRESHOLD1 < x < 0x1.b39dc41e48bfdp+4 thus erfc(x) < 0.000046. */
877    /* on a i7-8700 with gcc 12.2.0, for x in [THRESHOLD1,+5.0],
878    the average reciprocal throughput is about 111 cycles
879    (among which 20 cycles for exp_1) */
880    erfc_asympt_fast(x)
881}
882
883/// Complementary error function
884///
885/// Max ulp 0.5
886pub fn f_erfc(x: f64) -> f64 {
887    let t: u64 = x.to_bits();
888    let at: u64 = t & 0x7fff_ffff_ffff_ffff;
889
890    if t >= 0x8000000000000000u64
891    // x = -NaN or x <= 0 (excluding +0)
892    {
893        // for x <= -0x1.7744f8f74e94bp2, erfc(x) rounds to 2 (to nearest)
894        if t >= 0xc017744f8f74e94bu64
895        // x = NaN or x <= -0x1.7744f8f74e94bp2
896        {
897            if t >= 0xfff0000000000000u64 {
898                // -Inf or NaN
899                if t == 0xfff0000000000000u64 {
900                    return 2.0;
901                } // -Inf
902                return x + x; // NaN
903            }
904            return black_box(2.0) - black_box(f64::from_bits(0x3c90000000000000)); // rounds to 2 or below(2)
905        }
906
907        // for -9.8390953768041405e-17 <= x <= 0, erfc(x) rounds to 1 (to nearest)
908        if f64::from_bits(0xbc9c5bf891b4ef6a) <= x {
909            return dd_fmla(-x, f64::from_bits(0x3c90000000000000), 1.0);
910        }
911    } else
912    // x = +NaN or x >= 0 (excluding -0)
913    {
914        // for x >= 0x1.b39dc41e48bfdp+4, erfc(x) < 2^-1075: rounds to 0 or 2^-1074
915        if at >= 0x403b39dc41e48bfdu64
916        // x = NaN or x >= 0x1.b39dc41e48bfdp+4
917        {
918            if at >= 0x7ff0000000000000u64 {
919                // +Inf or NaN
920                if at == 0x7ff0000000000000u64 {
921                    return 0.0;
922                } // +Inf
923                return x + x; // NaN
924            }
925            return black_box(f64::from_bits(0x0000000000000001)) * black_box(0.25); // 0 or 2^-1074 wrt rounding
926        }
927
928        // for 0 <= x <= 0x1.c5bf891b4ef6ap-55, erfc(x) rounds to 1 (to nearest)
929        if x <= f64::from_bits(0x3c8c5bf891b4ef6a) {
930            return dd_fmla(-x, f64::from_bits(0x3c90000000000000), 1.0);
931        }
932    }
933
934    /* now -0x1.7744f8f74e94bp+2 < x < -0x1.c5bf891b4ef6ap-54
935    or 0x1.c5bf891b4ef6ap-55 < x < 0x1.b39dc41e48bfdp+4 */
936
937    let result = erfc_fast(x);
938
939    let left = result.result.hi + (result.result.lo - result.err);
940    let right = result.result.hi + (result.result.lo + result.err);
941    if left == right {
942        return left;
943    }
944    erfc_accurate(x)
945}
946
947#[cfg(test)]
948mod tests {
949    use super::*;
950    #[test]
951    fn test_erfc() {
952        assert_eq!(f_erfc(1.0), 0.15729920705028513);
953        assert_eq!(f_erfc(0.5), 0.4795001221869535);
954        assert_eq!(f_erfc(0.000000005), 0.9999999943581042);
955        assert_eq!(f_erfc(-0.00000000000065465465423305), 1.0000000000007387);
956        assert!(f_erfc(f64::NAN).is_nan());
957        assert_eq!(f_erfc(f64::INFINITY), 0.0);
958        assert_eq!(f_erfc(f64::NEG_INFINITY), 2.0);
959    }
960}
pxfm/err/erfc.rs

pxfm/err/
erfc.rs