pxfm/
pow_exec.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::dd_fmla;
30use crate::double_double::DoubleDouble;
31use crate::dyadic_float::{DyadicFloat128, DyadicSign};
32use crate::exponents::{EXP_REDUCE_T0, EXP_REDUCE_T1, ldexp};
33use crate::exponents::{EXPM1_T0, EXPM1_T1};
34use crate::polyeval::f_polyeval6;
35use crate::pow_tables::{EXP_T1_2_DYADIC, EXP_T2_2_DYADIC, POW_INVERSE, POW_LOG_INV};
36use crate::rounding::CpuRound;
37use crate::rounding::CpuRoundTiesEven;
38
39#[inline(always)]
40pub(crate) fn log_poly_1(z: f64) -> DoubleDouble {
41    /* The following is a degree-8 polynomial generated by Sollya for
42    log(1+x)-x+x^2/2 over [-0.0040283203125,0.0040283203125]
43    with absolute error < 2^-81.63
44    and relative error < 2^-72.423 (see sollya/P_1.sollya).
45    The relative error is for x - x^2/2 + P(x) with respect to log(1+x). */
46    const P_1: [u64; 6] = [
47        0x3fd5555555555558,
48        0xbfd0000000000003,
49        0x3fc999999981f535,
50        0xbfc55555553d1eb4,
51        0x3fc2494526fd4a06,
52        0xbfc0001f0c80e8ce,
53    ];
54    let w = DoubleDouble::from_exact_mult(z, z);
55    let t = dd_fmla(f64::from_bits(P_1[5]), z, f64::from_bits(P_1[4]));
56    let mut u = dd_fmla(f64::from_bits(P_1[3]), z, f64::from_bits(P_1[2]));
57    let mut v = dd_fmla(f64::from_bits(P_1[1]), z, f64::from_bits(P_1[0]));
58    u = dd_fmla(t, w.hi, u);
59    v = dd_fmla(u, w.hi, v);
60    u = v * w.hi;
61    DoubleDouble::new(dd_fmla(u, z, -0.5 * w.lo), -0.5 * w.hi)
62}
63
64/* Given 2^-1074 <= x <= 0x1.fffffffffffffp+1023, this routine puts in h+l
65   an approximation of log(x) such that |l| < 2^-23.89*|h| and
66
67   | h + l - log(x) | <= elog * |log x|
68
69   with elog = 2^-73.527  if x < 1/sqrt(2) or sqrt(2) < x,
70   and  elog = 2^-67.0544 if 1/sqrt(2) < x < sqrt(2)
71   (note that x cannot equal 1/sqrt(2) nor sqrt(2)).
72*/
73#[inline]
74pub(crate) fn pow_log_1(x: f64) -> (DoubleDouble, bool) {
75    /* for 181 <= i <= 362, r[i] = _INVERSE[i-181] is a 9-bit approximation of
76    1/x[i], where i*2^-8 <= x[i] < (i+1)*2^-8.
77    More precisely r[i] is a 9-bit value such that r[i]*y-1 is representable
78    exactly on 53 bits for for any y, i*2^-8 <= y < (i+1)*2^-8.
79    Moreover |r[i]*y-1| < 0.0040283203125.
80    Table generated with the accompanying pow.sage file,
81    with l=inverse_centered(k=8,prec=9,maxbits=53,verbose=false) */
82    let x_u = x.to_bits();
83    let mut m = x_u & 0xfffffffffffff;
84    let mut e: i64 = ((x_u >> 52) & 0x7ff) as i64;
85
86    let t;
87    if e != 0 {
88        t = m | (0x3ffu64 << 52);
89        m = m.wrapping_add(1u64 << 52);
90        e -= 0x3ff;
91    } else {
92        /* x is a subnormal double  */
93        let k = m.leading_zeros() - 11;
94
95        e = -0x3fei64 - k as i64;
96        m = m.wrapping_shl(k);
97        t = m | (0x3ffu64 << 52);
98    }
99
100    /* now |x| = 2^_e*_t = 2^(_e-52)*m with 1 <= _t < 2,
101    and 2^52 <= _m < 2^53 */
102
103    //   log(x) = log(t) + E · log(2)
104    let mut t = f64::from_bits(t);
105
106    // If m > sqrt(2) we divide it by 2 so ensure 1/sqrt(2) < t < sqrt(2)
107    let c: usize = (m >= 0x16a09e667f3bcd) as usize;
108    static CY: [f64; 2] = [1.0, 0.5];
109    static CM: [u64; 2] = [44, 45];
110
111    e = e.wrapping_add(c as i64);
112    let be = e;
113    let i = m >> CM[c]; /* i/2^8 <= t < (i+1)/2^8 */
114    /* when c=1, we have 0x16a09e667f3bcd <= m < 2^53, thus 90 <= i <= 127;
115    when c=0, we have 2^52 <= m < 0x16a09e667f3bcd, thus 128 <= i <= 181 */
116    t *= CY[c];
117    /* now 0x1.6a09e667f3bcdp-1 <= t < 0x1.6a09e667f3bcdp+0,
118    and log(x) = E * log(2) + log(t) */
119
120    let r = f64::from_bits(POW_INVERSE[(i - 181) as usize]);
121    let l1 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].1);
122    let l2 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].0);
123
124    let z = dd_fmla(r, t, -1.0);
125
126    const LOG2_DD: DoubleDouble = DoubleDouble::new(
127        f64::from_bits(0x3d2ef35793c76730),
128        f64::from_bits(0x3fe62e42fefa3800),
129    );
130
131    let th = dd_fmla(be as f64, LOG2_DD.hi, l1);
132    let tl = dd_fmla(be as f64, LOG2_DD.lo, l2);
133
134    let mut v = DoubleDouble::f64_add(th, DoubleDouble::new(tl, z));
135    let p = log_poly_1(z);
136    v = DoubleDouble::f64_add(v.hi, DoubleDouble::new(v.lo + p.lo, p.hi));
137
138    if e == 0 && v.lo.abs() > (v.hi.abs()) * f64::from_bits(0x3e70000000000000) {
139        v = DoubleDouble::from_exact_add(v.hi, v.lo);
140        return (v, true);
141    }
142
143    (v, false)
144}
145
146/* Given z such that |z| < 2^-12.905,
147   this routine puts in qh+ql an approximation of exp(z) such that
148
149   | (qh+ql) / exp(z) - 1 | < 2^-64.902632
150
151   and |ql| <= 2^-51.999.
152*/
153#[inline(always)]
154fn exp_poly_1(z: f64) -> DoubleDouble {
155    /* The following is a degree-4 polynomial generated by Sollya for exp(x)
156    over [-2^-12.905,2^-12.905]
157    with absolute error < 2^-74.34 (see sollya/Q_1.sollya). */
158    const Q_1: [u64; 5] = [
159        0x3ff0000000000000,
160        0x3ff0000000000000,
161        0x3fe0000000000000,
162        0x3fc5555555997996,
163        0x3fa5555555849d8d,
164    ];
165    let mut q = dd_fmla(f64::from_bits(Q_1[4]), z, f64::from_bits(Q_1[3]));
166    q = dd_fmla(q, z, f64::from_bits(Q_1[2]));
167    let h0 = dd_fmla(q, z, f64::from_bits(Q_1[1]));
168
169    let v1 = DoubleDouble::from_exact_mult(z, h0);
170    DoubleDouble::f64_add(f64::from_bits(Q_1[0]), v1)
171}
172
173/* Given z such that |z| < 2^-12.905,
174   this routine puts in qh+ql an approximation of exp(z) such that
175
176   | (qh+ql) / exp(z) - 1 | < 2^-64.902632
177
178   and |ql| <= 2^-51.999.
179*/
180
181// #[inline(always)]
182// fn exp_poly_dd(z: DoubleDouble) -> DoubleDouble {
183//     /* The following is a degree-4 polynomial generated by Sollya for exp(x)
184//     over [-2^-12.905,2^-12.905] */
185//     const Q_1: [(u64, u64); 7] = [
186//         (0x0000000000000000, 0x3ff0000000000000),
187//         (0x3a20e40000000000, 0x3ff0000000000000),
188//         (0x3a04820000000000, 0x3fe0000000000000),
189//         (0xbc756423c5338a66, 0x3fc5555555555556),
190//         (0xbc5560f74db5556c, 0x3fa5555555555556),
191//         (0x3c3648eca89bc6ac, 0x3f8111111144fbee),
192//         (0xbbd53d924ae90c8c, 0x3f56c16c16ffeecc),
193//     ];
194//     let mut p = DoubleDouble::mult(z, DoubleDouble::from_bit_pair(Q_1[6]));
195//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[5]));
196//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[4]));
197//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[3]));
198//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[2]));
199//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[1]));
200//     p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[0]));
201//     p
202// }
203
204#[inline]
205pub(crate) fn pow_exp_1(r: DoubleDouble, s: f64) -> DoubleDouble {
206    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
207    // #define RHO1 -0x1.577453f1799a6p+9
208    /* We increase the initial value of RHO1 to avoid spurious underflow in
209    the result value el. However, it is not possible to obtain a lower
210    bound on |el| from the input value rh, thus this modified value of RHO1
211    is obtained experimentally. */
212    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
213    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
214    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
215
216    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
217    #[allow(clippy::neg_cmp_op_on_partial_ord)]
218    if !(r.hi <= RHO2) {
219        return if r.hi > RHO3 {
220            /* If rh > RHO3, we are sure there is overflow,
221            For s=1 we return eh = el = DBL_MAX, which yields
222            res_min = res_max = +Inf for rounding up or to nearest,
223            and res_min = res_max = DBL_MAX for rounding down or toward zero,
224            which will yield the correct rounding.
225            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
226            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
227            which is the correct rounding. */
228            DoubleDouble::new(
229                f64::from_bits(0x7fefffffffffffff) * s,
230                f64::from_bits(0x7fefffffffffffff) * s,
231            )
232        } else {
233            /* If RHO2 < rh <= RHO3, we are in the intermediate region
234            where there might be overflow or not, thus we set eh = el = NaN,
235            which will set res_min = res_max = NaN, the comparison
236            res_min == res_max will fail: we defer to the 2nd phase. */
237            DoubleDouble::new(f64::NAN, f64::NAN)
238        };
239    }
240
241    if r.hi < RHO1 {
242        return if r.hi < RHO0 {
243            /* For s=1, we have eh=el=+0 except for rounding up,
244               thus res_min=+0 or -0, res_max=+0 in the main code,
245               the rounding test succeeds, and we return res_max which is the
246               expected result in the underflow case.
247               For s=1 and rounding up, we have eh=+0, el=2^-1074,
248               thus res_min = res_max = 2^-1074, which is the expected result too.
249               For s=-1, we have eh=el=-0 except for rounding down,
250               thus res_min=-0 or +0, res_max=-0 in the main code,
251               the rounding test succeeds, and we return res_max which is the
252               expected result in the underflow case.
253               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
254               thus res_min = res_max = -2^-1074, which is the expected result too.
255            */
256            DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
257        } else {
258            /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
259            DoubleDouble::new(f64::NAN, f64::NAN)
260        };
261    }
262    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
263
264    let k = (r.hi * INVLOG2).cpu_round();
265
266    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
267    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
268
269    let zh = dd_fmla(LOG2H, -k, r.hi);
270    let zl = dd_fmla(LOG2L, -k, r.lo);
271
272    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
273    let mk = (bk >> 12) + 0x3ff;
274    let i2 = (bk >> 6) & 0x3f;
275    let i1 = bk & 0x3f;
276
277    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
278    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
279    let mut de = DoubleDouble::quick_mult(t1, t0);
280    let q = exp_poly_1(zh + zl);
281    de = DoubleDouble::quick_mult(de, q);
282    /* we should have 1 < M < 2047 here, since we filtered out
283    potential underflow/overflow cases at the beginning of this function */
284
285    let mut du = (mk as u64).wrapping_shl(52);
286    du = (f64::from_bits(du) * s).to_bits();
287    de.hi *= f64::from_bits(du);
288    de.lo *= f64::from_bits(du);
289    de
290}
291
292#[inline]
293pub(crate) fn exp_dd_fast(r: DoubleDouble) -> DoubleDouble {
294    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
295
296    let k = (r.hi * INVLOG2).cpu_round();
297
298    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
299    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
300
301    let mut z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
302    z = DoubleDouble::from_exact_add(z.hi, z.lo);
303
304    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
305    let mk = (bk >> 12) + 0x3ff;
306    let i2 = (bk >> 6) & 0x3f;
307    let i1 = bk & 0x3f;
308
309    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
310    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
311    let mut de = DoubleDouble::quick_mult(t1, t0);
312    // exp(hi + lo) = exp(hi) * exp(lo)
313    let q_hi = exp_poly_1(z.hi);
314    // Taylor series exp(x) ~ 1 + x since z.lo < ulp(z.h)
315    let q_lo = DoubleDouble::from_exact_add(1., z.lo);
316    let q = DoubleDouble::quick_mult(q_hi, q_lo);
317    de = DoubleDouble::quick_mult(de, q);
318    /* we should have 1 < M < 2047 here, since we filtered out
319    potential underflow/overflow cases at the beginning of this function */
320
321    let du = (mk as u64).wrapping_shl(52);
322    de.hi *= f64::from_bits(du);
323    de.lo *= f64::from_bits(du);
324    de
325}
326
327#[inline]
328pub(crate) fn pow_exp_dd(r: DoubleDouble, s: f64) -> DoubleDouble {
329    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
330    // #define RHO1 -0x1.577453f1799a6p+9
331    /* We increase the initial value of RHO1 to avoid spurious underflow in
332    the result value el. However, it is not possible to obtain a lower
333    bound on |el| from the input value rh, thus this modified value of RHO1
334    is obtained experimentally. */
335    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
336    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
337    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
338
339    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
340    #[allow(clippy::neg_cmp_op_on_partial_ord)]
341    if !(r.hi <= RHO2) {
342        return if r.hi > RHO3 {
343            /* If rh > RHO3, we are sure there is overflow,
344            For s=1 we return eh = el = DBL_MAX, which yields
345            res_min = res_max = +Inf for rounding up or to nearest,
346            and res_min = res_max = DBL_MAX for rounding down or toward zero,
347            which will yield the correct rounding.
348            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
349            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
350            which is the correct rounding. */
351            DoubleDouble::new(
352                f64::from_bits(0x7fefffffffffffff) * s,
353                f64::from_bits(0x7fefffffffffffff) * s,
354            )
355        } else {
356            /* If RHO2 < rh <= RHO3, we are in the intermediate region
357            where there might be overflow or not, thus we set eh = el = NaN,
358            which will set res_min = res_max = NaN, the comparison
359            res_min == res_max will fail: we defer to the 2nd phase. */
360            DoubleDouble::new(f64::NAN, f64::NAN)
361        };
362    }
363
364    if r.hi < RHO1 {
365        return if r.hi < RHO0 {
366            /* For s=1, we have eh=el=+0 except for rounding up,
367               thus res_min=+0 or -0, res_max=+0 in the main code,
368               the rounding test succeeds, and we return res_max which is the
369               expected result in the underflow case.
370               For s=1 and rounding up, we have eh=+0, el=2^-1074,
371               thus res_min = res_max = 2^-1074, which is the expected result too.
372               For s=-1, we have eh=el=-0 except for rounding down,
373               thus res_min=-0 or +0, res_max=-0 in the main code,
374               the rounding test succeeds, and we return res_max which is the
375               expected result in the underflow case.
376               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
377               thus res_min = res_max = -2^-1074, which is the expected result too.
378            */
379            DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
380        } else {
381            /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
382            DoubleDouble::new(f64::NAN, f64::NAN)
383        };
384    }
385    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
386
387    let k = (r.hi * INVLOG2).cpu_round();
388
389    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
390    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
391
392    let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
393
394    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
395    let mk = (bk >> 12) + 0x3ff;
396    let i2 = (bk >> 6) & 0x3f;
397    let i1 = bk & 0x3f;
398
399    let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
400    let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
401    let mut de = DoubleDouble::quick_mult(t1, t0);
402    let q = exp_poly_1(z.to_f64());
403    de = DoubleDouble::quick_mult(de, q);
404    /* we should have 1 < M < 2047 here, since we filtered out
405    potential underflow/overflow cases at the beginning of this function */
406
407    let mut du = (mk as u64).wrapping_shl(52);
408    du = (f64::from_bits(du) * s).to_bits();
409    de.hi *= f64::from_bits(du);
410    de.lo *= f64::from_bits(du);
411    de
412}
413/*
414#[inline(always)]
415pub(crate) fn expm1_poly_dd(z: DoubleDouble) -> DoubleDouble {
416    /*
417       Sollya:
418       pretty = proc(u) {
419         return ~(floor(u*1000)/1000);
420       };
421
422       d = [-2^-12.905,2^-12.905];
423       f = expm1(x);
424       w = 1;
425       pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, 107...|], d, absolute, floating);
426       err_p = -log2(dirtyinfnorm(pf*w-f, d));
427       display = decimal;
428
429       for i from 1 to degree(pf) do print(coeff(pf, i));
430
431       print (pf);
432       display = decimal;
433       print ("absolute error:",pretty(err_p));
434       f = 1;
435       w = 1/expm1(x);
436       err_p = -log2(dirtyinfnorm(pf*w-f, d));
437       print ("relative error:",pretty(err_p));
438    */
439    const Q: [(u64, u64); 7] = [
440        (0x0000000000000000, 0x3ff0000000000000),
441        (0x0000000000000000, 0x3fe0000000000000),
442        (0xbc75555554d7c48c, 0x3fc5555555555556),
443        (0xbc555a40ffb472d9, 0x3fa5555555555556),
444        (0x3c24866314c38093, 0x3f8111111111110e),
445        (0x3be34665978dddb8, 0x3f56c16c16efac90),
446        (0x3baeab43b813ef24, 0x3f2a01a1e12d253c),
447    ];
448    let z2 = z * z;
449    let z4 = z2 * z2;
450
451    let b0 = DoubleDouble::quick_mul_add(
452        z,
453        DoubleDouble::from_bit_pair(Q[1]),
454        DoubleDouble::from_bit_pair(Q[0]),
455    );
456    let b1 = DoubleDouble::quick_mul_add(
457        z,
458        DoubleDouble::from_bit_pair(Q[3]),
459        DoubleDouble::from_bit_pair(Q[2]),
460    );
461    let b2 = DoubleDouble::quick_mul_add(
462        z,
463        DoubleDouble::from_bit_pair(Q[5]),
464        DoubleDouble::from_bit_pair(Q[4]),
465    );
466
467    let c0 = DoubleDouble::quick_mul_add(z2, b1, b0);
468    let c1 = DoubleDouble::quick_mul_add(z2, DoubleDouble::from_bit_pair(Q[6]), b2);
469
470    let p = DoubleDouble::quick_mul_add(z4, c1, c0);
471    DoubleDouble::quick_mult(p, z)
472}
473*/
474#[inline(always)]
475pub(crate) fn expm1_poly_fast(z: DoubleDouble) -> DoubleDouble {
476    // Polynomial generated by Sollya:
477    // d = [-2^-12.905,2^-12.905];
478    // f = expm1(x);
479    // w = 1;
480    // pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, D...|], d, absolute, floating);
481    // See ./notes/compound_m1_expm1_fast.sollya
482    let p = f_polyeval6(
483        z.hi,
484        f64::from_bits(0x3fe0000000000000),
485        f64::from_bits(0x3fc5555555555555),
486        f64::from_bits(0x3fa55555555553de),
487        f64::from_bits(0x3f81111144995a9a),
488        f64::from_bits(0x3f56c241f9a791c5),
489        f64::from_bits(0xbfad9209c6d8b9e1),
490    );
491    let px = DoubleDouble::quick_mult_f64(DoubleDouble::from_exact_mult(z.hi, p), z.hi);
492    // expm1(hi + lo) = expm1(hi) + expm1(lo)(1 + expm1(hi)) = expm1(hi) + expm1(lo)expm1(hi) + expm1(lo)
493    // expm1(lo) ~ lo
494    let expm1_hi = DoubleDouble::f64_add(z.hi, px);
495    let mut lowest_part = DoubleDouble::quick_mult_f64(expm1_hi, z.lo);
496    lowest_part = DoubleDouble::full_add_f64(lowest_part, z.lo);
497    DoubleDouble::quick_dd_add(expm1_hi, lowest_part)
498}
499
500/// |z.hi| < 2^-7
501#[inline(always)]
502pub(crate) fn expm1_poly_dd_tiny(z: DoubleDouble) -> DoubleDouble {
503    // Polynomial generated in Sollya
504    // d = [-2^-7,2^-7];
505    // f = expm1(x);
506    // w = 1;
507    // pf = fpminimax(f, [|1,2,3,4,5,6,7,8,9|], [|1, 1, 107...|], d, absolute, floating);
508    // See ./notes/compound_expm1_tiny.sollya
509    const Q: [(u64, u64); 9] = [
510        (0x0000000000000000, 0x3ff0000000000000),
511        (0x0000000000000000, 0x3fe0000000000000),
512        (0x3c6555564150ff16, 0x3fc5555555555555),
513        (0x3c4586275c26f8a5, 0x3fa5555555555555),
514        (0xbc19e6193ac658a6, 0x3f81111111111111),
515        (0xbbf025e72dc21051, 0x3f56c16c16c1500a),
516        (0x3bc2d641a7b7b9b8, 0x3f2a01a01a07dc46),
517        (0xbb42cc8aaeeb3d00, 0x3efa01a29fef3e6f),
518        (0x3b52b1589125ce82, 0x3ec71db6af553255),
519    ];
520    let z = DoubleDouble::from_exact_add(z.hi, z.lo);
521    let mut d = DoubleDouble::quick_mul_add(
522        z,
523        DoubleDouble::from_bit_pair(Q[8]),
524        DoubleDouble::from_bit_pair(Q[7]),
525    );
526    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[6]));
527    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[5]));
528    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[4]));
529    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[3]));
530    d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[2]));
531    d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3fe0000000000000));
532    d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3ff0000000000000));
533    DoubleDouble::quick_mult(d, z)
534}
535
536#[inline]
537pub(crate) fn pow_expm1_1(r: DoubleDouble, s: f64) -> DoubleDouble {
538    const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
539    // #define RHO1 -0x1.577453f1799a6p+9
540    /* We increase the initial value of RHO1 to avoid spurious underflow in
541    the result value el. However, it is not possible to obtain a lower
542    bound on |el| from the input value rh, thus this modified value of RHO1
543    is obtained experimentally. */
544    const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
545    const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
546    const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
547
548    // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
549    #[allow(clippy::neg_cmp_op_on_partial_ord)]
550    if !(r.hi <= RHO2) {
551        return if r.hi > RHO3 {
552            /* If rh > RHO3, we are sure there is overflow,
553            For s=1 we return eh = el = DBL_MAX, which yields
554            res_min = res_max = +Inf for rounding up or to nearest,
555            and res_min = res_max = DBL_MAX for rounding down or toward zero,
556            which will yield the correct rounding.
557            For s=-1 we return eh = el = -DBL_MAX, which similarly gives
558            res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
559            which is the correct rounding. */
560            DoubleDouble::new(
561                f64::from_bits(0x7fefffffffffffff) * s,
562                f64::from_bits(0x7fefffffffffffff) * s,
563            )
564        } else {
565            /* If RHO2 < rh <= RHO3, we are in the intermediate region
566            where there might be overflow or not, thus we set eh = el = NaN,
567            which will set res_min = res_max = NaN, the comparison
568            res_min == res_max will fail: we defer to the 2nd phase. */
569            DoubleDouble::new(f64::NAN, f64::NAN)
570        };
571    }
572
573    if r.hi < RHO1 {
574        if r.hi < RHO0 {
575            /* For s=1, we have eh=el=+0 except for rounding up,
576               thus res_min=+0 or -0, res_max=+0 in the main code,
577               the rounding test succeeds, and we return res_max which is the
578               expected result in the underflow case.
579               For s=1 and rounding up, we have eh=+0, el=2^-1074,
580               thus res_min = res_max = 2^-1074, which is the expected result too.
581               For s=-1, we have eh=el=-0 except for rounding down,
582               thus res_min=-0 or +0, res_max=-0 in the main code,
583               the rounding test succeeds, and we return res_max which is the
584               expected result in the underflow case.
585               For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
586               thus res_min = res_max = -2^-1074, which is the expected result too.
587            */
588            return DoubleDouble::full_add_f64(
589                DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s),
590                -1.0,
591            );
592        } else {
593            /* RHO0 <= rh < RHO1 or s < 0: we return -1 */
594            return DoubleDouble::new(0., -1.);
595        };
596    }
597
598    let ax = r.hi.to_bits() & 0x7fffffffffffffffu64;
599
600    const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
601    const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
602
603    if ax <= 0x3f80000000000000 {
604        // |x| < 2^-7
605        if ax < 0x3970000000000000 {
606            // |x| < 2^-104
607            return r;
608        }
609        let d = expm1_poly_dd_tiny(r);
610        return d;
611    }
612
613    const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
614
615    let k = (r.hi * INVLOG2).cpu_round_ties_even();
616
617    let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
618
619    let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
620    let mk = (bk >> 12) + 0x3ff;
621    let i2 = (bk >> 6) & 0x3f;
622    let i1 = bk & 0x3f;
623
624    let t0 = DoubleDouble::from_bit_pair(EXPM1_T0[i2 as usize]);
625    let t1 = DoubleDouble::from_bit_pair(EXPM1_T1[i1 as usize]);
626    let tbh = DoubleDouble::quick_mult(t1, t0);
627    let mut de = tbh;
628    // exp(k)=2^k*exp(r) + (2^k - 1)
629    let q = expm1_poly_fast(z);
630    de = DoubleDouble::quick_mult(de, q);
631    de = DoubleDouble::add(tbh, de);
632
633    let ie = mk - 0x3ff;
634
635    let off: f64 = f64::from_bits((2048i64 + 1023i64).wrapping_sub(ie).wrapping_shl(52) as u64);
636
637    let e: f64;
638    if ie < 53 {
639        let fhz = DoubleDouble::from_exact_add(off, de.hi);
640        de.hi = fhz.hi;
641        e = fhz.lo;
642    } else if ie < 104 {
643        let fhz = DoubleDouble::from_exact_add(de.hi, off);
644        de.hi = fhz.hi;
645        e = fhz.lo;
646    } else {
647        e = 0.;
648    }
649    de.lo += e;
650    de.hi = ldexp(de.to_f64(), ie as i32);
651    de.lo = 0.;
652    de
653}
654
655fn exp_dyadic_poly(x: DyadicFloat128) -> DyadicFloat128 {
656    const Q_2: [DyadicFloat128; 8] = [
657        DyadicFloat128 {
658            sign: DyadicSign::Pos,
659            exponent: -128,
660            mantissa: 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffd0_u128,
661        },
662        DyadicFloat128 {
663            sign: DyadicSign::Pos,
664            exponent: -127,
665            mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0088_u128,
666        },
667        DyadicFloat128 {
668            sign: DyadicSign::Pos,
669            exponent: -128,
670            mantissa: 0x8000_0000_0000_0000_0000_000c_06f3_cd29_u128,
671        },
672        DyadicFloat128 {
673            sign: DyadicSign::Pos,
674            exponent: -130,
675            mantissa: 0xaaaa_aaaa_aaaa_aaaa_aaaa_aa6a_1e07_76ae_u128,
676        },
677        DyadicFloat128 {
678            sign: DyadicSign::Pos,
679            exponent: -132,
680            mantissa: 0xaaaa_aaaa_aaaa_aaa3_0000_0000_0000_0000_u128,
681        },
682        DyadicFloat128 {
683            sign: DyadicSign::Pos,
684            exponent: -134,
685            mantissa: 0x8888_8888_8888_8897_0000_0000_0000_0000_u128,
686        },
687        DyadicFloat128 {
688            sign: DyadicSign::Pos,
689            exponent: -137,
690            mantissa: 0xb60b_60b9_3214_6a54_0000_0000_0000_0000_u128,
691        },
692        DyadicFloat128 {
693            sign: DyadicSign::Pos,
694            exponent: -140,
695            mantissa: 0xd00d_00cd_9841_6862_0000_0000_0000_0000_u128,
696        },
697    ];
698    let mut p = Q_2[7];
699    for i in (0..7).rev() {
700        p = x * p + Q_2[i];
701    }
702    p
703}
704
705/* put in r an approximation of exp(x), for |x| < 744.45,
706with relative error < 2^-121.70 */
707#[inline]
708pub(crate) fn exp_dyadic(x: DyadicFloat128) -> DyadicFloat128 {
709    let ex = x.exponent + 127;
710    if ex >= 10
711    // underflow or overflow
712    {
713        return DyadicFloat128 {
714            sign: DyadicSign::Pos,
715            exponent: if x.sign == DyadicSign::Neg {
716                -1076
717            } else {
718                1025
719            },
720            mantissa: x.mantissa,
721        };
722    }
723
724    const LOG2_INV: DyadicFloat128 = DyadicFloat128 {
725        sign: DyadicSign::Pos,
726        exponent: -115,
727        mantissa: 0xb8aa_3b29_5c17_f0bc_0000_0000_0000_0000_u128,
728    };
729
730    const LOG2: DyadicFloat128 = DyadicFloat128 {
731        sign: DyadicSign::Pos,
732        exponent: -128,
733        mantissa: 0xb172_17f7_d1cf_79ab_c9e3_b398_03f2_f6af_u128,
734    };
735
736    let mut bk = x * LOG2_INV;
737    let k = bk.trunc_to_i64(); /* k = trunc(K) [rounded towards zero, exact] */
738    /* The rounding error of mul_dint_int64() is bounded by 6 ulps, thus since
739    |K| <= 4399162*log(2) < 3049267, the error on K is bounded by 2^-103.41.
740    This error is divided by 2^12 below, thus yields < 2^-115.41. */
741    bk = LOG2.mul_int64(k);
742    bk.exponent -= 12;
743    bk.sign = bk.sign.negate();
744    let y = x + bk;
745
746    let bm = k >> 12;
747    let i2 = (k >> 6) & 0x3f;
748    let i1 = k & 0x3f;
749    let mut r = exp_dyadic_poly(y);
750    r = EXP_T1_2_DYADIC[i2 as usize] * r;
751    r = EXP_T2_2_DYADIC[i1 as usize] * r;
752    r.exponent += bm as i16; /* exact */
753    r
754}
755
756#[cfg(test)]
757mod tests {
758    use super::*;
759    use crate::f_expm1;
760    #[test]
761    fn test_log() {
762        let k = DyadicFloat128::new_from_f64(2.5);
763        assert_eq!(exp_dyadic(k).fast_as_f64(), 12.182493960703473);
764    }
765
766    #[test]
767    fn test_exp() {
768        let k = pow_expm1_1(DoubleDouble::new(0., 2.543543543543), 1.);
769        println!("{}", k.to_f64());
770        println!("{}", f_expm1(2.543543543543));
771    }
772}
pxfm/pow_exec.rs

pxfm/
pow_exec.rs