pxfm/pow_exec.rs
1/*
2 * // Copyright (c) Radzivon Bartoshyk 7/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1. Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2. Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3. Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::dd_fmla;
30use crate::double_double::DoubleDouble;
31use crate::dyadic_float::{DyadicFloat128, DyadicSign};
32use crate::exponents::{EXP_REDUCE_T0, EXP_REDUCE_T1, ldexp};
33use crate::exponents::{EXPM1_T0, EXPM1_T1};
34use crate::polyeval::{f_polyeval6, f_polyeval8};
35use crate::pow_tables::{EXP_T1_2_DYADIC, EXP_T2_2_DYADIC, POW_INVERSE, POW_LOG_INV};
36use crate::round::RoundFinite;
37use crate::round_ties_even::RoundTiesEven;
38
39#[inline(always)]
40pub(crate) fn log_poly_1(z: f64) -> DoubleDouble {
41 /* The following is a degree-8 polynomial generated by Sollya for
42 log(1+x)-x+x^2/2 over [-0.0040283203125,0.0040283203125]
43 with absolute error < 2^-81.63
44 and relative error < 2^-72.423 (see sollya/P_1.sollya).
45 The relative error is for x - x^2/2 + P(x) with respect to log(1+x). */
46 const P_1: [u64; 6] = [
47 0x3fd5555555555558,
48 0xbfd0000000000003,
49 0x3fc999999981f535,
50 0xbfc55555553d1eb4,
51 0x3fc2494526fd4a06,
52 0xbfc0001f0c80e8ce,
53 ];
54 let w = DoubleDouble::from_exact_mult(z, z);
55 let t = dd_fmla(f64::from_bits(P_1[5]), z, f64::from_bits(P_1[4]));
56 let mut u = dd_fmla(f64::from_bits(P_1[3]), z, f64::from_bits(P_1[2]));
57 let mut v = dd_fmla(f64::from_bits(P_1[1]), z, f64::from_bits(P_1[0]));
58 u = dd_fmla(t, w.hi, u);
59 v = dd_fmla(u, w.hi, v);
60 u = v * w.hi;
61 DoubleDouble::new(dd_fmla(u, z, -0.5 * w.lo), -0.5 * w.hi)
62}
63
64/* Given 2^-1074 <= x <= 0x1.fffffffffffffp+1023, this routine puts in h+l
65 an approximation of log(x) such that |l| < 2^-23.89*|h| and
66
67 | h + l - log(x) | <= elog * |log x|
68
69 with elog = 2^-73.527 if x < 1/sqrt(2) or sqrt(2) < x,
70 and elog = 2^-67.0544 if 1/sqrt(2) < x < sqrt(2)
71 (note that x cannot equal 1/sqrt(2) nor sqrt(2)).
72*/
73#[inline]
74pub(crate) fn pow_log_1(x: f64) -> (DoubleDouble, bool) {
75 /* for 181 <= i <= 362, r[i] = _INVERSE[i-181] is a 9-bit approximation of
76 1/x[i], where i*2^-8 <= x[i] < (i+1)*2^-8.
77 More precisely r[i] is a 9-bit value such that r[i]*y-1 is representable
78 exactly on 53 bits for for any y, i*2^-8 <= y < (i+1)*2^-8.
79 Moreover |r[i]*y-1| < 0.0040283203125.
80 Table generated with the accompanying pow.sage file,
81 with l=inverse_centered(k=8,prec=9,maxbits=53,verbose=false) */
82 let x_u = x.to_bits();
83 let mut m = x_u & 0xfffffffffffff;
84 let mut e: i64 = ((x_u >> 52) & 0x7ff) as i64;
85
86 let t;
87 if e != 0 {
88 t = m | (0x3ffu64 << 52);
89 m = m.wrapping_add(1u64 << 52);
90 e -= 0x3ff;
91 } else {
92 /* x is a subnormal double */
93 let k = m.leading_zeros() - 11;
94
95 e = -0x3fei64 - k as i64;
96 m = m.wrapping_shl(k);
97 t = m | (0x3ffu64 << 52);
98 }
99
100 /* now |x| = 2^_e*_t = 2^(_e-52)*m with 1 <= _t < 2,
101 and 2^52 <= _m < 2^53 */
102
103 // log(x) = log(t) + E ยท log(2)
104 let mut t = f64::from_bits(t);
105
106 // If m > sqrt(2) we divide it by 2 so ensure 1/sqrt(2) < t < sqrt(2)
107 let c: usize = (m >= 0x16a09e667f3bcd) as usize;
108 static CY: [f64; 2] = [1.0, 0.5];
109 static CM: [u64; 2] = [44, 45];
110
111 e = e.wrapping_add(c as i64);
112 let be = e;
113 let i = m >> CM[c]; /* i/2^8 <= t < (i+1)/2^8 */
114 /* when c=1, we have 0x16a09e667f3bcd <= m < 2^53, thus 90 <= i <= 127;
115 when c=0, we have 2^52 <= m < 0x16a09e667f3bcd, thus 128 <= i <= 181 */
116 t *= CY[c];
117 /* now 0x1.6a09e667f3bcdp-1 <= t < 0x1.6a09e667f3bcdp+0,
118 and log(x) = E * log(2) + log(t) */
119
120 let r = f64::from_bits(POW_INVERSE[(i - 181) as usize]);
121 let l1 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].1);
122 let l2 = f64::from_bits(POW_LOG_INV[(i - 181) as usize].0);
123
124 let z = dd_fmla(r, t, -1.0);
125
126 const LOG2_DD: DoubleDouble = DoubleDouble::new(
127 f64::from_bits(0x3d2ef35793c76730),
128 f64::from_bits(0x3fe62e42fefa3800),
129 );
130
131 let th = dd_fmla(be as f64, LOG2_DD.hi, l1);
132 let tl = dd_fmla(be as f64, LOG2_DD.lo, l2);
133
134 let mut v = DoubleDouble::f64_add(th, DoubleDouble::new(tl, z));
135 let p = log_poly_1(z);
136 v = DoubleDouble::f64_add(v.hi, DoubleDouble::new(v.lo + p.lo, p.hi));
137
138 if e == 0 && v.lo.abs() > (v.hi.abs()) * f64::from_bits(0x3e70000000000000) {
139 v = DoubleDouble::from_exact_add(v.hi, v.lo);
140 return (v, true);
141 }
142
143 (v, false)
144}
145
146/* Given z such that |z| < 2^-12.905,
147 this routine puts in qh+ql an approximation of exp(z) such that
148
149 | (qh+ql) / exp(z) - 1 | < 2^-64.902632
150
151 and |ql| <= 2^-51.999.
152*/
153#[inline(always)]
154fn exp_poly_1(z: f64) -> DoubleDouble {
155 /* The following is a degree-4 polynomial generated by Sollya for exp(x)
156 over [-2^-12.905,2^-12.905]
157 with absolute error < 2^-74.34 (see sollya/Q_1.sollya). */
158 const Q_1: [u64; 5] = [
159 0x3ff0000000000000,
160 0x3ff0000000000000,
161 0x3fe0000000000000,
162 0x3fc5555555997996,
163 0x3fa5555555849d8d,
164 ];
165 let mut q = dd_fmla(f64::from_bits(Q_1[4]), z, f64::from_bits(Q_1[3]));
166 q = dd_fmla(q, z, f64::from_bits(Q_1[2]));
167 let h0 = dd_fmla(q, z, f64::from_bits(Q_1[1]));
168
169 let v1 = DoubleDouble::from_exact_mult(z, h0);
170 DoubleDouble::f64_add(f64::from_bits(Q_1[0]), v1)
171}
172
173/* Given z such that |z| < 2^-12.905,
174 this routine puts in qh+ql an approximation of exp(z) such that
175
176 | (qh+ql) / exp(z) - 1 | < 2^-64.902632
177
178 and |ql| <= 2^-51.999.
179*/
180
181// #[inline(always)]
182// fn exp_poly_dd(z: DoubleDouble) -> DoubleDouble {
183// /* The following is a degree-4 polynomial generated by Sollya for exp(x)
184// over [-2^-12.905,2^-12.905] */
185// const Q_1: [(u64, u64); 7] = [
186// (0x0000000000000000, 0x3ff0000000000000),
187// (0x3a20e40000000000, 0x3ff0000000000000),
188// (0x3a04820000000000, 0x3fe0000000000000),
189// (0xbc756423c5338a66, 0x3fc5555555555556),
190// (0xbc5560f74db5556c, 0x3fa5555555555556),
191// (0x3c3648eca89bc6ac, 0x3f8111111144fbee),
192// (0xbbd53d924ae90c8c, 0x3f56c16c16ffeecc),
193// ];
194// let mut p = DoubleDouble::mult(z, DoubleDouble::from_bit_pair(Q_1[6]));
195// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[5]));
196// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[4]));
197// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[3]));
198// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[2]));
199// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[1]));
200// p = DoubleDouble::quick_mul_add(z, p, DoubleDouble::from_bit_pair(Q_1[0]));
201// p
202// }
203
204#[inline]
205pub(crate) fn pow_exp_1(r: DoubleDouble, s: f64) -> DoubleDouble {
206 const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
207 // #define RHO1 -0x1.577453f1799a6p+9
208 /* We increase the initial value of RHO1 to avoid spurious underflow in
209 the result value el. However, it is not possible to obtain a lower
210 bound on |el| from the input value rh, thus this modified value of RHO1
211 is obtained experimentally. */
212 const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
213 const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
214 const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
215
216 // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
217 #[allow(clippy::neg_cmp_op_on_partial_ord)]
218 if !(r.hi <= RHO2) {
219 return if r.hi > RHO3 {
220 /* If rh > RHO3, we are sure there is overflow,
221 For s=1 we return eh = el = DBL_MAX, which yields
222 res_min = res_max = +Inf for rounding up or to nearest,
223 and res_min = res_max = DBL_MAX for rounding down or toward zero,
224 which will yield the correct rounding.
225 For s=-1 we return eh = el = -DBL_MAX, which similarly gives
226 res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
227 which is the correct rounding. */
228 DoubleDouble::new(
229 f64::from_bits(0x7fefffffffffffff) * s,
230 f64::from_bits(0x7fefffffffffffff) * s,
231 )
232 } else {
233 /* If RHO2 < rh <= RHO3, we are in the intermediate region
234 where there might be overflow or not, thus we set eh = el = NaN,
235 which will set res_min = res_max = NaN, the comparison
236 res_min == res_max will fail: we defer to the 2nd phase. */
237 DoubleDouble::new(f64::NAN, f64::NAN)
238 };
239 }
240
241 if r.hi < RHO1 {
242 return if r.hi < RHO0 {
243 /* For s=1, we have eh=el=+0 except for rounding up,
244 thus res_min=+0 or -0, res_max=+0 in the main code,
245 the rounding test succeeds, and we return res_max which is the
246 expected result in the underflow case.
247 For s=1 and rounding up, we have eh=+0, el=2^-1074,
248 thus res_min = res_max = 2^-1074, which is the expected result too.
249 For s=-1, we have eh=el=-0 except for rounding down,
250 thus res_min=-0 or +0, res_max=-0 in the main code,
251 the rounding test succeeds, and we return res_max which is the
252 expected result in the underflow case.
253 For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
254 thus res_min = res_max = -2^-1074, which is the expected result too.
255 */
256 DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
257 } else {
258 /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
259 DoubleDouble::new(f64::NAN, f64::NAN)
260 };
261 }
262 const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
263
264 let k = (r.hi * INVLOG2).round_finite();
265
266 const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
267 const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
268
269 let zh = dd_fmla(LOG2H, -k, r.hi);
270 let zl = dd_fmla(LOG2L, -k, r.lo);
271
272 let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
273 let mk = (bk >> 12) + 0x3ff;
274 let i2 = (bk >> 6) & 0x3f;
275 let i1 = bk & 0x3f;
276
277 let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
278 let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
279 let mut de = DoubleDouble::quick_mult(t1, t0);
280 let q = exp_poly_1(zh + zl);
281 de = DoubleDouble::quick_mult(de, q);
282 /* we should have 1 < M < 2047 here, since we filtered out
283 potential underflow/overflow cases at the beginning of this function */
284
285 let mut du = (mk as u64).wrapping_shl(52);
286 du = (f64::from_bits(du) * s).to_bits();
287 de.hi *= f64::from_bits(du);
288 de.lo *= f64::from_bits(du);
289 de
290}
291
292#[inline]
293pub(crate) fn exp_dd_fast(r: DoubleDouble) -> DoubleDouble {
294 const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
295
296 let k = (r.hi * INVLOG2).round_finite();
297
298 const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
299 const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
300
301 let mut z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
302 z = DoubleDouble::from_exact_add(z.hi, z.lo);
303
304 let bk = k as i64; /* Note: k is an integer, this is just a conversion. */
305 let mk = (bk >> 12) + 0x3ff;
306 let i2 = (bk >> 6) & 0x3f;
307 let i1 = bk & 0x3f;
308
309 let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
310 let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
311 let mut de = DoubleDouble::quick_mult(t1, t0);
312 // exp(hi + lo) = exp(hi) * exp(lo)
313 let q_hi = exp_poly_1(z.hi);
314 // Taylor series exp(x) ~ 1 + x since z.lo < ulp(z.h)
315 let q_lo = DoubleDouble::from_exact_add(1., z.lo);
316 let q = DoubleDouble::quick_mult(q_hi, q_lo);
317 de = DoubleDouble::quick_mult(de, q);
318 /* we should have 1 < M < 2047 here, since we filtered out
319 potential underflow/overflow cases at the beginning of this function */
320
321 let du = (mk as u64).wrapping_shl(52);
322 de.hi *= f64::from_bits(du);
323 de.lo *= f64::from_bits(du);
324 de
325}
326
327#[inline]
328pub(crate) fn pow_exp_dd(r: DoubleDouble, s: f64) -> DoubleDouble {
329 const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
330 // #define RHO1 -0x1.577453f1799a6p+9
331 /* We increase the initial value of RHO1 to avoid spurious underflow in
332 the result value el. However, it is not possible to obtain a lower
333 bound on |el| from the input value rh, thus this modified value of RHO1
334 is obtained experimentally. */
335 const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
336 const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
337 const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
338
339 // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
340 #[allow(clippy::neg_cmp_op_on_partial_ord)]
341 if !(r.hi <= RHO2) {
342 return if r.hi > RHO3 {
343 /* If rh > RHO3, we are sure there is overflow,
344 For s=1 we return eh = el = DBL_MAX, which yields
345 res_min = res_max = +Inf for rounding up or to nearest,
346 and res_min = res_max = DBL_MAX for rounding down or toward zero,
347 which will yield the correct rounding.
348 For s=-1 we return eh = el = -DBL_MAX, which similarly gives
349 res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
350 which is the correct rounding. */
351 DoubleDouble::new(
352 f64::from_bits(0x7fefffffffffffff) * s,
353 f64::from_bits(0x7fefffffffffffff) * s,
354 )
355 } else {
356 /* If RHO2 < rh <= RHO3, we are in the intermediate region
357 where there might be overflow or not, thus we set eh = el = NaN,
358 which will set res_min = res_max = NaN, the comparison
359 res_min == res_max will fail: we defer to the 2nd phase. */
360 DoubleDouble::new(f64::NAN, f64::NAN)
361 };
362 }
363
364 if r.hi < RHO1 {
365 return if r.hi < RHO0 {
366 /* For s=1, we have eh=el=+0 except for rounding up,
367 thus res_min=+0 or -0, res_max=+0 in the main code,
368 the rounding test succeeds, and we return res_max which is the
369 expected result in the underflow case.
370 For s=1 and rounding up, we have eh=+0, el=2^-1074,
371 thus res_min = res_max = 2^-1074, which is the expected result too.
372 For s=-1, we have eh=el=-0 except for rounding down,
373 thus res_min=-0 or +0, res_max=-0 in the main code,
374 the rounding test succeeds, and we return res_max which is the
375 expected result in the underflow case.
376 For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
377 thus res_min = res_max = -2^-1074, which is the expected result too.
378 */
379 DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s)
380 } else {
381 /* RHO0 <= rh < RHO1 or s < 0: we defer to the 2nd phase */
382 DoubleDouble::new(f64::NAN, f64::NAN)
383 };
384 }
385 const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
386
387 let k = (r.hi * INVLOG2).round_finite();
388
389 const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
390 const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
391
392 let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
393
394 let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
395 let mk = (bk >> 12) + 0x3ff;
396 let i2 = (bk >> 6) & 0x3f;
397 let i1 = bk & 0x3f;
398
399 let t0 = DoubleDouble::from_bit_pair(EXP_REDUCE_T0[i2 as usize]);
400 let t1 = DoubleDouble::from_bit_pair(EXP_REDUCE_T1[i1 as usize]);
401 let mut de = DoubleDouble::quick_mult(t1, t0);
402 let q = exp_poly_1(z.to_f64());
403 de = DoubleDouble::quick_mult(de, q);
404 /* we should have 1 < M < 2047 here, since we filtered out
405 potential underflow/overflow cases at the beginning of this function */
406
407 let mut du = (mk as u64).wrapping_shl(52);
408 du = (f64::from_bits(du) * s).to_bits();
409 de.hi *= f64::from_bits(du);
410 de.lo *= f64::from_bits(du);
411 de
412}
413/*
414#[inline(always)]
415pub(crate) fn expm1_poly_dd(z: DoubleDouble) -> DoubleDouble {
416 /*
417 Sollya:
418 pretty = proc(u) {
419 return ~(floor(u*1000)/1000);
420 };
421
422 d = [-2^-12.905,2^-12.905];
423 f = expm1(x);
424 w = 1;
425 pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, 107...|], d, absolute, floating);
426 err_p = -log2(dirtyinfnorm(pf*w-f, d));
427 display = decimal;
428
429 for i from 1 to degree(pf) do print(coeff(pf, i));
430
431 print (pf);
432 display = decimal;
433 print ("absolute error:",pretty(err_p));
434 f = 1;
435 w = 1/expm1(x);
436 err_p = -log2(dirtyinfnorm(pf*w-f, d));
437 print ("relative error:",pretty(err_p));
438 */
439 const Q: [(u64, u64); 7] = [
440 (0x0000000000000000, 0x3ff0000000000000),
441 (0x0000000000000000, 0x3fe0000000000000),
442 (0xbc75555554d7c48c, 0x3fc5555555555556),
443 (0xbc555a40ffb472d9, 0x3fa5555555555556),
444 (0x3c24866314c38093, 0x3f8111111111110e),
445 (0x3be34665978dddb8, 0x3f56c16c16efac90),
446 (0x3baeab43b813ef24, 0x3f2a01a1e12d253c),
447 ];
448 let z2 = z * z;
449 let z4 = z2 * z2;
450
451 let b0 = DoubleDouble::quick_mul_add(
452 z,
453 DoubleDouble::from_bit_pair(Q[1]),
454 DoubleDouble::from_bit_pair(Q[0]),
455 );
456 let b1 = DoubleDouble::quick_mul_add(
457 z,
458 DoubleDouble::from_bit_pair(Q[3]),
459 DoubleDouble::from_bit_pair(Q[2]),
460 );
461 let b2 = DoubleDouble::quick_mul_add(
462 z,
463 DoubleDouble::from_bit_pair(Q[5]),
464 DoubleDouble::from_bit_pair(Q[4]),
465 );
466
467 let c0 = DoubleDouble::quick_mul_add(z2, b1, b0);
468 let c1 = DoubleDouble::quick_mul_add(z2, DoubleDouble::from_bit_pair(Q[6]), b2);
469
470 let p = DoubleDouble::quick_mul_add(z4, c1, c0);
471 DoubleDouble::quick_mult(p, z)
472}
473*/
474#[inline(always)]
475pub(crate) fn expm1_poly_fast(z: DoubleDouble) -> DoubleDouble {
476 // Polynomial generated by Sollya:
477 // d = [-2^-12.905,2^-12.905];
478 // f = expm1(x);
479 // w = 1;
480 // pf = fpminimax(f, [|1,2,3,4,5,6,7|], [|1, 1, D...|], d, absolute, floating);
481 // See ./notes/compound_m1_expm1_fast.sollya
482 let p = f_polyeval6(
483 z.hi,
484 f64::from_bits(0x3fe0000000000000),
485 f64::from_bits(0x3fc5555555555555),
486 f64::from_bits(0x3fa55555555553de),
487 f64::from_bits(0x3f81111144995a9a),
488 f64::from_bits(0x3f56c241f9a791c5),
489 f64::from_bits(0xbfad9209c6d8b9e1),
490 );
491 let px = DoubleDouble::quick_mult_f64(DoubleDouble::from_exact_mult(z.hi, p), z.hi);
492 // expm1(hi + lo) = expm1(hi) + expm1(lo)(1 + expm1(hi)) = expm1(hi) + expm1(lo)expm1(hi) + expm1(lo)
493 // expm1(lo) ~ lo
494 let expm1_hi = DoubleDouble::f64_add(z.hi, px);
495 let mut lowest_part = DoubleDouble::quick_mult_f64(expm1_hi, z.lo);
496 lowest_part = DoubleDouble::full_add_f64(lowest_part, z.lo);
497 DoubleDouble::quick_dd_add(expm1_hi, lowest_part)
498}
499
500/// |z.hi| < 2^-7
501#[inline(always)]
502pub(crate) fn expm1_poly_dd_tiny(z: DoubleDouble) -> DoubleDouble {
503 // Polynomial generated in Sollya
504 // d = [-2^-7,2^-7];
505 // f = expm1(x);
506 // w = 1;
507 // pf = fpminimax(f, [|1,2,3,4,5,6,7,8,9|], [|1, 1, 107...|], d, absolute, floating);
508 // See ./notes/compound_expm1_tiny.sollya
509 const Q: [(u64, u64); 9] = [
510 (0x0000000000000000, 0x3ff0000000000000),
511 (0x0000000000000000, 0x3fe0000000000000),
512 (0x3c6555564150ff16, 0x3fc5555555555555),
513 (0x3c4586275c26f8a5, 0x3fa5555555555555),
514 (0xbc19e6193ac658a6, 0x3f81111111111111),
515 (0xbbf025e72dc21051, 0x3f56c16c16c1500a),
516 (0x3bc2d641a7b7b9b8, 0x3f2a01a01a07dc46),
517 (0xbb42cc8aaeeb3d00, 0x3efa01a29fef3e6f),
518 (0x3b52b1589125ce82, 0x3ec71db6af553255),
519 ];
520 let z = DoubleDouble::from_exact_add(z.hi, z.lo);
521 let mut d = DoubleDouble::quick_mul_add(
522 z,
523 DoubleDouble::from_bit_pair(Q[8]),
524 DoubleDouble::from_bit_pair(Q[7]),
525 );
526 d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[6]));
527 d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[5]));
528 d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[4]));
529 d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[3]));
530 d = DoubleDouble::quick_mul_add(z, d, DoubleDouble::from_bit_pair(Q[2]));
531 d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3fe0000000000000));
532 d = DoubleDouble::quick_mul_add_f64(z, d, f64::from_bits(0x3ff0000000000000));
533 DoubleDouble::quick_mult(d, z)
534}
535
536#[inline]
537pub(crate) fn pow_expm1_1(r: DoubleDouble, s: f64) -> DoubleDouble {
538 const RHO0: f64 = f64::from_bits(0xc0874910ee4e8a27);
539 // #define RHO1 -0x1.577453f1799a6p+9
540 /* We increase the initial value of RHO1 to avoid spurious underflow in
541 the result value el. However, it is not possible to obtain a lower
542 bound on |el| from the input value rh, thus this modified value of RHO1
543 is obtained experimentally. */
544 const RHO1: f64 = f64::from_bits(0xc08483b8cca421af);
545 const RHO2: f64 = f64::from_bits(0x40862e42e709a95b);
546 const RHO3: f64 = f64::from_bits(0x40862e4316ea5df9);
547
548 // use !(rh <= RHO2) instead of rh < RHO2 to catch rh = NaN too
549 #[allow(clippy::neg_cmp_op_on_partial_ord)]
550 if !(r.hi <= RHO2) {
551 return if r.hi > RHO3 {
552 /* If rh > RHO3, we are sure there is overflow,
553 For s=1 we return eh = el = DBL_MAX, which yields
554 res_min = res_max = +Inf for rounding up or to nearest,
555 and res_min = res_max = DBL_MAX for rounding down or toward zero,
556 which will yield the correct rounding.
557 For s=-1 we return eh = el = -DBL_MAX, which similarly gives
558 res_min = res_max = -Inf or res_min = res_max = -DBL_MAX,
559 which is the correct rounding. */
560 DoubleDouble::new(
561 f64::from_bits(0x7fefffffffffffff) * s,
562 f64::from_bits(0x7fefffffffffffff) * s,
563 )
564 } else {
565 /* If RHO2 < rh <= RHO3, we are in the intermediate region
566 where there might be overflow or not, thus we set eh = el = NaN,
567 which will set res_min = res_max = NaN, the comparison
568 res_min == res_max will fail: we defer to the 2nd phase. */
569 DoubleDouble::new(f64::NAN, f64::NAN)
570 };
571 }
572
573 if r.hi < RHO1 {
574 if r.hi < RHO0 {
575 /* For s=1, we have eh=el=+0 except for rounding up,
576 thus res_min=+0 or -0, res_max=+0 in the main code,
577 the rounding test succeeds, and we return res_max which is the
578 expected result in the underflow case.
579 For s=1 and rounding up, we have eh=+0, el=2^-1074,
580 thus res_min = res_max = 2^-1074, which is the expected result too.
581 For s=-1, we have eh=el=-0 except for rounding down,
582 thus res_min=-0 or +0, res_max=-0 in the main code,
583 the rounding test succeeds, and we return res_max which is the
584 expected result in the underflow case.
585 For s=-1 and rounding down, we have eh=-0, el=-2^-1074,
586 thus res_min = res_max = -2^-1074, which is the expected result too.
587 */
588 return DoubleDouble::full_add_f64(
589 DoubleDouble::new(f64::from_bits(0x0000000000000001) * (0.5 * s), 0.0 * s),
590 -1.0,
591 );
592 } else {
593 /* RHO0 <= rh < RHO1 or s < 0: we return -1 */
594 return DoubleDouble::new(0., -1.);
595 };
596 }
597
598 let ax = r.hi.to_bits() & 0x7fffffffffffffffu64;
599
600 const LOG2H: f64 = f64::from_bits(0x3f262e42fefa39ef);
601 const LOG2L: f64 = f64::from_bits(0x3bbabc9e3b39803f);
602
603 if ax <= 0x3f80000000000000 {
604 // |x| < 2^-7
605 if ax < 0x3970000000000000 {
606 // |x| < 2^-104
607 return r;
608 }
609 let d = expm1_poly_dd_tiny(r);
610 return d;
611 }
612
613 const INVLOG2: f64 = f64::from_bits(0x40b71547652b82fe);
614
615 let k = (r.hi * INVLOG2).round_ties_even_finite();
616
617 let z = DoubleDouble::mul_f64_add(DoubleDouble::new(LOG2L, LOG2H), -k, r);
618
619 let bk = unsafe { k.to_int_unchecked::<i64>() }; /* Note: k is an integer, this is just a conversion. */
620 let mk = (bk >> 12) + 0x3ff;
621 let i2 = (bk >> 6) & 0x3f;
622 let i1 = bk & 0x3f;
623
624 let t0 = DoubleDouble::from_bit_pair(EXPM1_T0[i2 as usize]);
625 let t1 = DoubleDouble::from_bit_pair(EXPM1_T1[i1 as usize]);
626 let tbh = DoubleDouble::quick_mult(t1, t0);
627 let mut de = tbh;
628 // exp(k)=2^k*exp(r) + (2^k - 1)
629 let q = expm1_poly_fast(z);
630 de = DoubleDouble::quick_mult(de, q);
631 de = DoubleDouble::add(tbh, de);
632
633 let ie = mk - 0x3ff;
634
635 let off: f64 = f64::from_bits((2048i64 + 1023i64).wrapping_sub(ie).wrapping_shl(52) as u64);
636
637 let e: f64;
638 if ie < 53 {
639 let fhz = DoubleDouble::from_exact_add(off, de.hi);
640 de.hi = fhz.hi;
641 e = fhz.lo;
642 } else if ie < 104 {
643 let fhz = DoubleDouble::from_exact_add(de.hi, off);
644 de.hi = fhz.hi;
645 e = fhz.lo;
646 } else {
647 e = 0.;
648 }
649 de.lo += e;
650 de.hi = ldexp(de.to_f64(), ie as i32);
651 de.lo = 0.;
652 de
653}
654
655fn exp_dyadic_poly(x: DyadicFloat128) -> DyadicFloat128 {
656 const Q_2: [DyadicFloat128; 8] = [
657 DyadicFloat128 {
658 sign: DyadicSign::Pos,
659 exponent: -128,
660 mantissa: 0xffff_ffff_ffff_ffff_ffff_ffff_ffff_ffd0_u128,
661 },
662 DyadicFloat128 {
663 sign: DyadicSign::Pos,
664 exponent: -127,
665 mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0088_u128,
666 },
667 DyadicFloat128 {
668 sign: DyadicSign::Pos,
669 exponent: -128,
670 mantissa: 0x8000_0000_0000_0000_0000_000c_06f3_cd29_u128,
671 },
672 DyadicFloat128 {
673 sign: DyadicSign::Pos,
674 exponent: -130,
675 mantissa: 0xaaaa_aaaa_aaaa_aaaa_aaaa_aa6a_1e07_76ae_u128,
676 },
677 DyadicFloat128 {
678 sign: DyadicSign::Pos,
679 exponent: -132,
680 mantissa: 0xaaaa_aaaa_aaaa_aaa3_0000_0000_0000_0000_u128,
681 },
682 DyadicFloat128 {
683 sign: DyadicSign::Pos,
684 exponent: -134,
685 mantissa: 0x8888_8888_8888_8897_0000_0000_0000_0000_u128,
686 },
687 DyadicFloat128 {
688 sign: DyadicSign::Pos,
689 exponent: -137,
690 mantissa: 0xb60b_60b9_3214_6a54_0000_0000_0000_0000_u128,
691 },
692 DyadicFloat128 {
693 sign: DyadicSign::Pos,
694 exponent: -140,
695 mantissa: 0xd00d_00cd_9841_6862_0000_0000_0000_0000_u128,
696 },
697 ];
698 f_polyeval8(
699 x, Q_2[0], Q_2[1], Q_2[2], Q_2[3], Q_2[4], Q_2[5], Q_2[6], Q_2[7],
700 )
701}
702
703/* put in r an approximation of exp(x), for |x| < 744.45,
704with relative error < 2^-121.70 */
705#[inline]
706pub(crate) fn exp_dyadic(x: DyadicFloat128) -> DyadicFloat128 {
707 let ex = x.exponent + 127;
708 if ex >= 10
709 // underflow or overflow
710 {
711 return DyadicFloat128 {
712 sign: DyadicSign::Pos,
713 exponent: if x.sign == DyadicSign::Neg {
714 -1076
715 } else {
716 1025
717 },
718 mantissa: x.mantissa,
719 };
720 }
721
722 const LOG2_INV: DyadicFloat128 = DyadicFloat128 {
723 sign: DyadicSign::Pos,
724 exponent: -115,
725 mantissa: 0xb8aa_3b29_5c17_f0bc_0000_0000_0000_0000_u128,
726 };
727
728 const LOG2: DyadicFloat128 = DyadicFloat128 {
729 sign: DyadicSign::Pos,
730 exponent: -128,
731 mantissa: 0xb172_17f7_d1cf_79ab_c9e3_b398_03f2_f6af_u128,
732 };
733
734 let mut bk = x * LOG2_INV;
735 let k = bk.trunc_to_i64(); /* k = trunc(K) [rounded towards zero, exact] */
736 /* The rounding error of mul_dint_int64() is bounded by 6 ulps, thus since
737 |K| <= 4399162*log(2) < 3049267, the error on K is bounded by 2^-103.41.
738 This error is divided by 2^12 below, thus yields < 2^-115.41. */
739 bk = LOG2.mul_int64(k);
740 bk.exponent -= 12;
741 bk.sign = bk.sign.negate();
742 let y = x + bk;
743
744 let bm = k >> 12;
745 let i2 = (k >> 6) & 0x3f;
746 let i1 = k & 0x3f;
747 let mut r = exp_dyadic_poly(y);
748 r = EXP_T1_2_DYADIC[i2 as usize] * r;
749 r = EXP_T2_2_DYADIC[i1 as usize] * r;
750 r.exponent += bm as i16; /* exact */
751 r
752}
753
754#[cfg(test)]
755mod tests {
756 use super::*;
757 use crate::f_expm1;
758 #[test]
759 fn test_log() {
760 let k = DyadicFloat128::new_from_f64(2.5);
761 assert_eq!(exp_dyadic(k).fast_as_f64(), 12.182493960703473);
762 }
763
764 #[test]
765 fn test_exp() {
766 let k = pow_expm1_1(DoubleDouble::new(0., 2.543543543543), 1.);
767 println!("{}", k.to_f64());
768 println!("{}", f_expm1(2.543543543543));
769 }
770}