pxfm/
powf.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 4/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::biased_exponent_f64;
30use crate::common::*;
31use crate::double_double::DoubleDouble;
32use crate::exponents::expf;
33use crate::logf;
34use crate::logs::LOG2_R;
35use crate::polyeval::{f_polyeval3, f_polyeval6, f_polyeval10};
36use crate::pow_tables::EXP2_MID1;
37use crate::powf_tables::{LOG2_R_TD, LOG2_R2_DD, POWF_R2};
38use crate::round::RoundFinite;
39
40/// Power function for given value for const context.
41/// This is simplified version just to make a good approximation on const context.
42#[inline]
43pub const fn powf(d: f32, n: f32) -> f32 {
44    let value = d.abs();
45    let c = expf(n * logf(value));
46    if n == 1. {
47        return d;
48    }
49    if d < 0.0 {
50        let y = n as i32;
51        if y % 2 == 0 { c } else { -c }
52    } else {
53        c
54    }
55}
56
57#[inline]
58const fn larger_exponent(a: f64, b: f64) -> bool {
59    biased_exponent_f64(a) >= biased_exponent_f64(b)
60}
61
62// Calculate 2^(y * log2(x)) in double-double precision.
63// At this point we can reuse the following values:
64//   idx_x: index for extra precision of log2 for the middle part of log2(x).
65//   dx: the reduced argument for log2(x)
66//   y6: 2^6 * y.
67//   lo6_hi: the high part of 2^6 * (y - (hi + mid))
68//   exp2_hi_mid: high part of 2^(hi + mid)
69#[cold]
70#[inline(never)]
71fn powf_dd(idx_x: i32, dx: f64, y6: f64, lo6_hi: f64, exp2_hi_mid: DoubleDouble) -> f64 {
72    // Perform a second range reduction step:
73    //   idx2 = round(2^14 * (dx  + 2^-8)) = round ( dx * 2^14 + 2^6)
74    //   dx2 = (1 + dx) * r2 - 1
75    // Output range:
76    //   -0x1.3ffcp-15 <= dx2 <= 0x1.3e3dp-15
77    let idx2 = f_fmla(
78        dx,
79        f64::from_bits(0x40d0000000000000),
80        f64::from_bits(0x4050000000000000),
81    )
82    .round_finite() as usize;
83    let dx2 = f_fmla(1.0 + dx, f64::from_bits(POWF_R2[idx2]), -1.0); // Exact
84
85    const COEFFS: [(u64, u64); 6] = [
86        (0x3c7777d0ffda25e0, 0x3ff71547652b82fe),
87        (0xbc6777d101cf0a84, 0xbfe71547652b82fe),
88        (0x3c7ce04b5140d867, 0x3fdec709dc3a03fd),
89        (0x3c7137b47e635be5, 0xbfd71547652b82fb),
90        (0xbc5b5a30b3bdb318, 0x3fd2776c516a92a2),
91        (0x3c62d2fbd081e657, 0xbfcec70af1929ca6),
92    ];
93    let dx_dd = DoubleDouble::new(0.0, dx2);
94    let p = f_polyeval6(
95        dx_dd,
96        DoubleDouble::from_bit_pair(COEFFS[0]),
97        DoubleDouble::from_bit_pair(COEFFS[1]),
98        DoubleDouble::from_bit_pair(COEFFS[2]),
99        DoubleDouble::from_bit_pair(COEFFS[3]),
100        DoubleDouble::from_bit_pair(COEFFS[4]),
101        DoubleDouble::from_bit_pair(COEFFS[5]),
102    );
103    // log2(1 + dx2) ~ dx2 * P(dx2)
104    let log2_x_lo = DoubleDouble::quick_mult_f64(p, dx2);
105    // Lower parts of (e_x - log2(r1)) of the first range reduction constant
106    let log2_r_td = LOG2_R_TD[idx_x as usize];
107    let log2_x_mid = DoubleDouble::new(f64::from_bits(log2_r_td.0), f64::from_bits(log2_r_td.1));
108    // -log2(r2) + lower part of (e_x - log2(r1))
109    let log2_x_m = DoubleDouble::add(DoubleDouble::from_bit_pair(LOG2_R2_DD[idx2]), log2_x_mid);
110    // log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1))
111    // Since we don't know which one has larger exponent to apply Fast2Sum
112    // algorithm, we need to check them before calling double-double addition.
113    let log2_x = if larger_exponent(log2_x_m.hi, log2_x_lo.hi) {
114        DoubleDouble::add(log2_x_m, log2_x_lo)
115    } else {
116        DoubleDouble::add(log2_x_lo, log2_x_m)
117    };
118    let lo6_hi_dd = DoubleDouble::new(0.0, lo6_hi);
119    // 2^6 * y * (log2(1 + dx2) - log2(r2) + lower part of (e_x - log2(r1)))
120    let prod = DoubleDouble::quick_mult_f64(log2_x, y6);
121    // 2^6 * (y * log2(x) - (hi + mid)) = 2^6 * lo
122    let lo6 = if larger_exponent(prod.hi, lo6_hi) {
123        DoubleDouble::add(prod, lo6_hi_dd)
124    } else {
125        DoubleDouble::add(lo6_hi_dd, prod)
126    };
127
128    const EXP2_COEFFS: [(u64, u64); 10] = [
129        (0x0000000000000000, 0x3ff0000000000000),
130        (0x3c1abc9e3b398024, 0x3f862e42fefa39ef),
131        (0xbba5e43a5429bddb, 0x3f0ebfbdff82c58f),
132        (0xbb2d33162491268f, 0x3e8c6b08d704a0c0),
133        (0x3a94fb32d240a14e, 0x3e03b2ab6fba4e77),
134        (0x39ee84e916be83e0, 0x3d75d87fe78a6731),
135        (0xb989a447bfddc5e6, 0x3ce430912f86bfb8),
136        (0xb8e31a55719de47f, 0x3c4ffcbfc588ded9),
137        (0xb850ba57164eb36b, 0x3bb62c034beb8339),
138        (0xb7b8483eabd9642d, 0x3b1b5251ff97bee1),
139    ];
140
141    let pp = f_polyeval10(
142        lo6,
143        DoubleDouble::from_bit_pair(EXP2_COEFFS[0]),
144        DoubleDouble::from_bit_pair(EXP2_COEFFS[1]),
145        DoubleDouble::from_bit_pair(EXP2_COEFFS[2]),
146        DoubleDouble::from_bit_pair(EXP2_COEFFS[3]),
147        DoubleDouble::from_bit_pair(EXP2_COEFFS[4]),
148        DoubleDouble::from_bit_pair(EXP2_COEFFS[5]),
149        DoubleDouble::from_bit_pair(EXP2_COEFFS[6]),
150        DoubleDouble::from_bit_pair(EXP2_COEFFS[7]),
151        DoubleDouble::from_bit_pair(EXP2_COEFFS[8]),
152        DoubleDouble::from_bit_pair(EXP2_COEFFS[9]),
153    );
154    let rr = DoubleDouble::quick_mult(exp2_hi_mid, pp);
155
156    // Make sure the sum is normalized:
157    let r = DoubleDouble::from_exact_add(rr.hi, rr.lo);
158
159    const FRACTION_MASK: u64 = (1u64 << 52) - 1;
160
161    let mut r_bits = r.hi.to_bits();
162    if ((r_bits & 0xfff_ffff) == 0) && (r.lo != 0.0) {
163        let hi_sign = r.hi.to_bits() >> 63;
164        let lo_sign = r.lo.to_bits() >> 63;
165        if hi_sign == lo_sign {
166            r_bits = r_bits.wrapping_add(1);
167        } else if (r_bits & FRACTION_MASK) > 0 {
168            r_bits = r_bits.wrapping_sub(1);
169        }
170    }
171
172    f64::from_bits(r_bits)
173}
174
175/// Power function
176///
177/// Max found ULP 0.5
178#[inline]
179pub fn f_powf(x: f32, y: f32) -> f32 {
180    let mut x_u = x.to_bits();
181    let x_abs = x_u & 0x7fff_ffff;
182    let mut y = y;
183    let y_u = y.to_bits();
184    let y_abs = y_u & 0x7fff_ffff;
185    let mut x = x;
186
187    if ((y_abs & 0x0007_ffff) == 0) || (y_abs > 0x4f170000) {
188        // y is signaling NaN
189        if x.is_nan() || y.is_nan() {
190            if y.abs() == 0. {
191                return 1.;
192            }
193            if x == 1. {
194                return 1.;
195            }
196            return f32::NAN;
197        }
198
199        // Exceptional exponents.
200        if y == 0.0 {
201            return 1.0;
202        }
203
204        match y_abs {
205            0x7f80_0000 => {
206                if x_abs > 0x7f80_0000 {
207                    // pow(NaN, +-Inf) = NaN
208                    return x;
209                }
210                if x_abs == 0x3f80_0000 {
211                    // pow(+-1, +-Inf) = 1.0f
212                    return 1.0;
213                }
214                if x == 0.0 && y_u == 0xff80_0000 {
215                    // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
216                    return f32::INFINITY;
217                }
218                // pow (|x| < 1, -inf) = +inf
219                // pow (|x| < 1, +inf) = 0.0f
220                // pow (|x| > 1, -inf) = 0.0f
221                // pow (|x| > 1, +inf) = +inf
222                return if (x_abs < 0x3f80_0000) == (y_u == 0xff80_0000) {
223                    f32::INFINITY
224                } else {
225                    0.
226                };
227            }
228            _ => {
229                match y_u {
230                    0x3f00_0000 => {
231                        // pow(x, 1/2) = sqrt(x)
232                        if x == 0.0 || x_u == 0xff80_0000 {
233                            // pow(-0, 1/2) = +0
234                            // pow(-inf, 1/2) = +inf
235                            // Make sure it is correct for FTZ/DAZ.
236                            return x * x;
237                        }
238                        let r = x.sqrt();
239                        return if r.to_bits() != 0x8000_0000 { r } else { 0.0 };
240                    }
241                    0x3f80_0000 => {
242                        return x;
243                    } // y = 1.0f
244                    0x4000_0000 => return x * x, // y = 2.0f
245                    _ => {
246                        let is_int = is_integerf(y);
247                        if is_int && (y_u > 0x4000_0000) && (y_u <= 0x41c0_0000) {
248                            // Check for exact cases when 2 < y < 25 and y is an integer.
249                            let mut msb: i32 = if x_abs == 0 {
250                                32 - 2
251                            } else {
252                                x_abs.leading_zeros() as i32
253                            };
254                            msb = if msb > 8 { msb } else { 8 };
255                            let mut lsb: i32 = if x_abs == 0 {
256                                0
257                            } else {
258                                x_abs.trailing_zeros() as i32
259                            };
260                            lsb = if lsb > 23 { 23 } else { lsb };
261                            let extra_bits: i32 = 32 - 2 - lsb - msb;
262                            let iter = y as i32;
263
264                            if extra_bits * iter <= 23 + 2 {
265                                // The result is either exact or exactly half-way.
266                                // But it is exactly representable in double precision.
267                                let x_d = x as f64;
268                                let mut result = x_d;
269                                for _ in 1..iter {
270                                    result *= x_d;
271                                }
272                                return result as f32;
273                            }
274                        }
275
276                        if y_abs > 0x4f17_0000 {
277                            // if y is NaN
278                            if y_abs > 0x7f80_0000 {
279                                if x_u == 0x3f80_0000 {
280                                    // x = 1.0f
281                                    // pow(1, NaN) = 1
282                                    return 1.0;
283                                }
284                                // pow(x, NaN) = NaN
285                                return y;
286                            }
287                            // x^y will be overflow / underflow in single precision.  Set y to a
288                            // large enough exponent but not too large, so that the computations
289                            // won't be overflow in double precision.
290                            y = f32::from_bits((y_u & 0x8000_0000).wrapping_add(0x4f800000u32));
291                        }
292                    }
293                }
294            }
295        }
296    }
297
298    const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
299    let mut ex = -(E_BIAS as i32);
300    let mut sign: u64 = 0;
301
302    if ((x_u & 0x801f_ffffu32) == 0) || x_u >= 0x7f80_0000u32 || x_u < 0x0080_0000u32 {
303        if x.is_nan() {
304            return f32::NAN;
305        }
306
307        if x_u == 0x3f80_0000 {
308            return 1.;
309        }
310
311        let x_is_neg = x.to_bits() > 0x8000_0000;
312
313        if x == 0.0 {
314            let out_is_neg = x_is_neg && is_odd_integerf(f32::from_bits(y_u));
315            if y_u > 0x8000_0000u32 {
316                // pow(0, negative number) = inf
317                return if x_is_neg {
318                    f32::NEG_INFINITY
319                } else {
320                    f32::INFINITY
321                };
322            }
323            // pow(0, positive number) = 0
324            return if out_is_neg { -0.0 } else { 0.0 };
325        }
326
327        if x_abs == 0x7f80_0000u32 {
328            // x = +-Inf
329            let out_is_neg = x_is_neg && is_odd_integerf(f32::from_bits(y_u));
330            if y_u >= 0x7fff_ffff {
331                return if out_is_neg { -0.0 } else { 0.0 };
332            }
333            return if out_is_neg {
334                f32::NEG_INFINITY
335            } else {
336                f32::INFINITY
337            };
338        }
339
340        if x_abs > 0x7f80_0000 {
341            // x is NaN.
342            // pow (aNaN, 0) is already taken care above.
343            return x;
344        }
345
346        // Normalize denormal inputs.
347        if x_abs < 0x0080_0000u32 {
348            ex = ex.wrapping_sub(64);
349            x *= f32::from_bits(0x5f800000);
350        }
351
352        // x is finite and negative, and y is a finite integer.
353        if x.is_sign_negative() {
354            if is_integerf(y) {
355                x = -x;
356                if is_odd_integerf(y) {
357                    sign = 0x8000_0000_0000_0000u64;
358                }
359            } else {
360                // pow( negative, non-integer ) = NaN
361                return f32::NAN;
362            }
363        }
364    }
365
366    // x^y = 2^( y * log2(x) )
367    //     = 2^( y * ( e_x + log2(m_x) ) )
368    // First we compute log2(x) = e_x + log2(m_x)
369    x_u = x.to_bits();
370
371    // Extract exponent field of x.
372    ex = ex.wrapping_add((x_u >> 23) as i32);
373    let e_x = ex as f64;
374    // Use the highest 7 fractional bits of m_x as the index for look up tables.
375    let x_mant = x_u & ((1u32 << 23) - 1);
376    let idx_x = (x_mant >> (23 - 7)) as i32;
377    // Add the hidden bit to the mantissa.
378    // 1 <= m_x < 2
379    let m_x = f32::from_bits(x_mant | 0x3f800000);
380
381    // Reduced argument for log2(m_x):
382    //   dx = r * m_x - 1.
383    // The computation is exact, and -2^-8 <= dx < 2^-7.
384    // Then m_x = (1 + dx) / r, and
385    //   log2(m_x) = log2( (1 + dx) / r )
386    //             = log2(1 + dx) - log2(r).
387
388    let dx;
389    #[cfg(any(
390        all(
391            any(target_arch = "x86", target_arch = "x86_64"),
392            target_feature = "fma"
393        ),
394        all(target_arch = "aarch64", target_feature = "neon")
395    ))]
396    {
397        use crate::logs::LOG_REDUCTION_F32;
398        dx = f_fmlaf(
399            m_x,
400            f32::from_bits(LOG_REDUCTION_F32.0[idx_x as usize]),
401            -1.0,
402        ) as f64; // Exact.
403    }
404    #[cfg(not(any(
405        all(
406            any(target_arch = "x86", target_arch = "x86_64"),
407            target_feature = "fma"
408        ),
409        all(target_arch = "aarch64", target_feature = "neon")
410    )))]
411    {
412        use crate::logs::LOG_RANGE_REDUCTION;
413        dx = f_fmla(
414            m_x as f64,
415            f64::from_bits(LOG_RANGE_REDUCTION[idx_x as usize]),
416            -1.0,
417        ); // Exact
418    }
419
420    // Degree-5 polynomial approximation:
421    //   dx * P(dx) ~ log2(1 + dx)
422    // Generated by Sollya with:
423    // > P = fpminimax(log2(1 + x)/x, 5, [|D...|], [-2^-8, 2^-7]);
424    // > dirtyinfnorm(log2(1 + x)/x - P, [-2^-8, 2^-7]);
425    //   0x1.653...p-52
426    const COEFFS: [u64; 6] = [
427        0x3ff71547652b82fe,
428        0xbfe71547652b7a07,
429        0x3fdec709dc458db1,
430        0xbfd715479c2266c9,
431        0x3fd2776ae1ddf8f0,
432        0xbfce7b2178870157,
433    ];
434
435    let dx2 = dx * dx; // Exact
436    let c0 = f_fmla(dx, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
437    let c1 = f_fmla(dx, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
438    let c2 = f_fmla(dx, f64::from_bits(COEFFS[5]), f64::from_bits(COEFFS[4]));
439
440    let p = f_polyeval3(dx2, c0, c1, c2);
441
442    // s = e_x - log2(r) + dx * P(dx)
443    // Approximation errors:
444    //   |log2(x) - s| < ulp(e_x) + (bounds on dx) * (error bounds of P(dx))
445    //                 = ulp(e_x) + 2^-7 * 2^-51
446    //                 < 2^8 * 2^-52 + 2^-7 * 2^-43
447    //                 ~ 2^-44 + 2^-50
448    let s = f_fmla(dx, p, f64::from_bits(LOG2_R[idx_x as usize]) + e_x);
449
450    // To compute 2^(y * log2(x)), we break the exponent into 3 parts:
451    //   y * log(2) = hi + mid + lo, where
452    //   hi is an integer
453    //   mid * 2^6 is an integer
454    //   |lo| <= 2^-7
455    // Then:
456    //   x^y = 2^(y * log2(x)) = 2^hi * 2^mid * 2^lo,
457    // In which 2^mid is obtained from a look-up table of size 2^6 = 64 elements,
458    // and 2^lo ~ 1 + lo * P(lo).
459    // Thus, we have:
460    //   hi + mid = 2^-6 * round( 2^6 * y * log2(x) )
461    // If we restrict the output such that |hi| < 150, (hi + mid) uses (8 + 6)
462    // bits, hence, if we use double precision to perform
463    //   round( 2^6 * y * log2(x))
464    // the lo part is bounded by 2^-7 + 2^(-(52 - 14)) = 2^-7 + 2^-38
465
466    // In the following computations:
467    //   y6  = 2^6 * y
468    //   hm  = 2^6 * (hi + mid) = round(2^6 * y * log2(x)) ~ round(y6 * s)
469    //   lo6 = 2^6 * lo = 2^6 * (y - (hi + mid)) = y6 * log2(x) - hm.
470    let y6 = (y * f32::from_bits(0x42800000)) as f64; // Exact.
471    let hm = (s * y6).round_finite();
472
473    // let log2_rr = LOG2_R2_DD[idx_x as usize];
474
475    // // lo6 = 2^6 * lo.
476    // let lo6_hi = f_fmla(y6, e_x + f64::from_bits(log2_rr.1), -hm); // Exact
477    // // Error bounds:
478    // //   | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo)
479    // //                                       < 2^-51 + 2^-75
480    // let lo6 = f_fmla(y6, f_fmla(dx, p, f64::from_bits(log2_rr.0)), lo6_hi);
481
482    // lo6 = 2^6 * lo.
483    let lo6_hi = f_fmla(y6, e_x + f64::from_bits(LOG2_R_TD[idx_x as usize].2), -hm); // Exact
484    // Error bounds:
485    //   | (y*log2(x) - hm * 2^-6 - lo) / y| < err(dx * p) + err(LOG2_R_DD.lo)
486    //                                       < 2^-51 + 2^-75
487    let lo6 = f_fmla(
488        y6,
489        f_fmla(dx, p, f64::from_bits(LOG2_R_TD[idx_x as usize].1)),
490        lo6_hi,
491    );
492
493    // |2^(hi + mid) - exp2_hi_mid| <= ulp(exp2_hi_mid) / 2
494    // Clamp the exponent part into smaller range that fits double precision.
495    // For those exponents that are out of range, the final conversion will round
496    // them correctly to inf/max float or 0/min float accordingly.
497    let mut hm_i = unsafe { hm.to_int_unchecked::<i64>() };
498    hm_i = if hm_i > (1i64 << 15) {
499        1 << 15
500    } else if hm_i < (-(1i64 << 15)) {
501        -(1 << 15)
502    } else {
503        hm_i
504    };
505
506    let idx_y = hm_i & 0x3f;
507
508    // 2^hi
509    let exp_hi_i = (hm_i >> 6).wrapping_shl(52);
510    // 2^mid
511    let exp_mid_i = EXP2_MID1[idx_y as usize].1;
512    // (-1)^sign * 2^hi * 2^mid
513    // Error <= 2^hi * 2^-53
514    let exp2_hi_mid_i = (exp_hi_i.wrapping_add(exp_mid_i as i64) as u64).wrapping_add(sign);
515    let exp2_hi_mid = f64::from_bits(exp2_hi_mid_i);
516
517    // Degree-5 polynomial approximation P(lo6) ~ 2^(lo6 / 2^6) = 2^(lo).
518    // Generated by Sollya with:
519    // > P = fpminimax(2^(x/64), 5, [|1, D...|], [-2^-1, 2^-1]);
520    // > dirtyinfnorm(2^(x/64) - P, [-0.5, 0.5]);
521    // 0x1.a2b77e618f5c4c176fd11b7659016cde5de83cb72p-60
522    const EXP2_COEFFS: [u64; 6] = [
523        0x3ff0000000000000,
524        0x3f862e42fefa39ef,
525        0x3f0ebfbdff82a23a,
526        0x3e8c6b08d7076268,
527        0x3e03b2ad33f8b48b,
528        0x3d75d870c4d84445,
529    ];
530
531    let lo6_sqr = lo6 * lo6;
532    let d0 = f_fmla(
533        lo6,
534        f64::from_bits(EXP2_COEFFS[1]),
535        f64::from_bits(EXP2_COEFFS[0]),
536    );
537    let d1 = f_fmla(
538        lo6,
539        f64::from_bits(EXP2_COEFFS[3]),
540        f64::from_bits(EXP2_COEFFS[2]),
541    );
542    let d2 = f_fmla(
543        lo6,
544        f64::from_bits(EXP2_COEFFS[5]),
545        f64::from_bits(EXP2_COEFFS[4]),
546    );
547    let pp = f_polyeval3(lo6_sqr, d0, d1, d2);
548
549    let r = pp * exp2_hi_mid;
550    let r_u = r.to_bits();
551
552    #[cfg(any(
553        all(
554            any(target_arch = "x86", target_arch = "x86_64"),
555            target_feature = "fma"
556        ),
557        all(target_arch = "aarch64", target_feature = "neon")
558    ))]
559    const ERR: u64 = 64;
560    #[cfg(not(any(
561        all(
562            any(target_arch = "x86", target_arch = "x86_64"),
563            target_feature = "fma"
564        ),
565        all(target_arch = "aarch64", target_feature = "neon")
566    )))]
567    const ERR: u64 = 128;
568
569    let r_upper = f64::from_bits(r_u + ERR) as f32;
570    let r_lower = f64::from_bits(r_u - ERR) as f32;
571    if r_upper == r_lower {
572        return r_upper;
573    }
574
575    // Scale lower part of 2^(hi + mid)
576    let exp2_hi_mid_dd = DoubleDouble {
577        lo: if idx_y != 0 {
578            f64::from_bits((exp_hi_i as u64).wrapping_add(EXP2_MID1[idx_y as usize].0))
579        } else {
580            0.
581        },
582        hi: exp2_hi_mid,
583    };
584
585    let r_dd = powf_dd(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd);
586    r_dd as f32
587}
588
589/// Dirty fast pow
590#[inline]
591pub fn dirty_powf(d: f32, n: f32) -> f32 {
592    use crate::exponents::dirty_exp2f;
593    use crate::logs::dirty_log2f;
594    let value = d.abs();
595    let lg = dirty_log2f(value);
596    let c = dirty_exp2f(n * lg);
597    if d < 0.0 {
598        let y = n as i32;
599        if y % 2 == 0 { c } else { -c }
600    } else {
601        c
602    }
603}
604
605#[cfg(test)]
606mod tests {
607    use super::*;
608
609    #[test]
610    fn powf_test() {
611        assert!(
612            (powf(2f32, 3f32) - 8f32).abs() < 1e-6,
613            "Invalid result {}",
614            powf(2f32, 3f32)
615        );
616        assert!(
617            (powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
618            "Invalid result {}",
619            powf(0.5f32, 2f32)
620        );
621    }
622
623    #[test]
624    fn f_powf_test() {
625        assert!(
626            (f_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
627            "Invalid result {}",
628            f_powf(2f32, 3f32)
629        );
630        assert!(
631            (f_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
632            "Invalid result {}",
633            f_powf(0.5f32, 2f32)
634        );
635        assert_eq!(f_powf(0.5f32, 1.5432f32), 0.34312353);
636        assert_eq!(
637            f_powf(f32::INFINITY, 0.00000000000000000000000000000000038518824),
638            f32::INFINITY
639        );
640        assert_eq!(f_powf(f32::NAN, 0.0), 1.);
641        assert_eq!(f_powf(1., f32::NAN), 1.);
642    }
643
644    #[test]
645    fn dirty_powf_test() {
646        assert!(
647            (dirty_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
648            "Invalid result {}",
649            dirty_powf(2f32, 3f32)
650        );
651        assert!(
652            (dirty_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
653            "Invalid result {}",
654            dirty_powf(0.5f32, 2f32)
655        );
656    }
657}