1use crate::bits::biased_exponent_f64;
30use crate::common::*;
31use crate::double_double::DoubleDouble;
32use crate::exponents::expf;
33use crate::logf;
34use crate::logs::LOG2_R;
35use crate::polyeval::{f_polyeval3, f_polyeval6, f_polyeval10};
36use crate::pow_tables::EXP2_MID1;
37use crate::powf_tables::{LOG2_R_TD, LOG2_R2_DD, POWF_R2};
38use crate::rounding::CpuRound;
39
40pub const fn powf(d: f32, n: f32) -> f32 {
43 let value = d.abs();
44 let c = expf(n * logf(value));
45 if n == 1. {
46 return d;
47 }
48 if d < 0.0 {
49 let y = n as i32;
50 if y % 2 == 0 { c } else { -c }
51 } else {
52 c
53 }
54}
55
56#[inline]
57const fn larger_exponent(a: f64, b: f64) -> bool {
58 biased_exponent_f64(a) >= biased_exponent_f64(b)
59}
60
61#[cold]
69#[inline(never)]
70fn powf_dd(idx_x: i32, dx: f64, y6: f64, lo6_hi: f64, exp2_hi_mid: DoubleDouble) -> f64 {
71 let idx2 = f_fmla(
77 dx,
78 f64::from_bits(0x40d0000000000000),
79 f64::from_bits(0x4050000000000000),
80 )
81 .cpu_round() as usize;
82 let dx2 = f_fmla(1.0 + dx, f64::from_bits(POWF_R2[idx2]), -1.0); const COEFFS: [(u64, u64); 6] = [
85 (0x3c7777d0ffda25e0, 0x3ff71547652b82fe),
86 (0xbc6777d101cf0a84, 0xbfe71547652b82fe),
87 (0x3c7ce04b5140d867, 0x3fdec709dc3a03fd),
88 (0x3c7137b47e635be5, 0xbfd71547652b82fb),
89 (0xbc5b5a30b3bdb318, 0x3fd2776c516a92a2),
90 (0x3c62d2fbd081e657, 0xbfcec70af1929ca6),
91 ];
92 let dx_dd = DoubleDouble::new(0.0, dx2);
93 let p = f_polyeval6(
94 dx_dd,
95 DoubleDouble::from_bit_pair(COEFFS[0]),
96 DoubleDouble::from_bit_pair(COEFFS[1]),
97 DoubleDouble::from_bit_pair(COEFFS[2]),
98 DoubleDouble::from_bit_pair(COEFFS[3]),
99 DoubleDouble::from_bit_pair(COEFFS[4]),
100 DoubleDouble::from_bit_pair(COEFFS[5]),
101 );
102 let log2_x_lo = DoubleDouble::quick_mult_f64(p, dx2);
104 let log2_r_td = LOG2_R_TD[idx_x as usize];
106 let log2_x_mid = DoubleDouble::new(f64::from_bits(log2_r_td.0), f64::from_bits(log2_r_td.1));
107 let log2_x_m = DoubleDouble::add(DoubleDouble::from_bit_pair(LOG2_R2_DD[idx2]), log2_x_mid);
109 let log2_x = if larger_exponent(log2_x_m.hi, log2_x_lo.hi) {
113 DoubleDouble::add(log2_x_m, log2_x_lo)
114 } else {
115 DoubleDouble::add(log2_x_lo, log2_x_m)
116 };
117 let lo6_hi_dd = DoubleDouble::new(0.0, lo6_hi);
118 let prod = DoubleDouble::quick_mult_f64(log2_x, y6);
120 let lo6 = if larger_exponent(prod.hi, lo6_hi) {
122 DoubleDouble::add(prod, lo6_hi_dd)
123 } else {
124 DoubleDouble::add(lo6_hi_dd, prod)
125 };
126
127 const EXP2_COEFFS: [(u64, u64); 10] = [
128 (0x0000000000000000, 0x3ff0000000000000),
129 (0x3c1abc9e3b398024, 0x3f862e42fefa39ef),
130 (0xbba5e43a5429bddb, 0x3f0ebfbdff82c58f),
131 (0xbb2d33162491268f, 0x3e8c6b08d704a0c0),
132 (0x3a94fb32d240a14e, 0x3e03b2ab6fba4e77),
133 (0x39ee84e916be83e0, 0x3d75d87fe78a6731),
134 (0xb989a447bfddc5e6, 0x3ce430912f86bfb8),
135 (0xb8e31a55719de47f, 0x3c4ffcbfc588ded9),
136 (0xb850ba57164eb36b, 0x3bb62c034beb8339),
137 (0xb7b8483eabd9642d, 0x3b1b5251ff97bee1),
138 ];
139
140 let pp = f_polyeval10(
141 lo6,
142 DoubleDouble::from_bit_pair(EXP2_COEFFS[0]),
143 DoubleDouble::from_bit_pair(EXP2_COEFFS[1]),
144 DoubleDouble::from_bit_pair(EXP2_COEFFS[2]),
145 DoubleDouble::from_bit_pair(EXP2_COEFFS[3]),
146 DoubleDouble::from_bit_pair(EXP2_COEFFS[4]),
147 DoubleDouble::from_bit_pair(EXP2_COEFFS[5]),
148 DoubleDouble::from_bit_pair(EXP2_COEFFS[6]),
149 DoubleDouble::from_bit_pair(EXP2_COEFFS[7]),
150 DoubleDouble::from_bit_pair(EXP2_COEFFS[8]),
151 DoubleDouble::from_bit_pair(EXP2_COEFFS[9]),
152 );
153 let rr = DoubleDouble::quick_mult(exp2_hi_mid, pp);
154
155 let r = DoubleDouble::from_exact_add(rr.hi, rr.lo);
157
158 const FRACTION_MASK: u64 = (1u64 << 52) - 1;
159
160 let mut r_bits = r.hi.to_bits();
161 if ((r_bits & 0xfff_ffff) == 0) && (r.lo != 0.0) {
162 let hi_sign = r.hi.to_bits() >> 63;
163 let lo_sign = r.lo.to_bits() >> 63;
164 if hi_sign == lo_sign {
165 r_bits = r_bits.wrapping_add(1);
166 } else if (r_bits & FRACTION_MASK) > 0 {
167 r_bits = r_bits.wrapping_sub(1);
168 }
169 }
170
171 f64::from_bits(r_bits)
172}
173
174pub fn f_powf(x: f32, y: f32) -> f32 {
178 let mut x_u = x.to_bits();
179 let x_abs = x_u & 0x7fff_ffff;
180 let mut y = y;
181 let y_u = y.to_bits();
182 let y_abs = y_u & 0x7fff_ffff;
183 let mut x = x;
184
185 if ((y_abs & 0x0007_ffff) == 0) || (y_abs > 0x4f170000) {
186 if x.is_nan() || y.is_nan() {
188 if y.abs() == 0. {
189 return 1.;
190 }
191 if x == 1. {
192 return 1.;
193 }
194 return f32::NAN;
195 }
196
197 if y == 0.0 {
199 return 1.0;
200 }
201
202 match y_abs {
203 0x7f80_0000 => {
204 if x_abs > 0x7f80_0000 {
205 return x;
207 }
208 if x_abs == 0x3f80_0000 {
209 return 1.0;
211 }
212 if x == 0.0 && y_u == 0xff80_0000 {
213 return f32::INFINITY;
215 }
216 return if (x_abs < 0x3f80_0000) == (y_u == 0xff80_0000) {
221 f32::INFINITY
222 } else {
223 0.
224 };
225 }
226 _ => {
227 match y_u {
228 0x3f00_0000 => {
229 if x == 0.0 || x_u == 0xff80_0000 {
231 return x * x;
235 }
236 let r = x.sqrt();
237 return if r.to_bits() != 0x8000_0000 { r } else { 0.0 };
238 }
239 0x3f80_0000 => {
240 return x;
241 } 0x4000_0000 => return x * x, _ => {
244 let is_int = is_integerf(y);
245 if is_int && (y_u > 0x4000_0000) && (y_u <= 0x41c0_0000) {
246 let mut msb: i32 = if x_abs == 0 {
248 32 - 2
249 } else {
250 x_abs.leading_zeros() as i32
251 };
252 msb = if msb > 8 { msb } else { 8 };
253 let mut lsb: i32 = if x_abs == 0 {
254 0
255 } else {
256 x_abs.trailing_zeros() as i32
257 };
258 lsb = if lsb > 23 { 23 } else { lsb };
259 let extra_bits: i32 = 32 - 2 - lsb - msb;
260 let iter = y as i32;
261
262 if extra_bits * iter <= 23 + 2 {
263 let x_d = x as f64;
266 let mut result = x_d;
267 for _ in 1..iter {
268 result *= x_d;
269 }
270 return result as f32;
271 }
272 }
273
274 if y_abs > 0x4f17_0000 {
275 if y_abs > 0x7f80_0000 {
277 if x_u == 0x3f80_0000 {
278 return 1.0;
281 }
282 return y;
284 }
285 y = f32::from_bits((y_u & 0x8000_0000).wrapping_add(0x4f800000u32));
289 }
290 }
291 }
292 }
293 }
294 }
295
296 const E_BIAS: u32 = (1u32 << (8 - 1u32)) - 1u32;
297 let mut ex = -(E_BIAS as i32);
298 let mut sign: u64 = 0;
299
300 if ((x_u & 0x801f_ffffu32) == 0) || x_u >= 0x7f80_0000u32 || x_u < 0x0080_0000u32 {
301 if x.is_nan() {
302 return f32::NAN;
303 }
304
305 if x_u == 0x3f80_0000 {
306 return 1.;
307 }
308
309 let x_is_neg = x.to_bits() > 0x8000_0000;
310
311 if x == 0.0 {
312 let out_is_neg = x_is_neg && is_odd_integerf(f32::from_bits(y_u));
313 if y_u > 0x8000_0000u32 {
314 return if x_is_neg {
316 f32::NEG_INFINITY
317 } else {
318 f32::INFINITY
319 };
320 }
321 return if out_is_neg { -0.0 } else { 0.0 };
323 }
324
325 if x_abs == 0x7f80_0000u32 {
326 let out_is_neg = x_is_neg && is_odd_integerf(f32::from_bits(y_u));
328 if y_u >= 0x7fff_ffff {
329 return if out_is_neg { -0.0 } else { 0.0 };
330 }
331 return if out_is_neg {
332 f32::NEG_INFINITY
333 } else {
334 f32::INFINITY
335 };
336 }
337
338 if x_abs > 0x7f80_0000 {
339 return x;
342 }
343
344 if x_abs < 0x0080_0000u32 {
346 ex = ex.wrapping_sub(64);
347 x *= f32::from_bits(0x5f800000);
348 }
349
350 if x.is_sign_negative() {
352 if is_integerf(y) {
353 x = -x;
354 if is_odd_integerf(y) {
355 sign = 0x8000_0000_0000_0000u64;
356 }
357 } else {
358 return f32::NAN;
360 }
361 }
362 }
363
364 x_u = x.to_bits();
368
369 ex = ex.wrapping_add((x_u >> 23) as i32);
371 let e_x = ex as f64;
372 let x_mant = x_u & ((1u32 << 23) - 1);
374 let idx_x = (x_mant >> (23 - 7)) as i32;
375 let m_x = f32::from_bits(x_mant | 0x3f800000);
378
379 let dx;
387 #[cfg(any(
388 all(
389 any(target_arch = "x86", target_arch = "x86_64"),
390 target_feature = "fma"
391 ),
392 target_arch = "aarch64"
393 ))]
394 {
395 use crate::logs::LOG_REDUCTION_F32;
396 dx = f_fmlaf(
397 m_x,
398 f32::from_bits(LOG_REDUCTION_F32.0[idx_x as usize]),
399 -1.0,
400 ) as f64; }
402 #[cfg(not(any(
403 all(
404 any(target_arch = "x86", target_arch = "x86_64"),
405 target_feature = "fma"
406 ),
407 target_arch = "aarch64"
408 )))]
409 {
410 use crate::logs::LOG_RANGE_REDUCTION;
411 dx = f_fmla(
412 m_x as f64,
413 f64::from_bits(LOG_RANGE_REDUCTION[idx_x as usize]),
414 -1.0,
415 ); }
417
418 const COEFFS: [u64; 6] = [
425 0x3ff71547652b82fe,
426 0xbfe71547652b7a07,
427 0x3fdec709dc458db1,
428 0xbfd715479c2266c9,
429 0x3fd2776ae1ddf8f0,
430 0xbfce7b2178870157,
431 ];
432
433 let dx2 = dx * dx; let c0 = f_fmla(dx, f64::from_bits(COEFFS[1]), f64::from_bits(COEFFS[0]));
435 let c1 = f_fmla(dx, f64::from_bits(COEFFS[3]), f64::from_bits(COEFFS[2]));
436 let c2 = f_fmla(dx, f64::from_bits(COEFFS[5]), f64::from_bits(COEFFS[4]));
437
438 let p = f_polyeval3(dx2, c0, c1, c2);
439
440 let s = f_fmla(dx, p, f64::from_bits(LOG2_R[idx_x as usize]) + e_x);
447
448 let y6 = (y * f32::from_bits(0x42800000)) as f64; let hm = (s * y6).cpu_round();
470
471 let lo6_hi = f_fmla(y6, e_x + f64::from_bits(LOG2_R_TD[idx_x as usize].2), -hm); let lo6 = f_fmla(
486 y6,
487 f_fmla(dx, p, f64::from_bits(LOG2_R_TD[idx_x as usize].1)),
488 lo6_hi,
489 );
490
491 let mut hm_i = unsafe { hm.to_int_unchecked::<i64>() };
496 hm_i = if hm_i > (1i64 << 15) {
497 1 << 15
498 } else if hm_i < (-(1i64 << 15)) {
499 -(1 << 15)
500 } else {
501 hm_i
502 };
503
504 let idx_y = hm_i & 0x3f;
505
506 let exp_hi_i = (hm_i >> 6).wrapping_shl(52);
508 let exp_mid_i = EXP2_MID1[idx_y as usize].1;
510 let exp2_hi_mid_i = (exp_hi_i.wrapping_add(exp_mid_i as i64) as u64).wrapping_add(sign);
513 let exp2_hi_mid = f64::from_bits(exp2_hi_mid_i);
514
515 const EXP2_COEFFS: [u64; 6] = [
521 0x3ff0000000000000,
522 0x3f862e42fefa39ef,
523 0x3f0ebfbdff82a23a,
524 0x3e8c6b08d7076268,
525 0x3e03b2ad33f8b48b,
526 0x3d75d870c4d84445,
527 ];
528
529 let lo6_sqr = lo6 * lo6;
530 let d0 = f_fmla(
531 lo6,
532 f64::from_bits(EXP2_COEFFS[1]),
533 f64::from_bits(EXP2_COEFFS[0]),
534 );
535 let d1 = f_fmla(
536 lo6,
537 f64::from_bits(EXP2_COEFFS[3]),
538 f64::from_bits(EXP2_COEFFS[2]),
539 );
540 let d2 = f_fmla(
541 lo6,
542 f64::from_bits(EXP2_COEFFS[5]),
543 f64::from_bits(EXP2_COEFFS[4]),
544 );
545 let pp = f_polyeval3(lo6_sqr, d0, d1, d2);
546
547 let r = pp * exp2_hi_mid;
548 let r_u = r.to_bits();
549
550 #[cfg(any(
551 all(
552 any(target_arch = "x86", target_arch = "x86_64"),
553 target_feature = "fma"
554 ),
555 target_arch = "aarch64"
556 ))]
557 const ERR: u64 = 64;
558 #[cfg(not(any(
559 all(
560 any(target_arch = "x86", target_arch = "x86_64"),
561 target_feature = "fma"
562 ),
563 target_arch = "aarch64"
564 )))]
565 const ERR: u64 = 128;
566
567 let r_upper = f64::from_bits(r_u + ERR) as f32;
568 let r_lower = f64::from_bits(r_u - ERR) as f32;
569 if r_upper == r_lower {
570 return r_upper;
571 }
572
573 let exp2_hi_mid_dd = DoubleDouble {
575 lo: if idx_y != 0 {
576 f64::from_bits((exp_hi_i as u64).wrapping_add(EXP2_MID1[idx_y as usize].0))
577 } else {
578 0.
579 },
580 hi: exp2_hi_mid,
581 };
582
583 let r_dd = powf_dd(idx_x, dx, y6, lo6_hi, exp2_hi_mid_dd);
584 r_dd as f32
585}
586
587#[inline]
589pub fn dirty_powf(d: f32, n: f32) -> f32 {
590 use crate::exponents::dirty_exp2f;
591 use crate::logs::dirty_log2f;
592 let value = d.abs();
593 let lg = dirty_log2f(value);
594 let c = dirty_exp2f(n * lg);
595 if d < 0.0 {
596 let y = n as i32;
597 if y % 2 == 0 { c } else { -c }
598 } else {
599 c
600 }
601}
602
603#[cfg(test)]
604mod tests {
605 use super::*;
606
607 #[test]
608 fn powf_test() {
609 assert!(
610 (powf(2f32, 3f32) - 8f32).abs() < 1e-6,
611 "Invalid result {}",
612 powf(2f32, 3f32)
613 );
614 assert!(
615 (powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
616 "Invalid result {}",
617 powf(0.5f32, 2f32)
618 );
619 }
620
621 #[test]
622 fn f_powf_test() {
623 assert!(
624 (f_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
625 "Invalid result {}",
626 f_powf(2f32, 3f32)
627 );
628 assert!(
629 (f_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
630 "Invalid result {}",
631 f_powf(0.5f32, 2f32)
632 );
633 assert_eq!(f_powf(0.5f32, 1.5432f32), 0.34312353);
634 assert_eq!(
635 f_powf(f32::INFINITY, 0.00000000000000000000000000000000038518824),
636 f32::INFINITY
637 );
638 assert_eq!(f_powf(f32::NAN, 0.0), 1.);
639 assert_eq!(f_powf(1., f32::NAN), 1.);
640 }
641
642 #[test]
643 fn dirty_powf_test() {
644 assert!(
645 (dirty_powf(2f32, 3f32) - 8f32).abs() < 1e-6,
646 "Invalid result {}",
647 dirty_powf(2f32, 3f32)
648 );
649 assert!(
650 (dirty_powf(0.5f32, 2f32) - 0.25f32).abs() < 1e-6,
651 "Invalid result {}",
652 dirty_powf(0.5f32, 2f32)
653 );
654 }
655}