pxfm/square_root/
rsqrt.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 8/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::double_double::DoubleDouble;
30
31/// Computes 1/sqrt(x)
32///
33/// Max ULP 0.5
34pub fn f_rsqrt(x: f64) -> f64 {
35    let ix = x.to_bits();
36    let r: f64 = if ix < 1u64 << 52 {
37        // 0 <= x < 0x1p-1022
38        if ix != 0 {
39            // x <> +0
40            x.sqrt() / x
41        } else {
42            return f64::INFINITY; // case x = +0
43        }
44    } else if ix >= 0x7ffu64 << 52 {
45        // NaN, Inf, x <= 0
46        if ix.wrapping_shl(1) == 0 {
47            return f64::NEG_INFINITY; // x=-0
48        }
49        if ix > 0xfff0000000000000u64 {
50            return x + x;
51        } // -NaN
52        if (ix >> 63) != 0 {
53            // x < 0
54            return f64::NAN;
55        }
56        if ix.wrapping_shl(12) == 0 {
57            return 0.0;
58        } // +/-Inf
59        return x + x; // +NaN
60    } else {
61        // 0x1p-1022 <= x < 2^1024
62        if ix > 0x7fd000000000000u64 {
63            // x > 2^1022
64            // avoid spurious underflow in 1/x
65            (4.0 / x) * (0.25 * x.sqrt())
66        } else {
67            (1.0 / x) * x.sqrt()
68        }
69    };
70
71    #[cfg(any(
72        all(
73            any(target_arch = "x86", target_arch = "x86_64"),
74            target_feature = "fma"
75        ),
76        all(target_arch = "aarch64", target_feature = "neon")
77    ))]
78    {
79        let d2x = DoubleDouble::from_exact_mult(r, x);
80        use crate::common::f_fmla;
81        let h = f_fmla(r, d2x.lo, f_fmla(r, d2x.hi, -1.0));
82        let dr = (r * 0.5) * h;
83        r - dr
84    }
85    #[cfg(not(any(
86        all(
87            any(target_arch = "x86", target_arch = "x86_64"),
88            target_feature = "fma"
89        ),
90        all(target_arch = "aarch64", target_feature = "neon")
91    )))]
92    {
93        use crate::double_double::two_product_compatible;
94        if !two_product_compatible(x) {
95            recip_hard_dyadic(x, r)
96        } else {
97            let d2x = DoubleDouble::from_exact_mult(r, x);
98            let DoubleDouble { hi: h, lo: pr } = DoubleDouble::quick_mult_f64(d2x, r);
99            let DoubleDouble { hi: p, lo: q } = DoubleDouble::from_full_exact_add(-1.0, h);
100            let h = DoubleDouble::from_exact_add(p, pr + q);
101            let dr = DoubleDouble::quick_mult_f64(h, r * 0.5);
102            r - dr.hi - dr.lo
103        }
104    }
105}
106
107#[cfg(not(any(
108    all(
109        any(target_arch = "x86", target_arch = "x86_64"),
110        target_feature = "fma"
111    ),
112    all(target_arch = "aarch64", target_feature = "neon")
113)))]
114#[cold]
115#[inline(never)]
116fn recip_hard_dyadic(x: f64, r: f64) -> f64 {
117    use crate::dyadic_float::{DyadicFloat128, DyadicSign};
118    let dx = DyadicFloat128::new_from_f64(x);
119    let dr = DyadicFloat128::new_from_f64(r);
120    const M_ONE: DyadicFloat128 = DyadicFloat128 {
121        sign: DyadicSign::Neg,
122        exponent: -127,
123        mantissa: 0x80000000_00000000_00000000_00000000_u128,
124    };
125    let d2 = dx * dr;
126    let h = d2 * dr + M_ONE;
127    let mut half_dr = dr;
128    half_dr.exponent -= 1; // * 0.5;
129    let ddr = half_dr * h;
130    (dr - ddr).fast_as_f64()
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136
137    #[test]
138    fn test_rsqrt() {
139        assert_eq!(f_rsqrt(7518001163502890000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.),
140                   0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000011533172976634968);
141        assert_eq!(f_rsqrt(0.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001984274103353),
142                   709903255474595300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.);
143        assert_eq!(f_rsqrt(0.0), f64::INFINITY);
144        assert_eq!(f_rsqrt(4.0), 0.5);
145        assert_eq!(f_rsqrt(9.0), 1. / 3.);
146        assert_eq!(f_rsqrt(-0.0), f64::NEG_INFINITY);
147        assert!(f_rsqrt(f64::NAN).is_nan());
148    }
149}