pxfm/square_root/
sqrt1pm1.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 9/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::common::f_fmla;
30use crate::double_double::DoubleDouble;
31use crate::polyeval::f_polyeval6;
32
33#[cold]
34fn sqrt1pm1_near_zero_hard(x: f64) -> f64 {
35    const C: [(u64, u64); 10] = [
36        (0x3af2bd7b2a000000, 0x3fe0000000000000),
37        (0xbb1e743d7fe00000, 0xbfc0000000000000),
38        (0xbc18d267496ae7f8, 0x3fb0000000000000),
39        (0x3c2d7a2a887e1af8, 0xbfa4000000000000),
40        (0xbc3a4965117e40c8, 0x3f9c00000000150b),
41        (0x3c3dc057cb5bf82c, 0xbf9500000000209e),
42        (0x3c02d0756979aa48, 0x3f907ffff9c1db3d),
43        (0xbbe30b3d9b8a1020, 0xbf8acffff10fbdbc),
44        (0xbc16d26d5f2efc04, 0x3f8659830d1bf014),
45        (0x3c212aabd12c483e, 0xbf82ff830f9799c4),
46    ];
47    let mut p = DoubleDouble::quick_mul_f64_add(
48        DoubleDouble::from_bit_pair(C[9]),
49        x,
50        DoubleDouble::from_bit_pair(C[8]),
51    );
52    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[7]));
53    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[6]));
54    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[5]));
55    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[4]));
56    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[3]));
57    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[2]));
58    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[1]));
59    p = DoubleDouble::quick_mul_f64_add(p, x, DoubleDouble::from_bit_pair(C[0]));
60    p = DoubleDouble::quick_mult_f64(p, x);
61    p.to_f64()
62}
63
64/// Computes sqrt(1+x) - 1
65pub fn f_sqrt1pm1(x: f64) -> f64 {
66    let ix = x.to_bits();
67    let ux = ix.wrapping_shl(1);
68
69    if ux >= 0x7ffu64 << 53 || ux <= 0x7960000000000000u64 {
70        // |x| == NaN, x == inf, |x| == 0, |x| <= f64::EPSILON
71        if ux == 0 {
72            // |x| == 0
73            return 0.;
74        }
75        if ux.wrapping_shl(11) == 0 {
76            // |x| == Inf
77            return if x.is_sign_negative() {
78                f64::NAN
79            } else {
80                f64::INFINITY
81            };
82        }
83
84        if ux <= 0x7960000000000000u64 {
85            // |x| <= f64::EPSILON
86            #[cfg(any(
87                all(
88                    any(target_arch = "x86", target_arch = "x86_64"),
89                    target_feature = "fma"
90                ),
91                all(target_arch = "aarch64", target_feature = "neon")
92            ))]
93            {
94                return f_fmla(-0.25 * x, 0.25 * x, x * 0.5);
95            }
96            #[cfg(not(any(
97                all(
98                    any(target_arch = "x86", target_arch = "x86_64"),
99                    target_feature = "fma"
100                ),
101                all(target_arch = "aarch64", target_feature = "neon")
102            )))]
103            {
104                use crate::common::dyad_fmla;
105                return if x < 1e-150 {
106                    dyad_fmla(-0.25 * x, 0.25 * x, x * 0.5)
107                } else {
108                    f_fmla(-0.25 * x, 0.25 * x, x * 0.5)
109                };
110            }
111        }
112
113        return f64::NAN; // x == NaN
114    }
115
116    if (ix >> 63) != 0 && ux >= 0x7fe0000000000000u64 {
117        // x < 0 and x >= 1
118        if ux == 0x7fe0000000000000u64 {
119            // x == -1
120            return -1.;
121        }
122        return f64::NAN;
123    }
124
125    if ux <= 0x7f1126e978d4fdf4u64 {
126        // |x| <= 0.012
127
128        // Polynomial generated by Sollya:
129        // d = [-0.012, 0.012];
130        // sqrt1pm1 = sqrt(x + 1) - 1;
131        // Q = fpminimax(sqrt1pm1, [|1,2,3,4,5,6,7|], [|1, D...|], d);
132        const C: [u64; 7] = [
133            0x3fe0000000000000,
134            0xbfc000000000009a,
135            0x3fb0000000000202,
136            0xbfa3fffffdf5a853,
137            0x3f9bfffffb3c75fe,
138            0xbf9500dd6aee8501,
139            0x3f9080d2ece21348,
140        ];
141        let p = f_polyeval6(
142            x,
143            f64::from_bits(C[1]),
144            f64::from_bits(C[2]),
145            f64::from_bits(C[3]),
146            f64::from_bits(C[4]),
147            f64::from_bits(C[5]),
148            f64::from_bits(C[6]),
149        ) * x;
150        let r = DoubleDouble::from_exact_add(f64::from_bits(C[0]), p);
151        let q = DoubleDouble::quick_mult_f64(r, x);
152        let err = f_fmla(
153            q.hi,
154            f64::from_bits(0x3c50000000000000), // 2^-58
155            f64::from_bits(0x3bf0000000000000), // 2^-64
156        );
157        let ub = q.hi + (q.lo + err);
158        let lb = q.hi + (q.lo - err);
159        // Ziv's accuracy test
160        if ub == lb {
161            return ub;
162        }
163        return sqrt1pm1_near_zero_hard(x);
164    }
165    // |x| > 0.012
166
167    #[cfg(any(
168        all(
169            any(target_arch = "x86", target_arch = "x86_64"),
170            target_feature = "fma"
171        ),
172        all(target_arch = "aarch64", target_feature = "neon")
173    ))]
174    {
175        let r = DoubleDouble::from_full_exact_add(x, 1.0);
176        let v_sqrt = r.fast_sqrt();
177        let q = DoubleDouble::full_add_f64(v_sqrt, -1.0);
178        q.to_f64()
179    }
180    #[cfg(not(any(
181        all(
182            any(target_arch = "x86", target_arch = "x86_64"),
183            target_feature = "fma"
184        ),
185        all(target_arch = "aarch64", target_feature = "neon")
186    )))]
187    {
188        use crate::double_double::two_product_compatible;
189        if !two_product_compatible(x) {
190            // x is very big, thus adding + 1 is negligible in ulp terms
191            let r = x + 1.;
192            let v_sqrt = r.sqrt();
193            DoubleDouble::from_full_exact_sub(v_sqrt, 1.0).to_f64()
194        } else {
195            let r = DoubleDouble::from_full_exact_add(x, 1.0);
196            let v_sqrt = r.fast_sqrt();
197            let q = DoubleDouble::full_add_f64(v_sqrt, -1.0);
198            q.to_f64()
199        }
200    }
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_sqrt1pm1() {
209        assert_eq!(f_sqrt1pm1(15.), 3.0);
210        assert_eq!(f_sqrt1pm1(24.), 4.0);
211        assert_eq!(f_sqrt1pm1(8.), 2.0);
212        assert_eq!(f_sqrt1pm1(-0.75), -0.5);
213        assert_eq!(f_sqrt1pm1(0.5), 0.22474487139158905);
214        assert_eq!(f_sqrt1pm1(0.0005233212), 0.00026162637581973774);
215        assert_eq!(f_sqrt1pm1(-0.0005233212), -0.0002616948420951896);
216        assert_eq!(f_sqrt1pm1(-0.00000000000000000000005233212), -2.616606e-23);
217        assert_eq!(f_sqrt1pm1(0.00000000000000000000005233212), 2.616606e-23);
218        assert_eq!(f_sqrt1pm1(0.), 0.);
219        assert_eq!(f_sqrt1pm1(-0.), 0.);
220        assert_eq!(f_sqrt1pm1(f64::INFINITY), f64::INFINITY);
221        assert!(f_sqrt1pm1(f64::NEG_INFINITY).is_nan());
222        assert!(f_sqrt1pm1(f64::NAN).is_nan());
223        assert!(f_sqrt1pm1(-1.0001).is_nan());
224    }
225}