pxfm/
dyadic_float.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 6/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::bits::EXP_MASK;
30use crate::common::f_fmla;
31use std::ops::{Add, Mul, Sub};
32
33#[repr(u8)]
34#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
35pub(crate) enum DyadicSign {
36    Pos = 0,
37    Neg = 1,
38}
39
40impl DyadicSign {
41    #[inline]
42    pub(crate) fn negate(self) -> Self {
43        match self {
44            DyadicSign::Pos => DyadicSign::Neg,
45            DyadicSign::Neg => DyadicSign::Pos,
46        }
47    }
48
49    #[inline]
50    pub(crate) const fn to_bit(self) -> u8 {
51        match self {
52            DyadicSign::Pos => 0,
53            DyadicSign::Neg => 1,
54        }
55    }
56
57    #[inline]
58    pub(crate) const fn mult(self, rhs: Self) -> Self {
59        if (self as u8) ^ (rhs as u8) != 0 {
60            DyadicSign::Neg
61        } else {
62            DyadicSign::Pos
63        }
64    }
65}
66
67const BITS: u32 = 128;
68
69#[derive(Copy, Clone, Debug)]
70pub(crate) struct DyadicFloat128 {
71    pub(crate) sign: DyadicSign,
72    pub(crate) exponent: i16,
73    pub(crate) mantissa: u128,
74}
75
76#[inline]
77pub(crate) const fn f64_from_parts(sign: DyadicSign, exp: u64, mantissa: u64) -> f64 {
78    let r_sign = (if sign.to_bit() == 0 { 0u64 } else { 1u64 }).wrapping_shl(63);
79    let r_exp = exp.wrapping_shl(52);
80    f64::from_bits(r_sign | r_exp | mantissa)
81}
82
83#[inline]
84pub(crate) fn mulhi_u128(a: u128, b: u128) -> u128 {
85    let a_lo = a as u64 as u128;
86    let a_hi = (a >> 64) as u64 as u128;
87    let b_lo = b as u64 as u128;
88    let b_hi = (b >> 64) as u64 as u128;
89
90    let lo_lo = a_lo * b_lo;
91    let lo_hi = a_lo * b_hi;
92    let hi_lo = a_hi * b_lo;
93    let hi_hi = a_hi * b_hi;
94
95    let carry = (lo_lo >> 64)
96        .wrapping_add(lo_hi & 0xffff_ffff_ffff_ffff)
97        .wrapping_add(hi_lo & 0xffff_ffff_ffff_ffff);
98    let mid = (lo_hi >> 64)
99        .wrapping_add(hi_lo >> 64)
100        .wrapping_add(carry >> 64);
101
102    hi_hi.wrapping_add(mid)
103}
104
105#[inline]
106const fn explicit_exponent(x: f64) -> i16 {
107    let exp = ((x.to_bits() >> 52) & ((1u64 << 11) - 1u64)) as i16 - 1023;
108    if x == 0. {
109        return 0;
110    } else if x.is_subnormal() {
111        const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64;
112        return 1i16 - EXP_BIAS as i16;
113    }
114    exp
115}
116
117#[inline]
118const fn explicit_mantissa(x: f64) -> u64 {
119    const MASK: u64 = (1u64 << 52) - 1;
120    let sig_bits = x.to_bits() & MASK;
121    if x.is_subnormal() || x == 0. {
122        return sig_bits;
123    }
124    (1u64 << 52) | sig_bits
125}
126
127impl DyadicFloat128 {
128    #[inline]
129    pub(crate) const fn zero() -> Self {
130        Self {
131            sign: DyadicSign::Pos,
132            exponent: 0,
133            mantissa: 0,
134        }
135    }
136
137    #[inline]
138    pub(crate) const fn new_from_f64(x: f64) -> Self {
139        let sign = if x.is_sign_negative() {
140            DyadicSign::Neg
141        } else {
142            DyadicSign::Pos
143        };
144        let exponent = explicit_exponent(x) - 52;
145        let mantissa = explicit_mantissa(x) as u128;
146        let mut new_val = Self {
147            sign,
148            exponent,
149            mantissa,
150        };
151        new_val.normalize();
152        new_val
153    }
154
155    #[inline]
156    pub(crate) fn new(sign: DyadicSign, exponent: i16, mantissa: u128) -> Self {
157        let mut new_item = DyadicFloat128 {
158            sign,
159            exponent,
160            mantissa,
161        };
162        new_item.normalize();
163        new_item
164    }
165
166    #[inline]
167    pub(crate) fn accurate_reciprocal(a: f64) -> Self {
168        let mut r = DyadicFloat128::new_from_f64(4.0 / a); /* accurate to about 53 bits */
169        r.exponent -= 2;
170        /* we use Newton's iteration: r -> r + r*(1-a*r) */
171        let ba = DyadicFloat128::new_from_f64(-a);
172        let mut q = ba * r;
173        const F128_ONE: DyadicFloat128 = DyadicFloat128 {
174            sign: DyadicSign::Pos,
175            exponent: -127,
176            mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
177        };
178        q = F128_ONE + q;
179        q = r * q;
180        r + q
181    }
182
183    #[inline]
184    pub(crate) fn from_div_f64(a: f64, b: f64) -> Self {
185        let reciprocal = DyadicFloat128::accurate_reciprocal(b);
186        let da = DyadicFloat128::new_from_f64(a);
187        reciprocal * da
188    }
189
190    /// Multiply self by integer scalar `b`.
191    /// Returns a new normalized DyadicFloat128.
192    #[inline]
193    pub(crate) fn mul_int64(&self, b: i64) -> DyadicFloat128 {
194        if b == 0 {
195            return DyadicFloat128::zero();
196        }
197
198        let abs_b = b.unsigned_abs();
199        let sign = if (b < 0) ^ (self.sign == DyadicSign::Neg) {
200            DyadicSign::Neg
201        } else {
202            DyadicSign::Pos
203        };
204
205        let mut hi_prod = (self.mantissa >> 64).wrapping_mul(abs_b as u128);
206        let m = hi_prod.leading_zeros();
207        hi_prod <<= m;
208
209        let mut lo_prod = (self.mantissa & 0xffff_ffff_ffff_ffff).wrapping_mul(abs_b as u128);
210        lo_prod = (lo_prod << (m - 1)) >> 63;
211
212        let (mut product, overflow) = hi_prod.overflowing_add(lo_prod);
213
214        let mut result = DyadicFloat128 {
215            sign,
216            exponent: self.exponent + 64 - m as i16,
217            mantissa: product,
218        };
219
220        if overflow {
221            // Overflow means an implicit bit in the 129th place, which we shift down.
222            product += product & 0x1;
223            result.mantissa = (product >> 1) | (1u128 << 127);
224            result.shift_right(1);
225        }
226
227        result.normalize();
228        result
229    }
230
231    #[inline]
232    fn shift_right(&mut self, amount: u32) {
233        if amount < BITS {
234            self.exponent += amount as i16;
235            self.mantissa = self.mantissa.wrapping_shr(amount);
236        } else {
237            self.exponent = 0;
238            self.mantissa = 0;
239        }
240    }
241
242    #[inline]
243    fn shift_left(&mut self, amount: u32) {
244        if amount < BITS {
245            self.exponent -= amount as i16;
246            self.mantissa = self.mantissa.wrapping_shl(amount);
247        } else {
248            self.exponent = 0;
249            self.mantissa = 0;
250        }
251    }
252
253    // Don't forget to call if manually created
254    #[inline]
255    pub(crate) const fn normalize(&mut self) {
256        if self.mantissa != 0 {
257            let shift_length = self.mantissa.leading_zeros();
258            self.exponent -= shift_length as i16;
259            self.mantissa = self.mantissa.wrapping_shl(shift_length);
260        }
261    }
262
263    #[inline]
264    pub(crate) fn negated(&self) -> Self {
265        Self {
266            sign: self.sign.negate(),
267            exponent: self.exponent,
268            mantissa: self.mantissa,
269        }
270    }
271
272    #[inline]
273    pub(crate) fn quick_sub(&self, rhs: &Self) -> Self {
274        self.quick_add(&rhs.negated())
275    }
276
277    #[inline]
278    pub(crate) fn quick_add(&self, rhs: &Self) -> Self {
279        if self.mantissa == 0 {
280            return *rhs;
281        }
282        if rhs.mantissa == 0 {
283            return *self;
284        }
285        let mut a = *self;
286        let mut b = *rhs;
287
288        let exp_diff = a.exponent.wrapping_sub(b.exponent);
289
290        // If exponent difference is too large, b is negligible
291        if exp_diff.abs() >= BITS as i16 {
292            return if a.sign == b.sign {
293                // Adding very small number to large: return a
294                return if a.exponent > b.exponent { a } else { b };
295            } else if a.exponent > b.exponent {
296                a
297            } else {
298                b
299            };
300        }
301
302        // Align exponents
303        if a.exponent > b.exponent {
304            b.shift_right((a.exponent - b.exponent) as u32);
305        } else if b.exponent > a.exponent {
306            a.shift_right((b.exponent - a.exponent) as u32);
307        }
308
309        let mut result = DyadicFloat128::zero();
310
311        if a.sign == b.sign {
312            // Addition
313            result.sign = a.sign;
314            result.exponent = a.exponent;
315            result.mantissa = a.mantissa;
316            let (sum, is_overflow) = result.mantissa.overflowing_add(b.mantissa);
317            result.mantissa = sum;
318            if is_overflow {
319                // Mantissa addition overflow.
320                result.shift_right(1);
321                result.mantissa |= 1u128 << 127;
322            }
323            // Result is already normalized.
324            return result;
325        }
326
327        // Subtraction
328        if a.mantissa >= b.mantissa {
329            result.sign = a.sign;
330            result.exponent = a.exponent;
331            result.mantissa = a.mantissa.wrapping_sub(b.mantissa);
332        } else {
333            result.sign = b.sign;
334            result.exponent = b.exponent;
335            result.mantissa = b.mantissa.wrapping_sub(a.mantissa);
336        }
337
338        result.normalize();
339        result
340    }
341
342    #[inline]
343    pub(crate) fn quick_mul(&self, rhs: &Self) -> Self {
344        let mut result = DyadicFloat128 {
345            sign: if self.sign != rhs.sign {
346                DyadicSign::Neg
347            } else {
348                DyadicSign::Pos
349            },
350            exponent: self.exponent + rhs.exponent + BITS as i16,
351            mantissa: 0,
352        };
353
354        if !(self.mantissa == 0 || rhs.mantissa == 0) {
355            result.mantissa = mulhi_u128(self.mantissa, rhs.mantissa);
356            // Check the leading bit directly, should be faster than using clz in
357            // normalize().
358            if result.mantissa >> 127 == 0 {
359                result.shift_left(1);
360            }
361        } else {
362            result.mantissa = 0;
363        }
364        result
365    }
366
367    #[inline]
368    pub(crate) fn fast_as_f64(&self) -> f64 {
369        if self.mantissa == 0 {
370            return if self.sign == DyadicSign::Pos {
371                0.
372            } else {
373                -0.0
374            };
375        }
376
377        // Assume that it is normalized, and output is also normal.
378        const PRECISION: u32 = 52 + 1;
379
380        // SIG_MASK - FRACTION_MASK
381        const SIG_MASK: u64 = (1u64 << 52) - 1;
382        const FRACTION_MASK: u64 = (1u64 << 52) - 1;
383        const IMPLICIT_MASK: u64 = SIG_MASK - FRACTION_MASK;
384        const EXP_BIAS: u64 = (1u64 << (11 - 1u64)) - 1u64;
385
386        let mut exp_hi = self.exponent as i32 + ((BITS - 1) as i32 + EXP_BIAS as i32);
387
388        if exp_hi > 2 * EXP_BIAS as i32 {
389            // Results overflow.
390            let d_hi = f64_from_parts(self.sign, 2 * EXP_BIAS, IMPLICIT_MASK);
391            // volatile prevents constant propagation that would result in infinity
392            // always being returned no matter the current rounding mode.
393            let two = 2.0f64;
394            let r = two * d_hi;
395            return r;
396        }
397
398        let mut denorm = false;
399        let mut shift = BITS - PRECISION;
400        if exp_hi <= 0 {
401            // Output is denormal.
402            denorm = true;
403            shift = (BITS - PRECISION) + (1 - exp_hi) as u32;
404
405            exp_hi = EXP_BIAS as i32;
406        }
407
408        let exp_lo = exp_hi.wrapping_sub(PRECISION as i32).wrapping_sub(1);
409
410        let m_hi = if shift >= BITS {
411            0
412        } else {
413            self.mantissa >> shift
414        };
415
416        let d_hi = f64_from_parts(
417            self.sign,
418            exp_hi as u64,
419            (m_hi as u64 & SIG_MASK) | IMPLICIT_MASK,
420        );
421
422        let round_mask = if shift > BITS {
423            0
424        } else {
425            1u128.wrapping_shl(shift.wrapping_sub(1))
426        };
427        let sticky_mask = round_mask.wrapping_sub(1u128);
428
429        let round_bit = (self.mantissa & round_mask) != 0;
430        let sticky_bit = (self.mantissa & sticky_mask) != 0;
431        let round_and_sticky = round_bit as i32 * 2 + sticky_bit as i32;
432
433        let d_lo: f64;
434
435        if exp_lo <= 0 {
436            // d_lo is denormal, but the output is normal.
437            let scale_up_exponent = 1 - exp_lo;
438            let scale_up_factor = f64_from_parts(
439                DyadicSign::Pos,
440                EXP_BIAS + scale_up_exponent as u64,
441                IMPLICIT_MASK,
442            );
443            let scale_down_factor = f64_from_parts(
444                DyadicSign::Pos,
445                EXP_BIAS - scale_up_exponent as u64,
446                IMPLICIT_MASK,
447            );
448
449            d_lo = f64_from_parts(
450                self.sign,
451                (exp_lo + scale_up_exponent) as u64,
452                IMPLICIT_MASK,
453            );
454
455            return f_fmla(d_lo, round_and_sticky as f64, d_hi * scale_up_factor)
456                * scale_down_factor;
457        }
458
459        d_lo = f64_from_parts(self.sign, exp_lo as u64, IMPLICIT_MASK);
460
461        // Still correct without FMA instructions if `d_lo` is not underflow.
462        let r = f_fmla(d_lo, round_and_sticky as f64, d_hi);
463
464        if denorm {
465            const SIG_LEN: u64 = 52;
466            // Exponent before rounding is in denormal range, simply clear the
467            // exponent field.
468            let clear_exp: u64 = (exp_hi as u64) << SIG_LEN;
469            let mut r_bits: u64 = r.to_bits() - clear_exp;
470
471            if r_bits & EXP_MASK == 0 {
472                // Output is denormal after rounding, clear the implicit bit for 80-bit
473                // long double.
474                r_bits -= IMPLICIT_MASK;
475            }
476
477            return f64::from_bits(r_bits);
478        }
479
480        r
481    }
482
483    // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a.
484    // The method is Newton-Raphson iteration, based on quick_mul.
485    #[inline]
486    pub(crate) fn reciprocal(self) -> DyadicFloat128 {
487        // Computes the reciprocal using Newton-Raphson iteration:
488        // Given an approximation x ≈ 1/a, we refine via:
489        //     x' = x * (2 - a * x)
490        // This squares the error term: if ax ≈ 1 - e, then ax' ≈ 1 - e².
491
492        let guess = 1. / self.fast_as_f64();
493        let mut x = DyadicFloat128::new_from_f64(guess);
494
495        // The constant 2, which we'll need in every iteration
496        let twos = DyadicFloat128 {
497            sign: DyadicSign::Pos,
498            exponent: -126,
499            mantissa: 0x80000000_00000000_00000000_00000000_u128,
500        };
501
502        x = x * (twos - (self * x));
503        x = x * (twos - (self * x));
504        x
505    }
506
507    // // Approximate reciprocal - given a nonzero `a`, make a good approximation to 1/a.
508    // // The method is Newton-Raphson iteration, based on quick_mul.
509    // // *This is very crude guess*
510    // #[inline]
511    // fn approximate_reciprocal(&self) -> DyadicFloat128 {
512    //     // Given an approximation x to 1/a, a better one is x' = x(2-ax).
513    //     //
514    //     // You can derive this by using the Newton-Raphson formula with the function
515    //     // f(x) = 1/x - a. But another way to see that it works is to say: suppose
516    //     // that ax = 1-e for some small error e. Then ax' = ax(2-ax) = (1-e)(1+e) =
517    //     // 1-e^2. So the error in x' is the square of the error in x, i.e. the number
518    //     // of correct bits in x' is double the number in x.
519    //
520    //     // An initial approximation to the reciprocal
521    //     let mut x = DyadicFloat128 {
522    //         sign: DyadicSign::Pos,
523    //         exponent: -32 - self.exponent - BITS as i16,
524    //         mantissa: self.mantissa >> (BITS - 32),
525    //     };
526    //     x.normalize();
527    //
528    //     // The constant 2, which we'll need in every iteration
529    //     let two = DyadicFloat128::new(DyadicSign::Pos, 1, 1);
530    //
531    //     // We expect at least 31 correct bits from our 32-bit starting approximation
532    //     let mut ok_bits = 31usize;
533    //
534    //     // The number of good bits doubles in each iteration, except that rounding
535    //     // errors introduce a little extra each time. Subtract a bit from our
536    //     // accuracy assessment to account for that.
537    //     while ok_bits < BITS as usize {
538    //         x = x * (two - (*self * x));
539    //         ok_bits = 2 * ok_bits - 1;
540    //     }
541    //
542    //     x
543    // }
544}
545
546impl Add<DyadicFloat128> for DyadicFloat128 {
547    type Output = DyadicFloat128;
548    #[inline]
549    fn add(self, rhs: DyadicFloat128) -> Self::Output {
550        self.quick_add(&rhs)
551    }
552}
553
554impl DyadicFloat128 {
555    #[inline]
556    pub(crate) fn biased_exponent(&self) -> i16 {
557        self.exponent + (BITS as i16 - 1)
558    }
559
560    #[inline]
561    pub(crate) fn trunc_to_i64(&self) -> i64 {
562        if self.exponent <= -(BITS as i16) {
563            // Absolute value of x is greater than equal to 0.5 but less than 1.
564            return 0;
565        }
566        let hi = self.mantissa >> 64;
567        let norm_exp = self.biased_exponent();
568        if norm_exp > 63 {
569            return if self.sign == DyadicSign::Neg {
570                i64::MIN
571            } else {
572                i64::MAX
573            };
574        }
575        let r: i64 = (hi >> (63 - norm_exp)) as i64;
576
577        if self.sign == DyadicSign::Neg { -r } else { r }
578    }
579
580    #[inline]
581    pub(crate) fn round_to_nearest(&self) -> DyadicFloat128 {
582        if self.exponent == -(BITS as i16) {
583            // Absolute value of x is greater than equal to 0.5 but less than 1.
584            return DyadicFloat128 {
585                sign: self.sign,
586                exponent: -(BITS as i16 - 1),
587                mantissa: 0x80000000_00000000_00000000_00000000_u128,
588            };
589        }
590        if self.exponent <= -((BITS + 1) as i16) {
591            // Absolute value of x is greater than equal to 0.5 but less than 1.
592            return DyadicFloat128 {
593                sign: self.sign,
594                exponent: 0,
595                mantissa: 0u128,
596            };
597        }
598        const FRACTION_LENGTH: u32 = BITS - 1;
599        let trim_size =
600            (FRACTION_LENGTH as i64).wrapping_sub(self.exponent as i64 + (BITS - 1) as i64) as u128;
601        let half_bit_set =
602            self.mantissa & (1u128.wrapping_shl(trim_size.wrapping_sub(1) as u32)) != 0;
603        let trunc_u: u128 = self
604            .mantissa
605            .wrapping_shr(trim_size as u32)
606            .wrapping_shl(trim_size as u32);
607        if trunc_u == self.mantissa {
608            return *self;
609        }
610
611        let truncated = DyadicFloat128::new(self.sign, self.exponent, trunc_u);
612
613        if !half_bit_set {
614            // Franctional part is less than 0.5 so round value is the
615            // same as the trunc value.
616            truncated
617        } else if self.sign == DyadicSign::Neg {
618            let ones = DyadicFloat128 {
619                sign: DyadicSign::Pos,
620                exponent: -(BITS as i16 - 1),
621                mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
622            };
623            truncated - ones
624        } else {
625            let ones = DyadicFloat128 {
626                sign: DyadicSign::Pos,
627                exponent: -(BITS as i16 - 1),
628                mantissa: 0x8000_0000_0000_0000_0000_0000_0000_0000_u128,
629            };
630            truncated + ones
631        }
632    }
633
634    #[inline]
635    pub(crate) fn round_to_nearest_f64(&self) -> f64 {
636        self.round_to_nearest().fast_as_f64()
637    }
638}
639
640impl Sub<DyadicFloat128> for DyadicFloat128 {
641    type Output = DyadicFloat128;
642    #[inline]
643    fn sub(self, rhs: DyadicFloat128) -> Self::Output {
644        self.quick_sub(&rhs)
645    }
646}
647
648impl Mul<DyadicFloat128> for DyadicFloat128 {
649    type Output = DyadicFloat128;
650    #[inline]
651    fn mul(self, rhs: DyadicFloat128) -> Self::Output {
652        self.quick_mul(&rhs)
653    }
654}
655
656#[cfg(test)]
657mod tests {
658    use super::*;
659
660    #[test]
661    fn test_dyadic_float() {
662        let ones = DyadicFloat128 {
663            sign: DyadicSign::Pos,
664            exponent: -127,
665            mantissa: 0x80000000_00000000_00000000_00000000_u128,
666        };
667        let cvt = ones.fast_as_f64();
668        assert_eq!(cvt, 1.0);
669
670        let minus_0_5 = DyadicFloat128 {
671            sign: DyadicSign::Neg,
672            exponent: -128,
673            mantissa: 0x80000000_00000000_00000000_00000000_u128,
674        };
675        let cvt0 = minus_0_5.fast_as_f64();
676        assert_eq!(cvt0, -1.0 / 2.0);
677
678        let minus_1_f4 = DyadicFloat128 {
679            sign: DyadicSign::Neg,
680            exponent: -132,
681            mantissa: 0xaaaaaaaa_aaaaaaaa_aaaaaaaa_aaaaaaab_u128,
682        };
683        let cvt0 = minus_1_f4.fast_as_f64();
684        assert_eq!(cvt0, -1.0 / 24.0);
685
686        let minus_1_f8 = DyadicFloat128 {
687            sign: DyadicSign::Pos,
688            exponent: -143,
689            mantissa: 0xd00d00d0_0d00d00d_00d00d00_d00d00d0_u128,
690        };
691        let cvt0 = minus_1_f8.fast_as_f64();
692        assert_eq!(cvt0, 1.0 / 40320.0);
693    }
694
695    #[test]
696    fn dyadic_float_add() {
697        let ones = DyadicFloat128 {
698            sign: DyadicSign::Pos,
699            exponent: -127,
700            mantissa: 0x80000000_00000000_00000000_00000000_u128,
701        };
702
703        let cvt = ones.fast_as_f64();
704        assert_eq!(cvt, 1.0);
705
706        let minus_0_5 = DyadicFloat128 {
707            sign: DyadicSign::Neg,
708            exponent: -128,
709            mantissa: 0x80000000_00000000_00000000_00000000_u128,
710        };
711        let cvt0 = ones.quick_add(&minus_0_5).fast_as_f64();
712        assert_eq!(cvt0, 0.5);
713    }
714
715    #[test]
716    fn dyadic_float_mul() {
717        let ones = DyadicFloat128 {
718            sign: DyadicSign::Pos,
719            exponent: -127,
720            mantissa: 0x80000000_00000000_00000000_00000000_u128,
721        };
722
723        let cvt = ones.fast_as_f64();
724        assert_eq!(cvt, 1.0);
725
726        let minus_0_5 = DyadicFloat128 {
727            sign: DyadicSign::Neg,
728            exponent: -128,
729            mantissa: 0x80000000_00000000_00000000_00000000_u128,
730        };
731        let product = ones.quick_mul(&minus_0_5);
732        let cvt0 = product.fast_as_f64();
733        assert_eq!(cvt0, -0.5);
734
735        let twos = DyadicFloat128 {
736            sign: DyadicSign::Pos,
737            exponent: -126,
738            mantissa: 0x80000000_00000000_00000000_00000000_u128,
739        };
740
741        let cvt = twos.fast_as_f64();
742        assert_eq!(cvt, 2.0);
743    }
744
745    #[test]
746    fn dyadic_round_trip() {
747        let z00 = 0.0;
748        let zvt00 = DyadicFloat128::new_from_f64(z00);
749        let b00 = zvt00.fast_as_f64();
750        assert_eq!(b00, z00);
751
752        let zvt000 = DyadicFloat128 {
753            sign: DyadicSign::Pos,
754            exponent: 0,
755            mantissa: 0,
756        };
757        let b000 = zvt000.fast_as_f64();
758        assert_eq!(b000, z00);
759
760        let z0 = 1.0;
761        let zvt0 = DyadicFloat128::new_from_f64(z0);
762        let b0 = zvt0.fast_as_f64();
763        assert_eq!(b0, z0);
764
765        let z1 = 0.5;
766        let zvt1 = DyadicFloat128::new_from_f64(z1);
767        let b1 = zvt1.fast_as_f64();
768        assert_eq!(b1, z1);
769
770        let z2 = -0.5;
771        let zvt2 = DyadicFloat128::new_from_f64(z2);
772        let b2 = zvt2.fast_as_f64();
773        assert_eq!(b2, z2);
774
775        let z3 = -532322.54324324232;
776        let zvt3 = DyadicFloat128::new_from_f64(z3);
777        let b3 = zvt3.fast_as_f64();
778        assert_eq!(b3, z3);
779    }
780
781    #[test]
782    fn dyadic_float_reciprocal() {
783        let ones = DyadicFloat128 {
784            sign: DyadicSign::Pos,
785            exponent: -127,
786            mantissa: 0x80000000_00000000_00000000_00000000_u128,
787        }
788        .reciprocal();
789
790        let cvt = ones.fast_as_f64();
791        assert_eq!(cvt, 1.0);
792
793        let minus_0_5 = DyadicFloat128::new_from_f64(4.).reciprocal();
794        let cvt0 = minus_0_5.fast_as_f64();
795        assert_eq!(cvt0, 0.25);
796    }
797
798    #[test]
799    fn dyadic_float_from_div() {
800        let from_div = DyadicFloat128::from_div_f64(1.0, 4.0);
801        let cvt = from_div.fast_as_f64();
802        assert_eq!(cvt, 0.25);
803    }
804
805    #[test]
806    fn dyadic_float_accurate_reciprocal() {
807        let from_div = DyadicFloat128::accurate_reciprocal(4.0);
808        let cvt = from_div.fast_as_f64();
809        assert_eq!(cvt, 0.25);
810    }
811
812    #[test]
813    fn dyadic_float_mul_int() {
814        let from_div = DyadicFloat128::new_from_f64(4.0);
815        let m1 = from_div.mul_int64(-2);
816        assert_eq!(m1.fast_as_f64(), -8.0);
817
818        let from_div = DyadicFloat128::new_from_f64(-4.0);
819        let m1 = from_div.mul_int64(-2);
820        assert_eq!(m1.fast_as_f64(), 8.0);
821
822        let from_div = DyadicFloat128::new_from_f64(2.5);
823        let m1 = from_div.mul_int64(2);
824        assert_eq!(m1.fast_as_f64(), 5.0);
825    }
826
827    #[test]
828    fn dyadic_float_round() {
829        let from_div = DyadicFloat128::new_from_f64(2.5);
830        let m1 = from_div.round_to_nearest_f64();
831        assert_eq!(m1, 3.0);
832
833        let from_div = DyadicFloat128::new_from_f64(0.5);
834        let m1 = from_div.round_to_nearest_f64();
835        assert_eq!(m1, 1.0);
836
837        let from_div = DyadicFloat128::new_from_f64(-0.5);
838        let m1 = from_div.round_to_nearest_f64();
839        assert_eq!(m1, -1.0);
840
841        let from_div = DyadicFloat128::new_from_f64(-0.351);
842        let m1 = from_div.round_to_nearest_f64();
843        assert_eq!(m1, (-0.351f64).round());
844
845        let from_div = DyadicFloat128::new_from_f64(0.351);
846        let m1 = from_div.round_to_nearest_f64();
847        assert_eq!(m1, 0.351f64.round());
848
849        let z00 = 25.6;
850        let zvt00 = DyadicFloat128::new_from_f64(z00);
851        let b00 = zvt00.round_to_nearest_f64();
852        assert_eq!(b00, 26.);
853    }
854
855    #[test]
856    fn dyadic_int_trunc() {
857        let from_div = DyadicFloat128::new_from_f64(-2.5);
858        let m1 = from_div.trunc_to_i64();
859        assert_eq!(m1, -2);
860
861        let from_div = DyadicFloat128::new_from_f64(2.5);
862        let m1 = from_div.trunc_to_i64();
863        assert_eq!(m1, 2);
864
865        let from_div = DyadicFloat128::new_from_f64(0.5);
866        let m1 = from_div.trunc_to_i64();
867        assert_eq!(m1, 0);
868
869        let from_div = DyadicFloat128::new_from_f64(-0.5);
870        let m1 = from_div.trunc_to_i64();
871        assert_eq!(m1, 0);
872
873        let from_div = DyadicFloat128::new_from_f64(-0.351);
874        let m1 = from_div.trunc_to_i64();
875        assert_eq!(m1, 0);
876
877        let from_div = DyadicFloat128::new_from_f64(0.351);
878        let m1 = from_div.trunc_to_i64();
879        assert_eq!(m1, 0);
880    }
881}
pxfm/dyadic_float.rs

pxfm/
dyadic_float.rs