moxcms/conversions/avx/
interpolator.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
31use num_traits::AsPrimitive;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35#[repr(align(16), C)]
36pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
37
38#[cfg(feature = "options")]
39pub(crate) struct TetrahedralAvxFma<'a, const GRID_SIZE: usize> {
40    pub(crate) cube: &'a [SseAlignedF32],
41}
42
43#[cfg(feature = "options")]
44pub(crate) struct PyramidalAvxFma<'a, const GRID_SIZE: usize> {
45    pub(crate) cube: &'a [SseAlignedF32],
46}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxFma<'a, const GRID_SIZE: usize> {
50    pub(crate) cube: &'a [SseAlignedF32],
51}
52
53pub(crate) struct TrilinearAvxFma<'a, const GRID_SIZE: usize> {
54    pub(crate) cube: &'a [SseAlignedF32],
55}
56
57#[cfg(feature = "options")]
58pub(crate) struct PrismaticAvxFmaDouble<'a, const GRID_SIZE: usize> {
59    pub(crate) cube0: &'a [SseAlignedF32],
60    pub(crate) cube1: &'a [SseAlignedF32],
61}
62
63pub(crate) struct TrilinearAvxFmaDouble<'a, const GRID_SIZE: usize> {
64    pub(crate) cube0: &'a [SseAlignedF32],
65    pub(crate) cube1: &'a [SseAlignedF32],
66}
67
68#[cfg(feature = "options")]
69pub(crate) struct PyramidAvxFmaDouble<'a, const GRID_SIZE: usize> {
70    pub(crate) cube0: &'a [SseAlignedF32],
71    pub(crate) cube1: &'a [SseAlignedF32],
72}
73
74#[cfg(feature = "options")]
75pub(crate) struct TetrahedralAvxFmaDouble<'a, const GRID_SIZE: usize> {
76    pub(crate) cube0: &'a [SseAlignedF32],
77    pub(crate) cube1: &'a [SseAlignedF32],
78}
79
80pub(crate) trait AvxMdInterpolationDouble<'a, const GRID_SIZE: usize> {
81    fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self;
82    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
83        &self,
84        in_r: U,
85        in_g: U,
86        in_b: U,
87        lut: &[BarycentricWeight<f32>; BINS],
88    ) -> (AvxVectorSse, AvxVectorSse);
89}
90
91pub(crate) trait AvxMdInterpolation<'a, const GRID_SIZE: usize> {
92    fn new(table: &'a [SseAlignedF32]) -> Self;
93    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
94        &self,
95        in_r: U,
96        in_g: U,
97        in_b: U,
98        lut: &[BarycentricWeight<f32>; BINS],
99    ) -> AvxVectorSse;
100}
101
102trait Fetcher<T> {
103    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
104}
105
106#[derive(Copy, Clone)]
107#[repr(transparent)]
108pub(crate) struct AvxVectorSse {
109    pub(crate) v: __m128,
110}
111
112#[derive(Copy, Clone)]
113#[repr(transparent)]
114pub(crate) struct AvxVector {
115    pub(crate) v: __m256,
116}
117
118impl AvxVector {
119    #[inline(always)]
120    pub(crate) fn from_sse(lo: AvxVectorSse, hi: AvxVectorSse) -> AvxVector {
121        unsafe {
122            AvxVector {
123                v: _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo.v), hi.v),
124            }
125        }
126    }
127
128    #[inline(always)]
129    pub(crate) fn split(self) -> (AvxVectorSse, AvxVectorSse) {
130        unsafe {
131            (
132                AvxVectorSse {
133                    v: _mm256_castps256_ps128(self.v),
134                },
135                AvxVectorSse {
136                    v: _mm256_extractf128_ps::<1>(self.v),
137                },
138            )
139        }
140    }
141}
142
143impl From<f32> for AvxVectorSse {
144    #[inline(always)]
145    fn from(v: f32) -> Self {
146        AvxVectorSse {
147            v: unsafe { _mm_set1_ps(v) },
148        }
149    }
150}
151
152impl From<f32> for AvxVector {
153    #[inline(always)]
154    fn from(v: f32) -> Self {
155        AvxVector {
156            v: unsafe { _mm256_set1_ps(v) },
157        }
158    }
159}
160
161impl Sub<AvxVectorSse> for AvxVectorSse {
162    type Output = Self;
163    #[inline(always)]
164    fn sub(self, rhs: AvxVectorSse) -> Self::Output {
165        AvxVectorSse {
166            v: unsafe { _mm_sub_ps(self.v, rhs.v) },
167        }
168    }
169}
170
171impl Sub<AvxVector> for AvxVector {
172    type Output = Self;
173    #[inline(always)]
174    fn sub(self, rhs: AvxVector) -> Self::Output {
175        AvxVector {
176            v: unsafe { _mm256_sub_ps(self.v, rhs.v) },
177        }
178    }
179}
180
181impl Add<AvxVectorSse> for AvxVectorSse {
182    type Output = Self;
183    #[inline(always)]
184    fn add(self, rhs: AvxVectorSse) -> Self::Output {
185        AvxVectorSse {
186            v: unsafe { _mm_add_ps(self.v, rhs.v) },
187        }
188    }
189}
190
191impl Mul<AvxVectorSse> for AvxVectorSse {
192    type Output = Self;
193    #[inline(always)]
194    fn mul(self, rhs: AvxVectorSse) -> Self::Output {
195        AvxVectorSse {
196            v: unsafe { _mm_mul_ps(self.v, rhs.v) },
197        }
198    }
199}
200
201impl AvxVector {
202    #[inline(always)]
203    pub(crate) fn neg_mla(self, b: AvxVector, c: AvxVector) -> Self {
204        Self {
205            v: unsafe { _mm256_fnmadd_ps(b.v, c.v, self.v) },
206        }
207    }
208}
209
210impl FusedMultiplyNegAdd<AvxVectorSse> for AvxVectorSse {
211    #[inline(always)]
212    fn neg_mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> Self {
213        Self {
214            v: unsafe { _mm_fnmadd_ps(b.v, c.v, self.v) },
215        }
216    }
217}
218
219impl Add<AvxVector> for AvxVector {
220    type Output = Self;
221    #[inline(always)]
222    fn add(self, rhs: AvxVector) -> Self::Output {
223        AvxVector {
224            v: unsafe { _mm256_add_ps(self.v, rhs.v) },
225        }
226    }
227}
228
229impl Mul<AvxVector> for AvxVector {
230    type Output = Self;
231    #[inline(always)]
232    fn mul(self, rhs: AvxVector) -> Self::Output {
233        AvxVector {
234            v: unsafe { _mm256_mul_ps(self.v, rhs.v) },
235        }
236    }
237}
238
239impl FusedMultiplyAdd<AvxVectorSse> for AvxVectorSse {
240    #[inline(always)]
241    fn mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> AvxVectorSse {
242        AvxVectorSse {
243            v: unsafe { _mm_fmadd_ps(b.v, c.v, self.v) },
244        }
245    }
246}
247
248impl FusedMultiplyAdd<AvxVector> for AvxVector {
249    #[inline(always)]
250    fn mla(&self, b: AvxVector, c: AvxVector) -> AvxVector {
251        AvxVector {
252            v: unsafe { _mm256_fmadd_ps(b.v, c.v, self.v) },
253        }
254    }
255}
256
257struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
258    cube: &'a [SseAlignedF32],
259}
260
261struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
262    cube0: &'a [SseAlignedF32],
263    cube1: &'a [SseAlignedF32],
264}
265
266impl<const GRID_SIZE: usize> Fetcher<AvxVector> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
267    #[inline(always)]
268    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVector {
269        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
270            + y as u32 * GRID_SIZE as u32
271            + z as u32) as usize;
272        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
273        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
274        AvxVector {
275            v: unsafe {
276                _mm256_insertf128_ps::<1>(
277                    _mm256_castps128_ps256(_mm_load_ps(jx0.as_ptr() as *const f32)),
278                    _mm_load_ps(jx1.as_ptr() as *const f32),
279                )
280            },
281        }
282    }
283}
284
285impl<const GRID_SIZE: usize> Fetcher<AvxVectorSse> for TetrahedralAvxSseFetchVector<'_, GRID_SIZE> {
286    #[inline(always)]
287    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorSse {
288        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
289            + y as u32 * GRID_SIZE as u32
290            + z as u32) as usize;
291        let jx = unsafe { self.cube.get_unchecked(offset..) };
292        AvxVectorSse {
293            v: unsafe { _mm_load_ps(jx.as_ptr() as *const f32) },
294        }
295    }
296}
297
298#[cfg(feature = "options")]
299impl<const GRID_SIZE: usize> TetrahedralAvxFma<'_, GRID_SIZE> {
300    #[inline(always)]
301    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
302        &self,
303        in_r: U,
304        in_g: U,
305        in_b: U,
306        lut: &[BarycentricWeight<f32>; BINS],
307        r: impl Fetcher<AvxVectorSse>,
308    ) -> AvxVectorSse {
309        let lut_r = lut[in_r.as_()];
310        let lut_g = lut[in_g.as_()];
311        let lut_b = lut[in_b.as_()];
312
313        let x: i32 = lut_r.x;
314        let y: i32 = lut_g.x;
315        let z: i32 = lut_b.x;
316
317        let x_n: i32 = lut_r.x_n;
318        let y_n: i32 = lut_g.x_n;
319        let z_n: i32 = lut_b.x_n;
320
321        let rx = lut_r.w;
322        let ry = lut_g.w;
323        let rz = lut_b.w;
324
325        let c0 = r.fetch(x, y, z);
326
327        let c2;
328        let c1;
329        let c3;
330        if rx >= ry {
331            if ry >= rz {
332                //rx >= ry && ry >= rz
333                c1 = r.fetch(x_n, y, z) - c0;
334                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
335                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
336            } else if rx >= rz {
337                //rx >= rz && rz >= ry
338                c1 = r.fetch(x_n, y, z) - c0;
339                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
340                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
341            } else {
342                //rz > rx && rx >= ry
343                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
344                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
345                c3 = r.fetch(x, y, z_n) - c0;
346            }
347        } else if rx >= rz {
348            //ry > rx && rx >= rz
349            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
350            c2 = r.fetch(x, y_n, z) - c0;
351            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
352        } else if ry >= rz {
353            //ry >= rz && rz > rx
354            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
355            c2 = r.fetch(x, y_n, z) - c0;
356            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
357        } else {
358            //rz > ry && ry > rx
359            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
360            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
361            c3 = r.fetch(x, y, z_n) - c0;
362        }
363        let s0 = c0.mla(c1, AvxVectorSse::from(rx));
364        let s1 = s0.mla(c2, AvxVectorSse::from(ry));
365        s1.mla(c3, AvxVectorSse::from(rz))
366    }
367}
368
369macro_rules! define_interp_avx {
370    ($interpolator: ident) => {
371        impl<'a, const GRID_SIZE: usize> AvxMdInterpolation<'a, GRID_SIZE>
372            for $interpolator<'a, GRID_SIZE>
373        {
374            #[inline(always)]
375            fn new(table: &'a [SseAlignedF32]) -> Self {
376                Self { cube: table }
377            }
378
379            #[inline(always)]
380            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
381                &self,
382                in_r: U,
383                in_g: U,
384                in_b: U,
385                lut: &[BarycentricWeight<f32>; BINS],
386            ) -> AvxVectorSse {
387                self.interpolate(
388                    in_r,
389                    in_g,
390                    in_b,
391                    lut,
392                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube },
393                )
394            }
395        }
396    };
397}
398
399#[cfg(feature = "options")]
400macro_rules! define_interp_avx_d {
401    ($interpolator: ident) => {
402        impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
403            for $interpolator<'a, GRID_SIZE>
404        {
405            #[inline(always)]
406            fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
407                Self {
408                    cube0: table0,
409                    cube1: table1,
410                }
411            }
412
413            #[inline(always)]
414            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
415                &self,
416                in_r: U,
417                in_g: U,
418                in_b: U,
419                lut: &[BarycentricWeight<f32>; BINS],
420            ) -> (AvxVectorSse, AvxVectorSse) {
421                self.interpolate(
422                    in_r,
423                    in_g,
424                    in_b,
425                    lut,
426                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube0 },
427                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube1 },
428                )
429            }
430        }
431    };
432}
433
434#[cfg(feature = "options")]
435define_interp_avx!(TetrahedralAvxFma);
436#[cfg(feature = "options")]
437define_interp_avx!(PyramidalAvxFma);
438#[cfg(feature = "options")]
439define_interp_avx!(PrismaticAvxFma);
440define_interp_avx!(TrilinearAvxFma);
441#[cfg(feature = "options")]
442define_interp_avx_d!(PrismaticAvxFmaDouble);
443#[cfg(feature = "options")]
444define_interp_avx_d!(PyramidAvxFmaDouble);
445
446#[cfg(feature = "options")]
447impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
448    for TetrahedralAvxFmaDouble<'a, GRID_SIZE>
449{
450    #[inline(always)]
451    fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
452        Self {
453            cube0: table0,
454            cube1: table1,
455        }
456    }
457
458    #[inline(always)]
459    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
460        &self,
461        in_r: U,
462        in_g: U,
463        in_b: U,
464        lut: &[BarycentricWeight<f32>; BINS],
465    ) -> (AvxVectorSse, AvxVectorSse) {
466        self.interpolate(
467            in_r,
468            in_g,
469            in_b,
470            lut,
471            TetrahedralAvxFetchVector::<GRID_SIZE> {
472                cube0: self.cube0,
473                cube1: self.cube1,
474            },
475        )
476    }
477}
478
479impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
480    for TrilinearAvxFmaDouble<'a, GRID_SIZE>
481{
482    #[inline(always)]
483    fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
484        Self {
485            cube0: table0,
486            cube1: table1,
487        }
488    }
489
490    #[inline(always)]
491    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
492        &self,
493        in_r: U,
494        in_g: U,
495        in_b: U,
496        lut: &[BarycentricWeight<f32>; BINS],
497    ) -> (AvxVectorSse, AvxVectorSse) {
498        self.interpolate(
499            in_r,
500            in_g,
501            in_b,
502            lut,
503            TetrahedralAvxFetchVector::<GRID_SIZE> {
504                cube0: self.cube0,
505                cube1: self.cube1,
506            },
507        )
508    }
509}
510
511#[cfg(feature = "options")]
512impl<const GRID_SIZE: usize> PyramidalAvxFma<'_, GRID_SIZE> {
513    #[inline(always)]
514    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
515        &self,
516        in_r: U,
517        in_g: U,
518        in_b: U,
519        lut: &[BarycentricWeight<f32>; BINS],
520        r: impl Fetcher<AvxVectorSse>,
521    ) -> AvxVectorSse {
522        let lut_r = lut[in_r.as_()];
523        let lut_g = lut[in_g.as_()];
524        let lut_b = lut[in_b.as_()];
525
526        let x: i32 = lut_r.x;
527        let y: i32 = lut_g.x;
528        let z: i32 = lut_b.x;
529
530        let x_n: i32 = lut_r.x_n;
531        let y_n: i32 = lut_g.x_n;
532        let z_n: i32 = lut_b.x_n;
533
534        let dr = lut_r.w;
535        let dg = lut_g.w;
536        let db = lut_b.w;
537
538        let c0 = r.fetch(x, y, z);
539
540        let w0 = AvxVectorSse::from(db);
541        let w1 = AvxVectorSse::from(dr);
542        let w2 = AvxVectorSse::from(dg);
543
544        if dr > db && dg > db {
545            let w3 = AvxVectorSse::from(dr * dg);
546            let x0 = r.fetch(x_n, y_n, z_n);
547            let x1 = r.fetch(x_n, y_n, z);
548            let x2 = r.fetch(x_n, y, z);
549            let x3 = r.fetch(x, y_n, z);
550
551            let c1 = x0 - x1;
552            let c2 = x2 - c0;
553            let c3 = x3 - c0;
554            let c4 = c0 - x3 - x2 + x1;
555
556            let s0 = c0.mla(c1, w0);
557            let s1 = s0.mla(c2, w1);
558            let s2 = s1.mla(c3, w2);
559            s2.mla(c4, w3)
560        } else if db > dr && dg > dr {
561            let w3 = AvxVectorSse::from(dg * db);
562
563            let x0 = r.fetch(x, y, z_n);
564            let x1 = r.fetch(x_n, y_n, z_n);
565            let x2 = r.fetch(x, y_n, z_n);
566            let x3 = r.fetch(x, y_n, z);
567
568            let c1 = x0 - c0;
569            let c2 = x1 - x2;
570            let c3 = x3 - c0;
571            let c4 = c0 - x3 - x0 + x2;
572
573            let s0 = c0.mla(c1, w0);
574            let s1 = s0.mla(c2, w1);
575            let s2 = s1.mla(c3, w2);
576            s2.mla(c4, w3)
577        } else {
578            let w3 = AvxVectorSse::from(db * dr);
579
580            let x0 = r.fetch(x, y, z_n);
581            let x1 = r.fetch(x_n, y, z);
582            let x2 = r.fetch(x_n, y, z_n);
583            let x3 = r.fetch(x_n, y_n, z_n);
584
585            let c1 = x0 - c0;
586            let c2 = x1 - c0;
587            let c3 = x3 - x2;
588            let c4 = c0 - x1 - x0 + x2;
589
590            let s0 = c0.mla(c1, w0);
591            let s1 = s0.mla(c2, w1);
592            let s2 = s1.mla(c3, w2);
593            s2.mla(c4, w3)
594        }
595    }
596}
597
598#[cfg(feature = "options")]
599impl<const GRID_SIZE: usize> PrismaticAvxFma<'_, GRID_SIZE> {
600    #[inline(always)]
601    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
602        &self,
603        in_r: U,
604        in_g: U,
605        in_b: U,
606        lut: &[BarycentricWeight<f32>; BINS],
607        r: impl Fetcher<AvxVectorSse>,
608    ) -> AvxVectorSse {
609        let lut_r = lut[in_r.as_()];
610        let lut_g = lut[in_g.as_()];
611        let lut_b = lut[in_b.as_()];
612
613        let x: i32 = lut_r.x;
614        let y: i32 = lut_g.x;
615        let z: i32 = lut_b.x;
616
617        let x_n: i32 = lut_r.x_n;
618        let y_n: i32 = lut_g.x_n;
619        let z_n: i32 = lut_b.x_n;
620
621        let dr = lut_r.w;
622        let dg = lut_g.w;
623        let db = lut_b.w;
624
625        let c0 = r.fetch(x, y, z);
626
627        let w0 = AvxVectorSse::from(db);
628        let w1 = AvxVectorSse::from(dr);
629        let w2 = AvxVectorSse::from(dg);
630        let w3 = AvxVectorSse::from(dg * db);
631        let w4 = AvxVectorSse::from(dr * dg);
632
633        if db > dr {
634            let x0 = r.fetch(x, y, z_n);
635            let x1 = r.fetch(x_n, y, z_n);
636            let x2 = r.fetch(x, y_n, z);
637            let x3 = r.fetch(x, y_n, z_n);
638            let x4 = r.fetch(x_n, y_n, z_n);
639
640            let c1 = x0 - c0;
641            let c2 = x1 - x0;
642            let c3 = x2 - c0;
643            let c4 = c0 - x2 - x0 + x3;
644            let c5 = x0 - x3 - x1 + x4;
645
646            let s0 = c0.mla(c1, w0);
647            let s1 = s0.mla(c2, w1);
648            let s2 = s1.mla(c3, w2);
649            let s3 = s2.mla(c4, w3);
650            s3.mla(c5, w4)
651        } else {
652            let x0 = r.fetch(x_n, y, z);
653            let x1 = r.fetch(x_n, y, z_n);
654            let x2 = r.fetch(x, y_n, z);
655            let x3 = r.fetch(x_n, y_n, z);
656            let x4 = r.fetch(x_n, y_n, z_n);
657
658            let c1 = x1 - x0;
659            let c2 = x0 - c0;
660            let c3 = x2 - c0;
661            let c4 = x0 - x3 - x1 + x4;
662            let c5 = c0 - x2 - x0 + x3;
663
664            let s0 = c0.mla(c1, w0);
665            let s1 = s0.mla(c2, w1);
666            let s2 = s1.mla(c3, w2);
667            let s3 = s2.mla(c4, w3);
668            s3.mla(c5, w4)
669        }
670    }
671}
672
673#[cfg(feature = "options")]
674impl<const GRID_SIZE: usize> PrismaticAvxFmaDouble<'_, GRID_SIZE> {
675    #[inline(always)]
676    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
677        &self,
678        in_r: U,
679        in_g: U,
680        in_b: U,
681        lut: &[BarycentricWeight<f32>; BINS],
682        r0: impl Fetcher<AvxVectorSse>,
683        r1: impl Fetcher<AvxVectorSse>,
684    ) -> (AvxVectorSse, AvxVectorSse) {
685        let lut_r = lut[in_r.as_()];
686        let lut_g = lut[in_g.as_()];
687        let lut_b = lut[in_b.as_()];
688
689        let x: i32 = lut_r.x;
690        let y: i32 = lut_g.x;
691        let z: i32 = lut_b.x;
692
693        let x_n: i32 = lut_r.x_n;
694        let y_n: i32 = lut_g.x_n;
695        let z_n: i32 = lut_b.x_n;
696
697        let dr = lut_r.w;
698        let dg = lut_g.w;
699        let db = lut_b.w;
700
701        let c0_0 = r0.fetch(x, y, z);
702        let c0_1 = r0.fetch(x, y, z);
703
704        let w0 = AvxVector::from(db);
705        let w1 = AvxVector::from(dr);
706        let w2 = AvxVector::from(dg);
707        let w3 = AvxVector::from(dg * db);
708        let w4 = AvxVector::from(dr * dg);
709
710        let c0 = AvxVector::from_sse(c0_0, c0_1);
711
712        if db > dr {
713            let x0_0 = r0.fetch(x, y, z_n);
714            let x1_0 = r0.fetch(x_n, y, z_n);
715            let x2_0 = r0.fetch(x, y_n, z);
716            let x3_0 = r0.fetch(x, y_n, z_n);
717            let x4_0 = r0.fetch(x_n, y_n, z_n);
718
719            let x0_1 = r1.fetch(x, y, z_n);
720            let x1_1 = r1.fetch(x_n, y, z_n);
721            let x2_1 = r1.fetch(x, y_n, z);
722            let x3_1 = r1.fetch(x, y_n, z_n);
723            let x4_1 = r1.fetch(x_n, y_n, z_n);
724
725            let x0 = AvxVector::from_sse(x0_0, x0_1);
726            let x1 = AvxVector::from_sse(x1_0, x1_1);
727            let x2 = AvxVector::from_sse(x2_0, x2_1);
728            let x3 = AvxVector::from_sse(x3_0, x3_1);
729            let x4 = AvxVector::from_sse(x4_0, x4_1);
730
731            let c1 = x0 - c0;
732            let c2 = x1 - x0;
733            let c3 = x2 - c0;
734            let c4 = c0 - x2 - x0 + x3;
735            let c5 = x0 - x3 - x1 + x4;
736
737            let s0 = c0.mla(c1, w0);
738            let s1 = s0.mla(c2, w1);
739            let s2 = s1.mla(c3, w2);
740            let s3 = s2.mla(c4, w3);
741            s3.mla(c5, w4).split()
742        } else {
743            let x0_0 = r0.fetch(x_n, y, z);
744            let x1_0 = r0.fetch(x_n, y, z_n);
745            let x2_0 = r0.fetch(x, y_n, z);
746            let x3_0 = r0.fetch(x_n, y_n, z);
747            let x4_0 = r0.fetch(x_n, y_n, z_n);
748
749            let x0_1 = r1.fetch(x_n, y, z);
750            let x1_1 = r1.fetch(x_n, y, z_n);
751            let x2_1 = r1.fetch(x, y_n, z);
752            let x3_1 = r1.fetch(x_n, y_n, z);
753            let x4_1 = r1.fetch(x_n, y_n, z_n);
754
755            let x0 = AvxVector::from_sse(x0_0, x0_1);
756            let x1 = AvxVector::from_sse(x1_0, x1_1);
757            let x2 = AvxVector::from_sse(x2_0, x2_1);
758            let x3 = AvxVector::from_sse(x3_0, x3_1);
759            let x4 = AvxVector::from_sse(x4_0, x4_1);
760
761            let c1 = x1 - x0;
762            let c2 = x0 - c0;
763            let c3 = x2 - c0;
764            let c4 = x0 - x3 - x1 + x4;
765            let c5 = c0 - x2 - x0 + x3;
766
767            let s0 = c0.mla(c1, w0);
768            let s1 = s0.mla(c2, w1);
769            let s2 = s1.mla(c3, w2);
770            let s3 = s2.mla(c4, w3);
771            s3.mla(c5, w4).split()
772        }
773    }
774}
775
776#[cfg(feature = "options")]
777impl<const GRID_SIZE: usize> PyramidAvxFmaDouble<'_, GRID_SIZE> {
778    #[inline(always)]
779    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
780        &self,
781        in_r: U,
782        in_g: U,
783        in_b: U,
784        lut: &[BarycentricWeight<f32>; BINS],
785        r0: impl Fetcher<AvxVectorSse>,
786        r1: impl Fetcher<AvxVectorSse>,
787    ) -> (AvxVectorSse, AvxVectorSse) {
788        let lut_r = lut[in_r.as_()];
789        let lut_g = lut[in_g.as_()];
790        let lut_b = lut[in_b.as_()];
791
792        let x: i32 = lut_r.x;
793        let y: i32 = lut_g.x;
794        let z: i32 = lut_b.x;
795
796        let x_n: i32 = lut_r.x_n;
797        let y_n: i32 = lut_g.x_n;
798        let z_n: i32 = lut_b.x_n;
799
800        let dr = lut_r.w;
801        let dg = lut_g.w;
802        let db = lut_b.w;
803
804        let c0_0 = r0.fetch(x, y, z);
805        let c0_1 = r1.fetch(x, y, z);
806
807        let w0 = AvxVector::from(db);
808        let w1 = AvxVector::from(dr);
809        let w2 = AvxVector::from(dg);
810
811        let c0 = AvxVector::from_sse(c0_0, c0_1);
812
813        if dr > db && dg > db {
814            let w3 = AvxVector::from(dr * dg);
815
816            let x0_0 = r0.fetch(x_n, y_n, z_n);
817            let x1_0 = r0.fetch(x_n, y_n, z);
818            let x2_0 = r0.fetch(x_n, y, z);
819            let x3_0 = r0.fetch(x, y_n, z);
820
821            let x0_1 = r1.fetch(x_n, y_n, z_n);
822            let x1_1 = r1.fetch(x_n, y_n, z);
823            let x2_1 = r1.fetch(x_n, y, z);
824            let x3_1 = r1.fetch(x, y_n, z);
825
826            let x0 = AvxVector::from_sse(x0_0, x0_1);
827            let x1 = AvxVector::from_sse(x1_0, x1_1);
828            let x2 = AvxVector::from_sse(x2_0, x2_1);
829            let x3 = AvxVector::from_sse(x3_0, x3_1);
830
831            let c1 = x0 - x1;
832            let c2 = x2 - c0;
833            let c3 = x3 - c0;
834            let c4 = c0 - x3 - x2 + x1;
835
836            let s0 = c0.mla(c1, w0);
837            let s1 = s0.mla(c2, w1);
838            let s2 = s1.mla(c3, w2);
839            s2.mla(c4, w3).split()
840        } else if db > dr && dg > dr {
841            let w3 = AvxVector::from(dg * db);
842
843            let x0_0 = r0.fetch(x, y, z_n);
844            let x1_0 = r0.fetch(x_n, y_n, z_n);
845            let x2_0 = r0.fetch(x, y_n, z_n);
846            let x3_0 = r0.fetch(x, y_n, z);
847
848            let x0_1 = r1.fetch(x, y, z_n);
849            let x1_1 = r1.fetch(x_n, y_n, z_n);
850            let x2_1 = r1.fetch(x, y_n, z_n);
851            let x3_1 = r1.fetch(x, y_n, z);
852
853            let x0 = AvxVector::from_sse(x0_0, x0_1);
854            let x1 = AvxVector::from_sse(x1_0, x1_1);
855            let x2 = AvxVector::from_sse(x2_0, x2_1);
856            let x3 = AvxVector::from_sse(x3_0, x3_1);
857
858            let c1 = x0 - c0;
859            let c2 = x1 - x2;
860            let c3 = x3 - c0;
861            let c4 = c0 - x3 - x0 + x2;
862
863            let s0 = c0.mla(c1, w0);
864            let s1 = s0.mla(c2, w1);
865            let s2 = s1.mla(c3, w2);
866            s2.mla(c4, w3).split()
867        } else {
868            let w3 = AvxVector::from(db * dr);
869
870            let x0_0 = r0.fetch(x, y, z_n);
871            let x1_0 = r0.fetch(x_n, y, z);
872            let x2_0 = r0.fetch(x_n, y, z_n);
873            let x3_0 = r0.fetch(x_n, y_n, z_n);
874
875            let x0_1 = r1.fetch(x, y, z_n);
876            let x1_1 = r1.fetch(x_n, y, z);
877            let x2_1 = r1.fetch(x_n, y, z_n);
878            let x3_1 = r1.fetch(x_n, y_n, z_n);
879
880            let x0 = AvxVector::from_sse(x0_0, x0_1);
881            let x1 = AvxVector::from_sse(x1_0, x1_1);
882            let x2 = AvxVector::from_sse(x2_0, x2_1);
883            let x3 = AvxVector::from_sse(x3_0, x3_1);
884
885            let c1 = x0 - c0;
886            let c2 = x1 - c0;
887            let c3 = x3 - x2;
888            let c4 = c0 - x1 - x0 + x2;
889
890            let s0 = c0.mla(c1, w0);
891            let s1 = s0.mla(c2, w1);
892            let s2 = s1.mla(c3, w2);
893            s2.mla(c4, w3).split()
894        }
895    }
896}
897
898#[cfg(feature = "options")]
899impl<const GRID_SIZE: usize> TetrahedralAvxFmaDouble<'_, GRID_SIZE> {
900    #[inline(always)]
901    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
902        &self,
903        in_r: U,
904        in_g: U,
905        in_b: U,
906        lut: &[BarycentricWeight<f32>; BINS],
907        rv: impl Fetcher<AvxVector>,
908    ) -> (AvxVectorSse, AvxVectorSse) {
909        let lut_r = lut[in_r.as_()];
910        let lut_g = lut[in_g.as_()];
911        let lut_b = lut[in_b.as_()];
912
913        let x: i32 = lut_r.x;
914        let y: i32 = lut_g.x;
915        let z: i32 = lut_b.x;
916
917        let x_n: i32 = lut_r.x_n;
918        let y_n: i32 = lut_g.x_n;
919        let z_n: i32 = lut_b.x_n;
920
921        let rx = lut_r.w;
922        let ry = lut_g.w;
923        let rz = lut_b.w;
924
925        let c0 = rv.fetch(x, y, z);
926
927        let w0 = AvxVector::from(rx);
928        let w1 = AvxVector::from(ry);
929        let w2 = AvxVector::from(rz);
930
931        let c2;
932        let c1;
933        let c3;
934        if rx >= ry {
935            if ry >= rz {
936                //rx >= ry && ry >= rz
937                c1 = rv.fetch(x_n, y, z) - c0;
938                c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
939                c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
940            } else if rx >= rz {
941                //rx >= rz && rz >= ry
942                c1 = rv.fetch(x_n, y, z) - c0;
943                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
944                c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
945            } else {
946                //rz > rx && rx >= ry
947                c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
948                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
949                c3 = rv.fetch(x, y, z_n) - c0;
950            }
951        } else if rx >= rz {
952            //ry > rx && rx >= rz
953            c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
954            c2 = rv.fetch(x, y_n, z) - c0;
955            c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
956        } else if ry >= rz {
957            //ry >= rz && rz > rx
958            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
959            c2 = rv.fetch(x, y_n, z) - c0;
960            c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
961        } else {
962            //rz > ry && ry > rx
963            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
964            c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
965            c3 = rv.fetch(x, y, z_n) - c0;
966        }
967        let s0 = c0.mla(c1, w0);
968        let s1 = s0.mla(c2, w1);
969        s1.mla(c3, w2).split()
970    }
971}
972
973impl<const GRID_SIZE: usize> TrilinearAvxFmaDouble<'_, GRID_SIZE> {
974    #[inline(always)]
975    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
976        &self,
977        in_r: U,
978        in_g: U,
979        in_b: U,
980        lut: &[BarycentricWeight<f32>; BINS],
981        rv: impl Fetcher<AvxVector>,
982    ) -> (AvxVectorSse, AvxVectorSse) {
983        let lut_r = lut[in_r.as_()];
984        let lut_g = lut[in_g.as_()];
985        let lut_b = lut[in_b.as_()];
986
987        let x: i32 = lut_r.x;
988        let y: i32 = lut_g.x;
989        let z: i32 = lut_b.x;
990
991        let x_n: i32 = lut_r.x_n;
992        let y_n: i32 = lut_g.x_n;
993        let z_n: i32 = lut_b.x_n;
994
995        let rx = lut_r.w;
996        let ry = lut_g.w;
997        let rz = lut_b.w;
998
999        let w0 = AvxVector::from(rx);
1000        let w1 = AvxVector::from(ry);
1001        let w2 = AvxVector::from(rz);
1002
1003        let c000 = rv.fetch(x, y, z);
1004        let c100 = rv.fetch(x_n, y, z);
1005        let c010 = rv.fetch(x, y_n, z);
1006        let c110 = rv.fetch(x_n, y_n, z);
1007        let c001 = rv.fetch(x, y, z_n);
1008        let c101 = rv.fetch(x_n, y, z_n);
1009        let c011 = rv.fetch(x, y_n, z_n);
1010        let c111 = rv.fetch(x_n, y_n, z_n);
1011
1012        let dx = AvxVector::from(rx);
1013
1014        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
1015        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
1016        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
1017        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
1018
1019        let dy = AvxVector::from(ry);
1020
1021        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
1022        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
1023
1024        let dz = AvxVector::from(rz);
1025
1026        c0.neg_mla(c0, dz).mla(c1, w2).split()
1027    }
1028}
1029
1030impl<const GRID_SIZE: usize> TrilinearAvxFma<'_, GRID_SIZE> {
1031    #[inline(always)]
1032    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
1033        &self,
1034        in_r: U,
1035        in_g: U,
1036        in_b: U,
1037        lut: &[BarycentricWeight<f32>; BINS],
1038        r: impl Fetcher<AvxVectorSse>,
1039    ) -> AvxVectorSse {
1040        let lut_r = lut[in_r.as_()];
1041        let lut_g = lut[in_g.as_()];
1042        let lut_b = lut[in_b.as_()];
1043
1044        let x: i32 = lut_r.x;
1045        let y: i32 = lut_g.x;
1046        let z: i32 = lut_b.x;
1047
1048        let x_n: i32 = lut_r.x_n;
1049        let y_n: i32 = lut_g.x_n;
1050        let z_n: i32 = lut_b.x_n;
1051
1052        let dr = lut_r.w;
1053        let dg = lut_g.w;
1054        let db = lut_b.w;
1055
1056        let w0 = AvxVector::from(dr);
1057        let w1 = AvxVector::from(dg);
1058        let w2 = AvxVectorSse::from(db);
1059
1060        let c000 = r.fetch(x, y, z);
1061        let c100 = r.fetch(x_n, y, z);
1062        let c010 = r.fetch(x, y_n, z);
1063        let c110 = r.fetch(x_n, y_n, z);
1064        let c001 = r.fetch(x, y, z_n);
1065        let c101 = r.fetch(x_n, y, z_n);
1066        let c011 = r.fetch(x, y_n, z_n);
1067        let c111 = r.fetch(x_n, y_n, z_n);
1068
1069        let x000 = AvxVector::from_sse(c000, c001);
1070        let x010 = AvxVector::from_sse(c010, c011);
1071        let x011 = AvxVector::from_sse(c100, c101);
1072        let x111 = AvxVector::from_sse(c110, c111);
1073
1074        let c00 = x000.neg_mla(x000, w0).mla(x011, w0);
1075        let c10 = x010.neg_mla(x010, w0).mla(x111, w0);
1076
1077        let z0 = c00.neg_mla(c00, w1).mla(c10, w1);
1078
1079        let (c0, c1) = z0.split();
1080
1081        c0.neg_mla(c0, w2).mla(c1, w2)
1082    }
1083}