moxcms/conversions/avx/
interpolator.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
31use std::arch::x86_64::*;
32use std::ops::{Add, Mul, Sub};
33
34#[repr(align(16), C)]
35pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
36
37#[cfg(feature = "options")]
38pub(crate) struct TetrahedralAvxFma<const GRID_SIZE: usize> {}
39
40#[cfg(feature = "options")]
41pub(crate) struct PyramidalAvxFma<const GRID_SIZE: usize> {}
42
43#[cfg(feature = "options")]
44pub(crate) struct PrismaticAvxFma<const GRID_SIZE: usize> {}
45
46pub(crate) struct TrilinearAvxFma<const GRID_SIZE: usize> {}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxFmaDouble<const GRID_SIZE: usize> {}
50
51pub(crate) struct TrilinearAvxFmaDouble<const GRID_SIZE: usize> {}
52
53#[cfg(feature = "options")]
54pub(crate) struct PyramidAvxFmaDouble<const GRID_SIZE: usize> {}
55
56#[cfg(feature = "options")]
57pub(crate) struct TetrahedralAvxFmaDouble<const GRID_SIZE: usize> {}
58
59pub(crate) trait AvxMdInterpolationDouble {
60    fn inter3_sse(
61        &self,
62        table0: &[SseAlignedF32],
63        table1: &[SseAlignedF32],
64        in_r: usize,
65        in_g: usize,
66        in_b: usize,
67        lut: &[BarycentricWeight<f32>],
68    ) -> (AvxVectorSse, AvxVectorSse);
69}
70
71pub(crate) trait AvxMdInterpolation {
72    fn inter3_sse(
73        &self,
74        table: &[SseAlignedF32],
75        in_r: usize,
76        in_g: usize,
77        in_b: usize,
78        lut: &[BarycentricWeight<f32>],
79    ) -> AvxVectorSse;
80}
81
82trait Fetcher<T> {
83    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
84}
85
86#[derive(Copy, Clone)]
87#[repr(transparent)]
88pub(crate) struct AvxVectorSse {
89    pub(crate) v: __m128,
90}
91
92#[derive(Copy, Clone)]
93#[repr(transparent)]
94pub(crate) struct AvxVector {
95    pub(crate) v: __m256,
96}
97
98impl AvxVector {
99    #[inline(always)]
100    pub(crate) fn from_sse(lo: AvxVectorSse, hi: AvxVectorSse) -> AvxVector {
101        unsafe {
102            AvxVector {
103                v: _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo.v), hi.v),
104            }
105        }
106    }
107
108    #[inline(always)]
109    pub(crate) fn split(self) -> (AvxVectorSse, AvxVectorSse) {
110        unsafe {
111            (
112                AvxVectorSse {
113                    v: _mm256_castps256_ps128(self.v),
114                },
115                AvxVectorSse {
116                    v: _mm256_extractf128_ps::<1>(self.v),
117                },
118            )
119        }
120    }
121}
122
123impl From<f32> for AvxVectorSse {
124    #[inline(always)]
125    fn from(v: f32) -> Self {
126        AvxVectorSse {
127            v: unsafe { _mm_set1_ps(v) },
128        }
129    }
130}
131
132impl From<f32> for AvxVector {
133    #[inline(always)]
134    fn from(v: f32) -> Self {
135        AvxVector {
136            v: unsafe { _mm256_set1_ps(v) },
137        }
138    }
139}
140
141impl Sub<AvxVectorSse> for AvxVectorSse {
142    type Output = Self;
143    #[inline(always)]
144    fn sub(self, rhs: AvxVectorSse) -> Self::Output {
145        AvxVectorSse {
146            v: unsafe { _mm_sub_ps(self.v, rhs.v) },
147        }
148    }
149}
150
151impl Sub<AvxVector> for AvxVector {
152    type Output = Self;
153    #[inline(always)]
154    fn sub(self, rhs: AvxVector) -> Self::Output {
155        AvxVector {
156            v: unsafe { _mm256_sub_ps(self.v, rhs.v) },
157        }
158    }
159}
160
161impl Add<AvxVectorSse> for AvxVectorSse {
162    type Output = Self;
163    #[inline(always)]
164    fn add(self, rhs: AvxVectorSse) -> Self::Output {
165        AvxVectorSse {
166            v: unsafe { _mm_add_ps(self.v, rhs.v) },
167        }
168    }
169}
170
171impl Mul<AvxVectorSse> for AvxVectorSse {
172    type Output = Self;
173    #[inline(always)]
174    fn mul(self, rhs: AvxVectorSse) -> Self::Output {
175        AvxVectorSse {
176            v: unsafe { _mm_mul_ps(self.v, rhs.v) },
177        }
178    }
179}
180
181impl AvxVector {
182    #[inline(always)]
183    pub(crate) fn neg_mla(self, b: AvxVector, c: AvxVector) -> Self {
184        Self {
185            v: unsafe { _mm256_fnmadd_ps(b.v, c.v, self.v) },
186        }
187    }
188}
189
190impl FusedMultiplyNegAdd<AvxVectorSse> for AvxVectorSse {
191    #[inline(always)]
192    fn neg_mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> Self {
193        Self {
194            v: unsafe { _mm_fnmadd_ps(b.v, c.v, self.v) },
195        }
196    }
197}
198
199impl Add<AvxVector> for AvxVector {
200    type Output = Self;
201    #[inline(always)]
202    fn add(self, rhs: AvxVector) -> Self::Output {
203        AvxVector {
204            v: unsafe { _mm256_add_ps(self.v, rhs.v) },
205        }
206    }
207}
208
209impl Mul<AvxVector> for AvxVector {
210    type Output = Self;
211    #[inline(always)]
212    fn mul(self, rhs: AvxVector) -> Self::Output {
213        AvxVector {
214            v: unsafe { _mm256_mul_ps(self.v, rhs.v) },
215        }
216    }
217}
218
219impl FusedMultiplyAdd<AvxVectorSse> for AvxVectorSse {
220    #[inline(always)]
221    fn mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> AvxVectorSse {
222        AvxVectorSse {
223            v: unsafe { _mm_fmadd_ps(b.v, c.v, self.v) },
224        }
225    }
226}
227
228impl FusedMultiplyAdd<AvxVector> for AvxVector {
229    #[inline(always)]
230    fn mla(&self, b: AvxVector, c: AvxVector) -> AvxVector {
231        AvxVector {
232            v: unsafe { _mm256_fmadd_ps(b.v, c.v, self.v) },
233        }
234    }
235}
236
237struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
238    cube: &'a [SseAlignedF32],
239}
240
241struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
242    cube0: &'a [SseAlignedF32],
243    cube1: &'a [SseAlignedF32],
244}
245
246impl<const GRID_SIZE: usize> Fetcher<AvxVector> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
247    #[inline(always)]
248    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVector {
249        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
250            + y as u32 * GRID_SIZE as u32
251            + z as u32) as usize;
252        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
253        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
254        AvxVector {
255            v: unsafe {
256                _mm256_insertf128_ps::<1>(
257                    _mm256_castps128_ps256(_mm_load_ps(jx0.as_ptr() as *const f32)),
258                    _mm_load_ps(jx1.as_ptr() as *const f32),
259                )
260            },
261        }
262    }
263}
264
265impl<const GRID_SIZE: usize> Fetcher<AvxVectorSse> for TetrahedralAvxSseFetchVector<'_, GRID_SIZE> {
266    #[inline(always)]
267    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorSse {
268        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
269            + y as u32 * GRID_SIZE as u32
270            + z as u32) as usize;
271        let jx = unsafe { self.cube.get_unchecked(offset..) };
272        AvxVectorSse {
273            v: unsafe { _mm_load_ps(jx.as_ptr() as *const f32) },
274        }
275    }
276}
277
278#[cfg(feature = "options")]
279impl<const GRID_SIZE: usize> TetrahedralAvxFma<GRID_SIZE> {
280    #[target_feature(enable = "avx2", enable = "fma")]
281    unsafe fn interpolate(
282        &self,
283        in_r: usize,
284        in_g: usize,
285        in_b: usize,
286        lut: &[BarycentricWeight<f32>],
287        r: impl Fetcher<AvxVectorSse>,
288    ) -> AvxVectorSse {
289        let lut_r = unsafe { lut.get_unchecked(in_r) };
290        let lut_g = unsafe { lut.get_unchecked(in_g) };
291        let lut_b = unsafe { lut.get_unchecked(in_b) };
292
293        let x: i32 = lut_r.x;
294        let y: i32 = lut_g.x;
295        let z: i32 = lut_b.x;
296
297        let x_n: i32 = lut_r.x_n;
298        let y_n: i32 = lut_g.x_n;
299        let z_n: i32 = lut_b.x_n;
300
301        let rx = lut_r.w;
302        let ry = lut_g.w;
303        let rz = lut_b.w;
304
305        let c0 = r.fetch(x, y, z);
306
307        let c2;
308        let c1;
309        let c3;
310        if rx >= ry {
311            if ry >= rz {
312                //rx >= ry && ry >= rz
313                c1 = r.fetch(x_n, y, z) - c0;
314                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
315                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
316            } else if rx >= rz {
317                //rx >= rz && rz >= ry
318                c1 = r.fetch(x_n, y, z) - c0;
319                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
320                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
321            } else {
322                //rz > rx && rx >= ry
323                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
324                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
325                c3 = r.fetch(x, y, z_n) - c0;
326            }
327        } else if rx >= rz {
328            //ry > rx && rx >= rz
329            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
330            c2 = r.fetch(x, y_n, z) - c0;
331            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
332        } else if ry >= rz {
333            //ry >= rz && rz > rx
334            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
335            c2 = r.fetch(x, y_n, z) - c0;
336            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
337        } else {
338            //rz > ry && ry > rx
339            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
340            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
341            c3 = r.fetch(x, y, z_n) - c0;
342        }
343        let s0 = c0.mla(c1, AvxVectorSse::from(rx));
344        let s1 = s0.mla(c2, AvxVectorSse::from(ry));
345        s1.mla(c3, AvxVectorSse::from(rz))
346    }
347}
348
349macro_rules! define_interp_avx {
350    ($interpolator: ident) => {
351        impl<const GRID_SIZE: usize> AvxMdInterpolation for $interpolator<GRID_SIZE> {
352            fn inter3_sse(
353                &self,
354                table: &[SseAlignedF32],
355                in_r: usize,
356                in_g: usize,
357                in_b: usize,
358                lut: &[BarycentricWeight<f32>],
359            ) -> AvxVectorSse {
360                unsafe {
361                    self.interpolate(
362                        in_r,
363                        in_g,
364                        in_b,
365                        lut,
366                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
367                    )
368                }
369            }
370        }
371    };
372}
373
374#[cfg(feature = "options")]
375macro_rules! define_interp_avx_d {
376    ($interpolator: ident) => {
377        impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for $interpolator<GRID_SIZE> {
378            fn inter3_sse(
379                &self,
380                table0: &[SseAlignedF32],
381                table1: &[SseAlignedF32],
382                in_r: usize,
383                in_g: usize,
384                in_b: usize,
385                lut: &[BarycentricWeight<f32>],
386            ) -> (AvxVectorSse, AvxVectorSse) {
387                unsafe {
388                    self.interpolate(
389                        in_r,
390                        in_g,
391                        in_b,
392                        lut,
393                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
394                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
395                    )
396                }
397            }
398        }
399    };
400}
401
402#[cfg(feature = "options")]
403define_interp_avx!(TetrahedralAvxFma);
404#[cfg(feature = "options")]
405define_interp_avx!(PyramidalAvxFma);
406#[cfg(feature = "options")]
407define_interp_avx!(PrismaticAvxFma);
408define_interp_avx!(TrilinearAvxFma);
409#[cfg(feature = "options")]
410define_interp_avx_d!(PrismaticAvxFmaDouble);
411#[cfg(feature = "options")]
412define_interp_avx_d!(PyramidAvxFmaDouble);
413
414#[cfg(feature = "options")]
415impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for TetrahedralAvxFmaDouble<GRID_SIZE> {
416    fn inter3_sse(
417        &self,
418        table0: &[SseAlignedF32],
419        table1: &[SseAlignedF32],
420        in_r: usize,
421        in_g: usize,
422        in_b: usize,
423        lut: &[BarycentricWeight<f32>],
424    ) -> (AvxVectorSse, AvxVectorSse) {
425        unsafe {
426            self.interpolate(
427                in_r,
428                in_g,
429                in_b,
430                lut,
431                TetrahedralAvxFetchVector::<GRID_SIZE> {
432                    cube0: table0,
433                    cube1: table1,
434                },
435            )
436        }
437    }
438}
439
440impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for TrilinearAvxFmaDouble<GRID_SIZE> {
441    fn inter3_sse(
442        &self,
443        table0: &[SseAlignedF32],
444        table1: &[SseAlignedF32],
445        in_r: usize,
446        in_g: usize,
447        in_b: usize,
448        lut: &[BarycentricWeight<f32>],
449    ) -> (AvxVectorSse, AvxVectorSse) {
450        unsafe {
451            self.interpolate(
452                in_r,
453                in_g,
454                in_b,
455                lut,
456                TetrahedralAvxFetchVector::<GRID_SIZE> {
457                    cube0: table0,
458                    cube1: table1,
459                },
460            )
461        }
462    }
463}
464
465#[cfg(feature = "options")]
466impl<const GRID_SIZE: usize> PyramidalAvxFma<GRID_SIZE> {
467    #[target_feature(enable = "avx2", enable = "fma")]
468    unsafe fn interpolate(
469        &self,
470        in_r: usize,
471        in_g: usize,
472        in_b: usize,
473        lut: &[BarycentricWeight<f32>],
474        r: impl Fetcher<AvxVectorSse>,
475    ) -> AvxVectorSse {
476        let lut_r = unsafe { lut.get_unchecked(in_r) };
477        let lut_g = unsafe { lut.get_unchecked(in_g) };
478        let lut_b = unsafe { lut.get_unchecked(in_b) };
479
480        let x: i32 = lut_r.x;
481        let y: i32 = lut_g.x;
482        let z: i32 = lut_b.x;
483
484        let x_n: i32 = lut_r.x_n;
485        let y_n: i32 = lut_g.x_n;
486        let z_n: i32 = lut_b.x_n;
487
488        let dr = lut_r.w;
489        let dg = lut_g.w;
490        let db = lut_b.w;
491
492        let c0 = r.fetch(x, y, z);
493
494        let w0 = AvxVectorSse::from(db);
495        let w1 = AvxVectorSse::from(dr);
496        let w2 = AvxVectorSse::from(dg);
497
498        if dr > db && dg > db {
499            let w3 = AvxVectorSse::from(dr * dg);
500            let x0 = r.fetch(x_n, y_n, z_n);
501            let x1 = r.fetch(x_n, y_n, z);
502            let x2 = r.fetch(x_n, y, z);
503            let x3 = r.fetch(x, y_n, z);
504
505            let c1 = x0 - x1;
506            let c2 = x2 - c0;
507            let c3 = x3 - c0;
508            let c4 = c0 - x3 - x2 + x1;
509
510            let s0 = c0.mla(c1, w0);
511            let s1 = s0.mla(c2, w1);
512            let s2 = s1.mla(c3, w2);
513            s2.mla(c4, w3)
514        } else if db > dr && dg > dr {
515            let w3 = AvxVectorSse::from(dg * db);
516
517            let x0 = r.fetch(x, y, z_n);
518            let x1 = r.fetch(x_n, y_n, z_n);
519            let x2 = r.fetch(x, y_n, z_n);
520            let x3 = r.fetch(x, y_n, z);
521
522            let c1 = x0 - c0;
523            let c2 = x1 - x2;
524            let c3 = x3 - c0;
525            let c4 = c0 - x3 - x0 + x2;
526
527            let s0 = c0.mla(c1, w0);
528            let s1 = s0.mla(c2, w1);
529            let s2 = s1.mla(c3, w2);
530            s2.mla(c4, w3)
531        } else {
532            let w3 = AvxVectorSse::from(db * dr);
533
534            let x0 = r.fetch(x, y, z_n);
535            let x1 = r.fetch(x_n, y, z);
536            let x2 = r.fetch(x_n, y, z_n);
537            let x3 = r.fetch(x_n, y_n, z_n);
538
539            let c1 = x0 - c0;
540            let c2 = x1 - c0;
541            let c3 = x3 - x2;
542            let c4 = c0 - x1 - x0 + x2;
543
544            let s0 = c0.mla(c1, w0);
545            let s1 = s0.mla(c2, w1);
546            let s2 = s1.mla(c3, w2);
547            s2.mla(c4, w3)
548        }
549    }
550}
551
552#[cfg(feature = "options")]
553impl<const GRID_SIZE: usize> PrismaticAvxFma<GRID_SIZE> {
554    #[target_feature(enable = "avx2", enable = "fma")]
555    unsafe fn interpolate(
556        &self,
557        in_r: usize,
558        in_g: usize,
559        in_b: usize,
560        lut: &[BarycentricWeight<f32>],
561        r: impl Fetcher<AvxVectorSse>,
562    ) -> AvxVectorSse {
563        let lut_r = unsafe { lut.get_unchecked(in_r) };
564        let lut_g = unsafe { lut.get_unchecked(in_g) };
565        let lut_b = unsafe { lut.get_unchecked(in_b) };
566
567        let x: i32 = lut_r.x;
568        let y: i32 = lut_g.x;
569        let z: i32 = lut_b.x;
570
571        let x_n: i32 = lut_r.x_n;
572        let y_n: i32 = lut_g.x_n;
573        let z_n: i32 = lut_b.x_n;
574
575        let dr = lut_r.w;
576        let dg = lut_g.w;
577        let db = lut_b.w;
578
579        let c0 = r.fetch(x, y, z);
580
581        let w0 = AvxVectorSse::from(db);
582        let w1 = AvxVectorSse::from(dr);
583        let w2 = AvxVectorSse::from(dg);
584        let w3 = AvxVectorSse::from(dg * db);
585        let w4 = AvxVectorSse::from(dr * dg);
586
587        if db > dr {
588            let x0 = r.fetch(x, y, z_n);
589            let x1 = r.fetch(x_n, y, z_n);
590            let x2 = r.fetch(x, y_n, z);
591            let x3 = r.fetch(x, y_n, z_n);
592            let x4 = r.fetch(x_n, y_n, z_n);
593
594            let c1 = x0 - c0;
595            let c2 = x1 - x0;
596            let c3 = x2 - c0;
597            let c4 = c0 - x2 - x0 + x3;
598            let c5 = x0 - x3 - x1 + x4;
599
600            let s0 = c0.mla(c1, w0);
601            let s1 = s0.mla(c2, w1);
602            let s2 = s1.mla(c3, w2);
603            let s3 = s2.mla(c4, w3);
604            s3.mla(c5, w4)
605        } else {
606            let x0 = r.fetch(x_n, y, z);
607            let x1 = r.fetch(x_n, y, z_n);
608            let x2 = r.fetch(x, y_n, z);
609            let x3 = r.fetch(x_n, y_n, z);
610            let x4 = r.fetch(x_n, y_n, z_n);
611
612            let c1 = x1 - x0;
613            let c2 = x0 - c0;
614            let c3 = x2 - c0;
615            let c4 = x0 - x3 - x1 + x4;
616            let c5 = c0 - x2 - x0 + x3;
617
618            let s0 = c0.mla(c1, w0);
619            let s1 = s0.mla(c2, w1);
620            let s2 = s1.mla(c3, w2);
621            let s3 = s2.mla(c4, w3);
622            s3.mla(c5, w4)
623        }
624    }
625}
626
627#[cfg(feature = "options")]
628impl<const GRID_SIZE: usize> PrismaticAvxFmaDouble<GRID_SIZE> {
629    #[target_feature(enable = "avx2", enable = "fma")]
630    unsafe fn interpolate(
631        &self,
632        in_r: usize,
633        in_g: usize,
634        in_b: usize,
635        lut: &[BarycentricWeight<f32>],
636        r0: impl Fetcher<AvxVectorSse>,
637        r1: impl Fetcher<AvxVectorSse>,
638    ) -> (AvxVectorSse, AvxVectorSse) {
639        let lut_r = unsafe { lut.get_unchecked(in_r) };
640        let lut_g = unsafe { lut.get_unchecked(in_g) };
641        let lut_b = unsafe { lut.get_unchecked(in_b) };
642
643        let x: i32 = lut_r.x;
644        let y: i32 = lut_g.x;
645        let z: i32 = lut_b.x;
646
647        let x_n: i32 = lut_r.x_n;
648        let y_n: i32 = lut_g.x_n;
649        let z_n: i32 = lut_b.x_n;
650
651        let dr = lut_r.w;
652        let dg = lut_g.w;
653        let db = lut_b.w;
654
655        let c0_0 = r0.fetch(x, y, z);
656        let c0_1 = r0.fetch(x, y, z);
657
658        let w0 = AvxVector::from(db);
659        let w1 = AvxVector::from(dr);
660        let w2 = AvxVector::from(dg);
661        let w3 = AvxVector::from(dg * db);
662        let w4 = AvxVector::from(dr * dg);
663
664        let c0 = AvxVector::from_sse(c0_0, c0_1);
665
666        if db > dr {
667            let x0_0 = r0.fetch(x, y, z_n);
668            let x1_0 = r0.fetch(x_n, y, z_n);
669            let x2_0 = r0.fetch(x, y_n, z);
670            let x3_0 = r0.fetch(x, y_n, z_n);
671            let x4_0 = r0.fetch(x_n, y_n, z_n);
672
673            let x0_1 = r1.fetch(x, y, z_n);
674            let x1_1 = r1.fetch(x_n, y, z_n);
675            let x2_1 = r1.fetch(x, y_n, z);
676            let x3_1 = r1.fetch(x, y_n, z_n);
677            let x4_1 = r1.fetch(x_n, y_n, z_n);
678
679            let x0 = AvxVector::from_sse(x0_0, x0_1);
680            let x1 = AvxVector::from_sse(x1_0, x1_1);
681            let x2 = AvxVector::from_sse(x2_0, x2_1);
682            let x3 = AvxVector::from_sse(x3_0, x3_1);
683            let x4 = AvxVector::from_sse(x4_0, x4_1);
684
685            let c1 = x0 - c0;
686            let c2 = x1 - x0;
687            let c3 = x2 - c0;
688            let c4 = c0 - x2 - x0 + x3;
689            let c5 = x0 - x3 - x1 + x4;
690
691            let s0 = c0.mla(c1, w0);
692            let s1 = s0.mla(c2, w1);
693            let s2 = s1.mla(c3, w2);
694            let s3 = s2.mla(c4, w3);
695            s3.mla(c5, w4).split()
696        } else {
697            let x0_0 = r0.fetch(x_n, y, z);
698            let x1_0 = r0.fetch(x_n, y, z_n);
699            let x2_0 = r0.fetch(x, y_n, z);
700            let x3_0 = r0.fetch(x_n, y_n, z);
701            let x4_0 = r0.fetch(x_n, y_n, z_n);
702
703            let x0_1 = r1.fetch(x_n, y, z);
704            let x1_1 = r1.fetch(x_n, y, z_n);
705            let x2_1 = r1.fetch(x, y_n, z);
706            let x3_1 = r1.fetch(x_n, y_n, z);
707            let x4_1 = r1.fetch(x_n, y_n, z_n);
708
709            let x0 = AvxVector::from_sse(x0_0, x0_1);
710            let x1 = AvxVector::from_sse(x1_0, x1_1);
711            let x2 = AvxVector::from_sse(x2_0, x2_1);
712            let x3 = AvxVector::from_sse(x3_0, x3_1);
713            let x4 = AvxVector::from_sse(x4_0, x4_1);
714
715            let c1 = x1 - x0;
716            let c2 = x0 - c0;
717            let c3 = x2 - c0;
718            let c4 = x0 - x3 - x1 + x4;
719            let c5 = c0 - x2 - x0 + x3;
720
721            let s0 = c0.mla(c1, w0);
722            let s1 = s0.mla(c2, w1);
723            let s2 = s1.mla(c3, w2);
724            let s3 = s2.mla(c4, w3);
725            s3.mla(c5, w4).split()
726        }
727    }
728}
729
730#[cfg(feature = "options")]
731impl<const GRID_SIZE: usize> PyramidAvxFmaDouble<GRID_SIZE> {
732    #[target_feature(enable = "avx2", enable = "fma")]
733    unsafe fn interpolate(
734        &self,
735        in_r: usize,
736        in_g: usize,
737        in_b: usize,
738        lut: &[BarycentricWeight<f32>],
739        r0: impl Fetcher<AvxVectorSse>,
740        r1: impl Fetcher<AvxVectorSse>,
741    ) -> (AvxVectorSse, AvxVectorSse) {
742        let lut_r = unsafe { lut.get_unchecked(in_r) };
743        let lut_g = unsafe { lut.get_unchecked(in_g) };
744        let lut_b = unsafe { lut.get_unchecked(in_b) };
745
746        let x: i32 = lut_r.x;
747        let y: i32 = lut_g.x;
748        let z: i32 = lut_b.x;
749
750        let x_n: i32 = lut_r.x_n;
751        let y_n: i32 = lut_g.x_n;
752        let z_n: i32 = lut_b.x_n;
753
754        let dr = lut_r.w;
755        let dg = lut_g.w;
756        let db = lut_b.w;
757
758        let c0_0 = r0.fetch(x, y, z);
759        let c0_1 = r1.fetch(x, y, z);
760
761        let w0 = AvxVector::from(db);
762        let w1 = AvxVector::from(dr);
763        let w2 = AvxVector::from(dg);
764
765        let c0 = AvxVector::from_sse(c0_0, c0_1);
766
767        if dr > db && dg > db {
768            let w3 = AvxVector::from(dr * dg);
769
770            let x0_0 = r0.fetch(x_n, y_n, z_n);
771            let x1_0 = r0.fetch(x_n, y_n, z);
772            let x2_0 = r0.fetch(x_n, y, z);
773            let x3_0 = r0.fetch(x, y_n, z);
774
775            let x0_1 = r1.fetch(x_n, y_n, z_n);
776            let x1_1 = r1.fetch(x_n, y_n, z);
777            let x2_1 = r1.fetch(x_n, y, z);
778            let x3_1 = r1.fetch(x, y_n, z);
779
780            let x0 = AvxVector::from_sse(x0_0, x0_1);
781            let x1 = AvxVector::from_sse(x1_0, x1_1);
782            let x2 = AvxVector::from_sse(x2_0, x2_1);
783            let x3 = AvxVector::from_sse(x3_0, x3_1);
784
785            let c1 = x0 - x1;
786            let c2 = x2 - c0;
787            let c3 = x3 - c0;
788            let c4 = c0 - x3 - x2 + x1;
789
790            let s0 = c0.mla(c1, w0);
791            let s1 = s0.mla(c2, w1);
792            let s2 = s1.mla(c3, w2);
793            s2.mla(c4, w3).split()
794        } else if db > dr && dg > dr {
795            let w3 = AvxVector::from(dg * db);
796
797            let x0_0 = r0.fetch(x, y, z_n);
798            let x1_0 = r0.fetch(x_n, y_n, z_n);
799            let x2_0 = r0.fetch(x, y_n, z_n);
800            let x3_0 = r0.fetch(x, y_n, z);
801
802            let x0_1 = r1.fetch(x, y, z_n);
803            let x1_1 = r1.fetch(x_n, y_n, z_n);
804            let x2_1 = r1.fetch(x, y_n, z_n);
805            let x3_1 = r1.fetch(x, y_n, z);
806
807            let x0 = AvxVector::from_sse(x0_0, x0_1);
808            let x1 = AvxVector::from_sse(x1_0, x1_1);
809            let x2 = AvxVector::from_sse(x2_0, x2_1);
810            let x3 = AvxVector::from_sse(x3_0, x3_1);
811
812            let c1 = x0 - c0;
813            let c2 = x1 - x2;
814            let c3 = x3 - c0;
815            let c4 = c0 - x3 - x0 + x2;
816
817            let s0 = c0.mla(c1, w0);
818            let s1 = s0.mla(c2, w1);
819            let s2 = s1.mla(c3, w2);
820            s2.mla(c4, w3).split()
821        } else {
822            let w3 = AvxVector::from(db * dr);
823
824            let x0_0 = r0.fetch(x, y, z_n);
825            let x1_0 = r0.fetch(x_n, y, z);
826            let x2_0 = r0.fetch(x_n, y, z_n);
827            let x3_0 = r0.fetch(x_n, y_n, z_n);
828
829            let x0_1 = r1.fetch(x, y, z_n);
830            let x1_1 = r1.fetch(x_n, y, z);
831            let x2_1 = r1.fetch(x_n, y, z_n);
832            let x3_1 = r1.fetch(x_n, y_n, z_n);
833
834            let x0 = AvxVector::from_sse(x0_0, x0_1);
835            let x1 = AvxVector::from_sse(x1_0, x1_1);
836            let x2 = AvxVector::from_sse(x2_0, x2_1);
837            let x3 = AvxVector::from_sse(x3_0, x3_1);
838
839            let c1 = x0 - c0;
840            let c2 = x1 - c0;
841            let c3 = x3 - x2;
842            let c4 = c0 - x1 - x0 + x2;
843
844            let s0 = c0.mla(c1, w0);
845            let s1 = s0.mla(c2, w1);
846            let s2 = s1.mla(c3, w2);
847            s2.mla(c4, w3).split()
848        }
849    }
850}
851
852#[cfg(feature = "options")]
853impl<const GRID_SIZE: usize> TetrahedralAvxFmaDouble<GRID_SIZE> {
854    #[target_feature(enable = "avx2", enable = "fma")]
855    unsafe fn interpolate(
856        &self,
857        in_r: usize,
858        in_g: usize,
859        in_b: usize,
860        lut: &[BarycentricWeight<f32>],
861        rv: impl Fetcher<AvxVector>,
862    ) -> (AvxVectorSse, AvxVectorSse) {
863        let lut_r = unsafe { lut.get_unchecked(in_r) };
864        let lut_g = unsafe { lut.get_unchecked(in_g) };
865        let lut_b = unsafe { lut.get_unchecked(in_b) };
866
867        let x: i32 = lut_r.x;
868        let y: i32 = lut_g.x;
869        let z: i32 = lut_b.x;
870
871        let x_n: i32 = lut_r.x_n;
872        let y_n: i32 = lut_g.x_n;
873        let z_n: i32 = lut_b.x_n;
874
875        let rx = lut_r.w;
876        let ry = lut_g.w;
877        let rz = lut_b.w;
878
879        let c0 = rv.fetch(x, y, z);
880
881        let w0 = AvxVector::from(rx);
882        let w1 = AvxVector::from(ry);
883        let w2 = AvxVector::from(rz);
884
885        let c2;
886        let c1;
887        let c3;
888        if rx >= ry {
889            if ry >= rz {
890                //rx >= ry && ry >= rz
891                c1 = rv.fetch(x_n, y, z) - c0;
892                c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
893                c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
894            } else if rx >= rz {
895                //rx >= rz && rz >= ry
896                c1 = rv.fetch(x_n, y, z) - c0;
897                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
898                c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
899            } else {
900                //rz > rx && rx >= ry
901                c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
902                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
903                c3 = rv.fetch(x, y, z_n) - c0;
904            }
905        } else if rx >= rz {
906            //ry > rx && rx >= rz
907            c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
908            c2 = rv.fetch(x, y_n, z) - c0;
909            c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
910        } else if ry >= rz {
911            //ry >= rz && rz > rx
912            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
913            c2 = rv.fetch(x, y_n, z) - c0;
914            c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
915        } else {
916            //rz > ry && ry > rx
917            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
918            c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
919            c3 = rv.fetch(x, y, z_n) - c0;
920        }
921        let s0 = c0.mla(c1, w0);
922        let s1 = s0.mla(c2, w1);
923        s1.mla(c3, w2).split()
924    }
925}
926
927impl<const GRID_SIZE: usize> TrilinearAvxFmaDouble<GRID_SIZE> {
928    #[target_feature(enable = "avx2", enable = "fma")]
929    unsafe fn interpolate(
930        &self,
931        in_r: usize,
932        in_g: usize,
933        in_b: usize,
934        lut: &[BarycentricWeight<f32>],
935        rv: impl Fetcher<AvxVector>,
936    ) -> (AvxVectorSse, AvxVectorSse) {
937        let lut_r = unsafe { lut.get_unchecked(in_r) };
938        let lut_g = unsafe { lut.get_unchecked(in_g) };
939        let lut_b = unsafe { lut.get_unchecked(in_b) };
940
941        let x: i32 = lut_r.x;
942        let y: i32 = lut_g.x;
943        let z: i32 = lut_b.x;
944
945        let x_n: i32 = lut_r.x_n;
946        let y_n: i32 = lut_g.x_n;
947        let z_n: i32 = lut_b.x_n;
948
949        let rx = lut_r.w;
950        let ry = lut_g.w;
951        let rz = lut_b.w;
952
953        let w0 = AvxVector::from(rx);
954        let w1 = AvxVector::from(ry);
955        let w2 = AvxVector::from(rz);
956
957        let c000 = rv.fetch(x, y, z);
958        let c100 = rv.fetch(x_n, y, z);
959        let c010 = rv.fetch(x, y_n, z);
960        let c110 = rv.fetch(x_n, y_n, z);
961        let c001 = rv.fetch(x, y, z_n);
962        let c101 = rv.fetch(x_n, y, z_n);
963        let c011 = rv.fetch(x, y_n, z_n);
964        let c111 = rv.fetch(x_n, y_n, z_n);
965
966        let dx = AvxVector::from(rx);
967
968        let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
969        let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
970        let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
971        let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
972
973        let dy = AvxVector::from(ry);
974
975        let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
976        let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
977
978        let dz = AvxVector::from(rz);
979
980        c0.neg_mla(c0, dz).mla(c1, w2).split()
981    }
982}
983
984impl<const GRID_SIZE: usize> TrilinearAvxFma<GRID_SIZE> {
985    #[target_feature(enable = "avx2", enable = "fma")]
986    unsafe fn interpolate(
987        &self,
988        in_r: usize,
989        in_g: usize,
990        in_b: usize,
991        lut: &[BarycentricWeight<f32>],
992        r: impl Fetcher<AvxVectorSse>,
993    ) -> AvxVectorSse {
994        let lut_r = unsafe { lut.get_unchecked(in_r) };
995        let lut_g = unsafe { lut.get_unchecked(in_g) };
996        let lut_b = unsafe { lut.get_unchecked(in_b) };
997
998        let x: i32 = lut_r.x;
999        let y: i32 = lut_g.x;
1000        let z: i32 = lut_b.x;
1001
1002        let x_n: i32 = lut_r.x_n;
1003        let y_n: i32 = lut_g.x_n;
1004        let z_n: i32 = lut_b.x_n;
1005
1006        let dr = lut_r.w;
1007        let dg = lut_g.w;
1008        let db = lut_b.w;
1009
1010        let w0 = AvxVector::from(dr);
1011        let w1 = AvxVector::from(dg);
1012        let w2 = AvxVectorSse::from(db);
1013
1014        let c000 = r.fetch(x, y, z);
1015        let c100 = r.fetch(x_n, y, z);
1016        let c010 = r.fetch(x, y_n, z);
1017        let c110 = r.fetch(x_n, y_n, z);
1018        let c001 = r.fetch(x, y, z_n);
1019        let c101 = r.fetch(x_n, y, z_n);
1020        let c011 = r.fetch(x, y_n, z_n);
1021        let c111 = r.fetch(x_n, y_n, z_n);
1022
1023        let x000 = AvxVector::from_sse(c000, c001);
1024        let x010 = AvxVector::from_sse(c010, c011);
1025        let x011 = AvxVector::from_sse(c100, c101);
1026        let x111 = AvxVector::from_sse(c110, c111);
1027
1028        let c00 = x000.neg_mla(x000, w0).mla(x011, w0);
1029        let c10 = x010.neg_mla(x010, w0).mla(x111, w0);
1030
1031        let z0 = c00.neg_mla(c00, w1).mla(c10, w1);
1032
1033        let (c0, c1) = z0.split();
1034
1035        c0.neg_mla(c0, w2).mla(c1, w2)
1036    }
1037}