moxcms/conversions/avx/
interpolator_q0_15.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::FusedMultiplyAdd;
31use std::arch::x86_64::*;
32use std::ops::{Add, Mul, Sub};
33
34#[repr(align(8), C)]
35pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
36
37#[cfg(feature = "options")]
38pub(crate) struct TetrahedralAvxQ0_15<const GRID_SIZE: usize> {}
39
40#[cfg(feature = "options")]
41pub(crate) struct PyramidalAvxQ0_15<const GRID_SIZE: usize> {}
42
43#[cfg(feature = "options")]
44pub(crate) struct PrismaticAvxQ0_15<const GRID_SIZE: usize> {}
45
46pub(crate) struct TrilinearAvxQ0_15<const GRID_SIZE: usize> {}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxQ0_15Double<const GRID_SIZE: usize> {}
50
51pub(crate) struct TrilinearAvxQ0_15Double<const GRID_SIZE: usize> {}
52
53#[cfg(feature = "options")]
54pub(crate) struct PyramidAvxFmaQ0_15Double<const GRID_SIZE: usize> {}
55
56#[cfg(feature = "options")]
57pub(crate) struct TetrahedralAvxQ0_15Double<const GRID_SIZE: usize> {}
58
59pub(crate) trait AvxMdInterpolationQ0_15Double {
60    fn inter3_sse(
61        &self,
62        table0: &[AvxAlignedI16],
63        table1: &[AvxAlignedI16],
64        in_r: usize,
65        in_g: usize,
66        in_b: usize,
67        lut: &[BarycentricWeight<i16>],
68    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
69}
70
71pub(crate) trait AvxMdInterpolationQ0_15 {
72    fn inter3_sse(
73        &self,
74        table: &[AvxAlignedI16],
75        in_r: usize,
76        in_g: usize,
77        in_b: usize,
78        lut: &[BarycentricWeight<i16>],
79    ) -> AvxVectorQ0_15Sse;
80}
81
82trait Fetcher<T> {
83    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
84}
85
86#[derive(Copy, Clone)]
87#[repr(transparent)]
88pub(crate) struct AvxVectorQ0_15Sse {
89    pub(crate) v: __m128i,
90}
91
92#[derive(Copy, Clone)]
93#[repr(transparent)]
94pub(crate) struct AvxVectorQ0_15 {
95    pub(crate) v: __m256i,
96}
97
98impl AvxVectorQ0_15 {
99    #[inline(always)]
100    pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
101        unsafe {
102            AvxVectorQ0_15 {
103                v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
104            }
105        }
106    }
107
108    #[inline(always)]
109    pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
110        unsafe {
111            (
112                AvxVectorQ0_15Sse {
113                    v: _mm256_castsi256_si128(self.v),
114                },
115                AvxVectorQ0_15Sse {
116                    v: _mm256_extracti128_si256::<1>(self.v),
117                },
118            )
119        }
120    }
121}
122
123impl From<i16> for AvxVectorQ0_15Sse {
124    #[inline(always)]
125    fn from(v: i16) -> Self {
126        AvxVectorQ0_15Sse {
127            v: unsafe { _mm_set1_epi16(v) },
128        }
129    }
130}
131
132impl From<i16> for AvxVectorQ0_15 {
133    #[inline(always)]
134    fn from(v: i16) -> Self {
135        AvxVectorQ0_15 {
136            v: unsafe { _mm256_set1_epi16(v) },
137        }
138    }
139}
140
141impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
142    type Output = Self;
143    #[inline(always)]
144    fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
145        AvxVectorQ0_15Sse {
146            v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
147        }
148    }
149}
150
151impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
152    type Output = Self;
153    #[inline(always)]
154    fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
155        AvxVectorQ0_15 {
156            v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
157        }
158    }
159}
160
161impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
162    type Output = Self;
163    #[inline(always)]
164    fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
165        AvxVectorQ0_15Sse {
166            v: unsafe { _mm_add_epi16(self.v, rhs.v) },
167        }
168    }
169}
170
171impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
172    type Output = Self;
173    #[inline(always)]
174    fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
175        AvxVectorQ0_15Sse {
176            v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
177        }
178    }
179}
180
181impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
182    type Output = Self;
183    #[inline(always)]
184    fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
185        AvxVectorQ0_15 {
186            v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
187        }
188    }
189}
190
191impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
192    type Output = Self;
193    #[inline(always)]
194    fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
195        AvxVectorQ0_15 {
196            v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
197        }
198    }
199}
200
201impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
202    #[inline(always)]
203    fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
204        AvxVectorQ0_15Sse {
205            v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
206        }
207    }
208}
209
210impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
211    #[inline(always)]
212    fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
213        AvxVectorQ0_15 {
214            v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
215        }
216    }
217}
218
219struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
220    cube: &'a [AvxAlignedI16],
221}
222
223struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
224    cube0: &'a [AvxAlignedI16],
225    cube1: &'a [AvxAlignedI16],
226}
227
228impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
229    #[inline(always)]
230    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
231        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
232            + y as u32 * GRID_SIZE as u32
233            + z as u32) as usize;
234        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
235        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
236        AvxVectorQ0_15 {
237            v: unsafe {
238                _mm256_inserti128_si256::<1>(
239                    _mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
240                    _mm_loadu_si64(jx1.as_ptr() as *const _),
241                )
242            },
243        }
244    }
245}
246
247impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
248    for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
249{
250    #[inline(always)]
251    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
252        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
253            + y as u32 * GRID_SIZE as u32
254            + z as u32) as usize;
255        let jx = unsafe { self.cube.get_unchecked(offset..) };
256        AvxVectorQ0_15Sse {
257            v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
258        }
259    }
260}
261
262#[cfg(feature = "options")]
263impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<GRID_SIZE> {
264    #[target_feature(enable = "avx2")]
265    unsafe fn interpolate(
266        &self,
267        in_r: usize,
268        in_g: usize,
269        in_b: usize,
270        lut: &[BarycentricWeight<i16>],
271        r: impl Fetcher<AvxVectorQ0_15Sse>,
272    ) -> AvxVectorQ0_15Sse {
273        let lut_r = unsafe { *lut.get_unchecked(in_r) };
274        let lut_g = unsafe { *lut.get_unchecked(in_g) };
275        let lut_b = unsafe { *lut.get_unchecked(in_b) };
276
277        let x: i32 = lut_r.x;
278        let y: i32 = lut_g.x;
279        let z: i32 = lut_b.x;
280
281        let x_n: i32 = lut_r.x_n;
282        let y_n: i32 = lut_g.x_n;
283        let z_n: i32 = lut_b.x_n;
284
285        let rx = lut_r.w;
286        let ry = lut_g.w;
287        let rz = lut_b.w;
288
289        let c0 = r.fetch(x, y, z);
290
291        let c2;
292        let c1;
293        let c3;
294        if rx >= ry {
295            if ry >= rz {
296                //rx >= ry && ry >= rz
297                c1 = r.fetch(x_n, y, z) - c0;
298                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
299                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
300            } else if rx >= rz {
301                //rx >= rz && rz >= ry
302                c1 = r.fetch(x_n, y, z) - c0;
303                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
304                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
305            } else {
306                //rz > rx && rx >= ry
307                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
308                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
309                c3 = r.fetch(x, y, z_n) - c0;
310            }
311        } else if rx >= rz {
312            //ry > rx && rx >= rz
313            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
314            c2 = r.fetch(x, y_n, z) - c0;
315            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
316        } else if ry >= rz {
317            //ry >= rz && rz > rx
318            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
319            c2 = r.fetch(x, y_n, z) - c0;
320            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
321        } else {
322            //rz > ry && ry > rx
323            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
324            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
325            c3 = r.fetch(x, y, z_n) - c0;
326        }
327        let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
328        let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
329        s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
330    }
331}
332
333macro_rules! define_interp_avx {
334    ($interpolator: ident) => {
335        impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15 for $interpolator<GRID_SIZE> {
336            fn inter3_sse(
337                &self,
338                table: &[AvxAlignedI16],
339                in_r: usize,
340                in_g: usize,
341                in_b: usize,
342                lut: &[BarycentricWeight<i16>],
343            ) -> AvxVectorQ0_15Sse {
344                unsafe {
345                    self.interpolate(
346                        in_r,
347                        in_g,
348                        in_b,
349                        lut,
350                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
351                    )
352                }
353            }
354        }
355    };
356}
357
358#[cfg(feature = "options")]
359macro_rules! define_interp_avx_d {
360    ($interpolator: ident) => {
361        impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for $interpolator<GRID_SIZE> {
362            fn inter3_sse(
363                &self,
364                table0: &[AvxAlignedI16],
365                table1: &[AvxAlignedI16],
366                in_r: usize,
367                in_g: usize,
368                in_b: usize,
369                lut: &[BarycentricWeight<i16>],
370            ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
371                unsafe {
372                    self.interpolate(
373                        in_r,
374                        in_g,
375                        in_b,
376                        lut,
377                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
378                        TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
379                    )
380                }
381            }
382        }
383    };
384}
385
386#[cfg(feature = "options")]
387define_interp_avx!(TetrahedralAvxQ0_15);
388#[cfg(feature = "options")]
389define_interp_avx!(PyramidalAvxQ0_15);
390#[cfg(feature = "options")]
391define_interp_avx!(PrismaticAvxQ0_15);
392define_interp_avx!(TrilinearAvxQ0_15);
393#[cfg(feature = "options")]
394define_interp_avx_d!(PrismaticAvxQ0_15Double);
395#[cfg(feature = "options")]
396define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
397
398#[cfg(feature = "options")]
399impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double
400    for TetrahedralAvxQ0_15Double<GRID_SIZE>
401{
402    fn inter3_sse(
403        &self,
404        table0: &[AvxAlignedI16],
405        table1: &[AvxAlignedI16],
406        in_r: usize,
407        in_g: usize,
408        in_b: usize,
409        lut: &[BarycentricWeight<i16>],
410    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
411        unsafe {
412            self.interpolate(
413                in_r,
414                in_g,
415                in_b,
416                lut,
417                TetrahedralAvxFetchVector::<GRID_SIZE> {
418                    cube0: table0,
419                    cube1: table1,
420                },
421            )
422        }
423    }
424}
425
426impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for TrilinearAvxQ0_15Double<GRID_SIZE> {
427    fn inter3_sse(
428        &self,
429        table0: &[AvxAlignedI16],
430        table1: &[AvxAlignedI16],
431        in_r: usize,
432        in_g: usize,
433        in_b: usize,
434        lut: &[BarycentricWeight<i16>],
435    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
436        unsafe {
437            self.interpolate(
438                in_r,
439                in_g,
440                in_b,
441                lut,
442                TetrahedralAvxFetchVector::<GRID_SIZE> {
443                    cube0: table0,
444                    cube1: table1,
445                },
446            )
447        }
448    }
449}
450
451#[cfg(feature = "options")]
452impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<GRID_SIZE> {
453    #[target_feature(enable = "avx2")]
454    unsafe fn interpolate(
455        &self,
456        in_r: usize,
457        in_g: usize,
458        in_b: usize,
459        lut: &[BarycentricWeight<i16>],
460        r: impl Fetcher<AvxVectorQ0_15Sse>,
461    ) -> AvxVectorQ0_15Sse {
462        let lut_r = unsafe { *lut.get_unchecked(in_r) };
463        let lut_g = unsafe { *lut.get_unchecked(in_g) };
464        let lut_b = unsafe { *lut.get_unchecked(in_b) };
465
466        let x: i32 = lut_r.x;
467        let y: i32 = lut_g.x;
468        let z: i32 = lut_b.x;
469
470        let x_n: i32 = lut_r.x_n;
471        let y_n: i32 = lut_g.x_n;
472        let z_n: i32 = lut_b.x_n;
473
474        let dr = lut_r.w;
475        let dg = lut_g.w;
476        let db = lut_b.w;
477
478        let c0 = r.fetch(x, y, z);
479
480        let w0 = AvxVectorQ0_15Sse::from(db);
481        let w1 = AvxVectorQ0_15Sse::from(dr);
482        let w2 = AvxVectorQ0_15Sse::from(dg);
483
484        if dr > db && dg > db {
485            let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
486            let x0 = r.fetch(x_n, y_n, z_n);
487            let x1 = r.fetch(x_n, y_n, z);
488            let x2 = r.fetch(x_n, y, z);
489            let x3 = r.fetch(x, y_n, z);
490
491            let c1 = x0 - x1;
492            let c2 = x2 - c0;
493            let c3 = x3 - c0;
494            let c4 = c0 - x3 - x2 + x1;
495
496            let s0 = c0.mla(c1, w0);
497            let s1 = s0.mla(c2, w1);
498            let s2 = s1.mla(c3, w2);
499            s2.mla(c4, w3)
500        } else if db > dr && dg > dr {
501            let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
502
503            let x0 = r.fetch(x, y, z_n);
504            let x1 = r.fetch(x_n, y_n, z_n);
505            let x2 = r.fetch(x, y_n, z_n);
506            let x3 = r.fetch(x, y_n, z);
507
508            let c1 = x0 - c0;
509            let c2 = x1 - x2;
510            let c3 = x3 - c0;
511            let c4 = c0 - x3 - x0 + x2;
512
513            let s0 = c0.mla(c1, w0);
514            let s1 = s0.mla(c2, w1);
515            let s2 = s1.mla(c3, w2);
516            s2.mla(c4, w3)
517        } else {
518            let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
519
520            let x0 = r.fetch(x, y, z_n);
521            let x1 = r.fetch(x_n, y, z);
522            let x2 = r.fetch(x_n, y, z_n);
523            let x3 = r.fetch(x_n, y_n, z_n);
524
525            let c1 = x0 - c0;
526            let c2 = x1 - c0;
527            let c3 = x3 - x2;
528            let c4 = c0 - x1 - x0 + x2;
529
530            let s0 = c0.mla(c1, w0);
531            let s1 = s0.mla(c2, w1);
532            let s2 = s1.mla(c3, w2);
533            s2.mla(c4, w3)
534        }
535    }
536}
537
538#[cfg(feature = "options")]
539impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<GRID_SIZE> {
540    #[target_feature(enable = "avx2")]
541    unsafe fn interpolate(
542        &self,
543        in_r: usize,
544        in_g: usize,
545        in_b: usize,
546        lut: &[BarycentricWeight<i16>],
547        r: impl Fetcher<AvxVectorQ0_15Sse>,
548    ) -> AvxVectorQ0_15Sse {
549        let lut_r = unsafe { *lut.get_unchecked(in_r) };
550        let lut_g = unsafe { *lut.get_unchecked(in_g) };
551        let lut_b = unsafe { *lut.get_unchecked(in_b) };
552
553        let x: i32 = lut_r.x;
554        let y: i32 = lut_g.x;
555        let z: i32 = lut_b.x;
556
557        let x_n: i32 = lut_r.x_n;
558        let y_n: i32 = lut_g.x_n;
559        let z_n: i32 = lut_b.x_n;
560
561        let dr = lut_r.w;
562        let dg = lut_g.w;
563        let db = lut_b.w;
564
565        let c0 = r.fetch(x, y, z);
566
567        let w0 = AvxVectorQ0_15Sse::from(db);
568        let w1 = AvxVectorQ0_15Sse::from(dr);
569        let w2 = AvxVectorQ0_15Sse::from(dg);
570        let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
571        let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
572
573        if db > dr {
574            let x0 = r.fetch(x, y, z_n);
575            let x1 = r.fetch(x_n, y, z_n);
576            let x2 = r.fetch(x, y_n, z);
577            let x3 = r.fetch(x, y_n, z_n);
578            let x4 = r.fetch(x_n, y_n, z_n);
579
580            let c1 = x0 - c0;
581            let c2 = x1 - x0;
582            let c3 = x2 - c0;
583            let c4 = c0 - x2 - x0 + x3;
584            let c5 = x0 - x3 - x1 + x4;
585
586            let s0 = c0.mla(c1, w0);
587            let s1 = s0.mla(c2, w1);
588            let s2 = s1.mla(c3, w2);
589            let s3 = s2.mla(c4, w3);
590            s3.mla(c5, w4)
591        } else {
592            let x0 = r.fetch(x_n, y, z);
593            let x1 = r.fetch(x_n, y, z_n);
594            let x2 = r.fetch(x, y_n, z);
595            let x3 = r.fetch(x_n, y_n, z);
596            let x4 = r.fetch(x_n, y_n, z_n);
597
598            let c1 = x1 - x0;
599            let c2 = x0 - c0;
600            let c3 = x2 - c0;
601            let c4 = x0 - x3 - x1 + x4;
602            let c5 = c0 - x2 - x0 + x3;
603
604            let s0 = c0.mla(c1, w0);
605            let s1 = s0.mla(c2, w1);
606            let s2 = s1.mla(c3, w2);
607            let s3 = s2.mla(c4, w3);
608            s3.mla(c5, w4)
609        }
610    }
611}
612
613#[cfg(feature = "options")]
614impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<GRID_SIZE> {
615    #[target_feature(enable = "avx2")]
616    unsafe fn interpolate(
617        &self,
618        in_r: usize,
619        in_g: usize,
620        in_b: usize,
621        lut: &[BarycentricWeight<i16>],
622        r0: impl Fetcher<AvxVectorQ0_15Sse>,
623        r1: impl Fetcher<AvxVectorQ0_15Sse>,
624    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
625        let lut_r = unsafe { *lut.get_unchecked(in_r) };
626        let lut_g = unsafe { *lut.get_unchecked(in_g) };
627        let lut_b = unsafe { *lut.get_unchecked(in_b) };
628
629        let x: i32 = lut_r.x;
630        let y: i32 = lut_g.x;
631        let z: i32 = lut_b.x;
632
633        let x_n: i32 = lut_r.x_n;
634        let y_n: i32 = lut_g.x_n;
635        let z_n: i32 = lut_b.x_n;
636
637        let dr = lut_r.w;
638        let dg = lut_g.w;
639        let db = lut_b.w;
640
641        let c0_0 = r0.fetch(x, y, z);
642        let c0_1 = r0.fetch(x, y, z);
643
644        let w0 = AvxVectorQ0_15::from(db);
645        let w1 = AvxVectorQ0_15::from(dr);
646        let w2 = AvxVectorQ0_15::from(dg);
647        let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
648        let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
649
650        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
651
652        if db > dr {
653            let x0_0 = r0.fetch(x, y, z_n);
654            let x1_0 = r0.fetch(x_n, y, z_n);
655            let x2_0 = r0.fetch(x, y_n, z);
656            let x3_0 = r0.fetch(x, y_n, z_n);
657            let x4_0 = r0.fetch(x_n, y_n, z_n);
658
659            let x0_1 = r1.fetch(x, y, z_n);
660            let x1_1 = r1.fetch(x_n, y, z_n);
661            let x2_1 = r1.fetch(x, y_n, z);
662            let x3_1 = r1.fetch(x, y_n, z_n);
663            let x4_1 = r1.fetch(x_n, y_n, z_n);
664
665            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
666            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
667            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
668            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
669            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
670
671            let c1 = x0 - c0;
672            let c2 = x1 - x0;
673            let c3 = x2 - c0;
674            let c4 = c0 - x2 - x0 + x3;
675            let c5 = x0 - x3 - x1 + x4;
676
677            let s0 = c0.mla(c1, w0);
678            let s1 = s0.mla(c2, w1);
679            let s2 = s1.mla(c3, w2);
680            let s3 = s2.mla(c4, w3);
681            s3.mla(c5, w4).split()
682        } else {
683            let x0_0 = r0.fetch(x_n, y, z);
684            let x1_0 = r0.fetch(x_n, y, z_n);
685            let x2_0 = r0.fetch(x, y_n, z);
686            let x3_0 = r0.fetch(x_n, y_n, z);
687            let x4_0 = r0.fetch(x_n, y_n, z_n);
688
689            let x0_1 = r1.fetch(x_n, y, z);
690            let x1_1 = r1.fetch(x_n, y, z_n);
691            let x2_1 = r1.fetch(x, y_n, z);
692            let x3_1 = r1.fetch(x_n, y_n, z);
693            let x4_1 = r1.fetch(x_n, y_n, z_n);
694
695            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
696            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
697            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
698            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
699            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
700
701            let c1 = x1 - x0;
702            let c2 = x0 - c0;
703            let c3 = x2 - c0;
704            let c4 = x0 - x3 - x1 + x4;
705            let c5 = c0 - x2 - x0 + x3;
706
707            let s0 = c0.mla(c1, w0);
708            let s1 = s0.mla(c2, w1);
709            let s2 = s1.mla(c3, w2);
710            let s3 = s2.mla(c4, w3);
711            s3.mla(c5, w4).split()
712        }
713    }
714}
715
716#[cfg(feature = "options")]
717impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<GRID_SIZE> {
718    #[target_feature(enable = "avx2")]
719    unsafe fn interpolate(
720        &self,
721        in_r: usize,
722        in_g: usize,
723        in_b: usize,
724        lut: &[BarycentricWeight<i16>],
725        r0: impl Fetcher<AvxVectorQ0_15Sse>,
726        r1: impl Fetcher<AvxVectorQ0_15Sse>,
727    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
728        let lut_r = unsafe { *lut.get_unchecked(in_r) };
729        let lut_g = unsafe { *lut.get_unchecked(in_g) };
730        let lut_b = unsafe { *lut.get_unchecked(in_b) };
731
732        let x: i32 = lut_r.x;
733        let y: i32 = lut_g.x;
734        let z: i32 = lut_b.x;
735
736        let x_n: i32 = lut_r.x_n;
737        let y_n: i32 = lut_g.x_n;
738        let z_n: i32 = lut_b.x_n;
739
740        let dr = lut_r.w;
741        let dg = lut_g.w;
742        let db = lut_b.w;
743
744        let c0_0 = r0.fetch(x, y, z);
745        let c0_1 = r1.fetch(x, y, z);
746
747        let w0 = AvxVectorQ0_15::from(db);
748        let w1 = AvxVectorQ0_15::from(dr);
749        let w2 = AvxVectorQ0_15::from(dg);
750
751        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
752
753        if dr > db && dg > db {
754            let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
755
756            let x0_0 = r0.fetch(x_n, y_n, z_n);
757            let x1_0 = r0.fetch(x_n, y_n, z);
758            let x2_0 = r0.fetch(x_n, y, z);
759            let x3_0 = r0.fetch(x, y_n, z);
760
761            let x0_1 = r1.fetch(x_n, y_n, z_n);
762            let x1_1 = r1.fetch(x_n, y_n, z);
763            let x2_1 = r1.fetch(x_n, y, z);
764            let x3_1 = r1.fetch(x, y_n, z);
765
766            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
767            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
768            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
769            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
770
771            let c1 = x0 - x1;
772            let c2 = x2 - c0;
773            let c3 = x3 - c0;
774            let c4 = c0 - x3 - x2 + x1;
775
776            let s0 = c0.mla(c1, w0);
777            let s1 = s0.mla(c2, w1);
778            let s2 = s1.mla(c3, w2);
779            s2.mla(c4, w3).split()
780        } else if db > dr && dg > dr {
781            let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
782
783            let x0_0 = r0.fetch(x, y, z_n);
784            let x1_0 = r0.fetch(x_n, y_n, z_n);
785            let x2_0 = r0.fetch(x, y_n, z_n);
786            let x3_0 = r0.fetch(x, y_n, z);
787
788            let x0_1 = r1.fetch(x, y, z_n);
789            let x1_1 = r1.fetch(x_n, y_n, z_n);
790            let x2_1 = r1.fetch(x, y_n, z_n);
791            let x3_1 = r1.fetch(x, y_n, z);
792
793            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
794            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
795            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
796            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
797
798            let c1 = x0 - c0;
799            let c2 = x1 - x2;
800            let c3 = x3 - c0;
801            let c4 = c0 - x3 - x0 + x2;
802
803            let s0 = c0.mla(c1, w0);
804            let s1 = s0.mla(c2, w1);
805            let s2 = s1.mla(c3, w2);
806            s2.mla(c4, w3).split()
807        } else {
808            let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
809
810            let x0_0 = r0.fetch(x, y, z_n);
811            let x1_0 = r0.fetch(x_n, y, z);
812            let x2_0 = r0.fetch(x_n, y, z_n);
813            let x3_0 = r0.fetch(x_n, y_n, z_n);
814
815            let x0_1 = r1.fetch(x, y, z_n);
816            let x1_1 = r1.fetch(x_n, y, z);
817            let x2_1 = r1.fetch(x_n, y, z_n);
818            let x3_1 = r1.fetch(x_n, y_n, z_n);
819
820            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
821            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
822            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
823            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
824
825            let c1 = x0 - c0;
826            let c2 = x1 - c0;
827            let c3 = x3 - x2;
828            let c4 = c0 - x1 - x0 + x2;
829
830            let s0 = c0.mla(c1, w0);
831            let s1 = s0.mla(c2, w1);
832            let s2 = s1.mla(c3, w2);
833            s2.mla(c4, w3).split()
834        }
835    }
836}
837
838#[cfg(feature = "options")]
839impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<GRID_SIZE> {
840    #[target_feature(enable = "avx2")]
841    unsafe fn interpolate(
842        &self,
843        in_r: usize,
844        in_g: usize,
845        in_b: usize,
846        lut: &[BarycentricWeight<i16>],
847        rv: impl Fetcher<AvxVectorQ0_15>,
848    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
849        let lut_r = unsafe { *lut.get_unchecked(in_r) };
850        let lut_g = unsafe { *lut.get_unchecked(in_g) };
851        let lut_b = unsafe { *lut.get_unchecked(in_b) };
852
853        let x: i32 = lut_r.x;
854        let y: i32 = lut_g.x;
855        let z: i32 = lut_b.x;
856
857        let x_n: i32 = lut_r.x_n;
858        let y_n: i32 = lut_g.x_n;
859        let z_n: i32 = lut_b.x_n;
860
861        let rx = lut_r.w;
862        let ry = lut_g.w;
863        let rz = lut_b.w;
864
865        let c0 = rv.fetch(x, y, z);
866
867        let w0 = AvxVectorQ0_15::from(rx);
868        let w1 = AvxVectorQ0_15::from(ry);
869        let w2 = AvxVectorQ0_15::from(rz);
870
871        let c2;
872        let c1;
873        let c3;
874        if rx >= ry {
875            if ry >= rz {
876                //rx >= ry && ry >= rz
877                c1 = rv.fetch(x_n, y, z) - c0;
878                c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
879                c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
880            } else if rx >= rz {
881                //rx >= rz && rz >= ry
882                c1 = rv.fetch(x_n, y, z) - c0;
883                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
884                c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
885            } else {
886                //rz > rx && rx >= ry
887                c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
888                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
889                c3 = rv.fetch(x, y, z_n) - c0;
890            }
891        } else if rx >= rz {
892            //ry > rx && rx >= rz
893            c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
894            c2 = rv.fetch(x, y_n, z) - c0;
895            c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
896        } else if ry >= rz {
897            //ry >= rz && rz > rx
898            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
899            c2 = rv.fetch(x, y_n, z) - c0;
900            c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
901        } else {
902            //rz > ry && ry > rx
903            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
904            c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
905            c3 = rv.fetch(x, y, z_n) - c0;
906        }
907        let s0 = c0.mla(c1, w0);
908        let s1 = s0.mla(c2, w1);
909        s1.mla(c3, w2).split()
910    }
911}
912
913impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<GRID_SIZE> {
914    #[target_feature(enable = "avx2")]
915    unsafe fn interpolate(
916        &self,
917        in_r: usize,
918        in_g: usize,
919        in_b: usize,
920        lut: &[BarycentricWeight<i16>],
921        rv: impl Fetcher<AvxVectorQ0_15>,
922    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
923        let lut_r = unsafe { *lut.get_unchecked(in_r) };
924        let lut_g = unsafe { *lut.get_unchecked(in_g) };
925        let lut_b = unsafe { *lut.get_unchecked(in_b) };
926
927        let x: i32 = lut_r.x;
928        let y: i32 = lut_g.x;
929        let z: i32 = lut_b.x;
930
931        let x_n: i32 = lut_r.x_n;
932        let y_n: i32 = lut_g.x_n;
933        let z_n: i32 = lut_b.x_n;
934
935        let rx = lut_r.w;
936        let ry = lut_g.w;
937        let rz = lut_b.w;
938
939        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
940
941        let q_max = AvxVectorQ0_15::from(Q_MAX);
942        let w0 = AvxVectorQ0_15::from(rx);
943        let w1 = AvxVectorQ0_15::from(ry);
944        let w2 = AvxVectorQ0_15::from(rz);
945        let dx = q_max - w0;
946        let dy = q_max - w1;
947        let dz = q_max - w2;
948
949        let c000 = rv.fetch(x, y, z);
950        let c100 = rv.fetch(x_n, y, z);
951        let c010 = rv.fetch(x, y_n, z);
952        let c110 = rv.fetch(x_n, y_n, z);
953        let c001 = rv.fetch(x, y, z_n);
954        let c101 = rv.fetch(x_n, y, z_n);
955        let c011 = rv.fetch(x, y_n, z_n);
956        let c111 = rv.fetch(x_n, y_n, z_n);
957
958        let c00 = (c000 * dx).mla(c100, w0);
959        let c10 = (c010 * dx).mla(c110, w0);
960        let c01 = (c001 * dx).mla(c101, w0);
961        let c11 = (c011 * dx).mla(c111, w0);
962
963        let c0 = (c00 * dy).mla(c10, w1);
964        let c1 = (c01 * dy).mla(c11, w1);
965
966        (c0 * dz).mla(c1, w2).split()
967    }
968}
969
970impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<GRID_SIZE> {
971    #[target_feature(enable = "avx2")]
972    unsafe fn interpolate(
973        &self,
974        in_r: usize,
975        in_g: usize,
976        in_b: usize,
977        lut: &[BarycentricWeight<i16>],
978        r: impl Fetcher<AvxVectorQ0_15Sse>,
979    ) -> AvxVectorQ0_15Sse {
980        let lut_r = unsafe { *lut.get_unchecked(in_r) };
981        let lut_g = unsafe { *lut.get_unchecked(in_g) };
982        let lut_b = unsafe { *lut.get_unchecked(in_b) };
983
984        let x: i32 = lut_r.x;
985        let y: i32 = lut_g.x;
986        let z: i32 = lut_b.x;
987
988        let x_n: i32 = lut_r.x_n;
989        let y_n: i32 = lut_g.x_n;
990        let z_n: i32 = lut_b.x_n;
991
992        let dr = lut_r.w;
993        let dg = lut_g.w;
994        let db = lut_b.w;
995
996        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
997
998        let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
999        let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
1000        let w0 = AvxVectorQ0_15::from(dr);
1001        let w1 = AvxVectorQ0_15::from(dg);
1002        let w2 = AvxVectorQ0_15Sse::from(db);
1003        let dx = q_max_avx - w0;
1004        let dy = q_max_avx - w1;
1005        let dz = q_max - w2;
1006
1007        let c000 = r.fetch(x, y, z);
1008        let c100 = r.fetch(x_n, y, z);
1009        let c010 = r.fetch(x, y_n, z);
1010        let c110 = r.fetch(x_n, y_n, z);
1011        let c001 = r.fetch(x, y, z_n);
1012        let c101 = r.fetch(x_n, y, z_n);
1013        let c011 = r.fetch(x, y_n, z_n);
1014        let c111 = r.fetch(x_n, y_n, z_n);
1015
1016        let x000 = AvxVectorQ0_15::from_sse(c000, c001);
1017        let x010 = AvxVectorQ0_15::from_sse(c010, c011);
1018        let x011 = AvxVectorQ0_15::from_sse(c100, c101);
1019        let x111 = AvxVectorQ0_15::from_sse(c110, c111);
1020
1021        let c00 = (x000 * dx).mla(x011, w0);
1022        let c10 = (x010 * dx).mla(x111, w0);
1023
1024        let c0 = (c00 * dy).mla(c10, w1);
1025
1026        let (c0, c1) = c0.split();
1027
1028        (c0 * dz).mla(c1, w2)
1029    }
1030}