moxcms/conversions/avx/
interpolator_q0_15.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::FusedMultiplyAdd;
31use num_traits::AsPrimitive;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35#[repr(align(8), C)]
36pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
37
38#[cfg(feature = "options")]
39pub(crate) struct TetrahedralAvxQ0_15<'a, const GRID_SIZE: usize> {
40    pub(crate) cube: &'a [AvxAlignedI16],
41}
42
43#[cfg(feature = "options")]
44pub(crate) struct PyramidalAvxQ0_15<'a, const GRID_SIZE: usize> {
45    pub(crate) cube: &'a [AvxAlignedI16],
46}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxQ0_15<'a, const GRID_SIZE: usize> {
50    pub(crate) cube: &'a [AvxAlignedI16],
51}
52
53pub(crate) struct TrilinearAvxQ0_15<'a, const GRID_SIZE: usize> {
54    pub(crate) cube: &'a [AvxAlignedI16],
55}
56
57#[cfg(feature = "options")]
58pub(crate) struct PrismaticAvxQ0_15Double<'a, const GRID_SIZE: usize> {
59    pub(crate) cube0: &'a [AvxAlignedI16],
60    pub(crate) cube1: &'a [AvxAlignedI16],
61}
62
63pub(crate) struct TrilinearAvxQ0_15Double<'a, const GRID_SIZE: usize> {
64    pub(crate) cube0: &'a [AvxAlignedI16],
65    pub(crate) cube1: &'a [AvxAlignedI16],
66}
67
68#[cfg(feature = "options")]
69pub(crate) struct PyramidAvxFmaQ0_15Double<'a, const GRID_SIZE: usize> {
70    pub(crate) cube0: &'a [AvxAlignedI16],
71    pub(crate) cube1: &'a [AvxAlignedI16],
72}
73
74#[cfg(feature = "options")]
75pub(crate) struct TetrahedralAvxQ0_15Double<'a, const GRID_SIZE: usize> {
76    pub(crate) cube0: &'a [AvxAlignedI16],
77    pub(crate) cube1: &'a [AvxAlignedI16],
78}
79
80pub(crate) trait AvxMdInterpolationQ0_15Double<'a, const GRID_SIZE: usize> {
81    fn new(table0: &'a [AvxAlignedI16], table1: &'a [AvxAlignedI16]) -> Self;
82    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
83        &self,
84        in_r: U,
85        in_g: U,
86        in_b: U,
87        lut: &[BarycentricWeight<i16>; BINS],
88    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
89}
90
91pub(crate) trait AvxMdInterpolationQ0_15<'a, const GRID_SIZE: usize> {
92    fn new(table: &'a [AvxAlignedI16]) -> Self;
93    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
94        &self,
95        in_r: U,
96        in_g: U,
97        in_b: U,
98        lut: &[BarycentricWeight<i16>; BINS],
99    ) -> AvxVectorQ0_15Sse;
100}
101
102trait Fetcher<T> {
103    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
104}
105
106#[derive(Copy, Clone)]
107#[repr(transparent)]
108pub(crate) struct AvxVectorQ0_15Sse {
109    pub(crate) v: __m128i,
110}
111
112#[derive(Copy, Clone)]
113#[repr(transparent)]
114pub(crate) struct AvxVectorQ0_15 {
115    pub(crate) v: __m256i,
116}
117
118impl AvxVectorQ0_15 {
119    #[inline(always)]
120    pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
121        unsafe {
122            AvxVectorQ0_15 {
123                v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
124            }
125        }
126    }
127
128    #[inline(always)]
129    pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
130        unsafe {
131            (
132                AvxVectorQ0_15Sse {
133                    v: _mm256_castsi256_si128(self.v),
134                },
135                AvxVectorQ0_15Sse {
136                    v: _mm256_extracti128_si256::<1>(self.v),
137                },
138            )
139        }
140    }
141}
142
143impl From<i16> for AvxVectorQ0_15Sse {
144    #[inline(always)]
145    fn from(v: i16) -> Self {
146        AvxVectorQ0_15Sse {
147            v: unsafe { _mm_set1_epi16(v) },
148        }
149    }
150}
151
152impl From<i16> for AvxVectorQ0_15 {
153    #[inline(always)]
154    fn from(v: i16) -> Self {
155        AvxVectorQ0_15 {
156            v: unsafe { _mm256_set1_epi16(v) },
157        }
158    }
159}
160
161impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
162    type Output = Self;
163    #[inline(always)]
164    fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
165        AvxVectorQ0_15Sse {
166            v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
167        }
168    }
169}
170
171impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
172    type Output = Self;
173    #[inline(always)]
174    fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
175        AvxVectorQ0_15 {
176            v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
177        }
178    }
179}
180
181impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
182    type Output = Self;
183    #[inline(always)]
184    fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
185        AvxVectorQ0_15Sse {
186            v: unsafe { _mm_add_epi16(self.v, rhs.v) },
187        }
188    }
189}
190
191impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
192    type Output = Self;
193    #[inline(always)]
194    fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
195        AvxVectorQ0_15Sse {
196            v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
197        }
198    }
199}
200
201impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
202    type Output = Self;
203    #[inline(always)]
204    fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
205        AvxVectorQ0_15 {
206            v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
207        }
208    }
209}
210
211impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
212    type Output = Self;
213    #[inline(always)]
214    fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
215        AvxVectorQ0_15 {
216            v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
217        }
218    }
219}
220
221impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
222    #[inline(always)]
223    fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
224        AvxVectorQ0_15Sse {
225            v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
226        }
227    }
228}
229
230impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
231    #[inline(always)]
232    fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
233        AvxVectorQ0_15 {
234            v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
235        }
236    }
237}
238
239struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
240    cube: &'a [AvxAlignedI16],
241}
242
243struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
244    cube0: &'a [AvxAlignedI16],
245    cube1: &'a [AvxAlignedI16],
246}
247
248impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
249    #[inline(always)]
250    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
251        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
252            + y as u32 * GRID_SIZE as u32
253            + z as u32) as usize;
254        let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
255        let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
256        AvxVectorQ0_15 {
257            v: unsafe {
258                _mm256_inserti128_si256::<1>(
259                    _mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
260                    _mm_loadu_si64(jx1.as_ptr() as *const _),
261                )
262            },
263        }
264    }
265}
266
267impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
268    for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
269{
270    #[inline(always)]
271    fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
272        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
273            + y as u32 * GRID_SIZE as u32
274            + z as u32) as usize;
275        let jx = unsafe { self.cube.get_unchecked(offset..) };
276        AvxVectorQ0_15Sse {
277            v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
278        }
279    }
280}
281
282#[cfg(feature = "options")]
283impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<'_, GRID_SIZE> {
284    #[inline(always)]
285    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
286        &self,
287        in_r: U,
288        in_g: U,
289        in_b: U,
290        lut: &[BarycentricWeight<i16>; BINS],
291        r: impl Fetcher<AvxVectorQ0_15Sse>,
292    ) -> AvxVectorQ0_15Sse {
293        let lut_r = lut[in_r.as_()];
294        let lut_g = lut[in_g.as_()];
295        let lut_b = lut[in_b.as_()];
296
297        let x: i32 = lut_r.x;
298        let y: i32 = lut_g.x;
299        let z: i32 = lut_b.x;
300
301        let x_n: i32 = lut_r.x_n;
302        let y_n: i32 = lut_g.x_n;
303        let z_n: i32 = lut_b.x_n;
304
305        let rx = lut_r.w;
306        let ry = lut_g.w;
307        let rz = lut_b.w;
308
309        let c0 = r.fetch(x, y, z);
310
311        let c2;
312        let c1;
313        let c3;
314        if rx >= ry {
315            if ry >= rz {
316                //rx >= ry && ry >= rz
317                c1 = r.fetch(x_n, y, z) - c0;
318                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
319                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
320            } else if rx >= rz {
321                //rx >= rz && rz >= ry
322                c1 = r.fetch(x_n, y, z) - c0;
323                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
324                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
325            } else {
326                //rz > rx && rx >= ry
327                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
328                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
329                c3 = r.fetch(x, y, z_n) - c0;
330            }
331        } else if rx >= rz {
332            //ry > rx && rx >= rz
333            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
334            c2 = r.fetch(x, y_n, z) - c0;
335            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
336        } else if ry >= rz {
337            //ry >= rz && rz > rx
338            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
339            c2 = r.fetch(x, y_n, z) - c0;
340            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
341        } else {
342            //rz > ry && ry > rx
343            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
344            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
345            c3 = r.fetch(x, y, z_n) - c0;
346        }
347        let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
348        let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
349        s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
350    }
351}
352
353macro_rules! define_interp_avx {
354    ($interpolator: ident) => {
355        impl<'a, const GRID_SIZE: usize> AvxMdInterpolationQ0_15<'a, GRID_SIZE>
356            for $interpolator<'a, GRID_SIZE>
357        {
358            #[inline(always)]
359            fn new(table: &'a [AvxAlignedI16]) -> Self {
360                Self { cube: table }
361            }
362
363            #[inline(always)]
364            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
365                &self,
366                in_r: U,
367                in_g: U,
368                in_b: U,
369                lut: &[BarycentricWeight<i16>; BINS],
370            ) -> AvxVectorQ0_15Sse {
371                self.interpolate(
372                    in_r,
373                    in_g,
374                    in_b,
375                    lut,
376                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube },
377                )
378            }
379        }
380    };
381}
382
383#[cfg(feature = "options")]
384macro_rules! define_interp_avx_d {
385    ($interpolator: ident) => {
386        impl<'a, const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double<'a, GRID_SIZE>
387            for $interpolator<'a, GRID_SIZE>
388        {
389            #[inline(always)]
390            fn new(table0: &'a [AvxAlignedI16], table1: &'a [AvxAlignedI16]) -> Self {
391                Self {
392                    cube0: table0,
393                    cube1: table1,
394                }
395            }
396
397            #[inline(always)]
398            fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
399                &self,
400                in_r: U,
401                in_g: U,
402                in_b: U,
403                lut: &[BarycentricWeight<i16>; BINS],
404            ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
405                self.interpolate(
406                    in_r,
407                    in_g,
408                    in_b,
409                    lut,
410                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube0 },
411                    TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube1 },
412                )
413            }
414        }
415    };
416}
417
418#[cfg(feature = "options")]
419define_interp_avx!(TetrahedralAvxQ0_15);
420#[cfg(feature = "options")]
421define_interp_avx!(PyramidalAvxQ0_15);
422#[cfg(feature = "options")]
423define_interp_avx!(PrismaticAvxQ0_15);
424define_interp_avx!(TrilinearAvxQ0_15);
425#[cfg(feature = "options")]
426define_interp_avx_d!(PrismaticAvxQ0_15Double);
427#[cfg(feature = "options")]
428define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
429
430#[cfg(feature = "options")]
431impl<'a, const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double<'a, GRID_SIZE>
432    for TetrahedralAvxQ0_15Double<'a, GRID_SIZE>
433{
434    #[inline(always)]
435    fn new(table0: &'a [AvxAlignedI16], table1: &'a [AvxAlignedI16]) -> Self {
436        Self {
437            cube0: table0,
438            cube1: table1,
439        }
440    }
441
442    #[inline(always)]
443    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
444        &self,
445        in_r: U,
446        in_g: U,
447        in_b: U,
448        lut: &[BarycentricWeight<i16>; BINS],
449    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
450        self.interpolate(
451            in_r,
452            in_g,
453            in_b,
454            lut,
455            TetrahedralAvxFetchVector::<GRID_SIZE> {
456                cube0: self.cube0,
457                cube1: self.cube1,
458            },
459        )
460    }
461}
462
463impl<'a, const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double<'a, GRID_SIZE>
464    for TrilinearAvxQ0_15Double<'a, GRID_SIZE>
465{
466    #[inline(always)]
467    fn new(table0: &'a [AvxAlignedI16], table1: &'a [AvxAlignedI16]) -> Self {
468        Self {
469            cube0: table0,
470            cube1: table1,
471        }
472    }
473
474    #[inline(always)]
475    fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
476        &self,
477        in_r: U,
478        in_g: U,
479        in_b: U,
480        lut: &[BarycentricWeight<i16>; BINS],
481    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
482        self.interpolate(
483            in_r,
484            in_g,
485            in_b,
486            lut,
487            TetrahedralAvxFetchVector::<GRID_SIZE> {
488                cube0: self.cube0,
489                cube1: self.cube1,
490            },
491        )
492    }
493}
494
495#[cfg(feature = "options")]
496impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<'_, GRID_SIZE> {
497    #[inline(always)]
498    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
499        &self,
500        in_r: U,
501        in_g: U,
502        in_b: U,
503        lut: &[BarycentricWeight<i16>; BINS],
504        r: impl Fetcher<AvxVectorQ0_15Sse>,
505    ) -> AvxVectorQ0_15Sse {
506        let lut_r = lut[in_r.as_()];
507        let lut_g = lut[in_g.as_()];
508        let lut_b = lut[in_b.as_()];
509
510        let x: i32 = lut_r.x;
511        let y: i32 = lut_g.x;
512        let z: i32 = lut_b.x;
513
514        let x_n: i32 = lut_r.x_n;
515        let y_n: i32 = lut_g.x_n;
516        let z_n: i32 = lut_b.x_n;
517
518        let dr = lut_r.w;
519        let dg = lut_g.w;
520        let db = lut_b.w;
521
522        let c0 = r.fetch(x, y, z);
523
524        let w0 = AvxVectorQ0_15Sse::from(db);
525        let w1 = AvxVectorQ0_15Sse::from(dr);
526        let w2 = AvxVectorQ0_15Sse::from(dg);
527
528        if dr > db && dg > db {
529            let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
530            let x0 = r.fetch(x_n, y_n, z_n);
531            let x1 = r.fetch(x_n, y_n, z);
532            let x2 = r.fetch(x_n, y, z);
533            let x3 = r.fetch(x, y_n, z);
534
535            let c1 = x0 - x1;
536            let c2 = x2 - c0;
537            let c3 = x3 - c0;
538            let c4 = c0 - x3 - x2 + x1;
539
540            let s0 = c0.mla(c1, w0);
541            let s1 = s0.mla(c2, w1);
542            let s2 = s1.mla(c3, w2);
543            s2.mla(c4, w3)
544        } else if db > dr && dg > dr {
545            let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
546
547            let x0 = r.fetch(x, y, z_n);
548            let x1 = r.fetch(x_n, y_n, z_n);
549            let x2 = r.fetch(x, y_n, z_n);
550            let x3 = r.fetch(x, y_n, z);
551
552            let c1 = x0 - c0;
553            let c2 = x1 - x2;
554            let c3 = x3 - c0;
555            let c4 = c0 - x3 - x0 + x2;
556
557            let s0 = c0.mla(c1, w0);
558            let s1 = s0.mla(c2, w1);
559            let s2 = s1.mla(c3, w2);
560            s2.mla(c4, w3)
561        } else {
562            let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
563
564            let x0 = r.fetch(x, y, z_n);
565            let x1 = r.fetch(x_n, y, z);
566            let x2 = r.fetch(x_n, y, z_n);
567            let x3 = r.fetch(x_n, y_n, z_n);
568
569            let c1 = x0 - c0;
570            let c2 = x1 - c0;
571            let c3 = x3 - x2;
572            let c4 = c0 - x1 - x0 + x2;
573
574            let s0 = c0.mla(c1, w0);
575            let s1 = s0.mla(c2, w1);
576            let s2 = s1.mla(c3, w2);
577            s2.mla(c4, w3)
578        }
579    }
580}
581
582#[cfg(feature = "options")]
583impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<'_, GRID_SIZE> {
584    #[inline(always)]
585    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
586        &self,
587        in_r: U,
588        in_g: U,
589        in_b: U,
590        lut: &[BarycentricWeight<i16>; BINS],
591        r: impl Fetcher<AvxVectorQ0_15Sse>,
592    ) -> AvxVectorQ0_15Sse {
593        let lut_r = lut[in_r.as_()];
594        let lut_g = lut[in_g.as_()];
595        let lut_b = lut[in_b.as_()];
596
597        let x: i32 = lut_r.x;
598        let y: i32 = lut_g.x;
599        let z: i32 = lut_b.x;
600
601        let x_n: i32 = lut_r.x_n;
602        let y_n: i32 = lut_g.x_n;
603        let z_n: i32 = lut_b.x_n;
604
605        let dr = lut_r.w;
606        let dg = lut_g.w;
607        let db = lut_b.w;
608
609        let c0 = r.fetch(x, y, z);
610
611        let w0 = AvxVectorQ0_15Sse::from(db);
612        let w1 = AvxVectorQ0_15Sse::from(dr);
613        let w2 = AvxVectorQ0_15Sse::from(dg);
614        let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
615        let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
616
617        if db > dr {
618            let x0 = r.fetch(x, y, z_n);
619            let x1 = r.fetch(x_n, y, z_n);
620            let x2 = r.fetch(x, y_n, z);
621            let x3 = r.fetch(x, y_n, z_n);
622            let x4 = r.fetch(x_n, y_n, z_n);
623
624            let c1 = x0 - c0;
625            let c2 = x1 - x0;
626            let c3 = x2 - c0;
627            let c4 = c0 - x2 - x0 + x3;
628            let c5 = x0 - x3 - x1 + x4;
629
630            let s0 = c0.mla(c1, w0);
631            let s1 = s0.mla(c2, w1);
632            let s2 = s1.mla(c3, w2);
633            let s3 = s2.mla(c4, w3);
634            s3.mla(c5, w4)
635        } else {
636            let x0 = r.fetch(x_n, y, z);
637            let x1 = r.fetch(x_n, y, z_n);
638            let x2 = r.fetch(x, y_n, z);
639            let x3 = r.fetch(x_n, y_n, z);
640            let x4 = r.fetch(x_n, y_n, z_n);
641
642            let c1 = x1 - x0;
643            let c2 = x0 - c0;
644            let c3 = x2 - c0;
645            let c4 = x0 - x3 - x1 + x4;
646            let c5 = c0 - x2 - x0 + x3;
647
648            let s0 = c0.mla(c1, w0);
649            let s1 = s0.mla(c2, w1);
650            let s2 = s1.mla(c3, w2);
651            let s3 = s2.mla(c4, w3);
652            s3.mla(c5, w4)
653        }
654    }
655}
656
657#[cfg(feature = "options")]
658impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<'_, GRID_SIZE> {
659    #[inline(always)]
660    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
661        &self,
662        in_r: U,
663        in_g: U,
664        in_b: U,
665        lut: &[BarycentricWeight<i16>; BINS],
666        r0: impl Fetcher<AvxVectorQ0_15Sse>,
667        r1: impl Fetcher<AvxVectorQ0_15Sse>,
668    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
669        let lut_r = lut[in_r.as_()];
670        let lut_g = lut[in_g.as_()];
671        let lut_b = lut[in_b.as_()];
672
673        let x: i32 = lut_r.x;
674        let y: i32 = lut_g.x;
675        let z: i32 = lut_b.x;
676
677        let x_n: i32 = lut_r.x_n;
678        let y_n: i32 = lut_g.x_n;
679        let z_n: i32 = lut_b.x_n;
680
681        let dr = lut_r.w;
682        let dg = lut_g.w;
683        let db = lut_b.w;
684
685        let c0_0 = r0.fetch(x, y, z);
686        let c0_1 = r0.fetch(x, y, z);
687
688        let w0 = AvxVectorQ0_15::from(db);
689        let w1 = AvxVectorQ0_15::from(dr);
690        let w2 = AvxVectorQ0_15::from(dg);
691        let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
692        let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
693
694        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
695
696        if db > dr {
697            let x0_0 = r0.fetch(x, y, z_n);
698            let x1_0 = r0.fetch(x_n, y, z_n);
699            let x2_0 = r0.fetch(x, y_n, z);
700            let x3_0 = r0.fetch(x, y_n, z_n);
701            let x4_0 = r0.fetch(x_n, y_n, z_n);
702
703            let x0_1 = r1.fetch(x, y, z_n);
704            let x1_1 = r1.fetch(x_n, y, z_n);
705            let x2_1 = r1.fetch(x, y_n, z);
706            let x3_1 = r1.fetch(x, y_n, z_n);
707            let x4_1 = r1.fetch(x_n, y_n, z_n);
708
709            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
710            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
711            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
712            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
713            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
714
715            let c1 = x0 - c0;
716            let c2 = x1 - x0;
717            let c3 = x2 - c0;
718            let c4 = c0 - x2 - x0 + x3;
719            let c5 = x0 - x3 - x1 + x4;
720
721            let s0 = c0.mla(c1, w0);
722            let s1 = s0.mla(c2, w1);
723            let s2 = s1.mla(c3, w2);
724            let s3 = s2.mla(c4, w3);
725            s3.mla(c5, w4).split()
726        } else {
727            let x0_0 = r0.fetch(x_n, y, z);
728            let x1_0 = r0.fetch(x_n, y, z_n);
729            let x2_0 = r0.fetch(x, y_n, z);
730            let x3_0 = r0.fetch(x_n, y_n, z);
731            let x4_0 = r0.fetch(x_n, y_n, z_n);
732
733            let x0_1 = r1.fetch(x_n, y, z);
734            let x1_1 = r1.fetch(x_n, y, z_n);
735            let x2_1 = r1.fetch(x, y_n, z);
736            let x3_1 = r1.fetch(x_n, y_n, z);
737            let x4_1 = r1.fetch(x_n, y_n, z_n);
738
739            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
740            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
741            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
742            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
743            let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
744
745            let c1 = x1 - x0;
746            let c2 = x0 - c0;
747            let c3 = x2 - c0;
748            let c4 = x0 - x3 - x1 + x4;
749            let c5 = c0 - x2 - x0 + x3;
750
751            let s0 = c0.mla(c1, w0);
752            let s1 = s0.mla(c2, w1);
753            let s2 = s1.mla(c3, w2);
754            let s3 = s2.mla(c4, w3);
755            s3.mla(c5, w4).split()
756        }
757    }
758}
759
760#[cfg(feature = "options")]
761impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<'_, GRID_SIZE> {
762    #[inline(always)]
763    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
764        &self,
765        in_r: U,
766        in_g: U,
767        in_b: U,
768        lut: &[BarycentricWeight<i16>; BINS],
769        r0: impl Fetcher<AvxVectorQ0_15Sse>,
770        r1: impl Fetcher<AvxVectorQ0_15Sse>,
771    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
772        let lut_r = lut[in_r.as_()];
773        let lut_g = lut[in_g.as_()];
774        let lut_b = lut[in_b.as_()];
775
776        let x: i32 = lut_r.x;
777        let y: i32 = lut_g.x;
778        let z: i32 = lut_b.x;
779
780        let x_n: i32 = lut_r.x_n;
781        let y_n: i32 = lut_g.x_n;
782        let z_n: i32 = lut_b.x_n;
783
784        let dr = lut_r.w;
785        let dg = lut_g.w;
786        let db = lut_b.w;
787
788        let c0_0 = r0.fetch(x, y, z);
789        let c0_1 = r1.fetch(x, y, z);
790
791        let w0 = AvxVectorQ0_15::from(db);
792        let w1 = AvxVectorQ0_15::from(dr);
793        let w2 = AvxVectorQ0_15::from(dg);
794
795        let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
796
797        if dr > db && dg > db {
798            let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
799
800            let x0_0 = r0.fetch(x_n, y_n, z_n);
801            let x1_0 = r0.fetch(x_n, y_n, z);
802            let x2_0 = r0.fetch(x_n, y, z);
803            let x3_0 = r0.fetch(x, y_n, z);
804
805            let x0_1 = r1.fetch(x_n, y_n, z_n);
806            let x1_1 = r1.fetch(x_n, y_n, z);
807            let x2_1 = r1.fetch(x_n, y, z);
808            let x3_1 = r1.fetch(x, y_n, z);
809
810            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
811            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
812            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
813            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
814
815            let c1 = x0 - x1;
816            let c2 = x2 - c0;
817            let c3 = x3 - c0;
818            let c4 = c0 - x3 - x2 + x1;
819
820            let s0 = c0.mla(c1, w0);
821            let s1 = s0.mla(c2, w1);
822            let s2 = s1.mla(c3, w2);
823            s2.mla(c4, w3).split()
824        } else if db > dr && dg > dr {
825            let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
826
827            let x0_0 = r0.fetch(x, y, z_n);
828            let x1_0 = r0.fetch(x_n, y_n, z_n);
829            let x2_0 = r0.fetch(x, y_n, z_n);
830            let x3_0 = r0.fetch(x, y_n, z);
831
832            let x0_1 = r1.fetch(x, y, z_n);
833            let x1_1 = r1.fetch(x_n, y_n, z_n);
834            let x2_1 = r1.fetch(x, y_n, z_n);
835            let x3_1 = r1.fetch(x, y_n, z);
836
837            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
838            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
839            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
840            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
841
842            let c1 = x0 - c0;
843            let c2 = x1 - x2;
844            let c3 = x3 - c0;
845            let c4 = c0 - x3 - x0 + x2;
846
847            let s0 = c0.mla(c1, w0);
848            let s1 = s0.mla(c2, w1);
849            let s2 = s1.mla(c3, w2);
850            s2.mla(c4, w3).split()
851        } else {
852            let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
853
854            let x0_0 = r0.fetch(x, y, z_n);
855            let x1_0 = r0.fetch(x_n, y, z);
856            let x2_0 = r0.fetch(x_n, y, z_n);
857            let x3_0 = r0.fetch(x_n, y_n, z_n);
858
859            let x0_1 = r1.fetch(x, y, z_n);
860            let x1_1 = r1.fetch(x_n, y, z);
861            let x2_1 = r1.fetch(x_n, y, z_n);
862            let x3_1 = r1.fetch(x_n, y_n, z_n);
863
864            let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
865            let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
866            let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
867            let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
868
869            let c1 = x0 - c0;
870            let c2 = x1 - c0;
871            let c3 = x3 - x2;
872            let c4 = c0 - x1 - x0 + x2;
873
874            let s0 = c0.mla(c1, w0);
875            let s1 = s0.mla(c2, w1);
876            let s2 = s1.mla(c3, w2);
877            s2.mla(c4, w3).split()
878        }
879    }
880}
881
882#[cfg(feature = "options")]
883impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<'_, GRID_SIZE> {
884    #[inline(always)]
885    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
886        &self,
887        in_r: U,
888        in_g: U,
889        in_b: U,
890        lut: &[BarycentricWeight<i16>; BINS],
891        rv: impl Fetcher<AvxVectorQ0_15>,
892    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
893        let lut_r = lut[in_r.as_()];
894        let lut_g = lut[in_g.as_()];
895        let lut_b = lut[in_b.as_()];
896
897        let x: i32 = lut_r.x;
898        let y: i32 = lut_g.x;
899        let z: i32 = lut_b.x;
900
901        let x_n: i32 = lut_r.x_n;
902        let y_n: i32 = lut_g.x_n;
903        let z_n: i32 = lut_b.x_n;
904
905        let rx = lut_r.w;
906        let ry = lut_g.w;
907        let rz = lut_b.w;
908
909        let c0 = rv.fetch(x, y, z);
910
911        let w0 = AvxVectorQ0_15::from(rx);
912        let w1 = AvxVectorQ0_15::from(ry);
913        let w2 = AvxVectorQ0_15::from(rz);
914
915        let c2;
916        let c1;
917        let c3;
918        if rx >= ry {
919            if ry >= rz {
920                //rx >= ry && ry >= rz
921                c1 = rv.fetch(x_n, y, z) - c0;
922                c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
923                c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
924            } else if rx >= rz {
925                //rx >= rz && rz >= ry
926                c1 = rv.fetch(x_n, y, z) - c0;
927                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
928                c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
929            } else {
930                //rz > rx && rx >= ry
931                c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
932                c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
933                c3 = rv.fetch(x, y, z_n) - c0;
934            }
935        } else if rx >= rz {
936            //ry > rx && rx >= rz
937            c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
938            c2 = rv.fetch(x, y_n, z) - c0;
939            c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
940        } else if ry >= rz {
941            //ry >= rz && rz > rx
942            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
943            c2 = rv.fetch(x, y_n, z) - c0;
944            c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
945        } else {
946            //rz > ry && ry > rx
947            c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
948            c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
949            c3 = rv.fetch(x, y, z_n) - c0;
950        }
951        let s0 = c0.mla(c1, w0);
952        let s1 = s0.mla(c2, w1);
953        s1.mla(c3, w2).split()
954    }
955}
956
957impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<'_, GRID_SIZE> {
958    #[inline(always)]
959    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
960        &self,
961        in_r: U,
962        in_g: U,
963        in_b: U,
964        lut: &[BarycentricWeight<i16>; BINS],
965        rv: impl Fetcher<AvxVectorQ0_15>,
966    ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
967        let lut_r = lut[in_r.as_()];
968        let lut_g = lut[in_g.as_()];
969        let lut_b = lut[in_b.as_()];
970
971        let x: i32 = lut_r.x;
972        let y: i32 = lut_g.x;
973        let z: i32 = lut_b.x;
974
975        let x_n: i32 = lut_r.x_n;
976        let y_n: i32 = lut_g.x_n;
977        let z_n: i32 = lut_b.x_n;
978
979        let rx = lut_r.w;
980        let ry = lut_g.w;
981        let rz = lut_b.w;
982
983        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
984
985        let q_max = AvxVectorQ0_15::from(Q_MAX);
986        let w0 = AvxVectorQ0_15::from(rx);
987        let w1 = AvxVectorQ0_15::from(ry);
988        let w2 = AvxVectorQ0_15::from(rz);
989        let dx = q_max - w0;
990        let dy = q_max - w1;
991        let dz = q_max - w2;
992
993        let c000 = rv.fetch(x, y, z);
994        let c100 = rv.fetch(x_n, y, z);
995        let c010 = rv.fetch(x, y_n, z);
996        let c110 = rv.fetch(x_n, y_n, z);
997        let c001 = rv.fetch(x, y, z_n);
998        let c101 = rv.fetch(x_n, y, z_n);
999        let c011 = rv.fetch(x, y_n, z_n);
1000        let c111 = rv.fetch(x_n, y_n, z_n);
1001
1002        let c00 = (c000 * dx).mla(c100, w0);
1003        let c10 = (c010 * dx).mla(c110, w0);
1004        let c01 = (c001 * dx).mla(c101, w0);
1005        let c11 = (c011 * dx).mla(c111, w0);
1006
1007        let c0 = (c00 * dy).mla(c10, w1);
1008        let c1 = (c01 * dy).mla(c11, w1);
1009
1010        (c0 * dz).mla(c1, w2).split()
1011    }
1012}
1013
1014impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<'_, GRID_SIZE> {
1015    #[inline(always)]
1016    fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
1017        &self,
1018        in_r: U,
1019        in_g: U,
1020        in_b: U,
1021        lut: &[BarycentricWeight<i16>; BINS],
1022        r: impl Fetcher<AvxVectorQ0_15Sse>,
1023    ) -> AvxVectorQ0_15Sse {
1024        let lut_r = lut[in_r.as_()];
1025        let lut_g = lut[in_g.as_()];
1026        let lut_b = lut[in_b.as_()];
1027
1028        let x: i32 = lut_r.x;
1029        let y: i32 = lut_g.x;
1030        let z: i32 = lut_b.x;
1031
1032        let x_n: i32 = lut_r.x_n;
1033        let y_n: i32 = lut_g.x_n;
1034        let z_n: i32 = lut_b.x_n;
1035
1036        let dr = lut_r.w;
1037        let dg = lut_g.w;
1038        let db = lut_b.w;
1039
1040        const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
1041
1042        let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
1043        let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
1044        let w0 = AvxVectorQ0_15::from(dr);
1045        let w1 = AvxVectorQ0_15::from(dg);
1046        let w2 = AvxVectorQ0_15Sse::from(db);
1047        let dx = q_max_avx - w0;
1048        let dy = q_max_avx - w1;
1049        let dz = q_max - w2;
1050
1051        let c000 = r.fetch(x, y, z);
1052        let c100 = r.fetch(x_n, y, z);
1053        let c010 = r.fetch(x, y_n, z);
1054        let c110 = r.fetch(x_n, y_n, z);
1055        let c001 = r.fetch(x, y, z_n);
1056        let c101 = r.fetch(x_n, y, z_n);
1057        let c011 = r.fetch(x, y_n, z_n);
1058        let c111 = r.fetch(x_n, y_n, z_n);
1059
1060        let x000 = AvxVectorQ0_15::from_sse(c000, c001);
1061        let x010 = AvxVectorQ0_15::from_sse(c010, c011);
1062        let x011 = AvxVectorQ0_15::from_sse(c100, c101);
1063        let x111 = AvxVectorQ0_15::from_sse(c110, c111);
1064
1065        let c00 = (x000 * dx).mla(x011, w0);
1066        let c10 = (x010 * dx).mla(x111, w0);
1067
1068        let c0 = (c00 * dy).mla(c10, w1);
1069
1070        let (c0, c1) = c0.split();
1071
1072        (c0 * dz).mla(c1, w2)
1073    }
1074}