moxcms/conversions/avx/
hypercube.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::avx::interpolator::AvxVectorSse;
30use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
31use crate::nd_array::lerp;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35/// 4D CLUT helper.
36///
37/// Represents hypercube.
38pub(crate) struct HypercubeAvx<'a> {
39    array: &'a [f32],
40    x_stride: u32,
41    y_stride: u32,
42    z_stride: u32,
43    grid_size: [u8; 4],
44}
45
46trait Fetcher4<T> {
47    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> T;
48}
49
50struct Fetch4Vec3<'a> {
51    array: &'a [f32],
52    x_stride: u32,
53    y_stride: u32,
54    z_stride: u32,
55}
56
57impl Fetcher4<AvxVectorSse> for Fetch4Vec3<'_> {
58    #[inline(always)]
59    fn fetch(&self, x: i32, y: i32, z: i32, w: i32) -> AvxVectorSse {
60        let start = (x as u32 * self.x_stride
61            + y as u32 * self.y_stride
62            + z as u32 * self.z_stride
63            + w as u32) as usize
64            * 3;
65        unsafe {
66            let k = self.array.get_unchecked(start..);
67            let lo = _mm_loadu_si64(k.as_ptr() as *const _);
68            let hi = _mm_insert_epi32::<2>(
69                lo,
70                k.get_unchecked(2..).as_ptr().read_unaligned().to_bits() as i32,
71            );
72            AvxVectorSse {
73                v: _mm_castsi128_ps(hi),
74            }
75        }
76    }
77}
78
79impl<'a> HypercubeAvx<'a> {
80    pub(crate) fn new(arr: &'a [f32], grid: [u8; 4], components: usize) -> Self {
81        // This is safety precondition, array size must be not less than full grid size * components.
82        // Needs to ensure that it is not missed somewhere else
83        assert_eq!(
84            grid[0] as usize * grid[1] as usize * grid[2] as usize * grid[3] as usize * components,
85            arr.len()
86        );
87        let z_stride = grid[2] as u32;
88        let y_stride = z_stride * grid[1] as u32;
89        let x_stride = y_stride * grid[0] as u32;
90        HypercubeAvx {
91            array: arr,
92            x_stride,
93            y_stride,
94            z_stride,
95            grid_size: grid,
96        }
97    }
98
99    #[inline(always)]
100    fn quadlinear<
101        T: From<f32>
102            + Add<T, Output = T>
103            + Mul<T, Output = T>
104            + FusedMultiplyAdd<T>
105            + Sub<T, Output = T>
106            + Copy
107            + FusedMultiplyNegAdd<T>,
108    >(
109        &self,
110        lin_x: f32,
111        lin_y: f32,
112        lin_z: f32,
113        lin_w: f32,
114        r: impl Fetcher4<T>,
115    ) -> T {
116        let lin_x = lin_x.max(0.0).min(1.0);
117        let lin_y = lin_y.max(0.0).min(1.0);
118        let lin_z = lin_z.max(0.0).min(1.0);
119        let lin_w = lin_w.max(0.0).min(1.0);
120
121        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
122        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
123        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
124        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
125
126        let x = (lin_x * scale_x).floor() as i32;
127        let y = (lin_y * scale_y).floor() as i32;
128        let z = (lin_z * scale_z).floor() as i32;
129        let w = (lin_w * scale_w).floor() as i32;
130
131        let x_n = (lin_x * scale_x).ceil() as i32;
132        let y_n = (lin_y * scale_y).ceil() as i32;
133        let z_n = (lin_z * scale_z).ceil() as i32;
134        let w_n = (lin_w * scale_w).ceil() as i32;
135
136        let x_d = T::from(lin_x * scale_x - x as f32);
137        let y_d = T::from(lin_y * scale_y - y as f32);
138        let z_d = T::from(lin_z * scale_z - z as f32);
139        let w_d = T::from(lin_w * scale_w - w as f32);
140
141        let r_x1 = lerp(r.fetch(x, y, z, w), r.fetch(x_n, y, z, w), x_d);
142        let r_x2 = lerp(r.fetch(x, y_n, z, w), r.fetch(x_n, y_n, z, w), x_d);
143        let r_y1 = lerp(r_x1, r_x2, y_d);
144        let r_x3 = lerp(r.fetch(x, y, z_n, w), r.fetch(x_n, y, z_n, w), x_d);
145        let r_x4 = lerp(r.fetch(x, y_n, z_n, w), r.fetch(x_n, y_n, z_n, w), x_d);
146        let r_y2 = lerp(r_x3, r_x4, y_d);
147        let r_z1 = lerp(r_y1, r_y2, z_d);
148
149        let r_x1 = lerp(r.fetch(x, y, z, w_n), r.fetch(x_n, y, z, w_n), x_d);
150        let r_x2 = lerp(r.fetch(x, y_n, z, w_n), r.fetch(x_n, y_n, z, w_n), x_d);
151        let r_y1 = lerp(r_x1, r_x2, y_d);
152        let r_x3 = lerp(r.fetch(x, y, z_n, w_n), r.fetch(x_n, y, z_n, w_n), x_d);
153        let r_x4 = lerp(r.fetch(x, y_n, z_n, w_n), r.fetch(x_n, y_n, z_n, w_n), x_d);
154        let r_y2 = lerp(r_x3, r_x4, y_d);
155        let r_z2 = lerp(r_y1, r_y2, z_d);
156        lerp(r_z1, r_z2, w_d)
157    }
158
159    #[inline(always)]
160    pub(crate) fn quadlinear_vec3(
161        &self,
162        lin_x: f32,
163        lin_y: f32,
164        lin_z: f32,
165        lin_w: f32,
166    ) -> AvxVectorSse {
167        self.quadlinear(
168            lin_x,
169            lin_y,
170            lin_z,
171            lin_w,
172            Fetch4Vec3 {
173                array: self.array,
174                x_stride: self.x_stride,
175                y_stride: self.y_stride,
176                z_stride: self.z_stride,
177            },
178        )
179    }
180
181    #[cfg(feature = "options")]
182    #[inline(always)]
183    fn pyramid<
184        T: From<f32>
185            + Add<T, Output = T>
186            + Mul<T, Output = T>
187            + FusedMultiplyAdd<T>
188            + Sub<T, Output = T>
189            + Copy
190            + FusedMultiplyNegAdd<T>,
191    >(
192        &self,
193        lin_x: f32,
194        lin_y: f32,
195        lin_z: f32,
196        lin_w: f32,
197        r: impl Fetcher4<T>,
198    ) -> T {
199        let lin_x = lin_x.max(0.0).min(1.0);
200        let lin_y = lin_y.max(0.0).min(1.0);
201        let lin_z = lin_z.max(0.0).min(1.0);
202        let lin_w = lin_w.max(0.0).min(1.0);
203
204        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
205        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
206        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
207        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
208
209        let x = (lin_x * scale_x).floor() as i32;
210        let y = (lin_y * scale_y).floor() as i32;
211        let z = (lin_z * scale_z).floor() as i32;
212        let w = (lin_w * scale_w).floor() as i32;
213
214        let x_n = (lin_x * scale_x).ceil() as i32;
215        let y_n = (lin_y * scale_y).ceil() as i32;
216        let z_n = (lin_z * scale_z).ceil() as i32;
217        let w_n = (lin_w * scale_w).ceil() as i32;
218
219        let dr = lin_x * scale_x - x as f32;
220        let dg = lin_y * scale_y - y as f32;
221        let db = lin_z * scale_z - z as f32;
222        let dw = lin_w * scale_w - w as f32;
223
224        let c0 = r.fetch(x, y, z, w);
225
226        let w0 = if dr > db && dg > db {
227            let x0 = r.fetch(x_n, y_n, z_n, w);
228            let x1 = r.fetch(x_n, y_n, z, w);
229            let x2 = r.fetch(x_n, y, z, w);
230            let x3 = r.fetch(x, y_n, z, w);
231
232            let c1 = x0 - x1;
233            let c2 = x2 - c0;
234            let c3 = x3 - c0;
235            let c4 = c0 - x3 - x2 + x1;
236
237            let s0 = c0.mla(c1, T::from(db));
238            let s1 = s0.mla(c2, T::from(dr));
239            let s2 = s1.mla(c3, T::from(dg));
240            s2.mla(c4, T::from(dr * dg))
241        } else if db > dr && dg > dr {
242            let x0 = r.fetch(x, y, z_n, w);
243            let x1 = r.fetch(x_n, y_n, z_n, w);
244            let x2 = r.fetch(x, y_n, z_n, w);
245            let x3 = r.fetch(x, y_n, z, w);
246
247            let c1 = x0 - c0;
248            let c2 = x1 - x2;
249            let c3 = x3 - c0;
250            let c4 = c0 - x3 - x0 + x2;
251
252            let s0 = c0.mla(c1, T::from(db));
253            let s1 = s0.mla(c2, T::from(dr));
254            let s2 = s1.mla(c3, T::from(dg));
255            s2.mla(c4, T::from(dg * db))
256        } else {
257            let x0 = r.fetch(x, y, z_n, w);
258            let x1 = r.fetch(x_n, y, z, w);
259            let x2 = r.fetch(x_n, y, z_n, w);
260            let x3 = r.fetch(x_n, y_n, z_n, w);
261
262            let c1 = x0 - c0;
263            let c2 = x1 - c0;
264            let c3 = x3 - x2;
265            let c4 = c0 - x1 - x0 + x2;
266
267            let s0 = c0.mla(c1, T::from(db));
268            let s1 = s0.mla(c2, T::from(dr));
269            let s2 = s1.mla(c3, T::from(dg));
270            s2.mla(c4, T::from(db * dr))
271        };
272
273        let c0 = r.fetch(x, y, z, w_n);
274
275        let w1 = if dr > db && dg > db {
276            let x0 = r.fetch(x_n, y_n, z_n, w_n);
277            let x1 = r.fetch(x_n, y_n, z, w_n);
278            let x2 = r.fetch(x_n, y, z, w_n);
279            let x3 = r.fetch(x, y_n, z, w_n);
280
281            let c1 = x0 - x1;
282            let c2 = x2 - c0;
283            let c3 = x3 - c0;
284            let c4 = c0 - x3 - x2 + x1;
285
286            let s0 = c0.mla(c1, T::from(db));
287            let s1 = s0.mla(c2, T::from(dr));
288            let s2 = s1.mla(c3, T::from(dg));
289            s2.mla(c4, T::from(dr * dg))
290        } else if db > dr && dg > dr {
291            let x0 = r.fetch(x, y, z_n, w_n);
292            let x1 = r.fetch(x_n, y_n, z_n, w_n);
293            let x2 = r.fetch(x, y_n, z_n, w_n);
294            let x3 = r.fetch(x, y_n, z, w_n);
295
296            let c1 = x0 - c0;
297            let c2 = x1 - x2;
298            let c3 = x3 - c0;
299            let c4 = c0 - x3 - x0 + x2;
300
301            let s0 = c0.mla(c1, T::from(db));
302            let s1 = s0.mla(c2, T::from(dr));
303            let s2 = s1.mla(c3, T::from(dg));
304            s2.mla(c4, T::from(dg * db))
305        } else {
306            let x0 = r.fetch(x, y, z_n, w_n);
307            let x1 = r.fetch(x_n, y, z, w_n);
308            let x2 = r.fetch(x_n, y, z_n, w_n);
309            let x3 = r.fetch(x_n, y_n, z_n, w_n);
310
311            let c1 = x0 - c0;
312            let c2 = x1 - c0;
313            let c3 = x3 - x2;
314            let c4 = c0 - x1 - x0 + x2;
315
316            let s0 = c0.mla(c1, T::from(db));
317            let s1 = s0.mla(c2, T::from(dr));
318            let s2 = s1.mla(c3, T::from(dg));
319            s2.mla(c4, T::from(db * dr))
320        };
321        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
322    }
323
324    #[cfg(feature = "options")]
325    #[inline(always)]
326    pub(crate) fn pyramid_vec3(
327        &self,
328        lin_x: f32,
329        lin_y: f32,
330        lin_z: f32,
331        lin_w: f32,
332    ) -> AvxVectorSse {
333        self.pyramid(
334            lin_x,
335            lin_y,
336            lin_z,
337            lin_w,
338            Fetch4Vec3 {
339                array: self.array,
340                x_stride: self.x_stride,
341                y_stride: self.y_stride,
342                z_stride: self.z_stride,
343            },
344        )
345    }
346
347    #[cfg(feature = "options")]
348    #[inline(always)]
349    fn prism<
350        T: From<f32>
351            + Add<T, Output = T>
352            + Mul<T, Output = T>
353            + FusedMultiplyAdd<T>
354            + Sub<T, Output = T>
355            + Copy
356            + FusedMultiplyNegAdd<T>,
357    >(
358        &self,
359        lin_x: f32,
360        lin_y: f32,
361        lin_z: f32,
362        lin_w: f32,
363        r: impl Fetcher4<T>,
364    ) -> T {
365        let lin_x = lin_x.max(0.0).min(1.0);
366        let lin_y = lin_y.max(0.0).min(1.0);
367        let lin_z = lin_z.max(0.0).min(1.0);
368        let lin_w = lin_w.max(0.0).min(1.0);
369
370        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
371        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
372        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
373        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
374
375        let x = (lin_x * scale_x).floor() as i32;
376        let y = (lin_y * scale_y).floor() as i32;
377        let z = (lin_z * scale_z).floor() as i32;
378        let w = (lin_w * scale_w).floor() as i32;
379
380        let x_n = (lin_x * scale_x).ceil() as i32;
381        let y_n = (lin_y * scale_y).ceil() as i32;
382        let z_n = (lin_z * scale_z).ceil() as i32;
383        let w_n = (lin_w * scale_w).ceil() as i32;
384
385        let dr = lin_x * scale_x - x as f32;
386        let dg = lin_y * scale_y - y as f32;
387        let db = lin_z * scale_z - z as f32;
388        let dw = lin_w * scale_w - w as f32;
389
390        let c0 = r.fetch(x, y, z, w);
391
392        let w0 = if db >= dr {
393            let x0 = r.fetch(x, y, z_n, w);
394            let x1 = r.fetch(x_n, y, z_n, w);
395            let x2 = r.fetch(x, y_n, z, w);
396            let x3 = r.fetch(x, y_n, z_n, w);
397            let x4 = r.fetch(x_n, y_n, z_n, w);
398
399            let c1 = x0 - c0;
400            let c2 = x1 - x0;
401            let c3 = x2 - c0;
402            let c4 = c0 - x2 - x0 + x3;
403            let c5 = x0 - x3 - x1 + x4;
404
405            let s0 = c0.mla(c1, T::from(db));
406            let s1 = s0.mla(c2, T::from(dr));
407            let s2 = s1.mla(c3, T::from(dg));
408            let s3 = s2.mla(c4, T::from(dg * db));
409            s3.mla(c5, T::from(dr * dg))
410        } else {
411            let x0 = r.fetch(x_n, y, z, w);
412            let x1 = r.fetch(x_n, y, z_n, w);
413            let x2 = r.fetch(x, y_n, z, w);
414            let x3 = r.fetch(x_n, y_n, z, w);
415            let x4 = r.fetch(x_n, y_n, z_n, w);
416
417            let c1 = x1 - x0;
418            let c2 = x0 - c0;
419            let c3 = x2 - c0;
420            let c4 = x0 - x3 - x1 + x4;
421            let c5 = c0 - x2 - x0 + x3;
422
423            let s0 = c0.mla(c1, T::from(db));
424            let s1 = s0.mla(c2, T::from(dr));
425            let s2 = s1.mla(c3, T::from(dg));
426            let s3 = s2.mla(c4, T::from(dg * db));
427            s3.mla(c5, T::from(dr * dg))
428        };
429
430        let c0 = r.fetch(x, y, z, w_n);
431
432        let w1 = if db >= dr {
433            let x0 = r.fetch(x, y, z_n, w_n);
434            let x1 = r.fetch(x_n, y, z_n, w_n);
435            let x2 = r.fetch(x, y_n, z, w_n);
436            let x3 = r.fetch(x, y_n, z_n, w_n);
437            let x4 = r.fetch(x_n, y_n, z_n, w_n);
438
439            let c1 = x0 - c0;
440            let c2 = x1 - x0;
441            let c3 = x2 - c0;
442            let c4 = c0 - x2 - x0 + x3;
443            let c5 = x0 - x3 - x1 + x4;
444
445            let s0 = c0.mla(c1, T::from(db));
446            let s1 = s0.mla(c2, T::from(dr));
447            let s2 = s1.mla(c3, T::from(dg));
448            let s3 = s2.mla(c4, T::from(dg * db));
449            s3.mla(c5, T::from(dr * dg))
450        } else {
451            let x0 = r.fetch(x_n, y, z, w_n);
452            let x1 = r.fetch(x_n, y, z_n, w_n);
453            let x2 = r.fetch(x, y_n, z, w_n);
454            let x3 = r.fetch(x_n, y_n, z, w_n);
455            let x4 = r.fetch(x_n, y_n, z_n, w_n);
456
457            let c1 = x1 - x0;
458            let c2 = x0 - c0;
459            let c3 = x2 - c0;
460            let c4 = x0 - x3 - x1 + x4;
461            let c5 = c0 - x2 - x0 + x3;
462
463            let s0 = c0.mla(c1, T::from(db));
464            let s1 = s0.mla(c2, T::from(dr));
465            let s2 = s1.mla(c3, T::from(dg));
466            let s3 = s2.mla(c4, T::from(dg * db));
467            s3.mla(c5, T::from(dr * dg))
468        };
469        w0.neg_mla(w0, T::from(dw)).mla(w1, T::from(dw))
470    }
471
472    #[cfg(feature = "options")]
473    #[inline(always)]
474    pub(crate) fn prism_vec3(
475        &self,
476        lin_x: f32,
477        lin_y: f32,
478        lin_z: f32,
479        lin_w: f32,
480    ) -> AvxVectorSse {
481        self.prism(
482            lin_x,
483            lin_y,
484            lin_z,
485            lin_w,
486            Fetch4Vec3 {
487                array: self.array,
488                x_stride: self.x_stride,
489                y_stride: self.y_stride,
490                z_stride: self.z_stride,
491            },
492        )
493    }
494
495    #[cfg(feature = "options")]
496    #[inline(always)]
497    fn tetra<
498        T: From<f32>
499            + Add<T, Output = T>
500            + Mul<T, Output = T>
501            + FusedMultiplyAdd<T>
502            + Sub<T, Output = T>
503            + Copy
504            + FusedMultiplyNegAdd<T>,
505    >(
506        &self,
507        lin_x: f32,
508        lin_y: f32,
509        lin_z: f32,
510        lin_w: f32,
511        r: impl Fetcher4<T>,
512    ) -> T {
513        let lin_x = lin_x.max(0.0).min(1.0);
514        let lin_y = lin_y.max(0.0).min(1.0);
515        let lin_z = lin_z.max(0.0).min(1.0);
516        let lin_w = lin_w.max(0.0).min(1.0);
517
518        let scale_x = (self.grid_size[0] as i32 - 1) as f32;
519        let scale_y = (self.grid_size[1] as i32 - 1) as f32;
520        let scale_z = (self.grid_size[2] as i32 - 1) as f32;
521        let scale_w = (self.grid_size[3] as i32 - 1) as f32;
522
523        let x = (lin_x * scale_x).floor() as i32;
524        let y = (lin_y * scale_y).floor() as i32;
525        let z = (lin_z * scale_z).floor() as i32;
526        let w = (lin_w * scale_w).floor() as i32;
527
528        let x_n = (lin_x * scale_x).ceil() as i32;
529        let y_n = (lin_y * scale_y).ceil() as i32;
530        let z_n = (lin_z * scale_z).ceil() as i32;
531        let w_n = (lin_w * scale_w).ceil() as i32;
532
533        let rx = lin_x * scale_x - x as f32;
534        let ry = lin_y * scale_y - y as f32;
535        let rz = lin_z * scale_z - z as f32;
536        let rw = lin_w * scale_w - w as f32;
537
538        let c0 = r.fetch(x, y, z, w);
539        let c2;
540        let c1;
541        let c3;
542        if rx >= ry {
543            if ry >= rz {
544                //rx >= ry && ry >= rz
545                c1 = r.fetch(x_n, y, z, w) - c0;
546                c2 = r.fetch(x_n, y_n, z, w) - r.fetch(x_n, y, z, w);
547                c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
548            } else if rx >= rz {
549                //rx >= rz && rz >= ry
550                c1 = r.fetch(x_n, y, z, w) - c0;
551                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
552                c3 = r.fetch(x_n, y, z_n, w) - r.fetch(x_n, y, z, w);
553            } else {
554                //rz > rx && rx >= ry
555                c1 = r.fetch(x_n, y, z_n, w) - r.fetch(x, y, z_n, w);
556                c2 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y, z_n, w);
557                c3 = r.fetch(x, y, z_n, w) - c0;
558            }
559        } else if rx >= rz {
560            //ry > rx && rx >= rz
561            c1 = r.fetch(x_n, y_n, z, w) - r.fetch(x, y_n, z, w);
562            c2 = r.fetch(x, y_n, z, w) - c0;
563            c3 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x_n, y_n, z, w);
564        } else if ry >= rz {
565            //ry >= rz && rz > rx
566            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
567            c2 = r.fetch(x, y_n, z, w) - c0;
568            c3 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y_n, z, w);
569        } else {
570            //rz > ry && ry > rx
571            c1 = r.fetch(x_n, y_n, z_n, w) - r.fetch(x, y_n, z_n, w);
572            c2 = r.fetch(x, y_n, z_n, w) - r.fetch(x, y, z_n, w);
573            c3 = r.fetch(x, y, z_n, w) - c0;
574        }
575        let s0 = c0.mla(c1, T::from(rx));
576        let s1 = s0.mla(c2, T::from(ry));
577        let w0 = s1.mla(c3, T::from(rz));
578
579        let c0 = r.fetch(x, y, z, w_n);
580        let c2;
581        let c1;
582        let c3;
583        if rx >= ry {
584            if ry >= rz {
585                //rx >= ry && ry >= rz
586                c1 = r.fetch(x_n, y, z, w_n) - c0;
587                c2 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x_n, y, z, w_n);
588                c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
589            } else if rx >= rz {
590                //rx >= rz && rz >= ry
591                c1 = r.fetch(x_n, y, z, w_n) - c0;
592                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
593                c3 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x_n, y, z, w_n);
594            } else {
595                //rz > rx && rx >= ry
596                c1 = r.fetch(x_n, y, z_n, w_n) - r.fetch(x, y, z_n, w_n);
597                c2 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y, z_n, w_n);
598                c3 = r.fetch(x, y, z_n, w_n) - c0;
599            }
600        } else if rx >= rz {
601            //ry > rx && rx >= rz
602            c1 = r.fetch(x_n, y_n, z, w_n) - r.fetch(x, y_n, z, w_n);
603            c2 = r.fetch(x, y_n, z, w_n) - c0;
604            c3 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x_n, y_n, z, w_n);
605        } else if ry >= rz {
606            //ry >= rz && rz > rx
607            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
608            c2 = r.fetch(x, y_n, z, w_n) - c0;
609            c3 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y_n, z, w_n);
610        } else {
611            //rz > ry && ry > rx
612            c1 = r.fetch(x_n, y_n, z_n, w_n) - r.fetch(x, y_n, z_n, w_n);
613            c2 = r.fetch(x, y_n, z_n, w_n) - r.fetch(x, y, z_n, w_n);
614            c3 = r.fetch(x, y, z_n, w_n) - c0;
615        }
616        let s0 = c0.mla(c1, T::from(rx));
617        let s1 = s0.mla(c2, T::from(ry));
618        let w1 = s1.mla(c3, T::from(rz));
619        w0.neg_mla(w0, T::from(rw)).mla(w1, T::from(rw))
620    }
621
622    #[cfg(feature = "options")]
623    #[inline(always)]
624    pub(crate) fn tetra_vec3(
625        &self,
626        lin_x: f32,
627        lin_y: f32,
628        lin_z: f32,
629        lin_w: f32,
630    ) -> AvxVectorSse {
631        self.tetra(
632            lin_x,
633            lin_y,
634            lin_z,
635            lin_w,
636            Fetch4Vec3 {
637                array: self.array,
638                x_stride: self.x_stride,
639                y_stride: self.y_stride,
640                z_stride: self.z_stride,
641            },
642        )
643    }
644}