1use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
31use num_traits::AsPrimitive;
32use std::arch::x86_64::*;
33use std::ops::{Add, Mul, Sub};
34
35#[repr(align(16), C)]
36pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
37
38#[cfg(feature = "options")]
39pub(crate) struct TetrahedralAvxFma<'a, const GRID_SIZE: usize> {
40 pub(crate) cube: &'a [SseAlignedF32],
41}
42
43#[cfg(feature = "options")]
44pub(crate) struct PyramidalAvxFma<'a, const GRID_SIZE: usize> {
45 pub(crate) cube: &'a [SseAlignedF32],
46}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxFma<'a, const GRID_SIZE: usize> {
50 pub(crate) cube: &'a [SseAlignedF32],
51}
52
53pub(crate) struct TrilinearAvxFma<'a, const GRID_SIZE: usize> {
54 pub(crate) cube: &'a [SseAlignedF32],
55}
56
57#[cfg(feature = "options")]
58pub(crate) struct PrismaticAvxFmaDouble<'a, const GRID_SIZE: usize> {
59 pub(crate) cube0: &'a [SseAlignedF32],
60 pub(crate) cube1: &'a [SseAlignedF32],
61}
62
63pub(crate) struct TrilinearAvxFmaDouble<'a, const GRID_SIZE: usize> {
64 pub(crate) cube0: &'a [SseAlignedF32],
65 pub(crate) cube1: &'a [SseAlignedF32],
66}
67
68#[cfg(feature = "options")]
69pub(crate) struct PyramidAvxFmaDouble<'a, const GRID_SIZE: usize> {
70 pub(crate) cube0: &'a [SseAlignedF32],
71 pub(crate) cube1: &'a [SseAlignedF32],
72}
73
74#[cfg(feature = "options")]
75pub(crate) struct TetrahedralAvxFmaDouble<'a, const GRID_SIZE: usize> {
76 pub(crate) cube0: &'a [SseAlignedF32],
77 pub(crate) cube1: &'a [SseAlignedF32],
78}
79
80pub(crate) trait AvxMdInterpolationDouble<'a, const GRID_SIZE: usize> {
81 fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self;
82 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
83 &self,
84 in_r: U,
85 in_g: U,
86 in_b: U,
87 lut: &[BarycentricWeight<f32>; BINS],
88 ) -> (AvxVectorSse, AvxVectorSse);
89}
90
91pub(crate) trait AvxMdInterpolation<'a, const GRID_SIZE: usize> {
92 fn new(table: &'a [SseAlignedF32]) -> Self;
93 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
94 &self,
95 in_r: U,
96 in_g: U,
97 in_b: U,
98 lut: &[BarycentricWeight<f32>; BINS],
99 ) -> AvxVectorSse;
100}
101
102trait Fetcher<T> {
103 fn fetch(&self, x: i32, y: i32, z: i32) -> T;
104}
105
106#[derive(Copy, Clone)]
107#[repr(transparent)]
108pub(crate) struct AvxVectorSse {
109 pub(crate) v: __m128,
110}
111
112#[derive(Copy, Clone)]
113#[repr(transparent)]
114pub(crate) struct AvxVector {
115 pub(crate) v: __m256,
116}
117
118impl AvxVector {
119 #[inline(always)]
120 pub(crate) fn from_sse(lo: AvxVectorSse, hi: AvxVectorSse) -> AvxVector {
121 unsafe {
122 AvxVector {
123 v: _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo.v), hi.v),
124 }
125 }
126 }
127
128 #[inline(always)]
129 pub(crate) fn split(self) -> (AvxVectorSse, AvxVectorSse) {
130 unsafe {
131 (
132 AvxVectorSse {
133 v: _mm256_castps256_ps128(self.v),
134 },
135 AvxVectorSse {
136 v: _mm256_extractf128_ps::<1>(self.v),
137 },
138 )
139 }
140 }
141}
142
143impl From<f32> for AvxVectorSse {
144 #[inline(always)]
145 fn from(v: f32) -> Self {
146 AvxVectorSse {
147 v: unsafe { _mm_set1_ps(v) },
148 }
149 }
150}
151
152impl From<f32> for AvxVector {
153 #[inline(always)]
154 fn from(v: f32) -> Self {
155 AvxVector {
156 v: unsafe { _mm256_set1_ps(v) },
157 }
158 }
159}
160
161impl Sub<AvxVectorSse> for AvxVectorSse {
162 type Output = Self;
163 #[inline(always)]
164 fn sub(self, rhs: AvxVectorSse) -> Self::Output {
165 AvxVectorSse {
166 v: unsafe { _mm_sub_ps(self.v, rhs.v) },
167 }
168 }
169}
170
171impl Sub<AvxVector> for AvxVector {
172 type Output = Self;
173 #[inline(always)]
174 fn sub(self, rhs: AvxVector) -> Self::Output {
175 AvxVector {
176 v: unsafe { _mm256_sub_ps(self.v, rhs.v) },
177 }
178 }
179}
180
181impl Add<AvxVectorSse> for AvxVectorSse {
182 type Output = Self;
183 #[inline(always)]
184 fn add(self, rhs: AvxVectorSse) -> Self::Output {
185 AvxVectorSse {
186 v: unsafe { _mm_add_ps(self.v, rhs.v) },
187 }
188 }
189}
190
191impl Mul<AvxVectorSse> for AvxVectorSse {
192 type Output = Self;
193 #[inline(always)]
194 fn mul(self, rhs: AvxVectorSse) -> Self::Output {
195 AvxVectorSse {
196 v: unsafe { _mm_mul_ps(self.v, rhs.v) },
197 }
198 }
199}
200
201impl AvxVector {
202 #[inline(always)]
203 pub(crate) fn neg_mla(self, b: AvxVector, c: AvxVector) -> Self {
204 Self {
205 v: unsafe { _mm256_fnmadd_ps(b.v, c.v, self.v) },
206 }
207 }
208}
209
210impl FusedMultiplyNegAdd<AvxVectorSse> for AvxVectorSse {
211 #[inline(always)]
212 fn neg_mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> Self {
213 Self {
214 v: unsafe { _mm_fnmadd_ps(b.v, c.v, self.v) },
215 }
216 }
217}
218
219impl Add<AvxVector> for AvxVector {
220 type Output = Self;
221 #[inline(always)]
222 fn add(self, rhs: AvxVector) -> Self::Output {
223 AvxVector {
224 v: unsafe { _mm256_add_ps(self.v, rhs.v) },
225 }
226 }
227}
228
229impl Mul<AvxVector> for AvxVector {
230 type Output = Self;
231 #[inline(always)]
232 fn mul(self, rhs: AvxVector) -> Self::Output {
233 AvxVector {
234 v: unsafe { _mm256_mul_ps(self.v, rhs.v) },
235 }
236 }
237}
238
239impl FusedMultiplyAdd<AvxVectorSse> for AvxVectorSse {
240 #[inline(always)]
241 fn mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> AvxVectorSse {
242 AvxVectorSse {
243 v: unsafe { _mm_fmadd_ps(b.v, c.v, self.v) },
244 }
245 }
246}
247
248impl FusedMultiplyAdd<AvxVector> for AvxVector {
249 #[inline(always)]
250 fn mla(&self, b: AvxVector, c: AvxVector) -> AvxVector {
251 AvxVector {
252 v: unsafe { _mm256_fmadd_ps(b.v, c.v, self.v) },
253 }
254 }
255}
256
257struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
258 cube: &'a [SseAlignedF32],
259}
260
261struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
262 cube0: &'a [SseAlignedF32],
263 cube1: &'a [SseAlignedF32],
264}
265
266impl<const GRID_SIZE: usize> Fetcher<AvxVector> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
267 #[inline(always)]
268 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVector {
269 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
270 + y as u32 * GRID_SIZE as u32
271 + z as u32) as usize;
272 let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
273 let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
274 AvxVector {
275 v: unsafe {
276 _mm256_insertf128_ps::<1>(
277 _mm256_castps128_ps256(_mm_load_ps(jx0.as_ptr() as *const f32)),
278 _mm_load_ps(jx1.as_ptr() as *const f32),
279 )
280 },
281 }
282 }
283}
284
285impl<const GRID_SIZE: usize> Fetcher<AvxVectorSse> for TetrahedralAvxSseFetchVector<'_, GRID_SIZE> {
286 #[inline(always)]
287 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorSse {
288 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
289 + y as u32 * GRID_SIZE as u32
290 + z as u32) as usize;
291 let jx = unsafe { self.cube.get_unchecked(offset..) };
292 AvxVectorSse {
293 v: unsafe { _mm_load_ps(jx.as_ptr() as *const f32) },
294 }
295 }
296}
297
298#[cfg(feature = "options")]
299impl<const GRID_SIZE: usize> TetrahedralAvxFma<'_, GRID_SIZE> {
300 #[inline(always)]
301 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
302 &self,
303 in_r: U,
304 in_g: U,
305 in_b: U,
306 lut: &[BarycentricWeight<f32>; BINS],
307 r: impl Fetcher<AvxVectorSse>,
308 ) -> AvxVectorSse {
309 let lut_r = lut[in_r.as_()];
310 let lut_g = lut[in_g.as_()];
311 let lut_b = lut[in_b.as_()];
312
313 let x: i32 = lut_r.x;
314 let y: i32 = lut_g.x;
315 let z: i32 = lut_b.x;
316
317 let x_n: i32 = lut_r.x_n;
318 let y_n: i32 = lut_g.x_n;
319 let z_n: i32 = lut_b.x_n;
320
321 let rx = lut_r.w;
322 let ry = lut_g.w;
323 let rz = lut_b.w;
324
325 let c0 = r.fetch(x, y, z);
326
327 let c2;
328 let c1;
329 let c3;
330 if rx >= ry {
331 if ry >= rz {
332 c1 = r.fetch(x_n, y, z) - c0;
334 c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
335 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
336 } else if rx >= rz {
337 c1 = r.fetch(x_n, y, z) - c0;
339 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
340 c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
341 } else {
342 c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
344 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
345 c3 = r.fetch(x, y, z_n) - c0;
346 }
347 } else if rx >= rz {
348 c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
350 c2 = r.fetch(x, y_n, z) - c0;
351 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
352 } else if ry >= rz {
353 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
355 c2 = r.fetch(x, y_n, z) - c0;
356 c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
357 } else {
358 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
360 c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
361 c3 = r.fetch(x, y, z_n) - c0;
362 }
363 let s0 = c0.mla(c1, AvxVectorSse::from(rx));
364 let s1 = s0.mla(c2, AvxVectorSse::from(ry));
365 s1.mla(c3, AvxVectorSse::from(rz))
366 }
367}
368
369macro_rules! define_interp_avx {
370 ($interpolator: ident) => {
371 impl<'a, const GRID_SIZE: usize> AvxMdInterpolation<'a, GRID_SIZE>
372 for $interpolator<'a, GRID_SIZE>
373 {
374 #[inline(always)]
375 fn new(table: &'a [SseAlignedF32]) -> Self {
376 Self { cube: table }
377 }
378
379 #[inline(always)]
380 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
381 &self,
382 in_r: U,
383 in_g: U,
384 in_b: U,
385 lut: &[BarycentricWeight<f32>; BINS],
386 ) -> AvxVectorSse {
387 self.interpolate(
388 in_r,
389 in_g,
390 in_b,
391 lut,
392 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube },
393 )
394 }
395 }
396 };
397}
398
399#[cfg(feature = "options")]
400macro_rules! define_interp_avx_d {
401 ($interpolator: ident) => {
402 impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
403 for $interpolator<'a, GRID_SIZE>
404 {
405 #[inline(always)]
406 fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
407 Self {
408 cube0: table0,
409 cube1: table1,
410 }
411 }
412
413 #[inline(always)]
414 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
415 &self,
416 in_r: U,
417 in_g: U,
418 in_b: U,
419 lut: &[BarycentricWeight<f32>; BINS],
420 ) -> (AvxVectorSse, AvxVectorSse) {
421 self.interpolate(
422 in_r,
423 in_g,
424 in_b,
425 lut,
426 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube0 },
427 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: self.cube1 },
428 )
429 }
430 }
431 };
432}
433
434#[cfg(feature = "options")]
435define_interp_avx!(TetrahedralAvxFma);
436#[cfg(feature = "options")]
437define_interp_avx!(PyramidalAvxFma);
438#[cfg(feature = "options")]
439define_interp_avx!(PrismaticAvxFma);
440define_interp_avx!(TrilinearAvxFma);
441#[cfg(feature = "options")]
442define_interp_avx_d!(PrismaticAvxFmaDouble);
443#[cfg(feature = "options")]
444define_interp_avx_d!(PyramidAvxFmaDouble);
445
446#[cfg(feature = "options")]
447impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
448 for TetrahedralAvxFmaDouble<'a, GRID_SIZE>
449{
450 #[inline(always)]
451 fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
452 Self {
453 cube0: table0,
454 cube1: table1,
455 }
456 }
457
458 #[inline(always)]
459 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
460 &self,
461 in_r: U,
462 in_g: U,
463 in_b: U,
464 lut: &[BarycentricWeight<f32>; BINS],
465 ) -> (AvxVectorSse, AvxVectorSse) {
466 self.interpolate(
467 in_r,
468 in_g,
469 in_b,
470 lut,
471 TetrahedralAvxFetchVector::<GRID_SIZE> {
472 cube0: self.cube0,
473 cube1: self.cube1,
474 },
475 )
476 }
477}
478
479impl<'a, const GRID_SIZE: usize> AvxMdInterpolationDouble<'a, GRID_SIZE>
480 for TrilinearAvxFmaDouble<'a, GRID_SIZE>
481{
482 #[inline(always)]
483 fn new(table0: &'a [SseAlignedF32], table1: &'a [SseAlignedF32]) -> Self {
484 Self {
485 cube0: table0,
486 cube1: table1,
487 }
488 }
489
490 #[inline(always)]
491 fn inter3_sse<U: AsPrimitive<usize>, const BINS: usize>(
492 &self,
493 in_r: U,
494 in_g: U,
495 in_b: U,
496 lut: &[BarycentricWeight<f32>; BINS],
497 ) -> (AvxVectorSse, AvxVectorSse) {
498 self.interpolate(
499 in_r,
500 in_g,
501 in_b,
502 lut,
503 TetrahedralAvxFetchVector::<GRID_SIZE> {
504 cube0: self.cube0,
505 cube1: self.cube1,
506 },
507 )
508 }
509}
510
511#[cfg(feature = "options")]
512impl<const GRID_SIZE: usize> PyramidalAvxFma<'_, GRID_SIZE> {
513 #[inline(always)]
514 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
515 &self,
516 in_r: U,
517 in_g: U,
518 in_b: U,
519 lut: &[BarycentricWeight<f32>; BINS],
520 r: impl Fetcher<AvxVectorSse>,
521 ) -> AvxVectorSse {
522 let lut_r = lut[in_r.as_()];
523 let lut_g = lut[in_g.as_()];
524 let lut_b = lut[in_b.as_()];
525
526 let x: i32 = lut_r.x;
527 let y: i32 = lut_g.x;
528 let z: i32 = lut_b.x;
529
530 let x_n: i32 = lut_r.x_n;
531 let y_n: i32 = lut_g.x_n;
532 let z_n: i32 = lut_b.x_n;
533
534 let dr = lut_r.w;
535 let dg = lut_g.w;
536 let db = lut_b.w;
537
538 let c0 = r.fetch(x, y, z);
539
540 let w0 = AvxVectorSse::from(db);
541 let w1 = AvxVectorSse::from(dr);
542 let w2 = AvxVectorSse::from(dg);
543
544 if dr > db && dg > db {
545 let w3 = AvxVectorSse::from(dr * dg);
546 let x0 = r.fetch(x_n, y_n, z_n);
547 let x1 = r.fetch(x_n, y_n, z);
548 let x2 = r.fetch(x_n, y, z);
549 let x3 = r.fetch(x, y_n, z);
550
551 let c1 = x0 - x1;
552 let c2 = x2 - c0;
553 let c3 = x3 - c0;
554 let c4 = c0 - x3 - x2 + x1;
555
556 let s0 = c0.mla(c1, w0);
557 let s1 = s0.mla(c2, w1);
558 let s2 = s1.mla(c3, w2);
559 s2.mla(c4, w3)
560 } else if db > dr && dg > dr {
561 let w3 = AvxVectorSse::from(dg * db);
562
563 let x0 = r.fetch(x, y, z_n);
564 let x1 = r.fetch(x_n, y_n, z_n);
565 let x2 = r.fetch(x, y_n, z_n);
566 let x3 = r.fetch(x, y_n, z);
567
568 let c1 = x0 - c0;
569 let c2 = x1 - x2;
570 let c3 = x3 - c0;
571 let c4 = c0 - x3 - x0 + x2;
572
573 let s0 = c0.mla(c1, w0);
574 let s1 = s0.mla(c2, w1);
575 let s2 = s1.mla(c3, w2);
576 s2.mla(c4, w3)
577 } else {
578 let w3 = AvxVectorSse::from(db * dr);
579
580 let x0 = r.fetch(x, y, z_n);
581 let x1 = r.fetch(x_n, y, z);
582 let x2 = r.fetch(x_n, y, z_n);
583 let x3 = r.fetch(x_n, y_n, z_n);
584
585 let c1 = x0 - c0;
586 let c2 = x1 - c0;
587 let c3 = x3 - x2;
588 let c4 = c0 - x1 - x0 + x2;
589
590 let s0 = c0.mla(c1, w0);
591 let s1 = s0.mla(c2, w1);
592 let s2 = s1.mla(c3, w2);
593 s2.mla(c4, w3)
594 }
595 }
596}
597
598#[cfg(feature = "options")]
599impl<const GRID_SIZE: usize> PrismaticAvxFma<'_, GRID_SIZE> {
600 #[inline(always)]
601 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
602 &self,
603 in_r: U,
604 in_g: U,
605 in_b: U,
606 lut: &[BarycentricWeight<f32>; BINS],
607 r: impl Fetcher<AvxVectorSse>,
608 ) -> AvxVectorSse {
609 let lut_r = lut[in_r.as_()];
610 let lut_g = lut[in_g.as_()];
611 let lut_b = lut[in_b.as_()];
612
613 let x: i32 = lut_r.x;
614 let y: i32 = lut_g.x;
615 let z: i32 = lut_b.x;
616
617 let x_n: i32 = lut_r.x_n;
618 let y_n: i32 = lut_g.x_n;
619 let z_n: i32 = lut_b.x_n;
620
621 let dr = lut_r.w;
622 let dg = lut_g.w;
623 let db = lut_b.w;
624
625 let c0 = r.fetch(x, y, z);
626
627 let w0 = AvxVectorSse::from(db);
628 let w1 = AvxVectorSse::from(dr);
629 let w2 = AvxVectorSse::from(dg);
630 let w3 = AvxVectorSse::from(dg * db);
631 let w4 = AvxVectorSse::from(dr * dg);
632
633 if db > dr {
634 let x0 = r.fetch(x, y, z_n);
635 let x1 = r.fetch(x_n, y, z_n);
636 let x2 = r.fetch(x, y_n, z);
637 let x3 = r.fetch(x, y_n, z_n);
638 let x4 = r.fetch(x_n, y_n, z_n);
639
640 let c1 = x0 - c0;
641 let c2 = x1 - x0;
642 let c3 = x2 - c0;
643 let c4 = c0 - x2 - x0 + x3;
644 let c5 = x0 - x3 - x1 + x4;
645
646 let s0 = c0.mla(c1, w0);
647 let s1 = s0.mla(c2, w1);
648 let s2 = s1.mla(c3, w2);
649 let s3 = s2.mla(c4, w3);
650 s3.mla(c5, w4)
651 } else {
652 let x0 = r.fetch(x_n, y, z);
653 let x1 = r.fetch(x_n, y, z_n);
654 let x2 = r.fetch(x, y_n, z);
655 let x3 = r.fetch(x_n, y_n, z);
656 let x4 = r.fetch(x_n, y_n, z_n);
657
658 let c1 = x1 - x0;
659 let c2 = x0 - c0;
660 let c3 = x2 - c0;
661 let c4 = x0 - x3 - x1 + x4;
662 let c5 = c0 - x2 - x0 + x3;
663
664 let s0 = c0.mla(c1, w0);
665 let s1 = s0.mla(c2, w1);
666 let s2 = s1.mla(c3, w2);
667 let s3 = s2.mla(c4, w3);
668 s3.mla(c5, w4)
669 }
670 }
671}
672
673#[cfg(feature = "options")]
674impl<const GRID_SIZE: usize> PrismaticAvxFmaDouble<'_, GRID_SIZE> {
675 #[inline(always)]
676 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
677 &self,
678 in_r: U,
679 in_g: U,
680 in_b: U,
681 lut: &[BarycentricWeight<f32>; BINS],
682 r0: impl Fetcher<AvxVectorSse>,
683 r1: impl Fetcher<AvxVectorSse>,
684 ) -> (AvxVectorSse, AvxVectorSse) {
685 let lut_r = lut[in_r.as_()];
686 let lut_g = lut[in_g.as_()];
687 let lut_b = lut[in_b.as_()];
688
689 let x: i32 = lut_r.x;
690 let y: i32 = lut_g.x;
691 let z: i32 = lut_b.x;
692
693 let x_n: i32 = lut_r.x_n;
694 let y_n: i32 = lut_g.x_n;
695 let z_n: i32 = lut_b.x_n;
696
697 let dr = lut_r.w;
698 let dg = lut_g.w;
699 let db = lut_b.w;
700
701 let c0_0 = r0.fetch(x, y, z);
702 let c0_1 = r0.fetch(x, y, z);
703
704 let w0 = AvxVector::from(db);
705 let w1 = AvxVector::from(dr);
706 let w2 = AvxVector::from(dg);
707 let w3 = AvxVector::from(dg * db);
708 let w4 = AvxVector::from(dr * dg);
709
710 let c0 = AvxVector::from_sse(c0_0, c0_1);
711
712 if db > dr {
713 let x0_0 = r0.fetch(x, y, z_n);
714 let x1_0 = r0.fetch(x_n, y, z_n);
715 let x2_0 = r0.fetch(x, y_n, z);
716 let x3_0 = r0.fetch(x, y_n, z_n);
717 let x4_0 = r0.fetch(x_n, y_n, z_n);
718
719 let x0_1 = r1.fetch(x, y, z_n);
720 let x1_1 = r1.fetch(x_n, y, z_n);
721 let x2_1 = r1.fetch(x, y_n, z);
722 let x3_1 = r1.fetch(x, y_n, z_n);
723 let x4_1 = r1.fetch(x_n, y_n, z_n);
724
725 let x0 = AvxVector::from_sse(x0_0, x0_1);
726 let x1 = AvxVector::from_sse(x1_0, x1_1);
727 let x2 = AvxVector::from_sse(x2_0, x2_1);
728 let x3 = AvxVector::from_sse(x3_0, x3_1);
729 let x4 = AvxVector::from_sse(x4_0, x4_1);
730
731 let c1 = x0 - c0;
732 let c2 = x1 - x0;
733 let c3 = x2 - c0;
734 let c4 = c0 - x2 - x0 + x3;
735 let c5 = x0 - x3 - x1 + x4;
736
737 let s0 = c0.mla(c1, w0);
738 let s1 = s0.mla(c2, w1);
739 let s2 = s1.mla(c3, w2);
740 let s3 = s2.mla(c4, w3);
741 s3.mla(c5, w4).split()
742 } else {
743 let x0_0 = r0.fetch(x_n, y, z);
744 let x1_0 = r0.fetch(x_n, y, z_n);
745 let x2_0 = r0.fetch(x, y_n, z);
746 let x3_0 = r0.fetch(x_n, y_n, z);
747 let x4_0 = r0.fetch(x_n, y_n, z_n);
748
749 let x0_1 = r1.fetch(x_n, y, z);
750 let x1_1 = r1.fetch(x_n, y, z_n);
751 let x2_1 = r1.fetch(x, y_n, z);
752 let x3_1 = r1.fetch(x_n, y_n, z);
753 let x4_1 = r1.fetch(x_n, y_n, z_n);
754
755 let x0 = AvxVector::from_sse(x0_0, x0_1);
756 let x1 = AvxVector::from_sse(x1_0, x1_1);
757 let x2 = AvxVector::from_sse(x2_0, x2_1);
758 let x3 = AvxVector::from_sse(x3_0, x3_1);
759 let x4 = AvxVector::from_sse(x4_0, x4_1);
760
761 let c1 = x1 - x0;
762 let c2 = x0 - c0;
763 let c3 = x2 - c0;
764 let c4 = x0 - x3 - x1 + x4;
765 let c5 = c0 - x2 - x0 + x3;
766
767 let s0 = c0.mla(c1, w0);
768 let s1 = s0.mla(c2, w1);
769 let s2 = s1.mla(c3, w2);
770 let s3 = s2.mla(c4, w3);
771 s3.mla(c5, w4).split()
772 }
773 }
774}
775
776#[cfg(feature = "options")]
777impl<const GRID_SIZE: usize> PyramidAvxFmaDouble<'_, GRID_SIZE> {
778 #[inline(always)]
779 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
780 &self,
781 in_r: U,
782 in_g: U,
783 in_b: U,
784 lut: &[BarycentricWeight<f32>; BINS],
785 r0: impl Fetcher<AvxVectorSse>,
786 r1: impl Fetcher<AvxVectorSse>,
787 ) -> (AvxVectorSse, AvxVectorSse) {
788 let lut_r = lut[in_r.as_()];
789 let lut_g = lut[in_g.as_()];
790 let lut_b = lut[in_b.as_()];
791
792 let x: i32 = lut_r.x;
793 let y: i32 = lut_g.x;
794 let z: i32 = lut_b.x;
795
796 let x_n: i32 = lut_r.x_n;
797 let y_n: i32 = lut_g.x_n;
798 let z_n: i32 = lut_b.x_n;
799
800 let dr = lut_r.w;
801 let dg = lut_g.w;
802 let db = lut_b.w;
803
804 let c0_0 = r0.fetch(x, y, z);
805 let c0_1 = r1.fetch(x, y, z);
806
807 let w0 = AvxVector::from(db);
808 let w1 = AvxVector::from(dr);
809 let w2 = AvxVector::from(dg);
810
811 let c0 = AvxVector::from_sse(c0_0, c0_1);
812
813 if dr > db && dg > db {
814 let w3 = AvxVector::from(dr * dg);
815
816 let x0_0 = r0.fetch(x_n, y_n, z_n);
817 let x1_0 = r0.fetch(x_n, y_n, z);
818 let x2_0 = r0.fetch(x_n, y, z);
819 let x3_0 = r0.fetch(x, y_n, z);
820
821 let x0_1 = r1.fetch(x_n, y_n, z_n);
822 let x1_1 = r1.fetch(x_n, y_n, z);
823 let x2_1 = r1.fetch(x_n, y, z);
824 let x3_1 = r1.fetch(x, y_n, z);
825
826 let x0 = AvxVector::from_sse(x0_0, x0_1);
827 let x1 = AvxVector::from_sse(x1_0, x1_1);
828 let x2 = AvxVector::from_sse(x2_0, x2_1);
829 let x3 = AvxVector::from_sse(x3_0, x3_1);
830
831 let c1 = x0 - x1;
832 let c2 = x2 - c0;
833 let c3 = x3 - c0;
834 let c4 = c0 - x3 - x2 + x1;
835
836 let s0 = c0.mla(c1, w0);
837 let s1 = s0.mla(c2, w1);
838 let s2 = s1.mla(c3, w2);
839 s2.mla(c4, w3).split()
840 } else if db > dr && dg > dr {
841 let w3 = AvxVector::from(dg * db);
842
843 let x0_0 = r0.fetch(x, y, z_n);
844 let x1_0 = r0.fetch(x_n, y_n, z_n);
845 let x2_0 = r0.fetch(x, y_n, z_n);
846 let x3_0 = r0.fetch(x, y_n, z);
847
848 let x0_1 = r1.fetch(x, y, z_n);
849 let x1_1 = r1.fetch(x_n, y_n, z_n);
850 let x2_1 = r1.fetch(x, y_n, z_n);
851 let x3_1 = r1.fetch(x, y_n, z);
852
853 let x0 = AvxVector::from_sse(x0_0, x0_1);
854 let x1 = AvxVector::from_sse(x1_0, x1_1);
855 let x2 = AvxVector::from_sse(x2_0, x2_1);
856 let x3 = AvxVector::from_sse(x3_0, x3_1);
857
858 let c1 = x0 - c0;
859 let c2 = x1 - x2;
860 let c3 = x3 - c0;
861 let c4 = c0 - x3 - x0 + x2;
862
863 let s0 = c0.mla(c1, w0);
864 let s1 = s0.mla(c2, w1);
865 let s2 = s1.mla(c3, w2);
866 s2.mla(c4, w3).split()
867 } else {
868 let w3 = AvxVector::from(db * dr);
869
870 let x0_0 = r0.fetch(x, y, z_n);
871 let x1_0 = r0.fetch(x_n, y, z);
872 let x2_0 = r0.fetch(x_n, y, z_n);
873 let x3_0 = r0.fetch(x_n, y_n, z_n);
874
875 let x0_1 = r1.fetch(x, y, z_n);
876 let x1_1 = r1.fetch(x_n, y, z);
877 let x2_1 = r1.fetch(x_n, y, z_n);
878 let x3_1 = r1.fetch(x_n, y_n, z_n);
879
880 let x0 = AvxVector::from_sse(x0_0, x0_1);
881 let x1 = AvxVector::from_sse(x1_0, x1_1);
882 let x2 = AvxVector::from_sse(x2_0, x2_1);
883 let x3 = AvxVector::from_sse(x3_0, x3_1);
884
885 let c1 = x0 - c0;
886 let c2 = x1 - c0;
887 let c3 = x3 - x2;
888 let c4 = c0 - x1 - x0 + x2;
889
890 let s0 = c0.mla(c1, w0);
891 let s1 = s0.mla(c2, w1);
892 let s2 = s1.mla(c3, w2);
893 s2.mla(c4, w3).split()
894 }
895 }
896}
897
898#[cfg(feature = "options")]
899impl<const GRID_SIZE: usize> TetrahedralAvxFmaDouble<'_, GRID_SIZE> {
900 #[inline(always)]
901 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
902 &self,
903 in_r: U,
904 in_g: U,
905 in_b: U,
906 lut: &[BarycentricWeight<f32>; BINS],
907 rv: impl Fetcher<AvxVector>,
908 ) -> (AvxVectorSse, AvxVectorSse) {
909 let lut_r = lut[in_r.as_()];
910 let lut_g = lut[in_g.as_()];
911 let lut_b = lut[in_b.as_()];
912
913 let x: i32 = lut_r.x;
914 let y: i32 = lut_g.x;
915 let z: i32 = lut_b.x;
916
917 let x_n: i32 = lut_r.x_n;
918 let y_n: i32 = lut_g.x_n;
919 let z_n: i32 = lut_b.x_n;
920
921 let rx = lut_r.w;
922 let ry = lut_g.w;
923 let rz = lut_b.w;
924
925 let c0 = rv.fetch(x, y, z);
926
927 let w0 = AvxVector::from(rx);
928 let w1 = AvxVector::from(ry);
929 let w2 = AvxVector::from(rz);
930
931 let c2;
932 let c1;
933 let c3;
934 if rx >= ry {
935 if ry >= rz {
936 c1 = rv.fetch(x_n, y, z) - c0;
938 c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
939 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
940 } else if rx >= rz {
941 c1 = rv.fetch(x_n, y, z) - c0;
943 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
944 c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
945 } else {
946 c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
948 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
949 c3 = rv.fetch(x, y, z_n) - c0;
950 }
951 } else if rx >= rz {
952 c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
954 c2 = rv.fetch(x, y_n, z) - c0;
955 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
956 } else if ry >= rz {
957 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
959 c2 = rv.fetch(x, y_n, z) - c0;
960 c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
961 } else {
962 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
964 c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
965 c3 = rv.fetch(x, y, z_n) - c0;
966 }
967 let s0 = c0.mla(c1, w0);
968 let s1 = s0.mla(c2, w1);
969 s1.mla(c3, w2).split()
970 }
971}
972
973impl<const GRID_SIZE: usize> TrilinearAvxFmaDouble<'_, GRID_SIZE> {
974 #[inline(always)]
975 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
976 &self,
977 in_r: U,
978 in_g: U,
979 in_b: U,
980 lut: &[BarycentricWeight<f32>; BINS],
981 rv: impl Fetcher<AvxVector>,
982 ) -> (AvxVectorSse, AvxVectorSse) {
983 let lut_r = lut[in_r.as_()];
984 let lut_g = lut[in_g.as_()];
985 let lut_b = lut[in_b.as_()];
986
987 let x: i32 = lut_r.x;
988 let y: i32 = lut_g.x;
989 let z: i32 = lut_b.x;
990
991 let x_n: i32 = lut_r.x_n;
992 let y_n: i32 = lut_g.x_n;
993 let z_n: i32 = lut_b.x_n;
994
995 let rx = lut_r.w;
996 let ry = lut_g.w;
997 let rz = lut_b.w;
998
999 let w0 = AvxVector::from(rx);
1000 let w1 = AvxVector::from(ry);
1001 let w2 = AvxVector::from(rz);
1002
1003 let c000 = rv.fetch(x, y, z);
1004 let c100 = rv.fetch(x_n, y, z);
1005 let c010 = rv.fetch(x, y_n, z);
1006 let c110 = rv.fetch(x_n, y_n, z);
1007 let c001 = rv.fetch(x, y, z_n);
1008 let c101 = rv.fetch(x_n, y, z_n);
1009 let c011 = rv.fetch(x, y_n, z_n);
1010 let c111 = rv.fetch(x_n, y_n, z_n);
1011
1012 let dx = AvxVector::from(rx);
1013
1014 let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
1015 let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
1016 let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
1017 let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
1018
1019 let dy = AvxVector::from(ry);
1020
1021 let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
1022 let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
1023
1024 let dz = AvxVector::from(rz);
1025
1026 c0.neg_mla(c0, dz).mla(c1, w2).split()
1027 }
1028}
1029
1030impl<const GRID_SIZE: usize> TrilinearAvxFma<'_, GRID_SIZE> {
1031 #[inline(always)]
1032 fn interpolate<U: AsPrimitive<usize>, const BINS: usize>(
1033 &self,
1034 in_r: U,
1035 in_g: U,
1036 in_b: U,
1037 lut: &[BarycentricWeight<f32>; BINS],
1038 r: impl Fetcher<AvxVectorSse>,
1039 ) -> AvxVectorSse {
1040 let lut_r = lut[in_r.as_()];
1041 let lut_g = lut[in_g.as_()];
1042 let lut_b = lut[in_b.as_()];
1043
1044 let x: i32 = lut_r.x;
1045 let y: i32 = lut_g.x;
1046 let z: i32 = lut_b.x;
1047
1048 let x_n: i32 = lut_r.x_n;
1049 let y_n: i32 = lut_g.x_n;
1050 let z_n: i32 = lut_b.x_n;
1051
1052 let dr = lut_r.w;
1053 let dg = lut_g.w;
1054 let db = lut_b.w;
1055
1056 let w0 = AvxVector::from(dr);
1057 let w1 = AvxVector::from(dg);
1058 let w2 = AvxVectorSse::from(db);
1059
1060 let c000 = r.fetch(x, y, z);
1061 let c100 = r.fetch(x_n, y, z);
1062 let c010 = r.fetch(x, y_n, z);
1063 let c110 = r.fetch(x_n, y_n, z);
1064 let c001 = r.fetch(x, y, z_n);
1065 let c101 = r.fetch(x_n, y, z_n);
1066 let c011 = r.fetch(x, y_n, z_n);
1067 let c111 = r.fetch(x_n, y_n, z_n);
1068
1069 let x000 = AvxVector::from_sse(c000, c001);
1070 let x010 = AvxVector::from_sse(c010, c011);
1071 let x011 = AvxVector::from_sse(c100, c101);
1072 let x111 = AvxVector::from_sse(c110, c111);
1073
1074 let c00 = x000.neg_mla(x000, w0).mla(x011, w0);
1075 let c10 = x010.neg_mla(x010, w0).mla(x111, w0);
1076
1077 let z0 = c00.neg_mla(c00, w1).mla(c10, w1);
1078
1079 let (c0, c1) = z0.split();
1080
1081 c0.neg_mla(c0, w2).mla(c1, w2)
1082 }
1083}