1use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::{FusedMultiplyAdd, FusedMultiplyNegAdd};
31use std::arch::x86_64::*;
32use std::ops::{Add, Mul, Sub};
33
34#[repr(align(16), C)]
35pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
36
37#[cfg(feature = "options")]
38pub(crate) struct TetrahedralAvxFma<const GRID_SIZE: usize> {}
39
40#[cfg(feature = "options")]
41pub(crate) struct PyramidalAvxFma<const GRID_SIZE: usize> {}
42
43#[cfg(feature = "options")]
44pub(crate) struct PrismaticAvxFma<const GRID_SIZE: usize> {}
45
46pub(crate) struct TrilinearAvxFma<const GRID_SIZE: usize> {}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxFmaDouble<const GRID_SIZE: usize> {}
50
51pub(crate) struct TrilinearAvxFmaDouble<const GRID_SIZE: usize> {}
52
53#[cfg(feature = "options")]
54pub(crate) struct PyramidAvxFmaDouble<const GRID_SIZE: usize> {}
55
56#[cfg(feature = "options")]
57pub(crate) struct TetrahedralAvxFmaDouble<const GRID_SIZE: usize> {}
58
59pub(crate) trait AvxMdInterpolationDouble {
60 fn inter3_sse(
61 &self,
62 table0: &[SseAlignedF32],
63 table1: &[SseAlignedF32],
64 in_r: usize,
65 in_g: usize,
66 in_b: usize,
67 lut: &[BarycentricWeight<f32>],
68 ) -> (AvxVectorSse, AvxVectorSse);
69}
70
71pub(crate) trait AvxMdInterpolation {
72 fn inter3_sse(
73 &self,
74 table: &[SseAlignedF32],
75 in_r: usize,
76 in_g: usize,
77 in_b: usize,
78 lut: &[BarycentricWeight<f32>],
79 ) -> AvxVectorSse;
80}
81
82trait Fetcher<T> {
83 fn fetch(&self, x: i32, y: i32, z: i32) -> T;
84}
85
86#[derive(Copy, Clone)]
87#[repr(transparent)]
88pub(crate) struct AvxVectorSse {
89 pub(crate) v: __m128,
90}
91
92#[derive(Copy, Clone)]
93#[repr(transparent)]
94pub(crate) struct AvxVector {
95 pub(crate) v: __m256,
96}
97
98impl AvxVector {
99 #[inline(always)]
100 pub(crate) fn from_sse(lo: AvxVectorSse, hi: AvxVectorSse) -> AvxVector {
101 unsafe {
102 AvxVector {
103 v: _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(lo.v), hi.v),
104 }
105 }
106 }
107
108 #[inline(always)]
109 pub(crate) fn split(self) -> (AvxVectorSse, AvxVectorSse) {
110 unsafe {
111 (
112 AvxVectorSse {
113 v: _mm256_castps256_ps128(self.v),
114 },
115 AvxVectorSse {
116 v: _mm256_extractf128_ps::<1>(self.v),
117 },
118 )
119 }
120 }
121}
122
123impl From<f32> for AvxVectorSse {
124 #[inline(always)]
125 fn from(v: f32) -> Self {
126 AvxVectorSse {
127 v: unsafe { _mm_set1_ps(v) },
128 }
129 }
130}
131
132impl From<f32> for AvxVector {
133 #[inline(always)]
134 fn from(v: f32) -> Self {
135 AvxVector {
136 v: unsafe { _mm256_set1_ps(v) },
137 }
138 }
139}
140
141impl Sub<AvxVectorSse> for AvxVectorSse {
142 type Output = Self;
143 #[inline(always)]
144 fn sub(self, rhs: AvxVectorSse) -> Self::Output {
145 AvxVectorSse {
146 v: unsafe { _mm_sub_ps(self.v, rhs.v) },
147 }
148 }
149}
150
151impl Sub<AvxVector> for AvxVector {
152 type Output = Self;
153 #[inline(always)]
154 fn sub(self, rhs: AvxVector) -> Self::Output {
155 AvxVector {
156 v: unsafe { _mm256_sub_ps(self.v, rhs.v) },
157 }
158 }
159}
160
161impl Add<AvxVectorSse> for AvxVectorSse {
162 type Output = Self;
163 #[inline(always)]
164 fn add(self, rhs: AvxVectorSse) -> Self::Output {
165 AvxVectorSse {
166 v: unsafe { _mm_add_ps(self.v, rhs.v) },
167 }
168 }
169}
170
171impl Mul<AvxVectorSse> for AvxVectorSse {
172 type Output = Self;
173 #[inline(always)]
174 fn mul(self, rhs: AvxVectorSse) -> Self::Output {
175 AvxVectorSse {
176 v: unsafe { _mm_mul_ps(self.v, rhs.v) },
177 }
178 }
179}
180
181impl AvxVector {
182 #[inline(always)]
183 pub(crate) fn neg_mla(self, b: AvxVector, c: AvxVector) -> Self {
184 Self {
185 v: unsafe { _mm256_fnmadd_ps(b.v, c.v, self.v) },
186 }
187 }
188}
189
190impl FusedMultiplyNegAdd<AvxVectorSse> for AvxVectorSse {
191 #[inline(always)]
192 fn neg_mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> Self {
193 Self {
194 v: unsafe { _mm_fnmadd_ps(b.v, c.v, self.v) },
195 }
196 }
197}
198
199impl Add<AvxVector> for AvxVector {
200 type Output = Self;
201 #[inline(always)]
202 fn add(self, rhs: AvxVector) -> Self::Output {
203 AvxVector {
204 v: unsafe { _mm256_add_ps(self.v, rhs.v) },
205 }
206 }
207}
208
209impl Mul<AvxVector> for AvxVector {
210 type Output = Self;
211 #[inline(always)]
212 fn mul(self, rhs: AvxVector) -> Self::Output {
213 AvxVector {
214 v: unsafe { _mm256_mul_ps(self.v, rhs.v) },
215 }
216 }
217}
218
219impl FusedMultiplyAdd<AvxVectorSse> for AvxVectorSse {
220 #[inline(always)]
221 fn mla(&self, b: AvxVectorSse, c: AvxVectorSse) -> AvxVectorSse {
222 AvxVectorSse {
223 v: unsafe { _mm_fmadd_ps(b.v, c.v, self.v) },
224 }
225 }
226}
227
228impl FusedMultiplyAdd<AvxVector> for AvxVector {
229 #[inline(always)]
230 fn mla(&self, b: AvxVector, c: AvxVector) -> AvxVector {
231 AvxVector {
232 v: unsafe { _mm256_fmadd_ps(b.v, c.v, self.v) },
233 }
234 }
235}
236
237struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
238 cube: &'a [SseAlignedF32],
239}
240
241struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
242 cube0: &'a [SseAlignedF32],
243 cube1: &'a [SseAlignedF32],
244}
245
246impl<const GRID_SIZE: usize> Fetcher<AvxVector> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
247 #[inline(always)]
248 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVector {
249 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
250 + y as u32 * GRID_SIZE as u32
251 + z as u32) as usize;
252 let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
253 let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
254 AvxVector {
255 v: unsafe {
256 _mm256_insertf128_ps::<1>(
257 _mm256_castps128_ps256(_mm_load_ps(jx0.as_ptr() as *const f32)),
258 _mm_load_ps(jx1.as_ptr() as *const f32),
259 )
260 },
261 }
262 }
263}
264
265impl<const GRID_SIZE: usize> Fetcher<AvxVectorSse> for TetrahedralAvxSseFetchVector<'_, GRID_SIZE> {
266 #[inline(always)]
267 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorSse {
268 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
269 + y as u32 * GRID_SIZE as u32
270 + z as u32) as usize;
271 let jx = unsafe { self.cube.get_unchecked(offset..) };
272 AvxVectorSse {
273 v: unsafe { _mm_load_ps(jx.as_ptr() as *const f32) },
274 }
275 }
276}
277
278#[cfg(feature = "options")]
279impl<const GRID_SIZE: usize> TetrahedralAvxFma<GRID_SIZE> {
280 #[target_feature(enable = "avx2", enable = "fma")]
281 unsafe fn interpolate(
282 &self,
283 in_r: usize,
284 in_g: usize,
285 in_b: usize,
286 lut: &[BarycentricWeight<f32>],
287 r: impl Fetcher<AvxVectorSse>,
288 ) -> AvxVectorSse {
289 let lut_r = unsafe { lut.get_unchecked(in_r) };
290 let lut_g = unsafe { lut.get_unchecked(in_g) };
291 let lut_b = unsafe { lut.get_unchecked(in_b) };
292
293 let x: i32 = lut_r.x;
294 let y: i32 = lut_g.x;
295 let z: i32 = lut_b.x;
296
297 let x_n: i32 = lut_r.x_n;
298 let y_n: i32 = lut_g.x_n;
299 let z_n: i32 = lut_b.x_n;
300
301 let rx = lut_r.w;
302 let ry = lut_g.w;
303 let rz = lut_b.w;
304
305 let c0 = r.fetch(x, y, z);
306
307 let c2;
308 let c1;
309 let c3;
310 if rx >= ry {
311 if ry >= rz {
312 c1 = r.fetch(x_n, y, z) - c0;
314 c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
315 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
316 } else if rx >= rz {
317 c1 = r.fetch(x_n, y, z) - c0;
319 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
320 c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
321 } else {
322 c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
324 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
325 c3 = r.fetch(x, y, z_n) - c0;
326 }
327 } else if rx >= rz {
328 c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
330 c2 = r.fetch(x, y_n, z) - c0;
331 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
332 } else if ry >= rz {
333 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
335 c2 = r.fetch(x, y_n, z) - c0;
336 c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
337 } else {
338 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
340 c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
341 c3 = r.fetch(x, y, z_n) - c0;
342 }
343 let s0 = c0.mla(c1, AvxVectorSse::from(rx));
344 let s1 = s0.mla(c2, AvxVectorSse::from(ry));
345 s1.mla(c3, AvxVectorSse::from(rz))
346 }
347}
348
349macro_rules! define_interp_avx {
350 ($interpolator: ident) => {
351 impl<const GRID_SIZE: usize> AvxMdInterpolation for $interpolator<GRID_SIZE> {
352 fn inter3_sse(
353 &self,
354 table: &[SseAlignedF32],
355 in_r: usize,
356 in_g: usize,
357 in_b: usize,
358 lut: &[BarycentricWeight<f32>],
359 ) -> AvxVectorSse {
360 unsafe {
361 self.interpolate(
362 in_r,
363 in_g,
364 in_b,
365 lut,
366 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
367 )
368 }
369 }
370 }
371 };
372}
373
374#[cfg(feature = "options")]
375macro_rules! define_interp_avx_d {
376 ($interpolator: ident) => {
377 impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for $interpolator<GRID_SIZE> {
378 fn inter3_sse(
379 &self,
380 table0: &[SseAlignedF32],
381 table1: &[SseAlignedF32],
382 in_r: usize,
383 in_g: usize,
384 in_b: usize,
385 lut: &[BarycentricWeight<f32>],
386 ) -> (AvxVectorSse, AvxVectorSse) {
387 unsafe {
388 self.interpolate(
389 in_r,
390 in_g,
391 in_b,
392 lut,
393 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
394 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
395 )
396 }
397 }
398 }
399 };
400}
401
402#[cfg(feature = "options")]
403define_interp_avx!(TetrahedralAvxFma);
404#[cfg(feature = "options")]
405define_interp_avx!(PyramidalAvxFma);
406#[cfg(feature = "options")]
407define_interp_avx!(PrismaticAvxFma);
408define_interp_avx!(TrilinearAvxFma);
409#[cfg(feature = "options")]
410define_interp_avx_d!(PrismaticAvxFmaDouble);
411#[cfg(feature = "options")]
412define_interp_avx_d!(PyramidAvxFmaDouble);
413
414#[cfg(feature = "options")]
415impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for TetrahedralAvxFmaDouble<GRID_SIZE> {
416 fn inter3_sse(
417 &self,
418 table0: &[SseAlignedF32],
419 table1: &[SseAlignedF32],
420 in_r: usize,
421 in_g: usize,
422 in_b: usize,
423 lut: &[BarycentricWeight<f32>],
424 ) -> (AvxVectorSse, AvxVectorSse) {
425 unsafe {
426 self.interpolate(
427 in_r,
428 in_g,
429 in_b,
430 lut,
431 TetrahedralAvxFetchVector::<GRID_SIZE> {
432 cube0: table0,
433 cube1: table1,
434 },
435 )
436 }
437 }
438}
439
440impl<const GRID_SIZE: usize> AvxMdInterpolationDouble for TrilinearAvxFmaDouble<GRID_SIZE> {
441 fn inter3_sse(
442 &self,
443 table0: &[SseAlignedF32],
444 table1: &[SseAlignedF32],
445 in_r: usize,
446 in_g: usize,
447 in_b: usize,
448 lut: &[BarycentricWeight<f32>],
449 ) -> (AvxVectorSse, AvxVectorSse) {
450 unsafe {
451 self.interpolate(
452 in_r,
453 in_g,
454 in_b,
455 lut,
456 TetrahedralAvxFetchVector::<GRID_SIZE> {
457 cube0: table0,
458 cube1: table1,
459 },
460 )
461 }
462 }
463}
464
465#[cfg(feature = "options")]
466impl<const GRID_SIZE: usize> PyramidalAvxFma<GRID_SIZE> {
467 #[target_feature(enable = "avx2", enable = "fma")]
468 unsafe fn interpolate(
469 &self,
470 in_r: usize,
471 in_g: usize,
472 in_b: usize,
473 lut: &[BarycentricWeight<f32>],
474 r: impl Fetcher<AvxVectorSse>,
475 ) -> AvxVectorSse {
476 let lut_r = unsafe { lut.get_unchecked(in_r) };
477 let lut_g = unsafe { lut.get_unchecked(in_g) };
478 let lut_b = unsafe { lut.get_unchecked(in_b) };
479
480 let x: i32 = lut_r.x;
481 let y: i32 = lut_g.x;
482 let z: i32 = lut_b.x;
483
484 let x_n: i32 = lut_r.x_n;
485 let y_n: i32 = lut_g.x_n;
486 let z_n: i32 = lut_b.x_n;
487
488 let dr = lut_r.w;
489 let dg = lut_g.w;
490 let db = lut_b.w;
491
492 let c0 = r.fetch(x, y, z);
493
494 let w0 = AvxVectorSse::from(db);
495 let w1 = AvxVectorSse::from(dr);
496 let w2 = AvxVectorSse::from(dg);
497
498 if dr > db && dg > db {
499 let w3 = AvxVectorSse::from(dr * dg);
500 let x0 = r.fetch(x_n, y_n, z_n);
501 let x1 = r.fetch(x_n, y_n, z);
502 let x2 = r.fetch(x_n, y, z);
503 let x3 = r.fetch(x, y_n, z);
504
505 let c1 = x0 - x1;
506 let c2 = x2 - c0;
507 let c3 = x3 - c0;
508 let c4 = c0 - x3 - x2 + x1;
509
510 let s0 = c0.mla(c1, w0);
511 let s1 = s0.mla(c2, w1);
512 let s2 = s1.mla(c3, w2);
513 s2.mla(c4, w3)
514 } else if db > dr && dg > dr {
515 let w3 = AvxVectorSse::from(dg * db);
516
517 let x0 = r.fetch(x, y, z_n);
518 let x1 = r.fetch(x_n, y_n, z_n);
519 let x2 = r.fetch(x, y_n, z_n);
520 let x3 = r.fetch(x, y_n, z);
521
522 let c1 = x0 - c0;
523 let c2 = x1 - x2;
524 let c3 = x3 - c0;
525 let c4 = c0 - x3 - x0 + x2;
526
527 let s0 = c0.mla(c1, w0);
528 let s1 = s0.mla(c2, w1);
529 let s2 = s1.mla(c3, w2);
530 s2.mla(c4, w3)
531 } else {
532 let w3 = AvxVectorSse::from(db * dr);
533
534 let x0 = r.fetch(x, y, z_n);
535 let x1 = r.fetch(x_n, y, z);
536 let x2 = r.fetch(x_n, y, z_n);
537 let x3 = r.fetch(x_n, y_n, z_n);
538
539 let c1 = x0 - c0;
540 let c2 = x1 - c0;
541 let c3 = x3 - x2;
542 let c4 = c0 - x1 - x0 + x2;
543
544 let s0 = c0.mla(c1, w0);
545 let s1 = s0.mla(c2, w1);
546 let s2 = s1.mla(c3, w2);
547 s2.mla(c4, w3)
548 }
549 }
550}
551
552#[cfg(feature = "options")]
553impl<const GRID_SIZE: usize> PrismaticAvxFma<GRID_SIZE> {
554 #[target_feature(enable = "avx2", enable = "fma")]
555 unsafe fn interpolate(
556 &self,
557 in_r: usize,
558 in_g: usize,
559 in_b: usize,
560 lut: &[BarycentricWeight<f32>],
561 r: impl Fetcher<AvxVectorSse>,
562 ) -> AvxVectorSse {
563 let lut_r = unsafe { lut.get_unchecked(in_r) };
564 let lut_g = unsafe { lut.get_unchecked(in_g) };
565 let lut_b = unsafe { lut.get_unchecked(in_b) };
566
567 let x: i32 = lut_r.x;
568 let y: i32 = lut_g.x;
569 let z: i32 = lut_b.x;
570
571 let x_n: i32 = lut_r.x_n;
572 let y_n: i32 = lut_g.x_n;
573 let z_n: i32 = lut_b.x_n;
574
575 let dr = lut_r.w;
576 let dg = lut_g.w;
577 let db = lut_b.w;
578
579 let c0 = r.fetch(x, y, z);
580
581 let w0 = AvxVectorSse::from(db);
582 let w1 = AvxVectorSse::from(dr);
583 let w2 = AvxVectorSse::from(dg);
584 let w3 = AvxVectorSse::from(dg * db);
585 let w4 = AvxVectorSse::from(dr * dg);
586
587 if db > dr {
588 let x0 = r.fetch(x, y, z_n);
589 let x1 = r.fetch(x_n, y, z_n);
590 let x2 = r.fetch(x, y_n, z);
591 let x3 = r.fetch(x, y_n, z_n);
592 let x4 = r.fetch(x_n, y_n, z_n);
593
594 let c1 = x0 - c0;
595 let c2 = x1 - x0;
596 let c3 = x2 - c0;
597 let c4 = c0 - x2 - x0 + x3;
598 let c5 = x0 - x3 - x1 + x4;
599
600 let s0 = c0.mla(c1, w0);
601 let s1 = s0.mla(c2, w1);
602 let s2 = s1.mla(c3, w2);
603 let s3 = s2.mla(c4, w3);
604 s3.mla(c5, w4)
605 } else {
606 let x0 = r.fetch(x_n, y, z);
607 let x1 = r.fetch(x_n, y, z_n);
608 let x2 = r.fetch(x, y_n, z);
609 let x3 = r.fetch(x_n, y_n, z);
610 let x4 = r.fetch(x_n, y_n, z_n);
611
612 let c1 = x1 - x0;
613 let c2 = x0 - c0;
614 let c3 = x2 - c0;
615 let c4 = x0 - x3 - x1 + x4;
616 let c5 = c0 - x2 - x0 + x3;
617
618 let s0 = c0.mla(c1, w0);
619 let s1 = s0.mla(c2, w1);
620 let s2 = s1.mla(c3, w2);
621 let s3 = s2.mla(c4, w3);
622 s3.mla(c5, w4)
623 }
624 }
625}
626
627#[cfg(feature = "options")]
628impl<const GRID_SIZE: usize> PrismaticAvxFmaDouble<GRID_SIZE> {
629 #[target_feature(enable = "avx2", enable = "fma")]
630 unsafe fn interpolate(
631 &self,
632 in_r: usize,
633 in_g: usize,
634 in_b: usize,
635 lut: &[BarycentricWeight<f32>],
636 r0: impl Fetcher<AvxVectorSse>,
637 r1: impl Fetcher<AvxVectorSse>,
638 ) -> (AvxVectorSse, AvxVectorSse) {
639 let lut_r = unsafe { lut.get_unchecked(in_r) };
640 let lut_g = unsafe { lut.get_unchecked(in_g) };
641 let lut_b = unsafe { lut.get_unchecked(in_b) };
642
643 let x: i32 = lut_r.x;
644 let y: i32 = lut_g.x;
645 let z: i32 = lut_b.x;
646
647 let x_n: i32 = lut_r.x_n;
648 let y_n: i32 = lut_g.x_n;
649 let z_n: i32 = lut_b.x_n;
650
651 let dr = lut_r.w;
652 let dg = lut_g.w;
653 let db = lut_b.w;
654
655 let c0_0 = r0.fetch(x, y, z);
656 let c0_1 = r0.fetch(x, y, z);
657
658 let w0 = AvxVector::from(db);
659 let w1 = AvxVector::from(dr);
660 let w2 = AvxVector::from(dg);
661 let w3 = AvxVector::from(dg * db);
662 let w4 = AvxVector::from(dr * dg);
663
664 let c0 = AvxVector::from_sse(c0_0, c0_1);
665
666 if db > dr {
667 let x0_0 = r0.fetch(x, y, z_n);
668 let x1_0 = r0.fetch(x_n, y, z_n);
669 let x2_0 = r0.fetch(x, y_n, z);
670 let x3_0 = r0.fetch(x, y_n, z_n);
671 let x4_0 = r0.fetch(x_n, y_n, z_n);
672
673 let x0_1 = r1.fetch(x, y, z_n);
674 let x1_1 = r1.fetch(x_n, y, z_n);
675 let x2_1 = r1.fetch(x, y_n, z);
676 let x3_1 = r1.fetch(x, y_n, z_n);
677 let x4_1 = r1.fetch(x_n, y_n, z_n);
678
679 let x0 = AvxVector::from_sse(x0_0, x0_1);
680 let x1 = AvxVector::from_sse(x1_0, x1_1);
681 let x2 = AvxVector::from_sse(x2_0, x2_1);
682 let x3 = AvxVector::from_sse(x3_0, x3_1);
683 let x4 = AvxVector::from_sse(x4_0, x4_1);
684
685 let c1 = x0 - c0;
686 let c2 = x1 - x0;
687 let c3 = x2 - c0;
688 let c4 = c0 - x2 - x0 + x3;
689 let c5 = x0 - x3 - x1 + x4;
690
691 let s0 = c0.mla(c1, w0);
692 let s1 = s0.mla(c2, w1);
693 let s2 = s1.mla(c3, w2);
694 let s3 = s2.mla(c4, w3);
695 s3.mla(c5, w4).split()
696 } else {
697 let x0_0 = r0.fetch(x_n, y, z);
698 let x1_0 = r0.fetch(x_n, y, z_n);
699 let x2_0 = r0.fetch(x, y_n, z);
700 let x3_0 = r0.fetch(x_n, y_n, z);
701 let x4_0 = r0.fetch(x_n, y_n, z_n);
702
703 let x0_1 = r1.fetch(x_n, y, z);
704 let x1_1 = r1.fetch(x_n, y, z_n);
705 let x2_1 = r1.fetch(x, y_n, z);
706 let x3_1 = r1.fetch(x_n, y_n, z);
707 let x4_1 = r1.fetch(x_n, y_n, z_n);
708
709 let x0 = AvxVector::from_sse(x0_0, x0_1);
710 let x1 = AvxVector::from_sse(x1_0, x1_1);
711 let x2 = AvxVector::from_sse(x2_0, x2_1);
712 let x3 = AvxVector::from_sse(x3_0, x3_1);
713 let x4 = AvxVector::from_sse(x4_0, x4_1);
714
715 let c1 = x1 - x0;
716 let c2 = x0 - c0;
717 let c3 = x2 - c0;
718 let c4 = x0 - x3 - x1 + x4;
719 let c5 = c0 - x2 - x0 + x3;
720
721 let s0 = c0.mla(c1, w0);
722 let s1 = s0.mla(c2, w1);
723 let s2 = s1.mla(c3, w2);
724 let s3 = s2.mla(c4, w3);
725 s3.mla(c5, w4).split()
726 }
727 }
728}
729
730#[cfg(feature = "options")]
731impl<const GRID_SIZE: usize> PyramidAvxFmaDouble<GRID_SIZE> {
732 #[target_feature(enable = "avx2", enable = "fma")]
733 unsafe fn interpolate(
734 &self,
735 in_r: usize,
736 in_g: usize,
737 in_b: usize,
738 lut: &[BarycentricWeight<f32>],
739 r0: impl Fetcher<AvxVectorSse>,
740 r1: impl Fetcher<AvxVectorSse>,
741 ) -> (AvxVectorSse, AvxVectorSse) {
742 let lut_r = unsafe { lut.get_unchecked(in_r) };
743 let lut_g = unsafe { lut.get_unchecked(in_g) };
744 let lut_b = unsafe { lut.get_unchecked(in_b) };
745
746 let x: i32 = lut_r.x;
747 let y: i32 = lut_g.x;
748 let z: i32 = lut_b.x;
749
750 let x_n: i32 = lut_r.x_n;
751 let y_n: i32 = lut_g.x_n;
752 let z_n: i32 = lut_b.x_n;
753
754 let dr = lut_r.w;
755 let dg = lut_g.w;
756 let db = lut_b.w;
757
758 let c0_0 = r0.fetch(x, y, z);
759 let c0_1 = r1.fetch(x, y, z);
760
761 let w0 = AvxVector::from(db);
762 let w1 = AvxVector::from(dr);
763 let w2 = AvxVector::from(dg);
764
765 let c0 = AvxVector::from_sse(c0_0, c0_1);
766
767 if dr > db && dg > db {
768 let w3 = AvxVector::from(dr * dg);
769
770 let x0_0 = r0.fetch(x_n, y_n, z_n);
771 let x1_0 = r0.fetch(x_n, y_n, z);
772 let x2_0 = r0.fetch(x_n, y, z);
773 let x3_0 = r0.fetch(x, y_n, z);
774
775 let x0_1 = r1.fetch(x_n, y_n, z_n);
776 let x1_1 = r1.fetch(x_n, y_n, z);
777 let x2_1 = r1.fetch(x_n, y, z);
778 let x3_1 = r1.fetch(x, y_n, z);
779
780 let x0 = AvxVector::from_sse(x0_0, x0_1);
781 let x1 = AvxVector::from_sse(x1_0, x1_1);
782 let x2 = AvxVector::from_sse(x2_0, x2_1);
783 let x3 = AvxVector::from_sse(x3_0, x3_1);
784
785 let c1 = x0 - x1;
786 let c2 = x2 - c0;
787 let c3 = x3 - c0;
788 let c4 = c0 - x3 - x2 + x1;
789
790 let s0 = c0.mla(c1, w0);
791 let s1 = s0.mla(c2, w1);
792 let s2 = s1.mla(c3, w2);
793 s2.mla(c4, w3).split()
794 } else if db > dr && dg > dr {
795 let w3 = AvxVector::from(dg * db);
796
797 let x0_0 = r0.fetch(x, y, z_n);
798 let x1_0 = r0.fetch(x_n, y_n, z_n);
799 let x2_0 = r0.fetch(x, y_n, z_n);
800 let x3_0 = r0.fetch(x, y_n, z);
801
802 let x0_1 = r1.fetch(x, y, z_n);
803 let x1_1 = r1.fetch(x_n, y_n, z_n);
804 let x2_1 = r1.fetch(x, y_n, z_n);
805 let x3_1 = r1.fetch(x, y_n, z);
806
807 let x0 = AvxVector::from_sse(x0_0, x0_1);
808 let x1 = AvxVector::from_sse(x1_0, x1_1);
809 let x2 = AvxVector::from_sse(x2_0, x2_1);
810 let x3 = AvxVector::from_sse(x3_0, x3_1);
811
812 let c1 = x0 - c0;
813 let c2 = x1 - x2;
814 let c3 = x3 - c0;
815 let c4 = c0 - x3 - x0 + x2;
816
817 let s0 = c0.mla(c1, w0);
818 let s1 = s0.mla(c2, w1);
819 let s2 = s1.mla(c3, w2);
820 s2.mla(c4, w3).split()
821 } else {
822 let w3 = AvxVector::from(db * dr);
823
824 let x0_0 = r0.fetch(x, y, z_n);
825 let x1_0 = r0.fetch(x_n, y, z);
826 let x2_0 = r0.fetch(x_n, y, z_n);
827 let x3_0 = r0.fetch(x_n, y_n, z_n);
828
829 let x0_1 = r1.fetch(x, y, z_n);
830 let x1_1 = r1.fetch(x_n, y, z);
831 let x2_1 = r1.fetch(x_n, y, z_n);
832 let x3_1 = r1.fetch(x_n, y_n, z_n);
833
834 let x0 = AvxVector::from_sse(x0_0, x0_1);
835 let x1 = AvxVector::from_sse(x1_0, x1_1);
836 let x2 = AvxVector::from_sse(x2_0, x2_1);
837 let x3 = AvxVector::from_sse(x3_0, x3_1);
838
839 let c1 = x0 - c0;
840 let c2 = x1 - c0;
841 let c3 = x3 - x2;
842 let c4 = c0 - x1 - x0 + x2;
843
844 let s0 = c0.mla(c1, w0);
845 let s1 = s0.mla(c2, w1);
846 let s2 = s1.mla(c3, w2);
847 s2.mla(c4, w3).split()
848 }
849 }
850}
851
852#[cfg(feature = "options")]
853impl<const GRID_SIZE: usize> TetrahedralAvxFmaDouble<GRID_SIZE> {
854 #[target_feature(enable = "avx2", enable = "fma")]
855 unsafe fn interpolate(
856 &self,
857 in_r: usize,
858 in_g: usize,
859 in_b: usize,
860 lut: &[BarycentricWeight<f32>],
861 rv: impl Fetcher<AvxVector>,
862 ) -> (AvxVectorSse, AvxVectorSse) {
863 let lut_r = unsafe { lut.get_unchecked(in_r) };
864 let lut_g = unsafe { lut.get_unchecked(in_g) };
865 let lut_b = unsafe { lut.get_unchecked(in_b) };
866
867 let x: i32 = lut_r.x;
868 let y: i32 = lut_g.x;
869 let z: i32 = lut_b.x;
870
871 let x_n: i32 = lut_r.x_n;
872 let y_n: i32 = lut_g.x_n;
873 let z_n: i32 = lut_b.x_n;
874
875 let rx = lut_r.w;
876 let ry = lut_g.w;
877 let rz = lut_b.w;
878
879 let c0 = rv.fetch(x, y, z);
880
881 let w0 = AvxVector::from(rx);
882 let w1 = AvxVector::from(ry);
883 let w2 = AvxVector::from(rz);
884
885 let c2;
886 let c1;
887 let c3;
888 if rx >= ry {
889 if ry >= rz {
890 c1 = rv.fetch(x_n, y, z) - c0;
892 c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
893 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
894 } else if rx >= rz {
895 c1 = rv.fetch(x_n, y, z) - c0;
897 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
898 c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
899 } else {
900 c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
902 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
903 c3 = rv.fetch(x, y, z_n) - c0;
904 }
905 } else if rx >= rz {
906 c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
908 c2 = rv.fetch(x, y_n, z) - c0;
909 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
910 } else if ry >= rz {
911 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
913 c2 = rv.fetch(x, y_n, z) - c0;
914 c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
915 } else {
916 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
918 c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
919 c3 = rv.fetch(x, y, z_n) - c0;
920 }
921 let s0 = c0.mla(c1, w0);
922 let s1 = s0.mla(c2, w1);
923 s1.mla(c3, w2).split()
924 }
925}
926
927impl<const GRID_SIZE: usize> TrilinearAvxFmaDouble<GRID_SIZE> {
928 #[target_feature(enable = "avx2", enable = "fma")]
929 unsafe fn interpolate(
930 &self,
931 in_r: usize,
932 in_g: usize,
933 in_b: usize,
934 lut: &[BarycentricWeight<f32>],
935 rv: impl Fetcher<AvxVector>,
936 ) -> (AvxVectorSse, AvxVectorSse) {
937 let lut_r = unsafe { lut.get_unchecked(in_r) };
938 let lut_g = unsafe { lut.get_unchecked(in_g) };
939 let lut_b = unsafe { lut.get_unchecked(in_b) };
940
941 let x: i32 = lut_r.x;
942 let y: i32 = lut_g.x;
943 let z: i32 = lut_b.x;
944
945 let x_n: i32 = lut_r.x_n;
946 let y_n: i32 = lut_g.x_n;
947 let z_n: i32 = lut_b.x_n;
948
949 let rx = lut_r.w;
950 let ry = lut_g.w;
951 let rz = lut_b.w;
952
953 let w0 = AvxVector::from(rx);
954 let w1 = AvxVector::from(ry);
955 let w2 = AvxVector::from(rz);
956
957 let c000 = rv.fetch(x, y, z);
958 let c100 = rv.fetch(x_n, y, z);
959 let c010 = rv.fetch(x, y_n, z);
960 let c110 = rv.fetch(x_n, y_n, z);
961 let c001 = rv.fetch(x, y, z_n);
962 let c101 = rv.fetch(x_n, y, z_n);
963 let c011 = rv.fetch(x, y_n, z_n);
964 let c111 = rv.fetch(x_n, y_n, z_n);
965
966 let dx = AvxVector::from(rx);
967
968 let c00 = c000.neg_mla(c000, dx).mla(c100, w0);
969 let c10 = c010.neg_mla(c010, dx).mla(c110, w0);
970 let c01 = c001.neg_mla(c001, dx).mla(c101, w0);
971 let c11 = c011.neg_mla(c011, dx).mla(c111, w0);
972
973 let dy = AvxVector::from(ry);
974
975 let c0 = c00.neg_mla(c00, dy).mla(c10, w1);
976 let c1 = c01.neg_mla(c01, dy).mla(c11, w1);
977
978 let dz = AvxVector::from(rz);
979
980 c0.neg_mla(c0, dz).mla(c1, w2).split()
981 }
982}
983
984impl<const GRID_SIZE: usize> TrilinearAvxFma<GRID_SIZE> {
985 #[target_feature(enable = "avx2", enable = "fma")]
986 unsafe fn interpolate(
987 &self,
988 in_r: usize,
989 in_g: usize,
990 in_b: usize,
991 lut: &[BarycentricWeight<f32>],
992 r: impl Fetcher<AvxVectorSse>,
993 ) -> AvxVectorSse {
994 let lut_r = unsafe { lut.get_unchecked(in_r) };
995 let lut_g = unsafe { lut.get_unchecked(in_g) };
996 let lut_b = unsafe { lut.get_unchecked(in_b) };
997
998 let x: i32 = lut_r.x;
999 let y: i32 = lut_g.x;
1000 let z: i32 = lut_b.x;
1001
1002 let x_n: i32 = lut_r.x_n;
1003 let y_n: i32 = lut_g.x_n;
1004 let z_n: i32 = lut_b.x_n;
1005
1006 let dr = lut_r.w;
1007 let dg = lut_g.w;
1008 let db = lut_b.w;
1009
1010 let w0 = AvxVector::from(dr);
1011 let w1 = AvxVector::from(dg);
1012 let w2 = AvxVectorSse::from(db);
1013
1014 let c000 = r.fetch(x, y, z);
1015 let c100 = r.fetch(x_n, y, z);
1016 let c010 = r.fetch(x, y_n, z);
1017 let c110 = r.fetch(x_n, y_n, z);
1018 let c001 = r.fetch(x, y, z_n);
1019 let c101 = r.fetch(x_n, y, z_n);
1020 let c011 = r.fetch(x, y_n, z_n);
1021 let c111 = r.fetch(x_n, y_n, z_n);
1022
1023 let x000 = AvxVector::from_sse(c000, c001);
1024 let x010 = AvxVector::from_sse(c010, c011);
1025 let x011 = AvxVector::from_sse(c100, c101);
1026 let x111 = AvxVector::from_sse(c110, c111);
1027
1028 let c00 = x000.neg_mla(x000, w0).mla(x011, w0);
1029 let c10 = x010.neg_mla(x010, w0).mla(x111, w0);
1030
1031 let z0 = c00.neg_mla(c00, w1).mla(c10, w1);
1032
1033 let (c0, c1) = z0.split();
1034
1035 c0.neg_mla(c0, w2).mla(c1, w2)
1036 }
1037}