1use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::FusedMultiplyAdd;
31use std::arch::x86_64::*;
32use std::ops::{Add, Mul, Sub};
33
34#[repr(align(8), C)]
35pub(crate) struct AvxAlignedI16(pub(crate) [i16; 4]);
36
37#[cfg(feature = "options")]
38pub(crate) struct TetrahedralAvxQ0_15<const GRID_SIZE: usize> {}
39
40#[cfg(feature = "options")]
41pub(crate) struct PyramidalAvxQ0_15<const GRID_SIZE: usize> {}
42
43#[cfg(feature = "options")]
44pub(crate) struct PrismaticAvxQ0_15<const GRID_SIZE: usize> {}
45
46pub(crate) struct TrilinearAvxQ0_15<const GRID_SIZE: usize> {}
47
48#[cfg(feature = "options")]
49pub(crate) struct PrismaticAvxQ0_15Double<const GRID_SIZE: usize> {}
50
51pub(crate) struct TrilinearAvxQ0_15Double<const GRID_SIZE: usize> {}
52
53#[cfg(feature = "options")]
54pub(crate) struct PyramidAvxFmaQ0_15Double<const GRID_SIZE: usize> {}
55
56#[cfg(feature = "options")]
57pub(crate) struct TetrahedralAvxQ0_15Double<const GRID_SIZE: usize> {}
58
59pub(crate) trait AvxMdInterpolationQ0_15Double {
60 fn inter3_sse(
61 &self,
62 table0: &[AvxAlignedI16],
63 table1: &[AvxAlignedI16],
64 in_r: usize,
65 in_g: usize,
66 in_b: usize,
67 lut: &[BarycentricWeight<i16>],
68 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse);
69}
70
71pub(crate) trait AvxMdInterpolationQ0_15 {
72 fn inter3_sse(
73 &self,
74 table: &[AvxAlignedI16],
75 in_r: usize,
76 in_g: usize,
77 in_b: usize,
78 lut: &[BarycentricWeight<i16>],
79 ) -> AvxVectorQ0_15Sse;
80}
81
82trait Fetcher<T> {
83 fn fetch(&self, x: i32, y: i32, z: i32) -> T;
84}
85
86#[derive(Copy, Clone)]
87#[repr(transparent)]
88pub(crate) struct AvxVectorQ0_15Sse {
89 pub(crate) v: __m128i,
90}
91
92#[derive(Copy, Clone)]
93#[repr(transparent)]
94pub(crate) struct AvxVectorQ0_15 {
95 pub(crate) v: __m256i,
96}
97
98impl AvxVectorQ0_15 {
99 #[inline(always)]
100 pub(crate) fn from_sse(lo: AvxVectorQ0_15Sse, hi: AvxVectorQ0_15Sse) -> AvxVectorQ0_15 {
101 unsafe {
102 AvxVectorQ0_15 {
103 v: _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(lo.v), hi.v),
104 }
105 }
106 }
107
108 #[inline(always)]
109 pub(crate) fn split(self) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
110 unsafe {
111 (
112 AvxVectorQ0_15Sse {
113 v: _mm256_castsi256_si128(self.v),
114 },
115 AvxVectorQ0_15Sse {
116 v: _mm256_extracti128_si256::<1>(self.v),
117 },
118 )
119 }
120 }
121}
122
123impl From<i16> for AvxVectorQ0_15Sse {
124 #[inline(always)]
125 fn from(v: i16) -> Self {
126 AvxVectorQ0_15Sse {
127 v: unsafe { _mm_set1_epi16(v) },
128 }
129 }
130}
131
132impl From<i16> for AvxVectorQ0_15 {
133 #[inline(always)]
134 fn from(v: i16) -> Self {
135 AvxVectorQ0_15 {
136 v: unsafe { _mm256_set1_epi16(v) },
137 }
138 }
139}
140
141impl Sub<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
142 type Output = Self;
143 #[inline(always)]
144 fn sub(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
145 AvxVectorQ0_15Sse {
146 v: unsafe { _mm_sub_epi16(self.v, rhs.v) },
147 }
148 }
149}
150
151impl Sub<AvxVectorQ0_15> for AvxVectorQ0_15 {
152 type Output = Self;
153 #[inline(always)]
154 fn sub(self, rhs: AvxVectorQ0_15) -> Self::Output {
155 AvxVectorQ0_15 {
156 v: unsafe { _mm256_sub_epi16(self.v, rhs.v) },
157 }
158 }
159}
160
161impl Add<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
162 type Output = Self;
163 #[inline(always)]
164 fn add(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
165 AvxVectorQ0_15Sse {
166 v: unsafe { _mm_add_epi16(self.v, rhs.v) },
167 }
168 }
169}
170
171impl Mul<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
172 type Output = Self;
173 #[inline(always)]
174 fn mul(self, rhs: AvxVectorQ0_15Sse) -> Self::Output {
175 AvxVectorQ0_15Sse {
176 v: unsafe { _mm_mulhrs_epi16(self.v, rhs.v) },
177 }
178 }
179}
180
181impl Add<AvxVectorQ0_15> for AvxVectorQ0_15 {
182 type Output = Self;
183 #[inline(always)]
184 fn add(self, rhs: AvxVectorQ0_15) -> Self::Output {
185 AvxVectorQ0_15 {
186 v: unsafe { _mm256_add_epi16(self.v, rhs.v) },
187 }
188 }
189}
190
191impl Mul<AvxVectorQ0_15> for AvxVectorQ0_15 {
192 type Output = Self;
193 #[inline(always)]
194 fn mul(self, rhs: AvxVectorQ0_15) -> Self::Output {
195 AvxVectorQ0_15 {
196 v: unsafe { _mm256_mulhrs_epi16(self.v, rhs.v) },
197 }
198 }
199}
200
201impl FusedMultiplyAdd<AvxVectorQ0_15Sse> for AvxVectorQ0_15Sse {
202 #[inline(always)]
203 fn mla(&self, b: AvxVectorQ0_15Sse, c: AvxVectorQ0_15Sse) -> AvxVectorQ0_15Sse {
204 AvxVectorQ0_15Sse {
205 v: unsafe { _mm_add_epi16(_mm_mulhrs_epi16(b.v, c.v), self.v) },
206 }
207 }
208}
209
210impl FusedMultiplyAdd<AvxVectorQ0_15> for AvxVectorQ0_15 {
211 #[inline(always)]
212 fn mla(&self, b: AvxVectorQ0_15, c: AvxVectorQ0_15) -> AvxVectorQ0_15 {
213 AvxVectorQ0_15 {
214 v: unsafe { _mm256_add_epi16(_mm256_mulhrs_epi16(b.v, c.v), self.v) },
215 }
216 }
217}
218
219struct TetrahedralAvxSseFetchVector<'a, const GRID_SIZE: usize> {
220 cube: &'a [AvxAlignedI16],
221}
222
223struct TetrahedralAvxFetchVector<'a, const GRID_SIZE: usize> {
224 cube0: &'a [AvxAlignedI16],
225 cube1: &'a [AvxAlignedI16],
226}
227
228impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15> for TetrahedralAvxFetchVector<'_, GRID_SIZE> {
229 #[inline(always)]
230 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15 {
231 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
232 + y as u32 * GRID_SIZE as u32
233 + z as u32) as usize;
234 let jx0 = unsafe { self.cube0.get_unchecked(offset..) };
235 let jx1 = unsafe { self.cube1.get_unchecked(offset..) };
236 AvxVectorQ0_15 {
237 v: unsafe {
238 _mm256_inserti128_si256::<1>(
239 _mm256_castsi128_si256(_mm_loadu_si64(jx0.as_ptr() as *const _)),
240 _mm_loadu_si64(jx1.as_ptr() as *const _),
241 )
242 },
243 }
244 }
245}
246
247impl<const GRID_SIZE: usize> Fetcher<AvxVectorQ0_15Sse>
248 for TetrahedralAvxSseFetchVector<'_, GRID_SIZE>
249{
250 #[inline(always)]
251 fn fetch(&self, x: i32, y: i32, z: i32) -> AvxVectorQ0_15Sse {
252 let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
253 + y as u32 * GRID_SIZE as u32
254 + z as u32) as usize;
255 let jx = unsafe { self.cube.get_unchecked(offset..) };
256 AvxVectorQ0_15Sse {
257 v: unsafe { _mm_loadu_si64(jx.as_ptr() as *const _) },
258 }
259 }
260}
261
262#[cfg(feature = "options")]
263impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15<GRID_SIZE> {
264 #[target_feature(enable = "avx2")]
265 unsafe fn interpolate(
266 &self,
267 in_r: usize,
268 in_g: usize,
269 in_b: usize,
270 lut: &[BarycentricWeight<i16>],
271 r: impl Fetcher<AvxVectorQ0_15Sse>,
272 ) -> AvxVectorQ0_15Sse {
273 let lut_r = unsafe { *lut.get_unchecked(in_r) };
274 let lut_g = unsafe { *lut.get_unchecked(in_g) };
275 let lut_b = unsafe { *lut.get_unchecked(in_b) };
276
277 let x: i32 = lut_r.x;
278 let y: i32 = lut_g.x;
279 let z: i32 = lut_b.x;
280
281 let x_n: i32 = lut_r.x_n;
282 let y_n: i32 = lut_g.x_n;
283 let z_n: i32 = lut_b.x_n;
284
285 let rx = lut_r.w;
286 let ry = lut_g.w;
287 let rz = lut_b.w;
288
289 let c0 = r.fetch(x, y, z);
290
291 let c2;
292 let c1;
293 let c3;
294 if rx >= ry {
295 if ry >= rz {
296 c1 = r.fetch(x_n, y, z) - c0;
298 c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
299 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
300 } else if rx >= rz {
301 c1 = r.fetch(x_n, y, z) - c0;
303 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
304 c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
305 } else {
306 c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
308 c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
309 c3 = r.fetch(x, y, z_n) - c0;
310 }
311 } else if rx >= rz {
312 c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
314 c2 = r.fetch(x, y_n, z) - c0;
315 c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
316 } else if ry >= rz {
317 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
319 c2 = r.fetch(x, y_n, z) - c0;
320 c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
321 } else {
322 c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
324 c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
325 c3 = r.fetch(x, y, z_n) - c0;
326 }
327 let s0 = c0.mla(c1, AvxVectorQ0_15Sse::from(rx));
328 let s1 = s0.mla(c2, AvxVectorQ0_15Sse::from(ry));
329 s1.mla(c3, AvxVectorQ0_15Sse::from(rz))
330 }
331}
332
333macro_rules! define_interp_avx {
334 ($interpolator: ident) => {
335 impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15 for $interpolator<GRID_SIZE> {
336 fn inter3_sse(
337 &self,
338 table: &[AvxAlignedI16],
339 in_r: usize,
340 in_g: usize,
341 in_b: usize,
342 lut: &[BarycentricWeight<i16>],
343 ) -> AvxVectorQ0_15Sse {
344 unsafe {
345 self.interpolate(
346 in_r,
347 in_g,
348 in_b,
349 lut,
350 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table },
351 )
352 }
353 }
354 }
355 };
356}
357
358#[cfg(feature = "options")]
359macro_rules! define_interp_avx_d {
360 ($interpolator: ident) => {
361 impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for $interpolator<GRID_SIZE> {
362 fn inter3_sse(
363 &self,
364 table0: &[AvxAlignedI16],
365 table1: &[AvxAlignedI16],
366 in_r: usize,
367 in_g: usize,
368 in_b: usize,
369 lut: &[BarycentricWeight<i16>],
370 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
371 unsafe {
372 self.interpolate(
373 in_r,
374 in_g,
375 in_b,
376 lut,
377 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table0 },
378 TetrahedralAvxSseFetchVector::<GRID_SIZE> { cube: table1 },
379 )
380 }
381 }
382 }
383 };
384}
385
386#[cfg(feature = "options")]
387define_interp_avx!(TetrahedralAvxQ0_15);
388#[cfg(feature = "options")]
389define_interp_avx!(PyramidalAvxQ0_15);
390#[cfg(feature = "options")]
391define_interp_avx!(PrismaticAvxQ0_15);
392define_interp_avx!(TrilinearAvxQ0_15);
393#[cfg(feature = "options")]
394define_interp_avx_d!(PrismaticAvxQ0_15Double);
395#[cfg(feature = "options")]
396define_interp_avx_d!(PyramidAvxFmaQ0_15Double);
397
398#[cfg(feature = "options")]
399impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double
400 for TetrahedralAvxQ0_15Double<GRID_SIZE>
401{
402 fn inter3_sse(
403 &self,
404 table0: &[AvxAlignedI16],
405 table1: &[AvxAlignedI16],
406 in_r: usize,
407 in_g: usize,
408 in_b: usize,
409 lut: &[BarycentricWeight<i16>],
410 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
411 unsafe {
412 self.interpolate(
413 in_r,
414 in_g,
415 in_b,
416 lut,
417 TetrahedralAvxFetchVector::<GRID_SIZE> {
418 cube0: table0,
419 cube1: table1,
420 },
421 )
422 }
423 }
424}
425
426impl<const GRID_SIZE: usize> AvxMdInterpolationQ0_15Double for TrilinearAvxQ0_15Double<GRID_SIZE> {
427 fn inter3_sse(
428 &self,
429 table0: &[AvxAlignedI16],
430 table1: &[AvxAlignedI16],
431 in_r: usize,
432 in_g: usize,
433 in_b: usize,
434 lut: &[BarycentricWeight<i16>],
435 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
436 unsafe {
437 self.interpolate(
438 in_r,
439 in_g,
440 in_b,
441 lut,
442 TetrahedralAvxFetchVector::<GRID_SIZE> {
443 cube0: table0,
444 cube1: table1,
445 },
446 )
447 }
448 }
449}
450
451#[cfg(feature = "options")]
452impl<const GRID_SIZE: usize> PyramidalAvxQ0_15<GRID_SIZE> {
453 #[target_feature(enable = "avx2")]
454 unsafe fn interpolate(
455 &self,
456 in_r: usize,
457 in_g: usize,
458 in_b: usize,
459 lut: &[BarycentricWeight<i16>],
460 r: impl Fetcher<AvxVectorQ0_15Sse>,
461 ) -> AvxVectorQ0_15Sse {
462 let lut_r = unsafe { *lut.get_unchecked(in_r) };
463 let lut_g = unsafe { *lut.get_unchecked(in_g) };
464 let lut_b = unsafe { *lut.get_unchecked(in_b) };
465
466 let x: i32 = lut_r.x;
467 let y: i32 = lut_g.x;
468 let z: i32 = lut_b.x;
469
470 let x_n: i32 = lut_r.x_n;
471 let y_n: i32 = lut_g.x_n;
472 let z_n: i32 = lut_b.x_n;
473
474 let dr = lut_r.w;
475 let dg = lut_g.w;
476 let db = lut_b.w;
477
478 let c0 = r.fetch(x, y, z);
479
480 let w0 = AvxVectorQ0_15Sse::from(db);
481 let w1 = AvxVectorQ0_15Sse::from(dr);
482 let w2 = AvxVectorQ0_15Sse::from(dg);
483
484 if dr > db && dg > db {
485 let w3 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
486 let x0 = r.fetch(x_n, y_n, z_n);
487 let x1 = r.fetch(x_n, y_n, z);
488 let x2 = r.fetch(x_n, y, z);
489 let x3 = r.fetch(x, y_n, z);
490
491 let c1 = x0 - x1;
492 let c2 = x2 - c0;
493 let c3 = x3 - c0;
494 let c4 = c0 - x3 - x2 + x1;
495
496 let s0 = c0.mla(c1, w0);
497 let s1 = s0.mla(c2, w1);
498 let s2 = s1.mla(c3, w2);
499 s2.mla(c4, w3)
500 } else if db > dr && dg > dr {
501 let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
502
503 let x0 = r.fetch(x, y, z_n);
504 let x1 = r.fetch(x_n, y_n, z_n);
505 let x2 = r.fetch(x, y_n, z_n);
506 let x3 = r.fetch(x, y_n, z);
507
508 let c1 = x0 - c0;
509 let c2 = x1 - x2;
510 let c3 = x3 - c0;
511 let c4 = c0 - x3 - x0 + x2;
512
513 let s0 = c0.mla(c1, w0);
514 let s1 = s0.mla(c2, w1);
515 let s2 = s1.mla(c3, w2);
516 s2.mla(c4, w3)
517 } else {
518 let w3 = AvxVectorQ0_15Sse::from(db) * AvxVectorQ0_15Sse::from(dr);
519
520 let x0 = r.fetch(x, y, z_n);
521 let x1 = r.fetch(x_n, y, z);
522 let x2 = r.fetch(x_n, y, z_n);
523 let x3 = r.fetch(x_n, y_n, z_n);
524
525 let c1 = x0 - c0;
526 let c2 = x1 - c0;
527 let c3 = x3 - x2;
528 let c4 = c0 - x1 - x0 + x2;
529
530 let s0 = c0.mla(c1, w0);
531 let s1 = s0.mla(c2, w1);
532 let s2 = s1.mla(c3, w2);
533 s2.mla(c4, w3)
534 }
535 }
536}
537
538#[cfg(feature = "options")]
539impl<const GRID_SIZE: usize> PrismaticAvxQ0_15<GRID_SIZE> {
540 #[target_feature(enable = "avx2")]
541 unsafe fn interpolate(
542 &self,
543 in_r: usize,
544 in_g: usize,
545 in_b: usize,
546 lut: &[BarycentricWeight<i16>],
547 r: impl Fetcher<AvxVectorQ0_15Sse>,
548 ) -> AvxVectorQ0_15Sse {
549 let lut_r = unsafe { *lut.get_unchecked(in_r) };
550 let lut_g = unsafe { *lut.get_unchecked(in_g) };
551 let lut_b = unsafe { *lut.get_unchecked(in_b) };
552
553 let x: i32 = lut_r.x;
554 let y: i32 = lut_g.x;
555 let z: i32 = lut_b.x;
556
557 let x_n: i32 = lut_r.x_n;
558 let y_n: i32 = lut_g.x_n;
559 let z_n: i32 = lut_b.x_n;
560
561 let dr = lut_r.w;
562 let dg = lut_g.w;
563 let db = lut_b.w;
564
565 let c0 = r.fetch(x, y, z);
566
567 let w0 = AvxVectorQ0_15Sse::from(db);
568 let w1 = AvxVectorQ0_15Sse::from(dr);
569 let w2 = AvxVectorQ0_15Sse::from(dg);
570 let w3 = AvxVectorQ0_15Sse::from(dg) * AvxVectorQ0_15Sse::from(db);
571 let w4 = AvxVectorQ0_15Sse::from(dr) * AvxVectorQ0_15Sse::from(dg);
572
573 if db > dr {
574 let x0 = r.fetch(x, y, z_n);
575 let x1 = r.fetch(x_n, y, z_n);
576 let x2 = r.fetch(x, y_n, z);
577 let x3 = r.fetch(x, y_n, z_n);
578 let x4 = r.fetch(x_n, y_n, z_n);
579
580 let c1 = x0 - c0;
581 let c2 = x1 - x0;
582 let c3 = x2 - c0;
583 let c4 = c0 - x2 - x0 + x3;
584 let c5 = x0 - x3 - x1 + x4;
585
586 let s0 = c0.mla(c1, w0);
587 let s1 = s0.mla(c2, w1);
588 let s2 = s1.mla(c3, w2);
589 let s3 = s2.mla(c4, w3);
590 s3.mla(c5, w4)
591 } else {
592 let x0 = r.fetch(x_n, y, z);
593 let x1 = r.fetch(x_n, y, z_n);
594 let x2 = r.fetch(x, y_n, z);
595 let x3 = r.fetch(x_n, y_n, z);
596 let x4 = r.fetch(x_n, y_n, z_n);
597
598 let c1 = x1 - x0;
599 let c2 = x0 - c0;
600 let c3 = x2 - c0;
601 let c4 = x0 - x3 - x1 + x4;
602 let c5 = c0 - x2 - x0 + x3;
603
604 let s0 = c0.mla(c1, w0);
605 let s1 = s0.mla(c2, w1);
606 let s2 = s1.mla(c3, w2);
607 let s3 = s2.mla(c4, w3);
608 s3.mla(c5, w4)
609 }
610 }
611}
612
613#[cfg(feature = "options")]
614impl<const GRID_SIZE: usize> PrismaticAvxQ0_15Double<GRID_SIZE> {
615 #[target_feature(enable = "avx2")]
616 unsafe fn interpolate(
617 &self,
618 in_r: usize,
619 in_g: usize,
620 in_b: usize,
621 lut: &[BarycentricWeight<i16>],
622 r0: impl Fetcher<AvxVectorQ0_15Sse>,
623 r1: impl Fetcher<AvxVectorQ0_15Sse>,
624 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
625 let lut_r = unsafe { *lut.get_unchecked(in_r) };
626 let lut_g = unsafe { *lut.get_unchecked(in_g) };
627 let lut_b = unsafe { *lut.get_unchecked(in_b) };
628
629 let x: i32 = lut_r.x;
630 let y: i32 = lut_g.x;
631 let z: i32 = lut_b.x;
632
633 let x_n: i32 = lut_r.x_n;
634 let y_n: i32 = lut_g.x_n;
635 let z_n: i32 = lut_b.x_n;
636
637 let dr = lut_r.w;
638 let dg = lut_g.w;
639 let db = lut_b.w;
640
641 let c0_0 = r0.fetch(x, y, z);
642 let c0_1 = r0.fetch(x, y, z);
643
644 let w0 = AvxVectorQ0_15::from(db);
645 let w1 = AvxVectorQ0_15::from(dr);
646 let w2 = AvxVectorQ0_15::from(dg);
647 let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
648 let w4 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
649
650 let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
651
652 if db > dr {
653 let x0_0 = r0.fetch(x, y, z_n);
654 let x1_0 = r0.fetch(x_n, y, z_n);
655 let x2_0 = r0.fetch(x, y_n, z);
656 let x3_0 = r0.fetch(x, y_n, z_n);
657 let x4_0 = r0.fetch(x_n, y_n, z_n);
658
659 let x0_1 = r1.fetch(x, y, z_n);
660 let x1_1 = r1.fetch(x_n, y, z_n);
661 let x2_1 = r1.fetch(x, y_n, z);
662 let x3_1 = r1.fetch(x, y_n, z_n);
663 let x4_1 = r1.fetch(x_n, y_n, z_n);
664
665 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
666 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
667 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
668 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
669 let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
670
671 let c1 = x0 - c0;
672 let c2 = x1 - x0;
673 let c3 = x2 - c0;
674 let c4 = c0 - x2 - x0 + x3;
675 let c5 = x0 - x3 - x1 + x4;
676
677 let s0 = c0.mla(c1, w0);
678 let s1 = s0.mla(c2, w1);
679 let s2 = s1.mla(c3, w2);
680 let s3 = s2.mla(c4, w3);
681 s3.mla(c5, w4).split()
682 } else {
683 let x0_0 = r0.fetch(x_n, y, z);
684 let x1_0 = r0.fetch(x_n, y, z_n);
685 let x2_0 = r0.fetch(x, y_n, z);
686 let x3_0 = r0.fetch(x_n, y_n, z);
687 let x4_0 = r0.fetch(x_n, y_n, z_n);
688
689 let x0_1 = r1.fetch(x_n, y, z);
690 let x1_1 = r1.fetch(x_n, y, z_n);
691 let x2_1 = r1.fetch(x, y_n, z);
692 let x3_1 = r1.fetch(x_n, y_n, z);
693 let x4_1 = r1.fetch(x_n, y_n, z_n);
694
695 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
696 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
697 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
698 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
699 let x4 = AvxVectorQ0_15::from_sse(x4_0, x4_1);
700
701 let c1 = x1 - x0;
702 let c2 = x0 - c0;
703 let c3 = x2 - c0;
704 let c4 = x0 - x3 - x1 + x4;
705 let c5 = c0 - x2 - x0 + x3;
706
707 let s0 = c0.mla(c1, w0);
708 let s1 = s0.mla(c2, w1);
709 let s2 = s1.mla(c3, w2);
710 let s3 = s2.mla(c4, w3);
711 s3.mla(c5, w4).split()
712 }
713 }
714}
715
716#[cfg(feature = "options")]
717impl<const GRID_SIZE: usize> PyramidAvxFmaQ0_15Double<GRID_SIZE> {
718 #[target_feature(enable = "avx2")]
719 unsafe fn interpolate(
720 &self,
721 in_r: usize,
722 in_g: usize,
723 in_b: usize,
724 lut: &[BarycentricWeight<i16>],
725 r0: impl Fetcher<AvxVectorQ0_15Sse>,
726 r1: impl Fetcher<AvxVectorQ0_15Sse>,
727 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
728 let lut_r = unsafe { *lut.get_unchecked(in_r) };
729 let lut_g = unsafe { *lut.get_unchecked(in_g) };
730 let lut_b = unsafe { *lut.get_unchecked(in_b) };
731
732 let x: i32 = lut_r.x;
733 let y: i32 = lut_g.x;
734 let z: i32 = lut_b.x;
735
736 let x_n: i32 = lut_r.x_n;
737 let y_n: i32 = lut_g.x_n;
738 let z_n: i32 = lut_b.x_n;
739
740 let dr = lut_r.w;
741 let dg = lut_g.w;
742 let db = lut_b.w;
743
744 let c0_0 = r0.fetch(x, y, z);
745 let c0_1 = r1.fetch(x, y, z);
746
747 let w0 = AvxVectorQ0_15::from(db);
748 let w1 = AvxVectorQ0_15::from(dr);
749 let w2 = AvxVectorQ0_15::from(dg);
750
751 let c0 = AvxVectorQ0_15::from_sse(c0_0, c0_1);
752
753 if dr > db && dg > db {
754 let w3 = AvxVectorQ0_15::from(dr) * AvxVectorQ0_15::from(dg);
755
756 let x0_0 = r0.fetch(x_n, y_n, z_n);
757 let x1_0 = r0.fetch(x_n, y_n, z);
758 let x2_0 = r0.fetch(x_n, y, z);
759 let x3_0 = r0.fetch(x, y_n, z);
760
761 let x0_1 = r1.fetch(x_n, y_n, z_n);
762 let x1_1 = r1.fetch(x_n, y_n, z);
763 let x2_1 = r1.fetch(x_n, y, z);
764 let x3_1 = r1.fetch(x, y_n, z);
765
766 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
767 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
768 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
769 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
770
771 let c1 = x0 - x1;
772 let c2 = x2 - c0;
773 let c3 = x3 - c0;
774 let c4 = c0 - x3 - x2 + x1;
775
776 let s0 = c0.mla(c1, w0);
777 let s1 = s0.mla(c2, w1);
778 let s2 = s1.mla(c3, w2);
779 s2.mla(c4, w3).split()
780 } else if db > dr && dg > dr {
781 let w3 = AvxVectorQ0_15::from(dg) * AvxVectorQ0_15::from(db);
782
783 let x0_0 = r0.fetch(x, y, z_n);
784 let x1_0 = r0.fetch(x_n, y_n, z_n);
785 let x2_0 = r0.fetch(x, y_n, z_n);
786 let x3_0 = r0.fetch(x, y_n, z);
787
788 let x0_1 = r1.fetch(x, y, z_n);
789 let x1_1 = r1.fetch(x_n, y_n, z_n);
790 let x2_1 = r1.fetch(x, y_n, z_n);
791 let x3_1 = r1.fetch(x, y_n, z);
792
793 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
794 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
795 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
796 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
797
798 let c1 = x0 - c0;
799 let c2 = x1 - x2;
800 let c3 = x3 - c0;
801 let c4 = c0 - x3 - x0 + x2;
802
803 let s0 = c0.mla(c1, w0);
804 let s1 = s0.mla(c2, w1);
805 let s2 = s1.mla(c3, w2);
806 s2.mla(c4, w3).split()
807 } else {
808 let w3 = AvxVectorQ0_15::from(db) * AvxVectorQ0_15::from(dr);
809
810 let x0_0 = r0.fetch(x, y, z_n);
811 let x1_0 = r0.fetch(x_n, y, z);
812 let x2_0 = r0.fetch(x_n, y, z_n);
813 let x3_0 = r0.fetch(x_n, y_n, z_n);
814
815 let x0_1 = r1.fetch(x, y, z_n);
816 let x1_1 = r1.fetch(x_n, y, z);
817 let x2_1 = r1.fetch(x_n, y, z_n);
818 let x3_1 = r1.fetch(x_n, y_n, z_n);
819
820 let x0 = AvxVectorQ0_15::from_sse(x0_0, x0_1);
821 let x1 = AvxVectorQ0_15::from_sse(x1_0, x1_1);
822 let x2 = AvxVectorQ0_15::from_sse(x2_0, x2_1);
823 let x3 = AvxVectorQ0_15::from_sse(x3_0, x3_1);
824
825 let c1 = x0 - c0;
826 let c2 = x1 - c0;
827 let c3 = x3 - x2;
828 let c4 = c0 - x1 - x0 + x2;
829
830 let s0 = c0.mla(c1, w0);
831 let s1 = s0.mla(c2, w1);
832 let s2 = s1.mla(c3, w2);
833 s2.mla(c4, w3).split()
834 }
835 }
836}
837
838#[cfg(feature = "options")]
839impl<const GRID_SIZE: usize> TetrahedralAvxQ0_15Double<GRID_SIZE> {
840 #[target_feature(enable = "avx2")]
841 unsafe fn interpolate(
842 &self,
843 in_r: usize,
844 in_g: usize,
845 in_b: usize,
846 lut: &[BarycentricWeight<i16>],
847 rv: impl Fetcher<AvxVectorQ0_15>,
848 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
849 let lut_r = unsafe { *lut.get_unchecked(in_r) };
850 let lut_g = unsafe { *lut.get_unchecked(in_g) };
851 let lut_b = unsafe { *lut.get_unchecked(in_b) };
852
853 let x: i32 = lut_r.x;
854 let y: i32 = lut_g.x;
855 let z: i32 = lut_b.x;
856
857 let x_n: i32 = lut_r.x_n;
858 let y_n: i32 = lut_g.x_n;
859 let z_n: i32 = lut_b.x_n;
860
861 let rx = lut_r.w;
862 let ry = lut_g.w;
863 let rz = lut_b.w;
864
865 let c0 = rv.fetch(x, y, z);
866
867 let w0 = AvxVectorQ0_15::from(rx);
868 let w1 = AvxVectorQ0_15::from(ry);
869 let w2 = AvxVectorQ0_15::from(rz);
870
871 let c2;
872 let c1;
873 let c3;
874 if rx >= ry {
875 if ry >= rz {
876 c1 = rv.fetch(x_n, y, z) - c0;
878 c2 = rv.fetch(x_n, y_n, z) - rv.fetch(x_n, y, z);
879 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
880 } else if rx >= rz {
881 c1 = rv.fetch(x_n, y, z) - c0;
883 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
884 c3 = rv.fetch(x_n, y, z_n) - rv.fetch(x_n, y, z);
885 } else {
886 c1 = rv.fetch(x_n, y, z_n) - rv.fetch(x, y, z_n);
888 c2 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y, z_n);
889 c3 = rv.fetch(x, y, z_n) - c0;
890 }
891 } else if rx >= rz {
892 c1 = rv.fetch(x_n, y_n, z) - rv.fetch(x, y_n, z);
894 c2 = rv.fetch(x, y_n, z) - c0;
895 c3 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x_n, y_n, z);
896 } else if ry >= rz {
897 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
899 c2 = rv.fetch(x, y_n, z) - c0;
900 c3 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y_n, z);
901 } else {
902 c1 = rv.fetch(x_n, y_n, z_n) - rv.fetch(x, y_n, z_n);
904 c2 = rv.fetch(x, y_n, z_n) - rv.fetch(x, y, z_n);
905 c3 = rv.fetch(x, y, z_n) - c0;
906 }
907 let s0 = c0.mla(c1, w0);
908 let s1 = s0.mla(c2, w1);
909 s1.mla(c3, w2).split()
910 }
911}
912
913impl<const GRID_SIZE: usize> TrilinearAvxQ0_15Double<GRID_SIZE> {
914 #[target_feature(enable = "avx2")]
915 unsafe fn interpolate(
916 &self,
917 in_r: usize,
918 in_g: usize,
919 in_b: usize,
920 lut: &[BarycentricWeight<i16>],
921 rv: impl Fetcher<AvxVectorQ0_15>,
922 ) -> (AvxVectorQ0_15Sse, AvxVectorQ0_15Sse) {
923 let lut_r = unsafe { *lut.get_unchecked(in_r) };
924 let lut_g = unsafe { *lut.get_unchecked(in_g) };
925 let lut_b = unsafe { *lut.get_unchecked(in_b) };
926
927 let x: i32 = lut_r.x;
928 let y: i32 = lut_g.x;
929 let z: i32 = lut_b.x;
930
931 let x_n: i32 = lut_r.x_n;
932 let y_n: i32 = lut_g.x_n;
933 let z_n: i32 = lut_b.x_n;
934
935 let rx = lut_r.w;
936 let ry = lut_g.w;
937 let rz = lut_b.w;
938
939 const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
940
941 let q_max = AvxVectorQ0_15::from(Q_MAX);
942 let w0 = AvxVectorQ0_15::from(rx);
943 let w1 = AvxVectorQ0_15::from(ry);
944 let w2 = AvxVectorQ0_15::from(rz);
945 let dx = q_max - w0;
946 let dy = q_max - w1;
947 let dz = q_max - w2;
948
949 let c000 = rv.fetch(x, y, z);
950 let c100 = rv.fetch(x_n, y, z);
951 let c010 = rv.fetch(x, y_n, z);
952 let c110 = rv.fetch(x_n, y_n, z);
953 let c001 = rv.fetch(x, y, z_n);
954 let c101 = rv.fetch(x_n, y, z_n);
955 let c011 = rv.fetch(x, y_n, z_n);
956 let c111 = rv.fetch(x_n, y_n, z_n);
957
958 let c00 = (c000 * dx).mla(c100, w0);
959 let c10 = (c010 * dx).mla(c110, w0);
960 let c01 = (c001 * dx).mla(c101, w0);
961 let c11 = (c011 * dx).mla(c111, w0);
962
963 let c0 = (c00 * dy).mla(c10, w1);
964 let c1 = (c01 * dy).mla(c11, w1);
965
966 (c0 * dz).mla(c1, w2).split()
967 }
968}
969
970impl<const GRID_SIZE: usize> TrilinearAvxQ0_15<GRID_SIZE> {
971 #[target_feature(enable = "avx2")]
972 unsafe fn interpolate(
973 &self,
974 in_r: usize,
975 in_g: usize,
976 in_b: usize,
977 lut: &[BarycentricWeight<i16>],
978 r: impl Fetcher<AvxVectorQ0_15Sse>,
979 ) -> AvxVectorQ0_15Sse {
980 let lut_r = unsafe { *lut.get_unchecked(in_r) };
981 let lut_g = unsafe { *lut.get_unchecked(in_g) };
982 let lut_b = unsafe { *lut.get_unchecked(in_b) };
983
984 let x: i32 = lut_r.x;
985 let y: i32 = lut_g.x;
986 let z: i32 = lut_b.x;
987
988 let x_n: i32 = lut_r.x_n;
989 let y_n: i32 = lut_g.x_n;
990 let z_n: i32 = lut_b.x_n;
991
992 let dr = lut_r.w;
993 let dg = lut_g.w;
994 let db = lut_b.w;
995
996 const Q_MAX: i16 = ((1i32 << 15i32) - 1) as i16;
997
998 let q_max = AvxVectorQ0_15Sse::from(Q_MAX);
999 let q_max_avx = AvxVectorQ0_15::from(Q_MAX);
1000 let w0 = AvxVectorQ0_15::from(dr);
1001 let w1 = AvxVectorQ0_15::from(dg);
1002 let w2 = AvxVectorQ0_15Sse::from(db);
1003 let dx = q_max_avx - w0;
1004 let dy = q_max_avx - w1;
1005 let dz = q_max - w2;
1006
1007 let c000 = r.fetch(x, y, z);
1008 let c100 = r.fetch(x_n, y, z);
1009 let c010 = r.fetch(x, y_n, z);
1010 let c110 = r.fetch(x_n, y_n, z);
1011 let c001 = r.fetch(x, y, z_n);
1012 let c101 = r.fetch(x_n, y, z_n);
1013 let c011 = r.fetch(x, y_n, z_n);
1014 let c111 = r.fetch(x_n, y_n, z_n);
1015
1016 let x000 = AvxVectorQ0_15::from_sse(c000, c001);
1017 let x010 = AvxVectorQ0_15::from_sse(c010, c011);
1018 let x011 = AvxVectorQ0_15::from_sse(c100, c101);
1019 let x111 = AvxVectorQ0_15::from_sse(c110, c111);
1020
1021 let c00 = (x000 * dx).mla(x011, w0);
1022 let c10 = (x010 * dx).mla(x111, w0);
1023
1024 let c0 = (c00 * dy).mla(c10, w1);
1025
1026 let (c0, c1) = c0.split();
1027
1028 (c0 * dz).mla(c1, w2)
1029 }
1030}