moxcms/conversions/sse/
interpolator.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::interpolator::BarycentricWeight;
30use crate::math::FusedMultiplyAdd;
31#[cfg(target_arch = "x86")]
32use std::arch::x86::*;
33#[cfg(target_arch = "x86_64")]
34use std::arch::x86_64::*;
35use std::ops::{Add, Mul, Sub};
36
37#[repr(align(16), C)]
38pub(crate) struct SseAlignedF32(pub(crate) [f32; 4]);
39
40#[cfg(feature = "options")]
41pub(crate) struct TetrahedralSse<const GRID_SIZE: usize> {}
42
43#[cfg(feature = "options")]
44pub(crate) struct PyramidalSse<const GRID_SIZE: usize> {}
45
46#[cfg(feature = "options")]
47pub(crate) struct PrismaticSse<const GRID_SIZE: usize> {}
48
49pub(crate) struct TrilinearSse<const GRID_SIZE: usize> {}
50
51trait Fetcher<T> {
52    fn fetch(&self, x: i32, y: i32, z: i32) -> T;
53}
54
55#[derive(Copy, Clone)]
56#[repr(transparent)]
57pub(crate) struct SseVector {
58    pub(crate) v: __m128,
59}
60
61impl From<f32> for SseVector {
62    #[inline(always)]
63    fn from(v: f32) -> Self {
64        SseVector {
65            v: unsafe { _mm_set1_ps(v) },
66        }
67    }
68}
69
70impl Sub<SseVector> for SseVector {
71    type Output = Self;
72    #[inline(always)]
73    fn sub(self, rhs: SseVector) -> Self::Output {
74        SseVector {
75            v: unsafe { _mm_sub_ps(self.v, rhs.v) },
76        }
77    }
78}
79
80impl Add<SseVector> for SseVector {
81    type Output = Self;
82    #[inline(always)]
83    fn add(self, rhs: SseVector) -> Self::Output {
84        SseVector {
85            v: unsafe { _mm_add_ps(self.v, rhs.v) },
86        }
87    }
88}
89
90impl Mul<SseVector> for SseVector {
91    type Output = Self;
92    #[inline(always)]
93    fn mul(self, rhs: SseVector) -> Self::Output {
94        SseVector {
95            v: unsafe { _mm_mul_ps(self.v, rhs.v) },
96        }
97    }
98}
99
100impl FusedMultiplyAdd<SseVector> for SseVector {
101    #[inline(always)]
102    fn mla(&self, b: SseVector, c: SseVector) -> SseVector {
103        SseVector {
104            v: unsafe { _mm_add_ps(self.v, _mm_mul_ps(b.v, c.v)) },
105        }
106    }
107}
108
109struct TetrahedralSseFetchVector<'a, const GRID_SIZE: usize> {
110    cube: &'a [SseAlignedF32],
111}
112
113impl<const GRID_SIZE: usize> Fetcher<SseVector> for TetrahedralSseFetchVector<'_, GRID_SIZE> {
114    #[inline(always)]
115    fn fetch(&self, x: i32, y: i32, z: i32) -> SseVector {
116        let offset = (x as u32 * (GRID_SIZE as u32 * GRID_SIZE as u32)
117            + y as u32 * GRID_SIZE as u32
118            + z as u32) as usize;
119        let jx = unsafe { self.cube.get_unchecked(offset..) };
120        SseVector {
121            v: unsafe { _mm_load_ps(jx.as_ptr() as *const _) },
122        }
123    }
124}
125
126pub(crate) trait SseMdInterpolation {
127    fn inter3_sse(
128        &self,
129        table: &[SseAlignedF32],
130        in_r: usize,
131        in_g: usize,
132        in_b: usize,
133        lut: &[BarycentricWeight<f32>],
134    ) -> SseVector;
135}
136
137#[cfg(feature = "options")]
138impl<const GRID_SIZE: usize> TetrahedralSse<GRID_SIZE> {
139    #[target_feature(enable = "sse4.1")]
140    unsafe fn interpolate(
141        &self,
142        in_r: usize,
143        in_g: usize,
144        in_b: usize,
145        lut: &[BarycentricWeight<f32>],
146        r: impl Fetcher<SseVector>,
147    ) -> SseVector {
148        let lut_r = unsafe { *lut.get_unchecked(in_r) };
149        let lut_g = unsafe { *lut.get_unchecked(in_g) };
150        let lut_b = unsafe { *lut.get_unchecked(in_b) };
151
152        let x: i32 = lut_r.x;
153        let y: i32 = lut_g.x;
154        let z: i32 = lut_b.x;
155
156        let x_n: i32 = lut_r.x_n;
157        let y_n: i32 = lut_g.x_n;
158        let z_n: i32 = lut_b.x_n;
159
160        let rx = lut_r.w;
161        let ry = lut_g.w;
162        let rz = lut_b.w;
163
164        let c0 = r.fetch(x, y, z);
165
166        let c2;
167        let c1;
168        let c3;
169        if rx >= ry {
170            if ry >= rz {
171                //rx >= ry && ry >= rz
172                c1 = r.fetch(x_n, y, z) - c0;
173                c2 = r.fetch(x_n, y_n, z) - r.fetch(x_n, y, z);
174                c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
175            } else if rx >= rz {
176                //rx >= rz && rz >= ry
177                c1 = r.fetch(x_n, y, z) - c0;
178                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
179                c3 = r.fetch(x_n, y, z_n) - r.fetch(x_n, y, z);
180            } else {
181                //rz > rx && rx >= ry
182                c1 = r.fetch(x_n, y, z_n) - r.fetch(x, y, z_n);
183                c2 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y, z_n);
184                c3 = r.fetch(x, y, z_n) - c0;
185            }
186        } else if rx >= rz {
187            //ry > rx && rx >= rz
188            c1 = r.fetch(x_n, y_n, z) - r.fetch(x, y_n, z);
189            c2 = r.fetch(x, y_n, z) - c0;
190            c3 = r.fetch(x_n, y_n, z_n) - r.fetch(x_n, y_n, z);
191        } else if ry >= rz {
192            //ry >= rz && rz > rx
193            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
194            c2 = r.fetch(x, y_n, z) - c0;
195            c3 = r.fetch(x, y_n, z_n) - r.fetch(x, y_n, z);
196        } else {
197            //rz > ry && ry > rx
198            c1 = r.fetch(x_n, y_n, z_n) - r.fetch(x, y_n, z_n);
199            c2 = r.fetch(x, y_n, z_n) - r.fetch(x, y, z_n);
200            c3 = r.fetch(x, y, z_n) - c0;
201        }
202        let s0 = c0.mla(c1, SseVector::from(rx));
203        let s1 = s0.mla(c2, SseVector::from(ry));
204        s1.mla(c3, SseVector::from(rz))
205    }
206}
207
208macro_rules! define_inter_sse {
209    ($interpolator: ident) => {
210        impl<const GRID_SIZE: usize> SseMdInterpolation for $interpolator<GRID_SIZE> {
211            fn inter3_sse(
212                &self,
213                table: &[SseAlignedF32],
214                in_r: usize,
215                in_g: usize,
216                in_b: usize,
217                lut: &[BarycentricWeight<f32>],
218            ) -> SseVector {
219                unsafe {
220                    self.interpolate(
221                        in_r,
222                        in_g,
223                        in_b,
224                        lut,
225                        TetrahedralSseFetchVector::<GRID_SIZE> { cube: table },
226                    )
227                }
228            }
229        }
230    };
231}
232
233#[cfg(feature = "options")]
234define_inter_sse!(TetrahedralSse);
235#[cfg(feature = "options")]
236define_inter_sse!(PyramidalSse);
237#[cfg(feature = "options")]
238define_inter_sse!(PrismaticSse);
239define_inter_sse!(TrilinearSse);
240
241#[cfg(feature = "options")]
242impl<const GRID_SIZE: usize> PyramidalSse<GRID_SIZE> {
243    #[target_feature(enable = "sse4.1")]
244    unsafe fn interpolate(
245        &self,
246        in_r: usize,
247        in_g: usize,
248        in_b: usize,
249        lut: &[BarycentricWeight<f32>],
250        r: impl Fetcher<SseVector>,
251    ) -> SseVector {
252        let lut_r = unsafe { *lut.get_unchecked(in_r) };
253        let lut_g = unsafe { *lut.get_unchecked(in_g) };
254        let lut_b = unsafe { *lut.get_unchecked(in_b) };
255
256        let x: i32 = lut_r.x;
257        let y: i32 = lut_g.x;
258        let z: i32 = lut_b.x;
259
260        let x_n: i32 = lut_r.x_n;
261        let y_n: i32 = lut_g.x_n;
262        let z_n: i32 = lut_b.x_n;
263
264        let dr = lut_r.w;
265        let dg = lut_g.w;
266        let db = lut_b.w;
267
268        let c0 = r.fetch(x, y, z);
269
270        if dr > db && dg > db {
271            let x0 = r.fetch(x_n, y_n, z_n);
272            let x1 = r.fetch(x_n, y_n, z);
273            let x2 = r.fetch(x_n, y, z);
274            let x3 = r.fetch(x, y_n, z);
275
276            let c1 = x0 - x1;
277            let c2 = x2 - c0;
278            let c3 = x3 - c0;
279            let c4 = c0 - x3 - x2 + x1;
280
281            let s0 = c0.mla(c1, SseVector::from(db));
282            let s1 = s0.mla(c2, SseVector::from(dr));
283            let s2 = s1.mla(c3, SseVector::from(dg));
284            s2.mla(c4, SseVector::from(dr * dg))
285        } else if db > dr && dg > dr {
286            let x0 = r.fetch(x, y, z_n);
287            let x1 = r.fetch(x_n, y_n, z_n);
288            let x2 = r.fetch(x, y_n, z_n);
289            let x3 = r.fetch(x, y_n, z);
290
291            let c1 = x0 - c0;
292            let c2 = x1 - x2;
293            let c3 = x3 - c0;
294            let c4 = c0 - x3 - x0 + x2;
295
296            let s0 = c0.mla(c1, SseVector::from(db));
297            let s1 = s0.mla(c2, SseVector::from(dr));
298            let s2 = s1.mla(c3, SseVector::from(dg));
299            s2.mla(c4, SseVector::from(dg * db))
300        } else {
301            let x0 = r.fetch(x, y, z_n);
302            let x1 = r.fetch(x_n, y, z);
303            let x2 = r.fetch(x_n, y, z_n);
304            let x3 = r.fetch(x_n, y_n, z_n);
305
306            let c1 = x0 - c0;
307            let c2 = x1 - c0;
308            let c3 = x3 - x2;
309            let c4 = c0 - x1 - x0 + x2;
310
311            let s0 = c0.mla(c1, SseVector::from(db));
312            let s1 = s0.mla(c2, SseVector::from(dr));
313            let s2 = s1.mla(c3, SseVector::from(dg));
314            s2.mla(c4, SseVector::from(db * dr))
315        }
316    }
317}
318
319#[cfg(feature = "options")]
320impl<const GRID_SIZE: usize> PrismaticSse<GRID_SIZE> {
321    #[target_feature(enable = "sse4.1")]
322    unsafe fn interpolate(
323        &self,
324        in_r: usize,
325        in_g: usize,
326        in_b: usize,
327        lut: &[BarycentricWeight<f32>],
328        r: impl Fetcher<SseVector>,
329    ) -> SseVector {
330        let lut_r = unsafe { *lut.get_unchecked(in_r) };
331        let lut_g = unsafe { *lut.get_unchecked(in_g) };
332        let lut_b = unsafe { *lut.get_unchecked(in_b) };
333
334        let x: i32 = lut_r.x;
335        let y: i32 = lut_g.x;
336        let z: i32 = lut_b.x;
337
338        let x_n: i32 = lut_r.x_n;
339        let y_n: i32 = lut_g.x_n;
340        let z_n: i32 = lut_b.x_n;
341
342        let dr = lut_r.w;
343        let dg = lut_g.w;
344        let db = lut_b.w;
345
346        let c0 = r.fetch(x, y, z);
347
348        if db > dr {
349            let x0 = r.fetch(x, y, z_n);
350            let x1 = r.fetch(x_n, y, z_n);
351            let x2 = r.fetch(x, y_n, z);
352            let x3 = r.fetch(x, y_n, z_n);
353            let x4 = r.fetch(x_n, y_n, z_n);
354
355            let c1 = x0 - c0;
356            let c2 = x1 - x0;
357            let c3 = x2 - c0;
358            let c4 = c0 - x2 - x0 + x3;
359            let c5 = x0 - x3 - x1 + x4;
360
361            let s0 = c0.mla(c1, SseVector::from(db));
362            let s1 = s0.mla(c2, SseVector::from(dr));
363            let s2 = s1.mla(c3, SseVector::from(dg));
364            let s3 = s2.mla(c4, SseVector::from(dg * db));
365            s3.mla(c5, SseVector::from(dr * dg))
366        } else {
367            let x0 = r.fetch(x_n, y, z);
368            let x1 = r.fetch(x_n, y, z_n);
369            let x2 = r.fetch(x, y_n, z);
370            let x3 = r.fetch(x_n, y_n, z);
371            let x4 = r.fetch(x_n, y_n, z_n);
372
373            let c1 = x1 - x0;
374            let c2 = x0 - c0;
375            let c3 = x2 - c0;
376            let c4 = x0 - x3 - x1 + x4;
377            let c5 = c0 - x2 - x0 + x3;
378
379            let s0 = c0.mla(c1, SseVector::from(db));
380            let s1 = s0.mla(c2, SseVector::from(dr));
381            let s2 = s1.mla(c3, SseVector::from(dg));
382            let s3 = s2.mla(c4, SseVector::from(dg * db));
383            s3.mla(c5, SseVector::from(dr * dg))
384        }
385    }
386}
387
388impl<const GRID_SIZE: usize> TrilinearSse<GRID_SIZE> {
389    #[target_feature(enable = "sse4.1")]
390    unsafe fn interpolate(
391        &self,
392        in_r: usize,
393        in_g: usize,
394        in_b: usize,
395        lut: &[BarycentricWeight<f32>],
396        r: impl Fetcher<SseVector>,
397    ) -> SseVector {
398        let lut_r = unsafe { *lut.get_unchecked(in_r) };
399        let lut_g = unsafe { *lut.get_unchecked(in_g) };
400        let lut_b = unsafe { *lut.get_unchecked(in_b) };
401
402        let x: i32 = lut_r.x;
403        let y: i32 = lut_g.x;
404        let z: i32 = lut_b.x;
405
406        let x_n: i32 = lut_r.x_n;
407        let y_n: i32 = lut_g.x_n;
408        let z_n: i32 = lut_b.x_n;
409
410        let dr = lut_r.w;
411        let dg = lut_g.w;
412        let db = lut_b.w;
413
414        let w0 = SseVector::from(dr);
415        let w1 = SseVector::from(dg);
416        let w2 = SseVector::from(db);
417
418        let c000 = r.fetch(x, y, z);
419        let c100 = r.fetch(x_n, y, z);
420        let c010 = r.fetch(x, y_n, z);
421        let c110 = r.fetch(x_n, y_n, z);
422        let c001 = r.fetch(x, y, z_n);
423        let c101 = r.fetch(x_n, y, z_n);
424        let c011 = r.fetch(x, y_n, z_n);
425        let c111 = r.fetch(x_n, y_n, z_n);
426
427        let dx = SseVector::from(1.0 - dr);
428
429        let c00 = (c000 * dx).mla(c100, w0);
430        let c10 = (c010 * dx).mla(c110, w0);
431        let c01 = (c001 * dx).mla(c101, w0);
432        let c11 = (c011 * dx).mla(c111, w0);
433
434        let dy = SseVector::from(1.0 - dg);
435
436        let c0 = (c00 * dy).mla(c10, w1);
437        let c1 = (c01 * dy).mla(c11, w1);
438
439        let dz = SseVector::from(1.0 - db);
440
441        (c0 * dz).mla(c1, w2)
442    }
443}
moxcms/conversions/sse/interpolator.rs

moxcms/conversions/sse/
interpolator.rs