moxcms/conversions/sse/
t_lut3_to_3.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::lut_transforms::Lut3x3Factory;
32use crate::conversions::sse::interpolator::*;
33use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
34use crate::conversions::sse::t_lut3_to_3_q0_15::TransformLut3x3SseQ0_15;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38    TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::marker::PhantomData;
46
47struct TransformLut3x3Sse<
48    T,
49    U,
50    const SRC_LAYOUT: u8,
51    const DST_LAYOUT: u8,
52    const GRID_SIZE: usize,
53    const BIT_DEPTH: usize,
54    const BINS: usize,
55    const BARYCENTRIC_BINS: usize,
56> {
57    lut: Vec<SseAlignedF32>,
58    _phantom: PhantomData<T>,
59    _phantom2: PhantomData<U>,
60    interpolation_method: InterpolationMethod,
61    weights: Box<[BarycentricWeight<f32>; BINS]>,
62    color_space: DataColorSpace,
63    is_linear: bool,
64}
65
66impl<
67    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
68    U: AsPrimitive<usize>,
69    const SRC_LAYOUT: u8,
70    const DST_LAYOUT: u8,
71    const GRID_SIZE: usize,
72    const BIT_DEPTH: usize,
73    const BINS: usize,
74    const BARYCENTRIC_BINS: usize,
75> TransformLut3x3Sse<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
76where
77    f32: AsPrimitive<T>,
78    u32: AsPrimitive<T>,
79    (): LutBarycentricReduction<T, U>,
80{
81    #[allow(unused_unsafe)]
82    #[target_feature(enable = "sse4.1")]
83    unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolation<'b, GRID_SIZE>>(
84        &'b self,
85        src: &[T],
86        dst: &mut [T],
87    ) {
88        let src_cn = Layout::from(SRC_LAYOUT);
89        let src_channels = src_cn.channels();
90
91        let dst_cn = Layout::from(DST_LAYOUT);
92        let dst_channels = dst_cn.channels();
93
94        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
95        let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
96
97        for (src, dst) in src
98            .chunks_exact(src_channels)
99            .zip(dst.chunks_exact_mut(dst_channels))
100        {
101            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102                src[src_cn.r_i()],
103            );
104            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
105                src[src_cn.g_i()],
106            );
107            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
108                src[src_cn.b_i()],
109            );
110
111            let a = if src_channels == 4 {
112                src[src_cn.a_i()]
113            } else {
114                max_value
115            };
116
117            let tetrahedral = Interpolator::new(&self.lut);
118            let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
119            if T::FINITE {
120                unsafe {
121                    let mut r = _mm_mul_ps(v.v, value_scale);
122                    r = _mm_max_ps(r, _mm_setzero_ps());
123                    r = _mm_min_ps(r, value_scale);
124                    let jvz = _mm_cvtps_epi32(r);
125
126                    let x = _mm_extract_epi32::<0>(jvz);
127                    let y = _mm_extract_epi32::<1>(jvz);
128                    let z = _mm_extract_epi32::<2>(jvz);
129
130                    dst[dst_cn.r_i()] = (x as u32).as_();
131                    dst[dst_cn.g_i()] = (y as u32).as_();
132                    dst[dst_cn.b_i()] = (z as u32).as_();
133                }
134            } else {
135                unsafe {
136                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
137                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
138                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
139                }
140            }
141            if dst_channels == 4 {
142                dst[dst_cn.a_i()] = a;
143            }
144        }
145    }
146}
147
148impl<
149    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
150    U: AsPrimitive<usize>,
151    const SRC_LAYOUT: u8,
152    const DST_LAYOUT: u8,
153    const GRID_SIZE: usize,
154    const BIT_DEPTH: usize,
155    const BINS: usize,
156    const BARYCENTRIC_BINS: usize,
157> TransformExecutor<T>
158    for TransformLut3x3Sse<
159        T,
160        U,
161        SRC_LAYOUT,
162        DST_LAYOUT,
163        GRID_SIZE,
164        BIT_DEPTH,
165        BINS,
166        BARYCENTRIC_BINS,
167    >
168where
169    f32: AsPrimitive<T>,
170    u32: AsPrimitive<T>,
171    (): LutBarycentricReduction<T, U>,
172{
173    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
174        let src_cn = Layout::from(SRC_LAYOUT);
175        let src_channels = src_cn.channels();
176
177        let dst_cn = Layout::from(DST_LAYOUT);
178        let dst_channels = dst_cn.channels();
179        if src.len() % src_channels != 0 {
180            return Err(CmsError::LaneMultipleOfChannels);
181        }
182        if dst.len() % dst_channels != 0 {
183            return Err(CmsError::LaneMultipleOfChannels);
184        }
185        let src_chunks = src.len() / src_channels;
186        let dst_chunks = dst.len() / dst_channels;
187        if src_chunks != dst_chunks {
188            return Err(CmsError::LaneSizeMismatch);
189        }
190
191        unsafe {
192            if self.color_space == DataColorSpace::Lab
193                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
194                || self.color_space == DataColorSpace::Xyz
195            {
196                self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
197            } else {
198                match self.interpolation_method {
199                    #[cfg(feature = "options")]
200                    InterpolationMethod::Tetrahedral => {
201                        self.transform_chunk::<TetrahedralSse<GRID_SIZE>>(src, dst);
202                    }
203                    #[cfg(feature = "options")]
204                    InterpolationMethod::Pyramid => {
205                        self.transform_chunk::<PyramidalSse<GRID_SIZE>>(src, dst);
206                    }
207                    #[cfg(feature = "options")]
208                    InterpolationMethod::Prism => {
209                        self.transform_chunk::<PrismaticSse<GRID_SIZE>>(src, dst);
210                    }
211                    InterpolationMethod::Linear => {
212                        self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
213                    }
214                }
215            }
216        }
217        Ok(())
218    }
219}
220
221pub(crate) struct SseLut3x3Factory {}
222
223impl Lut3x3Factory for SseLut3x3Factory {
224    fn make_transform_3x3<
225        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
226        const SRC_LAYOUT: u8,
227        const DST_LAYOUT: u8,
228        const GRID_SIZE: usize,
229        const BIT_DEPTH: usize,
230    >(
231        lut: Vec<f32>,
232        options: TransformOptions,
233        color_space: DataColorSpace,
234        is_linear: bool,
235    ) -> Box<dyn TransformExecutor<T> + Sync + Send>
236    where
237        f32: AsPrimitive<T>,
238        u32: AsPrimitive<T>,
239        (): LutBarycentricReduction<T, u8>,
240        (): LutBarycentricReduction<T, u16>,
241    {
242        if options.prefer_fixed_point && BIT_DEPTH < 16 {
243            let q: f32 = if T::FINITE {
244                ((1i32 << BIT_DEPTH as i32) - 1) as f32
245            } else {
246                ((1i32 << 14i32) - 1) as f32
247            };
248            let lut = lut
249                .chunks_exact(3)
250                .map(|x| {
251                    SseAlignedI16x4([
252                        (x[0] * q).round() as i16,
253                        (x[1] * q).round() as i16,
254                        (x[2] * q).round() as i16,
255                        0,
256                    ])
257                })
258                .collect::<Vec<_>>();
259            return match options.barycentric_weight_scale {
260                BarycentricWeightScale::Low => Box::new(TransformLut3x3SseQ0_15::<
261                    T,
262                    u8,
263                    SRC_LAYOUT,
264                    DST_LAYOUT,
265                    GRID_SIZE,
266                    BIT_DEPTH,
267                    256,
268                    256,
269                > {
270                    lut,
271                    _phantom: PhantomData,
272                    _phantom2: PhantomData,
273                    interpolation_method: options.interpolation_method,
274                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
275                    color_space,
276                    is_linear,
277                }),
278                #[cfg(feature = "options")]
279                BarycentricWeightScale::High => Box::new(TransformLut3x3SseQ0_15::<
280                    T,
281                    u16,
282                    SRC_LAYOUT,
283                    DST_LAYOUT,
284                    GRID_SIZE,
285                    BIT_DEPTH,
286                    65536,
287                    65536,
288                > {
289                    lut,
290                    _phantom: PhantomData,
291                    _phantom2: PhantomData,
292                    interpolation_method: options.interpolation_method,
293                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
294                    color_space,
295                    is_linear,
296                }),
297            };
298        }
299        let lut = lut
300            .chunks_exact(3)
301            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
302            .collect::<Vec<_>>();
303        match options.barycentric_weight_scale {
304            BarycentricWeightScale::Low => Box::new(TransformLut3x3Sse::<
305                T,
306                u8,
307                SRC_LAYOUT,
308                DST_LAYOUT,
309                GRID_SIZE,
310                BIT_DEPTH,
311                256,
312                256,
313            > {
314                lut,
315                _phantom: PhantomData,
316                _phantom2: PhantomData,
317                interpolation_method: options.interpolation_method,
318                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
319                color_space,
320                is_linear,
321            }),
322            #[cfg(feature = "options")]
323            BarycentricWeightScale::High => Box::new(TransformLut3x3Sse::<
324                T,
325                u16,
326                SRC_LAYOUT,
327                DST_LAYOUT,
328                GRID_SIZE,
329                BIT_DEPTH,
330                65536,
331                65536,
332            > {
333                lut,
334                _phantom: PhantomData,
335                _phantom2: PhantomData,
336                interpolation_method: options.interpolation_method,
337                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
338                color_space,
339                is_linear,
340            }),
341        }
342    }
343}