moxcms/conversions/avx/
t_lut3_to_3.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator::*;
31use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
32use crate::conversions::avx::t_lut3_to_3_q0_15::TransformLut3x3AvxQ0_15;
33use crate::conversions::interpolator::BarycentricWeight;
34use crate::conversions::lut_transforms::Lut3x3Factory;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37    BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38    TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41use std::arch::x86_64::*;
42use std::marker::PhantomData;
43
44struct TransformLut3x3AvxFma<
45    T,
46    U,
47    const SRC_LAYOUT: u8,
48    const DST_LAYOUT: u8,
49    const GRID_SIZE: usize,
50    const BIT_DEPTH: usize,
51    const BINS: usize,
52    const BARYCENTRIC_BINS: usize,
53> {
54    lut: Vec<SseAlignedF32>,
55    _phantom: PhantomData<T>,
56    _phantom2: PhantomData<U>,
57    interpolation_method: InterpolationMethod,
58    weights: Box<[BarycentricWeight<f32>; BINS]>,
59    color_space: DataColorSpace,
60    is_linear: bool,
61}
62
63impl<
64    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
65    U: AsPrimitive<usize>,
66    const SRC_LAYOUT: u8,
67    const DST_LAYOUT: u8,
68    const GRID_SIZE: usize,
69    const BIT_DEPTH: usize,
70    const BINS: usize,
71    const BARYCENTRIC_BINS: usize,
72> TransformLut3x3AvxFma<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
73where
74    f32: AsPrimitive<T>,
75    u32: AsPrimitive<T>,
76    (): LutBarycentricReduction<T, U>,
77{
78    #[allow(unused_unsafe)]
79    #[target_feature(enable = "avx2", enable = "fma")]
80    unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolation<'b, GRID_SIZE>>(
81        &'b self,
82        src: &[T],
83        dst: &mut [T],
84    ) {
85        let src_cn = Layout::from(SRC_LAYOUT);
86        let src_channels = src_cn.channels();
87
88        let dst_cn = Layout::from(DST_LAYOUT);
89        let dst_channels = dst_cn.channels();
90
91        let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
92        let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
93
94        for (src, dst) in src
95            .chunks_exact(src_channels)
96            .zip(dst.chunks_exact_mut(dst_channels))
97        {
98            let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
99                src[src_cn.r_i()],
100            );
101            let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102                src[src_cn.g_i()],
103            );
104            let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
105                src[src_cn.b_i()],
106            );
107
108            let a = if src_channels == 4 {
109                src[src_cn.a_i()]
110            } else {
111                max_value
112            };
113
114            let tetrahedral = Interpolator::new(&self.lut);
115            let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
116            if T::FINITE {
117                unsafe {
118                    let mut r = _mm_mul_ps(v.v, value_scale);
119                    r = _mm_max_ps(r, _mm_setzero_ps());
120                    r = _mm_min_ps(r, value_scale);
121                    let jvz = _mm_cvtps_epi32(r);
122
123                    let x = _mm_extract_epi32::<0>(jvz);
124                    let y = _mm_extract_epi32::<1>(jvz);
125                    let z = _mm_extract_epi32::<2>(jvz);
126
127                    dst[dst_cn.r_i()] = (x as u32).as_();
128                    dst[dst_cn.g_i()] = (y as u32).as_();
129                    dst[dst_cn.b_i()] = (z as u32).as_();
130                }
131            } else {
132                unsafe {
133                    dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
134                    dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
135                    dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
136                }
137            }
138            if dst_channels == 4 {
139                dst[dst_cn.a_i()] = a;
140            }
141        }
142    }
143}
144
145impl<
146    T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
147    U: AsPrimitive<usize>,
148    const SRC_LAYOUT: u8,
149    const DST_LAYOUT: u8,
150    const GRID_SIZE: usize,
151    const BIT_DEPTH: usize,
152    const BINS: usize,
153    const BARYCENTRIC_BINS: usize,
154> TransformExecutor<T>
155    for TransformLut3x3AvxFma<
156        T,
157        U,
158        SRC_LAYOUT,
159        DST_LAYOUT,
160        GRID_SIZE,
161        BIT_DEPTH,
162        BINS,
163        BARYCENTRIC_BINS,
164    >
165where
166    f32: AsPrimitive<T>,
167    u32: AsPrimitive<T>,
168    (): LutBarycentricReduction<T, U>,
169{
170    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
171        let src_cn = Layout::from(SRC_LAYOUT);
172        let src_channels = src_cn.channels();
173
174        let dst_cn = Layout::from(DST_LAYOUT);
175        let dst_channels = dst_cn.channels();
176        if src.len() % src_channels != 0 {
177            return Err(CmsError::LaneMultipleOfChannels);
178        }
179        if dst.len() % dst_channels != 0 {
180            return Err(CmsError::LaneMultipleOfChannels);
181        }
182        let src_chunks = src.len() / src_channels;
183        let dst_chunks = dst.len() / dst_channels;
184        if src_chunks != dst_chunks {
185            return Err(CmsError::LaneSizeMismatch);
186        }
187
188        unsafe {
189            if self.color_space == DataColorSpace::Lab
190                || (self.is_linear && self.color_space == DataColorSpace::Rgb)
191                || self.color_space == DataColorSpace::Xyz
192            {
193                self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
194            } else {
195                match self.interpolation_method {
196                    #[cfg(feature = "options")]
197                    InterpolationMethod::Tetrahedral => {
198                        self.transform_chunk::<TetrahedralAvxFma<GRID_SIZE>>(src, dst);
199                    }
200                    #[cfg(feature = "options")]
201                    InterpolationMethod::Pyramid => {
202                        self.transform_chunk::<PyramidalAvxFma<GRID_SIZE>>(src, dst);
203                    }
204                    #[cfg(feature = "options")]
205                    InterpolationMethod::Prism => {
206                        self.transform_chunk::<PrismaticAvxFma<GRID_SIZE>>(src, dst);
207                    }
208                    InterpolationMethod::Linear => {
209                        self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
210                    }
211                }
212            }
213        }
214        Ok(())
215    }
216}
217
218pub(crate) struct AvxLut3x3Factory {}
219
220impl Lut3x3Factory for AvxLut3x3Factory {
221    fn make_transform_3x3<
222        T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
223        const SRC_LAYOUT: u8,
224        const DST_LAYOUT: u8,
225        const GRID_SIZE: usize,
226        const BIT_DEPTH: usize,
227    >(
228        lut: Vec<f32>,
229        options: TransformOptions,
230        color_space: DataColorSpace,
231        is_linear: bool,
232    ) -> Box<dyn TransformExecutor<T> + Send + Sync>
233    where
234        f32: AsPrimitive<T>,
235        u32: AsPrimitive<T>,
236        (): LutBarycentricReduction<T, u8>,
237        (): LutBarycentricReduction<T, u16>,
238    {
239        if options.prefer_fixed_point && BIT_DEPTH < 16 {
240            let q: f32 = if T::FINITE {
241                ((1i32 << BIT_DEPTH as i32) - 1) as f32
242            } else {
243                ((1i32 << 14i32) - 1) as f32
244            };
245            let lut = lut
246                .chunks_exact(3)
247                .map(|x| {
248                    AvxAlignedI16([
249                        (x[0] * q).round() as i16,
250                        (x[1] * q).round() as i16,
251                        (x[2] * q).round() as i16,
252                        0,
253                    ])
254                })
255                .collect::<Vec<_>>();
256            return match options.barycentric_weight_scale {
257                BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxQ0_15::<
258                    T,
259                    u8,
260                    SRC_LAYOUT,
261                    DST_LAYOUT,
262                    GRID_SIZE,
263                    BIT_DEPTH,
264                    256,
265                    256,
266                > {
267                    lut,
268                    _phantom: PhantomData,
269                    _phantom2: PhantomData,
270                    interpolation_method: options.interpolation_method,
271                    weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
272                    color_space,
273                    is_linear,
274                }),
275                #[cfg(feature = "options")]
276                BarycentricWeightScale::High => Box::new(TransformLut3x3AvxQ0_15::<
277                    T,
278                    u16,
279                    SRC_LAYOUT,
280                    DST_LAYOUT,
281                    GRID_SIZE,
282                    BIT_DEPTH,
283                    65536,
284                    65536,
285                > {
286                    lut,
287                    _phantom: PhantomData,
288                    _phantom2: PhantomData,
289                    interpolation_method: options.interpolation_method,
290                    weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
291                    color_space,
292                    is_linear,
293                }),
294            };
295        }
296        assert!(
297            std::arch::is_x86_feature_detected!("fma"),
298            "Internal configuration error, this might not be called without `fma` feature"
299        );
300        let lut = lut
301            .chunks_exact(3)
302            .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
303            .collect::<Vec<_>>();
304        match options.barycentric_weight_scale {
305            BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxFma::<
306                T,
307                u8,
308                SRC_LAYOUT,
309                DST_LAYOUT,
310                GRID_SIZE,
311                BIT_DEPTH,
312                256,
313                256,
314            > {
315                lut,
316                _phantom: PhantomData,
317                _phantom2: PhantomData,
318                interpolation_method: options.interpolation_method,
319                weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
320                color_space,
321                is_linear,
322            }),
323            #[cfg(feature = "options")]
324            BarycentricWeightScale::High => Box::new(TransformLut3x3AvxFma::<
325                T,
326                u16,
327                SRC_LAYOUT,
328                DST_LAYOUT,
329                GRID_SIZE,
330                BIT_DEPTH,
331                65536,
332                65536,
333            > {
334                lut,
335                _phantom: PhantomData,
336                _phantom2: PhantomData,
337                interpolation_method: options.interpolation_method,
338                weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
339                color_space,
340                is_linear,
341            }),
342        }
343    }
344}