moxcms/conversions/avx/
preheat_lut4x3.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::avx::hypercube::HypercubeAvx;
30use crate::conversions::avx::interpolator::AvxVectorSse;
31use crate::trc::{lut_interp_linear_float, lut_interp_linear_float_clamped};
32use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
33use std::arch::x86_64::*;
34
35#[derive(Default)]
36pub(crate) struct Lut4x3AvxFma {
37    pub(crate) linearization: [Vec<f32>; 4],
38    pub(crate) clut: Vec<f32>,
39    pub(crate) grid_size: u8,
40    pub(crate) output: [Vec<f32>; 3],
41    pub(crate) interpolation_method: InterpolationMethod,
42    pub(crate) pcs: DataColorSpace,
43}
44
45impl Lut4x3AvxFma {
46    #[allow(unused_unsafe)]
47    #[target_feature(enable = "avx2", enable = "fma")]
48    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
49        &self,
50        src: &[f32],
51        dst: &mut [f32],
52        fetch: Fetch,
53    ) -> Result<(), CmsError> {
54        let linearization_0 = &self.linearization[0];
55        let linearization_1 = &self.linearization[1];
56        let linearization_2 = &self.linearization[2];
57        let linearization_3 = &self.linearization[3];
58        unsafe {
59            let ones = _mm_set1_ps(1.);
60            for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
61                debug_assert!(self.grid_size as i32 >= 1);
62                let linear_x = lut_interp_linear_float(src[0], linearization_0);
63                let linear_y = lut_interp_linear_float(src[1], linearization_1);
64                let linear_z = lut_interp_linear_float(src[2], linearization_2);
65                let linear_w = lut_interp_linear_float(src[3], linearization_3);
66
67                let mut v = fetch(linear_x, linear_y, linear_z, linear_w).v;
68                v = _mm_max_ps(v, _mm_setzero_ps());
69                v = _mm_min_ps(v, ones);
70
71                let pcs_x = lut_interp_linear_float_clamped(
72                    f32::from_bits(_mm_extract_ps::<0>(v) as u32),
73                    &self.output[0],
74                );
75                let pcs_y = lut_interp_linear_float_clamped(
76                    f32::from_bits(_mm_extract_ps::<1>(v) as u32),
77                    &self.output[1],
78                );
79                let pcs_z = lut_interp_linear_float_clamped(
80                    f32::from_bits(_mm_extract_ps::<2>(v) as u32),
81                    &self.output[2],
82                );
83                dest[0] = pcs_x;
84                dest[1] = pcs_y;
85                dest[2] = pcs_z;
86            }
87        }
88        Ok(())
89    }
90}
91
92impl Stage for Lut4x3AvxFma {
93    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
94        let l_tbl = HypercubeAvx::new(
95            &self.clut,
96            [
97                self.grid_size,
98                self.grid_size,
99                self.grid_size,
100                self.grid_size,
101            ],
102            3,
103        );
104
105        assert!(std::arch::is_x86_feature_detected!("avx2"));
106        assert!(std::arch::is_x86_feature_detected!("fma"));
107
108        unsafe {
109            // If Source PCS is LAB trilinear should be used
110            if self.pcs == DataColorSpace::Lab {
111                return self
112                    .transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
113            }
114
115            match self.interpolation_method {
116                #[cfg(feature = "options")]
117                InterpolationMethod::Tetrahedral => {
118                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))?;
119                }
120                #[cfg(feature = "options")]
121                InterpolationMethod::Pyramid => {
122                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))?;
123                }
124                #[cfg(feature = "options")]
125                InterpolationMethod::Prism => {
126                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))?
127                }
128                InterpolationMethod::Linear => {
129                    self.transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w))?
130                }
131            }
132        }
133        Ok(())
134    }
135}