moxcms/conversions/avx/
a_curves4x3.rs

1// /*
2//  * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3//  * //
4//  * // Redistribution and use in source and binary forms, with or without modification,
5//  * // are permitted provided that the following conditions are met:
6//  * //
7//  * // 1.  Redistributions of source code must retain the above copyright notice, this
8//  * // list of conditions and the following disclaimer.
9//  * //
10//  * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11//  * // this list of conditions and the following disclaimer in the documentation
12//  * // and/or other materials provided with the distribution.
13//  * //
14//  * // 3.  Neither the name of the copyright holder nor the names of its
15//  * // contributors may be used to endorse or promote products derived from
16//  * // this software without specific prior written permission.
17//  * //
18//  * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19//  * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20//  * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21//  * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22//  * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23//  * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24//  * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25//  * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26//  * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27//  * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28//  */
29use crate::conversions::avx::hypercube::HypercubeAvx;
30use crate::conversions::avx::interpolator::AvxVectorSse;
31use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
32use std::arch::x86_64::*;
33
34pub(crate) struct ACurves4x3AvxFma<'a, const DEPTH: usize> {
35    pub(crate) curve0: Box<[f32; 65536]>,
36    pub(crate) curve1: Box<[f32; 65536]>,
37    pub(crate) curve2: Box<[f32; 65536]>,
38    pub(crate) curve3: Box<[f32; 65536]>,
39    pub(crate) clut: &'a [f32],
40    pub(crate) grid_size: [u8; 4],
41    pub(crate) interpolation_method: InterpolationMethod,
42    pub(crate) pcs: DataColorSpace,
43}
44
45pub(crate) struct ACurves4x3AvxFmaOptimized<'a> {
46    pub(crate) clut: &'a [f32],
47    pub(crate) grid_size: [u8; 4],
48    pub(crate) interpolation_method: InterpolationMethod,
49    pub(crate) pcs: DataColorSpace,
50}
51
52impl<const DEPTH: usize> ACurves4x3AvxFma<'_, DEPTH> {
53    #[allow(unused_unsafe)]
54    #[target_feature(enable = "avx2", enable = "fma")]
55    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
56        &self,
57        src: &[f32],
58        dst: &mut [f32],
59        fetch: Fetch,
60    ) -> Result<(), CmsError> {
61        let scale_value = (DEPTH - 1) as f32;
62
63        assert_eq!(src.len() / 4, dst.len() / 3);
64
65        unsafe {
66            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
67                let a0 = (src[0] * scale_value).round().min(scale_value) as u16;
68                let a1 = (src[1] * scale_value).round().min(scale_value) as u16;
69                let a2 = (src[2] * scale_value).round().min(scale_value) as u16;
70                let a3 = (src[3] * scale_value).round().min(scale_value) as u16;
71                let c = self.curve0[a0 as usize];
72                let m = self.curve1[a1 as usize];
73                let y = self.curve2[a2 as usize];
74                let k = self.curve3[a3 as usize];
75
76                let v = fetch(c, m, y, k).v;
77                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
78                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
79                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
80            }
81        }
82        Ok(())
83    }
84}
85
86impl ACurves4x3AvxFmaOptimized<'_> {
87    #[allow(unused_unsafe)]
88    #[target_feature(enable = "avx2", enable = "fma")]
89    unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
90        &self,
91        src: &[f32],
92        dst: &mut [f32],
93        fetch: Fetch,
94    ) -> Result<(), CmsError> {
95        assert_eq!(src.len() / 4, dst.len() / 3);
96        unsafe {
97            for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
98                let c = src[0];
99                let m = src[1];
100                let y = src[2];
101                let k = src[3];
102
103                let v = fetch(c, m, y, k).v;
104                dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
105                dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
106                dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
107            }
108        }
109        Ok(())
110    }
111}
112
113impl<const DEPTH: usize> Stage for ACurves4x3AvxFma<'_, DEPTH> {
114    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
115        let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
116
117        assert!(std::arch::is_x86_feature_detected!("avx2"));
118        assert!(std::arch::is_x86_feature_detected!("fma"));
119
120        unsafe {
121            // If PCS is LAB then linear interpolation should be used
122            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
123                return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
124            }
125
126            match self.interpolation_method {
127                #[cfg(feature = "options")]
128                InterpolationMethod::Tetrahedral => {
129                    self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
130                }
131                #[cfg(feature = "options")]
132                InterpolationMethod::Pyramid => {
133                    self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
134                }
135                #[cfg(feature = "options")]
136                InterpolationMethod::Prism => {
137                    self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
138                }
139                InterpolationMethod::Linear => {
140                    self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
141                }
142            }
143        }
144
145        Ok(())
146    }
147}
148
149impl Stage for ACurves4x3AvxFmaOptimized<'_> {
150    fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
151        let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
152
153        assert!(std::arch::is_x86_feature_detected!("avx2"));
154        assert!(std::arch::is_x86_feature_detected!("fma"));
155
156        unsafe {
157            // If PCS is LAB then linear interpolation should be used
158            if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
159                return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
160            }
161
162            match self.interpolation_method {
163                #[cfg(feature = "options")]
164                InterpolationMethod::Tetrahedral => {
165                    self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
166                }
167                #[cfg(feature = "options")]
168                InterpolationMethod::Pyramid => {
169                    self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
170                }
171                #[cfg(feature = "options")]
172                InterpolationMethod::Prism => {
173                    self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
174                }
175                InterpolationMethod::Linear => {
176                    self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
177                }
178            }
179        }
180        Ok(())
181    }
182}