moxcms/conversions/avx/
rgb_xyz.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::TransformMatrixShaper;
30use crate::transform::PointeeSizeExpressible;
31use crate::{CmsError, Layout, TransformExecutor};
32use num_traits::AsPrimitive;
33use std::arch::x86_64::*;
34
35#[repr(align(32), C)]
36#[derive(Debug)]
37pub(crate) struct AvxAlignedU16(pub(crate) [u16; 16]);
38
39pub(crate) struct TransformShaperRgbAvx<
40    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
41    const SRC_LAYOUT: u8,
42    const DST_LAYOUT: u8,
43    const LINEAR_CAP: usize,
44    const GAMMA_LUT: usize,
45> {
46    pub(crate) profile: TransformMatrixShaper<T, LINEAR_CAP>,
47    pub(crate) bit_depth: usize,
48}
49
50impl<
51    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
52    const SRC_LAYOUT: u8,
53    const DST_LAYOUT: u8,
54    const LINEAR_CAP: usize,
55    const GAMMA_LUT: usize,
56> TransformShaperRgbAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
57where
58    u32: AsPrimitive<T>,
59{
60    #[inline(always)]
61    unsafe fn transform_impl<const FMA: bool>(
62        &self,
63        src: &[T],
64        dst: &mut [T],
65    ) -> Result<(), CmsError> {
66        let src_cn = Layout::from(SRC_LAYOUT);
67        let dst_cn = Layout::from(DST_LAYOUT);
68        let src_channels = src_cn.channels();
69        let dst_channels = dst_cn.channels();
70
71        let mut temporary0 = AvxAlignedU16([0; 16]);
72
73        if src.len() / src_channels != dst.len() / dst_channels {
74            return Err(CmsError::LaneSizeMismatch);
75        }
76        if src.len() % src_channels != 0 {
77            return Err(CmsError::LaneMultipleOfChannels);
78        }
79        if dst.len() % dst_channels != 0 {
80            return Err(CmsError::LaneMultipleOfChannels);
81        }
82
83        let t = self.profile.adaptation_matrix.transpose();
84
85        let scale = (GAMMA_LUT - 1) as f32;
86        let max_colors: T = ((1 << self.bit_depth) - 1).as_();
87
88        unsafe {
89            let m0 = _mm256_setr_ps(
90                t.v[0][0], t.v[0][1], t.v[0][2], 0., t.v[0][0], t.v[0][1], t.v[0][2], 0.,
91            );
92            let m1 = _mm256_setr_ps(
93                t.v[1][0], t.v[1][1], t.v[1][2], 0., t.v[1][0], t.v[1][1], t.v[1][2], 0.,
94            );
95            let m2 = _mm256_setr_ps(
96                t.v[2][0], t.v[2][1], t.v[2][2], 0., t.v[2][0], t.v[2][1], t.v[2][2], 0.,
97            );
98
99            let zeros = _mm_setzero_ps();
100
101            let v_scale = _mm256_set1_ps(scale);
102
103            let mut src = src;
104            let mut dst = dst;
105
106            let mut src_iter = src.chunks_exact(src_channels * 2);
107            let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
108
109            let (mut r0, mut g0, mut b0, mut a0);
110            let (mut r1, mut g1, mut b1, mut a1);
111
112            if let Some(src) = src_iter.next() {
113                r0 = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
114                g0 = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
115                b0 = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
116                r1 = _mm_broadcast_ss(
117                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
118                );
119                g1 = _mm_broadcast_ss(
120                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
121                );
122                b1 = _mm_broadcast_ss(
123                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
124                );
125                a0 = if src_channels == 4 {
126                    src[src_cn.a_i()]
127                } else {
128                    max_colors
129                };
130                a1 = if src_channels == 4 {
131                    src[src_cn.a_i() + src_channels]
132                } else {
133                    max_colors
134                };
135            } else {
136                r0 = _mm_setzero_ps();
137                g0 = _mm_setzero_ps();
138                b0 = _mm_setzero_ps();
139                a0 = max_colors;
140                r1 = _mm_setzero_ps();
141                g1 = _mm_setzero_ps();
142                b1 = _mm_setzero_ps();
143                a1 = max_colors;
144            }
145
146            for (src, dst) in src_iter.zip(dst_iter) {
147                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
148                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
149                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
150
151                let mut v = if FMA {
152                    let v0 = _mm256_mul_ps(r, m0);
153                    let v1 = _mm256_fmadd_ps(g, m1, v0);
154                    _mm256_fmadd_ps(b, m2, v1)
155                } else {
156                    let v0 = _mm256_mul_ps(r, m0);
157                    let v1 = _mm256_mul_ps(g, m1);
158                    let v2 = _mm256_mul_ps(b, m2);
159
160                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
161                };
162
163                v = _mm256_max_ps(v, _mm256_setzero_ps());
164                v = _mm256_mul_ps(v, v_scale);
165                v = _mm256_min_ps(v, v_scale);
166
167                let zx = _mm256_cvtps_epi32(v);
168                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
169
170                r0 = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
171                g0 = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
172                b0 = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
173                r1 = _mm_broadcast_ss(
174                    &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
175                );
176                g1 = _mm_broadcast_ss(
177                    &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
178                );
179                b1 = _mm_broadcast_ss(
180                    &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
181                );
182
183                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
184                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
185                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
186                if dst_channels == 4 {
187                    dst[dst_cn.a_i()] = a0;
188                }
189
190                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
191                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
192                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
193                if dst_channels == 4 {
194                    dst[dst_cn.a_i() + dst_channels] = a1;
195                }
196
197                a0 = if src_channels == 4 {
198                    src[src_cn.a_i()]
199                } else {
200                    max_colors
201                };
202                a1 = if src_channels == 4 {
203                    src[src_cn.a_i() + src_channels]
204                } else {
205                    max_colors
206                };
207            }
208
209            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
210                let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
211                let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
212                let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
213
214                let mut v = if FMA {
215                    let v0 = _mm256_mul_ps(r, m0);
216                    let v1 = _mm256_fmadd_ps(g, m1, v0);
217                    _mm256_fmadd_ps(b, m2, v1)
218                } else {
219                    let v0 = _mm256_mul_ps(r, m0);
220                    let v1 = _mm256_mul_ps(g, m1);
221                    let v2 = _mm256_mul_ps(b, m2);
222
223                    _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
224                };
225
226                v = _mm256_max_ps(v, _mm256_setzero_ps());
227                v = _mm256_mul_ps(v, v_scale);
228                v = _mm256_min_ps(v, v_scale);
229
230                let zx = _mm256_cvtps_epi32(v);
231                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
232
233                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
234                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
235                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
236                if dst_channels == 4 {
237                    dst[dst_cn.a_i()] = a0;
238                }
239
240                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
241                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
242                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
243                if dst_channels == 4 {
244                    dst[dst_cn.a_i() + dst_channels] = a1;
245                }
246            }
247
248            src = src.chunks_exact(src_channels * 2).remainder();
249            dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
250
251            for (src, dst) in src
252                .chunks_exact(src_channels)
253                .zip(dst.chunks_exact_mut(dst_channels))
254            {
255                let r = _mm_broadcast_ss(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
256                let g = _mm_broadcast_ss(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
257                let b = _mm_broadcast_ss(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
258                let a = if src_channels == 4 {
259                    src[src_cn.a_i()]
260                } else {
261                    max_colors
262                };
263
264                let mut v = if FMA {
265                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
266                    let v1 = _mm_fmadd_ps(g, _mm256_castps256_ps128(m1), v0);
267                    _mm_fmadd_ps(b, _mm256_castps256_ps128(m2), v1)
268                } else {
269                    let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
270                    let v1 = _mm_mul_ps(g, _mm256_castps256_ps128(m1));
271                    let v2 = _mm_mul_ps(b, _mm256_castps256_ps128(m2));
272
273                    _mm_add_ps(_mm_add_ps(v0, v1), v2)
274                };
275
276                v = _mm_max_ps(v, zeros);
277                v = _mm_mul_ps(v, _mm256_castps256_ps128(v_scale));
278                v = _mm_min_ps(v, _mm256_castps256_ps128(v_scale));
279
280                let zx = _mm_cvtps_epi32(v);
281                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, zx);
282
283                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
284                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
285                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
286                if dst_channels == 4 {
287                    dst[dst_cn.a_i()] = a;
288                }
289            }
290        }
291
292        Ok(())
293    }
294
295    #[target_feature(enable = "avx2", enable = "fma")]
296    unsafe fn transform_fma(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
297        unsafe { self.transform_impl::<true>(src, dst) }
298    }
299
300    #[target_feature(enable = "avx2")]
301    unsafe fn transform_avx(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
302        unsafe { self.transform_impl::<false>(src, dst) }
303    }
304}
305
306impl<
307    T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
308    const SRC_LAYOUT: u8,
309    const DST_LAYOUT: u8,
310    const LINEAR_CAP: usize,
311    const GAMMA_LUT: usize,
312> TransformExecutor<T> for TransformShaperRgbAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
313where
314    u32: AsPrimitive<T>,
315{
316    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
317        unsafe {
318            if std::arch::is_x86_feature_detected!("fma") {
319                self.transform_fma(src, dst)
320            } else {
321                self.transform_avx(src, dst)
322            }
323        }
324    }
325}