moxcms/conversions/avx/
rgb_xyz_q2_13.rs

1/*
2 * // Copyright (c) Radzivon Bartoshyk 3/2025. All rights reserved.
3 * //
4 * // Redistribution and use in source and binary forms, with or without modification,
5 * // are permitted provided that the following conditions are met:
6 * //
7 * // 1.  Redistributions of source code must retain the above copyright notice, this
8 * // list of conditions and the following disclaimer.
9 * //
10 * // 2.  Redistributions in binary form must reproduce the above copyright notice,
11 * // this list of conditions and the following disclaimer in the documentation
12 * // and/or other materials provided with the distribution.
13 * //
14 * // 3.  Neither the name of the copyright holder nor the names of its
15 * // contributors may be used to endorse or promote products derived from
16 * // this software without specific prior written permission.
17 * //
18 * // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 * // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 * // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 * // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 * // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
30use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFp;
31use crate::transform::PointeeSizeExpressible;
32use crate::{CmsError, Layout, TransformExecutor};
33use num_traits::AsPrimitive;
34use std::arch::x86_64::*;
35
36pub(crate) struct TransformShaperRgbQ2_13Avx<
37    T: Copy,
38    const SRC_LAYOUT: u8,
39    const DST_LAYOUT: u8,
40    const PRECISION: i32,
41> {
42    pub(crate) profile: TransformMatrixShaperFp<i32, T>,
43    pub(crate) bit_depth: usize,
44    pub(crate) gamma_lut: usize,
45}
46
47#[inline(always)]
48pub(crate) unsafe fn _xmm_broadcast_epi32(f: &i32) -> __m128i {
49    let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
50    unsafe { _mm_castps_si128(_mm_broadcast_ss(float_ref)) }
51}
52
53impl<
54    T: Copy + PointeeSizeExpressible + 'static,
55    const SRC_LAYOUT: u8,
56    const DST_LAYOUT: u8,
57    const PRECISION: i32,
58> TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
59where
60    u32: AsPrimitive<T>,
61{
62    #[target_feature(enable = "avx2")]
63    unsafe fn transform_avx2(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
64        let src_cn = Layout::from(SRC_LAYOUT);
65        let dst_cn = Layout::from(DST_LAYOUT);
66        let src_channels = src_cn.channels();
67        let dst_channels = dst_cn.channels();
68
69        let mut temporary0 = AvxAlignedU16([0; 16]);
70
71        if src.len() / src_channels != dst.len() / dst_channels {
72            return Err(CmsError::LaneSizeMismatch);
73        }
74        if src.len() % src_channels != 0 {
75            return Err(CmsError::LaneMultipleOfChannels);
76        }
77        if dst.len() % dst_channels != 0 {
78            return Err(CmsError::LaneMultipleOfChannels);
79        }
80
81        let t = self.profile.adaptation_matrix.transpose();
82
83        let max_colors = ((1 << self.bit_depth) - 1).as_();
84
85        // safety precondition for linearization table
86        if T::FINITE {
87            let cap = (1 << self.bit_depth) - 1;
88            assert!(self.profile.r_linear.len() >= cap);
89            assert!(self.profile.g_linear.len() >= cap);
90            assert!(self.profile.b_linear.len() >= cap);
91        } else {
92            assert!(self.profile.r_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
93            assert!(self.profile.g_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
94            assert!(self.profile.b_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
95        }
96
97        let r_lin = &self.profile.r_linear;
98        let g_lin = &self.profile.g_linear;
99        let b_lin = &self.profile.b_linear;
100
101        unsafe {
102            let m0 = _mm256_setr_epi16(
103                t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0, t.v[0][0],
104                t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
105            );
106            let m2 = _mm256_setr_epi16(
107                t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0, t.v[2][0], 1, t.v[2][1], 1,
108                t.v[2][2], 1, 0, 0,
109            );
110
111            let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
112            let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
113
114            let zeros = _mm256_setzero_si256();
115
116            let v_max_value = _mm256_set1_epi32(self.gamma_lut as i32 - 1);
117
118            let mut src = src;
119            let mut dst = dst;
120
121            let mut src_iter = src.chunks_exact(src_channels * 2);
122            let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
123
124            let (mut r0, mut g0, mut b0, mut a0);
125            let (mut r1, mut g1, mut b1, mut a1);
126
127            if let Some(src) = src_iter.next() {
128                r0 = _xmm_broadcast_epi32(r_lin.get_unchecked(src[src_cn.r_i()]._as_usize()));
129                g0 = _xmm_broadcast_epi32(g_lin.get_unchecked(src[src_cn.g_i()]._as_usize()));
130                b0 = _xmm_broadcast_epi32(b_lin.get_unchecked(src[src_cn.b_i()]._as_usize()));
131                r1 = _xmm_broadcast_epi32(
132                    r_lin.get_unchecked(src[src_cn.r_i() + src_channels]._as_usize()),
133                );
134                g1 = _xmm_broadcast_epi32(
135                    g_lin.get_unchecked(src[src_cn.g_i() + src_channels]._as_usize()),
136                );
137                b1 = _xmm_broadcast_epi32(
138                    b_lin.get_unchecked(src[src_cn.b_i() + src_channels]._as_usize()),
139                );
140                a0 = if src_channels == 4 {
141                    src[src_cn.a_i()]
142                } else {
143                    max_colors
144                };
145                a1 = if src_channels == 4 {
146                    src[src_cn.a_i() + src_channels]
147                } else {
148                    max_colors
149                };
150            } else {
151                r0 = _mm_setzero_si128();
152                g0 = _mm_setzero_si128();
153                b0 = _mm_setzero_si128();
154                a0 = max_colors;
155                r1 = _mm_setzero_si128();
156                g1 = _mm_setzero_si128();
157                b1 = _mm_setzero_si128();
158                a1 = max_colors;
159            }
160
161            for (src, dst) in src_iter.zip(dst_iter) {
162                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
163                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
164                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
165                zg0 = _mm256_slli_epi32::<16>(zg0);
166
167                let zrg0 = _mm256_or_si256(zr0, zg0);
168                let zbz0 = _mm256_or_si256(zb0, rnd);
169
170                let va0 = _mm256_madd_epi16(zrg0, m0);
171                let va1 = _mm256_madd_epi16(zbz0, m2);
172
173                let mut v0 = _mm256_add_epi32(va0, va1);
174
175                v0 = _mm256_srai_epi32::<PRECISION>(v0);
176                v0 = _mm256_max_epi32(v0, zeros);
177                v0 = _mm256_min_epi32(v0, v_max_value);
178
179                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
180
181                r0 = _xmm_broadcast_epi32(r_lin.get_unchecked(src[src_cn.r_i()]._as_usize()));
182                g0 = _xmm_broadcast_epi32(g_lin.get_unchecked(src[src_cn.g_i()]._as_usize()));
183                b0 = _xmm_broadcast_epi32(b_lin.get_unchecked(src[src_cn.b_i()]._as_usize()));
184                r1 = _xmm_broadcast_epi32(
185                    r_lin.get_unchecked(src[src_cn.r_i() + src_channels]._as_usize()),
186                );
187                g1 = _xmm_broadcast_epi32(
188                    g_lin.get_unchecked(src[src_cn.g_i() + src_channels]._as_usize()),
189                );
190                b1 = _xmm_broadcast_epi32(
191                    b_lin.get_unchecked(src[src_cn.b_i() + src_channels]._as_usize()),
192                );
193
194                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
195                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
196                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
197                if dst_channels == 4 {
198                    dst[dst_cn.a_i()] = a0;
199                }
200
201                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
202                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
203                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
204                if dst_channels == 4 {
205                    dst[dst_cn.a_i() + dst_channels] = a1;
206                }
207
208                a0 = if src_channels == 4 {
209                    src[src_cn.a_i()]
210                } else {
211                    max_colors
212                };
213                a1 = if src_channels == 4 {
214                    src[src_cn.a_i() + src_channels]
215                } else {
216                    max_colors
217                };
218            }
219
220            if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
221                let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
222                let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
223                let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
224                zg0 = _mm256_slli_epi32::<16>(zg0);
225
226                let zrg0 = _mm256_or_si256(zr0, zg0);
227                let zbz0 = _mm256_or_si256(zb0, rnd);
228
229                let va0 = _mm256_madd_epi16(zrg0, m0);
230                let va1 = _mm256_madd_epi16(zbz0, m2);
231
232                let mut v0 = _mm256_add_epi32(va0, va1);
233
234                v0 = _mm256_srai_epi32::<PRECISION>(v0);
235                v0 = _mm256_max_epi32(v0, zeros);
236                v0 = _mm256_min_epi32(v0, v_max_value);
237
238                _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
239
240                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
241                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
242                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
243                if dst_channels == 4 {
244                    dst[dst_cn.a_i()] = a0;
245                }
246
247                dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
248                dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
249                dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
250                if dst_channels == 4 {
251                    dst[dst_cn.a_i() + dst_channels] = a1;
252                }
253            }
254
255            src = src.chunks_exact(src_channels * 2).remainder();
256            dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
257
258            for (src, dst) in src
259                .chunks_exact(src_channels)
260                .zip(dst.chunks_exact_mut(dst_channels))
261            {
262                let r = _xmm_broadcast_epi32(r_lin.get_unchecked(src[src_cn.r_i()]._as_usize()));
263                let mut g =
264                    _xmm_broadcast_epi32(g_lin.get_unchecked(src[src_cn.g_i()]._as_usize()));
265                let b = _xmm_broadcast_epi32(b_lin.get_unchecked(src[src_cn.b_i()]._as_usize()));
266
267                g = _mm_slli_epi32::<16>(g);
268
269                let a = if src_channels == 4 {
270                    src[src_cn.a_i()]
271                } else {
272                    max_colors
273                };
274
275                let zrg0 = _mm_or_si128(r, g);
276                let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
277
278                let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
279                let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
280
281                let mut v = _mm_add_epi32(v0, v1);
282
283                v = _mm_srai_epi32::<PRECISION>(v);
284                v = _mm_max_epi32(v, _mm_setzero_si128());
285                v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
286
287                _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
288
289                dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
290                dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
291                dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
292                if dst_channels == 4 {
293                    dst[dst_cn.a_i()] = a;
294                }
295            }
296        }
297
298        Ok(())
299    }
300}
301
302impl<
303    T: Copy + PointeeSizeExpressible + 'static + Default,
304    const SRC_LAYOUT: u8,
305    const DST_LAYOUT: u8,
306    const PRECISION: i32,
307> TransformExecutor<T> for TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
308where
309    u32: AsPrimitive<T>,
310{
311    fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
312        unsafe { self.transform_avx2(src, dst) }
313    }
314}