1use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
30use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPoint;
31use crate::transform::PointeeSizeExpressible;
32use crate::{CmsError, Layout, TransformExecutor};
33use num_traits::AsPrimitive;
34use std::arch::x86_64::*;
35
36pub(crate) struct TransformShaperRgbQ2_13Avx<
37 T: Copy,
38 const SRC_LAYOUT: u8,
39 const DST_LAYOUT: u8,
40 const LINEAR_CAP: usize,
41 const GAMMA_LUT: usize,
42 const PRECISION: i32,
43> {
44 pub(crate) profile: TransformMatrixShaperFixedPoint<i32, T, LINEAR_CAP>,
45 pub(crate) bit_depth: usize,
46}
47
48#[inline(always)]
49pub(crate) unsafe fn _xmm_broadcast_epi32(f: &i32) -> __m128i {
50 let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
51 unsafe { _mm_castps_si128(_mm_broadcast_ss(float_ref)) }
52}
53
54impl<
55 T: Copy + PointeeSizeExpressible + 'static,
56 const SRC_LAYOUT: u8,
57 const DST_LAYOUT: u8,
58 const LINEAR_CAP: usize,
59 const GAMMA_LUT: usize,
60 const PRECISION: i32,
61> TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
62where
63 u32: AsPrimitive<T>,
64{
65 #[target_feature(enable = "avx2")]
66 unsafe fn transform_avx2(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
67 let src_cn = Layout::from(SRC_LAYOUT);
68 let dst_cn = Layout::from(DST_LAYOUT);
69 let src_channels = src_cn.channels();
70 let dst_channels = dst_cn.channels();
71
72 let mut temporary0 = AvxAlignedU16([0; 16]);
73
74 if src.len() / src_channels != dst.len() / dst_channels {
75 return Err(CmsError::LaneSizeMismatch);
76 }
77 if src.len() % src_channels != 0 {
78 return Err(CmsError::LaneMultipleOfChannels);
79 }
80 if dst.len() % dst_channels != 0 {
81 return Err(CmsError::LaneMultipleOfChannels);
82 }
83
84 let t = self.profile.adaptation_matrix.transpose();
85
86 let max_colors = ((1 << self.bit_depth) - 1).as_();
87
88 unsafe {
89 let m0 = _mm256_setr_epi16(
90 t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0, t.v[0][0],
91 t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
92 );
93 let m2 = _mm256_setr_epi16(
94 t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0, t.v[2][0], 1, t.v[2][1], 1,
95 t.v[2][2], 1, 0, 0,
96 );
97
98 let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
99 let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
100
101 let zeros = _mm256_setzero_si256();
102
103 let v_max_value = _mm256_set1_epi32(GAMMA_LUT as i32 - 1);
104
105 let mut src = src;
106 let mut dst = dst;
107
108 let mut src_iter = src.chunks_exact(src_channels * 2);
109 let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
110
111 let (mut r0, mut g0, mut b0, mut a0);
112 let (mut r1, mut g1, mut b1, mut a1);
113
114 if let Some(src) = src_iter.next() {
115 r0 = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
116 g0 = _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
117 b0 = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
118 r1 = _xmm_broadcast_epi32(
119 &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
120 );
121 g1 = _xmm_broadcast_epi32(
122 &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
123 );
124 b1 = _xmm_broadcast_epi32(
125 &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
126 );
127 a0 = if src_channels == 4 {
128 src[src_cn.a_i()]
129 } else {
130 max_colors
131 };
132 a1 = if src_channels == 4 {
133 src[src_cn.a_i() + src_channels]
134 } else {
135 max_colors
136 };
137 } else {
138 r0 = _mm_setzero_si128();
139 g0 = _mm_setzero_si128();
140 b0 = _mm_setzero_si128();
141 a0 = max_colors;
142 r1 = _mm_setzero_si128();
143 g1 = _mm_setzero_si128();
144 b1 = _mm_setzero_si128();
145 a1 = max_colors;
146 }
147
148 for (src, dst) in src_iter.zip(dst_iter) {
149 let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
150 let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
151 let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
152 zg0 = _mm256_slli_epi32::<16>(zg0);
153
154 let zrg0 = _mm256_or_si256(zr0, zg0);
155 let zbz0 = _mm256_or_si256(zb0, rnd);
156
157 let va0 = _mm256_madd_epi16(zrg0, m0);
158 let va1 = _mm256_madd_epi16(zbz0, m2);
159
160 let mut v0 = _mm256_add_epi32(va0, va1);
161
162 v0 = _mm256_srai_epi32::<PRECISION>(v0);
163 v0 = _mm256_max_epi32(v0, zeros);
164 v0 = _mm256_min_epi32(v0, v_max_value);
165
166 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
167
168 r0 = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
169 g0 = _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
170 b0 = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
171 r1 = _xmm_broadcast_epi32(
172 &self.profile.r_linear[src[src_cn.r_i() + src_channels]._as_usize()],
173 );
174 g1 = _xmm_broadcast_epi32(
175 &self.profile.g_linear[src[src_cn.g_i() + src_channels]._as_usize()],
176 );
177 b1 = _xmm_broadcast_epi32(
178 &self.profile.b_linear[src[src_cn.b_i() + src_channels]._as_usize()],
179 );
180
181 dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
182 dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
183 dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
184 if dst_channels == 4 {
185 dst[dst_cn.a_i()] = a0;
186 }
187
188 dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
189 dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
190 dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
191 if dst_channels == 4 {
192 dst[dst_cn.a_i() + dst_channels] = a1;
193 }
194
195 a0 = if src_channels == 4 {
196 src[src_cn.a_i()]
197 } else {
198 max_colors
199 };
200 a1 = if src_channels == 4 {
201 src[src_cn.a_i() + src_channels]
202 } else {
203 max_colors
204 };
205 }
206
207 if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
208 let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
209 let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
210 let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
211 zg0 = _mm256_slli_epi32::<16>(zg0);
212
213 let zrg0 = _mm256_or_si256(zr0, zg0);
214 let zbz0 = _mm256_or_si256(zb0, rnd);
215
216 let va0 = _mm256_madd_epi16(zrg0, m0);
217 let va1 = _mm256_madd_epi16(zbz0, m2);
218
219 let mut v0 = _mm256_add_epi32(va0, va1);
220
221 v0 = _mm256_srai_epi32::<PRECISION>(v0);
222 v0 = _mm256_max_epi32(v0, zeros);
223 v0 = _mm256_min_epi32(v0, v_max_value);
224
225 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
226
227 dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
228 dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
229 dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
230 if dst_channels == 4 {
231 dst[dst_cn.a_i()] = a0;
232 }
233
234 dst[dst_cn.r_i() + dst_channels] = self.profile.r_gamma[temporary0.0[8] as usize];
235 dst[dst_cn.g_i() + dst_channels] = self.profile.g_gamma[temporary0.0[10] as usize];
236 dst[dst_cn.b_i() + dst_channels] = self.profile.b_gamma[temporary0.0[12] as usize];
237 if dst_channels == 4 {
238 dst[dst_cn.a_i() + dst_channels] = a1;
239 }
240 }
241
242 src = src.chunks_exact(src_channels * 2).remainder();
243 dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
244
245 for (src, dst) in src
246 .chunks_exact(src_channels)
247 .zip(dst.chunks_exact_mut(dst_channels))
248 {
249 let r = _xmm_broadcast_epi32(&self.profile.r_linear[src[src_cn.r_i()]._as_usize()]);
250 let mut g =
251 _xmm_broadcast_epi32(&self.profile.g_linear[src[src_cn.g_i()]._as_usize()]);
252 let b = _xmm_broadcast_epi32(&self.profile.b_linear[src[src_cn.b_i()]._as_usize()]);
253
254 g = _mm_slli_epi32::<16>(g);
255
256 let a = if src_channels == 4 {
257 src[src_cn.a_i()]
258 } else {
259 max_colors
260 };
261
262 let zrg0 = _mm_or_si128(r, g);
263 let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
264
265 let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
266 let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
267
268 let mut v = _mm_add_epi32(v0, v1);
269
270 v = _mm_srai_epi32::<PRECISION>(v);
271 v = _mm_max_epi32(v, _mm_setzero_si128());
272 v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
273
274 _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
275
276 dst[dst_cn.r_i()] = self.profile.r_gamma[temporary0.0[0] as usize];
277 dst[dst_cn.g_i()] = self.profile.g_gamma[temporary0.0[2] as usize];
278 dst[dst_cn.b_i()] = self.profile.b_gamma[temporary0.0[4] as usize];
279 if dst_channels == 4 {
280 dst[dst_cn.a_i()] = a;
281 }
282 }
283 }
284
285 Ok(())
286 }
287}
288
289impl<
290 T: Copy + PointeeSizeExpressible + 'static + Default,
291 const SRC_LAYOUT: u8,
292 const DST_LAYOUT: u8,
293 const LINEAR_CAP: usize,
294 const GAMMA_LUT: usize,
295 const PRECISION: i32,
296> TransformExecutor<T>
297 for TransformShaperRgbQ2_13Avx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
298where
299 u32: AsPrimitive<T>,
300{
301 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
302 unsafe { self.transform_avx2(src, dst) }
303 }
304}