1use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
30use crate::conversions::avx::rgb_xyz_q2_13::_xmm_broadcast_epi32;
31use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFixedPointOpt;
32use crate::transform::PointeeSizeExpressible;
33use crate::{CmsError, Layout, TransformExecutor};
34use num_traits::AsPrimitive;
35use std::arch::x86_64::*;
36
37pub(crate) struct TransformShaperRgbQ2_13OptAvx<
38 T: Copy,
39 const SRC_LAYOUT: u8,
40 const DST_LAYOUT: u8,
41 const LINEAR_CAP: usize,
42 const GAMMA_LUT: usize,
43 const PRECISION: i32,
44> {
45 pub(crate) profile: TransformMatrixShaperFixedPointOpt<i32, i16, T, LINEAR_CAP>,
46 pub(crate) bit_depth: usize,
47}
48
49impl<
50 T: Copy + PointeeSizeExpressible + 'static,
51 const SRC_LAYOUT: u8,
52 const DST_LAYOUT: u8,
53 const LINEAR_CAP: usize,
54 const GAMMA_LUT: usize,
55 const PRECISION: i32,
56> TransformShaperRgbQ2_13OptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
57where
58 u32: AsPrimitive<T>,
59{
60 #[target_feature(enable = "avx2")]
61 unsafe fn transform_avx2(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
62 let src_cn = Layout::from(SRC_LAYOUT);
63 let dst_cn = Layout::from(DST_LAYOUT);
64 let src_channels = src_cn.channels();
65 let dst_channels = dst_cn.channels();
66
67 let mut temporary0 = AvxAlignedU16([0; 16]);
68
69 if src.len() / src_channels != dst.len() / dst_channels {
70 return Err(CmsError::LaneSizeMismatch);
71 }
72 if src.len() % src_channels != 0 {
73 return Err(CmsError::LaneMultipleOfChannels);
74 }
75 if dst.len() % dst_channels != 0 {
76 return Err(CmsError::LaneMultipleOfChannels);
77 }
78
79 let t = self.profile.adaptation_matrix.transpose();
80
81 let max_colors = ((1 << self.bit_depth) - 1).as_();
82
83 unsafe {
84 let m0 = _mm256_setr_epi16(
85 t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0, t.v[0][0],
86 t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
87 );
88 let m2 = _mm256_setr_epi16(
89 t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0, t.v[2][0], 1, t.v[2][1], 1,
90 t.v[2][2], 1, 0, 0,
91 );
92
93 let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
94 let rnd = _mm256_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
95
96 let zeros = _mm256_setzero_si256();
97
98 let v_max_value = _mm256_set1_epi32(GAMMA_LUT as i32 - 1);
99
100 let (mut r0, mut g0, mut b0, mut a0);
101 let (mut r1, mut g1, mut b1, mut a1);
102
103 let mut src_iter = src.chunks_exact(src_channels * 2);
104
105 if let Some(src0) = src_iter.next() {
106 r0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.r_i()]._as_usize()]);
107 g0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.g_i()]._as_usize()]);
108 b0 = _xmm_broadcast_epi32(&self.profile.linear[src0[src_cn.b_i()]._as_usize()]);
109
110 r1 = _xmm_broadcast_epi32(
111 &self.profile.linear[src0[src_cn.r_i() + src_channels]._as_usize()],
112 );
113 g1 = _xmm_broadcast_epi32(
114 &self.profile.linear[src0[src_cn.g_i() + src_channels]._as_usize()],
115 );
116 b1 = _xmm_broadcast_epi32(
117 &self.profile.linear[src0[src_cn.b_i() + src_channels]._as_usize()],
118 );
119
120 a0 = if src_channels == 4 {
121 src0[src_cn.a_i()]
122 } else {
123 max_colors
124 };
125 a1 = if src_channels == 4 {
126 src0[src_cn.a_i() + src_channels]
127 } else {
128 max_colors
129 };
130 } else {
131 r0 = _mm_setzero_si128();
132 g0 = _mm_setzero_si128();
133 b0 = _mm_setzero_si128();
134 a0 = max_colors;
135 r1 = _mm_setzero_si128();
136 g1 = _mm_setzero_si128();
137 b1 = _mm_setzero_si128();
138 a1 = max_colors;
139 }
140
141 for (src, dst) in src_iter.zip(dst.chunks_exact_mut(dst_channels * 2)) {
142 let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
143 let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
144 let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
145 zg0 = _mm256_slli_epi32::<16>(zg0);
146
147 let zrg0 = _mm256_or_si256(zr0, zg0);
148 let zbz0 = _mm256_or_si256(zb0, rnd);
149
150 let va0 = _mm256_madd_epi16(zrg0, m0);
151 let va1 = _mm256_madd_epi16(zbz0, m2);
152
153 let mut v0 = _mm256_add_epi32(va0, va1);
154
155 v0 = _mm256_srai_epi32::<PRECISION>(v0);
156 v0 = _mm256_max_epi32(v0, zeros);
157 v0 = _mm256_min_epi32(v0, v_max_value);
158
159 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
160
161 r0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
162 g0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
163 b0 = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
164
165 r1 = _xmm_broadcast_epi32(
166 &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
167 );
168 g1 = _xmm_broadcast_epi32(
169 &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
170 );
171 b1 = _xmm_broadcast_epi32(
172 &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
173 );
174
175 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
176 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
177 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
178 if dst_channels == 4 {
179 dst[dst_cn.a_i()] = a0;
180 }
181
182 dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
183 dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
184 dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
185 if dst_channels == 4 {
186 dst[dst_cn.a_i() + dst_channels] = a1;
187 }
188
189 a0 = if src_channels == 4 {
190 src[src_cn.a_i()]
191 } else {
192 max_colors
193 };
194 a1 = if src_channels == 4 {
195 src[src_cn.a_i() + src_channels]
196 } else {
197 max_colors
198 };
199 }
200
201 if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
202 let zr0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(r0), r1);
203 let mut zg0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(g0), g1);
204 let zb0 = _mm256_inserti128_si256::<1>(_mm256_castsi128_si256(b0), b1);
205 zg0 = _mm256_slli_epi32::<16>(zg0);
206
207 let zrg0 = _mm256_or_si256(zr0, zg0);
208 let zbz0 = _mm256_or_si256(zb0, rnd);
209
210 let va0 = _mm256_madd_epi16(zrg0, m0);
211 let va1 = _mm256_madd_epi16(zbz0, m2);
212
213 let mut v0 = _mm256_add_epi32(va0, va1);
214
215 v0 = _mm256_srai_epi32::<PRECISION>(v0);
216 v0 = _mm256_max_epi32(v0, zeros);
217 v0 = _mm256_min_epi32(v0, v_max_value);
218
219 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, v0);
220
221 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
222 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
223 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
224 if dst_channels == 4 {
225 dst[dst_cn.a_i()] = a0;
226 }
227
228 dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
229 dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
230 dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
231 if dst_channels == 4 {
232 dst[dst_cn.a_i() + dst_channels] = a1;
233 }
234 }
235
236 let src = src.chunks_exact(src_channels * 2).remainder();
237 let dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
238
239 for (src, dst) in src
240 .chunks_exact(src_channels)
241 .zip(dst.chunks_exact_mut(dst_channels))
242 {
243 let r = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
244 let mut g =
245 _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
246 let b = _xmm_broadcast_epi32(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
247
248 g = _mm_slli_epi32::<16>(g);
249
250 let a = if src_channels == 4 {
251 src[src_cn.a_i()]
252 } else {
253 max_colors
254 };
255
256 let zrg0 = _mm_or_si128(r, g);
257 let zbz0 = _mm_or_si128(b, _mm256_castsi256_si128(rnd));
258
259 let v0 = _mm_madd_epi16(zrg0, _mm256_castsi256_si128(m0));
260 let v1 = _mm_madd_epi16(zbz0, _mm256_castsi256_si128(m2));
261
262 let mut v = _mm_add_epi32(v0, v1);
263
264 v = _mm_srai_epi32::<PRECISION>(v);
265 v = _mm_max_epi32(v, _mm_setzero_si128());
266 v = _mm_min_epi32(v, _mm256_castsi256_si128(v_max_value));
267
268 _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, v);
269
270 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
271 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
272 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
273 if dst_channels == 4 {
274 dst[dst_cn.a_i()] = a;
275 }
276 }
277 }
278
279 Ok(())
280 }
281}
282
283impl<
284 T: Copy + PointeeSizeExpressible + 'static + Default,
285 const SRC_LAYOUT: u8,
286 const DST_LAYOUT: u8,
287 const LINEAR_CAP: usize,
288 const GAMMA_LUT: usize,
289 const PRECISION: i32,
290> TransformExecutor<T>
291 for TransformShaperRgbQ2_13OptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT, PRECISION>
292where
293 u32: AsPrimitive<T>,
294{
295 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
296 unsafe { self.transform_avx2(src, dst) }
297 }
298}