1use crate::conversions::TransformMatrixShaperOptimized;
30use crate::conversions::avx::rgb_xyz::AvxAlignedU16;
31use crate::transform::PointeeSizeExpressible;
32use crate::{CmsError, Layout, TransformExecutor};
33use num_traits::AsPrimitive;
34use std::arch::x86_64::*;
35
36pub(crate) struct TransformShaperRgbOptAvx<
37 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
38 const SRC_LAYOUT: u8,
39 const DST_LAYOUT: u8,
40 const LINEAR_CAP: usize,
41 const GAMMA_LUT: usize,
42> {
43 pub(crate) profile: TransformMatrixShaperOptimized<T, LINEAR_CAP>,
44 pub(crate) bit_depth: usize,
45}
46
47impl<
48 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
49 const SRC_LAYOUT: u8,
50 const DST_LAYOUT: u8,
51 const LINEAR_CAP: usize,
52 const GAMMA_LUT: usize,
53> TransformShaperRgbOptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
54where
55 u32: AsPrimitive<T>,
56{
57 #[inline(always)]
58 unsafe fn transform_impl<const FMA: bool>(
59 &self,
60 src: &[T],
61 dst: &mut [T],
62 ) -> Result<(), CmsError> {
63 let src_cn = Layout::from(SRC_LAYOUT);
64 let dst_cn = Layout::from(DST_LAYOUT);
65 let src_channels = src_cn.channels();
66 let dst_channels = dst_cn.channels();
67
68 let mut temporary0 = AvxAlignedU16([0; 16]);
69
70 if src.len() / src_channels != dst.len() / dst_channels {
71 return Err(CmsError::LaneSizeMismatch);
72 }
73 if src.len() % src_channels != 0 {
74 return Err(CmsError::LaneMultipleOfChannels);
75 }
76 if dst.len() % dst_channels != 0 {
77 return Err(CmsError::LaneMultipleOfChannels);
78 }
79
80 let t = self.profile.adaptation_matrix.transpose();
81
82 let scale = (GAMMA_LUT - 1) as f32;
83 let max_colors: T = ((1 << self.bit_depth) - 1).as_();
84
85 unsafe {
86 let m0 = _mm256_setr_ps(
87 t.v[0][0], t.v[0][1], t.v[0][2], 0., t.v[0][0], t.v[0][1], t.v[0][2], 0.,
88 );
89 let m1 = _mm256_setr_ps(
90 t.v[1][0], t.v[1][1], t.v[1][2], 0., t.v[1][0], t.v[1][1], t.v[1][2], 0.,
91 );
92 let m2 = _mm256_setr_ps(
93 t.v[2][0], t.v[2][1], t.v[2][2], 0., t.v[2][0], t.v[2][1], t.v[2][2], 0.,
94 );
95
96 let zeros = _mm_setzero_ps();
97
98 let v_scale = _mm256_set1_ps(scale);
99
100 let mut src = src;
101 let mut dst = dst;
102
103 let mut src_iter = src.chunks_exact(src_channels * 2);
104 let dst_iter = dst.chunks_exact_mut(dst_channels * 2);
105
106 let (mut r0, mut g0, mut b0, mut a0);
107 let (mut r1, mut g1, mut b1, mut a1);
108
109 if let Some(src) = src_iter.next() {
110 r0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
111 g0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
112 b0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
113 r1 = _mm_broadcast_ss(
114 &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
115 );
116 g1 = _mm_broadcast_ss(
117 &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
118 );
119 b1 = _mm_broadcast_ss(
120 &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
121 );
122 a0 = if src_channels == 4 {
123 src[src_cn.a_i()]
124 } else {
125 max_colors
126 };
127 a1 = if src_channels == 4 {
128 src[src_cn.a_i() + src_channels]
129 } else {
130 max_colors
131 };
132 } else {
133 r0 = _mm_setzero_ps();
134 g0 = _mm_setzero_ps();
135 b0 = _mm_setzero_ps();
136 a0 = max_colors;
137 r1 = _mm_setzero_ps();
138 g1 = _mm_setzero_ps();
139 b1 = _mm_setzero_ps();
140 a1 = max_colors;
141 }
142
143 for (src, dst) in src_iter.zip(dst_iter) {
144 let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
145 let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
146 let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
147
148 let mut v = if FMA {
149 let v0 = _mm256_mul_ps(r, m0);
150 let v1 = _mm256_fmadd_ps(g, m1, v0);
151 _mm256_fmadd_ps(b, m2, v1)
152 } else {
153 let v0 = _mm256_mul_ps(r, m0);
154 let v1 = _mm256_mul_ps(g, m1);
155 let v2 = _mm256_mul_ps(b, m2);
156
157 _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
158 };
159
160 v = _mm256_max_ps(v, _mm256_setzero_ps());
161 v = _mm256_mul_ps(v, v_scale);
162 v = _mm256_min_ps(v, v_scale);
163
164 let zx = _mm256_cvtps_epi32(v);
165 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
166
167 r0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
168 g0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
169 b0 = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
170 r1 = _mm_broadcast_ss(
171 &self.profile.linear[src[src_cn.r_i() + src_channels]._as_usize()],
172 );
173 g1 = _mm_broadcast_ss(
174 &self.profile.linear[src[src_cn.g_i() + src_channels]._as_usize()],
175 );
176 b1 = _mm_broadcast_ss(
177 &self.profile.linear[src[src_cn.b_i() + src_channels]._as_usize()],
178 );
179
180 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
181 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
182 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
183 if dst_channels == 4 {
184 dst[dst_cn.a_i()] = a0;
185 }
186
187 dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
188 dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
189 dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
190 if dst_channels == 4 {
191 dst[dst_cn.a_i() + dst_channels] = a1;
192 }
193
194 a0 = if src_channels == 4 {
195 src[src_cn.a_i()]
196 } else {
197 max_colors
198 };
199 a1 = if src_channels == 4 {
200 src[src_cn.a_i() + src_channels]
201 } else {
202 max_colors
203 };
204 }
205
206 if let Some(dst) = dst.chunks_exact_mut(dst_channels * 2).last() {
207 let r = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(r0), r1);
208 let g = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(g0), g1);
209 let b = _mm256_insertf128_ps::<1>(_mm256_castps128_ps256(b0), b1);
210
211 let mut v = if FMA {
212 let v0 = _mm256_mul_ps(r, m0);
213 let v1 = _mm256_fmadd_ps(g, m1, v0);
214 _mm256_fmadd_ps(b, m2, v1)
215 } else {
216 let v0 = _mm256_mul_ps(r, m0);
217 let v1 = _mm256_mul_ps(g, m1);
218 let v2 = _mm256_mul_ps(b, m2);
219
220 _mm256_add_ps(_mm256_add_ps(v0, v1), v2)
221 };
222
223 v = _mm256_max_ps(v, _mm256_setzero_ps());
224 v = _mm256_mul_ps(v, v_scale);
225 v = _mm256_min_ps(v, v_scale);
226
227 let zx = _mm256_cvtps_epi32(v);
228 _mm256_store_si256(temporary0.0.as_mut_ptr() as *mut _, zx);
229
230 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
231 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
232 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
233 if dst_channels == 4 {
234 dst[dst_cn.a_i()] = a0;
235 }
236
237 dst[dst_cn.r_i() + dst_channels] = self.profile.gamma[temporary0.0[8] as usize];
238 dst[dst_cn.g_i() + dst_channels] = self.profile.gamma[temporary0.0[10] as usize];
239 dst[dst_cn.b_i() + dst_channels] = self.profile.gamma[temporary0.0[12] as usize];
240 if dst_channels == 4 {
241 dst[dst_cn.a_i() + dst_channels] = a1;
242 }
243 }
244
245 src = src.chunks_exact(src_channels * 2).remainder();
246 dst = dst.chunks_exact_mut(dst_channels * 2).into_remainder();
247
248 for (src, dst) in src
249 .chunks_exact(src_channels)
250 .zip(dst.chunks_exact_mut(dst_channels))
251 {
252 let r = _mm_broadcast_ss(&self.profile.linear[src[src_cn.r_i()]._as_usize()]);
253 let g = _mm_broadcast_ss(&self.profile.linear[src[src_cn.g_i()]._as_usize()]);
254 let b = _mm_broadcast_ss(&self.profile.linear[src[src_cn.b_i()]._as_usize()]);
255 let a = if src_channels == 4 {
256 src[src_cn.a_i()]
257 } else {
258 max_colors
259 };
260
261 let mut v = if FMA {
262 let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
263 let v1 = _mm_fmadd_ps(g, _mm256_castps256_ps128(m1), v0);
264 _mm_fmadd_ps(b, _mm256_castps256_ps128(m2), v1)
265 } else {
266 let v0 = _mm_mul_ps(r, _mm256_castps256_ps128(m0));
267 let v1 = _mm_mul_ps(g, _mm256_castps256_ps128(m1));
268 let v2 = _mm_mul_ps(b, _mm256_castps256_ps128(m2));
269
270 _mm_add_ps(_mm_add_ps(v0, v1), v2)
271 };
272
273 v = _mm_max_ps(v, zeros);
274 v = _mm_mul_ps(v, _mm256_castps256_ps128(v_scale));
275 v = _mm_min_ps(v, _mm256_castps256_ps128(v_scale));
276
277 let zx = _mm_cvtps_epi32(v);
278 _mm_store_si128(temporary0.0.as_mut_ptr() as *mut _, zx);
279
280 dst[dst_cn.r_i()] = self.profile.gamma[temporary0.0[0] as usize];
281 dst[dst_cn.g_i()] = self.profile.gamma[temporary0.0[2] as usize];
282 dst[dst_cn.b_i()] = self.profile.gamma[temporary0.0[4] as usize];
283 if dst_channels == 4 {
284 dst[dst_cn.a_i()] = a;
285 }
286 }
287 }
288
289 Ok(())
290 }
291
292 #[target_feature(enable = "avx2", enable = "fma")]
293 unsafe fn transform_fma(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
294 unsafe { self.transform_impl::<true>(src, dst) }
295 }
296
297 #[target_feature(enable = "avx2")]
298 unsafe fn transform_avx(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
299 unsafe { self.transform_impl::<false>(src, dst) }
300 }
301}
302
303impl<
304 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
305 const SRC_LAYOUT: u8,
306 const DST_LAYOUT: u8,
307 const LINEAR_CAP: usize,
308 const GAMMA_LUT: usize,
309> TransformExecutor<T>
310 for TransformShaperRgbOptAvx<T, SRC_LAYOUT, DST_LAYOUT, LINEAR_CAP, GAMMA_LUT>
311where
312 u32: AsPrimitive<T>,
313{
314 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
315 unsafe {
316 if std::arch::is_x86_feature_detected!("fma") {
317 self.transform_fma(src, dst)
318 } else {
319 self.transform_avx(src, dst)
320 }
321 }
322 }
323}