1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator::*;
31use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
32use crate::conversions::avx::t_lut3_to_3_q0_15::TransformLut3x3AvxQ0_15;
33use crate::conversions::interpolator::BarycentricWeight;
34use crate::conversions::lut_transforms::Lut3x3Factory;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41use std::arch::x86_64::*;
42use std::marker::PhantomData;
43
44struct TransformLut3x3AvxFma<
45 T,
46 U,
47 const SRC_LAYOUT: u8,
48 const DST_LAYOUT: u8,
49 const GRID_SIZE: usize,
50 const BIT_DEPTH: usize,
51 const BINS: usize,
52 const BARYCENTRIC_BINS: usize,
53> {
54 lut: Vec<SseAlignedF32>,
55 _phantom: PhantomData<T>,
56 _phantom2: PhantomData<U>,
57 interpolation_method: InterpolationMethod,
58 weights: Box<[BarycentricWeight<f32>; BINS]>,
59 color_space: DataColorSpace,
60 is_linear: bool,
61}
62
63impl<
64 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
65 U: AsPrimitive<usize>,
66 const SRC_LAYOUT: u8,
67 const DST_LAYOUT: u8,
68 const GRID_SIZE: usize,
69 const BIT_DEPTH: usize,
70 const BINS: usize,
71 const BARYCENTRIC_BINS: usize,
72> TransformLut3x3AvxFma<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
73where
74 f32: AsPrimitive<T>,
75 u32: AsPrimitive<T>,
76 (): LutBarycentricReduction<T, U>,
77{
78 #[allow(unused_unsafe)]
79 #[target_feature(enable = "avx2", enable = "fma")]
80 unsafe fn transform_chunk(
81 &self,
82 src: &[T],
83 dst: &mut [T],
84 interpolator: Box<dyn AvxMdInterpolation + Send + Sync>,
85 ) {
86 let src_cn = Layout::from(SRC_LAYOUT);
87 let src_channels = src_cn.channels();
88
89 let dst_cn = Layout::from(DST_LAYOUT);
90 let dst_channels = dst_cn.channels();
91
92 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
93 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
94
95 for (src, dst) in src
96 .chunks_exact(src_channels)
97 .zip(dst.chunks_exact_mut(dst_channels))
98 {
99 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100 src[src_cn.r_i()],
101 );
102 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[src_cn.g_i()],
104 );
105 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[src_cn.b_i()],
107 );
108
109 let a = if src_channels == 4 {
110 src[src_cn.a_i()]
111 } else {
112 max_value
113 };
114
115 let v = interpolator.inter3_sse(
116 &self.lut,
117 x.as_(),
118 y.as_(),
119 z.as_(),
120 self.weights.as_slice(),
121 );
122 if T::FINITE {
123 unsafe {
124 let mut r = _mm_mul_ps(v.v, value_scale);
125 r = _mm_max_ps(r, _mm_setzero_ps());
126 r = _mm_min_ps(r, value_scale);
127 let jvz = _mm_cvtps_epi32(r);
128
129 let x = _mm_extract_epi32::<0>(jvz);
130 let y = _mm_extract_epi32::<1>(jvz);
131 let z = _mm_extract_epi32::<2>(jvz);
132
133 dst[dst_cn.r_i()] = (x as u32).as_();
134 dst[dst_cn.g_i()] = (y as u32).as_();
135 dst[dst_cn.b_i()] = (z as u32).as_();
136 }
137 } else {
138 unsafe {
139 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
140 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
141 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
142 }
143 }
144 if dst_channels == 4 {
145 dst[dst_cn.a_i()] = a;
146 }
147 }
148 }
149}
150
151impl<
152 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
153 U: AsPrimitive<usize>,
154 const SRC_LAYOUT: u8,
155 const DST_LAYOUT: u8,
156 const GRID_SIZE: usize,
157 const BIT_DEPTH: usize,
158 const BINS: usize,
159 const BARYCENTRIC_BINS: usize,
160> TransformExecutor<T>
161 for TransformLut3x3AvxFma<
162 T,
163 U,
164 SRC_LAYOUT,
165 DST_LAYOUT,
166 GRID_SIZE,
167 BIT_DEPTH,
168 BINS,
169 BARYCENTRIC_BINS,
170 >
171where
172 f32: AsPrimitive<T>,
173 u32: AsPrimitive<T>,
174 (): LutBarycentricReduction<T, U>,
175{
176 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
177 let src_cn = Layout::from(SRC_LAYOUT);
178 let src_channels = src_cn.channels();
179
180 let dst_cn = Layout::from(DST_LAYOUT);
181 let dst_channels = dst_cn.channels();
182 if src.len() % src_channels != 0 {
183 return Err(CmsError::LaneMultipleOfChannels);
184 }
185 if dst.len() % dst_channels != 0 {
186 return Err(CmsError::LaneMultipleOfChannels);
187 }
188 let src_chunks = src.len() / src_channels;
189 let dst_chunks = dst.len() / dst_channels;
190 if src_chunks != dst_chunks {
191 return Err(CmsError::LaneSizeMismatch);
192 }
193
194 unsafe {
195 if self.color_space == DataColorSpace::Lab
196 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
197 || self.color_space == DataColorSpace::Xyz
198 {
199 self.transform_chunk(src, dst, Box::new(TrilinearAvxFma::<GRID_SIZE> {}));
200 } else {
201 match self.interpolation_method {
202 #[cfg(feature = "options")]
203 InterpolationMethod::Tetrahedral => {
204 self.transform_chunk(src, dst, Box::new(TetrahedralAvxFma::<GRID_SIZE> {}));
205 }
206 #[cfg(feature = "options")]
207 InterpolationMethod::Pyramid => {
208 self.transform_chunk(src, dst, Box::new(PyramidalAvxFma::<GRID_SIZE> {}));
209 }
210 #[cfg(feature = "options")]
211 InterpolationMethod::Prism => {
212 self.transform_chunk(src, dst, Box::new(PrismaticAvxFma::<GRID_SIZE> {}));
213 }
214 InterpolationMethod::Linear => {
215 self.transform_chunk(src, dst, Box::new(TrilinearAvxFma::<GRID_SIZE> {}));
216 }
217 }
218 }
219 }
220 Ok(())
221 }
222}
223
224pub(crate) struct AvxLut3x3Factory {}
225
226impl Lut3x3Factory for AvxLut3x3Factory {
227 fn make_transform_3x3<
228 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
229 const SRC_LAYOUT: u8,
230 const DST_LAYOUT: u8,
231 const GRID_SIZE: usize,
232 const BIT_DEPTH: usize,
233 >(
234 lut: Vec<f32>,
235 options: TransformOptions,
236 color_space: DataColorSpace,
237 is_linear: bool,
238 ) -> Box<dyn TransformExecutor<T> + Send + Sync>
239 where
240 f32: AsPrimitive<T>,
241 u32: AsPrimitive<T>,
242 (): LutBarycentricReduction<T, u8>,
243 (): LutBarycentricReduction<T, u16>,
244 {
245 if options.prefer_fixed_point && BIT_DEPTH < 16 {
246 let q: f32 = if T::FINITE {
247 ((1i32 << BIT_DEPTH as i32) - 1) as f32
248 } else {
249 ((1i32 << 14i32) - 1) as f32
250 };
251 let lut = lut
252 .chunks_exact(3)
253 .map(|x| {
254 AvxAlignedI16([
255 (x[0] * q).round() as i16,
256 (x[1] * q).round() as i16,
257 (x[2] * q).round() as i16,
258 0,
259 ])
260 })
261 .collect::<Vec<_>>();
262 return match options.barycentric_weight_scale {
263 BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxQ0_15::<
264 T,
265 u8,
266 SRC_LAYOUT,
267 DST_LAYOUT,
268 GRID_SIZE,
269 BIT_DEPTH,
270 256,
271 256,
272 > {
273 lut,
274 _phantom: PhantomData,
275 _phantom2: PhantomData,
276 interpolation_method: options.interpolation_method,
277 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
278 color_space,
279 is_linear,
280 }),
281 #[cfg(feature = "options")]
282 BarycentricWeightScale::High => Box::new(TransformLut3x3AvxQ0_15::<
283 T,
284 u16,
285 SRC_LAYOUT,
286 DST_LAYOUT,
287 GRID_SIZE,
288 BIT_DEPTH,
289 65536,
290 65536,
291 > {
292 lut,
293 _phantom: PhantomData,
294 _phantom2: PhantomData,
295 interpolation_method: options.interpolation_method,
296 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
297 color_space,
298 is_linear,
299 }),
300 };
301 }
302 assert!(
303 std::arch::is_x86_feature_detected!("fma"),
304 "Internal configuration error, this might not be called without `fma` feature"
305 );
306 let lut = lut
307 .chunks_exact(3)
308 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
309 .collect::<Vec<_>>();
310 match options.barycentric_weight_scale {
311 BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxFma::<
312 T,
313 u8,
314 SRC_LAYOUT,
315 DST_LAYOUT,
316 GRID_SIZE,
317 BIT_DEPTH,
318 256,
319 256,
320 > {
321 lut,
322 _phantom: PhantomData,
323 _phantom2: PhantomData,
324 interpolation_method: options.interpolation_method,
325 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
326 color_space,
327 is_linear,
328 }),
329 #[cfg(feature = "options")]
330 BarycentricWeightScale::High => Box::new(TransformLut3x3AvxFma::<
331 T,
332 u16,
333 SRC_LAYOUT,
334 DST_LAYOUT,
335 GRID_SIZE,
336 BIT_DEPTH,
337 65536,
338 65536,
339 > {
340 lut,
341 _phantom: PhantomData,
342 _phantom2: PhantomData,
343 interpolation_method: options.interpolation_method,
344 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
345 color_space,
346 is_linear,
347 }),
348 }
349 }
350}