1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator::*;
31use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
32use crate::conversions::avx::t_lut3_to_3_q0_15::TransformLut3x3AvxQ0_15;
33use crate::conversions::interpolator::BarycentricWeight;
34use crate::conversions::lut_transforms::Lut3x3Factory;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41use std::arch::x86_64::*;
42use std::marker::PhantomData;
43
44struct TransformLut3x3AvxFma<
45 T,
46 U,
47 const SRC_LAYOUT: u8,
48 const DST_LAYOUT: u8,
49 const GRID_SIZE: usize,
50 const BIT_DEPTH: usize,
51 const BINS: usize,
52 const BARYCENTRIC_BINS: usize,
53> {
54 lut: Vec<SseAlignedF32>,
55 _phantom: PhantomData<T>,
56 _phantom2: PhantomData<U>,
57 interpolation_method: InterpolationMethod,
58 weights: Box<[BarycentricWeight<f32>; BINS]>,
59 color_space: DataColorSpace,
60 is_linear: bool,
61}
62
63impl<
64 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
65 U: AsPrimitive<usize>,
66 const SRC_LAYOUT: u8,
67 const DST_LAYOUT: u8,
68 const GRID_SIZE: usize,
69 const BIT_DEPTH: usize,
70 const BINS: usize,
71 const BARYCENTRIC_BINS: usize,
72> TransformLut3x3AvxFma<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
73where
74 f32: AsPrimitive<T>,
75 u32: AsPrimitive<T>,
76 (): LutBarycentricReduction<T, U>,
77{
78 #[allow(unused_unsafe)]
79 #[target_feature(enable = "avx2", enable = "fma")]
80 unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolation<'b, GRID_SIZE>>(
81 &'b self,
82 src: &[T],
83 dst: &mut [T],
84 ) {
85 let src_cn = Layout::from(SRC_LAYOUT);
86 let src_channels = src_cn.channels();
87
88 let dst_cn = Layout::from(DST_LAYOUT);
89 let dst_channels = dst_cn.channels();
90
91 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
92 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
93
94 for (src, dst) in src
95 .chunks_exact(src_channels)
96 .zip(dst.chunks_exact_mut(dst_channels))
97 {
98 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
99 src[src_cn.r_i()],
100 );
101 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102 src[src_cn.g_i()],
103 );
104 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
105 src[src_cn.b_i()],
106 );
107
108 let a = if src_channels == 4 {
109 src[src_cn.a_i()]
110 } else {
111 max_value
112 };
113
114 let tetrahedral = Interpolator::new(&self.lut);
115 let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
116 if T::FINITE {
117 unsafe {
118 let mut r = _mm_mul_ps(v.v, value_scale);
119 r = _mm_max_ps(r, _mm_setzero_ps());
120 r = _mm_min_ps(r, value_scale);
121 let jvz = _mm_cvtps_epi32(r);
122
123 let x = _mm_extract_epi32::<0>(jvz);
124 let y = _mm_extract_epi32::<1>(jvz);
125 let z = _mm_extract_epi32::<2>(jvz);
126
127 dst[dst_cn.r_i()] = (x as u32).as_();
128 dst[dst_cn.g_i()] = (y as u32).as_();
129 dst[dst_cn.b_i()] = (z as u32).as_();
130 }
131 } else {
132 unsafe {
133 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
134 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
135 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
136 }
137 }
138 if dst_channels == 4 {
139 dst[dst_cn.a_i()] = a;
140 }
141 }
142 }
143}
144
145impl<
146 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
147 U: AsPrimitive<usize>,
148 const SRC_LAYOUT: u8,
149 const DST_LAYOUT: u8,
150 const GRID_SIZE: usize,
151 const BIT_DEPTH: usize,
152 const BINS: usize,
153 const BARYCENTRIC_BINS: usize,
154> TransformExecutor<T>
155 for TransformLut3x3AvxFma<
156 T,
157 U,
158 SRC_LAYOUT,
159 DST_LAYOUT,
160 GRID_SIZE,
161 BIT_DEPTH,
162 BINS,
163 BARYCENTRIC_BINS,
164 >
165where
166 f32: AsPrimitive<T>,
167 u32: AsPrimitive<T>,
168 (): LutBarycentricReduction<T, U>,
169{
170 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
171 let src_cn = Layout::from(SRC_LAYOUT);
172 let src_channels = src_cn.channels();
173
174 let dst_cn = Layout::from(DST_LAYOUT);
175 let dst_channels = dst_cn.channels();
176 if src.len() % src_channels != 0 {
177 return Err(CmsError::LaneMultipleOfChannels);
178 }
179 if dst.len() % dst_channels != 0 {
180 return Err(CmsError::LaneMultipleOfChannels);
181 }
182 let src_chunks = src.len() / src_channels;
183 let dst_chunks = dst.len() / dst_channels;
184 if src_chunks != dst_chunks {
185 return Err(CmsError::LaneSizeMismatch);
186 }
187
188 unsafe {
189 if self.color_space == DataColorSpace::Lab
190 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
191 || self.color_space == DataColorSpace::Xyz
192 {
193 self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
194 } else {
195 match self.interpolation_method {
196 #[cfg(feature = "options")]
197 InterpolationMethod::Tetrahedral => {
198 self.transform_chunk::<TetrahedralAvxFma<GRID_SIZE>>(src, dst);
199 }
200 #[cfg(feature = "options")]
201 InterpolationMethod::Pyramid => {
202 self.transform_chunk::<PyramidalAvxFma<GRID_SIZE>>(src, dst);
203 }
204 #[cfg(feature = "options")]
205 InterpolationMethod::Prism => {
206 self.transform_chunk::<PrismaticAvxFma<GRID_SIZE>>(src, dst);
207 }
208 InterpolationMethod::Linear => {
209 self.transform_chunk::<TrilinearAvxFma<GRID_SIZE>>(src, dst);
210 }
211 }
212 }
213 }
214 Ok(())
215 }
216}
217
218pub(crate) struct AvxLut3x3Factory {}
219
220impl Lut3x3Factory for AvxLut3x3Factory {
221 fn make_transform_3x3<
222 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
223 const SRC_LAYOUT: u8,
224 const DST_LAYOUT: u8,
225 const GRID_SIZE: usize,
226 const BIT_DEPTH: usize,
227 >(
228 lut: Vec<f32>,
229 options: TransformOptions,
230 color_space: DataColorSpace,
231 is_linear: bool,
232 ) -> Box<dyn TransformExecutor<T> + Send + Sync>
233 where
234 f32: AsPrimitive<T>,
235 u32: AsPrimitive<T>,
236 (): LutBarycentricReduction<T, u8>,
237 (): LutBarycentricReduction<T, u16>,
238 {
239 if options.prefer_fixed_point && BIT_DEPTH < 16 {
240 let q: f32 = if T::FINITE {
241 ((1i32 << BIT_DEPTH as i32) - 1) as f32
242 } else {
243 ((1i32 << 14i32) - 1) as f32
244 };
245 let lut = lut
246 .chunks_exact(3)
247 .map(|x| {
248 AvxAlignedI16([
249 (x[0] * q).round() as i16,
250 (x[1] * q).round() as i16,
251 (x[2] * q).round() as i16,
252 0,
253 ])
254 })
255 .collect::<Vec<_>>();
256 return match options.barycentric_weight_scale {
257 BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxQ0_15::<
258 T,
259 u8,
260 SRC_LAYOUT,
261 DST_LAYOUT,
262 GRID_SIZE,
263 BIT_DEPTH,
264 256,
265 256,
266 > {
267 lut,
268 _phantom: PhantomData,
269 _phantom2: PhantomData,
270 interpolation_method: options.interpolation_method,
271 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
272 color_space,
273 is_linear,
274 }),
275 #[cfg(feature = "options")]
276 BarycentricWeightScale::High => Box::new(TransformLut3x3AvxQ0_15::<
277 T,
278 u16,
279 SRC_LAYOUT,
280 DST_LAYOUT,
281 GRID_SIZE,
282 BIT_DEPTH,
283 65536,
284 65536,
285 > {
286 lut,
287 _phantom: PhantomData,
288 _phantom2: PhantomData,
289 interpolation_method: options.interpolation_method,
290 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
291 color_space,
292 is_linear,
293 }),
294 };
295 }
296 assert!(
297 std::arch::is_x86_feature_detected!("fma"),
298 "Internal configuration error, this might not be called without `fma` feature"
299 );
300 let lut = lut
301 .chunks_exact(3)
302 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
303 .collect::<Vec<_>>();
304 match options.barycentric_weight_scale {
305 BarycentricWeightScale::Low => Box::new(TransformLut3x3AvxFma::<
306 T,
307 u8,
308 SRC_LAYOUT,
309 DST_LAYOUT,
310 GRID_SIZE,
311 BIT_DEPTH,
312 256,
313 256,
314 > {
315 lut,
316 _phantom: PhantomData,
317 _phantom2: PhantomData,
318 interpolation_method: options.interpolation_method,
319 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
320 color_space,
321 is_linear,
322 }),
323 #[cfg(feature = "options")]
324 BarycentricWeightScale::High => Box::new(TransformLut3x3AvxFma::<
325 T,
326 u16,
327 SRC_LAYOUT,
328 DST_LAYOUT,
329 GRID_SIZE,
330 BIT_DEPTH,
331 65536,
332 65536,
333 > {
334 lut,
335 _phantom: PhantomData,
336 _phantom2: PhantomData,
337 interpolation_method: options.interpolation_method,
338 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
339 color_space,
340 is_linear,
341 }),
342 }
343 }
344}