1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator::*;
31use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
32use crate::conversions::avx::lut4_to_3_q0_15::TransformLut4To3AvxQ0_15;
33use crate::conversions::interpolator::BarycentricWeight;
34use crate::conversions::lut_transforms::Lut4x3Factory;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41use std::arch::x86_64::*;
42use std::marker::PhantomData;
43
44struct TransformLut4To3Avx<
45 T,
46 U,
47 const LAYOUT: u8,
48 const GRID_SIZE: usize,
49 const BIT_DEPTH: usize,
50 const BINS: usize,
51 const BARYCENTRIC_BINS: usize,
52> {
53 lut: Vec<SseAlignedF32>,
54 _phantom: PhantomData<T>,
55 _phantom1: PhantomData<U>,
56 interpolation_method: InterpolationMethod,
57 weights: Box<[BarycentricWeight<f32>; BINS]>,
58 color_space: DataColorSpace,
59 is_linear: bool,
60}
61
62impl<
63 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
64 U: AsPrimitive<usize>,
65 const LAYOUT: u8,
66 const GRID_SIZE: usize,
67 const BIT_DEPTH: usize,
68 const BINS: usize,
69 const BARYCENTRIC_BINS: usize,
70> TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
71where
72 f32: AsPrimitive<T>,
73 u32: AsPrimitive<T>,
74 (): LutBarycentricReduction<T, U>,
75{
76 #[allow(unused_unsafe)]
77 #[target_feature(enable = "avx2", enable = "fma")]
78 unsafe fn transform_chunk(
79 &self,
80 src: &[T],
81 dst: &mut [T],
82 interpolator: Box<dyn AvxMdInterpolationDouble + Send + Sync>,
83 ) {
84 let cn = Layout::from(LAYOUT);
85 let channels = cn.channels();
86 let grid_size = GRID_SIZE as i32;
87 let grid_size3 = grid_size * grid_size * grid_size;
88
89 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
90 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
91
92 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
93 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
94 src[0],
95 );
96 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
97 src[1],
98 );
99 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100 src[2],
101 );
102 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[3],
104 );
105
106 let k_weights = self.weights[k.as_()];
107
108 let w: i32 = k_weights.x;
109 let w_n: i32 = k_weights.x_n;
110 let t: f32 = k_weights.w;
111
112 let table1 = &self.lut[(w * grid_size3) as usize..];
113 let table2 = &self.lut[(w_n * grid_size3) as usize..];
114
115 let v = interpolator.inter3_sse(
116 table1,
117 table2,
118 c.as_(),
119 m.as_(),
120 y.as_(),
121 self.weights.as_slice(),
122 );
123 let (a0, b0) = (v.0.v, v.1.v);
124
125 if T::FINITE {
126 unsafe {
127 let t0 = _mm_set1_ps(t);
128 let hp = _mm_fnmadd_ps(a0, t0, a0);
129 let mut v = _mm_fmadd_ps(b0, t0, hp);
130 v = _mm_max_ps(v, _mm_setzero_ps());
131 v = _mm_mul_ps(v, value_scale);
132 v = _mm_min_ps(v, value_scale);
133 let jvz = _mm_cvtps_epi32(v);
134
135 let x = _mm_extract_epi32::<0>(jvz);
136 let y = _mm_extract_epi32::<1>(jvz);
137 let z = _mm_extract_epi32::<2>(jvz);
138
139 dst[cn.r_i()] = (x as u32).as_();
140 dst[cn.g_i()] = (y as u32).as_();
141 dst[cn.b_i()] = (z as u32).as_();
142 }
143 } else {
144 unsafe {
145 let t0 = _mm_set1_ps(t);
146 let hp = _mm_fnmadd_ps(a0, t0, a0);
147 let v = _mm_fmadd_ps(b0, t0, hp);
148 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
149 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
150 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
151 }
152 }
153 if channels == 4 {
154 dst[cn.a_i()] = max_value;
155 }
156 }
157 }
158}
159
160impl<
161 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
162 U: AsPrimitive<usize>,
163 const LAYOUT: u8,
164 const GRID_SIZE: usize,
165 const BIT_DEPTH: usize,
166 const BINS: usize,
167 const BARYCENTRIC_BINS: usize,
168> TransformExecutor<T>
169 for TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
170where
171 f32: AsPrimitive<T>,
172 u32: AsPrimitive<T>,
173 (): LutBarycentricReduction<T, U>,
174{
175 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
176 let cn = Layout::from(LAYOUT);
177 let channels = cn.channels();
178 if src.len() % 4 != 0 {
179 return Err(CmsError::LaneMultipleOfChannels);
180 }
181 if dst.len() % channels != 0 {
182 return Err(CmsError::LaneMultipleOfChannels);
183 }
184 let src_chunks = src.len() / 4;
185 let dst_chunks = dst.len() / channels;
186 if src_chunks != dst_chunks {
187 return Err(CmsError::LaneSizeMismatch);
188 }
189
190 unsafe {
191 if self.color_space == DataColorSpace::Lab
192 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
193 || self.color_space == DataColorSpace::Xyz
194 {
195 self.transform_chunk(src, dst, Box::new(TrilinearAvxFmaDouble::<GRID_SIZE> {}));
196 } else {
197 match self.interpolation_method {
198 #[cfg(feature = "options")]
199 InterpolationMethod::Tetrahedral => {
200 self.transform_chunk(
201 src,
202 dst,
203 Box::new(TetrahedralAvxFmaDouble::<GRID_SIZE> {}),
204 );
205 }
206 #[cfg(feature = "options")]
207 InterpolationMethod::Pyramid => {
208 self.transform_chunk(
209 src,
210 dst,
211 Box::new(PyramidAvxFmaDouble::<GRID_SIZE> {}),
212 );
213 }
214 #[cfg(feature = "options")]
215 InterpolationMethod::Prism => {
216 self.transform_chunk(
217 src,
218 dst,
219 Box::new(PrismaticAvxFmaDouble::<GRID_SIZE> {}),
220 );
221 }
222 InterpolationMethod::Linear => {
223 self.transform_chunk(
224 src,
225 dst,
226 Box::new(TrilinearAvxFmaDouble::<GRID_SIZE> {}),
227 );
228 }
229 }
230 }
231 }
232
233 Ok(())
234 }
235}
236
237pub(crate) struct AvxLut4x3Factory {}
238
239impl Lut4x3Factory for AvxLut4x3Factory {
240 fn make_transform_4x3<
241 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
242 const LAYOUT: u8,
243 const GRID_SIZE: usize,
244 const BIT_DEPTH: usize,
245 >(
246 lut: Vec<f32>,
247 options: TransformOptions,
248 color_space: DataColorSpace,
249 is_linear: bool,
250 ) -> Box<dyn TransformExecutor<T> + Send + Sync>
251 where
252 f32: AsPrimitive<T>,
253 u32: AsPrimitive<T>,
254 (): LutBarycentricReduction<T, u8>,
255 (): LutBarycentricReduction<T, u16>,
256 {
257 if options.prefer_fixed_point && BIT_DEPTH < 16 {
258 let q: f32 = if T::FINITE {
259 ((1i32 << BIT_DEPTH as i32) - 1) as f32
260 } else {
261 ((1i32 << 14i32) - 1) as f32
262 };
263 let lut = lut
264 .chunks_exact(3)
265 .map(|x| {
266 AvxAlignedI16([
267 (x[0] * q).round() as i16,
268 (x[1] * q).round() as i16,
269 (x[2] * q).round() as i16,
270 0,
271 ])
272 })
273 .collect::<Vec<_>>();
274 return match options.barycentric_weight_scale {
275 BarycentricWeightScale::Low => Box::new(TransformLut4To3AvxQ0_15::<
276 T,
277 u8,
278 LAYOUT,
279 GRID_SIZE,
280 BIT_DEPTH,
281 256,
282 256,
283 > {
284 lut,
285 interpolation_method: options.interpolation_method,
286 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
287 _phantom: PhantomData,
288 _phantom1: PhantomData,
289 color_space,
290 is_linear,
291 }),
292 #[cfg(feature = "options")]
293 BarycentricWeightScale::High => Box::new(TransformLut4To3AvxQ0_15::<
294 T,
295 u16,
296 LAYOUT,
297 GRID_SIZE,
298 BIT_DEPTH,
299 65536,
300 65536,
301 > {
302 lut,
303 interpolation_method: options.interpolation_method,
304 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
305 _phantom: PhantomData,
306 _phantom1: PhantomData,
307 color_space,
308 is_linear,
309 }),
310 };
311 }
312 assert!(
313 std::arch::is_x86_feature_detected!("fma"),
314 "Internal configuration error, this feature might not be called without `fma` feature"
315 );
316 let lut = lut
317 .chunks_exact(3)
318 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
319 .collect::<Vec<_>>();
320 match options.barycentric_weight_scale {
321 BarycentricWeightScale::Low => {
322 Box::new(
323 TransformLut4To3Avx::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
324 lut,
325 interpolation_method: options.interpolation_method,
326 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
327 _phantom: PhantomData,
328 _phantom1: PhantomData,
329 color_space,
330 is_linear,
331 },
332 )
333 }
334 #[cfg(feature = "options")]
335 BarycentricWeightScale::High => {
336 Box::new(
337 TransformLut4To3Avx::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
338 lut,
339 interpolation_method: options.interpolation_method,
340 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
341 _phantom: PhantomData,
342 _phantom1: PhantomData,
343 color_space,
344 is_linear,
345 },
346 )
347 }
348 }
349 }
350}