1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator::*;
31use crate::conversions::avx::interpolator_q0_15::AvxAlignedI16;
32use crate::conversions::avx::lut4_to_3_q0_15::TransformLut4To3AvxQ0_15;
33use crate::conversions::interpolator::BarycentricWeight;
34use crate::conversions::lut_transforms::Lut4x3Factory;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41use std::arch::x86_64::*;
42use std::marker::PhantomData;
43
44struct TransformLut4To3Avx<
45 T,
46 U,
47 const LAYOUT: u8,
48 const GRID_SIZE: usize,
49 const BIT_DEPTH: usize,
50 const BINS: usize,
51 const BARYCENTRIC_BINS: usize,
52> {
53 lut: Vec<SseAlignedF32>,
54 _phantom: PhantomData<T>,
55 _phantom1: PhantomData<U>,
56 interpolation_method: InterpolationMethod,
57 weights: Box<[BarycentricWeight<f32>; BINS]>,
58 color_space: DataColorSpace,
59 is_linear: bool,
60}
61
62impl<
63 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
64 U: AsPrimitive<usize>,
65 const LAYOUT: u8,
66 const GRID_SIZE: usize,
67 const BIT_DEPTH: usize,
68 const BINS: usize,
69 const BARYCENTRIC_BINS: usize,
70> TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
71where
72 f32: AsPrimitive<T>,
73 u32: AsPrimitive<T>,
74 (): LutBarycentricReduction<T, U>,
75{
76 #[allow(unused_unsafe)]
77 #[target_feature(enable = "avx2", enable = "fma")]
78 unsafe fn transform_chunk<'b, Interpolator: AvxMdInterpolationDouble<'b, GRID_SIZE>>(
79 &'b self,
80 src: &[T],
81 dst: &mut [T],
82 ) {
83 let cn = Layout::from(LAYOUT);
84 let channels = cn.channels();
85 let grid_size = GRID_SIZE as i32;
86 let grid_size3 = grid_size * grid_size * grid_size;
87
88 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
89 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
90
91 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
92 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
93 src[0],
94 );
95 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
96 src[1],
97 );
98 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
99 src[2],
100 );
101 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102 src[3],
103 );
104
105 let k_weights = self.weights[k.as_()];
106
107 let w: i32 = k_weights.x;
108 let w_n: i32 = k_weights.x_n;
109 let t: f32 = k_weights.w;
110
111 let table1 = &self.lut[(w * grid_size3) as usize..];
112 let table2 = &self.lut[(w_n * grid_size3) as usize..];
113
114 let interpolator = Interpolator::new(table1, table2);
115 let v = interpolator.inter3_sse(c, m, y, &self.weights);
116 let (a0, b0) = (v.0.v, v.1.v);
117
118 if T::FINITE {
119 unsafe {
120 let t0 = _mm_set1_ps(t);
121 let hp = _mm_fnmadd_ps(a0, t0, a0);
122 let mut v = _mm_fmadd_ps(b0, t0, hp);
123 v = _mm_max_ps(v, _mm_setzero_ps());
124 v = _mm_mul_ps(v, value_scale);
125 v = _mm_min_ps(v, value_scale);
126 let jvz = _mm_cvtps_epi32(v);
127
128 let x = _mm_extract_epi32::<0>(jvz);
129 let y = _mm_extract_epi32::<1>(jvz);
130 let z = _mm_extract_epi32::<2>(jvz);
131
132 dst[cn.r_i()] = (x as u32).as_();
133 dst[cn.g_i()] = (y as u32).as_();
134 dst[cn.b_i()] = (z as u32).as_();
135 }
136 } else {
137 unsafe {
138 let t0 = _mm_set1_ps(t);
139 let hp = _mm_fnmadd_ps(a0, t0, a0);
140 let v = _mm_fmadd_ps(b0, t0, hp);
141 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
142 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
143 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
144 }
145 }
146 if channels == 4 {
147 dst[cn.a_i()] = max_value;
148 }
149 }
150 }
151}
152
153impl<
154 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
155 U: AsPrimitive<usize>,
156 const LAYOUT: u8,
157 const GRID_SIZE: usize,
158 const BIT_DEPTH: usize,
159 const BINS: usize,
160 const BARYCENTRIC_BINS: usize,
161> TransformExecutor<T>
162 for TransformLut4To3Avx<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
163where
164 f32: AsPrimitive<T>,
165 u32: AsPrimitive<T>,
166 (): LutBarycentricReduction<T, U>,
167{
168 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
169 let cn = Layout::from(LAYOUT);
170 let channels = cn.channels();
171 if src.len() % 4 != 0 {
172 return Err(CmsError::LaneMultipleOfChannels);
173 }
174 if dst.len() % channels != 0 {
175 return Err(CmsError::LaneMultipleOfChannels);
176 }
177 let src_chunks = src.len() / 4;
178 let dst_chunks = dst.len() / channels;
179 if src_chunks != dst_chunks {
180 return Err(CmsError::LaneSizeMismatch);
181 }
182
183 unsafe {
184 if self.color_space == DataColorSpace::Lab
185 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
186 || self.color_space == DataColorSpace::Xyz
187 {
188 self.transform_chunk::<TrilinearAvxFmaDouble<GRID_SIZE>>(src, dst);
189 } else {
190 match self.interpolation_method {
191 #[cfg(feature = "options")]
192 InterpolationMethod::Tetrahedral => {
193 self.transform_chunk::<TetrahedralAvxFmaDouble<GRID_SIZE>>(src, dst);
194 }
195 #[cfg(feature = "options")]
196 InterpolationMethod::Pyramid => {
197 self.transform_chunk::<PyramidAvxFmaDouble<GRID_SIZE>>(src, dst);
198 }
199 #[cfg(feature = "options")]
200 InterpolationMethod::Prism => {
201 self.transform_chunk::<PrismaticAvxFmaDouble<GRID_SIZE>>(src, dst);
202 }
203 InterpolationMethod::Linear => {
204 self.transform_chunk::<TrilinearAvxFmaDouble<GRID_SIZE>>(src, dst);
205 }
206 }
207 }
208 }
209
210 Ok(())
211 }
212}
213
214pub(crate) struct AvxLut4x3Factory {}
215
216impl Lut4x3Factory for AvxLut4x3Factory {
217 fn make_transform_4x3<
218 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
219 const LAYOUT: u8,
220 const GRID_SIZE: usize,
221 const BIT_DEPTH: usize,
222 >(
223 lut: Vec<f32>,
224 options: TransformOptions,
225 color_space: DataColorSpace,
226 is_linear: bool,
227 ) -> Box<dyn TransformExecutor<T> + Send + Sync>
228 where
229 f32: AsPrimitive<T>,
230 u32: AsPrimitive<T>,
231 (): LutBarycentricReduction<T, u8>,
232 (): LutBarycentricReduction<T, u16>,
233 {
234 if options.prefer_fixed_point && BIT_DEPTH < 16 {
235 let q: f32 = if T::FINITE {
236 ((1i32 << BIT_DEPTH as i32) - 1) as f32
237 } else {
238 ((1i32 << 14i32) - 1) as f32
239 };
240 let lut = lut
241 .chunks_exact(3)
242 .map(|x| {
243 AvxAlignedI16([
244 (x[0] * q).round() as i16,
245 (x[1] * q).round() as i16,
246 (x[2] * q).round() as i16,
247 0,
248 ])
249 })
250 .collect::<Vec<_>>();
251 return match options.barycentric_weight_scale {
252 BarycentricWeightScale::Low => Box::new(TransformLut4To3AvxQ0_15::<
253 T,
254 u8,
255 LAYOUT,
256 GRID_SIZE,
257 BIT_DEPTH,
258 256,
259 256,
260 > {
261 lut,
262 interpolation_method: options.interpolation_method,
263 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
264 _phantom: PhantomData,
265 _phantom1: PhantomData,
266 color_space,
267 is_linear,
268 }),
269 #[cfg(feature = "options")]
270 BarycentricWeightScale::High => Box::new(TransformLut4To3AvxQ0_15::<
271 T,
272 u16,
273 LAYOUT,
274 GRID_SIZE,
275 BIT_DEPTH,
276 65536,
277 65536,
278 > {
279 lut,
280 interpolation_method: options.interpolation_method,
281 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
282 _phantom: PhantomData,
283 _phantom1: PhantomData,
284 color_space,
285 is_linear,
286 }),
287 };
288 }
289 assert!(
290 std::arch::is_x86_feature_detected!("fma"),
291 "Internal configuration error, this might not be called without `fma` feature"
292 );
293 let lut = lut
294 .chunks_exact(3)
295 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
296 .collect::<Vec<_>>();
297 match options.barycentric_weight_scale {
298 BarycentricWeightScale::Low => {
299 Box::new(
300 TransformLut4To3Avx::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
301 lut,
302 interpolation_method: options.interpolation_method,
303 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
304 _phantom: PhantomData,
305 _phantom1: PhantomData,
306 color_space,
307 is_linear,
308 },
309 )
310 }
311 #[cfg(feature = "options")]
312 BarycentricWeightScale::High => {
313 Box::new(
314 TransformLut4To3Avx::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
315 lut,
316 interpolation_method: options.interpolation_method,
317 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
318 _phantom: PhantomData,
319 _phantom1: PhantomData,
320 color_space,
321 is_linear,
322 },
323 )
324 }
325 }
326 }
327}