1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::lut_transforms::Lut3x3Factory;
32use crate::conversions::sse::interpolator::*;
33use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
34use crate::conversions::sse::t_lut3_to_3_q0_15::TransformLut3x3SseQ0_15;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::marker::PhantomData;
46
47struct TransformLut3x3Sse<
48 T,
49 U,
50 const SRC_LAYOUT: u8,
51 const DST_LAYOUT: u8,
52 const GRID_SIZE: usize,
53 const BIT_DEPTH: usize,
54 const BINS: usize,
55 const BARYCENTRIC_BINS: usize,
56> {
57 lut: Vec<SseAlignedF32>,
58 _phantom: PhantomData<T>,
59 _phantom2: PhantomData<U>,
60 interpolation_method: InterpolationMethod,
61 weights: Box<[BarycentricWeight<f32>; BINS]>,
62 color_space: DataColorSpace,
63 is_linear: bool,
64}
65
66impl<
67 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
68 U: AsPrimitive<usize>,
69 const SRC_LAYOUT: u8,
70 const DST_LAYOUT: u8,
71 const GRID_SIZE: usize,
72 const BIT_DEPTH: usize,
73 const BINS: usize,
74 const BARYCENTRIC_BINS: usize,
75> TransformLut3x3Sse<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
76where
77 f32: AsPrimitive<T>,
78 u32: AsPrimitive<T>,
79 (): LutBarycentricReduction<T, U>,
80{
81 #[allow(unused_unsafe)]
82 #[target_feature(enable = "sse4.1")]
83 unsafe fn transform_chunk(
84 &self,
85 src: &[T],
86 dst: &mut [T],
87 interpolator: Box<dyn SseMdInterpolation + Send + Sync>,
88 ) {
89 let src_cn = Layout::from(SRC_LAYOUT);
90 let src_channels = src_cn.channels();
91
92 let dst_cn = Layout::from(DST_LAYOUT);
93 let dst_channels = dst_cn.channels();
94
95 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
96 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
97
98 for (src, dst) in src
99 .chunks_exact(src_channels)
100 .zip(dst.chunks_exact_mut(dst_channels))
101 {
102 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[src_cn.r_i()],
104 );
105 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[src_cn.g_i()],
107 );
108 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
109 src[src_cn.b_i()],
110 );
111
112 let a = if src_channels == 4 {
113 src[src_cn.a_i()]
114 } else {
115 max_value
116 };
117
118 let v = interpolator.inter3_sse(
119 &self.lut,
120 x.as_(),
121 y.as_(),
122 z.as_(),
123 self.weights.as_slice(),
124 );
125 if T::FINITE {
126 unsafe {
127 let mut r = _mm_mul_ps(v.v, value_scale);
128 r = _mm_max_ps(r, _mm_setzero_ps());
129 r = _mm_min_ps(r, value_scale);
130 let jvz = _mm_cvtps_epi32(r);
131
132 let x = _mm_extract_epi32::<0>(jvz);
133 let y = _mm_extract_epi32::<1>(jvz);
134 let z = _mm_extract_epi32::<2>(jvz);
135
136 dst[dst_cn.r_i()] = (x as u32).as_();
137 dst[dst_cn.g_i()] = (y as u32).as_();
138 dst[dst_cn.b_i()] = (z as u32).as_();
139 }
140 } else {
141 unsafe {
142 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
143 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
144 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
145 }
146 }
147 if dst_channels == 4 {
148 dst[dst_cn.a_i()] = a;
149 }
150 }
151 }
152}
153
154impl<
155 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
156 U: AsPrimitive<usize>,
157 const SRC_LAYOUT: u8,
158 const DST_LAYOUT: u8,
159 const GRID_SIZE: usize,
160 const BIT_DEPTH: usize,
161 const BINS: usize,
162 const BARYCENTRIC_BINS: usize,
163> TransformExecutor<T>
164 for TransformLut3x3Sse<
165 T,
166 U,
167 SRC_LAYOUT,
168 DST_LAYOUT,
169 GRID_SIZE,
170 BIT_DEPTH,
171 BINS,
172 BARYCENTRIC_BINS,
173 >
174where
175 f32: AsPrimitive<T>,
176 u32: AsPrimitive<T>,
177 (): LutBarycentricReduction<T, U>,
178{
179 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
180 let src_cn = Layout::from(SRC_LAYOUT);
181 let src_channels = src_cn.channels();
182
183 let dst_cn = Layout::from(DST_LAYOUT);
184 let dst_channels = dst_cn.channels();
185 if src.len() % src_channels != 0 {
186 return Err(CmsError::LaneMultipleOfChannels);
187 }
188 if dst.len() % dst_channels != 0 {
189 return Err(CmsError::LaneMultipleOfChannels);
190 }
191 let src_chunks = src.len() / src_channels;
192 let dst_chunks = dst.len() / dst_channels;
193 if src_chunks != dst_chunks {
194 return Err(CmsError::LaneSizeMismatch);
195 }
196
197 unsafe {
198 if self.color_space == DataColorSpace::Lab
199 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
200 || self.color_space == DataColorSpace::Xyz
201 {
202 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
203 } else {
204 match self.interpolation_method {
205 #[cfg(feature = "options")]
206 InterpolationMethod::Tetrahedral => {
207 self.transform_chunk(src, dst, Box::new(TetrahedralSse::<GRID_SIZE> {}));
208 }
209 #[cfg(feature = "options")]
210 InterpolationMethod::Pyramid => {
211 self.transform_chunk(src, dst, Box::new(PyramidalSse::<GRID_SIZE> {}));
212 }
213 #[cfg(feature = "options")]
214 InterpolationMethod::Prism => {
215 self.transform_chunk(src, dst, Box::new(PrismaticSse::<GRID_SIZE> {}));
216 }
217 InterpolationMethod::Linear => {
218 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
219 }
220 }
221 }
222 }
223 Ok(())
224 }
225}
226
227pub(crate) struct SseLut3x3Factory {}
228
229impl Lut3x3Factory for SseLut3x3Factory {
230 fn make_transform_3x3<
231 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
232 const SRC_LAYOUT: u8,
233 const DST_LAYOUT: u8,
234 const GRID_SIZE: usize,
235 const BIT_DEPTH: usize,
236 >(
237 lut: Vec<f32>,
238 options: TransformOptions,
239 color_space: DataColorSpace,
240 is_linear: bool,
241 ) -> Box<dyn TransformExecutor<T> + Sync + Send>
242 where
243 f32: AsPrimitive<T>,
244 u32: AsPrimitive<T>,
245 (): LutBarycentricReduction<T, u8>,
246 (): LutBarycentricReduction<T, u16>,
247 {
248 if options.prefer_fixed_point && BIT_DEPTH < 16 {
249 let q: f32 = if T::FINITE {
250 ((1i32 << BIT_DEPTH as i32) - 1) as f32
251 } else {
252 ((1i32 << 14i32) - 1) as f32
253 };
254 let lut = lut
255 .chunks_exact(3)
256 .map(|x| {
257 SseAlignedI16x4([
258 (x[0] * q).round() as i16,
259 (x[1] * q).round() as i16,
260 (x[2] * q).round() as i16,
261 0,
262 ])
263 })
264 .collect::<Vec<_>>();
265 return match options.barycentric_weight_scale {
266 BarycentricWeightScale::Low => Box::new(TransformLut3x3SseQ0_15::<
267 T,
268 u8,
269 SRC_LAYOUT,
270 DST_LAYOUT,
271 GRID_SIZE,
272 BIT_DEPTH,
273 256,
274 256,
275 > {
276 lut,
277 _phantom: PhantomData,
278 _phantom2: PhantomData,
279 interpolation_method: options.interpolation_method,
280 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
281 color_space,
282 is_linear,
283 }),
284 #[cfg(feature = "options")]
285 BarycentricWeightScale::High => Box::new(TransformLut3x3SseQ0_15::<
286 T,
287 u16,
288 SRC_LAYOUT,
289 DST_LAYOUT,
290 GRID_SIZE,
291 BIT_DEPTH,
292 65536,
293 65536,
294 > {
295 lut,
296 _phantom: PhantomData,
297 _phantom2: PhantomData,
298 interpolation_method: options.interpolation_method,
299 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
300 color_space,
301 is_linear,
302 }),
303 };
304 }
305 let lut = lut
306 .chunks_exact(3)
307 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
308 .collect::<Vec<_>>();
309 match options.barycentric_weight_scale {
310 BarycentricWeightScale::Low => Box::new(TransformLut3x3Sse::<
311 T,
312 u8,
313 SRC_LAYOUT,
314 DST_LAYOUT,
315 GRID_SIZE,
316 BIT_DEPTH,
317 256,
318 256,
319 > {
320 lut,
321 _phantom: PhantomData,
322 _phantom2: PhantomData,
323 interpolation_method: options.interpolation_method,
324 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
325 color_space,
326 is_linear,
327 }),
328 #[cfg(feature = "options")]
329 BarycentricWeightScale::High => Box::new(TransformLut3x3Sse::<
330 T,
331 u16,
332 SRC_LAYOUT,
333 DST_LAYOUT,
334 GRID_SIZE,
335 BIT_DEPTH,
336 65536,
337 65536,
338 > {
339 lut,
340 _phantom: PhantomData,
341 _phantom2: PhantomData,
342 interpolation_method: options.interpolation_method,
343 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
344 color_space,
345 is_linear,
346 }),
347 }
348 }
349}