1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::lut_transforms::Lut3x3Factory;
32use crate::conversions::sse::interpolator::*;
33use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
34use crate::conversions::sse::t_lut3_to_3_q0_15::TransformLut3x3SseQ0_15;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::marker::PhantomData;
46
47struct TransformLut3x3Sse<
48 T,
49 U,
50 const SRC_LAYOUT: u8,
51 const DST_LAYOUT: u8,
52 const GRID_SIZE: usize,
53 const BIT_DEPTH: usize,
54 const BINS: usize,
55 const BARYCENTRIC_BINS: usize,
56> {
57 lut: Vec<SseAlignedF32>,
58 _phantom: PhantomData<T>,
59 _phantom2: PhantomData<U>,
60 interpolation_method: InterpolationMethod,
61 weights: Box<[BarycentricWeight<f32>; BINS]>,
62 color_space: DataColorSpace,
63 is_linear: bool,
64}
65
66impl<
67 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
68 U: AsPrimitive<usize>,
69 const SRC_LAYOUT: u8,
70 const DST_LAYOUT: u8,
71 const GRID_SIZE: usize,
72 const BIT_DEPTH: usize,
73 const BINS: usize,
74 const BARYCENTRIC_BINS: usize,
75> TransformLut3x3Sse<T, U, SRC_LAYOUT, DST_LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
76where
77 f32: AsPrimitive<T>,
78 u32: AsPrimitive<T>,
79 (): LutBarycentricReduction<T, U>,
80{
81 #[allow(unused_unsafe)]
82 #[target_feature(enable = "sse4.1")]
83 unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolation<'b, GRID_SIZE>>(
84 &'b self,
85 src: &[T],
86 dst: &mut [T],
87 ) {
88 let src_cn = Layout::from(SRC_LAYOUT);
89 let src_channels = src_cn.channels();
90
91 let dst_cn = Layout::from(DST_LAYOUT);
92 let dst_channels = dst_cn.channels();
93
94 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
95 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
96
97 for (src, dst) in src
98 .chunks_exact(src_channels)
99 .zip(dst.chunks_exact_mut(dst_channels))
100 {
101 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102 src[src_cn.r_i()],
103 );
104 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
105 src[src_cn.g_i()],
106 );
107 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
108 src[src_cn.b_i()],
109 );
110
111 let a = if src_channels == 4 {
112 src[src_cn.a_i()]
113 } else {
114 max_value
115 };
116
117 let tetrahedral = Interpolator::new(&self.lut);
118 let v = tetrahedral.inter3_sse(x, y, z, &self.weights);
119 if T::FINITE {
120 unsafe {
121 let mut r = _mm_mul_ps(v.v, value_scale);
122 r = _mm_max_ps(r, _mm_setzero_ps());
123 r = _mm_min_ps(r, value_scale);
124 let jvz = _mm_cvtps_epi32(r);
125
126 let x = _mm_extract_epi32::<0>(jvz);
127 let y = _mm_extract_epi32::<1>(jvz);
128 let z = _mm_extract_epi32::<2>(jvz);
129
130 dst[dst_cn.r_i()] = (x as u32).as_();
131 dst[dst_cn.g_i()] = (y as u32).as_();
132 dst[dst_cn.b_i()] = (z as u32).as_();
133 }
134 } else {
135 unsafe {
136 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v.v) as u32).as_();
137 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v.v) as u32).as_();
138 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v.v) as u32).as_();
139 }
140 }
141 if dst_channels == 4 {
142 dst[dst_cn.a_i()] = a;
143 }
144 }
145 }
146}
147
148impl<
149 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
150 U: AsPrimitive<usize>,
151 const SRC_LAYOUT: u8,
152 const DST_LAYOUT: u8,
153 const GRID_SIZE: usize,
154 const BIT_DEPTH: usize,
155 const BINS: usize,
156 const BARYCENTRIC_BINS: usize,
157> TransformExecutor<T>
158 for TransformLut3x3Sse<
159 T,
160 U,
161 SRC_LAYOUT,
162 DST_LAYOUT,
163 GRID_SIZE,
164 BIT_DEPTH,
165 BINS,
166 BARYCENTRIC_BINS,
167 >
168where
169 f32: AsPrimitive<T>,
170 u32: AsPrimitive<T>,
171 (): LutBarycentricReduction<T, U>,
172{
173 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
174 let src_cn = Layout::from(SRC_LAYOUT);
175 let src_channels = src_cn.channels();
176
177 let dst_cn = Layout::from(DST_LAYOUT);
178 let dst_channels = dst_cn.channels();
179 if src.len() % src_channels != 0 {
180 return Err(CmsError::LaneMultipleOfChannels);
181 }
182 if dst.len() % dst_channels != 0 {
183 return Err(CmsError::LaneMultipleOfChannels);
184 }
185 let src_chunks = src.len() / src_channels;
186 let dst_chunks = dst.len() / dst_channels;
187 if src_chunks != dst_chunks {
188 return Err(CmsError::LaneSizeMismatch);
189 }
190
191 unsafe {
192 if self.color_space == DataColorSpace::Lab
193 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
194 || self.color_space == DataColorSpace::Xyz
195 {
196 self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
197 } else {
198 match self.interpolation_method {
199 #[cfg(feature = "options")]
200 InterpolationMethod::Tetrahedral => {
201 self.transform_chunk::<TetrahedralSse<GRID_SIZE>>(src, dst);
202 }
203 #[cfg(feature = "options")]
204 InterpolationMethod::Pyramid => {
205 self.transform_chunk::<PyramidalSse<GRID_SIZE>>(src, dst);
206 }
207 #[cfg(feature = "options")]
208 InterpolationMethod::Prism => {
209 self.transform_chunk::<PrismaticSse<GRID_SIZE>>(src, dst);
210 }
211 InterpolationMethod::Linear => {
212 self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
213 }
214 }
215 }
216 }
217 Ok(())
218 }
219}
220
221pub(crate) struct SseLut3x3Factory {}
222
223impl Lut3x3Factory for SseLut3x3Factory {
224 fn make_transform_3x3<
225 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
226 const SRC_LAYOUT: u8,
227 const DST_LAYOUT: u8,
228 const GRID_SIZE: usize,
229 const BIT_DEPTH: usize,
230 >(
231 lut: Vec<f32>,
232 options: TransformOptions,
233 color_space: DataColorSpace,
234 is_linear: bool,
235 ) -> Box<dyn TransformExecutor<T> + Sync + Send>
236 where
237 f32: AsPrimitive<T>,
238 u32: AsPrimitive<T>,
239 (): LutBarycentricReduction<T, u8>,
240 (): LutBarycentricReduction<T, u16>,
241 {
242 if options.prefer_fixed_point && BIT_DEPTH < 16 {
243 let q: f32 = if T::FINITE {
244 ((1i32 << BIT_DEPTH as i32) - 1) as f32
245 } else {
246 ((1i32 << 14i32) - 1) as f32
247 };
248 let lut = lut
249 .chunks_exact(3)
250 .map(|x| {
251 SseAlignedI16x4([
252 (x[0] * q).round() as i16,
253 (x[1] * q).round() as i16,
254 (x[2] * q).round() as i16,
255 0,
256 ])
257 })
258 .collect::<Vec<_>>();
259 return match options.barycentric_weight_scale {
260 BarycentricWeightScale::Low => Box::new(TransformLut3x3SseQ0_15::<
261 T,
262 u8,
263 SRC_LAYOUT,
264 DST_LAYOUT,
265 GRID_SIZE,
266 BIT_DEPTH,
267 256,
268 256,
269 > {
270 lut,
271 _phantom: PhantomData,
272 _phantom2: PhantomData,
273 interpolation_method: options.interpolation_method,
274 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
275 color_space,
276 is_linear,
277 }),
278 #[cfg(feature = "options")]
279 BarycentricWeightScale::High => Box::new(TransformLut3x3SseQ0_15::<
280 T,
281 u16,
282 SRC_LAYOUT,
283 DST_LAYOUT,
284 GRID_SIZE,
285 BIT_DEPTH,
286 65536,
287 65536,
288 > {
289 lut,
290 _phantom: PhantomData,
291 _phantom2: PhantomData,
292 interpolation_method: options.interpolation_method,
293 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
294 color_space,
295 is_linear,
296 }),
297 };
298 }
299 let lut = lut
300 .chunks_exact(3)
301 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
302 .collect::<Vec<_>>();
303 match options.barycentric_weight_scale {
304 BarycentricWeightScale::Low => Box::new(TransformLut3x3Sse::<
305 T,
306 u8,
307 SRC_LAYOUT,
308 DST_LAYOUT,
309 GRID_SIZE,
310 BIT_DEPTH,
311 256,
312 256,
313 > {
314 lut,
315 _phantom: PhantomData,
316 _phantom2: PhantomData,
317 interpolation_method: options.interpolation_method,
318 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
319 color_space,
320 is_linear,
321 }),
322 #[cfg(feature = "options")]
323 BarycentricWeightScale::High => Box::new(TransformLut3x3Sse::<
324 T,
325 u16,
326 SRC_LAYOUT,
327 DST_LAYOUT,
328 GRID_SIZE,
329 BIT_DEPTH,
330 65536,
331 65536,
332 > {
333 lut,
334 _phantom: PhantomData,
335 _phantom2: PhantomData,
336 interpolation_method: options.interpolation_method,
337 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
338 color_space,
339 is_linear,
340 }),
341 }
342 }
343}