1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::lut_transforms::Lut4x3Factory;
32use crate::conversions::sse::interpolator::*;
33use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
34use crate::conversions::sse::lut4_to_3_q0_15::TransformLut4To3SseQ0_15;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::marker::PhantomData;
46
47struct TransformLut4To3Sse<
48 T,
49 U,
50 const LAYOUT: u8,
51 const GRID_SIZE: usize,
52 const BIT_DEPTH: usize,
53 const BINS: usize,
54 const BARYCENTRIC_BINS: usize,
55> {
56 lut: Vec<SseAlignedF32>,
57 _phantom: PhantomData<T>,
58 _phantom1: PhantomData<U>,
59 interpolation_method: InterpolationMethod,
60 weights: Box<[BarycentricWeight<f32>; BINS]>,
61 color_space: DataColorSpace,
62 is_linear: bool,
63}
64
65impl<
66 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
67 U: AsPrimitive<usize>,
68 const LAYOUT: u8,
69 const GRID_SIZE: usize,
70 const BIT_DEPTH: usize,
71 const BINS: usize,
72 const BARYCENTRIC_BINS: usize,
73> TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
74where
75 f32: AsPrimitive<T>,
76 u32: AsPrimitive<T>,
77 (): LutBarycentricReduction<T, U>,
78{
79 #[allow(unused_unsafe)]
80 #[target_feature(enable = "sse4.1")]
81 unsafe fn transform_chunk(
82 &self,
83 src: &[T],
84 dst: &mut [T],
85 interpolator: Box<dyn SseMdInterpolation + Send + Sync>,
86 ) {
87 let cn = Layout::from(LAYOUT);
88 let channels = cn.channels();
89 let grid_size = GRID_SIZE as i32;
90 let grid_size3 = grid_size * grid_size * grid_size;
91
92 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
93 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
94
95 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
96 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
97 src[0],
98 );
99 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
100 src[1],
101 );
102 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
103 src[2],
104 );
105 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
106 src[3],
107 );
108
109 let k_weights = self.weights[k.as_()];
110
111 let w: i32 = k_weights.x;
112 let w_n: i32 = k_weights.x_n;
113 let t: f32 = k_weights.w;
114
115 let table1 = &self.lut[(w * grid_size3) as usize..];
116 let table2 = &self.lut[(w_n * grid_size3) as usize..];
117
118 let a0 = interpolator
119 .inter3_sse(table1, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
120 .v;
121 let b0 = interpolator
122 .inter3_sse(table2, c.as_(), m.as_(), y.as_(), self.weights.as_slice())
123 .v;
124
125 if T::FINITE {
126 unsafe {
127 let t0 = _mm_set1_ps(t);
128 let ones = _mm_set1_ps(1f32);
129 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
130 let mut v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
131 v = _mm_max_ps(v, _mm_setzero_ps());
132 v = _mm_mul_ps(v, value_scale);
133 v = _mm_min_ps(v, value_scale);
134 let jvz = _mm_cvtps_epi32(v);
135
136 let x = _mm_extract_epi32::<0>(jvz);
137 let y = _mm_extract_epi32::<1>(jvz);
138 let z = _mm_extract_epi32::<2>(jvz);
139
140 dst[cn.r_i()] = (x as u32).as_();
141 dst[cn.g_i()] = (y as u32).as_();
142 dst[cn.b_i()] = (z as u32).as_();
143 }
144 } else {
145 unsafe {
146 let t0 = _mm_set1_ps(t);
147 let ones = _mm_set1_ps(1f32);
148 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
149 let v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
150
151 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
152 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
153 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
154 }
155 }
156 if channels == 4 {
157 dst[cn.a_i()] = max_value;
158 }
159 }
160 }
161}
162
163impl<
164 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
165 U: AsPrimitive<usize>,
166 const LAYOUT: u8,
167 const GRID_SIZE: usize,
168 const BIT_DEPTH: usize,
169 const BINS: usize,
170 const BARYCENTRIC_BINS: usize,
171> TransformExecutor<T>
172 for TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
173where
174 f32: AsPrimitive<T>,
175 u32: AsPrimitive<T>,
176 (): LutBarycentricReduction<T, U>,
177{
178 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
179 let cn = Layout::from(LAYOUT);
180 let channels = cn.channels();
181 if src.len() % 4 != 0 {
182 return Err(CmsError::LaneMultipleOfChannels);
183 }
184 if dst.len() % channels != 0 {
185 return Err(CmsError::LaneMultipleOfChannels);
186 }
187 let src_chunks = src.len() / 4;
188 let dst_chunks = dst.len() / channels;
189 if src_chunks != dst_chunks {
190 return Err(CmsError::LaneSizeMismatch);
191 }
192
193 unsafe {
194 if self.color_space == DataColorSpace::Lab
195 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
196 || self.color_space == DataColorSpace::Xyz
197 {
198 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
199 } else {
200 match self.interpolation_method {
201 #[cfg(feature = "options")]
202 InterpolationMethod::Tetrahedral => {
203 self.transform_chunk(src, dst, Box::new(TetrahedralSse::<GRID_SIZE> {}));
204 }
205 #[cfg(feature = "options")]
206 InterpolationMethod::Pyramid => {
207 self.transform_chunk(src, dst, Box::new(PyramidalSse::<GRID_SIZE> {}));
208 }
209 #[cfg(feature = "options")]
210 InterpolationMethod::Prism => {
211 self.transform_chunk(src, dst, Box::new(PrismaticSse::<GRID_SIZE> {}));
212 }
213 InterpolationMethod::Linear => {
214 self.transform_chunk(src, dst, Box::new(TrilinearSse::<GRID_SIZE> {}));
215 }
216 }
217 }
218 }
219
220 Ok(())
221 }
222}
223
224pub(crate) struct SseLut4x3Factory {}
225
226impl Lut4x3Factory for SseLut4x3Factory {
227 fn make_transform_4x3<
228 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
229 const LAYOUT: u8,
230 const GRID_SIZE: usize,
231 const BIT_DEPTH: usize,
232 >(
233 lut: Vec<f32>,
234 options: TransformOptions,
235 color_space: DataColorSpace,
236 is_linear: bool,
237 ) -> Box<dyn TransformExecutor<T> + Sync + Send>
238 where
239 f32: AsPrimitive<T>,
240 u32: AsPrimitive<T>,
241 (): LutBarycentricReduction<T, u8>,
242 (): LutBarycentricReduction<T, u16>,
243 {
244 if options.prefer_fixed_point && BIT_DEPTH < 16 {
245 let q: f32 = if T::FINITE {
246 ((1i32 << BIT_DEPTH as i32) - 1) as f32
247 } else {
248 ((1i32 << 14i32) - 1) as f32
249 };
250 let lut = lut
251 .chunks_exact(3)
252 .map(|x| {
253 SseAlignedI16x4([
254 (x[0] * q).round() as i16,
255 (x[1] * q).round() as i16,
256 (x[2] * q).round() as i16,
257 0,
258 ])
259 })
260 .collect::<Vec<_>>();
261 return match options.barycentric_weight_scale {
262 BarycentricWeightScale::Low => Box::new(TransformLut4To3SseQ0_15::<
263 T,
264 u8,
265 LAYOUT,
266 GRID_SIZE,
267 BIT_DEPTH,
268 256,
269 256,
270 > {
271 lut,
272 interpolation_method: options.interpolation_method,
273 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
274 _phantom: PhantomData,
275 _phantom1: PhantomData,
276 color_space,
277 is_linear,
278 }),
279 #[cfg(feature = "options")]
280 BarycentricWeightScale::High => Box::new(TransformLut4To3SseQ0_15::<
281 T,
282 u16,
283 LAYOUT,
284 GRID_SIZE,
285 BIT_DEPTH,
286 65536,
287 65536,
288 > {
289 lut,
290 interpolation_method: options.interpolation_method,
291 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
292 _phantom: PhantomData,
293 _phantom1: PhantomData,
294 color_space,
295 is_linear,
296 }),
297 };
298 }
299 let lut = lut
300 .chunks_exact(3)
301 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
302 .collect::<Vec<_>>();
303 match options.barycentric_weight_scale {
304 BarycentricWeightScale::Low => {
305 Box::new(
306 TransformLut4To3Sse::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
307 lut,
308 _phantom: PhantomData,
309 _phantom1: PhantomData,
310 interpolation_method: options.interpolation_method,
311 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
312 color_space,
313 is_linear,
314 },
315 )
316 }
317 #[cfg(feature = "options")]
318 BarycentricWeightScale::High => {
319 Box::new(
320 TransformLut4To3Sse::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
321 lut,
322 _phantom: PhantomData,
323 _phantom1: PhantomData,
324 interpolation_method: options.interpolation_method,
325 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
326 color_space,
327 is_linear,
328 },
329 )
330 }
331 }
332 }
333}