1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::lut_transforms::Lut4x3Factory;
32use crate::conversions::sse::interpolator::*;
33use crate::conversions::sse::interpolator_q0_15::SseAlignedI16x4;
34use crate::conversions::sse::lut4_to_3_q0_15::TransformLut4To3SseQ0_15;
35use crate::transform::PointeeSizeExpressible;
36use crate::{
37 BarycentricWeightScale, CmsError, DataColorSpace, InterpolationMethod, Layout,
38 TransformExecutor, TransformOptions,
39};
40use num_traits::AsPrimitive;
41#[cfg(target_arch = "x86")]
42use std::arch::x86::*;
43#[cfg(target_arch = "x86_64")]
44use std::arch::x86_64::*;
45use std::marker::PhantomData;
46
47struct TransformLut4To3Sse<
48 T,
49 U,
50 const LAYOUT: u8,
51 const GRID_SIZE: usize,
52 const BIT_DEPTH: usize,
53 const BINS: usize,
54 const BARYCENTRIC_BINS: usize,
55> {
56 lut: Vec<SseAlignedF32>,
57 _phantom: PhantomData<T>,
58 _phantom1: PhantomData<U>,
59 interpolation_method: InterpolationMethod,
60 weights: Box<[BarycentricWeight<f32>; BINS]>,
61 color_space: DataColorSpace,
62 is_linear: bool,
63}
64
65impl<
66 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
67 U: AsPrimitive<usize>,
68 const LAYOUT: u8,
69 const GRID_SIZE: usize,
70 const BIT_DEPTH: usize,
71 const BINS: usize,
72 const BARYCENTRIC_BINS: usize,
73> TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
74where
75 f32: AsPrimitive<T>,
76 u32: AsPrimitive<T>,
77 (): LutBarycentricReduction<T, U>,
78{
79 #[allow(unused_unsafe)]
80 #[target_feature(enable = "sse4.1")]
81 unsafe fn transform_chunk<'b, Interpolator: SseMdInterpolation<'b, GRID_SIZE>>(
82 &'b self,
83 src: &[T],
84 dst: &mut [T],
85 ) {
86 let cn = Layout::from(LAYOUT);
87 let channels = cn.channels();
88 let grid_size = GRID_SIZE as i32;
89 let grid_size3 = grid_size * grid_size * grid_size;
90
91 let value_scale = unsafe { _mm_set1_ps(((1 << BIT_DEPTH) - 1) as f32) };
92 let max_value = ((1 << BIT_DEPTH) - 1u32).as_();
93
94 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(channels)) {
95 let c = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
96 src[0],
97 );
98 let m = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
99 src[1],
100 );
101 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
102 src[2],
103 );
104 let k = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
105 src[3],
106 );
107
108 let k_weights = self.weights[k.as_()];
109
110 let w: i32 = k_weights.x;
111 let w_n: i32 = k_weights.x_n;
112 let t: f32 = k_weights.w;
113
114 let table1 = &self.lut[(w * grid_size3) as usize..];
115 let table2 = &self.lut[(w_n * grid_size3) as usize..];
116
117 let tetrahedral1 = Interpolator::new(table1);
118 let tetrahedral2 = Interpolator::new(table2);
119 let a0 = tetrahedral1.inter3_sse(c, m, y, &self.weights).v;
120 let b0 = tetrahedral2.inter3_sse(c, m, y, &self.weights).v;
121
122 if T::FINITE {
123 unsafe {
124 let t0 = _mm_set1_ps(t);
125 let ones = _mm_set1_ps(1f32);
126 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
127 let mut v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
128 v = _mm_max_ps(v, _mm_setzero_ps());
129 v = _mm_mul_ps(v, value_scale);
130 v = _mm_min_ps(v, value_scale);
131 let jvz = _mm_cvtps_epi32(v);
132
133 let x = _mm_extract_epi32::<0>(jvz);
134 let y = _mm_extract_epi32::<1>(jvz);
135 let z = _mm_extract_epi32::<2>(jvz);
136
137 dst[cn.r_i()] = (x as u32).as_();
138 dst[cn.g_i()] = (y as u32).as_();
139 dst[cn.b_i()] = (z as u32).as_();
140 }
141 } else {
142 unsafe {
143 let t0 = _mm_set1_ps(t);
144 let ones = _mm_set1_ps(1f32);
145 let hp = _mm_mul_ps(a0, _mm_sub_ps(ones, t0));
146 let v = _mm_add_ps(_mm_mul_ps(b0, t0), hp);
147
148 dst[cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(v) as u32).as_();
149 dst[cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(v) as u32).as_();
150 dst[cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(v) as u32).as_();
151 }
152 }
153 if channels == 4 {
154 dst[cn.a_i()] = max_value;
155 }
156 }
157 }
158}
159
160impl<
161 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
162 U: AsPrimitive<usize>,
163 const LAYOUT: u8,
164 const GRID_SIZE: usize,
165 const BIT_DEPTH: usize,
166 const BINS: usize,
167 const BARYCENTRIC_BINS: usize,
168> TransformExecutor<T>
169 for TransformLut4To3Sse<T, U, LAYOUT, GRID_SIZE, BIT_DEPTH, BINS, BARYCENTRIC_BINS>
170where
171 f32: AsPrimitive<T>,
172 u32: AsPrimitive<T>,
173 (): LutBarycentricReduction<T, U>,
174{
175 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
176 let cn = Layout::from(LAYOUT);
177 let channels = cn.channels();
178 if src.len() % 4 != 0 {
179 return Err(CmsError::LaneMultipleOfChannels);
180 }
181 if dst.len() % channels != 0 {
182 return Err(CmsError::LaneMultipleOfChannels);
183 }
184 let src_chunks = src.len() / 4;
185 let dst_chunks = dst.len() / channels;
186 if src_chunks != dst_chunks {
187 return Err(CmsError::LaneSizeMismatch);
188 }
189
190 unsafe {
191 if self.color_space == DataColorSpace::Lab
192 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
193 || self.color_space == DataColorSpace::Xyz
194 {
195 self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
196 } else {
197 match self.interpolation_method {
198 #[cfg(feature = "options")]
199 InterpolationMethod::Tetrahedral => {
200 self.transform_chunk::<TetrahedralSse<GRID_SIZE>>(src, dst);
201 }
202 #[cfg(feature = "options")]
203 InterpolationMethod::Pyramid => {
204 self.transform_chunk::<PyramidalSse<GRID_SIZE>>(src, dst);
205 }
206 #[cfg(feature = "options")]
207 InterpolationMethod::Prism => {
208 self.transform_chunk::<PrismaticSse<GRID_SIZE>>(src, dst);
209 }
210 InterpolationMethod::Linear => {
211 self.transform_chunk::<TrilinearSse<GRID_SIZE>>(src, dst);
212 }
213 }
214 }
215 }
216
217 Ok(())
218 }
219}
220
221pub(crate) struct SseLut4x3Factory {}
222
223impl Lut4x3Factory for SseLut4x3Factory {
224 fn make_transform_4x3<
225 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible + 'static + Send + Sync,
226 const LAYOUT: u8,
227 const GRID_SIZE: usize,
228 const BIT_DEPTH: usize,
229 >(
230 lut: Vec<f32>,
231 options: TransformOptions,
232 color_space: DataColorSpace,
233 is_linear: bool,
234 ) -> Box<dyn TransformExecutor<T> + Sync + Send>
235 where
236 f32: AsPrimitive<T>,
237 u32: AsPrimitive<T>,
238 (): LutBarycentricReduction<T, u8>,
239 (): LutBarycentricReduction<T, u16>,
240 {
241 if options.prefer_fixed_point && BIT_DEPTH < 16 {
242 let q: f32 = if T::FINITE {
243 ((1i32 << BIT_DEPTH as i32) - 1) as f32
244 } else {
245 ((1i32 << 14i32) - 1) as f32
246 };
247 let lut = lut
248 .chunks_exact(3)
249 .map(|x| {
250 SseAlignedI16x4([
251 (x[0] * q).round() as i16,
252 (x[1] * q).round() as i16,
253 (x[2] * q).round() as i16,
254 0,
255 ])
256 })
257 .collect::<Vec<_>>();
258 return match options.barycentric_weight_scale {
259 BarycentricWeightScale::Low => Box::new(TransformLut4To3SseQ0_15::<
260 T,
261 u8,
262 LAYOUT,
263 GRID_SIZE,
264 BIT_DEPTH,
265 256,
266 256,
267 > {
268 lut,
269 interpolation_method: options.interpolation_method,
270 weights: BarycentricWeight::<i16>::create_ranged_256::<GRID_SIZE>(),
271 _phantom: PhantomData,
272 _phantom1: PhantomData,
273 color_space,
274 is_linear,
275 }),
276 #[cfg(feature = "options")]
277 BarycentricWeightScale::High => Box::new(TransformLut4To3SseQ0_15::<
278 T,
279 u16,
280 LAYOUT,
281 GRID_SIZE,
282 BIT_DEPTH,
283 65536,
284 65536,
285 > {
286 lut,
287 interpolation_method: options.interpolation_method,
288 weights: BarycentricWeight::<i16>::create_binned::<GRID_SIZE, 65536>(),
289 _phantom: PhantomData,
290 _phantom1: PhantomData,
291 color_space,
292 is_linear,
293 }),
294 };
295 }
296 let lut = lut
297 .chunks_exact(3)
298 .map(|x| SseAlignedF32([x[0], x[1], x[2], 0f32]))
299 .collect::<Vec<_>>();
300 match options.barycentric_weight_scale {
301 BarycentricWeightScale::Low => {
302 Box::new(
303 TransformLut4To3Sse::<T, u8, LAYOUT, GRID_SIZE, BIT_DEPTH, 256, 256> {
304 lut,
305 _phantom: PhantomData,
306 _phantom1: PhantomData,
307 interpolation_method: options.interpolation_method,
308 weights: BarycentricWeight::<f32>::create_ranged_256::<GRID_SIZE>(),
309 color_space,
310 is_linear,
311 },
312 )
313 }
314 #[cfg(feature = "options")]
315 BarycentricWeightScale::High => {
316 Box::new(
317 TransformLut4To3Sse::<T, u16, LAYOUT, GRID_SIZE, BIT_DEPTH, 65536, 65536> {
318 lut,
319 _phantom: PhantomData,
320 _phantom1: PhantomData,
321 interpolation_method: options.interpolation_method,
322 weights: BarycentricWeight::<f32>::create_binned::<GRID_SIZE, 65536>(),
323 color_space,
324 is_linear,
325 },
326 )
327 }
328 }
329 }
330}