1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::interpolator::BarycentricWeight;
31use crate::conversions::sse::interpolator_q0_15::*;
32use crate::transform::PointeeSizeExpressible;
33use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
34use num_traits::AsPrimitive;
35#[cfg(target_arch = "x86")]
36use std::arch::x86::*;
37#[cfg(target_arch = "x86_64")]
38use std::arch::x86_64::*;
39use std::marker::PhantomData;
40
41pub(crate) struct TransformLut3x3SseQ0_15<
42 T,
43 U,
44 const SRC_LAYOUT: u8,
45 const DST_LAYOUT: u8,
46 const GRID_SIZE: usize,
47 const BIT_DEPTH: usize,
48 const BINS: usize,
49 const BARYCENTRIC_BINS: usize,
50> {
51 pub(crate) lut: Vec<SseAlignedI16x4>,
52 pub(crate) _phantom: PhantomData<T>,
53 pub(crate) _phantom2: PhantomData<U>,
54 pub(crate) interpolation_method: InterpolationMethod,
55 pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
56 pub(crate) color_space: DataColorSpace,
57 pub(crate) is_linear: bool,
58}
59
60impl<
61 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
62 U: AsPrimitive<usize>,
63 const SRC_LAYOUT: u8,
64 const DST_LAYOUT: u8,
65 const GRID_SIZE: usize,
66 const BIT_DEPTH: usize,
67 const BINS: usize,
68 const BARYCENTRIC_BINS: usize,
69>
70 TransformLut3x3SseQ0_15<
71 T,
72 U,
73 SRC_LAYOUT,
74 DST_LAYOUT,
75 GRID_SIZE,
76 BIT_DEPTH,
77 BINS,
78 BARYCENTRIC_BINS,
79 >
80where
81 f32: AsPrimitive<T>,
82 u32: AsPrimitive<T>,
83 (): LutBarycentricReduction<T, U>,
84{
85 #[allow(unused_unsafe)]
86 #[target_feature(enable = "sse4.1")]
87 #[inline(never)]
88 unsafe fn transform_chunk(
89 &self,
90 src: &[T],
91 dst: &mut [T],
92 interpolator: Box<dyn SseMdInterpolationQ0_15 + Send + Sync>,
93 ) {
94 unsafe {
95 let src_cn = Layout::from(SRC_LAYOUT);
96 let src_channels = src_cn.channels();
97
98 let dst_cn = Layout::from(DST_LAYOUT);
99 let dst_channels = dst_cn.channels();
100
101 let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
102 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
103 let v_max_scale = if T::FINITE {
104 _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
105 } else {
106 _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
107 };
108
109 for (src, dst) in src
110 .chunks_exact(src_channels)
111 .zip(dst.chunks_exact_mut(dst_channels))
112 {
113 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
114 src[src_cn.r_i()],
115 );
116 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
117 src[src_cn.g_i()],
118 );
119 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
120 src[src_cn.b_i()],
121 );
122
123 let a = if src_channels == 4 {
124 src[src_cn.a_i()]
125 } else {
126 max_value
127 };
128
129 let v = interpolator.inter3_sse(
130 &self.lut,
131 x.as_(),
132 y.as_(),
133 z.as_(),
134 self.weights.as_slice(),
135 );
136 if T::FINITE {
137 let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
138 o = _mm_min_epi16(o, v_max_scale);
139 let x = _mm_extract_epi16::<0>(o);
140 let y = _mm_extract_epi16::<1>(o);
141 let z = _mm_extract_epi16::<2>(o);
142
143 dst[dst_cn.r_i()] = (x as u32).as_();
144 dst[dst_cn.g_i()] = (y as u32).as_();
145 dst[dst_cn.b_i()] = (z as u32).as_();
146 } else {
147 let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
148 r = _mm_mul_ps(r, f_value_scale);
149 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
150 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
151 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
152 }
153 if dst_channels == 4 {
154 dst[dst_cn.a_i()] = a;
155 }
156 }
157 }
158 }
159}
160
161impl<
162 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
163 U: AsPrimitive<usize>,
164 const SRC_LAYOUT: u8,
165 const DST_LAYOUT: u8,
166 const GRID_SIZE: usize,
167 const BIT_DEPTH: usize,
168 const BINS: usize,
169 const BARYCENTRIC_BINS: usize,
170> TransformExecutor<T>
171 for TransformLut3x3SseQ0_15<
172 T,
173 U,
174 SRC_LAYOUT,
175 DST_LAYOUT,
176 GRID_SIZE,
177 BIT_DEPTH,
178 BINS,
179 BARYCENTRIC_BINS,
180 >
181where
182 f32: AsPrimitive<T>,
183 u32: AsPrimitive<T>,
184 (): LutBarycentricReduction<T, U>,
185{
186 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
187 let src_cn = Layout::from(SRC_LAYOUT);
188 let src_channels = src_cn.channels();
189
190 let dst_cn = Layout::from(DST_LAYOUT);
191 let dst_channels = dst_cn.channels();
192 if src.len() % src_channels != 0 {
193 return Err(CmsError::LaneMultipleOfChannels);
194 }
195 if dst.len() % dst_channels != 0 {
196 return Err(CmsError::LaneMultipleOfChannels);
197 }
198 let src_chunks = src.len() / src_channels;
199 let dst_chunks = dst.len() / dst_channels;
200 if src_chunks != dst_chunks {
201 return Err(CmsError::LaneSizeMismatch);
202 }
203
204 unsafe {
205 if self.color_space == DataColorSpace::Lab
206 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
207 || self.color_space == DataColorSpace::Xyz
208 {
209 self.transform_chunk(src, dst, Box::new(TrilinearSseQ0_15::<GRID_SIZE> {}));
210 } else {
211 match self.interpolation_method {
212 #[cfg(feature = "options")]
213 InterpolationMethod::Tetrahedral => {
214 self.transform_chunk(
215 src,
216 dst,
217 Box::new(TetrahedralSseQ0_15::<GRID_SIZE> {}),
218 );
219 }
220 #[cfg(feature = "options")]
221 InterpolationMethod::Pyramid => {
222 self.transform_chunk(src, dst, Box::new(PyramidalSseQ0_15::<GRID_SIZE> {}));
223 }
224 #[cfg(feature = "options")]
225 InterpolationMethod::Prism => {
226 self.transform_chunk(src, dst, Box::new(PrismaticSseQ0_15::<GRID_SIZE> {}));
227 }
228 InterpolationMethod::Linear => {
229 self.transform_chunk(src, dst, Box::new(TrilinearSseQ0_15::<GRID_SIZE> {}));
230 }
231 }
232 }
233 }
234 Ok(())
235 }
236}