1use crate::conversions::LutBarycentricReduction;
30use crate::conversions::avx::interpolator_q0_15::*;
31use crate::conversions::interpolator::BarycentricWeight;
32use crate::transform::PointeeSizeExpressible;
33use crate::{CmsError, DataColorSpace, InterpolationMethod, Layout, TransformExecutor};
34use num_traits::AsPrimitive;
35use std::arch::x86_64::*;
36use std::marker::PhantomData;
37
38pub(crate) struct TransformLut3x3AvxQ0_15<
39 T,
40 U,
41 const SRC_LAYOUT: u8,
42 const DST_LAYOUT: u8,
43 const GRID_SIZE: usize,
44 const BIT_DEPTH: usize,
45 const BINS: usize,
46 const BARYCENTRIC_BINS: usize,
47> {
48 pub(crate) lut: Vec<AvxAlignedI16>,
49 pub(crate) _phantom: PhantomData<T>,
50 pub(crate) _phantom2: PhantomData<U>,
51 pub(crate) interpolation_method: InterpolationMethod,
52 pub(crate) weights: Box<[BarycentricWeight<i16>; BINS]>,
53 pub(crate) color_space: DataColorSpace,
54 pub(crate) is_linear: bool,
55}
56
57impl<
58 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
59 U: AsPrimitive<usize>,
60 const SRC_LAYOUT: u8,
61 const DST_LAYOUT: u8,
62 const GRID_SIZE: usize,
63 const BIT_DEPTH: usize,
64 const BINS: usize,
65 const BARYCENTRIC_BINS: usize,
66>
67 TransformLut3x3AvxQ0_15<
68 T,
69 U,
70 SRC_LAYOUT,
71 DST_LAYOUT,
72 GRID_SIZE,
73 BIT_DEPTH,
74 BINS,
75 BARYCENTRIC_BINS,
76 >
77where
78 f32: AsPrimitive<T>,
79 u32: AsPrimitive<T>,
80 (): LutBarycentricReduction<T, U>,
81{
82 #[allow(unused_unsafe)]
83 #[target_feature(enable = "avx2")]
84 unsafe fn transform_chunk(
85 &self,
86 src: &[T],
87 dst: &mut [T],
88 interpolator: Box<dyn AvxMdInterpolationQ0_15 + Send + Sync>,
89 ) {
90 unsafe {
91 let src_cn = Layout::from(SRC_LAYOUT);
92 let src_channels = src_cn.channels();
93
94 let dst_cn = Layout::from(DST_LAYOUT);
95 let dst_channels = dst_cn.channels();
96
97 let f_value_scale = _mm_set1_ps(1. / ((1 << 14i32) - 1) as f32);
98 let max_value = ((1u32 << BIT_DEPTH) - 1).as_();
99 let v_max_scale = if T::FINITE {
100 _mm_set1_epi16(((1i32 << BIT_DEPTH) - 1) as i16)
101 } else {
102 _mm_set1_epi16(((1i32 << 14i32) - 1) as i16)
103 };
104
105 for (src, dst) in src
106 .chunks_exact(src_channels)
107 .zip(dst.chunks_exact_mut(dst_channels))
108 {
109 let x = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
110 src[src_cn.r_i()],
111 );
112 let y = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
113 src[src_cn.g_i()],
114 );
115 let z = <() as LutBarycentricReduction<T, U>>::reduce::<BIT_DEPTH, BARYCENTRIC_BINS>(
116 src[src_cn.b_i()],
117 );
118
119 let a = if src_channels == 4 {
120 src[src_cn.a_i()]
121 } else {
122 max_value
123 };
124
125 let v = interpolator.inter3_sse(
126 &self.lut,
127 x.as_(),
128 y.as_(),
129 z.as_(),
130 self.weights.as_slice(),
131 );
132 if T::FINITE {
133 let mut o = _mm_max_epi16(v.v, _mm_setzero_si128());
134 o = _mm_min_epi16(o, v_max_scale);
135 let x = _mm_extract_epi16::<0>(o);
136 let y = _mm_extract_epi16::<1>(o);
137 let z = _mm_extract_epi16::<2>(o);
138
139 dst[dst_cn.r_i()] = (x as u32).as_();
140 dst[dst_cn.g_i()] = (y as u32).as_();
141 dst[dst_cn.b_i()] = (z as u32).as_();
142 } else {
143 let mut r = _mm_cvtepi32_ps(_mm_cvtepi16_epi32(v.v));
144 r = _mm_mul_ps(r, f_value_scale);
145 dst[dst_cn.r_i()] = f32::from_bits(_mm_extract_ps::<0>(r) as u32).as_();
146 dst[dst_cn.g_i()] = f32::from_bits(_mm_extract_ps::<1>(r) as u32).as_();
147 dst[dst_cn.b_i()] = f32::from_bits(_mm_extract_ps::<2>(r) as u32).as_();
148 }
149 if dst_channels == 4 {
150 dst[dst_cn.a_i()] = a;
151 }
152 }
153 }
154 }
155}
156
157impl<
158 T: Copy + AsPrimitive<f32> + Default + PointeeSizeExpressible,
159 U: AsPrimitive<usize>,
160 const SRC_LAYOUT: u8,
161 const DST_LAYOUT: u8,
162 const GRID_SIZE: usize,
163 const BIT_DEPTH: usize,
164 const BINS: usize,
165 const BARYCENTRIC_BINS: usize,
166> TransformExecutor<T>
167 for TransformLut3x3AvxQ0_15<
168 T,
169 U,
170 SRC_LAYOUT,
171 DST_LAYOUT,
172 GRID_SIZE,
173 BIT_DEPTH,
174 BINS,
175 BARYCENTRIC_BINS,
176 >
177where
178 f32: AsPrimitive<T>,
179 u32: AsPrimitive<T>,
180 (): LutBarycentricReduction<T, U>,
181{
182 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
183 let src_cn = Layout::from(SRC_LAYOUT);
184 let src_channels = src_cn.channels();
185
186 let dst_cn = Layout::from(DST_LAYOUT);
187 let dst_channels = dst_cn.channels();
188 if src.len() % src_channels != 0 {
189 return Err(CmsError::LaneMultipleOfChannels);
190 }
191 if dst.len() % dst_channels != 0 {
192 return Err(CmsError::LaneMultipleOfChannels);
193 }
194 let src_chunks = src.len() / src_channels;
195 let dst_chunks = dst.len() / dst_channels;
196 if src_chunks != dst_chunks {
197 return Err(CmsError::LaneSizeMismatch);
198 }
199
200 unsafe {
201 if self.color_space == DataColorSpace::Lab
202 || (self.is_linear && self.color_space == DataColorSpace::Rgb)
203 || self.color_space == DataColorSpace::Xyz
204 {
205 self.transform_chunk(src, dst, Box::new(TrilinearAvxQ0_15::<GRID_SIZE> {}));
206 } else {
207 match self.interpolation_method {
208 #[cfg(feature = "options")]
209 InterpolationMethod::Tetrahedral => {
210 self.transform_chunk(
211 src,
212 dst,
213 Box::new(TetrahedralAvxQ0_15::<GRID_SIZE> {}),
214 );
215 }
216 #[cfg(feature = "options")]
217 InterpolationMethod::Pyramid => {
218 self.transform_chunk(src, dst, Box::new(PyramidalAvxQ0_15::<GRID_SIZE> {}));
219 }
220 #[cfg(feature = "options")]
221 InterpolationMethod::Prism => {
222 self.transform_chunk(src, dst, Box::new(PrismaticAvxQ0_15::<GRID_SIZE> {}));
223 }
224 InterpolationMethod::Linear => {
225 self.transform_chunk(src, dst, Box::new(TrilinearAvxQ0_15::<GRID_SIZE> {}));
226 }
227 }
228 }
229 }
230 Ok(())
231 }
232}