moxcms/conversions/sse/
rgb_xyz_q2_13.rs1use crate::conversions::rgbxyz_fixed::TransformMatrixShaperFp;
30use crate::conversions::sse::rgb_xyz::SseAlignedU16;
31use crate::transform::PointeeSizeExpressible;
32use crate::{CmsError, Layout, TransformExecutor};
33use num_traits::AsPrimitive;
34#[cfg(target_arch = "x86")]
35use std::arch::x86::*;
36#[cfg(target_arch = "x86_64")]
37use std::arch::x86_64::*;
38
39pub(crate) struct TransformShaperQ2_13Sse<
40 T: Copy,
41 const SRC_LAYOUT: u8,
42 const DST_LAYOUT: u8,
43 const PRECISION: i32,
44> {
45 pub(crate) profile: TransformMatrixShaperFp<i32, T>,
46 pub(crate) bit_depth: usize,
47 pub(crate) gamma_lut: usize,
48}
49
50#[inline(always)]
51pub(crate) unsafe fn _xmm_load_epi32(f: &i32) -> __m128i {
52 let float_ref: &f32 = unsafe { &*(f as *const i32 as *const f32) };
53 unsafe { _mm_castps_si128(_mm_load_ss(float_ref)) }
54}
55
56impl<
57 T: Copy + PointeeSizeExpressible + 'static,
58 const SRC_LAYOUT: u8,
59 const DST_LAYOUT: u8,
60 const PRECISION: i32,
61> TransformShaperQ2_13Sse<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
62where
63 u32: AsPrimitive<T>,
64{
65 #[target_feature(enable = "sse4.1")]
66 unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
67 let src_cn = Layout::from(SRC_LAYOUT);
68 let dst_cn = Layout::from(DST_LAYOUT);
69 let src_channels = src_cn.channels();
70 let dst_channels = dst_cn.channels();
71
72 let mut temporary = SseAlignedU16([0; 8]);
73
74 if src.len() / src_channels != dst.len() / dst_channels {
75 return Err(CmsError::LaneSizeMismatch);
76 }
77 if src.len() % src_channels != 0 {
78 return Err(CmsError::LaneMultipleOfChannels);
79 }
80 if dst.len() % dst_channels != 0 {
81 return Err(CmsError::LaneMultipleOfChannels);
82 }
83
84 let t = self.profile.adaptation_matrix.transpose();
85
86 let max_colors = ((1 << self.bit_depth) - 1).as_();
87
88 if T::FINITE {
90 let cap = (1 << self.bit_depth) - 1;
91 assert!(self.profile.r_linear.len() >= cap);
92 assert!(self.profile.g_linear.len() >= cap);
93 assert!(self.profile.b_linear.len() >= cap);
94 } else {
95 assert!(self.profile.r_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
96 assert!(self.profile.g_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
97 assert!(self.profile.b_linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
98 }
99
100 let r_lin = &self.profile.r_linear;
101 let g_lin = &self.profile.g_linear;
102 let b_lin = &self.profile.b_linear;
103
104 unsafe {
105 let m0 = _mm_setr_epi16(
106 t.v[0][0], t.v[1][0], t.v[0][1], t.v[1][1], t.v[0][2], t.v[1][2], 0, 0,
107 );
108 let m2 = _mm_setr_epi16(t.v[2][0], 1, t.v[2][1], 1, t.v[2][2], 1, 0, 0);
109
110 let rnd_val = ((1i32 << (PRECISION - 1)) as i16).to_ne_bytes();
111 let rnd = _mm_set1_epi32(i32::from_ne_bytes([0, 0, rnd_val[0], rnd_val[1]]));
112
113 let v_max_value = _mm_set1_epi32(self.gamma_lut as i32 - 1);
114
115 for (src, dst) in src
116 .chunks_exact(src_channels)
117 .zip(dst.chunks_exact_mut(dst_channels))
118 {
119 let rp = r_lin.get_unchecked(src[src_cn.r_i()]._as_usize());
120 let gp = g_lin.get_unchecked(src[src_cn.g_i()]._as_usize());
121 let bp = b_lin.get_unchecked(src[src_cn.b_i()]._as_usize());
122
123 let mut r = _xmm_load_epi32(rp);
124 let mut g = _xmm_load_epi32(gp);
125 let mut b = _xmm_load_epi32(bp);
126 let a = if src_channels == 4 {
127 src[src_cn.a_i()]
128 } else {
129 max_colors
130 };
131
132 r = _mm_shuffle_epi32::<0>(r);
133 g = _mm_shuffle_epi32::<0>(g);
134 b = _mm_shuffle_epi32::<0>(b);
135
136 g = _mm_slli_epi32::<16>(g);
137
138 let zrg0 = _mm_or_si128(r, g);
139 let zbz0 = _mm_or_si128(b, rnd);
140
141 let v0 = _mm_madd_epi16(zrg0, m0);
142 let v1 = _mm_madd_epi16(zbz0, m2);
143
144 let mut v = _mm_add_epi32(v0, v1);
145
146 v = _mm_srai_epi32::<PRECISION>(v);
147 v = _mm_max_epi32(v, _mm_setzero_si128());
148 v = _mm_min_epi32(v, v_max_value);
149
150 _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, v);
151
152 dst[dst_cn.r_i()] = self.profile.r_gamma[temporary.0[0] as usize];
153 dst[dst_cn.g_i()] = self.profile.g_gamma[temporary.0[2] as usize];
154 dst[dst_cn.b_i()] = self.profile.b_gamma[temporary.0[4] as usize];
155 if dst_channels == 4 {
156 dst[dst_cn.a_i()] = a;
157 }
158 }
159 }
160
161 Ok(())
162 }
163}
164
165impl<
166 T: Copy + PointeeSizeExpressible + 'static + Default,
167 const SRC_LAYOUT: u8,
168 const DST_LAYOUT: u8,
169 const PRECISION: i32,
170> TransformExecutor<T> for TransformShaperQ2_13Sse<T, SRC_LAYOUT, DST_LAYOUT, PRECISION>
171where
172 u32: AsPrimitive<T>,
173{
174 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
175 unsafe { self.transform_impl(src, dst) }
176 }
177}