moxcms/conversions/sse/
rgb_xyz_opt.rs1use crate::conversions::rgbxyz::TransformMatrixShaperOptimizedV;
30use crate::conversions::sse::rgb_xyz::SseAlignedU16;
31use crate::transform::PointeeSizeExpressible;
32use crate::{CmsError, Layout, TransformExecutor};
33use num_traits::AsPrimitive;
34#[cfg(target_arch = "x86")]
35use std::arch::x86::*;
36#[cfg(target_arch = "x86_64")]
37use std::arch::x86_64::*;
38
39pub(crate) struct TransformShaperRgbOptSse<
40 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
41 const SRC_LAYOUT: u8,
42 const DST_LAYOUT: u8,
43> {
44 pub(crate) profile: TransformMatrixShaperOptimizedV<T>,
45 pub(crate) bit_depth: usize,
46 pub(crate) gamma_lut: usize,
47}
48
49impl<
50 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
51 const SRC_LAYOUT: u8,
52 const DST_LAYOUT: u8,
53> TransformShaperRgbOptSse<T, SRC_LAYOUT, DST_LAYOUT>
54where
55 u32: AsPrimitive<T>,
56{
57 #[target_feature(enable = "sse4.1")]
58 unsafe fn transform_impl(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
59 let src_cn = Layout::from(SRC_LAYOUT);
60 let dst_cn = Layout::from(DST_LAYOUT);
61 let src_channels = src_cn.channels();
62 let dst_channels = dst_cn.channels();
63
64 let mut temporary = SseAlignedU16([0; 8]);
65
66 if src.len() / src_channels != dst.len() / dst_channels {
67 return Err(CmsError::LaneSizeMismatch);
68 }
69 if src.len() % src_channels != 0 {
70 return Err(CmsError::LaneMultipleOfChannels);
71 }
72 if dst.len() % dst_channels != 0 {
73 return Err(CmsError::LaneMultipleOfChannels);
74 }
75
76 let t = self.profile.adaptation_matrix.transpose();
77
78 let scale = (self.gamma_lut - 1) as f32;
79 let max_colors: T = ((1 << self.bit_depth) - 1).as_();
80
81 if T::FINITE {
83 let cap = (1 << self.bit_depth) - 1;
84 assert!(self.profile.linear.len() >= cap);
85 } else {
86 assert!(self.profile.linear.len() >= T::NOT_FINITE_LINEAR_TABLE_SIZE);
87 }
88
89 let lut_lin = &self.profile.linear;
90
91 unsafe {
92 let m0 = _mm_setr_ps(t.v[0][0], t.v[0][1], t.v[0][2], 0f32);
93 let m1 = _mm_setr_ps(t.v[1][0], t.v[1][1], t.v[1][2], 0f32);
94 let m2 = _mm_setr_ps(t.v[2][0], t.v[2][1], t.v[2][2], 0f32);
95
96 let zeros = _mm_setzero_ps();
97
98 let v_scale = _mm_set1_ps(scale);
99
100 for (src, dst) in src
101 .chunks_exact(src_channels)
102 .zip(dst.chunks_exact_mut(dst_channels))
103 {
104 let rp = lut_lin.get_unchecked(src[src_cn.r_i()]._as_usize());
105 let gp = lut_lin.get_unchecked(src[src_cn.g_i()]._as_usize());
106 let bp = lut_lin.get_unchecked(src[src_cn.b_i()]._as_usize());
107
108 let mut r = _mm_load_ss(rp);
109 let mut g = _mm_load_ss(gp);
110 let mut b = _mm_load_ss(bp);
111 let a = if src_channels == 4 {
112 src[src_cn.a_i()]
113 } else {
114 max_colors
115 };
116
117 r = _mm_shuffle_ps::<0>(r, r);
118 g = _mm_shuffle_ps::<0>(g, g);
119 b = _mm_shuffle_ps::<0>(b, b);
120
121 let v0 = _mm_mul_ps(r, m0);
122 let v1 = _mm_mul_ps(g, m1);
123 let v2 = _mm_mul_ps(b, m2);
124
125 let mut v = _mm_add_ps(_mm_add_ps(v0, v1), v2);
126 v = _mm_max_ps(v, zeros);
127 v = _mm_mul_ps(v, v_scale);
128 v = _mm_min_ps(v, v_scale);
129
130 let zx = _mm_cvtps_epi32(v);
131 _mm_store_si128(temporary.0.as_mut_ptr() as *mut _, zx);
132
133 dst[dst_cn.r_i()] = self.profile.gamma[temporary.0[0] as usize];
134 dst[dst_cn.g_i()] = self.profile.gamma[temporary.0[2] as usize];
135 dst[dst_cn.b_i()] = self.profile.gamma[temporary.0[4] as usize];
136 if dst_channels == 4 {
137 dst[dst_cn.a_i()] = a;
138 }
139 }
140 }
141
142 Ok(())
143 }
144}
145
146impl<
147 T: Clone + Copy + 'static + PointeeSizeExpressible + Default,
148 const SRC_LAYOUT: u8,
149 const DST_LAYOUT: u8,
150> TransformExecutor<T> for TransformShaperRgbOptSse<T, SRC_LAYOUT, DST_LAYOUT>
151where
152 u32: AsPrimitive<T>,
153{
154 fn transform(&self, src: &[T], dst: &mut [T]) -> Result<(), CmsError> {
155 unsafe { self.transform_impl(src, dst) }
156 }
157}