1use crate::conversions::avx::hypercube::HypercubeAvx;
30use crate::conversions::avx::interpolator::AvxVectorSse;
31use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
32use std::arch::x86_64::*;
33
34pub(crate) struct ACurves4x3AvxFma<'a, const DEPTH: usize> {
35 pub(crate) curve0: Box<[f32; 65536]>,
36 pub(crate) curve1: Box<[f32; 65536]>,
37 pub(crate) curve2: Box<[f32; 65536]>,
38 pub(crate) curve3: Box<[f32; 65536]>,
39 pub(crate) clut: &'a [f32],
40 pub(crate) grid_size: [u8; 4],
41 pub(crate) interpolation_method: InterpolationMethod,
42 pub(crate) pcs: DataColorSpace,
43}
44
45pub(crate) struct ACurves4x3AvxFmaOptimized<'a> {
46 pub(crate) clut: &'a [f32],
47 pub(crate) grid_size: [u8; 4],
48 pub(crate) interpolation_method: InterpolationMethod,
49 pub(crate) pcs: DataColorSpace,
50}
51
52impl<const DEPTH: usize> ACurves4x3AvxFma<'_, DEPTH> {
53 #[allow(unused_unsafe)]
54 #[target_feature(enable = "avx2", enable = "fma")]
55 unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
56 &self,
57 src: &[f32],
58 dst: &mut [f32],
59 fetch: Fetch,
60 ) -> Result<(), CmsError> {
61 let scale_value = (DEPTH - 1) as f32;
62
63 assert_eq!(src.len() / 4, dst.len() / 3);
64
65 unsafe {
66 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
67 let a0 = (src[0] * scale_value).round().min(scale_value) as u16;
68 let a1 = (src[1] * scale_value).round().min(scale_value) as u16;
69 let a2 = (src[2] * scale_value).round().min(scale_value) as u16;
70 let a3 = (src[3] * scale_value).round().min(scale_value) as u16;
71 let c = self.curve0[a0 as usize];
72 let m = self.curve1[a1 as usize];
73 let y = self.curve2[a2 as usize];
74 let k = self.curve3[a3 as usize];
75
76 let v = fetch(c, m, y, k).v;
77 dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
78 dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
79 dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
80 }
81 }
82 Ok(())
83 }
84}
85
86impl ACurves4x3AvxFmaOptimized<'_> {
87 #[allow(unused_unsafe)]
88 #[target_feature(enable = "avx2", enable = "fma")]
89 unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
90 &self,
91 src: &[f32],
92 dst: &mut [f32],
93 fetch: Fetch,
94 ) -> Result<(), CmsError> {
95 assert_eq!(src.len() / 4, dst.len() / 3);
96 unsafe {
97 for (src, dst) in src.chunks_exact(4).zip(dst.chunks_exact_mut(3)) {
98 let c = src[0];
99 let m = src[1];
100 let y = src[2];
101 let k = src[3];
102
103 let v = fetch(c, m, y, k).v;
104 dst[0] = f32::from_bits(_mm_extract_ps::<0>(v) as u32);
105 dst[1] = f32::from_bits(_mm_extract_ps::<1>(v) as u32);
106 dst[2] = f32::from_bits(_mm_extract_ps::<2>(v) as u32);
107 }
108 }
109 Ok(())
110 }
111}
112
113impl<const DEPTH: usize> Stage for ACurves4x3AvxFma<'_, DEPTH> {
114 fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
115 let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
116
117 assert!(std::arch::is_x86_feature_detected!("avx2"));
118 assert!(std::arch::is_x86_feature_detected!("fma"));
119
120 unsafe {
121 if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
123 return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
124 }
125
126 match self.interpolation_method {
127 #[cfg(feature = "options")]
128 InterpolationMethod::Tetrahedral => {
129 self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
130 }
131 #[cfg(feature = "options")]
132 InterpolationMethod::Pyramid => {
133 self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
134 }
135 #[cfg(feature = "options")]
136 InterpolationMethod::Prism => {
137 self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
138 }
139 InterpolationMethod::Linear => {
140 self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
141 }
142 }
143 }
144
145 Ok(())
146 }
147}
148
149impl Stage for ACurves4x3AvxFmaOptimized<'_> {
150 fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
151 let lut = HypercubeAvx::new(self.clut, self.grid_size, 3);
152
153 assert!(std::arch::is_x86_feature_detected!("avx2"));
154 assert!(std::arch::is_x86_feature_detected!("fma"));
155
156 unsafe {
157 if self.pcs == DataColorSpace::Lab || self.pcs == DataColorSpace::Xyz {
159 return self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w));
160 }
161
162 match self.interpolation_method {
163 #[cfg(feature = "options")]
164 InterpolationMethod::Tetrahedral => {
165 self.transform_impl(src, dst, |x, y, z, w| lut.tetra_vec3(x, y, z, w))?;
166 }
167 #[cfg(feature = "options")]
168 InterpolationMethod::Pyramid => {
169 self.transform_impl(src, dst, |x, y, z, w| lut.pyramid_vec3(x, y, z, w))?;
170 }
171 #[cfg(feature = "options")]
172 InterpolationMethod::Prism => {
173 self.transform_impl(src, dst, |x, y, z, w| lut.prism_vec3(x, y, z, w))?;
174 }
175 InterpolationMethod::Linear => {
176 self.transform_impl(src, dst, |x, y, z, w| lut.quadlinear_vec3(x, y, z, w))?;
177 }
178 }
179 }
180 Ok(())
181 }
182}