moxcms/conversions/avx/
preheat_lut4x3.rs1use crate::conversions::avx::hypercube::HypercubeAvx;
30use crate::conversions::avx::interpolator::AvxVectorSse;
31use crate::trc::{lut_interp_linear_float, lut_interp_linear_float_clamped};
32use crate::{CmsError, DataColorSpace, InterpolationMethod, Stage};
33use std::arch::x86_64::*;
34
35#[derive(Default)]
36pub(crate) struct Lut4x3AvxFma {
37 pub(crate) linearization: [Vec<f32>; 4],
38 pub(crate) clut: Vec<f32>,
39 pub(crate) grid_size: u8,
40 pub(crate) output: [Vec<f32>; 3],
41 pub(crate) interpolation_method: InterpolationMethod,
42 pub(crate) pcs: DataColorSpace,
43}
44
45impl Lut4x3AvxFma {
46 #[allow(unused_unsafe)]
47 #[target_feature(enable = "avx2", enable = "fma")]
48 unsafe fn transform_impl<Fetch: Fn(f32, f32, f32, f32) -> AvxVectorSse>(
49 &self,
50 src: &[f32],
51 dst: &mut [f32],
52 fetch: Fetch,
53 ) -> Result<(), CmsError> {
54 let linearization_0 = &self.linearization[0];
55 let linearization_1 = &self.linearization[1];
56 let linearization_2 = &self.linearization[2];
57 let linearization_3 = &self.linearization[3];
58 unsafe {
59 let ones = _mm_set1_ps(1.);
60 for (dest, src) in dst.chunks_exact_mut(3).zip(src.chunks_exact(4)) {
61 debug_assert!(self.grid_size as i32 >= 1);
62 let linear_x = lut_interp_linear_float(src[0], linearization_0);
63 let linear_y = lut_interp_linear_float(src[1], linearization_1);
64 let linear_z = lut_interp_linear_float(src[2], linearization_2);
65 let linear_w = lut_interp_linear_float(src[3], linearization_3);
66
67 let mut v = fetch(linear_x, linear_y, linear_z, linear_w).v;
68 v = _mm_max_ps(v, _mm_setzero_ps());
69 v = _mm_min_ps(v, ones);
70
71 let pcs_x = lut_interp_linear_float_clamped(
72 f32::from_bits(_mm_extract_ps::<0>(v) as u32),
73 &self.output[0],
74 );
75 let pcs_y = lut_interp_linear_float_clamped(
76 f32::from_bits(_mm_extract_ps::<1>(v) as u32),
77 &self.output[1],
78 );
79 let pcs_z = lut_interp_linear_float_clamped(
80 f32::from_bits(_mm_extract_ps::<2>(v) as u32),
81 &self.output[2],
82 );
83 dest[0] = pcs_x;
84 dest[1] = pcs_y;
85 dest[2] = pcs_z;
86 }
87 }
88 Ok(())
89 }
90}
91
92impl Stage for Lut4x3AvxFma {
93 fn transform(&self, src: &[f32], dst: &mut [f32]) -> Result<(), CmsError> {
94 let l_tbl = HypercubeAvx::new(
95 &self.clut,
96 [
97 self.grid_size,
98 self.grid_size,
99 self.grid_size,
100 self.grid_size,
101 ],
102 3,
103 );
104
105 assert!(std::arch::is_x86_feature_detected!("avx2"));
106 assert!(std::arch::is_x86_feature_detected!("fma"));
107
108 unsafe {
109 if self.pcs == DataColorSpace::Lab {
111 return self
112 .transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w));
113 }
114
115 match self.interpolation_method {
116 #[cfg(feature = "options")]
117 InterpolationMethod::Tetrahedral => {
118 self.transform_impl(src, dst, |x, y, z, w| l_tbl.tetra_vec3(x, y, z, w))?;
119 }
120 #[cfg(feature = "options")]
121 InterpolationMethod::Pyramid => {
122 self.transform_impl(src, dst, |x, y, z, w| l_tbl.pyramid_vec3(x, y, z, w))?;
123 }
124 #[cfg(feature = "options")]
125 InterpolationMethod::Prism => {
126 self.transform_impl(src, dst, |x, y, z, w| l_tbl.prism_vec3(x, y, z, w))?
127 }
128 InterpolationMethod::Linear => {
129 self.transform_impl(src, dst, |x, y, z, w| l_tbl.quadlinear_vec3(x, y, z, w))?
130 }
131 }
132 }
133 Ok(())
134 }
135}