image/imageops/
filter_1d.rs

1#![forbid(unsafe_code)]
2use crate::error::{LimitError, LimitErrorKind};
3use crate::math::multiply_accumulate;
4use crate::ImageError;
5
6use num_traits::{AsPrimitive, MulAdd};
7use std::mem::size_of;
8use std::ops::{Add, Mul};
9
10pub(crate) trait SafeMul<S> {
11    fn safe_mul(&self, rhs: S) -> Result<S, ImageError>;
12}
13
14pub(crate) trait SafeAdd<S> {
15    fn safe_add(&self, rhs: S) -> Result<S, ImageError>;
16}
17
18impl SafeMul<usize> for usize {
19    #[inline]
20    fn safe_mul(&self, rhs: usize) -> Result<usize, ImageError> {
21        if let Some(product) = self.checked_mul(rhs) {
22            Ok(product)
23        } else {
24            Err(ImageError::Limits(LimitError::from_kind(
25                LimitErrorKind::DimensionError,
26            )))
27        }
28    }
29}
30
31impl SafeAdd<usize> for usize {
32    #[inline]
33    fn safe_add(&self, rhs: usize) -> Result<usize, ImageError> {
34        if let Some(product) = self.checked_add(rhs) {
35            Ok(product)
36        } else {
37            Err(ImageError::Limits(LimitError::from_kind(
38                LimitErrorKind::DimensionError,
39            )))
40        }
41    }
42}
43
44#[derive(Debug, Clone, Copy)]
45struct KernelShape {
46    width: usize,
47    height: usize,
48}
49
50#[derive(Debug, Clone, Copy)]
51pub(crate) struct FilterImageSize {
52    pub(crate) width: usize,
53    pub(crate) height: usize,
54}
55
56/// Pads an image row with *clamp* strategy
57///
58/// This method copies real content into center of new buffer
59/// and filling leading and trailing physical padded parts with *clamp* strategy
60fn make_arena_row<T, const N: usize>(
61    image: &[T],
62    row_buffer: &mut [T],
63    source_y: usize,
64    image_size: FilterImageSize,
65    kernel_size: KernelShape,
66) -> Result<(), ImageError>
67where
68    T: Default + Copy + Send + Sync + 'static,
69    f64: AsPrimitive<T>,
70{
71    assert_eq!(image.len(), N * image_size.width * image_size.height);
72
73    let pad_w = (kernel_size.width / 2).max(1);
74
75    let arena_width = image_size
76        .width
77        .safe_mul(N)?
78        .safe_add(pad_w.safe_mul(2 * N)?)?;
79
80    let source_offset = source_y * image_size.width * N;
81    assert_eq!(row_buffer.len(), arena_width);
82
83    let row_dst = &mut row_buffer[pad_w * N..(pad_w * N + image_size.width * N)];
84
85    let source_row = &image[source_offset..(source_offset + image_size.width * N)];
86
87    for (dst, src) in row_dst.iter_mut().zip(source_row.iter()) {
88        *dst = *src;
89    }
90
91    for (x, dst) in (0..pad_w).zip(row_buffer.chunks_exact_mut(N)) {
92        let old_x = x.saturating_sub(pad_w).min(image_size.width - 1);
93        let old_px = old_x * N;
94        let src_iter = &source_row[old_px..(old_px + N)];
95        for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
96            *dst = *src;
97        }
98    }
99
100    for (x, dst) in
101        (image_size.width..(image_size.width + pad_w)).zip(row_buffer.chunks_exact_mut(N).rev())
102    {
103        let old_x = x.max(0).min(image_size.width - 1);
104        let old_px = old_x * N;
105        let src_iter = &source_row[old_px..(old_px + N)];
106        for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
107            *dst = *src;
108        }
109    }
110    Ok(())
111}
112
113#[derive(Clone)]
114struct ArenaColumns<T>
115where
116    T: Copy,
117{
118    top_pad: Vec<T>,
119    bottom_pad: Vec<T>,
120}
121
122/// Pads a column image with *clamp* strategy
123///
124/// This method is used for *virtual* column padding.
125/// It produces two arrays that represents virtual image top part and bottom
126fn make_columns_arenas<T, const N: usize>(
127    image: &[T],
128    image_size: FilterImageSize,
129    kernel_size: KernelShape,
130) -> ArenaColumns<T>
131where
132    T: Default + Copy + Send + Sync + 'static,
133    f64: AsPrimitive<T>,
134{
135    assert_eq!(image.len(), N * image_size.width * image_size.height);
136    let pad_h = kernel_size.height / 2;
137
138    let mut top_pad = vec![T::default(); pad_h * image_size.width * N];
139    let mut bottom_pad = vec![T::default(); pad_h * image_size.width * N];
140
141    let top_pad_stride = image_size.width * N;
142
143    for (ky, dst) in (0..pad_h).zip(top_pad.chunks_exact_mut(top_pad_stride)) {
144        for (kx, dst) in (0..image_size.width).zip(dst.chunks_exact_mut(N)) {
145            let y = ky.saturating_sub(pad_h).min(image_size.height - 1);
146            let v_src = y * top_pad_stride + kx * N;
147
148            let src_iter = &image[v_src..(v_src + N)];
149            for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
150                *dst = *src;
151            }
152        }
153    }
154
155    let bottom_iter_dst = bottom_pad.chunks_exact_mut(top_pad_stride);
156
157    for (ky, dst) in (0..pad_h).zip(bottom_iter_dst) {
158        for (kx, dst) in (0..image_size.width).zip(dst.chunks_exact_mut(N)) {
159            let y = (ky + image_size.height).min(image_size.height - 1);
160            let v_src = y * top_pad_stride + kx * N;
161            let src_iter = &image[v_src..(v_src + N)];
162            for (dst, src) in dst.iter_mut().zip(src_iter.iter()) {
163                *dst = *src;
164            }
165        }
166    }
167
168    ArenaColumns {
169        top_pad,
170        bottom_pad,
171    }
172}
173
174trait ToStorage<S> {
175    fn to_(self) -> S;
176}
177
178const Q0_15: i32 = 15;
179
180impl ToStorage<u8> for u32 {
181    #[inline(always)]
182    fn to_(self) -> u8 {
183        ((self + (1 << (Q0_15 - 1))) >> Q0_15).min(255) as u8
184    }
185}
186
187impl ToStorage<u16> for u32 {
188    #[inline(always)]
189    fn to_(self) -> u16 {
190        ((self + (1 << (Q0_15 - 1))) >> Q0_15).min(u16::MAX as u32) as u16
191    }
192}
193
194impl ToStorage<u16> for f32 {
195    #[inline(always)]
196    fn to_(self) -> u16 {
197        self.round().min(u16::MAX as f32) as u16
198    }
199}
200
201impl ToStorage<f32> for f32 {
202    #[inline(always)]
203    fn to_(self) -> f32 {
204        self
205    }
206}
207
208/// Performs column convolution pass for symmetrical filter
209///
210/// Common convolution formula O(x,y)=∑K(k)⋅I(x,y+k); where sums goes from 0...R
211/// when filter is symmetric that we can half kernel reads by using formula
212/// O(x,y)=(∑K(k)⋅(I(x,y+k) + I(x,y+(R-k)))) + K(R/2)⋅I(x,y+R/2); where sums goes from 0...R/2
213fn filter_symmetric_column<T, F, const N: usize>(
214    arena_src: &[&[T]],
215    dst_row: &mut [T],
216    image_size: FilterImageSize,
217    kernel: &[F],
218) where
219    T: Copy + AsPrimitive<F>,
220    F: ToStorage<T>
221        + Mul<F, Output = F>
222        + MulAdd<F, Output = F>
223        + Add<F, Output = F>
224        + Default
225        + Copy
226        + 'static,
227{
228    let dst_stride = image_size.width * N;
229
230    let length = kernel.len();
231    let half_len = length / 2;
232
233    let mut cx = 0usize;
234
235    let coeff = kernel[half_len];
236
237    let mut dst_rem = dst_row;
238
239    if size_of::<T>() == 1 {
240        for chunk in dst_rem.chunks_exact_mut(32) {
241            let mut store0: [F; 16] = [F::default(); 16];
242            let mut store1: [F; 16] = [F::default(); 16];
243
244            let v_src0 = &arena_src[half_len][cx..(cx + 16)];
245            let v_src1 = &arena_src[half_len][(cx + 16)..(cx + 32)];
246
247            for (dst, src) in store0.iter_mut().zip(v_src0) {
248                *dst = src.as_().mul(coeff);
249            }
250            for (dst, src) in store1.iter_mut().zip(v_src1) {
251                *dst = src.as_().mul(coeff);
252            }
253
254            for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
255                let other_side = length - i - 1;
256                let fw_src = arena_src[i];
257                let rw_src = arena_src[other_side];
258                let fw0 = &fw_src[cx..(cx + 16)];
259                let bw0 = &rw_src[cx..(cx + 16)];
260                let fw1 = &fw_src[(cx + 16)..(cx + 32)];
261                let bw1 = &rw_src[(cx + 16)..(cx + 32)];
262
263                for ((dst, fw), bw) in store0.iter_mut().zip(fw0).zip(bw0) {
264                    *dst = multiply_accumulate(*dst, fw.as_().add(bw.as_()), coeff);
265                }
266
267                for ((dst, fw), bw) in store1.iter_mut().zip(fw1).zip(bw1) {
268                    *dst = multiply_accumulate(*dst, fw.as_().add(bw.as_()), coeff);
269                }
270            }
271
272            let shaped_dst0 = &mut chunk[..16];
273
274            for (src, dst) in store0.iter().zip(shaped_dst0.iter_mut()) {
275                *dst = src.to_();
276            }
277
278            let shaped_dst1 = &mut chunk[16..32];
279
280            for (src, dst) in store1.iter().zip(shaped_dst1.iter_mut()) {
281                *dst = src.to_();
282            }
283
284            cx += 32;
285        }
286
287        dst_rem = dst_rem.chunks_exact_mut(32).into_remainder();
288    }
289
290    for chunk in dst_rem.chunks_exact_mut(16) {
291        let mut store: [F; 16] = [F::default(); 16];
292
293        let v_src = &arena_src[half_len][cx..(cx + 16)];
294
295        for (dst, src) in store.iter_mut().zip(v_src) {
296            *dst = src.as_().mul(coeff);
297        }
298
299        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
300            let other_side = length - i - 1;
301            let fw = &arena_src[i][cx..(cx + 16)];
302            let bw = &arena_src[other_side][cx..(cx + 16)];
303
304            for ((dst, fw), bw) in store.iter_mut().zip(fw).zip(bw) {
305                *dst = multiply_accumulate(*dst, fw.as_().add(bw.as_()), coeff);
306            }
307        }
308
309        for (src, dst) in store.iter().zip(chunk.iter_mut()) {
310            *dst = src.to_();
311        }
312
313        cx += 16;
314    }
315
316    dst_rem = dst_rem.chunks_exact_mut(16).into_remainder();
317
318    for chunk in dst_rem.chunks_exact_mut(4) {
319        let v_src = &arena_src[half_len][cx..(cx + 4)];
320
321        let mut k0 = v_src[0].as_().mul(coeff);
322        let mut k1 = v_src[1].as_().mul(coeff);
323        let mut k2 = v_src[2].as_().mul(coeff);
324        let mut k3 = v_src[3].as_().mul(coeff);
325
326        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
327            let other_side = length - i - 1;
328            let fw = &arena_src[i][cx..(cx + 4)];
329            let bw = &arena_src[other_side][cx..(cx + 4)];
330            k0 = multiply_accumulate(k0, fw[0].as_().add(bw[0].as_()), coeff);
331            k1 = multiply_accumulate(k1, fw[1].as_().add(bw[1].as_()), coeff);
332            k2 = multiply_accumulate(k2, fw[2].as_().add(bw[2].as_()), coeff);
333            k3 = multiply_accumulate(k3, fw[3].as_().add(bw[3].as_()), coeff);
334        }
335
336        chunk[0] = k0.to_();
337        chunk[1] = k1.to_();
338        chunk[2] = k2.to_();
339        chunk[3] = k3.to_();
340        cx += 4;
341    }
342
343    dst_rem = dst_rem.chunks_exact_mut(4).into_remainder();
344
345    for (chunk, x) in dst_rem.iter_mut().zip(cx..dst_stride) {
346        let v_src = &arena_src[half_len][x..(x + 1)];
347
348        let mut k0 = v_src[0].as_().mul(coeff);
349
350        for (i, &coeff) in kernel.iter().take(half_len).enumerate() {
351            let other_side = length - i - 1;
352            let fw = &arena_src[i][x..(x + 1)];
353            let bw = &arena_src[other_side][x..(x + 1)];
354            k0 = multiply_accumulate(k0, fw[0].as_().add(bw[0].as_()), coeff);
355        }
356
357        *chunk = k0.to_();
358    }
359}
360
361/// Performs horizontal convolution for row
362///
363/// Common convolution formula O(x,y)=∑K(k)⋅I(x+k,y); where sums goes from 0...R
364/// when filter is symmetric that we can half kernel reads by using formula
365/// O(x,y)=(∑K(k)⋅(I(x+k,y) + I(x+(R-k),y))) + K(R/2)⋅I(x+R/2,y); where sums goes from 0...R/2
366fn filter_symmetric_row<T, F, const N: usize>(arena: &[T], dst_row: &mut [T], scanned_kernel: &[F])
367where
368    T: Copy + AsPrimitive<F> + Default,
369    F: ToStorage<T>
370        + Mul<Output = F>
371        + MulAdd<F, Output = F>
372        + Default
373        + Add<F, Output = F>
374        + Copy
375        + 'static,
376    i32: AsPrimitive<F>,
377{
378    let src = arena;
379
380    let length = scanned_kernel.len();
381    let half_len = length / 2;
382
383    let hc = scanned_kernel[half_len];
384
385    for (x, dst) in dst_row.chunks_exact_mut(4).enumerate() {
386        let v_cx = x * 4;
387        let src = &src[v_cx..];
388
389        let chunk = &src[half_len * N..half_len * N + 4];
390
391        let mut k0 = chunk[0].as_() * hc;
392        let mut k1 = chunk[1].as_() * hc;
393        let mut k2 = chunk[2].as_() * hc;
394        let mut k3 = chunk[3].as_() * hc;
395
396        // Note why here is no window operators:
397        // https://github.com/image-rs/image/pull/2496#discussion_r2155171034
398        for (i, &coeff) in scanned_kernel.iter().take(half_len).enumerate() {
399            let other_side = length - i - 1;
400            let fw = &src[(i * N)..(i * N) + 4];
401            let bw = &src[(other_side * N)..(other_side * N) + 4];
402            k0 = multiply_accumulate(k0, fw[0].as_() + bw[0].as_(), coeff);
403            k1 = multiply_accumulate(k1, fw[1].as_() + bw[1].as_(), coeff);
404            k2 = multiply_accumulate(k2, fw[2].as_() + bw[2].as_(), coeff);
405            k3 = multiply_accumulate(k3, fw[3].as_() + bw[3].as_(), coeff);
406        }
407
408        dst[0] = k0.to_();
409        dst[1] = k1.to_();
410        dst[2] = k2.to_();
411        dst[3] = k3.to_();
412    }
413
414    let dzx = dst_row.chunks_exact_mut(4).len() * 4;
415    let remainder = dst_row.chunks_exact_mut(4).into_remainder();
416
417    for (x, dst) in remainder.iter_mut().enumerate() {
418        let v_cx = x + dzx;
419        let src = &src[v_cx..];
420
421        let mut k0 = src[half_len * N].as_() * hc;
422
423        for (i, &coeff) in scanned_kernel.iter().take(half_len).enumerate() {
424            let other_side = length - i - 1;
425            let fw = &src[(i * N)..(i * N) + 1];
426            let bw = &src[(other_side * N)..(other_side * N) + 1];
427            k0 = multiply_accumulate(k0, fw[0].as_() + bw[0].as_(), coeff);
428        }
429
430        *dst = k0.to_();
431    }
432}
433
434trait KernelTransformer<F, I> {
435    fn transform(input: F) -> I;
436}
437
438impl KernelTransformer<f32, u32> for u8 {
439    fn transform(input: f32) -> u32 {
440        const SCALE: f32 = (1 << Q0_15) as f32;
441        (input * SCALE).min(((1u32 << Q0_15) - 1) as f32).max(0.) as u32
442    }
443}
444
445impl KernelTransformer<f32, u32> for u16 {
446    fn transform(input: f32) -> u32 {
447        const SCALE: f32 = (1 << Q0_15) as f32;
448        (input * SCALE).min(((1u32 << Q0_15) - 1) as f32).max(0.) as u32
449    }
450}
451
452impl KernelTransformer<f32, f32> for f32 {
453    fn transform(input: f32) -> f32 {
454        input
455    }
456}
457
458impl KernelTransformer<f32, f32> for u16 {
459    fn transform(input: f32) -> f32 {
460        input
461    }
462}
463
464/// Removes meaningless values from kernel preserving symmetry
465fn prepare_symmetric_kernel<I: Copy + PartialEq + 'static>(kernel: &[I]) -> Vec<I>
466where
467    i32: AsPrimitive<I>,
468{
469    let zeros: I = 0i32.as_();
470    let mut new_kernel = kernel.to_vec();
471    while new_kernel.len() > 2
472        && (new_kernel.last().unwrap().eq(&zeros) && new_kernel.first().unwrap().eq(&zeros))
473    {
474        new_kernel.remove(0);
475        new_kernel.remove(new_kernel.len() - 1);
476    }
477
478    new_kernel
479}
480
481const RING_QUEUE_CIRCULAR_CUTOFF: usize = 55;
482
483/// This is typical *Divide & Conquer* method with not typical *Sliding Window*.
484/// Thus, instead of transient image here sliding window is used in a ring queue form.
485///
486/// Let's define R(n) as row with number: R0,R1,R2,R3 and so forth.
487/// So to convolve an image the buffer that represents a ring is created with size:
488/// `image_width * channels_count * vertical_kernel_length` as our ring is representing
489/// columnar queue.
490///
491/// So at the very first time it's fully filled if we convolve image with vertical kernel size = 5,
492/// then it holds: R0,R1,R2,R3,R4, each is already blurred horizontally.
493/// Instead of removing/adding items from ring iterator position is hold
494/// and after iterator is adjusted the new row just replaces the old ones.
495/// Therefore, at the very moment of the first fulfilling we actually should blur now *R2*
496/// that is central item, but our iterator already at *R4*, thus our *ring queue* is always
497/// *ahead* of current row by `kernel_size/2`. So to send rows correctly to columnar pass,
498/// it's required to reshuffle as `(current_iterator_position+1)%vertical_kernel_size`
499/// our ring first ,since the iterator is always at the tail, advanced by
500/// `kernel_size/2` from current row, and ahead from start by `kernel_size`.
501///
502/// After columnar pass iterator is adjusted one position forward in a ring style:
503/// to fill the ring, the next row blurred, then it will replace leading row with a new one.
504/// The flow after the first fill looks like:
505/// - R0,R1,R2,R3,R4(iterator here, denoted as `I`)
506/// - R5(I),R1,R2,R3,R4
507/// - R5,R6(I),R2,R3,R4
508/// - R5,R6,R7(I),R3,R4
509/// - R5,R6,R7,R8(I),R4
510/// - R5,R6,R7,R8,R9(I)
511/// - R10(I),R6,R7,R8,R9
512///
513/// This approach is significantly more efficient for small kernels,
514/// and several times faster in multithreaded environments
515/// despite the presence of overlapping regions.
516///
517/// The algorithm consists of 3 stages that should be different for multithreaded and single-threaded
518/// modes:
519///
520/// Single threaded:
521/// - Blurring horizontal R0 and copy it into a ring buffer
522///   up to `kernel_size/2 * channels_count * vertical_kernel_length` because we always clamp here
523///   and first row the same
524/// - Blur next row and adjust iterator until ring the first full ring fill.
525/// - Blur vertically ring buffer, store row into destination,
526///   adjust iterator, blur horizontally next row and repeat it until end
527///
528/// Multithreaded:
529/// - Fill the ring buffer from tile's Y starting before the first full ring fill.
530/// - Blur vertically ring buffer, store row into destination,
531///   adjust iterator, blur horizontally next row and repeat it until end.
532fn filter_2d_separable_ring_queue<T, I, const N: usize>(
533    image: &[T],
534    destination: &mut [T],
535    image_size: FilterImageSize,
536    row_kernel: &[I],
537    column_kernel: &[I],
538) -> Result<(), ImageError>
539where
540    T: Copy + Default + Send + Sync + AsPrimitive<I>,
541    I: ToStorage<T>
542        + Mul<I, Output = I>
543        + Add<I, Output = I>
544        + MulAdd<I, Output = I>
545        + Send
546        + Sync
547        + PartialEq
548        + Default
549        + 'static
550        + Copy,
551    i32: AsPrimitive<I>,
552    f64: AsPrimitive<T>,
553{
554    let pad_w = (row_kernel.len() / 2).max(1);
555
556    let arena_width = image_size
557        .width
558        .safe_mul(N)?
559        .safe_add(pad_w.safe_mul(2 * N)?)?;
560    let mut row_buffer = vec![T::default(); arena_width];
561
562    let full_width = image_size.width * N;
563
564    // This is flat ring queue
565    let mut buffer = vec![T::default(); (image_size.width * N).safe_mul(column_kernel.len())?];
566
567    let column_kernel_len = column_kernel.len();
568
569    let half_kernel = column_kernel_len / 2;
570
571    make_arena_row::<T, N>(
572        image,
573        &mut row_buffer,
574        0,
575        image_size,
576        KernelShape {
577            width: row_kernel.len(),
578            height: 0,
579        },
580    )?;
581
582    // Blurring horizontally R0
583    filter_symmetric_row::<T, I, N>(&row_buffer, &mut buffer[..full_width], row_kernel);
584
585    // Distribute R0 up to half of kernel into a ring
586    let (src_row, rest) = buffer.split_at_mut(full_width);
587    for dst in rest.chunks_exact_mut(full_width).take(half_kernel) {
588        for (dst, src) in dst.iter_mut().zip(src_row.iter()) {
589            *dst = *src;
590        }
591    }
592
593    let mut start_ky = column_kernel_len / 2 + 1;
594    // This needs in case of anisotropy
595    start_ky %= column_kernel_len;
596
597    // image_size.height + half_kernel is here because as in description
598    // we're always in advance on half of vertical kernel
599    for y in 1..image_size.height + half_kernel {
600        let new_y = if y < image_size.height {
601            y
602        } else {
603            y.min(image_size.height - 1)
604        };
605
606        make_arena_row::<T, N>(
607            image,
608            &mut row_buffer,
609            new_y,
610            image_size,
611            KernelShape {
612                width: row_kernel.len(),
613                height: 0,
614            },
615        )?;
616
617        filter_symmetric_row::<T, I, N>(
618            &row_buffer,
619            &mut buffer[start_ky * full_width..(start_ky + 1) * full_width],
620            row_kernel,
621        );
622
623        // As we always in advance on half of vertical kernel so half_kernel = R0
624        // this is real image start
625        if y >= half_kernel {
626            let mut brows = vec![image; column_kernel_len];
627
628            for (i, brow) in brows.iter_mut().enumerate() {
629                let ky = (i + start_ky + 1) % column_kernel_len;
630                *brow = &buffer[ky * full_width..(ky + 1) * full_width];
631            }
632
633            let dy = y - half_kernel;
634
635            let dst = &mut destination[dy * full_width..(dy + 1) * full_width];
636
637            filter_symmetric_column::<T, I, N>(&brows, dst, image_size, column_kernel);
638        }
639
640        start_ky += 1;
641        start_ky %= column_kernel_len;
642    }
643
644    Ok(())
645}
646
647/// Performs 2D separable convolution on the image
648///
649/// Currently implemented only for symmetrical filters
650///
651/// # Arguments
652///
653/// * `image`: Single plane image
654/// * `destination`: Destination image
655/// * `image_size`: Image size see [FilterImageSize]
656/// * `row_kernel`: Row kernel, *size must be odd*!
657/// * `column_kernel`: Column kernel, *size must be odd*!
658///
659/// If both kernels after kernel scan appears to be an identity then just copy is performed.
660fn filter_2d_separable<T, F, I, const N: usize>(
661    image: &[T],
662    destination: &mut [T],
663    image_size: FilterImageSize,
664    row_kernel: &[F],
665    column_kernel: &[F],
666) -> Result<(), ImageError>
667where
668    T: Copy + AsPrimitive<F> + Default + Send + Sync + KernelTransformer<F, I> + AsPrimitive<I>,
669    F: Default + 'static + Copy,
670    I: ToStorage<T>
671        + Mul<I, Output = I>
672        + Add<I, Output = I>
673        + MulAdd<I, Output = I>
674        + Send
675        + Sync
676        + PartialEq
677        + Default
678        + 'static
679        + Copy,
680    i32: AsPrimitive<F> + AsPrimitive<I>,
681    f64: AsPrimitive<T>,
682{
683    if image.len() != image_size.width.safe_mul(image_size.height)?.safe_mul(N)? {
684        return Err(ImageError::Limits(LimitError::from_kind(
685            LimitErrorKind::DimensionError,
686        )));
687    }
688    if destination.len() != image.len() {
689        return Err(ImageError::Limits(LimitError::from_kind(
690            LimitErrorKind::DimensionError,
691        )));
692    }
693
694    assert_ne!(row_kernel.len() & 1, 0, "Row kernel length must be odd");
695    assert_ne!(
696        column_kernel.len() & 1,
697        0,
698        "Column kernel length must be odd"
699    );
700
701    let mut scanned_row_kernel = row_kernel
702        .iter()
703        .map(|&x| T::transform(x))
704        .collect::<Vec<I>>();
705    let mut scanned_column_kernel = column_kernel
706        .iter()
707        .map(|&x| T::transform(x))
708        .collect::<Vec<I>>();
709
710    scanned_row_kernel = prepare_symmetric_kernel(&scanned_row_kernel);
711    scanned_column_kernel = prepare_symmetric_kernel(&scanned_column_kernel);
712
713    if scanned_row_kernel.is_empty() && scanned_column_kernel.is_empty() {
714        destination.copy_from_slice(image);
715        return Ok(());
716    }
717
718    if column_kernel.len() < RING_QUEUE_CIRCULAR_CUTOFF {
719        return filter_2d_separable_ring_queue::<T, I, N>(
720            image,
721            destination,
722            image_size,
723            &scanned_row_kernel,
724            &scanned_column_kernel,
725        );
726    }
727
728    let pad_w = (scanned_row_kernel.len() / 2).max(1);
729
730    let arena_width = image_size.width * N + pad_w * 2 * N;
731    let mut row_buffer = vec![T::default(); arena_width];
732
733    let mut transient_image = vec![T::default(); image_size.width * image_size.height * N];
734
735    for (y, dst) in transient_image
736        .chunks_exact_mut(image_size.width * N)
737        .enumerate()
738    {
739        // This is important to perform padding before convolution.
740        // That approach allows to iterate without unnecessary branching
741        // and highly effective from low sized kernels to reasonable huge kernels.
742        // It is especially more effective for low sized kernels than copying
743        // specifically for big sized images.
744        // If image row is `asdfgh` then this method with clamp will produce row
745        // `aaa asdfgh hhh` padded exactly on half kernel size
746        make_arena_row::<T, N>(
747            image,
748            &mut row_buffer,
749            y,
750            image_size,
751            KernelShape {
752                width: scanned_row_kernel.len(),
753                height: 0,
754            },
755        )?;
756
757        filter_symmetric_row::<T, I, N>(&row_buffer, dst, &scanned_row_kernel);
758    }
759
760    let column_kernel_shape = KernelShape {
761        width: 0,
762        height: scanned_column_kernel.len(),
763    };
764
765    // This is important to perform padding before convolution.
766    // That approach allows to iterate without unnecessary branching
767    // and highly effective from low sized kernels to reasonable huge kernels.
768    // It is especially more effective for low sized kernels than copying
769    // specifically for big sized images.
770    // `This is virtual padding!` that means it produces two non-contiguous arrays.
771    // They will virtually replace image row, when convolution kernel goes out of bounds on Y coordinate.
772    let column_arena_k =
773        make_columns_arenas::<T, N>(transient_image.as_slice(), image_size, column_kernel_shape);
774
775    let top_pad = column_arena_k.top_pad.as_slice();
776    let bottom_pad = column_arena_k.bottom_pad.as_slice();
777
778    let pad_h = column_kernel_shape.height / 2;
779
780    let transient_image_slice = transient_image.as_slice();
781
782    let src_stride = image_size.width * N;
783
784    for (y, dst) in destination
785        .chunks_exact_mut(image_size.width * N)
786        .enumerate()
787    {
788        let mut brows: Vec<&[T]> = vec![&transient_image_slice[0..]; column_kernel_shape.height];
789
790        for (k, row) in (0..column_kernel_shape.height).zip(brows.iter_mut()) {
791            if (y as i64 - pad_h as i64 + k as i64) < 0 {
792                *row = &top_pad[(pad_h - k - 1) * src_stride..];
793            } else if (y as i64 - pad_h as i64 + k as i64) as usize >= image_size.height {
794                *row = &bottom_pad[(k - pad_h - 1) * src_stride..];
795            } else {
796                let fy = (y as i64 + k as i64 - pad_h as i64) as usize;
797                let start_offset = src_stride * fy;
798                *row = &transient_image_slice[start_offset..(start_offset + src_stride)];
799            }
800        }
801
802        let brows_slice = brows.as_slice();
803
804        filter_symmetric_column::<T, I, N>(brows_slice, dst, image_size, &scanned_column_kernel);
805    }
806
807    Ok(())
808}
809
810pub(crate) fn filter_2d_sep_plane(
811    image: &[u8],
812    destination: &mut [u8],
813    image_size: FilterImageSize,
814    row_kernel: &[f32],
815    column_kernel: &[f32],
816) -> Result<(), ImageError> {
817    filter_2d_separable::<u8, f32, u32, 1>(
818        image,
819        destination,
820        image_size,
821        row_kernel,
822        column_kernel,
823    )
824}
825
826pub(crate) fn filter_2d_sep_la(
827    image: &[u8],
828    destination: &mut [u8],
829    image_size: FilterImageSize,
830    row_kernel: &[f32],
831    column_kernel: &[f32],
832) -> Result<(), ImageError> {
833    filter_2d_separable::<u8, f32, u32, 2>(
834        image,
835        destination,
836        image_size,
837        row_kernel,
838        column_kernel,
839    )
840}
841
842pub(crate) fn filter_2d_sep_rgb(
843    image: &[u8],
844    destination: &mut [u8],
845    image_size: FilterImageSize,
846    row_kernel: &[f32],
847    column_kernel: &[f32],
848) -> Result<(), ImageError> {
849    filter_2d_separable::<u8, f32, u32, 3>(
850        image,
851        destination,
852        image_size,
853        row_kernel,
854        column_kernel,
855    )
856}
857
858pub(crate) fn filter_2d_sep_rgba(
859    image: &[u8],
860    destination: &mut [u8],
861    image_size: FilterImageSize,
862    row_kernel: &[f32],
863    column_kernel: &[f32],
864) -> Result<(), ImageError> {
865    filter_2d_separable::<u8, f32, u32, 4>(
866        image,
867        destination,
868        image_size,
869        row_kernel,
870        column_kernel,
871    )
872}
873
874pub(crate) fn filter_2d_sep_la_f32(
875    image: &[f32],
876    destination: &mut [f32],
877    image_size: FilterImageSize,
878    row_kernel: &[f32],
879    column_kernel: &[f32],
880) -> Result<(), ImageError> {
881    filter_2d_separable::<f32, f32, f32, 2>(
882        image,
883        destination,
884        image_size,
885        row_kernel,
886        column_kernel,
887    )
888}
889
890pub(crate) fn filter_2d_sep_plane_f32(
891    image: &[f32],
892    destination: &mut [f32],
893    image_size: FilterImageSize,
894    row_kernel: &[f32],
895    column_kernel: &[f32],
896) -> Result<(), ImageError> {
897    filter_2d_separable::<f32, f32, f32, 1>(
898        image,
899        destination,
900        image_size,
901        row_kernel,
902        column_kernel,
903    )
904}
905
906pub(crate) fn filter_2d_sep_rgb_f32(
907    image: &[f32],
908    destination: &mut [f32],
909    image_size: FilterImageSize,
910    row_kernel: &[f32],
911    column_kernel: &[f32],
912) -> Result<(), ImageError> {
913    filter_2d_separable::<f32, f32, f32, 3>(
914        image,
915        destination,
916        image_size,
917        row_kernel,
918        column_kernel,
919    )
920}
921
922pub(crate) fn filter_2d_sep_rgba_f32(
923    image: &[f32],
924    destination: &mut [f32],
925    image_size: FilterImageSize,
926    row_kernel: &[f32],
927    column_kernel: &[f32],
928) -> Result<(), ImageError> {
929    filter_2d_separable::<f32, f32, f32, 4>(
930        image,
931        destination,
932        image_size,
933        row_kernel,
934        column_kernel,
935    )
936}
937
938pub(crate) fn filter_2d_sep_rgb_u16(
939    image: &[u16],
940    destination: &mut [u16],
941    image_size: FilterImageSize,
942    row_kernel: &[f32],
943    column_kernel: &[f32],
944) -> Result<(), ImageError> {
945    filter_2d_separable::<u16, f32, u32, 3>(
946        image,
947        destination,
948        image_size,
949        row_kernel,
950        column_kernel,
951    )
952}
953
954pub(crate) fn filter_2d_sep_rgba_u16(
955    image: &[u16],
956    destination: &mut [u16],
957    image_size: FilterImageSize,
958    row_kernel: &[f32],
959    column_kernel: &[f32],
960) -> Result<(), ImageError> {
961    filter_2d_separable::<u16, f32, u32, 4>(
962        image,
963        destination,
964        image_size,
965        row_kernel,
966        column_kernel,
967    )
968}
969
970pub(crate) fn filter_2d_sep_la_u16(
971    image: &[u16],
972    destination: &mut [u16],
973    image_size: FilterImageSize,
974    row_kernel: &[f32],
975    column_kernel: &[f32],
976) -> Result<(), ImageError> {
977    filter_2d_separable::<u16, f32, u32, 2>(
978        image,
979        destination,
980        image_size,
981        row_kernel,
982        column_kernel,
983    )
984}
985
986pub(crate) fn filter_2d_sep_plane_u16(
987    image: &[u16],
988    destination: &mut [u16],
989    image_size: FilterImageSize,
990    row_kernel: &[f32],
991    column_kernel: &[f32],
992) -> Result<(), ImageError> {
993    filter_2d_separable::<u16, f32, u32, 1>(
994        image,
995        destination,
996        image_size,
997        row_kernel,
998        column_kernel,
999    )
1000}