wide/
u32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse2")] {
5    #[derive(Default, Clone, Copy, PartialEq, Eq)]
6    #[repr(C, align(16))]
7    pub struct u32x4 { pub(crate) sse: m128i }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct u32x4 { pub(crate) simd: v128 }
14
15    impl Default for u32x4 {
16      fn default() -> Self {
17        Self::splat(0)
18      }
19    }
20
21    impl PartialEq for u32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(u32x4_eq(self.simd, other.simd))
24      }
25    }
26
27    impl Eq for u32x4 { }
28  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
29    use core::arch::aarch64::*;
30    #[repr(C)]
31    #[derive(Copy, Clone)]
32    pub struct u32x4 { pub(crate) neon : uint32x4_t }
33
34    impl Default for u32x4 {
35      #[inline]
36      #[must_use]
37      fn default() -> Self {
38        Self::splat(0)
39      }
40    }
41
42    impl PartialEq for u32x4 {
43      #[inline]
44      #[must_use]
45      fn eq(&self, other: &Self) -> bool {
46        unsafe { vminvq_u32(vceqq_u32(self.neon, other.neon))==u32::MAX }
47      }
48    }
49
50    impl Eq for u32x4 { }
51} else {
52    #[derive(Default, Clone, Copy, PartialEq, Eq)]
53    #[repr(C, align(16))]
54    pub struct u32x4 { arr: [u32;4] }
55  }
56}
57
58int_uint_consts!(u32, 4, u32x4, 128);
59
60unsafe impl Zeroable for u32x4 {}
61unsafe impl Pod for u32x4 {}
62
63impl Add for u32x4 {
64  type Output = Self;
65  #[inline]
66  #[must_use]
67  fn add(self, rhs: Self) -> Self::Output {
68    pick! {
69      if #[cfg(target_feature="sse2")] {
70        Self { sse: add_i32_m128i(self.sse, rhs.sse) }
71      } else if #[cfg(target_feature="simd128")] {
72        Self { simd: u32x4_add(self.simd, rhs.simd) }
73      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
74        unsafe { Self { neon: vaddq_u32(self.neon, rhs.neon) } }
75      } else {
76        Self { arr: [
77          self.arr[0].wrapping_add(rhs.arr[0]),
78          self.arr[1].wrapping_add(rhs.arr[1]),
79          self.arr[2].wrapping_add(rhs.arr[2]),
80          self.arr[3].wrapping_add(rhs.arr[3]),
81        ]}
82      }
83    }
84  }
85}
86
87impl Sub for u32x4 {
88  type Output = Self;
89  #[inline]
90  #[must_use]
91  fn sub(self, rhs: Self) -> Self::Output {
92    pick! {
93      if #[cfg(target_feature="sse2")] {
94        Self { sse: sub_i32_m128i(self.sse, rhs.sse) }
95      } else if #[cfg(target_feature="simd128")] {
96        Self { simd: u32x4_sub(self.simd, rhs.simd) }
97      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
98        unsafe {Self { neon: vsubq_u32(self.neon, rhs.neon) }}
99      } else {
100        Self { arr: [
101          self.arr[0].wrapping_sub(rhs.arr[0]),
102          self.arr[1].wrapping_sub(rhs.arr[1]),
103          self.arr[2].wrapping_sub(rhs.arr[2]),
104          self.arr[3].wrapping_sub(rhs.arr[3]),
105        ]}
106      }
107    }
108  }
109}
110
111impl Mul for u32x4 {
112  type Output = Self;
113  #[inline]
114  #[must_use]
115  fn mul(self, rhs: Self) -> Self::Output {
116    pick! {
117      if #[cfg(target_feature="sse4.1")] {
118        Self { sse: mul_32_m128i(self.sse, rhs.sse) }
119      } else if #[cfg(target_feature="simd128")] {
120        Self { simd: u32x4_mul(self.simd, rhs.simd) }
121      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
122        unsafe {Self { neon: vmulq_u32(self.neon, rhs.neon) }}
123      } else {
124        let arr1: [u32; 4] = cast(self);
125        let arr2: [u32; 4] = cast(rhs);
126        cast([
127          arr1[0].wrapping_mul(arr2[0]),
128          arr1[1].wrapping_mul(arr2[1]),
129          arr1[2].wrapping_mul(arr2[2]),
130          arr1[3].wrapping_mul(arr2[3]),
131        ])
132      }
133    }
134  }
135}
136
137impl Add<u32> for u32x4 {
138  type Output = Self;
139  #[inline]
140  #[must_use]
141  fn add(self, rhs: u32) -> Self::Output {
142    self.add(Self::splat(rhs))
143  }
144}
145
146impl Sub<u32> for u32x4 {
147  type Output = Self;
148  #[inline]
149  #[must_use]
150  fn sub(self, rhs: u32) -> Self::Output {
151    self.sub(Self::splat(rhs))
152  }
153}
154
155impl Mul<u32> for u32x4 {
156  type Output = Self;
157  #[inline]
158  #[must_use]
159  fn mul(self, rhs: u32) -> Self::Output {
160    self.mul(Self::splat(rhs))
161  }
162}
163
164impl Add<u32x4> for u32 {
165  type Output = u32x4;
166  #[inline]
167  #[must_use]
168  fn add(self, rhs: u32x4) -> Self::Output {
169    u32x4::splat(self).add(rhs)
170  }
171}
172
173impl Sub<u32x4> for u32 {
174  type Output = u32x4;
175  #[inline]
176  #[must_use]
177  fn sub(self, rhs: u32x4) -> Self::Output {
178    u32x4::splat(self).sub(rhs)
179  }
180}
181
182impl Mul<u32x4> for u32 {
183  type Output = u32x4;
184  #[inline]
185  #[must_use]
186  fn mul(self, rhs: u32x4) -> Self::Output {
187    u32x4::splat(self).mul(rhs)
188  }
189}
190
191impl BitAnd for u32x4 {
192  type Output = Self;
193  #[inline]
194  #[must_use]
195  fn bitand(self, rhs: Self) -> Self::Output {
196    pick! {
197      if #[cfg(target_feature="sse2")] {
198        Self { sse: bitand_m128i(self.sse, rhs.sse) }
199      } else if #[cfg(target_feature="simd128")] {
200        Self { simd: v128_and(self.simd, rhs.simd) }
201      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
202        unsafe {Self { neon: vandq_u32(self.neon, rhs.neon) }}
203      } else {
204        Self { arr: [
205          self.arr[0].bitand(rhs.arr[0]),
206          self.arr[1].bitand(rhs.arr[1]),
207          self.arr[2].bitand(rhs.arr[2]),
208          self.arr[3].bitand(rhs.arr[3]),
209        ]}
210      }
211    }
212  }
213}
214
215impl BitOr for u32x4 {
216  type Output = Self;
217  #[inline]
218  #[must_use]
219  fn bitor(self, rhs: Self) -> Self::Output {
220    pick! {
221      if #[cfg(target_feature="sse2")] {
222        Self { sse: bitor_m128i(self.sse, rhs.sse) }
223      } else if #[cfg(target_feature="simd128")] {
224        Self { simd: v128_or(self.simd, rhs.simd) }
225      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
226        unsafe {Self { neon: vorrq_u32(self.neon, rhs.neon) }}
227      } else {
228        Self { arr: [
229          self.arr[0].bitor(rhs.arr[0]),
230          self.arr[1].bitor(rhs.arr[1]),
231          self.arr[2].bitor(rhs.arr[2]),
232          self.arr[3].bitor(rhs.arr[3]),
233        ]}
234      }
235    }
236  }
237}
238
239impl BitXor for u32x4 {
240  type Output = Self;
241  #[inline]
242  #[must_use]
243  fn bitxor(self, rhs: Self) -> Self::Output {
244    pick! {
245      if #[cfg(target_feature="sse2")] {
246        Self { sse: bitxor_m128i(self.sse, rhs.sse) }
247      } else if #[cfg(target_feature="simd128")] {
248        Self { simd: v128_xor(self.simd, rhs.simd) }
249      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
250        unsafe {Self { neon: veorq_u32(self.neon, rhs.neon) }}
251      } else {
252        Self { arr: [
253          self.arr[0].bitxor(rhs.arr[0]),
254          self.arr[1].bitxor(rhs.arr[1]),
255          self.arr[2].bitxor(rhs.arr[2]),
256          self.arr[3].bitxor(rhs.arr[3]),
257        ]}
258      }
259    }
260  }
261}
262
263macro_rules! impl_shl_t_for_u32x4 {
264  ($($shift_type:ty),+ $(,)?) => {
265    $(impl Shl<$shift_type> for u32x4 {
266      type Output = Self;
267      /// Shifts all lanes by the value given.
268      #[inline]
269      #[must_use]
270      fn shl(self, rhs: $shift_type) -> Self::Output {
271        pick! {
272          if #[cfg(target_feature="sse2")] {
273            let shift = cast([rhs as u64, 0]);
274            Self { sse: shl_all_u32_m128i(self.sse, shift) }
275          } else if #[cfg(target_feature="simd128")] {
276            Self { simd: u32x4_shl(self.simd, rhs as u32) }
277          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
278            unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32(rhs as i32)) }}
279          } else {
280            let u = rhs as u64;
281            Self { arr: [
282              self.arr[0] << u,
283              self.arr[1] << u,
284              self.arr[2] << u,
285              self.arr[3] << u,
286            ]}
287          }
288        }
289      }
290    })+
291  };
292}
293impl_shl_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
294
295macro_rules! impl_shr_t_for_u32x4 {
296  ($($shift_type:ty),+ $(,)?) => {
297    $(impl Shr<$shift_type> for u32x4 {
298      type Output = Self;
299      /// Shifts all lanes by the value given.
300      #[inline]
301      #[must_use]
302      fn shr(self, rhs: $shift_type) -> Self::Output {
303        pick! {
304          if #[cfg(target_feature="sse2")] {
305            let shift = cast([rhs as u64, 0]);
306            Self { sse: shr_all_u32_m128i(self.sse, shift) }
307          } else if #[cfg(target_feature="simd128")] {
308            Self { simd: u32x4_shr(self.simd, rhs as u32) }
309          } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
310            unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32( -(rhs as i32))) }}
311          } else {
312            let u = rhs as u64;
313            Self { arr: [
314              self.arr[0] >> u,
315              self.arr[1] >> u,
316              self.arr[2] >> u,
317              self.arr[3] >> u,
318            ]}
319          }
320        }
321      }
322    })+
323  };
324}
325impl_shr_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128);
326
327/// Shifts lanes by the corresponding lane.
328///
329/// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any
330/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
331/// of the type. (same as `wrapping_shr`)
332impl Shr<u32x4> for u32x4 {
333  type Output = Self;
334  #[inline]
335  #[must_use]
336  fn shr(self, rhs: u32x4) -> Self::Output {
337    pick! {
338      if #[cfg(target_feature="avx2")] {
339        // mask the shift count to 31 to have same behavior on all platforms
340        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
341        Self { sse: shr_each_u32_m128i(self.sse, shift_by) }
342      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
343        unsafe {
344          // mask the shift count to 31 to have same behavior on all platforms
345          // no right shift, have to pass negative value to left shift on neon
346          let shift_by = vnegq_s32(vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31))));
347          Self { neon: vshlq_u32(self.neon, shift_by) }
348        }
349      } else {
350        let arr: [u32; 4] = cast(self);
351        let rhs: [u32; 4] = cast(rhs);
352        cast([
353          arr[0].wrapping_shr(rhs[0]),
354          arr[1].wrapping_shr(rhs[1]),
355          arr[2].wrapping_shr(rhs[2]),
356          arr[3].wrapping_shr(rhs[3]),
357        ])
358      }
359    }
360  }
361}
362
363/// Shifts lanes by the corresponding lane.
364///
365/// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any
366/// high-order bits of `rhs` that would cause the shift to exceed the bitwidth
367/// of the type. (same as `wrapping_shl`)
368impl Shl<u32x4> for u32x4 {
369  type Output = Self;
370  #[inline]
371  #[must_use]
372  fn shl(self, rhs: u32x4) -> Self::Output {
373    pick! {
374      if #[cfg(target_feature="avx2")] {
375        // mask the shift count to 31 to have same behavior on all platforms
376        let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31));
377        Self { sse: shl_each_u32_m128i(self.sse, shift_by) }
378      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
379        unsafe {
380          // mask the shift count to 31 to have same behavior on all platforms
381          let shift_by = vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31)));
382          Self { neon: vshlq_u32(self.neon, shift_by) }
383        }
384      } else {
385        let arr: [u32; 4] = cast(self);
386        let rhs: [u32; 4] = cast(rhs);
387        cast([
388          arr[0].wrapping_shl(rhs[0]),
389          arr[1].wrapping_shl(rhs[1]),
390          arr[2].wrapping_shl(rhs[2]),
391          arr[3].wrapping_shl(rhs[3]),
392        ])
393      }
394    }
395  }
396}
397
398impl CmpEq for u32x4 {
399  type Output = Self;
400  #[inline]
401  #[must_use]
402  fn cmp_eq(self, rhs: Self) -> Self::Output {
403    Self::cmp_eq(self, rhs)
404  }
405}
406
407impl u32x4 {
408  #[inline]
409  #[must_use]
410  pub const fn new(array: [u32; 4]) -> Self {
411    unsafe { core::mem::transmute(array) }
412  }
413  #[inline]
414  #[must_use]
415  pub fn cmp_eq(self, rhs: Self) -> Self {
416    pick! {
417      if #[cfg(target_feature="sse2")] {
418        Self { sse: cmp_eq_mask_i32_m128i(self.sse, rhs.sse) }
419      } else if #[cfg(target_feature="simd128")] {
420        Self { simd: u32x4_eq(self.simd, rhs.simd) }
421      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
422        unsafe {Self { neon: vceqq_u32(self.neon, rhs.neon) }}
423      } else {
424        Self { arr: [
425          if self.arr[0] == rhs.arr[0] { u32::MAX } else { 0 },
426          if self.arr[1] == rhs.arr[1] { u32::MAX } else { 0 },
427          if self.arr[2] == rhs.arr[2] { u32::MAX } else { 0 },
428          if self.arr[3] == rhs.arr[3] { u32::MAX } else { 0 },
429        ]}
430      }
431    }
432  }
433  #[inline]
434  #[must_use]
435  pub fn cmp_gt(self, rhs: Self) -> Self {
436    pick! {
437      if #[cfg(target_feature="sse2")] {
438        // no unsigned less than so inverting the high bit will get the correct result
439        let h = u32x4::splat(1 << 31);
440        Self { sse: cmp_gt_mask_i32_m128i((self ^ h).sse, (rhs ^ h).sse) }
441      } else if #[cfg(target_feature="simd128")] {
442        Self { simd: u32x4_gt(self.simd, rhs.simd) }
443      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
444        unsafe {Self { neon: vcgtq_u32(self.neon, rhs.neon) }}
445      } else {
446        Self { arr: [
447          if self.arr[0] > rhs.arr[0] { u32::MAX } else { 0 },
448          if self.arr[1] > rhs.arr[1] { u32::MAX } else { 0 },
449          if self.arr[2] > rhs.arr[2] { u32::MAX } else { 0 },
450          if self.arr[3] > rhs.arr[3] { u32::MAX } else { 0 },
451        ]}
452      }
453    }
454  }
455  #[inline]
456  #[must_use]
457  pub fn cmp_lt(self, rhs: Self) -> Self {
458    // lt is just gt the other way around
459    rhs.cmp_gt(self)
460  }
461
462  /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the
463  /// result. Useful for implementing divide constant value (see t_usefulness
464  /// example)
465  #[inline]
466  #[must_use]
467  pub fn mul_keep_high(self, rhs: Self) -> Self {
468    pick! {
469      if #[cfg(target_feature="avx2")] {
470        let a = convert_to_i64_m256i_from_u32_m128i(self.sse);
471        let b = convert_to_i64_m256i_from_u32_m128i(rhs.sse);
472        let r = mul_u64_low_bits_m256i(a, b);
473
474        // the compiler does a good job shuffling the lanes around
475        let b : [u32;8] = cast(r);
476        cast([b[1],b[3],b[5],b[7]])
477      } else if #[cfg(target_feature="sse2")] {
478        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
479
480        let oddp = mul_widen_u32_odd_m128i(
481          shr_imm_u64_m128i::<32>(self.sse),
482          shr_imm_u64_m128i::<32>(rhs.sse));
483
484        // the compiler does a good job shuffling the lanes around
485        let a : [u32;4]= cast(evenp);
486        let b : [u32;4]= cast(oddp);
487        cast([a[1],b[1],a[3],b[3]])
488
489      } else if #[cfg(target_feature="simd128")] {
490        let low =  u64x2_extmul_low_u32x4(self.simd, rhs.simd);
491        let high = u64x2_extmul_high_u32x4(self.simd, rhs.simd);
492
493        Self { simd: u32x4_shuffle::<1, 3, 5, 7>(low, high) }
494      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
495        unsafe {
496          let l = vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon));
497          let h = vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon));
498          u32x4 { neon: vcombine_u32(vshrn_n_u64(l,32), vshrn_n_u64(h,32)) }
499        }
500      } else {
501        let a: [u32; 4] = cast(self);
502        let b: [u32; 4] = cast(rhs);
503        cast([
504          ((u64::from(a[0]) * u64::from(b[0])) >> 32) as u32,
505          ((u64::from(a[1]) * u64::from(b[1])) >> 32) as u32,
506          ((u64::from(a[2]) * u64::from(b[2])) >> 32) as u32,
507          ((u64::from(a[3]) * u64::from(b[3])) >> 32) as u32,
508        ])
509      }
510    }
511  }
512
513  /// Multiplies corresponding 32 bit lanes and returns the 64 bit result
514  /// on the corresponding lanes.
515  ///
516  /// Effectively does two multiplies on 128 bit platforms, but is easier
517  /// to use than wrapping mul_widen_u32_odd_m128i individually.
518  #[inline]
519  #[must_use]
520  pub fn mul_widen(self, rhs: Self) -> u64x4 {
521    pick! {
522      if #[cfg(target_feature="avx2")] {
523        // ok to sign extend since we are throwing away the high half of the result anyway
524        let a = convert_to_i64_m256i_from_i32_m128i(self.sse);
525        let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse);
526        cast(mul_u64_low_bits_m256i(a, b))
527      } else if #[cfg(target_feature="sse2")] {
528        let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse);
529
530        let oddp = mul_widen_u32_odd_m128i(
531          shr_imm_u64_m128i::<32>(self.sse),
532          shr_imm_u64_m128i::<32>(rhs.sse));
533
534        u64x4 {
535          a: u64x2 { sse: unpack_low_i64_m128i(evenp, oddp)},
536          b: u64x2 { sse: unpack_high_i64_m128i(evenp, oddp)}
537        }
538      } else if #[cfg(target_feature="simd128")] {
539        u64x4 {
540          a: u64x2 { simd: u64x2_extmul_low_u32x4(self.simd, rhs.simd) },
541          b: u64x2 { simd: u64x2_extmul_high_u32x4(self.simd, rhs.simd) },
542        }
543      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
544      unsafe {
545        u64x4 { a: u64x2 { neon: vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)) },
546                b: u64x2 { neon: vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)) } }
547        }
548      } else {
549        let a: [u32; 4] = cast(self);
550        let b: [u32; 4] = cast(rhs);
551        cast([
552          u64::from(a[0]) * u64::from(b[0]),
553          u64::from(a[1]) * u64::from(b[1]),
554          u64::from(a[2]) * u64::from(b[2]),
555          u64::from(a[3]) * u64::from(b[3]),
556        ])
557      }
558    }
559  }
560
561  #[inline]
562  #[must_use]
563  pub fn blend(self, t: Self, f: Self) -> Self {
564    pick! {
565      if #[cfg(target_feature="sse4.1")] {
566        Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) }
567      } else if #[cfg(target_feature="simd128")] {
568        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
569      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
570        unsafe {Self { neon: vbslq_u32(self.neon, t.neon, f.neon) }}
571      } else {
572        generic_bit_blend(self, t, f)
573      }
574    }
575  }
576  #[inline]
577  #[must_use]
578  pub fn max(self, rhs: Self) -> Self {
579    pick! {
580      if #[cfg(target_feature="sse4.1")] {
581        Self { sse: max_u32_m128i(self.sse, rhs.sse) }
582      } else if #[cfg(target_feature="simd128")] {
583        Self { simd: u32x4_max(self.simd, rhs.simd) }
584      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
585        unsafe {Self { neon: vmaxq_u32(self.neon, rhs.neon) }}
586      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
587        unsafe {Self { neon: vmaxq_u16(self.neon, rhs.neon) }}
588      } else {
589        let arr: [u32; 4] = cast(self);
590        let rhs: [u32; 4] = cast(rhs);
591        cast([
592          arr[0].max(rhs[0]),
593          arr[1].max(rhs[1]),
594          arr[2].max(rhs[2]),
595          arr[3].max(rhs[3]),
596        ])
597      }
598    }
599  }
600  #[inline]
601  #[must_use]
602  pub fn min(self, rhs: Self) -> Self {
603    pick! {
604      if #[cfg(target_feature="sse4.1")] {
605        Self { sse: min_u32_m128i(self.sse, rhs.sse) }
606      } else if #[cfg(target_feature="simd128")] {
607        Self { simd: u32x4_min(self.simd, rhs.simd) }
608      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
609        unsafe {Self { neon: vminq_u32(self.neon, rhs.neon) }}
610      } else {
611        let arr: [u32; 4] = cast(self);
612        let rhs: [u32; 4] = cast(rhs);
613        cast([
614          arr[0].min(rhs[0]),
615          arr[1].min(rhs[1]),
616          arr[2].min(rhs[2]),
617          arr[3].min(rhs[3]),
618        ])
619      }
620    }
621  }
622
623  #[inline]
624  #[must_use]
625  pub fn any(self) -> bool {
626    pick! {
627      if #[cfg(target_feature="sse2")] {
628        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) != 0
629      } else if #[cfg(target_feature="simd128")] {
630        u32x4_bitmask(self.simd) != 0
631      } else {
632        let v : [u64;2] = cast(self);
633        ((v[0] | v[1]) & 0x8000000080000000) != 0
634      }
635    }
636  }
637
638  #[inline]
639  #[must_use]
640  pub fn all(self) -> bool {
641    pick! {
642      if #[cfg(target_feature="sse2")] {
643        (move_mask_i8_m128i(self.sse) & 0b1000100010001000) == 0b1000100010001000
644      } else if #[cfg(target_feature="simd128")] {
645        u32x4_bitmask(self.simd) == 0b1111
646      } else {
647        let v : [u64;2] = cast(self);
648        (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000
649      }
650    }
651  }
652
653  #[inline]
654  #[must_use]
655  pub fn none(self) -> bool {
656    !self.any()
657  }
658
659  /// Transpose matrix of 4x4 `u32` matrix. Currently only accelerated on SSE.
660  #[must_use]
661  #[inline]
662  pub fn transpose(data: [u32x4; 4]) -> [u32x4; 4] {
663    pick! {
664      if #[cfg(target_feature="sse")] {
665        let mut e0 = data[0];
666        let mut e1 = data[1];
667        let mut e2 = data[2];
668        let mut e3 = data[3];
669
670        transpose_four_m128(
671          cast_mut(&mut e0.sse),
672          cast_mut(&mut e1.sse),
673          cast_mut(&mut e2.sse),
674          cast_mut(&mut e3.sse),
675        );
676
677        [e0, e1, e2, e3]
678      } else {
679        #[inline(always)]
680        fn transpose_column(data: &[u32x4; 4], index: usize) -> u32x4 {
681          u32x4::new([
682            data[0].as_array_ref()[index],
683            data[1].as_array_ref()[index],
684            data[2].as_array_ref()[index],
685            data[3].as_array_ref()[index],
686          ])
687        }
688
689        [
690          transpose_column(&data, 0),
691          transpose_column(&data, 1),
692          transpose_column(&data, 2),
693          transpose_column(&data, 3),
694        ]
695      }
696    }
697  }
698
699  #[inline]
700  pub fn to_array(self) -> [u32; 4] {
701    cast(self)
702  }
703
704  #[inline]
705  pub fn as_array_ref(&self) -> &[u32; 4] {
706    cast_ref(self)
707  }
708
709  #[inline]
710  pub fn as_array_mut(&mut self) -> &mut [u32; 4] {
711    cast_mut(self)
712  }
713}