diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs index 9d342539a..0fe782230 100644 --- a/fearless_simd/src/generated.rs +++ b/fearless_simd/src/generated.rs @@ -6,6 +6,7 @@ clippy::cast_possible_truncation, clippy::unseparated_literal_suffix, clippy::use_self, + clippy::wrong_self_convention, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] #![cfg_attr( diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 27e3b8df1..030e7bfbf 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -895,6 +895,26 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + unsafe { + { + let bit_bytes = _mm_cvtsi32_si128(bits as i32); + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1386,6 +1406,26 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + unsafe { + { + let bit_lanes = _mm_set1_epi16(bits as i16); + let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); + _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + unsafe { + { + let packed = _mm_packs_epi16(a.into(), a.into()); + _mm_movemask_epi8(packed) as u8 as u64 + } + } + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1887,6 +1927,21 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + unsafe { + { + let bit_lanes = _mm_set1_epi32(bits as i32); + let bit_mask = _mm_setr_epi32(1, 2, 4, 8); + _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2189,6 +2244,21 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + unsafe { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -3270,6 +3340,31 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + unsafe { + { + let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 3, 3, 3, 3, 3, 3, 3, + ), + ); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, + 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 } + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -3963,6 +4058,29 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + unsafe { + { + let bit_lanes = _mm256_set1_epi16(bits as i16); + let bit_mask = _mm256_setr_epi16( + 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, -32768, + ); + _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + unsafe { + { + let halves: [__m128i; 2usize] = core::mem::transmute(a.val.0); + let packed = _mm_packs_epi16(halves[0], halves[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } + } + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -4601,6 +4719,21 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + unsafe { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -4978,6 +5111,21 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<__m256i, [i64; 4usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + unsafe { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -6076,6 +6224,50 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + unsafe { + { + let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_setr_epi8( + 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, + 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, + ); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, + 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm256_shuffle_epi8( + bit_bytes, + _mm256_setr_epi8( + 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, + ), + ); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: self, + } + } + } + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -6814,6 +7006,24 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + unsafe { + { + let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; + let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; + let lo = _pext_u32(lo, 0x5555_5555u32) as u64; + let hi = _pext_u32(hi, 0x5555_5555u32) as u64; + lo | (hi << 16usize) + } + } + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7516,6 +7726,35 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + unsafe { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + mask32x16 { + val: crate::support::Aligned512([ + { + let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + { + let bit_mask = + _mm256_setr_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + ]), + simd: self, + } + } + } + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -7929,6 +8168,34 @@ impl Simd for Avx2 { unsafe { core::mem::transmute::<[__m256i; 2usize], [i64; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + unsafe { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + mask64x8 { + val: crate::support::Aligned512([ + { + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + { + let bit_mask = _mm256_set_epi64x(128, 64, 32, 16); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + }, + ]), + simd: self, + } + } + } + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + let (lo, hi) = self.split_mask64x8(a); + let lo = self.to_bitmask_mask64x4(lo); + let hi = self.to_bitmask_mask64x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index 43e8fd1ba..1dc4b5d96 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -1818,6 +1818,25 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + let lanes: [i8; 16usize] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + lanes.simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + let lanes = self.as_array_mask8x16(a); + let mut bits = 0u64; + let mut i = 0; + while i < 16usize { + if lanes[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + bits + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -2964,6 +2983,25 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + let lanes: [i16; 8usize] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + lanes.simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + let lanes = self.as_array_mask16x8(a); + let mut bits = 0u64; + let mut i = 0; + while i < 8usize { + if lanes[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + bits + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -3802,6 +3840,25 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + let lanes: [i32; 4usize] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + lanes.simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + let lanes = self.as_array_mask32x4(a); + let mut bits = 0u64; + let mut i = 0; + while i < 4usize { + if lanes[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + bits + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -4196,6 +4253,25 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + let lanes: [i64; 2usize] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + lanes.simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + let lanes = self.as_array_mask64x2(a); + let mut bits = 0u64; + let mut i = 0; + while i < 2usize { + if lanes[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + bits + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitand(a.val.0[0usize], &b.val.0[0usize]), @@ -5176,6 +5252,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -5824,6 +5913,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + let (lo, hi) = self.split_mask16x16(a); + let lo = self.to_bitmask_mask16x8(lo); + let hi = self.to_bitmask_mask16x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -6452,6 +6554,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -6853,6 +6968,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -7921,6 +8049,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + let lo = self.from_bitmask_mask8x32(bits); + let hi = self.from_bitmask_mask8x32(bits >> 32usize); + self.combine_mask8x32(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -8597,6 +8738,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + let (lo, hi) = self.split_mask16x32(a); + let lo = self.to_bitmask_mask16x16(lo); + let hi = self.to_bitmask_mask16x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -9237,6 +9391,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -9624,6 +9791,19 @@ impl Simd for Fallback { a.val.0 } #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + let lo = self.from_bitmask_mask64x4(bits); + let hi = self.from_bitmask_mask64x4(bits >> 4usize); + self.combine_mask64x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + let (lo, hi) = self.split_mask64x8(a); + let lo = self.to_bitmask_mask64x4(lo); + let hi = self.to_bitmask_mask64x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index ad46f2f55..984082976 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -786,6 +786,32 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + unsafe { + let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr()); + let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts); + let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts); + let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0)); + let hi = vcltq_s16(vreinterpretq_s16_u16(hi), vdupq_n_s16(0)); + vcombine_s8( + vmovn_s16(vreinterpretq_s16_u16(lo)), + vmovn_s16(vreinterpretq_s16_u16(hi)), + ) + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + unsafe { + let weights = + vld1q_u8([1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128].as_ptr()); + let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights); + let lo = vaddv_u8(vget_low_u8(bits)) as u64; + let hi = vaddv_u8(vget_high_u8(bits)) as u64; + lo | (hi << 8) + } + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } } @@ -1272,6 +1298,23 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + unsafe { + let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr()); + let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts); + let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0)); + vreinterpretq_s16_u16(mask).simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + unsafe { + let weights = vld1q_u16([1, 2, 4, 8, 16, 32, 64, 128].as_ptr()); + let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights); + vaddvq_u16(bits) as u64 + } + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } } @@ -1762,6 +1805,23 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + unsafe { + let shifts = vld1q_s32([31, 30, 29, 28].as_ptr()); + let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts); + let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0)); + vreinterpretq_s32_u32(mask).simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + unsafe { + let weights = vld1q_u32([1, 2, 4, 8].as_ptr()); + let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights); + vaddvq_u32(bits) as u64 + } + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } } @@ -2065,6 +2125,23 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + unsafe { + let shifts = vld1q_s64([63, 62].as_ptr()); + let shifted = vshlq_u64(vdupq_n_u64(bits), shifts); + let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0)); + vreinterpretq_s64_u64(mask).simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + unsafe { + let weights = vld1q_u64([1, 2].as_ptr()); + let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights); + vaddvq_u64(bits) + } + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } } @@ -3146,6 +3223,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3874,6 +3964,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + let (lo, hi) = self.split_mask16x16(a); + let lo = self.to_bitmask_mask16x8(lo); + let hi = self.to_bitmask_mask16x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4595,6 +4698,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5046,6 +5162,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6199,6 +6328,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + let lo = self.from_bitmask_mask8x32(bits); + let hi = self.from_bitmask_mask8x32(bits >> 32usize); + self.combine_mask8x32(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -6958,6 +7100,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + let (lo, hi) = self.split_mask16x32(a); + let lo = self.to_bitmask_mask16x16(lo); + let hi = self.to_bitmask_mask16x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7699,6 +7854,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8150,6 +8318,19 @@ impl Simd for Neon { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + let lo = self.from_bitmask_mask64x4(bits); + let hi = self.from_bitmask_mask64x4(bits >> 4usize); + self.combine_mask64x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + let (lo, hi) = self.split_mask64x8(a); + let lo = self.to_bitmask_mask64x4(lo); + let hi = self.to_bitmask_mask64x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 01a5ee0f9..47b3ce962 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -396,6 +396,10 @@ pub trait Simd: fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask8x16(self, a: mask8x16) -> [i8; 16usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; #[doc = "Compute the logical OR of two masks."] @@ -595,6 +599,10 @@ pub trait Simd: fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask16x8(self, a: mask16x8) -> [i16; 8usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; #[doc = "Compute the logical OR of two masks."] @@ -796,6 +804,10 @@ pub trait Simd: fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask32x4(self, a: mask32x4) -> [i32; 4usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; #[doc = "Compute the logical OR of two masks."] @@ -921,6 +933,10 @@ pub trait Simd: fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask64x2(self, a: mask64x2) -> [i64; 2usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; #[doc = "Compute the logical OR of two masks."] @@ -1232,6 +1248,10 @@ pub trait Simd: fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask8x32(self, a: mask8x32) -> [i8; 32usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; #[doc = "Compute the logical OR of two masks."] @@ -1439,6 +1459,10 @@ pub trait Simd: fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask16x16(self, a: mask16x16) -> [i16; 16usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; #[doc = "Compute the logical OR of two masks."] @@ -1646,6 +1670,10 @@ pub trait Simd: fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask32x8(self, a: mask32x8) -> [i32; 8usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; #[doc = "Compute the logical OR of two masks."] @@ -1775,6 +1803,10 @@ pub trait Simd: fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask64x4(self, a: mask64x4) -> [i64; 4usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; #[doc = "Compute the logical OR of two masks."] @@ -2088,6 +2120,10 @@ pub trait Simd: fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask8x64(self, a: mask8x64) -> [i8; 64usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; #[doc = "Compute the logical OR of two masks."] @@ -2293,6 +2329,10 @@ pub trait Simd: fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask16x32(self, a: mask16x32) -> [i16; 32usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; #[doc = "Compute the logical OR of two masks."] @@ -2498,6 +2538,10 @@ pub trait Simd: fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask32x16(self, a: mask32x16) -> [i32; 16usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; #[doc = "Compute the logical OR of two masks."] @@ -2623,6 +2667,10 @@ pub trait Simd: fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8; #[doc = "Convert a SIMD mask to signed integer mask lanes."] fn as_array_mask64x8(self, a: mask64x8) -> [i64; 8usize]; + #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8; + #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64; #[doc = "Compute the logical AND of two masks."] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; #[doc = "Compute the logical OR of two masks."] @@ -2951,6 +2999,32 @@ pub trait SimdMask: fn witness(&self) -> S; #[doc = r" Create a SIMD mask with all lanes set to the given boolean value."] fn splat(simd: S, val: bool) -> Self; + #[doc = r" Create a mask from a compact bitmask."] + #[doc = r""] + #[doc = r" Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above"] + #[doc = r" [`Self::N`] are ignored."] + fn from_bitmask(simd: S, bits: u64) -> Self; + #[doc = r" Convert this mask to a compact bitmask."] + #[doc = r""] + #[doc = r" Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above"] + #[doc = r" [`Self::N`] are cleared."] + fn to_bitmask(self) -> u64; + #[doc = r" Test whether one logical lane is set."] + #[doc = r""] + #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."] + #[inline(always)] + fn test(&self, index: usize) -> bool { + assert!( + index < Self::N, + "mask lane index {index} is out of bounds for {} lanes", + Self::N + ); + (((*self).to_bitmask() >> index) & 1) != 0 + } + #[doc = r" Sets the value of one logical lane."] + #[doc = r""] + #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."] + fn set(&mut self, index: usize, value: bool); #[doc = r" Create a SIMD mask from signed integer mask lanes."] #[doc = r""] #[doc = r" The slice must be exactly the size of the SIMD mask."] diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index ec0e074ff..a71b080e6 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -625,7 +625,7 @@ impl crate::SimdCombine for u8x16 { self.simd.combine_u8x16(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask8x16 { pub(crate) val: S::mask8x16, @@ -674,6 +674,25 @@ impl crate::SimdMask for mask8x16 { simd.splat_mask8x16(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask8x16(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask8x16(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 16, + "mask lane index {index} is out of bounds for {} lanes", + 16 + ); + let mut lanes = self.simd.as_array_mask8x16(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask8x16(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let slice: &[i8; 16] = slice.try_into().unwrap(); simd.load_array_mask8x16(*slice) @@ -1074,7 +1093,7 @@ impl crate::SimdCombine for u16x8 { self.simd.combine_u16x8(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask16x8 { pub(crate) val: S::mask16x8, @@ -1123,6 +1142,25 @@ impl crate::SimdMask for mask16x8 { simd.splat_mask16x8(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask16x8(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask16x8(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 8, + "mask lane index {index} is out of bounds for {} lanes", + 8 + ); + let mut lanes = self.simd.as_array_mask16x8(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask16x8(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let slice: &[i16; 8] = slice.try_into().unwrap(); simd.load_array_mask16x8(*slice) @@ -1547,7 +1585,7 @@ impl crate::SimdCombine for u32x4 { self.simd.combine_u32x4(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask32x4 { pub(crate) val: S::mask32x4, @@ -1596,6 +1634,25 @@ impl crate::SimdMask for mask32x4 { simd.splat_mask32x4(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask32x4(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask32x4(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 4, + "mask lane index {index} is out of bounds for {} lanes", + 4 + ); + let mut lanes = self.simd.as_array_mask32x4(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask32x4(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let slice: &[i32; 4] = slice.try_into().unwrap(); simd.load_array_mask32x4(*slice) @@ -1861,7 +1918,7 @@ impl crate::SimdCombine for f64x2 { self.simd.combine_f64x2(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask64x2 { pub(crate) val: S::mask64x2, @@ -1910,6 +1967,25 @@ impl crate::SimdMask for mask64x2 { simd.splat_mask64x2(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask64x2(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask64x2(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 2, + "mask lane index {index} is out of bounds for {} lanes", + 2 + ); + let mut lanes = self.simd.as_array_mask64x2(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask64x2(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let slice: &[i64; 2] = slice.try_into().unwrap(); simd.load_array_mask64x2(*slice) @@ -2580,7 +2656,7 @@ impl crate::SimdCombine for u8x32 { self.simd.combine_u8x32(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask8x32 { pub(crate) val: S::mask8x32, @@ -2629,6 +2705,25 @@ impl crate::SimdMask for mask8x32 { simd.splat_mask8x32(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask8x32(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask8x32(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 32, + "mask lane index {index} is out of bounds for {} lanes", + 32 + ); + let mut lanes = self.simd.as_array_mask8x32(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask8x32(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let slice: &[i8; 32] = slice.try_into().unwrap(); simd.load_array_mask8x32(*slice) @@ -3055,7 +3150,7 @@ impl crate::SimdCombine for u16x16 { self.simd.combine_u16x16(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask16x16 { pub(crate) val: S::mask16x16, @@ -3104,6 +3199,25 @@ impl crate::SimdMask for mask16x16 { simd.splat_mask16x16(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask16x16(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask16x16(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 16, + "mask lane index {index} is out of bounds for {} lanes", + 16 + ); + let mut lanes = self.simd.as_array_mask16x16(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask16x16(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let slice: &[i16; 16] = slice.try_into().unwrap(); simd.load_array_mask16x16(*slice) @@ -3542,7 +3656,7 @@ impl crate::SimdCombine for u32x8 { self.simd.combine_u32x8(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask32x8 { pub(crate) val: S::mask32x8, @@ -3591,6 +3705,25 @@ impl crate::SimdMask for mask32x8 { simd.splat_mask32x8(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask32x8(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask32x8(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 8, + "mask lane index {index} is out of bounds for {} lanes", + 8 + ); + let mut lanes = self.simd.as_array_mask32x8(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask32x8(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let slice: &[i32; 8] = slice.try_into().unwrap(); simd.load_array_mask32x8(*slice) @@ -3863,7 +3996,7 @@ impl crate::SimdCombine for f64x4 { self.simd.combine_f64x4(self, rhs.simd_into(self.simd)) } } -#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask64x4 { pub(crate) val: S::mask64x4, @@ -3912,6 +4045,25 @@ impl crate::SimdMask for mask64x4 { simd.splat_mask64x4(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask64x4(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask64x4(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 4, + "mask lane index {index} is out of bounds for {} lanes", + 4 + ); + let mut lanes = self.simd.as_array_mask64x4(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask64x4(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let slice: &[i64; 4] = slice.try_into().unwrap(); simd.load_array_mask64x4(*slice) @@ -4570,7 +4722,7 @@ impl crate::SimdSplit for u8x64 { self.simd.split_u8x64(self) } } -#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask8x64 { pub(crate) val: S::mask8x64, @@ -4619,6 +4771,25 @@ impl crate::SimdMask for mask8x64 { simd.splat_mask8x64(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask8x64(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask8x64(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 64, + "mask lane index {index} is out of bounds for {} lanes", + 64 + ); + let mut lanes = self.simd.as_array_mask8x64(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask8x64(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let slice: &[i8; 64] = slice.try_into().unwrap(); simd.load_array_mask8x64(*slice) @@ -5033,7 +5204,7 @@ impl crate::SimdSplit for u16x32 { self.simd.split_u16x32(self) } } -#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask16x32 { pub(crate) val: S::mask16x32, @@ -5082,6 +5253,25 @@ impl crate::SimdMask for mask16x32 { simd.splat_mask16x32(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask16x32(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask16x32(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 32, + "mask lane index {index} is out of bounds for {} lanes", + 32 + ); + let mut lanes = self.simd.as_array_mask16x32(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask16x32(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let slice: &[i16; 32] = slice.try_into().unwrap(); simd.load_array_mask16x32(*slice) @@ -5520,7 +5710,7 @@ impl crate::SimdSplit for u32x16 { self.simd.split_u32x16(self) } } -#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask32x16 { pub(crate) val: S::mask32x16, @@ -5569,6 +5759,25 @@ impl crate::SimdMask for mask32x16 { simd.splat_mask32x16(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask32x16(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask32x16(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 16, + "mask lane index {index} is out of bounds for {} lanes", + 16 + ); + let mut lanes = self.simd.as_array_mask32x16(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask32x16(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let slice: &[i32; 16] = slice.try_into().unwrap(); simd.load_array_mask32x16(*slice) @@ -5835,7 +6044,7 @@ impl crate::SimdSplit for f64x8 { self.simd.split_f64x8(self) } } -#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] +#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."] #[derive(Clone, Copy)] pub struct mask64x8 { pub(crate) val: S::mask64x8, @@ -5884,6 +6093,25 @@ impl crate::SimdMask for mask64x8 { simd.splat_mask64x8(val) } #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.from_bitmask_mask64x8(bits) + } + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.to_bitmask_mask64x8(self) + } + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < 8, + "mask lane index {index} is out of bounds for {} lanes", + 8 + ); + let mut lanes = self.simd.as_array_mask64x8(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.load_array_mask64x8(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let slice: &[i64; 8] = slice.try_into().unwrap(); simd.load_array_mask64x8(*slice) diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 6388d315c..746cbca55 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -935,6 +935,26 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + unsafe { + { + let bit_bytes = _mm_cvtsi32_si128(bits as i32); + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 } + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1435,6 +1455,26 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + unsafe { + { + let bit_lanes = _mm_set1_epi16(bits as i16); + let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128); + _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + unsafe { + { + let packed = _mm_packs_epi16(a.into(), a.into()); + _mm_movemask_epi8(packed) as u8 as u64 + } + } + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1945,6 +1985,21 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + unsafe { + { + let bit_lanes = _mm_set1_epi32(bits as i32); + let bit_mask = _mm_setr_epi32(1, 2, 4, 8); + _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2253,6 +2308,21 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + unsafe { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + .simd_into(self) + } + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 } + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -3268,6 +3338,19 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3952,6 +4035,21 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + unsafe { + { + let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } + } + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4627,6 +4725,19 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5054,6 +5165,19 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 2usize], [i64; 4usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6175,6 +6299,56 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + unsafe { + { + let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = + _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128); + mask8x64 { + val: crate::support::Aligned512([ + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + { + let bit_bytes = _mm_shuffle_epi8( + bit_bytes, + _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7), + ); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + }, + ]), + simd: self, + } + } + } + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -6905,6 +7079,24 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + unsafe { + { + let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); + let lo = _mm_movemask_epi8(lo) as u32 as u64; + let hi = _mm_movemask_epi8(hi) as u32 as u64; + lo | (hi << 16usize) + } + } + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7607,6 +7799,19 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8020,6 +8225,19 @@ impl Simd for Sse4_2 { unsafe { core::mem::transmute::<[__m128i; 4usize], [i64; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + let lo = self.from_bitmask_mask64x4(bits); + let hi = self.from_bitmask_mask64x4(bits >> 4usize); + self.combine_mask64x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + let (lo, hi) = self.split_mask64x8(a); + let lo = self.to_bitmask_mask64x4(lo); + let hi = self.to_bitmask_mask64x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 4eb7671bb..da6b718dc 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -852,6 +852,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16 { + let lo = i8x16_splat(bits as i8); + let hi = i8x16_splat((bits >> 8) as i8); + let bytes = u8x16_shuffle::<0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16, 16>(lo, hi); + let powers = u8x16(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + let selected = v128_and(bytes, powers); + i8x16_ne(selected, i8x16_splat(0)).simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask8x16(self, a: mask8x16) -> u64 { + i8x16_bitmask(a.into()) as u64 + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1341,6 +1354,17 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8 { + let bitset = i16x8_splat(bits as i16); + let powers = u16x8(1, 2, 4, 8, 16, 32, 64, 128); + let selected = v128_and(bitset, powers); + i16x8_ne(selected, i16x8_splat(0)).simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask16x8(self, a: mask16x8) -> u64 { + i16x8_bitmask(a.into()) as u64 + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1834,6 +1858,17 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4 { + let bitset = i32x4_splat(bits as i32); + let powers = u32x4(1, 2, 4, 8); + let selected = v128_and(bitset, powers); + i32x4_ne(selected, i32x4_splat(0)).simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask32x4(self, a: mask32x4) -> u64 { + i32x4_bitmask(a.into()) as u64 + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_and(a.into(), b.into()).simd_into(self) } @@ -2169,6 +2204,17 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2 { + let bitset = i64x2_splat(bits as i64); + let powers = u64x2(1, 2); + let selected = v128_and(bitset, powers); + i64x2_ne(selected, i64x2_splat(0)).simd_into(self) + } + #[inline(always)] + fn to_bitmask_mask64x2(self, a: mask64x2) -> u64 { + i64x2_bitmask(a.into()) as u64 + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_and(a.into(), b.into()).simd_into(self) } @@ -3191,6 +3237,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 2usize], [i8; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32 { + let lo = self.from_bitmask_mask8x16(bits); + let hi = self.from_bitmask_mask8x16(bits >> 16usize); + self.combine_mask8x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x32(self, a: mask8x32) -> u64 { + let (lo, hi) = self.split_mask8x32(a); + let lo = self.to_bitmask_mask8x16(lo); + let hi = self.to_bitmask_mask8x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3873,6 +3932,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 2usize], [i16; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16 { + let lo = self.from_bitmask_mask16x8(bits); + let hi = self.from_bitmask_mask16x8(bits >> 8usize); + self.combine_mask16x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x16(self, a: mask16x16) -> u64 { + let (lo, hi) = self.split_mask16x16(a); + let lo = self.to_bitmask_mask16x8(lo); + let hi = self.to_bitmask_mask16x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -4548,6 +4620,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 2usize], [i32; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8 { + let lo = self.from_bitmask_mask32x4(bits); + let hi = self.from_bitmask_mask32x4(bits >> 4usize); + self.combine_mask32x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x8(self, a: mask32x8) -> u64 { + let (lo, hi) = self.split_mask32x8(a); + let lo = self.to_bitmask_mask32x4(lo); + let hi = self.to_bitmask_mask32x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -4975,6 +5060,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 2usize], [i64; 4usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4 { + let lo = self.from_bitmask_mask64x2(bits); + let hi = self.from_bitmask_mask64x2(bits >> 2usize); + self.combine_mask64x2(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x4(self, a: mask64x4) -> u64 { + let (lo, hi) = self.split_mask64x4(a); + let lo = self.to_bitmask_mask64x2(lo); + let hi = self.to_bitmask_mask64x2(hi); + lo | (hi << 2usize) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6104,6 +6202,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 4usize], [i8; 64usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64 { + let lo = self.from_bitmask_mask8x32(bits); + let hi = self.from_bitmask_mask8x32(bits >> 32usize); + self.combine_mask8x32(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask8x64(self, a: mask8x64) -> u64 { + let (lo, hi) = self.split_mask8x64(a); + let lo = self.to_bitmask_mask8x32(lo); + let hi = self.to_bitmask_mask8x32(hi); + lo | (hi << 32usize) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -6821,6 +6932,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 4usize], [i16; 32usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32 { + let lo = self.from_bitmask_mask16x16(bits); + let hi = self.from_bitmask_mask16x16(bits >> 16usize); + self.combine_mask16x16(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask16x32(self, a: mask16x32) -> u64 { + let (lo, hi) = self.split_mask16x32(a); + let lo = self.to_bitmask_mask16x16(lo); + let hi = self.to_bitmask_mask16x16(hi); + lo | (hi << 16usize) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7520,6 +7644,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 4usize], [i32; 16usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16 { + let lo = self.from_bitmask_mask32x8(bits); + let hi = self.from_bitmask_mask32x8(bits >> 8usize); + self.combine_mask32x8(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask32x16(self, a: mask32x16) -> u64 { + let (lo, hi) = self.split_mask32x16(a); + let lo = self.to_bitmask_mask32x8(lo); + let hi = self.to_bitmask_mask32x8(hi); + lo | (hi << 8usize) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -7933,6 +8070,19 @@ impl Simd for WasmSimd128 { unsafe { core::mem::transmute::<[v128; 4usize], [i64; 8usize]>(a.val.0) } } #[inline(always)] + fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8 { + let lo = self.from_bitmask_mask64x4(bits); + let hi = self.from_bitmask_mask64x4(bits >> 4usize); + self.combine_mask64x4(lo, hi) + } + #[inline(always)] + fn to_bitmask_mask64x8(self, a: mask64x8) -> u64 { + let (lo, hi) = self.split_mask64x8(a); + let lo = self.to_bitmask_mask64x4(lo); + let hi = self.to_bitmask_mask64x4(hi); + lo | (hi << 4usize) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index fd390f613..e2d2cfeef 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -187,6 +187,27 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream { } } } + OpSig::MaskFromBitmask => { + let half_len = half.len; + quote! { + #method_sig { + let lo = self.#do_half(bits); + let hi = self.#do_half(bits >> #half_len); + self.#combine(lo, hi) + } + } + } + OpSig::MaskToBitmask => { + let half_len = half.len; + quote! { + #method_sig { + let (lo, hi) = self.#split(a); + let lo = self.#do_half(lo); + let hi = self.#do_half(hi); + lo | (hi << #half_len) + } + } + } OpSig::LoadInterleaved { block_size, block_count, @@ -455,3 +476,36 @@ pub(crate) fn generic_from_bytes(method_sig: TokenStream, vec_ty: &VecType) -> T } } } + +pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits); + let len = vec_ty.len; + + quote! { + #method_sig { + let lanes: [#scalar; #len] = + core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 }); + lanes.simd_into(self) + } + } +} + +pub(crate) fn generic_mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + let as_array = generic_op_name("as_array", vec_ty); + let len = vec_ty.len; + + quote! { + #method_sig { + let lanes = self.#as_array(a); + let mut bits = 0u64; + let mut i = 0; + while i < #len { + if lanes[i] != 0 { + bits |= 1u64 << i; + } + i += 1; + } + bits + } + } +} diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index c4800698f..61ec20303 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -46,6 +46,11 @@ pub(crate) trait Level { /// Generate a single operation's method on the `Simd` implementation. fn make_method(&self, op: Op, vec_ty: &VecType) -> TokenStream; + /// Determine whether an operation should defer to the generic split/combine implementation. + fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool { + op.sig.should_use_generic_op(vec_ty, self.native_width()) + } + fn token(&self) -> Ident { Ident::new(self.name(), Span::call_site()) } @@ -91,7 +96,7 @@ pub(crate) trait Level { let mut methods = vec![]; for vec_ty in SIMD_TYPES { for op in ops_for_type(vec_ty) { - if op.sig.should_use_generic_op(vec_ty, native_width) { + if self.should_use_generic_op(&op, vec_ty) { methods.push(generic_op(&op, vec_ty)); continue; } diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index 119850e27..269ef502a 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -3,7 +3,8 @@ use crate::arch::fallback; use crate::generic::{ - generic_from_bytes, generic_op_name, generic_to_bytes, integer_lane_mask_splat_arg, + generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name, + generic_to_bytes, integer_lane_mask_splat_arg, }; use crate::level::Level; use crate::ops::{Op, OpSig, RefKind, valid_reinterpret}; @@ -455,6 +456,8 @@ impl Level for Fallback { } } } + OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty), + OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 7f74fb4e0..9765c06df 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -530,6 +530,8 @@ impl Level for Neon { } } } + OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), + OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind), OpSig::AsArray { kind } => { generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| { @@ -561,6 +563,130 @@ impl Level for Neon { } } +impl Neon { + fn handle_mask_from_bitmask(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + assert_eq!( + vec_ty.n_bits(), + self.native_width(), + "wide masks should use the generic split implementation" + ); + + match vec_ty.scalar_bits { + 8 => quote! { + #method_sig { + unsafe { + let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr()); + let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts); + let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts); + let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0)); + let hi = vcltq_s16(vreinterpretq_s16_u16(hi), vdupq_n_s16(0)); + vcombine_s8( + vmovn_s16(vreinterpretq_s16_u16(lo)), + vmovn_s16(vreinterpretq_s16_u16(hi)), + ).simd_into(self) + } + } + }, + 16 => quote! { + #method_sig { + unsafe { + let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr()); + let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts); + let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0)); + vreinterpretq_s16_u16(mask).simd_into(self) + } + } + }, + 32 => quote! { + #method_sig { + unsafe { + let shifts = vld1q_s32([31, 30, 29, 28].as_ptr()); + let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts); + let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0)); + vreinterpretq_s32_u32(mask).simd_into(self) + } + } + }, + 64 => quote! { + #method_sig { + unsafe { + let shifts = vld1q_s64([63, 62].as_ptr()); + let shifted = vshlq_u64(vdupq_n_u64(bits), shifts); + let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0)); + vreinterpretq_s64_u64(mask).simd_into(self) + } + } + }, + _ => unimplemented!(), + } + } + + fn handle_mask_to_bitmask(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + assert_eq!( + vec_ty.n_bits(), + self.native_width(), + "wide masks should use the generic split implementation" + ); + + match vec_ty.scalar_bits { + 8 => quote! { + #method_sig { + unsafe { + let weights = vld1q_u8([ + 1, 2, 4, 8, 16, 32, 64, 128, + 1, 2, 4, 8, 16, 32, 64, 128, + ].as_ptr()); + let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights); + let lo = vaddv_u8(vget_low_u8(bits)) as u64; + let hi = vaddv_u8(vget_high_u8(bits)) as u64; + lo | (hi << 8) + } + } + }, + 16 => quote! { + #method_sig { + unsafe { + let weights = vld1q_u16([ + 1, 2, 4, 8, 16, 32, 64, 128, + ].as_ptr()); + let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights); + vaddvq_u16(bits) as u64 + } + } + }, + 32 => quote! { + #method_sig { + unsafe { + let weights = vld1q_u32([1, 2, 4, 8].as_ptr()); + let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights); + vaddvq_u32(bits) as u64 + } + } + }, + 64 => quote! { + #method_sig { + unsafe { + let weights = vld1q_u64([1, 2].as_ptr()); + let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights); + vaddvq_u64(bits) + } + } + }, + _ => unimplemented!(), + } + } +} + fn mk_slide_helpers() -> TokenStream { let shifts = (0_usize..16).map(|shift| { let shift_i32 = i32::try_from(shift).unwrap(); diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index a973c01bd..fb118cf49 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -308,6 +308,36 @@ fn mk_simd_mask() -> TokenStream { /// Create a SIMD mask with all lanes set to the given boolean value. fn splat(simd: S, val: bool) -> Self; + /// Create a mask from a compact bitmask. + /// + /// Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above + /// [`Self::N`] are ignored. + fn from_bitmask(simd: S, bits: u64) -> Self; + + /// Convert this mask to a compact bitmask. + /// + /// Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above + /// [`Self::N`] are cleared. + fn to_bitmask(self) -> u64; + + /// Test whether one logical lane is set. + /// + /// Panics if `index` is greater than or equal to the number of lanes in the mask. + #[inline(always)] + fn test(&self, index: usize) -> bool { + assert!( + index < Self::N, + "mask lane index {index} is out of bounds for {} lanes", + Self::N + ); + (((*self).to_bitmask() >> index) & 1) != 0 + } + + /// Sets the value of one logical lane. + /// + /// Panics if `index` is greater than or equal to the number of lanes in the mask. + fn set(&mut self, index: usize, value: bool); + /// Create a SIMD mask from signed integer mask lanes. /// /// The slice must be exactly the size of the SIMD mask. diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs index 22d484178..73885ad79 100644 --- a/fearless_simd_gen/src/mk_simd_types.rs +++ b/fearless_simd_gen/src/mk_simd_types.rs @@ -296,6 +296,8 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { let scalar = ty.scalar.rust(ty.scalar_bits); let len = Literal::usize_unsuffixed(ty.len); let splat = generic_op_name("splat", ty); + let from_bitmask_op = generic_op_name("from_bitmask", ty); + let to_bitmask_op = generic_op_name("to_bitmask", ty); let from_array_op = generic_op_name("load_array", ty); let as_array_op = generic_op_name("as_array", ty); let mut methods = vec![]; @@ -320,6 +322,9 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { } } + // Current backends store masks as signed integer lanes, so `set` uses a generic + // spill/update/reload path. Future compact predicate backends such as AVX-512 can + // switch this implementation to `to_bitmask`/`from_bitmask`. quote! { impl crate::SimdMask for #name { type Element = #scalar; @@ -335,6 +340,28 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream { simd.#splat(val) } + #[inline(always)] + fn from_bitmask(simd: S, bits: u64) -> Self { + simd.#from_bitmask_op(bits) + } + + #[inline(always)] + fn to_bitmask(self) -> u64 { + self.simd.#to_bitmask_op(self) + } + + #[inline(always)] + fn set(&mut self, index: usize, value: bool) { + assert!( + index < #len, + "mask lane index {index} is out of bounds for {} lanes", + #len + ); + let mut lanes = self.simd.#as_array_op(*self); + lanes[index] = if value { !0 } else { 0 }; + *self = self.simd.#from_array_op(lanes); + } + #[inline(always)] fn from_slice(simd: S, slice: &[#scalar]) -> Self { let slice: &[#scalar; #len] = slice.try_into().unwrap(); diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index eb7a3e333..200e30ee5 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -21,6 +21,77 @@ use crate::{ #[derive(Clone, Copy)] pub(crate) struct WasmSimd128; +fn mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + assert_eq!( + vec_ty.n_bits(), + 128, + "WASM SIMD mask bitmask lowering only handles one native vector" + ); + + let expr = match vec_ty.scalar_bits { + 8 => quote! { + let lo = i8x16_splat(bits as i8); + let hi = i8x16_splat((bits >> 8) as i8); + let bytes = + u8x16_shuffle::<0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16, 16>(lo, hi); + let powers = u8x16(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + let selected = v128_and(bytes, powers); + i8x16_ne(selected, i8x16_splat(0)).simd_into(self) + }, + 16 => quote! { + let bitset = i16x8_splat(bits as i16); + let powers = u16x8(1, 2, 4, 8, 16, 32, 64, 128); + let selected = v128_and(bitset, powers); + i16x8_ne(selected, i16x8_splat(0)).simd_into(self) + }, + 32 => quote! { + let bitset = i32x4_splat(bits as i32); + let powers = u32x4(1, 2, 4, 8); + let selected = v128_and(bitset, powers); + i32x4_ne(selected, i32x4_splat(0)).simd_into(self) + }, + 64 => quote! { + let bitset = i64x2_splat(bits as i64); + let powers = u64x2(1, 2); + let selected = v128_and(bitset, powers); + i64x2_ne(selected, i64x2_splat(0)).simd_into(self) + }, + _ => unreachable!("WASM only supports mask lane widths of 8, 16, 32, and 64 bits"), + }; + + quote! { + #method_sig { + #expr + } + } +} + +fn mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + assert_eq!( + vec_ty.n_bits(), + 128, + "WASM SIMD mask bitmask lowering only handles one native vector" + ); + + let intrinsic = format_ident!("i{}x{}_bitmask", vec_ty.scalar_bits, vec_ty.len); + + quote! { + #method_sig { + #intrinsic(a.into()) as u64 + } + } +} + impl Level for WasmSimd128 { fn name(&self) -> &'static str { "WasmSimd128" @@ -512,6 +583,8 @@ impl Level for WasmSimd128 { } } } + OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty), + OpSig::MaskToBitmask => mask_to_bitmask(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 7b139bdde..fa4639f0a 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -14,7 +14,7 @@ use crate::generic::{ use crate::level::Level; use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret}; use crate::types::{ScalarType, VecType}; -use proc_macro2::{Ident, Span, TokenStream}; +use proc_macro2::{Ident, Literal, Span, TokenStream}; use quote::{ToTokens as _, format_ident, quote}; #[derive(Clone, Copy, PartialEq, Eq)] @@ -168,6 +168,19 @@ impl Level for X86 { } } + fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool { + let should_use_generic = op.sig.should_use_generic_op(vec_ty, self.native_width()); + if !should_use_generic { + return false; + } + + match op.sig { + OpSig::MaskFromBitmask => !self.has_specialized_mask_from_bitmask(vec_ty), + OpSig::MaskToBitmask => !self.has_specialized_mask_to_bitmask(vec_ty), + _ => true, + } + } + fn make_method(&self, op: Op, vec_ty: &VecType) -> TokenStream { let Op { sig, method, .. } = op; let method_sig = op.simd_trait_method_sig(vec_ty); @@ -201,6 +214,8 @@ impl Level for X86 { quantifier, condition, } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition), + OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty), + OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty), OpSig::LoadInterleaved { block_size, block_count, @@ -224,6 +239,360 @@ impl Level for X86 { } } +fn mask_from_bitmask_bytes(vec_ty: &VecType) -> TokenStream { + let lane_count = vec_ty.len; + let bit_mask_128 = mask_bit_pattern_128(); + + if lane_count <= 8 { + return quote! { + { + let bit_bytes = _mm_set1_epi8(bits as i8); + let bit_mask = #bit_mask_128; + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + }; + } + + if lane_count <= 16 { + let shuffle = mask_byte_shuffle_128(lane_count); + return quote! { + { + let bit_bytes = _mm_cvtsi32_si128(bits as i32); + let bit_bytes = _mm_shuffle_epi8(bit_bytes, #shuffle); + let bit_mask = #bit_mask_128; + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + }; + } + + assert_eq!( + (vec_ty.n_bits(), vec_ty.scalar_bits, lane_count), + (256, 8, 32), + "only 32-lane masks need a 256-bit inverse movemask" + ); + + let shuffle = mask_byte_shuffle_256(); + let bit_mask = mask_bit_pattern_256(); + quote! { + { + let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32)); + let bit_bytes = _mm256_shuffle_epi8(bit_bytes, #shuffle); + let bit_mask = #bit_mask; + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + } + } +} + +fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream { + let lane_count = vec_ty.len; + let scalar_bits = vec_ty.scalar_bits; + + match (vec_ty.n_bits(), scalar_bits) { + (128, 16) => { + let lanes = (0..lane_count).map(|i| { + let bit = 1_u16 << i; + signed_literal(bit.into(), 16) + }); + quote! { + { + let bit_lanes = _mm_set1_epi16(bits as i16); + let bit_mask = _mm_setr_epi16(#(#lanes),*); + _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + } + } + (256, 16) => { + let lanes = (0..lane_count).map(|i| { + let bit = 1_u16 << i; + signed_literal(bit.into(), 16) + }); + quote! { + { + let bit_lanes = _mm256_set1_epi16(bits as i16); + let bit_mask = _mm256_setr_epi16(#(#lanes),*); + _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + } + } + (128, 32) => { + let lanes = (0..lane_count).map(|i| { + let bit = 1_u32 << i; + signed_literal(bit.into(), 32) + }); + quote! { + { + let bit_lanes = _mm_set1_epi32(bits as i32); + let bit_mask = _mm_setr_epi32(#(#lanes),*); + _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + } + } + (256, 32) => { + let lanes = (0..lane_count).map(|i| { + let bit = 1_u32 << i; + signed_literal(bit.into(), 32) + }); + quote! { + { + let bit_lanes = _mm256_set1_epi32(bits as i32); + let bit_mask = _mm256_setr_epi32(#(#lanes),*); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + } + } + (128, 64) => { + assert_eq!(lane_count, 2, "128-bit 64-bit masks must have two lanes"); + quote! { + { + let bit_lanes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm_set_epi64x(2, 1); + _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask) + } + } + } + (256, 64) => { + assert_eq!(lane_count, 4, "256-bit 64-bit masks must have four lanes"); + quote! { + { + let bit_lanes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = _mm256_set_epi64x(8, 4, 2, 1); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + } + } + _ => unimplemented!(), + } +} + +fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.n_bits(), + 512, + "only 512-bit masks use direct wide AVX2 bitmask lowering" + ); + assert!( + matches!(vec_ty.scalar_bits, 32 | 64), + "only 32-bit and 64-bit AVX2 masks use direct wide lowering" + ); + + let ty = vec_ty.rust(); + let lanes_per_chunk = 256 / vec_ty.scalar_bits; + let chunks = (0..2).map(|chunk| { + let chunk_start = chunk * lanes_per_chunk; + match vec_ty.scalar_bits { + 32 => { + let lanes = (0..lanes_per_chunk).map(|i| { + let bit = 1_u32 << (chunk_start + i); + signed_literal(bit.into(), 32) + }); + quote! { + { + let bit_mask = _mm256_setr_epi32(#(#lanes),*); + _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + } + } + 64 => { + let lanes = (0..lanes_per_chunk).rev().map(|i| { + let bit = 1_u64 << (chunk_start + i); + signed_literal(bit, 64) + }); + quote! { + { + let bit_mask = _mm256_set_epi64x(#(#lanes),*); + _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask) + } + } + } + _ => unreachable!(), + } + }); + let set1 = match vec_ty.scalar_bits { + 32 => quote! { _mm256_set1_epi32(bits as i32) }, + 64 => quote! { _mm256_set1_epi64x(bits.cast_signed()) }, + _ => unreachable!(), + }; + + quote! { + { + let bit_lanes = #set1; + #ty { + val: crate::support::Aligned512([#(#chunks),*]), + simd: self, + } + } + } +} + +fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.n_bits(), + 512, + "only 512-bit masks use direct wide byte-mask lowering" + ); + assert_eq!( + vec_ty.scalar_bits, 8, + "only mask8x64 uses direct wide byte-mask lowering" + ); + + let ty = vec_ty.rust(); + match native_width { + 128 => { + let bit_mask = mask_bit_pattern_128(); + let chunks = (0..4).map(|chunk| { + let shuffle = mask_byte_shuffle_128_offset(16, chunk * 2); + quote! { + { + let bit_bytes = _mm_shuffle_epi8(bit_bytes, #shuffle); + _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask) + } + } + }); + + quote! { + { + let bit_bytes = _mm_set1_epi64x(bits.cast_signed()); + let bit_mask = #bit_mask; + #ty { + val: crate::support::Aligned512([#(#chunks),*]), + simd: self, + } + } + } + } + 256 => { + let bit_mask = mask_bit_pattern_256(); + let chunks = (0..2).map(|chunk| { + let shuffle = mask_byte_shuffle_256_offset(chunk * 4); + quote! { + { + let bit_bytes = _mm256_shuffle_epi8(bit_bytes, #shuffle); + _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask) + } + } + }); + + quote! { + { + let bit_bytes = _mm256_set1_epi64x(bits.cast_signed()); + let bit_mask = #bit_mask; + #ty { + val: crate::support::Aligned512([#(#chunks),*]), + simd: self, + } + } + } + } + _ => unreachable!(), + } +} + +fn mask_to_bitmask_words(native_width: usize, vec_ty: &VecType) -> TokenStream { + assert_eq!( + vec_ty.scalar_bits, 16, + "only 16-bit masks use word packing to produce bitmasks" + ); + + match (native_width, vec_ty.n_bits()) { + (128 | 256, 128) => quote! { + { + let packed = _mm_packs_epi16(a.into(), a.into()); + _mm_movemask_epi8(packed) as u8 as u64 + } + }, + (128, 256) => quote! { + { + let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } + }, + (128, 512) => quote! { + { + let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]); + let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]); + let lo = _mm_movemask_epi8(lo) as u32 as u64; + let hi = _mm_movemask_epi8(hi) as u32 as u64; + lo | (hi << 16usize) + } + }, + (256, 256) => quote! { + { + let halves: [__m128i; 2usize] = core::mem::transmute(a.val.0); + let packed = _mm_packs_epi16(halves[0], halves[1]); + _mm_movemask_epi8(packed) as u32 as u64 + } + }, + (256, 512) => quote! { + { + let lo = _mm256_movemask_epi8(a.val.0[0]) as u32; + let hi = _mm256_movemask_epi8(a.val.0[1]) as u32; + let lo = _pext_u32(lo, 0x5555_5555u32) as u64; + let hi = _pext_u32(hi, 0x5555_5555u32) as u64; + lo | (hi << 16usize) + } + }, + _ => unimplemented!(), + } +} + +fn mask_bit_pattern_128() -> TokenStream { + let lanes = (0..16).map(|i| { + let bit = 1_u8 << (i % 8); + signed_literal(bit.into(), 8) + }); + quote! { _mm_setr_epi8(#(#lanes),*) } +} + +fn mask_bit_pattern_256() -> TokenStream { + let lanes = (0..32).map(|i| { + let bit = 1_u8 << (i % 8); + signed_literal(bit.into(), 8) + }); + quote! { _mm256_setr_epi8(#(#lanes),*) } +} + +fn mask_byte_shuffle_128_offset(lane_count: usize, byte_offset: usize) -> TokenStream { + let lanes = (0..16).map(|i| { + let byte = u8::try_from(byte_offset + i.min(lane_count - 1) / 8) + .expect("SSE byte shuffle index must fit in u8"); + signed_literal(byte.into(), 8) + }); + quote! { _mm_setr_epi8(#(#lanes),*) } +} + +fn mask_byte_shuffle_128(lane_count: usize) -> TokenStream { + mask_byte_shuffle_128_offset(lane_count, 0) +} + +fn mask_byte_shuffle_256_offset(byte_offset: usize) -> TokenStream { + let lanes = (0..32).map(|i| { + let byte = + u8::try_from(byte_offset + i / 8).expect("AVX2 byte shuffle index must fit in u8"); + signed_literal(byte.into(), 8) + }); + quote! { _mm256_setr_epi8(#(#lanes),*) } +} + +fn mask_byte_shuffle_256() -> TokenStream { + mask_byte_shuffle_256_offset(0) +} + +fn signed_literal(value: u64, bits: u32) -> TokenStream { + assert!( + bits <= 64, + "signed literal width must fit in a primitive integer" + ); + let shift = 64 - bits; + let value = (value << shift).cast_signed() >> shift; + if value < 0 { + let magnitude = Literal::u64_unsuffixed(value.unsigned_abs()); + quote! { -#magnitude } + } else { + let value = Literal::u64_unsuffixed(value as u64); + quote! { #value } + } +} + impl X86 { pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream { let intrinsic = set1_intrinsic(vec_ty); @@ -242,6 +611,138 @@ impl X86 { } } + fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { + self.has_wide_byte_mask_from_bitmask(vec_ty) || self.has_wide_avx2_mask_from_bitmask(vec_ty) + } + + fn has_wide_byte_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { + // 512-bit byte masks can be constructed directly from one broadcast, avoiding the + // shift-and-rebroadcast shape from generic split/combine. + vec_ty.scalar == ScalarType::Mask && vec_ty.n_bits() == 512 && vec_ty.scalar_bits == 8 + } + + fn has_wide_avx2_mask_from_bitmask(&self, vec_ty: &VecType) -> bool { + // AVX2 can construct these 512-bit masks directly from one broadcast, avoiding the + // split/combine shape that shifts and broadcasts each half separately. + *self == Self::Avx2 + && vec_ty.scalar == ScalarType::Mask + && vec_ty.n_bits() == 512 + && matches!(vec_ty.scalar_bits, 32 | 64) + } + + fn has_specialized_mask_to_bitmask(&self, vec_ty: &VecType) -> bool { + vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16 + } + + pub(crate) fn handle_mask_from_bitmask( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + ) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + + if self.has_wide_byte_mask_from_bitmask(vec_ty) { + let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty); + return quote! { + #method_sig { + unsafe { + #expr + } + } + }; + } + + if self.has_wide_avx2_mask_from_bitmask(vec_ty) { + let expr = mask_from_bitmask_wide_avx2(vec_ty); + return quote! { + #method_sig { + unsafe { + #expr + } + } + }; + } + + let expr = match vec_ty.scalar_bits { + 8 => { + let bytes = mask_from_bitmask_bytes(vec_ty); + quote! { + #bytes.simd_into(self) + } + } + 16 | 32 | 64 => { + let lanes = mask_from_bitmask_lanes(vec_ty); + quote! { + #lanes.simd_into(self) + } + } + _ => unreachable!(), + }; + + quote! { + #method_sig { + unsafe { + #expr + } + } + } + } + + pub(crate) fn handle_mask_to_bitmask( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + ) -> TokenStream { + assert_eq!( + vec_ty.scalar, + ScalarType::Mask, + "mask bitmask conversion only operates on masks" + ); + + match vec_ty.scalar_bits { + 8 => { + let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8); + let movemask = simple_intrinsic("movemask", &bits_ty); + quote! { + #method_sig { + unsafe { #movemask(a.into()) as u32 as u64 } + } + } + } + 16 => { + let bits = mask_to_bitmask_words(self.native_width(), vec_ty); + quote! { + #method_sig { + unsafe { + #bits + } + } + } + } + 32 | 64 => { + let float_ty = vec_ty.cast(ScalarType::Float); + let movemask = simple_intrinsic("movemask", &float_ty); + let cast = cast_ident( + ScalarType::Mask, + ScalarType::Float, + vec_ty.scalar_bits, + vec_ty.scalar_bits, + vec_ty.n_bits(), + ); + quote! { + #method_sig { + unsafe { #movemask(#cast(a.into())) as u32 as u64 } + } + } + } + _ => unreachable!(), + } + } + pub(crate) fn handle_compare( &self, method_sig: TokenStream, diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 521b0b5d2..79fa35c3c 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -106,6 +106,10 @@ pub(crate) enum OpSig { quantifier: Quantifier, condition: bool, }, + /// Takes a compact bitmask and returns the corresponding mask vector type. + MaskFromBitmask, + /// Takes a mask vector type and returns its compact bitmask representation. + MaskToBitmask, /// Takes an argument of an array of a certain scalar type, with the length (`block_size` * `block_count`) / [scalar /// type's byte size]. Returns a vector type of that scalar type and length. /// @@ -265,6 +269,14 @@ impl Op { let arg0 = &arg_names[0]; quote! { (self, #arg0: #ty) -> bool } } + OpSig::MaskFromBitmask => { + let arg0 = &arg_names[0]; + quote! { (self, #arg0: u64) -> #ty } + } + OpSig::MaskToBitmask => { + let arg0 = &arg_names[0]; + quote! { (self, #arg0: #ty) -> u64 } + } OpSig::Shift => { let arg0 = &arg_names[0]; let arg1 = &arg_names[1]; @@ -341,6 +353,7 @@ impl Op { OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } | OpSig::StoreArray => { return None; } + OpSig::MaskFromBitmask | OpSig::MaskToBitmask => return None, OpSig::Unary | OpSig::Cvt { .. } | OpSig::Reinterpret { .. } @@ -558,6 +571,18 @@ const MASK_REPRESENTATION_OPS: &[Op] = &[ }, "Convert a SIMD mask to signed integer mask lanes.", ), + Op::new( + "from_bitmask", + OpKind::AssociatedOnly, + OpSig::MaskFromBitmask, + "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored.", + ), + Op::new( + "to_bitmask", + OpKind::AssociatedOnly, + OpSig::MaskToBitmask, + "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared.", + ), ]; const FLOAT_OPS: &[Op] = &[ @@ -1504,12 +1529,14 @@ impl OpSig { fn simd_trait_arg_names(&self) -> &'static [&'static str] { match self { Self::Splat | Self::FromArray { .. } => &["val"], + Self::MaskFromBitmask => &["bits"], Self::Unary | Self::Split { .. } | Self::Cvt { .. } | Self::Reinterpret { .. } | Self::WidenNarrow { .. } | Self::MaskReduce { .. } + | Self::MaskToBitmask | Self::AsArray { .. } | Self::FromBytes | Self::ToBytes => &["a"], @@ -1533,6 +1560,8 @@ impl OpSig { Self::LoadInterleaved { .. } | Self::StoreInterleaved { .. } | Self::FromArray { .. } + | Self::MaskFromBitmask + | Self::MaskToBitmask | Self::FromBytes { .. } | Self::StoreArray => &[], Self::Unary @@ -1593,6 +1622,8 @@ impl OpSig { | Self::Reinterpret { .. } | Self::WidenNarrow { .. } | Self::Shift + | Self::MaskFromBitmask + | Self::MaskToBitmask | Self::LoadInterleaved { .. } | Self::StoreInterleaved { .. } | Self::FromArray { .. } diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs index 3b20e3104..ab1d7e829 100644 --- a/fearless_simd_gen/src/types.rs +++ b/fearless_simd_gen/src/types.rs @@ -179,7 +179,7 @@ impl VecType { let scalar_bits = self.scalar_bits; format!( "A SIMD mask of {len} logical lanes corresponding to {scalar_bits}-bit vector elements.\n\n\ - The storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).", + The storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).", ) } else { let scalar_name = self.scalar.rust_name(self.scalar_bits); diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs index 2db66f9ca..789a8eb99 100644 --- a/fearless_simd_tests/tests/harness/lm_generated.rs +++ b/fearless_simd_tests/tests/harness/lm_generated.rs @@ -2,5 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT mod extended_512; +mod mask_methods; mod mod_256; mod mod_512; diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs new file mode 100644 index 000000000..15963b2a3 --- /dev/null +++ b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs @@ -0,0 +1,205 @@ +// Copyright 2026 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use fearless_simd::*; +use fearless_simd_dev_macros::simd_test; + +#[simd_test] +fn mask8x16_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask8x16::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits); + } +} + +#[simd_test] +fn mask16x8_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask16x8::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0xff); + } +} + +#[simd_test] +fn mask32x4_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask32x4::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0xf); + } +} + +#[simd_test] +fn mask64x2_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask64x2::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0x3); + } +} + +#[simd_test] +fn mask16x16_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask16x16::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits); + } +} + +#[simd_test] +fn mask32x8_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask32x8::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0xff); + } +} + +#[simd_test] +fn mask64x4_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask64x4::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0xf); + } +} + +#[simd_test] +fn mask32x16_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask32x16::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits); + } +} + +#[simd_test] +fn mask64x8_bitmask_roundtrip(simd: S) { + for bits in 0..=0xffff_u64 { + let mask = mask64x8::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits & 0xff); + } +} + +#[simd_test] +#[ignore] // takes too long to run on CI +fn mask8x32_bitmask_roundtrip_exhaustive(simd: S) { + for bits in 0..=0xffff_ffff_u64 { + let mask = mask8x32::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits); + } +} + +#[simd_test] +#[ignore] // takes too long to run on CI +fn mask16x32_bitmask_roundtrip_exhaustive(simd: S) { + for bits in 0..=0xffff_ffff_u64 { + let mask = mask16x32::from_bitmask(simd, bits); + assert_eq!(mask.to_bitmask(), bits); + } +} + +// selected interesting bit patterns to test always +#[simd_test] +fn mask8x32_bitmask_roundtrip(simd: S) { + let mask = mask8x32::from_bitmask(simd, 0x0000_0000); + assert_eq!(mask.to_bitmask(), 0x0000_0000); + + let mask = mask8x32::from_bitmask(simd, 0x0000_0001); + assert_eq!(mask.to_bitmask(), 0x0000_0001); + + let mask = mask8x32::from_bitmask(simd, 0x8000_0000); + assert_eq!(mask.to_bitmask(), 0x8000_0000); + + let mask = mask8x32::from_bitmask(simd, 0x0000_ffff); + assert_eq!(mask.to_bitmask(), 0x0000_ffff); + + let mask = mask8x32::from_bitmask(simd, 0xffff_0000); + assert_eq!(mask.to_bitmask(), 0xffff_0000); + + let mask = mask8x32::from_bitmask(simd, 0x5555_5555); + assert_eq!(mask.to_bitmask(), 0x5555_5555); + + let mask = mask8x32::from_bitmask(simd, 0xaaaa_aaaa); + assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa); + + let mask = mask8x32::from_bitmask(simd, 0x8000_aa55); + assert_eq!(mask.to_bitmask(), 0x8000_aa55); + + let mask = mask8x32::from_bitmask(simd, 0xffff_ffff); + assert_eq!(mask.to_bitmask(), 0xffff_ffff); + + let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_0000_0000); + assert_eq!(mask.to_bitmask(), 0x0000_0000); + + let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_8000_aa55); + assert_eq!(mask.to_bitmask(), 0x8000_aa55); + + let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_ffff_ffff); + assert_eq!(mask.to_bitmask(), 0xffff_ffff); +} + +// selected interesting bit patterns to test always +#[simd_test] +fn mask16x32_bitmask_roundtrip(simd: S) { + let mask = mask16x32::from_bitmask(simd, 0x0000_0000); + assert_eq!(mask.to_bitmask(), 0x0000_0000); + + let mask = mask16x32::from_bitmask(simd, 0x0000_0001); + assert_eq!(mask.to_bitmask(), 0x0000_0001); + + let mask = mask16x32::from_bitmask(simd, 0x8000_0000); + assert_eq!(mask.to_bitmask(), 0x8000_0000); + + let mask = mask16x32::from_bitmask(simd, 0x0000_ffff); + assert_eq!(mask.to_bitmask(), 0x0000_ffff); + + let mask = mask16x32::from_bitmask(simd, 0xffff_0000); + assert_eq!(mask.to_bitmask(), 0xffff_0000); + + let mask = mask16x32::from_bitmask(simd, 0x5555_5555); + assert_eq!(mask.to_bitmask(), 0x5555_5555); + + let mask = mask16x32::from_bitmask(simd, 0xaaaa_aaaa); + assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa); + + let mask = mask16x32::from_bitmask(simd, 0x8000_aa55); + assert_eq!(mask.to_bitmask(), 0x8000_aa55); + + let mask = mask16x32::from_bitmask(simd, 0xffff_ffff); + assert_eq!(mask.to_bitmask(), 0xffff_ffff); + + let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_0000_0000); + assert_eq!(mask.to_bitmask(), 0x0000_0000); + + let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_8000_aa55); + assert_eq!(mask.to_bitmask(), 0x8000_aa55); + + let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_ffff_ffff); + assert_eq!(mask.to_bitmask(), 0xffff_ffff); +} + +#[simd_test] +fn mask8x64_bitmask_roundtrip(simd: S) { + let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0000); + assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0000); + + let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0001); + assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0001); + + let mask = mask8x64::from_bitmask(simd, 0x8000_0000_0000_0000); + assert_eq!(mask.to_bitmask(), 0x8000_0000_0000_0000); + + let mask = mask8x64::from_bitmask(simd, 0x0000_0000_ffff_ffff); + assert_eq!(mask.to_bitmask(), 0x0000_0000_ffff_ffff); + + let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_0000_0000); + assert_eq!(mask.to_bitmask(), 0xffff_ffff_0000_0000); + + let mask = mask8x64::from_bitmask(simd, 0x5555_5555_5555_5555); + assert_eq!(mask.to_bitmask(), 0x5555_5555_5555_5555); + + let mask = mask8x64::from_bitmask(simd, 0xaaaa_aaaa_aaaa_aaaa); + assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa_aaaa_aaaa); + + let mask = mask8x64::from_bitmask(simd, 0x8000_0001_5555_aaab); + assert_eq!(mask.to_bitmask(), 0x8000_0001_5555_aaab); + + let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_ffff_ffff); + assert_eq!(mask.to_bitmask(), 0xffff_ffff_ffff_ffff); +} diff --git a/fearless_simd_tests/tests/soundness.rs b/fearless_simd_tests/tests/soundness.rs index 04cd04c1c..3c78f52b2 100644 --- a/fearless_simd_tests/tests/soundness.rs +++ b/fearless_simd_tests/tests/soundness.rs @@ -63,6 +63,23 @@ macro_rules! for_each_simd_type { }; } +macro_rules! for_each_mask_type { + ($test:ident, $simd:expr) => { + $test!($simd, mask8x16, 16); + $test!($simd, mask16x8, 8); + $test!($simd, mask32x4, 4); + $test!($simd, mask64x2, 2); + $test!($simd, mask8x32, 32); + $test!($simd, mask16x16, 16); + $test!($simd, mask32x8, 8); + $test!($simd, mask64x4, 4); + $test!($simd, mask8x64, 64); + $test!($simd, mask16x32, 32); + $test!($simd, mask32x16, 16); + $test!($simd, mask64x8, 8); + }; +} + macro_rules! check_from_slice_short { ($simd:expr, $vec:ident, $len:expr) => { assert_panics(stringify!($vec::from_slice), || { @@ -82,6 +99,26 @@ macro_rules! check_store_slice_short { }}; } +macro_rules! check_mask_test_oob { + ($simd:expr, $mask:ident, $len:expr) => {{ + let mask = $mask::splat($simd, false); + + assert_panics(stringify!($mask::test), || { + let _ = mask.test($len); + }); + }}; +} + +macro_rules! check_mask_set_oob { + ($simd:expr, $mask:ident, $len:expr) => {{ + let mut mask = $mask::splat($simd, false); + + assert_panics(stringify!($mask::set), || { + mask.set($len, true); + }); + }}; +} + #[simd_test] fn from_slice_rejects_short_slice(simd: S) { for_each_simd_type!(check_from_slice_short, simd); @@ -91,3 +128,13 @@ fn from_slice_rejects_short_slice(simd: S) { fn store_slice_rejects_short_slice(simd: S) { for_each_simd_type!(check_store_slice_short, simd); } + +#[simd_test] +fn mask_test_rejects_out_of_bounds(simd: S) { + for_each_mask_type!(check_mask_test_oob, simd); +} + +#[simd_test] +fn mask_set_rejects_out_of_bounds(simd: S) { + for_each_mask_type!(check_mask_set_oob, simd); +}