Skip to content

Commit b972b53

Browse files
committed
Explicitly vectorize i32x8 to u8x8 conversion for storing into YCbCr
Remove unsafe transmute in AVX2 YCbCr conversion Prior to this, the conversion was scalarized and used a bswap. Rewriting the code to avoid reversing the array resulted in worse codegen that extracted the bytes and manually re-inserted them back into the SIMD register to store 8 bytes at once.
1 parent 057934e commit b972b53

File tree

1 file changed

+26
-27
lines changed

1 file changed

+26
-27
lines changed

src/avx2/ycbcr.rs

Lines changed: 26 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
#[cfg(target_arch = "x86")]
22
use core::arch::x86::{
3-
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
4-
_mm256_srli_epi32, _mm256_sub_epi32,
3+
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
4+
_mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8,
5+
_mm256_sub_epi32, _mm_cvtsi128_si64,
56
};
67

78
#[cfg(target_arch = "x86_64")]
89
use core::arch::x86_64::{
9-
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
10-
_mm256_srli_epi32, _mm256_sub_epi32,
10+
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
11+
_mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8,
12+
_mm256_sub_epi32, _mm_cvtsi128_si64,
1113
};
1214

1315
use alloc::vec::Vec;
@@ -27,7 +29,7 @@ macro_rules! ycbcr_image_avx2 {
2729
#[target_feature(enable = "avx2")]
2830
fn load3(data: &[u8]) -> __m256i {
2931
_ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later
30-
_mm256_set_epi32(
32+
_mm256_setr_epi32(
3133
data[0] as i32,
3234
data[1 * $num_colors] as i32,
3335
data[2 * $num_colors] as i32,
@@ -41,13 +43,22 @@ macro_rules! ycbcr_image_avx2 {
4143

4244
#[inline]
4345
#[target_feature(enable = "avx2")]
44-
fn avx_as_i32_array(data: __m256i) -> [i32; 8] {
45-
// Safety preconditions. Optimized away in release mode, no runtime cost.
46-
assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>());
47-
assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>());
48-
// SAFETY: size and alignment preconditions checked above.
49-
// Both types are plain old data: no pointers, lifetimes, etc.
50-
unsafe { core::mem::transmute(data) }
46+
fn avx_as_u8_array(data: __m256i) -> [u8; 8] {
47+
// Select the third byte of every i32, pack the bytes into
48+
// the low i32 byte of each 128-bit lane: idx = (4 * i) + 2
49+
let u8_shuffle_mask = _mm256_set1_epi32(i32::from_le_bytes([2, 6, 10, 14]));
50+
// Shuffle the lowest i32 lane from the high 128-bit lane
51+
// into the second i32
52+
let u32_permute_mask = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0);
53+
54+
// ABCD EFGH IJKL MNOP becomes CGKO CGKO CGKO CGKO, for both lanes
55+
let a = _mm256_shuffle_epi8(data, u8_shuffle_mask);
56+
// A... B... becomes AB.. ....
57+
let a = _mm256_permutevar8x32_epi32(a, u32_permute_mask);
58+
59+
// No-op cast to __m128i and extract the lower 64-bits
60+
let out = _mm256_castsi256_si128(a);
61+
_mm_cvtsi128_si64(out).to_le_bytes()
5162
}
5263

5364
let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
@@ -82,11 +93,7 @@ macro_rules! ycbcr_image_avx2 {
8293

8394
let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
8495
let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
85-
let y = _mm256_srli_epi32(y, 16);
86-
let y: [i32; 8] = avx_as_i32_array(y);
87-
let mut y: [u8; 8] = y.map(|x| x as u8);
88-
y.reverse();
89-
y_buffer.extend_from_slice(&y);
96+
y_buffer.extend(avx_as_u8_array(y));
9097

9198
let cbr = _mm256_mullo_epi32(cbmulr, r);
9299
let cbg = _mm256_mullo_epi32(cbmulg, g);
@@ -95,11 +102,7 @@ macro_rules! ycbcr_image_avx2 {
95102
let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb);
96103
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
97104
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
98-
let cb = _mm256_srli_epi32(cb, 16);
99-
let cb: [i32; 8] = avx_as_i32_array(cb);
100-
let mut cb: [u8; 8] = cb.map(|x| x as u8);
101-
cb.reverse();
102-
cb_buffer.extend_from_slice(&cb);
105+
cb_buffer.extend(avx_as_u8_array(cb));
103106

104107
let crr = _mm256_mullo_epi32(crmulr, r);
105108
let crg = _mm256_mullo_epi32(crmulg, g);
@@ -108,11 +111,7 @@ macro_rules! ycbcr_image_avx2 {
108111
let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb);
109112
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
110113
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
111-
let cr = _mm256_srli_epi32(cr, 16);
112-
let cr: [i32; 8] = avx_as_i32_array(cr);
113-
let mut cr: [u8; 8] = cr.map(|x| x as u8);
114-
cr.reverse();
115-
cr_buffer.extend_from_slice(&cr);
114+
cr_buffer.extend(avx_as_u8_array(cr));
116115
}
117116

118117
for _ in 0..self.width() % 8 {

0 commit comments

Comments
 (0)