11#[ cfg( target_arch = "x86" ) ]
22use core:: arch:: x86:: {
3- __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
4- _mm256_srli_epi32, _mm256_sub_epi32,
3+ __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
4+ _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8,
5+ _mm256_sub_epi32, _mm_cvtsi128_si64,
56} ;
67
78#[ cfg( target_arch = "x86_64" ) ]
89use core:: arch:: x86_64:: {
9- __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
10- _mm256_srli_epi32, _mm256_sub_epi32,
10+ __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
11+ _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_shuffle_epi8,
12+ _mm256_sub_epi32, _mm_cvtsi128_si64,
1113} ;
1214
1315use alloc:: vec:: Vec ;
@@ -27,7 +29,7 @@ macro_rules! ycbcr_image_avx2 {
2729 #[ target_feature( enable = "avx2" ) ]
2830 fn load3( data: & [ u8 ] ) -> __m256i {
2931 _ = data[ 7 * $num_colors] ; // dummy indexing operation up front to avoid bounds checks later
30- _mm256_set_epi32 (
32+ _mm256_setr_epi32 (
3133 data[ 0 ] as i32 ,
3234 data[ 1 * $num_colors] as i32 ,
3335 data[ 2 * $num_colors] as i32 ,
@@ -41,13 +43,22 @@ macro_rules! ycbcr_image_avx2 {
4143
4244 #[ inline]
4345 #[ target_feature( enable = "avx2" ) ]
44- fn avx_as_i32_array( data: __m256i) -> [ i32 ; 8 ] {
45- // Safety preconditions. Optimized away in release mode, no runtime cost.
46- assert!( core:: mem:: size_of:: <__m256i>( ) == core:: mem:: size_of:: <[ i32 ; 8 ] >( ) ) ;
47- assert!( core:: mem:: align_of:: <__m256i>( ) >= core:: mem:: align_of:: <[ i32 ; 8 ] >( ) ) ;
48- // SAFETY: size and alignment preconditions checked above.
49- // Both types are plain old data: no pointers, lifetimes, etc.
50- unsafe { core:: mem:: transmute( data) }
46+ fn avx_as_u8_array( data: __m256i) -> [ u8 ; 8 ] {
47+ // Select the third byte of every i32, pack the bytes into
48+ // the low i32 byte of each 128-bit lane: idx = (4 * i) + 2
49+ let u8_shuffle_mask = _mm256_set1_epi32( i32 :: from_le_bytes( [ 2 , 6 , 10 , 14 ] ) ) ;
50+ // Shuffle the lowest i32 lane from the high 128-bit lane
51+ // into the second i32
52+ let u32_permute_mask = _mm256_setr_epi32( 0 , 4 , 0 , 0 , 0 , 0 , 0 , 0 ) ;
53+
54+ // ABCD EFGH IJKL MNOP becomes CGKO CGKO CGKO CGKO, for both lanes
55+ let a = _mm256_shuffle_epi8( data, u8_shuffle_mask) ;
56+ // A... B... becomes AB.. ....
57+ let a = _mm256_permutevar8x32_epi32( a, u32_permute_mask) ;
58+
59+ // No-op cast to __m128i and extract the lower 64-bits
60+ let out = _mm256_castsi256_si128( a) ;
61+ _mm_cvtsi128_si64( out) . to_le_bytes( )
5162 }
5263
5364 let [ y_buffer, cb_buffer, cr_buffer, _] = buffers;
@@ -82,11 +93,7 @@ macro_rules! ycbcr_image_avx2 {
8293
8394 let y = _mm256_add_epi32( _mm256_add_epi32( yr, yg) , yb) ;
8495 let y = _mm256_add_epi32( y, _mm256_set1_epi32( 0x7FFF ) ) ;
85- let y = _mm256_srli_epi32( y, 16 ) ;
86- let y: [ i32 ; 8 ] = avx_as_i32_array( y) ;
87- let mut y: [ u8 ; 8 ] = y. map( |x| x as u8 ) ;
88- y. reverse( ) ;
89- y_buffer. extend_from_slice( & y) ;
96+ y_buffer. extend( avx_as_u8_array( y) ) ;
9097
9198 let cbr = _mm256_mullo_epi32( cbmulr, r) ;
9299 let cbg = _mm256_mullo_epi32( cbmulg, g) ;
@@ -95,11 +102,7 @@ macro_rules! ycbcr_image_avx2 {
95102 let cb = _mm256_add_epi32( _mm256_sub_epi32( cbr, cbg) , cbb) ;
96103 let cb = _mm256_add_epi32( cb, _mm256_set1_epi32( 128 << 16 ) ) ;
97104 let cb = _mm256_add_epi32( cb, _mm256_set1_epi32( 0x7FFF ) ) ;
98- let cb = _mm256_srli_epi32( cb, 16 ) ;
99- let cb: [ i32 ; 8 ] = avx_as_i32_array( cb) ;
100- let mut cb: [ u8 ; 8 ] = cb. map( |x| x as u8 ) ;
101- cb. reverse( ) ;
102- cb_buffer. extend_from_slice( & cb) ;
105+ cb_buffer. extend( avx_as_u8_array( cb) ) ;
103106
104107 let crr = _mm256_mullo_epi32( crmulr, r) ;
105108 let crg = _mm256_mullo_epi32( crmulg, g) ;
@@ -108,11 +111,7 @@ macro_rules! ycbcr_image_avx2 {
108111 let cr = _mm256_sub_epi32( _mm256_sub_epi32( crr, crg) , crb) ;
109112 let cr = _mm256_add_epi32( cr, _mm256_set1_epi32( 128 << 16 ) ) ;
110113 let cr = _mm256_add_epi32( cr, _mm256_set1_epi32( 0x7FFF ) ) ;
111- let cr = _mm256_srli_epi32( cr, 16 ) ;
112- let cr: [ i32 ; 8 ] = avx_as_i32_array( cr) ;
113- let mut cr: [ u8 ; 8 ] = cr. map( |x| x as u8 ) ;
114- cr. reverse( ) ;
115- cr_buffer. extend_from_slice( & cr) ;
114+ cr_buffer. extend( avx_as_u8_array( cr) ) ;
116115 }
117116
118117 for _ in 0 ..self . width( ) % 8 {
0 commit comments