@@ -64,10 +64,11 @@ pub fn fdct_avx2(data: &mut [i16; 64]) {
6464}
6565
6666#[ target_feature( enable = "avx2" ) ]
67- unsafe fn fdct_avx2_internal ( data : & mut [ i16 ; 64 ] ) {
67+ fn fdct_avx2_internal ( data : & mut [ i16 ; 64 ] ) {
68+ #[ target_feature( enable = "avx2" ) ]
6869 #[ allow( non_snake_case) ]
69- #[ inline( always ) ]
70- unsafe fn PW_F130_F054_MF130_F054 ( ) -> __m256i {
70+ #[ inline]
71+ fn PW_F130_F054_MF130_F054 ( ) -> __m256i {
7172 _mm256_set_epi16 (
7273 F_0_541 ,
7374 F_0_541 - F_1_847 ,
@@ -88,9 +89,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
8889 )
8990 }
9091
92+ #[ target_feature( enable = "avx2" ) ]
9193 #[ allow( non_snake_case) ]
92- #[ inline( always ) ]
93- unsafe fn PW_MF078_F117_F078_F117 ( ) -> __m256i {
94+ #[ inline]
95+ fn PW_MF078_F117_F078_F117 ( ) -> __m256i {
9496 _mm256_set_epi16 (
9597 F_1_175 ,
9698 F_1_175 - F_0_390 ,
@@ -111,9 +113,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
111113 )
112114 }
113115
116+ #[ target_feature( enable = "avx2" ) ]
114117 #[ allow( non_snake_case) ]
115- #[ inline( always ) ]
116- unsafe fn PW_MF060_MF089_MF050_MF256 ( ) -> __m256i {
118+ #[ inline]
119+ fn PW_MF060_MF089_MF050_MF256 ( ) -> __m256i {
117120 _mm256_set_epi16 (
118121 -F_2_562 ,
119122 F_2_053 - F_2_562 ,
@@ -134,9 +137,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
134137 )
135138 }
136139
140+ #[ target_feature( enable = "avx2" ) ]
137141 #[ allow( non_snake_case) ]
138- #[ inline( always ) ]
139- unsafe fn PW_F050_MF256_F060_MF089 ( ) -> __m256i {
142+ #[ inline]
143+ fn PW_F050_MF256_F060_MF089 ( ) -> __m256i {
140144 _mm256_set_epi16 (
141145 -F_0_899 ,
142146 F_1_501 - F_0_899 ,
@@ -157,9 +161,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
157161 )
158162 }
159163
164+ #[ target_feature( enable = "avx2" ) ]
160165 #[ allow( non_snake_case) ]
161- #[ inline( always ) ]
162- unsafe fn PD_DESCALE_P ( first_pass : bool ) -> __m256i {
166+ #[ inline]
167+ fn PD_DESCALE_P ( first_pass : bool ) -> __m256i {
163168 if first_pass {
164169 _mm256_set_epi32 (
165170 1 << ( DESCALE_P1 - 1 ) ,
@@ -185,9 +190,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
185190 }
186191 }
187192
193+ #[ target_feature( enable = "avx2" ) ]
188194 #[ allow( non_snake_case) ]
189- #[ inline( always ) ]
190- unsafe fn PW_DESCALE_P2X ( ) -> __m256i {
195+ #[ inline]
196+ fn PW_DESCALE_P2X ( ) -> __m256i {
191197 _mm256_set_epi32 (
192198 1 << ( PASS1_BITS - 1 ) ,
193199 1 << ( PASS1_BITS - 1 ) ,
@@ -201,8 +207,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
201207 }
202208
203209 // In-place 8x8x16-bit matrix transpose using AVX2 instructions
204- #[ inline( always) ]
205- unsafe fn do_transpose (
210+ #[ target_feature( enable = "avx2" ) ]
211+ #[ inline]
212+ fn do_transpose (
206213 i1 : __m256i ,
207214 i2 : __m256i ,
208215 i3 : __m256i ,
@@ -244,8 +251,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
244251 }
245252
246253 // In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
247- #[ inline( always) ]
248- unsafe fn do_dct (
254+ #[ target_feature( enable = "avx2" ) ]
255+ #[ inline]
256+ fn do_dct (
249257 first_pass : bool ,
250258 i1 : __m256i ,
251259 i2 : __m256i ,
@@ -412,12 +420,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
412420 ( t1, t2, t3, t4)
413421 }
414422
415- let in_data = core:: mem:: transmute :: < * mut i16 , * mut __m256i > ( data. as_mut_ptr ( ) ) ;
416-
417- let ymm4 = _mm256_loadu_si256 ( in_data) ;
418- let ymm5 = _mm256_loadu_si256 ( in_data. add ( 1 ) ) ;
419- let ymm6 = _mm256_loadu_si256 ( in_data. add ( 2 ) ) ;
420- let ymm7 = _mm256_loadu_si256 ( in_data. add ( 3 ) ) ;
423+ let ymm4 = avx_load ( & data[ 0 ..16 ] ) ;
424+ let ymm5 = avx_load ( & data[ 16 ..32 ] ) ;
425+ let ymm6 = avx_load ( & data[ 32 ..48 ] ) ;
426+ let ymm7 = avx_load ( & data[ 48 ..64 ] ) ;
421427
422428 // ---- Pass 1: process rows.
423429 // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
@@ -451,10 +457,28 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
451457 let ymm6 = _mm256_permute2x128_si256 ( ymm0, ymm4, 0x31 ) ; // ymm6=data4_5
452458 let ymm7 = _mm256_permute2x128_si256 ( ymm2, ymm4, 0x21 ) ; // ymm7=data6_7
453459
454- let out_data = core:: mem:: transmute :: < * mut i16 , * mut __m256i > ( data. as_mut_ptr ( ) ) ;
460+ avx_store ( ymm3, & mut data[ 0 ..16 ] ) ;
461+ avx_store ( ymm5, & mut data[ 16 ..32 ] ) ;
462+ avx_store ( ymm6, & mut data[ 32 ..48 ] ) ;
463+ avx_store ( ymm7, & mut data[ 48 ..64 ] ) ;
464+ }
455465
456- _mm256_storeu_si256 ( out_data, ymm3) ;
457- _mm256_storeu_si256 ( out_data. add ( 1 ) , ymm5) ;
458- _mm256_storeu_si256 ( out_data. add ( 2 ) , ymm6) ;
459- _mm256_storeu_si256 ( out_data. add ( 3 ) , ymm7) ;
466+ /// Safe wrapper for an unaligned AVX load
467+ #[ target_feature( enable = "avx2" ) ]
468+ #[ inline]
469+ fn avx_load ( input : & [ i16 ] ) -> __m256i {
470+ assert ! ( input. len( ) == 16 ) ;
471+ assert ! ( core:: mem:: size_of:: <[ i16 ; 16 ] >( ) == core:: mem:: size_of:: <__m256i>( ) ) ;
472+ // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
473+ unsafe { _mm256_loadu_si256 ( input. as_ptr ( ) as * const __m256i ) }
460474}
475+
476+ /// Safe wrapper for an unaligned AVX store
477+ #[ target_feature( enable = "avx2" ) ]
478+ #[ inline]
479+ fn avx_store ( input : __m256i , output : & mut [ i16 ] ) {
480+ assert ! ( output. len( ) == 16 ) ;
481+ assert ! ( core:: mem:: size_of:: <[ i16 ; 16 ] >( ) == core:: mem:: size_of:: <__m256i>( ) ) ;
482+ // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
483+ unsafe { _mm256_storeu_si256 ( output. as_mut_ptr ( ) as * mut __m256i , input) }
484+ }
0 commit comments