Skip to content

Commit f75f32b

Browse files
committed
Merge branch 'main' into safe-simd-ycbcr
2 parents 324b734 + f4ab0f2 commit f75f32b

File tree

3 files changed

+62
-36
lines changed

3 files changed

+62
-36
lines changed

src/avx2/fdct.rs

Lines changed: 52 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,11 @@ pub fn fdct_avx2(data: &mut [i16; 64]) {
6464
}
6565

6666
#[target_feature(enable = "avx2")]
67-
unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
67+
fn fdct_avx2_internal(data: &mut [i16; 64]) {
68+
#[target_feature(enable = "avx2")]
6869
#[allow(non_snake_case)]
69-
#[inline(always)]
70-
unsafe fn PW_F130_F054_MF130_F054() -> __m256i {
70+
#[inline]
71+
fn PW_F130_F054_MF130_F054() -> __m256i {
7172
_mm256_set_epi16(
7273
F_0_541,
7374
F_0_541 - F_1_847,
@@ -88,9 +89,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
8889
)
8990
}
9091

92+
#[target_feature(enable = "avx2")]
9193
#[allow(non_snake_case)]
92-
#[inline(always)]
93-
unsafe fn PW_MF078_F117_F078_F117() -> __m256i {
94+
#[inline]
95+
fn PW_MF078_F117_F078_F117() -> __m256i {
9496
_mm256_set_epi16(
9597
F_1_175,
9698
F_1_175 - F_0_390,
@@ -111,9 +113,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
111113
)
112114
}
113115

116+
#[target_feature(enable = "avx2")]
114117
#[allow(non_snake_case)]
115-
#[inline(always)]
116-
unsafe fn PW_MF060_MF089_MF050_MF256() -> __m256i {
118+
#[inline]
119+
fn PW_MF060_MF089_MF050_MF256() -> __m256i {
117120
_mm256_set_epi16(
118121
-F_2_562,
119122
F_2_053 - F_2_562,
@@ -134,9 +137,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
134137
)
135138
}
136139

140+
#[target_feature(enable = "avx2")]
137141
#[allow(non_snake_case)]
138-
#[inline(always)]
139-
unsafe fn PW_F050_MF256_F060_MF089() -> __m256i {
142+
#[inline]
143+
fn PW_F050_MF256_F060_MF089() -> __m256i {
140144
_mm256_set_epi16(
141145
-F_0_899,
142146
F_1_501 - F_0_899,
@@ -157,9 +161,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
157161
)
158162
}
159163

164+
#[target_feature(enable = "avx2")]
160165
#[allow(non_snake_case)]
161-
#[inline(always)]
162-
unsafe fn PD_DESCALE_P(first_pass: bool) -> __m256i {
166+
#[inline]
167+
fn PD_DESCALE_P(first_pass: bool) -> __m256i {
163168
if first_pass {
164169
_mm256_set_epi32(
165170
1 << (DESCALE_P1 - 1),
@@ -185,9 +190,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
185190
}
186191
}
187192

193+
#[target_feature(enable = "avx2")]
188194
#[allow(non_snake_case)]
189-
#[inline(always)]
190-
unsafe fn PW_DESCALE_P2X() -> __m256i {
195+
#[inline]
196+
fn PW_DESCALE_P2X() -> __m256i {
191197
_mm256_set_epi32(
192198
1 << (PASS1_BITS - 1),
193199
1 << (PASS1_BITS - 1),
@@ -201,8 +207,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
201207
}
202208

203209
// In-place 8x8x16-bit matrix transpose using AVX2 instructions
204-
#[inline(always)]
205-
unsafe fn do_transpose(
210+
#[target_feature(enable = "avx2")]
211+
#[inline]
212+
fn do_transpose(
206213
i1: __m256i,
207214
i2: __m256i,
208215
i3: __m256i,
@@ -244,8 +251,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
244251
}
245252

246253
// In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
247-
#[inline(always)]
248-
unsafe fn do_dct(
254+
#[target_feature(enable = "avx2")]
255+
#[inline]
256+
fn do_dct(
249257
first_pass: bool,
250258
i1: __m256i,
251259
i2: __m256i,
@@ -412,12 +420,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
412420
(t1, t2, t3, t4)
413421
}
414422

415-
let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());
416-
417-
let ymm4 = _mm256_loadu_si256(in_data);
418-
let ymm5 = _mm256_loadu_si256(in_data.add(1));
419-
let ymm6 = _mm256_loadu_si256(in_data.add(2));
420-
let ymm7 = _mm256_loadu_si256(in_data.add(3));
423+
let ymm4 = avx_load(&data[0..16]);
424+
let ymm5 = avx_load(&data[16..32]);
425+
let ymm6 = avx_load(&data[32..48]);
426+
let ymm7 = avx_load(&data[48..64]);
421427

422428
// ---- Pass 1: process rows.
423429
// ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
@@ -451,10 +457,28 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
451457
let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5
452458
let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7
453459

454-
let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());
460+
avx_store(ymm3, &mut data[0..16]);
461+
avx_store(ymm5, &mut data[16..32]);
462+
avx_store(ymm6, &mut data[32..48]);
463+
avx_store(ymm7, &mut data[48..64]);
464+
}
455465

456-
_mm256_storeu_si256(out_data, ymm3);
457-
_mm256_storeu_si256(out_data.add(1), ymm5);
458-
_mm256_storeu_si256(out_data.add(2), ymm6);
459-
_mm256_storeu_si256(out_data.add(3), ymm7);
466+
/// Safe wrapper for an unaligned AVX load
467+
#[target_feature(enable = "avx2")]
468+
#[inline]
469+
fn avx_load(input: &[i16]) -> __m256i {
470+
assert!(input.len() == 16);
471+
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
472+
// SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
473+
unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) }
460474
}
475+
476+
/// Safe wrapper for an unaligned AVX store
477+
#[target_feature(enable = "avx2")]
478+
#[inline]
479+
fn avx_store(input: __m256i, output: &mut [i16]) {
480+
assert!(output.len() == 16);
481+
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
482+
// SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
483+
unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) }
484+
}

src/encoder.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -353,13 +353,17 @@ impl<W: JfifWrite> Encoder<W> {
353353
/// # Errors
354354
///
355355
/// Returns an error if the segment number is invalid or data exceeds the allowed size
356-
pub fn add_app_segment(&mut self, segment_nr: u8, data: &[u8]) -> Result<(), EncodingError> {
356+
pub fn add_app_segment(
357+
&mut self,
358+
segment_nr: u8,
359+
data: Vec<u8>,
360+
) -> Result<(), EncodingError> {
357361
if segment_nr == 0 || segment_nr > 15 {
358362
Err(EncodingError::InvalidAppSegment(segment_nr))
359363
} else if data.len() > 65533 {
360364
Err(EncodingError::AppSegmentTooLarge(data.len()))
361365
} else {
362-
self.app_segments.push((segment_nr, data.to_vec()));
366+
self.app_segments.push((segment_nr, data));
363367
Ok(())
364368
}
365369
}
@@ -385,16 +389,14 @@ impl<W: JfifWrite> Encoder<W> {
385389
return Err(EncodingError::IccTooLarge(data.len()));
386390
}
387391

388-
let mut chunk_data = Vec::with_capacity(MAX_CHUNK_LENGTH);
389-
390392
for (i, data) in data.chunks(MAX_CHUNK_LENGTH).enumerate() {
391-
chunk_data.clear();
393+
let mut chunk_data = Vec::with_capacity(MAX_CHUNK_LENGTH);
392394
chunk_data.extend_from_slice(MARKER);
393395
chunk_data.push(i as u8 + 1);
394396
chunk_data.push(num_chunks as u8);
395397
chunk_data.extend_from_slice(data);
396398

397-
self.add_app_segment(2, &chunk_data)?;
399+
self.add_app_segment(2, chunk_data)?;
398400
}
399401

400402
Ok(())
@@ -415,7 +417,7 @@ impl<W: JfifWrite> Encoder<W> {
415417
let mut formatted = EXIF_HEADER.to_vec();
416418
formatted.extend_from_slice(data);
417419

418-
self.add_app_segment(1, &formatted)
420+
self.add_app_segment(1, formatted)
419421
}
420422

421423
/// Encode an image

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ mod tests {
473473
let mut result = Vec::new();
474474
let mut encoder = Encoder::new(&mut result, 100);
475475

476-
encoder.add_app_segment(15, b"HOHOHO\0").unwrap();
476+
encoder.add_app_segment(15, b"HOHOHO\0".to_vec()).unwrap();
477477

478478
encoder
479479
.encode(&data, width, height, ColorType::Rgb)

0 commit comments

Comments
 (0)