From d221c8f9015a9edc6c97aeb211e28b9c03e5cb27 Mon Sep 17 00:00:00 2001 From: Richard Townsend Date: Thu, 4 Sep 2025 01:02:00 +0000 Subject: [PATCH] perf: avg filter (4bpp) Again, Cortex-A520 seems the big winner here, going from 434 MiB/s to about 740 MiB/s (70% faster), X4 benefits less (about 13%). --- src/filter/mod.rs | 5 +++ src/filter/simd.rs | 90 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) diff --git a/src/filter/mod.rs b/src/filter/mod.rs index 8ebc0f80..c9192b4e 100644 --- a/src/filter/mod.rs +++ b/src/filter/mod.rs @@ -309,6 +309,11 @@ pub(crate) fn unfilter( } } BytesPerPixel::Four => { + #[cfg(feature = "unstable")] + { + simd::avg_unfilter_bpp4(current, previous); + return; + } let mut lprev = [0; 4]; for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) { let new_chunk = [ diff --git a/src/filter/simd.rs b/src/filter/simd.rs index 374cc586..ba8b4d8d 100644 --- a/src/filter/simd.rs +++ b/src/filter/simd.rs @@ -314,3 +314,93 @@ pub fn paeth_unfilter_4bpp(row: &mut [u8], prev_row: &[u8]) { c_bpp = b_bpp.try_into().unwrap(); } } + +/// Predictor for Avg filter: floor((left + above) / 2) +#[inline(always)] +fn avg_predictor_simd(left: Simd, above: Simd) -> Simd +where + LaneCount: SupportedLaneCount, +{ + ((left.cast::() + above.cast::()) >> Simd::splat(1)).cast::() +} + +/// Processes a chunk of 16 pixels (64 bytes) Avg filter (bpp=4) +#[inline(always)] +fn process_avg_chunk_bpp4_s64( + mut current_a: Simd, // Unfiltered left pixel from previous iteration/chunk + b_vec: &Simd, // Unfiltered above row chunk + x_out: &mut Simd, // Current row chunk (filtered -> unfiltered) +) -> Simd { + let x_in = *x_out; + let mut preds = [0u8; 64]; + + macro_rules! process_pixel { + ($shift:expr) => { + let pred = avg_predictor_simd(current_a, b_vec.extract::<$shift, 4>()); + current_a = x_in.extract::<$shift, 4>() + pred; + preds[$shift..$shift + 4].copy_from_slice(pred.as_array()); + }; + } + + process_pixel!(0); + process_pixel!(4); + process_pixel!(8); + process_pixel!(12); + process_pixel!(16); + process_pixel!(20); + process_pixel!(24); + process_pixel!(28); + process_pixel!(32); + process_pixel!(36); + process_pixel!(40); + process_pixel!(44); + process_pixel!(48); + process_pixel!(52); + process_pixel!(56); + process_pixel!(60); + + *x_out += Simd::from_array(preds); + current_a +} + +/// Unfilters a row of pixels (16 at a time) with the avg filter. +pub fn avg_unfilter_bpp4(current: &mut [u8], previous: &[u8]) { + const BPP: usize = 4; + const STRIDE_BYTES: usize = 64; // 16 pixels * 4 bytes/pixel + + let mut vlast_simd: Simd = Default::default(); // Left pixel (unfiltered) + + let chunks = current.len() / STRIDE_BYTES; + + let (simd_current, remainder_current) = current.split_at_mut(chunks * STRIDE_BYTES); + let (simd_previous, remainder_prev_row) = previous.split_at(chunks * STRIDE_BYTES); + + let current_iter = simd_current.chunks_exact_mut(STRIDE_BYTES); + let previous_iter = simd_previous.chunks_exact(STRIDE_BYTES); + let combined_iter = current_iter.zip(previous_iter); + + for (current_chunk, previous_chunk) in combined_iter { + let mut x: Simd = Simd::::from_slice(current_chunk); + let b: Simd = Simd::::from_slice(previous_chunk); + + vlast_simd = process_avg_chunk_bpp4_s64(vlast_simd, &b, &mut x); + + x.copy_to_slice(current_chunk); + } + + // Scalar remainder + let mut vlast_scalar = vlast_simd.to_array(); + for (chunk, above) in remainder_current + .chunks_exact_mut(BPP) + .zip(remainder_prev_row.chunks_exact(BPP)) + { + let new_chunk = [ + chunk[0].wrapping_add(((above[0] as u16 + vlast_scalar[0] as u16) / 2) as u8), + chunk[1].wrapping_add(((above[1] as u16 + vlast_scalar[1] as u16) / 2) as u8), + chunk[2].wrapping_add(((above[2] as u16 + vlast_scalar[2] as u16) / 2) as u8), + chunk[3].wrapping_add(((above[3] as u16 + vlast_scalar[3] as u16) / 2) as u8), + ]; + *TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk; + vlast_scalar = new_chunk; + } +}