Skip to content

Commit ac6b46c

Browse files
committed
perf: avg filter (4bpp)
Again, Cortex-A520 seems the big winner here, going from 415 MiB/s to about 700 MiB/s (70% faster), X4 benefits less.
1 parent 1b94193 commit ac6b46c

File tree

1 file changed

+98
-0
lines changed

1 file changed

+98
-0
lines changed

src/filter.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,99 @@ mod simd {
292292
c_bpp = b_bpp.try_into().unwrap();
293293
}
294294
}
295+
296+
/// Predictor for Avg filter: floor((left + above) / 2)
297+
#[inline(always)]
298+
fn avg_predictor_simd<const BPP: usize>(
299+
left: Simd<u8, BPP>,
300+
above: Simd<u8, BPP>,
301+
) -> Simd<u8, BPP>
302+
where
303+
LaneCount<BPP>: SupportedLaneCount,
304+
{
305+
((left.cast::<u16>() + above.cast::<u16>()) >> Simd::splat(1)).cast::<u8>()
306+
}
307+
308+
/// Processes a chunk of 16 pixels (64 bytes) Avg filter (bpp=4)
309+
#[inline(always)]
310+
fn process_avg_chunk_bpp4_s64(
311+
mut current_a: Simd<u8, 4>, // Unfiltered left pixel from previous iteration/chunk
312+
b_vec: &Simd<u8, 64>, // Unfiltered above row chunk
313+
x_out: &mut Simd<u8, 64>, // Current row chunk (filtered -> unfiltered)
314+
) -> Simd<u8, 4> {
315+
let x_in = *x_out;
316+
let mut preds = [0u8; 64];
317+
318+
macro_rules! process_pixel {
319+
($shift:expr) => {
320+
let pred = avg_predictor_simd(current_a, b_vec.extract::<$shift, 4>());
321+
current_a = x_in.extract::<$shift, 4>() + pred;
322+
preds[$shift..$shift + 4].copy_from_slice(pred.as_array());
323+
};
324+
}
325+
326+
process_pixel!(0);
327+
process_pixel!(4);
328+
process_pixel!(8);
329+
process_pixel!(12);
330+
process_pixel!(16);
331+
process_pixel!(20);
332+
process_pixel!(24);
333+
process_pixel!(28);
334+
process_pixel!(32);
335+
process_pixel!(36);
336+
process_pixel!(40);
337+
process_pixel!(44);
338+
process_pixel!(48);
339+
process_pixel!(52);
340+
process_pixel!(56);
341+
process_pixel!(60);
342+
343+
*x_out += Simd::from_array(preds);
344+
current_a
345+
}
346+
347+
/// Unfilters a row of pixels (16 at a time) with the avg filter.
348+
pub fn avg_unfilter_bpp4(current: &mut [u8], previous: &[u8]) {
349+
const BPP: usize = 4;
350+
const STRIDE_BYTES: usize = 64; // 16 pixels * 4 bytes/pixel
351+
352+
let mut vlast_simd: Simd<u8, BPP> = Default::default(); // Left pixel (unfiltered)
353+
354+
let chunks = current.len() / STRIDE_BYTES;
355+
356+
let (simd_current, remainder_current) = current.split_at_mut(chunks * STRIDE_BYTES);
357+
let (simd_previous, remainder_prev_row) = previous.split_at(chunks * STRIDE_BYTES);
358+
359+
let current_iter = simd_current.chunks_exact_mut(STRIDE_BYTES);
360+
let previous_iter = simd_previous.chunks_exact(STRIDE_BYTES);
361+
let combined_iter = current_iter.zip(previous_iter);
362+
363+
for (current_chunk, previous_chunk) in combined_iter {
364+
let mut x: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(current_chunk);
365+
let b: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(previous_chunk);
366+
367+
vlast_simd = process_avg_chunk_bpp4_s64(vlast_simd, &b, &mut x);
368+
369+
x.copy_to_slice(current_chunk);
370+
}
371+
372+
// Scalar remainder
373+
let mut vlast_scalar = vlast_simd.to_array();
374+
for (chunk, above) in remainder_current
375+
.chunks_exact_mut(BPP)
376+
.zip(remainder_prev_row.chunks_exact(BPP))
377+
{
378+
let new_chunk = [
379+
chunk[0].wrapping_add(((above[0] as u16 + vlast_scalar[0] as u16) / 2) as u8),
380+
chunk[1].wrapping_add(((above[1] as u16 + vlast_scalar[1] as u16) / 2) as u8),
381+
chunk[2].wrapping_add(((above[2] as u16 + vlast_scalar[2] as u16) / 2) as u8),
382+
chunk[3].wrapping_add(((above[3] as u16 + vlast_scalar[3] as u16) / 2) as u8),
383+
];
384+
*TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
385+
vlast_scalar = new_chunk;
386+
}
387+
}
295388
}
296389

297390
// This code path is used on non-x86_64 architectures but we allow dead code
@@ -691,6 +784,11 @@ pub(crate) fn unfilter(
691784
}
692785
}
693786
BytesPerPixel::Four => {
787+
#[cfg(feature = "unstable")]
788+
{
789+
simd::avg_unfilter_bpp4(current, previous);
790+
return;
791+
}
694792
let mut lprev = [0; 4];
695793
for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
696794
let new_chunk = [

0 commit comments

Comments
 (0)