perf: avg filter (4bpp)

Sentimentron · Sentimentron · commit ac6b46cd342d · 2025-09-04T01:02:00.000Z
Again, Cortex-A520 seems the big winner here, going from 415 MiB/s to
about 700 MiB/s (70% faster), X4 benefits less.
diff --git a/src/filter.rs b/src/filter.rs
@@ -292,6 +292,99 @@ mod simd {
             c_bpp = b_bpp.try_into().unwrap();
         }
     }
+
+    /// Predictor for Avg filter: floor((left + above) / 2)
+    #[inline(always)]
+    fn avg_predictor_simd<const BPP: usize>(
+        left: Simd<u8, BPP>,
+        above: Simd<u8, BPP>,
+    ) -> Simd<u8, BPP>
+    where
+        LaneCount<BPP>: SupportedLaneCount,
+    {
+        ((left.cast::<u16>() + above.cast::<u16>()) >> Simd::splat(1)).cast::<u8>()
+    }
+
+    /// Processes a chunk of 16 pixels (64 bytes) Avg filter (bpp=4)
+    #[inline(always)]
+    fn process_avg_chunk_bpp4_s64(
+        mut current_a: Simd<u8, 4>, // Unfiltered left pixel from previous iteration/chunk
+        b_vec: &Simd<u8, 64>,       // Unfiltered above row chunk
+        x_out: &mut Simd<u8, 64>,   // Current row chunk (filtered -> unfiltered)
+    ) -> Simd<u8, 4> {
+        let x_in = *x_out;
+        let mut preds = [0u8; 64];
+
+        macro_rules! process_pixel {
+            ($shift:expr) => {
+                let pred = avg_predictor_simd(current_a, b_vec.extract::<$shift, 4>());
+                current_a = x_in.extract::<$shift, 4>() + pred;
+                preds[$shift..$shift + 4].copy_from_slice(pred.as_array());
+            };
+        }
+
+        process_pixel!(0);
+        process_pixel!(4);
+        process_pixel!(8);
+        process_pixel!(12);
+        process_pixel!(16);
+        process_pixel!(20);
+        process_pixel!(24);
+        process_pixel!(28);
+        process_pixel!(32);
+        process_pixel!(36);
+        process_pixel!(40);
+        process_pixel!(44);
+        process_pixel!(48);
+        process_pixel!(52);
+        process_pixel!(56);
+        process_pixel!(60);
+
+        *x_out += Simd::from_array(preds);
+        current_a
+    }
+
+    /// Unfilters a row of pixels (16 at a time) with the avg filter.
+    pub fn avg_unfilter_bpp4(current: &mut [u8], previous: &[u8]) {
+        const BPP: usize = 4;
+        const STRIDE_BYTES: usize = 64; // 16 pixels * 4 bytes/pixel
+
+        let mut vlast_simd: Simd<u8, BPP> = Default::default(); // Left pixel (unfiltered)
+
+        let chunks = current.len() / STRIDE_BYTES;
+
+        let (simd_current, remainder_current) = current.split_at_mut(chunks * STRIDE_BYTES);
+        let (simd_previous, remainder_prev_row) = previous.split_at(chunks * STRIDE_BYTES);
+
+        let current_iter = simd_current.chunks_exact_mut(STRIDE_BYTES);
+        let previous_iter = simd_previous.chunks_exact(STRIDE_BYTES);
+        let combined_iter = current_iter.zip(previous_iter);
+
+        for (current_chunk, previous_chunk) in combined_iter {
+            let mut x: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(current_chunk);
+            let b: Simd<u8, STRIDE_BYTES> = Simd::<u8, STRIDE_BYTES>::from_slice(previous_chunk);
+
+            vlast_simd = process_avg_chunk_bpp4_s64(vlast_simd, &b, &mut x);
+
+            x.copy_to_slice(current_chunk);
+        }
+
+        // Scalar remainder
+        let mut vlast_scalar = vlast_simd.to_array();
+        for (chunk, above) in remainder_current
+            .chunks_exact_mut(BPP)
+            .zip(remainder_prev_row.chunks_exact(BPP))
+        {
+            let new_chunk = [
+                chunk[0].wrapping_add(((above[0] as u16 + vlast_scalar[0] as u16) / 2) as u8),
+                chunk[1].wrapping_add(((above[1] as u16 + vlast_scalar[1] as u16) / 2) as u8),
+                chunk[2].wrapping_add(((above[2] as u16 + vlast_scalar[2] as u16) / 2) as u8),
+                chunk[3].wrapping_add(((above[3] as u16 + vlast_scalar[3] as u16) / 2) as u8),
+            ];
+            *TryInto::<&mut [u8; BPP]>::try_into(chunk).unwrap() = new_chunk;
+            vlast_scalar = new_chunk;
+        }
+    }
 }
 
 // This code path is used on non-x86_64 architectures but we allow dead code
@@ -691,6 +784,11 @@ pub(crate) fn unfilter(
                 }
             }
             BytesPerPixel::Four => {
+                #[cfg(feature = "unstable")]
+                {
+                    simd::avg_unfilter_bpp4(current, previous);
+                    return;
+                }
                 let mut lprev = [0; 4];
                 for (chunk, above) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
                     let new_chunk = [