@@ -292,6 +292,99 @@ mod simd {
292292 c_bpp = b_bpp. try_into ( ) . unwrap ( ) ;
293293 }
294294 }
295+
296+ /// Predictor for Avg filter: floor((left + above) / 2)
297+ #[ inline( always) ]
298+ fn avg_predictor_simd < const BPP : usize > (
299+ left : Simd < u8 , BPP > ,
300+ above : Simd < u8 , BPP > ,
301+ ) -> Simd < u8 , BPP >
302+ where
303+ LaneCount < BPP > : SupportedLaneCount ,
304+ {
305+ ( ( left. cast :: < u16 > ( ) + above. cast :: < u16 > ( ) ) >> Simd :: splat ( 1 ) ) . cast :: < u8 > ( )
306+ }
307+
308+ /// Processes a chunk of 16 pixels (64 bytes) Avg filter (bpp=4)
309+ #[ inline( always) ]
310+ fn process_avg_chunk_bpp4_s64 (
311+ mut current_a : Simd < u8 , 4 > , // Unfiltered left pixel from previous iteration/chunk
312+ b_vec : & Simd < u8 , 64 > , // Unfiltered above row chunk
313+ x_out : & mut Simd < u8 , 64 > , // Current row chunk (filtered -> unfiltered)
314+ ) -> Simd < u8 , 4 > {
315+ let x_in = * x_out;
316+ let mut preds = [ 0u8 ; 64 ] ;
317+
318+ macro_rules! process_pixel {
319+ ( $shift: expr) => {
320+ let pred = avg_predictor_simd( current_a, b_vec. extract:: <$shift, 4 >( ) ) ;
321+ current_a = x_in. extract:: <$shift, 4 >( ) + pred;
322+ preds[ $shift..$shift + 4 ] . copy_from_slice( pred. as_array( ) ) ;
323+ } ;
324+ }
325+
326+ process_pixel ! ( 0 ) ;
327+ process_pixel ! ( 4 ) ;
328+ process_pixel ! ( 8 ) ;
329+ process_pixel ! ( 12 ) ;
330+ process_pixel ! ( 16 ) ;
331+ process_pixel ! ( 20 ) ;
332+ process_pixel ! ( 24 ) ;
333+ process_pixel ! ( 28 ) ;
334+ process_pixel ! ( 32 ) ;
335+ process_pixel ! ( 36 ) ;
336+ process_pixel ! ( 40 ) ;
337+ process_pixel ! ( 44 ) ;
338+ process_pixel ! ( 48 ) ;
339+ process_pixel ! ( 52 ) ;
340+ process_pixel ! ( 56 ) ;
341+ process_pixel ! ( 60 ) ;
342+
343+ * x_out += Simd :: from_array ( preds) ;
344+ current_a
345+ }
346+
347+ /// Unfilters a row of pixels (16 at a time) with the avg filter.
348+ pub fn avg_unfilter_bpp4 ( current : & mut [ u8 ] , previous : & [ u8 ] ) {
349+ const BPP : usize = 4 ;
350+ const STRIDE_BYTES : usize = 64 ; // 16 pixels * 4 bytes/pixel
351+
352+ let mut vlast_simd: Simd < u8 , BPP > = Default :: default ( ) ; // Left pixel (unfiltered)
353+
354+ let chunks = current. len ( ) / STRIDE_BYTES ;
355+
356+ let ( simd_current, remainder_current) = current. split_at_mut ( chunks * STRIDE_BYTES ) ;
357+ let ( simd_previous, remainder_prev_row) = previous. split_at ( chunks * STRIDE_BYTES ) ;
358+
359+ let current_iter = simd_current. chunks_exact_mut ( STRIDE_BYTES ) ;
360+ let previous_iter = simd_previous. chunks_exact ( STRIDE_BYTES ) ;
361+ let combined_iter = current_iter. zip ( previous_iter) ;
362+
363+ for ( current_chunk, previous_chunk) in combined_iter {
364+ let mut x: Simd < u8 , STRIDE_BYTES > = Simd :: < u8 , STRIDE_BYTES > :: from_slice ( current_chunk) ;
365+ let b: Simd < u8 , STRIDE_BYTES > = Simd :: < u8 , STRIDE_BYTES > :: from_slice ( previous_chunk) ;
366+
367+ vlast_simd = process_avg_chunk_bpp4_s64 ( vlast_simd, & b, & mut x) ;
368+
369+ x. copy_to_slice ( current_chunk) ;
370+ }
371+
372+ // Scalar remainder
373+ let mut vlast_scalar = vlast_simd. to_array ( ) ;
374+ for ( chunk, above) in remainder_current
375+ . chunks_exact_mut ( BPP )
376+ . zip ( remainder_prev_row. chunks_exact ( BPP ) )
377+ {
378+ let new_chunk = [
379+ chunk[ 0 ] . wrapping_add ( ( ( above[ 0 ] as u16 + vlast_scalar[ 0 ] as u16 ) / 2 ) as u8 ) ,
380+ chunk[ 1 ] . wrapping_add ( ( ( above[ 1 ] as u16 + vlast_scalar[ 1 ] as u16 ) / 2 ) as u8 ) ,
381+ chunk[ 2 ] . wrapping_add ( ( ( above[ 2 ] as u16 + vlast_scalar[ 2 ] as u16 ) / 2 ) as u8 ) ,
382+ chunk[ 3 ] . wrapping_add ( ( ( above[ 3 ] as u16 + vlast_scalar[ 3 ] as u16 ) / 2 ) as u8 ) ,
383+ ] ;
384+ * TryInto :: < & mut [ u8 ; BPP ] > :: try_into ( chunk) . unwrap ( ) = new_chunk;
385+ vlast_scalar = new_chunk;
386+ }
387+ }
295388}
296389
297390// This code path is used on non-x86_64 architectures but we allow dead code
@@ -691,6 +784,11 @@ pub(crate) fn unfilter(
691784 }
692785 }
693786 BytesPerPixel :: Four => {
787+ #[ cfg( feature = "unstable" ) ]
788+ {
789+ simd:: avg_unfilter_bpp4 ( current, previous) ;
790+ return ;
791+ }
694792 let mut lprev = [ 0 ; 4 ] ;
695793 for ( chunk, above) in current. chunks_exact_mut ( 4 ) . zip ( previous. chunks_exact ( 4 ) ) {
696794 let new_chunk = [
0 commit comments