@@ -1646,8 +1646,26 @@ BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512, 64)->Ran
16461646#endif
16471647
16481648#if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
1649+
1650+ /* *
1651+ * Arm Scalable Vector Extension @b (SVE) is one of the weirdest current SIMD
1652+ * extensions. Unlike AVX2, AVX-512, or even RVV on RISC-V, it doesn't preset
1653+ * the register width at the ISA level! It's up to the physical implementation
1654+ * to choose any power of two between 128 and @b 2048 bits.
1655+ *
1656+ * In practice, Fugaku supercomputer likely has the largest SVE implementation
1657+ * at 512-bits length. The Arm Neoverse N2 core has 256-bit SVE. It also
1658+ * handles masking differently from AVX-512! Definitely worth reading about!
1659+ *
1660+ * @see "ARM's Scalable Vector Extensions: A Critical Look at SVE2 For Integer
1661+ * Workloads" by @ zingaburga:
1662+ * https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
1663+ *
1664+ */
16491665#include < arm_sve.h>
16501666
1667+ constexpr std::size_t max_sve_size_k = 2048 / CHAR_BIT;
1668+
16511669void spread_gather_sve ( //
16521670 spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
16531671 for (std::size_t i = 0 ; i < size; i += svcntw ()) {
@@ -1668,14 +1686,22 @@ void spread_scatter_sve( //
16681686 }
16691687}
16701688
1671- BENCHMARK_CAPTURE (spread_memory, gather_sve, spread_gather_sve)->Range(1 << 10 , 1 << 20 );
1672- BENCHMARK_CAPTURE (spread_memory, scatter_sve, spread_scatter_sve)->Range(1 << 10 , 1 << 20 );
1689+ BENCHMARK_CAPTURE (spread_memory, gather_sve, spread_gather_sve, max_sve_size_k )->Range(1 << 10 , 1 << 20 )->MinTime( 5 );
1690+ BENCHMARK_CAPTURE (spread_memory, scatter_sve, spread_scatter_sve, max_sve_size_k )->Range(1 << 10 , 1 << 20 )->MinTime( 5 );
16731691
16741692/* *
16751693 * @b Finally! This may just be the first place where SVE supersedes NEON
1676- * in functionality and has a bigger improvement over scalar code than AVX-512
1677- * on a similar-level x86 platform! Both gathers and scatters are consistently
1678- * @b 30% faster across small and large inputs!
1694+ * in functionality and may have a bigger improvement over scalar code than
1695+ * AVX-512 on a similar-level x86 platform!
1696+ *
1697+ * If you are very lucky with your input sizes, on small arrays under 65K
1698+ * on AWS Graviton, gathers can be up to 4x faster compared to serial code!
1699+ * On larger sizes, they again start losing to serial code. This makes
1700+ * their applicability very limited 😡
1701+ *
1702+ * Vectorized scatters are universally slower than serial code on Graviton
1703+ * for small inputs, but on larger ones over 1MB start winning up to 50%!
1704+ * Great way to get everyone confused 🤬
16791705 */
16801706#endif
16811707
@@ -2917,7 +2943,7 @@ inline std::byte *reallocate_from_arena( //
29172943 }
29182944 }
29192945
2920- // If we can’ t grow in place, do: allocate new + copy + free old
2946+ // If we can' t grow in place, do: allocate new + copy + free old
29212947 std::byte *new_ptr = allocate_from_arena (arena, new_size);
29222948 if (!new_ptr) return nullptr ; // Out of memory
29232949
0 commit comments