Skip to content

Commit daa55f5

Browse files
committed
Improve: Timing SVE
1 parent 2230fa7 commit daa55f5

File tree

2 files changed

+34
-6
lines changed

2 files changed

+34
-6
lines changed

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"excerise",
2121
"fconcepts",
2222
"Fedor",
23+
"Fugaku",
2324
"Goodput",
2425
"grandkids",
2526
"Hana",
@@ -36,6 +37,7 @@
3637
"Meneide",
3738
"MSVC",
3839
"Müller",
40+
"Neoverse",
3941
"Niebler",
4042
"Niels",
4143
"nlohmann",

less_slow.cpp

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1646,8 +1646,26 @@ BENCHMARK_CAPTURE(spread_memory, scatter_avx512, spread_scatter_avx512, 64)->Ran
16461646
#endif
16471647

16481648
#if defined(__ARM_FEATURE_SVE) // Arm NEON has no gather/scatter instructions, but SVE does 🥳
1649+
1650+
/**
1651+
* Arm Scalable Vector Extension @b (SVE) is one of the weirdest current SIMD
1652+
* extensions. Unlike AVX2, AVX-512, or even RVV on RISC-V, it doesn't preset
1653+
* the register width at the ISA level! It's up to the physical implementation
1654+
* to choose any power of two between 128 and @b 2048 bits.
1655+
*
1656+
* In practice, Fugaku supercomputer likely has the largest SVE implementation
1657+
* at 512-bits length. The Arm Neoverse N2 core has 256-bit SVE. It also
1658+
* handles masking differently from AVX-512! Definitely worth reading about!
1659+
*
1660+
* @see "ARM's Scalable Vector Extensions: A Critical Look at SVE2 For Integer
1661+
* Workloads" by @ zingaburga:
1662+
* https://gist.github.com/zingaburga/805669eb891c820bd220418ee3f0d6bd
1663+
*
1664+
*/
16491665
#include <arm_sve.h>
16501666

1667+
constexpr std::size_t max_sve_size_k = 2048 / CHAR_BIT;
1668+
16511669
void spread_gather_sve( //
16521670
spread_data_t const *data, spread_index_t const *indices, spread_data_t *result, std::size_t size) {
16531671
for (std::size_t i = 0; i < size; i += svcntw()) {
@@ -1668,14 +1686,22 @@ void spread_scatter_sve( //
16681686
}
16691687
}
16701688

1671-
BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve)->Range(1 << 10, 1 << 20);
1672-
BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve)->Range(1 << 10, 1 << 20);
1689+
BENCHMARK_CAPTURE(spread_memory, gather_sve, spread_gather_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
1690+
BENCHMARK_CAPTURE(spread_memory, scatter_sve, spread_scatter_sve, max_sve_size_k)->Range(1 << 10, 1 << 20)->MinTime(5);
16731691

16741692
/**
16751693
* @b Finally! This may just be the first place where SVE supersedes NEON
1676-
* in functionality and has a bigger improvement over scalar code than AVX-512
1677-
* on a similar-level x86 platform! Both gathers and scatters are consistently
1678-
* @b 30% faster across small and large inputs!
1694+
* in functionality and may have a bigger improvement over scalar code than
1695+
* AVX-512 on a similar-level x86 platform!
1696+
*
1697+
* If you are very lucky with your input sizes, on small arrays under 65K
1698+
* on AWS Graviton, gathers can be up to 4x faster compared to serial code!
1699+
* On larger sizes, they again start losing to serial code. This makes
1700+
* their applicability very limited 😡
1701+
*
1702+
* Vectorized scatters are universally slower than serial code on Graviton
1703+
* for small inputs, but on larger ones over 1MB start winning up to 50%!
1704+
* Great way to get everyone confused 🤬
16791705
*/
16801706
#endif
16811707

@@ -2917,7 +2943,7 @@ inline std::byte *reallocate_from_arena( //
29172943
}
29182944
}
29192945

2920-
// If we cant grow in place, do: allocate new + copy + free old
2946+
// If we can't grow in place, do: allocate new + copy + free old
29212947
std::byte *new_ptr = allocate_from_arena(arena, new_size);
29222948
if (!new_ptr) return nullptr; // Out of memory
29232949

0 commit comments

Comments
 (0)