Skip to content

Commit 61b972b

Browse files
committed
simd-0332: reduce chacha rounds for turbine from 20 to 8
1 parent 3be8509 commit 61b972b

File tree

14 files changed

+919
-66
lines changed

14 files changed

+919
-66
lines changed

src/ballet/chacha/fd_chacha.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
#define FD_CHACHA_BLOCK_SZ (64UL)
99

10-
/* FD_CHACHA20_KEY_SZ is the size of the ChaCha20 encryption key */
10+
/* FD_CHACHA_KEY_SZ is the size of the ChaCha20 encryption key */
1111

12-
#define FD_CHACHA20_KEY_SZ (32UL)
12+
#define FD_CHACHA_KEY_SZ (32UL)
1313

1414
FD_PROTOTYPES_BEGIN
1515

src/ballet/chacha/fd_chacha_rng.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,20 @@ fd_chacha_rng_delete( void * shrng ) {
5858
return shrng;
5959
}
6060

61+
fd_chacha_rng_t *
62+
fd_chacha8_rng_init( fd_chacha_rng_t * rng,
63+
void const * key ) {
64+
memcpy( rng->key, key, FD_CHACHA_KEY_SZ );
65+
rng->buf_off = 0UL;
66+
rng->buf_fill = 0UL;
67+
fd_chacha8_rng_private_refill( rng );
68+
return rng;
69+
}
70+
6171
fd_chacha_rng_t *
6272
fd_chacha20_rng_init( fd_chacha_rng_t * rng,
6373
void const * key ) {
64-
memcpy( rng->key, key, FD_CHACHA20_KEY_SZ );
74+
memcpy( rng->key, key, FD_CHACHA_KEY_SZ );
6575
rng->buf_off = 0UL;
6676
rng->buf_fill = 0UL;
6777
fd_chacha20_rng_private_refill( rng );

src/ballet/chacha/fd_chacha_rng.h

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,7 @@
66
fd_rng is a better choice in all other cases. */
77

88
#include "fd_chacha.h"
9-
#if !FD_HAS_INT128
109
#include "../../util/bits/fd_uwide.h"
11-
#endif
1210

1311
/* FD_CHACHA_RNG_DEBUG controls debug logging. 0 is off; 1 is on. */
1412

@@ -231,16 +229,31 @@ fd_chacha20_rng_ulong_roll( fd_chacha_rng_t * rng,
231229
(n << (63 - fd_ulong_find_msb( n ) )) - 1UL );
232230

233231
for( int i=0; 1; i++ ) {
234-
ulong v = fd_chacha20_rng_ulong( rng );
235-
#if FD_HAS_INT128
236-
/* Compiles to one mulx instruction */
237-
uint128 res = (uint128)v * (uint128)n;
238-
ulong hi = (ulong)(res>>64);
239-
ulong lo = (ulong) res;
240-
#else
232+
ulong v = fd_chacha20_rng_ulong( rng );
233+
ulong hi, lo;
234+
fd_uwide_mul( &hi, &lo, v, n );
235+
236+
# if FD_CHACHA_RNG_DEBUG
237+
FD_LOG_DEBUG(( "roll (attempt %d): n=%016lx zone: %016lx v=%016lx lo=%016lx hi=%016lx", i, n, zone, v, lo, hi ));
238+
# else
239+
(void)i;
240+
# endif /* FD_CHACHA_RNG_DEBUG */
241+
242+
if( FD_LIKELY( lo<=zone ) ) return hi;
243+
}
244+
}
245+
246+
static inline ulong
247+
fd_chacha8_rng_ulong_roll( fd_chacha_rng_t * rng,
248+
ulong n ) {
249+
ulong const zone = fd_ulong_if( rng->mode==FD_CHACHA_RNG_MODE_MOD,
250+
ULONG_MAX - (ULONG_MAX-n+1UL)%n,
251+
(n << (63 - fd_ulong_find_msb( n ) )) - 1UL );
252+
253+
for( int i=0; 1; i++ ) {
254+
ulong v = fd_chacha8_rng_ulong( rng );
241255
ulong hi, lo;
242256
fd_uwide_mul( &hi, &lo, v, n );
243-
#endif
244257

245258
# if FD_CHACHA_RNG_DEBUG
246259
FD_LOG_DEBUG(( "roll (attempt %d): n=%016lx zone: %016lx v=%016lx lo=%016lx hi=%016lx", i, n, zone, v, lo, hi ));

src/ballet/wsample/fd_wsample.c

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ struct __attribute__((aligned(64UL))) fd_wsample_private {
127127
uint height;
128128
char restore_enabled;
129129
char poisoned_mode;
130-
/* Two bytes of padding here */
130+
uchar rng_algo;
131+
/* One byte of padding here */
131132

132133
fd_chacha_rng_t * rng;
133134

@@ -302,6 +303,7 @@ fd_wsample_new_init( void * shmem,
302303
sampler->height = (uint)height;
303304
sampler->restore_enabled = (char)!!restore_enabled;
304305
sampler->poisoned_mode = 0;
306+
sampler->rng_algo = 0;
305307
sampler->rng = rng;
306308

307309
fd_memset( sampler->tree, (char)0, internal_cnt*sizeof(tree_ele_t) );
@@ -389,18 +391,25 @@ fd_wsample_delete( void * shmem ) {
389391
return shmem;
390392
}
391393

392-
393-
394-
fd_chacha_rng_t * fd_wsample_get_rng( fd_wsample_t * sampler ) { return sampler->rng; }
395-
396-
397-
/* TODO: Should this function exist at all? */
398394
void
399-
fd_wsample_seed_rng( fd_chacha_rng_t * rng,
400-
uchar seed[static 32] ) {
401-
fd_chacha20_rng_init( rng, seed );
395+
fd_wsample_seed_rng( fd_wsample_t * sampler,
396+
uchar seed[ 32 ],
397+
int use_chacha8 ) {
398+
sampler->rng_algo = fd_uchar_if( use_chacha8, FD_WSAMPLE_RNG_CHACHA8, FD_WSAMPLE_RNG_CHACHA20 );
399+
if( FD_UNLIKELY( sampler->rng_algo==FD_WSAMPLE_RNG_CHACHA8 ) ) {
400+
fd_chacha8_rng_init( sampler->rng, seed );
401+
} else {
402+
fd_chacha20_rng_init( sampler->rng, seed );
403+
}
402404
}
403405

406+
ulong
407+
fd_wsample_rng_ulong_roll( fd_wsample_t * sampler, ulong n ) {
408+
if( FD_UNLIKELY( sampler->rng_algo==FD_WSAMPLE_RNG_CHACHA8 ) ) {
409+
return fd_chacha8_rng_ulong_roll( sampler->rng, n );
410+
}
411+
return fd_chacha20_rng_ulong_roll( sampler->rng, n );
412+
}
404413

405414
fd_wsample_t *
406415
fd_wsample_restore_all( fd_wsample_t * sampler ) {
@@ -597,7 +606,7 @@ fd_wsample_remove_idx( fd_wsample_t * sampler,
597606
but operations with mask registers are frustratingly slow. Instead,
598607
we first define x'=x+1, so then v0<=x is equivalent to v0-x'<0, or
599608
whether v0-x' has the high bit set. Because of
600-
fd_chacha20_rng_ulong_roll's contract, we know x<ULONG_MAX, so forming
609+
fd_wsample_rng_ulong_roll's contract, we know x<ULONG_MAX, so forming
601610
x' is safe. We then use a _mm512_permutexvar_epi8 to essentially
602611
broadcast the high byte from each of the ulongs (really we just need
603612
the high bit) to the whole vector. A popcnt gives us our base
@@ -734,7 +743,7 @@ fd_wsample_sample_and_remove_many( fd_wsample_t * sampler,
734743
for( ulong i=0UL; i<cnt; i++ ) {
735744
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) { idxs[ i ] = FD_WSAMPLE_EMPTY; continue; }
736745
if( FD_UNLIKELY( sampler->poisoned_mode ) ) { idxs[ i ] = FD_WSAMPLE_INDETERMINATE; continue; }
737-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
746+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
738747
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) {
739748
idxs[ i ] = FD_WSAMPLE_INDETERMINATE;
740749
sampler->poisoned_mode = 1;
@@ -771,7 +780,7 @@ ulong
771780
fd_wsample_sample( fd_wsample_t * sampler ) {
772781
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) return FD_WSAMPLE_EMPTY;
773782
if( FD_UNLIKELY( sampler->poisoned_mode ) ) return FD_WSAMPLE_INDETERMINATE;
774-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
783+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
775784
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) return FD_WSAMPLE_INDETERMINATE;
776785
return (ulong)fd_wsample_map_sample( sampler, unif );
777786
}
@@ -780,7 +789,7 @@ ulong
780789
fd_wsample_sample_and_remove( fd_wsample_t * sampler ) {
781790
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) return FD_WSAMPLE_EMPTY;
782791
if( FD_UNLIKELY( sampler->poisoned_mode ) ) return FD_WSAMPLE_INDETERMINATE;
783-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
792+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
784793
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) {
785794
sampler->poisoned_mode = 1;
786795
return FD_WSAMPLE_INDETERMINATE;

src/ballet/wsample/fd_wsample.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
struct fd_wsample_private;
2121
typedef struct fd_wsample_private fd_wsample_t;
2222

23+
#define FD_WSAMPLE_RNG_CHACHA20 (0U)
24+
#define FD_WSAMPLE_RNG_CHACHA8 (1U)
25+
2326
#define FD_WSAMPLE_ALIGN (64UL)
2427
/* fd_leaders really wants a compile time-compatible footprint... The
2528
internal count is 1/8 * (9^ceil(log_9(ele_cnt)) - 1) */
@@ -135,10 +138,10 @@ void * fd_wsample_new_fini( void * shmem, ulong poisoned_weight );
135138
/* fd_wsample_get_rng returns the value provided for rng in new. */
136139
fd_chacha_rng_t * fd_wsample_get_rng( fd_wsample_t * sampler );
137140

138-
/* fd_wsample_seed_rng seeds the ChaCha20 rng with the provided seed in
141+
/* fd_wsample_seed_rng seeds the ChaCha rng with the provided seed in
139142
preparation for sampling. This function is compatible with Solana's
140143
ChaChaRng::from_seed. */
141-
void fd_wsample_seed_rng( fd_chacha_rng_t * rng, uchar seed[static 32] );
144+
void fd_wsample_seed_rng( fd_wsample_t * sampler, uchar seed[ 32 ], int use_chacha8 );
142145

143146
/* fd_wsample_sample{_and_remove}{,_many} produces one or cnt (in the
144147
_many case) weighted random samples from the sampler. If the
@@ -191,6 +194,9 @@ void fd_wsample_remove_idx( fd_wsample_t * sampler, ulong idx );
191194
in which case no elements are restored. */
192195
fd_wsample_t * fd_wsample_restore_all( fd_wsample_t * sampler );
193196

194-
197+
/* fd_wsample_rng_ulong_roll returns an uniform IID rand in [0,n)
198+
analogous to fd_rng_ulong_roll. Internally it uses chacha8 or
199+
chacha20, based on how the wsample was initialized. */
200+
ulong fd_wsample_rng_ulong_roll( fd_wsample_t * sampler, ulong n );
195201

196202
#endif /* HEADER_fd_src_ballet_wsample_fd_wsample_h */

src/ballet/wsample/test_wsample.c

Lines changed: 63 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ test_matches_solana( void ) {
191191

192192
void * partial = fd_wsample_new_init( _shmem, rng, 2UL, 0, FD_WSAMPLE_HINT_FLAT );
193193
fd_wsample_t * tree = fd_wsample_join( fd_wsample_new_fini( fd_wsample_new_add( fd_wsample_new_add( partial, 2UL ), 1UL ), 0UL ) );
194-
fd_wsample_seed_rng( fd_wsample_get_rng( tree ), zero_seed );
194+
fd_wsample_seed_rng( tree, zero_seed, 0 /* use_chacha8 */ );
195195

196196
FD_TEST( fd_wsample_sample( tree ) == 0UL );
197197
FD_TEST( fd_wsample_sample( tree ) == 0UL );
@@ -214,13 +214,10 @@ test_matches_solana( void ) {
214214
ulong weights2[18] = { 78, 70, 38, 27, 21, 82, 42, 21, 77, 77, 17, 4, 50, 96, 83, 33, 16, 72 };
215215

216216
memset( zero_seed, 48, 32UL );
217-
fd_chacha20_rng_init( rng, zero_seed );
218-
219217
partial = fd_wsample_new_init( _shmem, rng, 18UL, 0, FD_WSAMPLE_HINT_FLAT );
220218
for( ulong i=0UL; i<18UL; i++ ) partial = fd_wsample_new_add( partial, weights2[i] );
221219
tree = fd_wsample_join( fd_wsample_new_fini( partial, 0UL ) );
222-
fd_wsample_seed_rng( fd_wsample_get_rng( tree ), zero_seed );
223-
220+
fd_wsample_seed_rng( tree, zero_seed, 0 /* use_chacha8 */ );
224221

225222
FD_TEST( fd_wsample_sample_and_remove( tree ) == 9UL );
226223
FD_TEST( fd_wsample_sample_and_remove( tree ) == 3UL );
@@ -245,6 +242,66 @@ test_matches_solana( void ) {
245242
fd_chacha_rng_delete( fd_chacha_rng_leave( rng ) );
246243
}
247244

245+
static void
246+
test_matches_solana_chacha8( void ) {
247+
/* Adopted from test_repeated_leader_schedule_specific: */
248+
fd_chacha_rng_t _rng[1];
249+
fd_chacha_rng_t * rng = fd_chacha_rng_join( fd_chacha_rng_new( _rng, FD_CHACHA_RNG_MODE_MOD ) );
250+
uchar zero_seed[32] = {0};
251+
252+
void * partial = fd_wsample_new_init( _shmem, rng, 2UL, 0, FD_WSAMPLE_HINT_FLAT );
253+
fd_wsample_t * tree = fd_wsample_join( fd_wsample_new_fini( fd_wsample_new_add( fd_wsample_new_add( partial, 2UL ), 1UL ), 0UL ) );
254+
fd_wsample_seed_rng( tree, zero_seed, 1 /* use_chacha8 */ );
255+
256+
FD_TEST( fd_wsample_sample( tree ) == 1UL );
257+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
258+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
259+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
260+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
261+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
262+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
263+
FD_TEST( fd_wsample_sample( tree ) == 0UL );
264+
265+
fd_wsample_delete( fd_wsample_leave( tree ) );
266+
fd_chacha_rng_delete( fd_chacha_rng_leave( rng ) );
267+
268+
rng = fd_chacha_rng_join( fd_chacha_rng_new( _rng, FD_CHACHA_RNG_MODE_SHIFT ) );
269+
270+
/* Adopted from test_weighted_shuffle_hard_coded, except they handle
271+
the special case for 0 weights inside their WeightedShuffle object,
272+
and the test case initially used i32 as weights, which made their
273+
Chacha20 object generate i32s instead of u64s. */
274+
ulong weights2[18] = { 78, 70, 38, 27, 21, 82, 42, 21, 77, 77, 17, 4, 50, 96, 83, 33, 16, 72 };
275+
276+
memset( zero_seed, 48, 32UL );
277+
partial = fd_wsample_new_init( _shmem, rng, 18UL, 0, FD_WSAMPLE_HINT_FLAT );
278+
for( ulong i=0UL; i<18UL; i++ ) partial = fd_wsample_new_add( partial, weights2[i] );
279+
tree = fd_wsample_join( fd_wsample_new_fini( partial, 0UL ) );
280+
fd_wsample_seed_rng( tree, zero_seed, 1 /* use_chacha8 */ );
281+
282+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 13UL );
283+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 8UL );
284+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 6UL );
285+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 14UL );
286+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 0UL );
287+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 17UL );
288+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 1UL );
289+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 12UL );
290+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 3UL );
291+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 16UL );
292+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 5UL );
293+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 15UL );
294+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 9UL );
295+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 2UL );
296+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 4UL );
297+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 7UL );
298+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 10UL );
299+
FD_TEST( fd_wsample_sample_and_remove( tree ) == 11UL );
300+
301+
fd_wsample_delete( fd_wsample_leave( tree ) );
302+
fd_chacha_rng_delete( fd_chacha_rng_leave( rng ) );
303+
}
304+
248305
static void
249306
test_sharing( void ) {
250307
fd_chacha_rng_t _rng[1];
@@ -472,6 +529,7 @@ main( int argc,
472529
FD_TEST( fd_wsample_footprint( MAX, 1 )<MAX_FOOTPRINT );
473530

474531
test_matches_solana();
532+
test_matches_solana_chacha8();
475533
test_map();
476534
test_sharing();
477535
test_restore_disabled();

src/disco/shred/Local.mk

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,11 @@ ifdef FD_HAS_ALLOCA
66
$(call add-objs,fd_shred_tile,fd_disco)
77
endif
88
$(call make-unit-test,test_shred_dest,test_shred_dest,fd_disco fd_flamenco fd_ballet fd_util)
9+
$(call make-unit-test,test_shred_dest_conformance,test_shred_dest_conformance,fd_disco fd_flamenco fd_ballet fd_util)
910
$(call make-unit-test,test_fec_resolver,test_fec_resolver,fd_flamenco fd_disco fd_ballet fd_util fd_tango fd_reedsol)
1011
$(call make-unit-test,test_stake_ci,test_stake_ci,fd_disco fd_flamenco fd_ballet fd_util fd_tango fd_reedsol)
1112
$(call run-unit-test,test_shred_dest,)
13+
$(call run-unit-test,test_shred_dest_conformance,)
1214
$(call run-unit-test,test_fec_resolver,)
1315
$(call run-unit-test,test_stake_ci,)
1416
ifdef FD_HAS_HOSTED

0 commit comments

Comments
 (0)