Skip to content

Commit 1ce4731

Browse files
committed
simd-0332: reduce chacha rounds for turbine from 20 to 8
1 parent 943c7d8 commit 1ce4731

File tree

12 files changed

+118
-89
lines changed

12 files changed

+118
-89
lines changed

agave

src/ballet/chacha/fd_chacha.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77

88
#define FD_CHACHA_BLOCK_SZ (64UL)
99

10-
/* FD_CHACHA20_KEY_SZ is the size of the ChaCha20 encryption key */
10+
/* FD_CHACHA_KEY_SZ is the size of the ChaCha20 encryption key */
1111

12-
#define FD_CHACHA20_KEY_SZ (32UL)
12+
#define FD_CHACHA_KEY_SZ (32UL)
1313

1414
FD_PROTOTYPES_BEGIN
1515

src/ballet/chacha/fd_chacha_rng.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,20 @@ fd_chacha_rng_delete( void * shrng ) {
5858
return shrng;
5959
}
6060

61+
fd_chacha_rng_t *
62+
fd_chacha8_rng_init( fd_chacha_rng_t * rng,
63+
void const * key ) {
64+
memcpy( rng->key, key, FD_CHACHA_KEY_SZ );
65+
rng->buf_off = 0UL;
66+
rng->buf_fill = 0UL;
67+
fd_chacha8_rng_private_refill( rng );
68+
return rng;
69+
}
70+
6171
fd_chacha_rng_t *
6272
fd_chacha20_rng_init( fd_chacha_rng_t * rng,
6373
void const * key ) {
64-
memcpy( rng->key, key, FD_CHACHA20_KEY_SZ );
74+
memcpy( rng->key, key, FD_CHACHA_KEY_SZ );
6575
rng->buf_off = 0UL;
6676
rng->buf_fill = 0UL;
6777
fd_chacha20_rng_private_refill( rng );

src/ballet/chacha/fd_chacha_rng.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,35 @@ fd_chacha20_rng_ulong_roll( fd_chacha_rng_t * rng,
252252
}
253253
}
254254

255+
static inline ulong
256+
fd_chacha8_rng_ulong_roll( fd_chacha_rng_t * rng,
257+
ulong n ) {
258+
ulong const zone = fd_ulong_if( rng->mode==FD_CHACHA_RNG_MODE_MOD,
259+
ULONG_MAX - (ULONG_MAX-n+1UL)%n,
260+
(n << (63 - fd_ulong_find_msb( n ) )) - 1UL );
261+
262+
for( int i=0; 1; i++ ) {
263+
ulong v = fd_chacha8_rng_ulong( rng );
264+
#if FD_HAS_INT128
265+
/* Compiles to one mulx instruction */
266+
uint128 res = (uint128)v * (uint128)n;
267+
ulong hi = (ulong)(res>>64);
268+
ulong lo = (ulong) res;
269+
#else
270+
ulong hi, lo;
271+
fd_uwide_mul( &hi, &lo, v, n );
272+
#endif
273+
274+
# if FD_CHACHA_RNG_DEBUG
275+
FD_LOG_DEBUG(( "roll (attempt %d): n=%016lx zone: %016lx v=%016lx lo=%016lx hi=%016lx", i, n, zone, v, lo, hi ));
276+
# else
277+
(void)i;
278+
# endif /* FD_CHACHA_RNG_DEBUG */
279+
280+
if( FD_LIKELY( lo<=zone ) ) return hi;
281+
}
282+
}
283+
255284
FD_PROTOTYPES_END
256285

257286
#endif /* HEADER_fd_src_ballet_chacha20_fd_chacha20_rng_h */

src/ballet/wsample/fd_wsample.c

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ struct __attribute__((aligned(64UL))) fd_wsample_private {
127127
uint height;
128128
char restore_enabled;
129129
char poisoned_mode;
130-
/* Two bytes of padding here */
130+
uchar rng_algo;
131+
/* One byte of padding here */
131132

132133
fd_chacha_rng_t * rng;
133134

@@ -302,6 +303,7 @@ fd_wsample_new_init( void * shmem,
302303
sampler->height = (uint)height;
303304
sampler->restore_enabled = (char)!!restore_enabled;
304305
sampler->poisoned_mode = 0;
306+
sampler->rng_algo = 0;
305307
sampler->rng = rng;
306308

307309
fd_memset( sampler->tree, (char)0, internal_cnt*sizeof(tree_ele_t) );
@@ -389,18 +391,25 @@ fd_wsample_delete( void * shmem ) {
389391
return shmem;
390392
}
391393

392-
393-
394-
fd_chacha_rng_t * fd_wsample_get_rng( fd_wsample_t * sampler ) { return sampler->rng; }
395-
396-
397-
/* TODO: Should this function exist at all? */
398394
void
399-
fd_wsample_seed_rng( fd_chacha_rng_t * rng,
400-
uchar seed[static 32] ) {
401-
fd_chacha20_rng_init( rng, seed );
395+
fd_wsample_seed_rng( fd_wsample_t * sampler,
396+
uchar seed[ 32 ],
397+
int use_chacha8 ) {
398+
sampler->rng_algo = fd_uchar_if( use_chacha8, FD_WSAMPLE_RNG_CHACHA8, FD_WSAMPLE_RNG_CHACHA20 );
399+
if( FD_UNLIKELY( sampler->rng_algo==FD_WSAMPLE_RNG_CHACHA8 ) ) {
400+
fd_chacha8_rng_init( sampler->rng, seed );
401+
} else {
402+
fd_chacha20_rng_init( sampler->rng, seed );
403+
}
402404
}
403405

406+
ulong
407+
fd_wsample_rng_ulong_roll( fd_wsample_t * sampler, ulong n ) {
408+
if( FD_UNLIKELY( sampler->rng_algo==FD_WSAMPLE_RNG_CHACHA8 ) ) {
409+
return fd_chacha8_rng_ulong_roll( sampler->rng, n );
410+
}
411+
return fd_chacha20_rng_ulong_roll( sampler->rng, n );
412+
}
404413

405414
fd_wsample_t *
406415
fd_wsample_restore_all( fd_wsample_t * sampler ) {
@@ -597,7 +606,7 @@ fd_wsample_remove_idx( fd_wsample_t * sampler,
597606
but operations with mask registers are frustratingly slow. Instead,
598607
we first define x'=x+1, so then v0<=x is equivalent to v0-x'<0, or
599608
whether v0-x' has the high bit set. Because of
600-
fd_chacha20_rng_ulong_roll's contract, we know x<ULONG_MAX, so forming
609+
fd_wsample_rng_ulong_roll's contract, we know x<ULONG_MAX, so forming
601610
x' is safe. We then use a _mm512_permutexvar_epi8 to essentially
602611
broadcast the high byte from each of the ulongs (really we just need
603612
the high bit) to the whole vector. A popcnt gives us our base
@@ -734,7 +743,7 @@ fd_wsample_sample_and_remove_many( fd_wsample_t * sampler,
734743
for( ulong i=0UL; i<cnt; i++ ) {
735744
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) { idxs[ i ] = FD_WSAMPLE_EMPTY; continue; }
736745
if( FD_UNLIKELY( sampler->poisoned_mode ) ) { idxs[ i ] = FD_WSAMPLE_INDETERMINATE; continue; }
737-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
746+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
738747
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) {
739748
idxs[ i ] = FD_WSAMPLE_INDETERMINATE;
740749
sampler->poisoned_mode = 1;
@@ -771,7 +780,7 @@ ulong
771780
fd_wsample_sample( fd_wsample_t * sampler ) {
772781
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) return FD_WSAMPLE_EMPTY;
773782
if( FD_UNLIKELY( sampler->poisoned_mode ) ) return FD_WSAMPLE_INDETERMINATE;
774-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
783+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
775784
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) return FD_WSAMPLE_INDETERMINATE;
776785
return (ulong)fd_wsample_map_sample( sampler, unif );
777786
}
@@ -780,7 +789,7 @@ ulong
780789
fd_wsample_sample_and_remove( fd_wsample_t * sampler ) {
781790
if( FD_UNLIKELY( !sampler->unremoved_weight ) ) return FD_WSAMPLE_EMPTY;
782791
if( FD_UNLIKELY( sampler->poisoned_mode ) ) return FD_WSAMPLE_INDETERMINATE;
783-
ulong unif = fd_chacha20_rng_ulong_roll( sampler->rng, sampler->unremoved_weight+sampler->poisoned_weight );
792+
ulong unif = fd_wsample_rng_ulong_roll( sampler, sampler->unremoved_weight+sampler->poisoned_weight );
784793
if( FD_UNLIKELY( unif>=sampler->unremoved_weight ) ) {
785794
sampler->poisoned_mode = 1;
786795
return FD_WSAMPLE_INDETERMINATE;

src/ballet/wsample/fd_wsample.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
struct fd_wsample_private;
2121
typedef struct fd_wsample_private fd_wsample_t;
2222

23+
#define FD_WSAMPLE_RNG_CHACHA20 (0U)
24+
#define FD_WSAMPLE_RNG_CHACHA8 (1U)
25+
2326
#define FD_WSAMPLE_ALIGN (64UL)
2427
/* fd_leaders really wants a compile time-compatible footprint... The
2528
internal count is 1/8 * (9^ceil(log_9(ele_cnt)) - 1) */
@@ -135,10 +138,10 @@ void * fd_wsample_new_fini( void * shmem, ulong poisoned_weight );
135138
/* fd_wsample_get_rng returns the value provided for rng in new. */
136139
fd_chacha_rng_t * fd_wsample_get_rng( fd_wsample_t * sampler );
137140

138-
/* fd_wsample_seed_rng seeds the ChaCha20 rng with the provided seed in
141+
/* fd_wsample_seed_rng seeds the ChaCha rng with the provided seed in
139142
preparation for sampling. This function is compatible with Solana's
140143
ChaChaRng::from_seed. */
141-
void fd_wsample_seed_rng( fd_chacha_rng_t * rng, uchar seed[static 32] );
144+
void fd_wsample_seed_rng( fd_wsample_t * sampler, uchar seed[ 32 ], int use_chacha8 );
142145

143146
/* fd_wsample_sample{_and_remove}{,_many} produces one or cnt (in the
144147
_many case) weighted random samples from the sampler. If the
@@ -191,6 +194,9 @@ void fd_wsample_remove_idx( fd_wsample_t * sampler, ulong idx );
191194
in which case no elements are restored. */
192195
fd_wsample_t * fd_wsample_restore_all( fd_wsample_t * sampler );
193196

194-
197+
/* fd_wsample_rng_ulong_roll returns an uniform IID rand in [0,n)
198+
analogous to fd_rng_ulong_roll. Internally it uses chacha8 or
199+
chacha20, based on how the wsample was initialized. */
200+
ulong fd_wsample_rng_ulong_roll( fd_wsample_t * sampler, ulong n );
195201

196202
#endif /* HEADER_fd_src_ballet_wsample_fd_wsample_h */

src/ballet/wsample/test_wsample.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ test_matches_solana( void ) {
191191

192192
void * partial = fd_wsample_new_init( _shmem, rng, 2UL, 0, FD_WSAMPLE_HINT_FLAT );
193193
fd_wsample_t * tree = fd_wsample_join( fd_wsample_new_fini( fd_wsample_new_add( fd_wsample_new_add( partial, 2UL ), 1UL ), 0UL ) );
194-
fd_wsample_seed_rng( fd_wsample_get_rng( tree ), zero_seed );
194+
fd_wsample_seed_rng( tree, zero_seed, 0 /* use_chacha8 */ );
195195

196196
FD_TEST( fd_wsample_sample( tree ) == 0UL );
197197
FD_TEST( fd_wsample_sample( tree ) == 0UL );
@@ -219,7 +219,7 @@ test_matches_solana( void ) {
219219
partial = fd_wsample_new_init( _shmem, rng, 18UL, 0, FD_WSAMPLE_HINT_FLAT );
220220
for( ulong i=0UL; i<18UL; i++ ) partial = fd_wsample_new_add( partial, weights2[i] );
221221
tree = fd_wsample_join( fd_wsample_new_fini( partial, 0UL ) );
222-
fd_wsample_seed_rng( fd_wsample_get_rng( tree ), zero_seed );
222+
fd_wsample_seed_rng( tree, zero_seed, 0 /* use_chacha8 */ );
223223

224224

225225
FD_TEST( fd_wsample_sample_and_remove( tree ) == 9UL );

src/disco/shred/fd_shred_dest.c

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ static const fd_pubkey_t null_pubkey = {{ 0 }};
2121
#include "../../util/tmpl/fd_map_dynamic.c"
2222

2323

24-
/* This 45 byte struct gets hashed to compute the seed for Chacha20 to
24+
/* This 45 byte struct gets hashed to compute the seed for ChaCha to
2525
compute the shred destinations. */
2626
struct __attribute__((packed)) shred_dest_input {
2727
ulong slot;
@@ -150,14 +150,19 @@ void * fd_shred_dest_delete( void * mem ) {
150150
return mem;
151151
}
152152

153+
static inline ulong
154+
fd_sdest_rng_ulong_roll( fd_shred_dest_t * sdest, ulong n ) {
155+
return fd_wsample_rng_ulong_roll( sdest->staked, n );
156+
}
157+
153158
/* sample_unstaked, sample_unstaked_noprepare, and
154159
prepare_unstaked_sampling are used to perform the specific form of
155160
unweighted random sampling that Solana uses for unstaked validators.
156161
In essence, you:
157162
1. construct a list of all the unstaked validators,
158163
2. delete the leader (if present)
159164
then repeatedly:
160-
3. choose the chacha20rng_roll( |unstaked| )th element.
165+
3. choose the chacha_rng_roll( |unstaked| )th element.
161166
4. swap the last element in unstaked with the chosen element
162167
5. return and remove the chosen element (which is now in the last
163168
position, so remove is O(1)).
@@ -191,7 +196,7 @@ sample_unstaked_noprepare( fd_shred_dest_t * sdest,
191196
ulong unstaked_cnt = sdest->unstaked_cnt - (ulong)remove_in_interval;
192197
if( FD_UNLIKELY( unstaked_cnt==0UL ) ) return FD_WSAMPLE_EMPTY;
193198

194-
ulong sample = sdest->staked_cnt + fd_chacha20_rng_ulong_roll( sdest->rng, unstaked_cnt );
199+
ulong sample = sdest->staked_cnt + fd_sdest_rng_ulong_roll( sdest, unstaked_cnt );
195200
return fd_ulong_if( (!remove_in_interval) | (sample<remove_idx), sample, sample+1UL );
196201
}
197202

@@ -219,7 +224,7 @@ static inline ulong
219224
sample_unstaked( fd_shred_dest_t * sdest ) {
220225
if( FD_UNLIKELY( sdest->unstaked_unremoved_cnt==0UL ) ) return FD_WSAMPLE_EMPTY;
221226

222-
ulong sample = fd_chacha20_rng_ulong_roll( sdest->rng, sdest->unstaked_unremoved_cnt );
227+
ulong sample = fd_sdest_rng_ulong_roll( sdest, sdest->unstaked_unremoved_cnt );
223228
ulong to_return = sdest->unstaked[sample];
224229
sdest->unstaked[sample] = sdest->unstaked[--sdest->unstaked_unremoved_cnt];
225230
return to_return;
@@ -261,7 +266,8 @@ fd_shred_dest_idx_t *
261266
fd_shred_dest_compute_first( fd_shred_dest_t * sdest,
262267
fd_shred_t const * const * input_shreds,
263268
ulong shred_cnt,
264-
fd_shred_dest_idx_t * out ) {
269+
fd_shred_dest_idx_t * out,
270+
int use_chacha8 ) {
265271

266272
if( FD_UNLIKELY( shred_cnt==0UL ) ) return out;
267273

@@ -290,7 +296,7 @@ fd_shred_dest_compute_first( fd_shred_dest_t * sdest,
290296

291297
int any_staked_candidates = sdest->staked_cnt > (ulong)source_validator_is_staked;
292298
for( ulong i=0UL; i<shred_cnt; i++ ) {
293-
fd_wsample_seed_rng( fd_wsample_get_rng( sdest->staked ), dest_hash_outputs[ i ] );
299+
fd_wsample_seed_rng( sdest->staked, dest_hash_outputs[ i ], use_chacha8 );
294300
/* Map FD_WSAMPLE_INDETERMINATE to FD_SHRED_DEST_NO_DEST */
295301
if( FD_LIKELY( any_staked_candidates ) ) out[i] = (fd_shred_dest_idx_t)fd_ulong_min( fd_wsample_sample( sdest->staked ), FD_SHRED_DEST_NO_DEST );
296302
else out[i] = (fd_shred_dest_idx_t)sample_unstaked_noprepare( sdest, sdest->source_validator_orig_idx );
@@ -308,7 +314,8 @@ fd_shred_dest_compute_children( fd_shred_dest_t * sdest,
308314
ulong out_stride,
309315
ulong fanout,
310316
ulong dest_cnt,
311-
ulong * opt_max_dest_cnt ) {
317+
ulong * opt_max_dest_cnt,
318+
int use_chacha8 ) {
312319

313320
/* The logic here is a little tricky since we are keeping track of
314321
staked and unstaked separately and only logically concatenating
@@ -358,7 +365,7 @@ fd_shred_dest_compute_children( fd_shred_dest_t * sdest,
358365
if( FD_LIKELY( query && leader_is_staked ) ) fd_wsample_remove_idx( sdest->staked, leader_idx );
359366

360367
ulong my_idx = 0UL;
361-
fd_wsample_seed_rng( fd_wsample_get_rng( sdest->staked ), dest_hash_outputs[ i ] ); /* Seeds both samplers since the rng is shared */
368+
fd_wsample_seed_rng( sdest->staked, dest_hash_outputs[ i ], use_chacha8 ); /* Seeds both samplers since the rng is shared */
362369

363370
if( FD_UNLIKELY( !i_am_staked ) ) {
364371
/* If there's excluded stake, we don't know about any unstaked

src/disco/shred/fd_shred_dest.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,8 @@ fd_shred_dest_idx_t *
161161
fd_shred_dest_compute_first( fd_shred_dest_t * sdest,
162162
fd_shred_t const * const * input_shreds,
163163
ulong shred_cnt,
164-
fd_shred_dest_idx_t * out );
164+
fd_shred_dest_idx_t * out,
165+
int use_chacha8 );
165166

166167
/* fd_shred_dest_compute_children computes the source validator's
167168
children in the Turbine tree for each of the provided shreds.
@@ -205,7 +206,8 @@ fd_shred_dest_compute_children( fd_shred_dest_t * sdest,
205206
ulong out_stride,
206207
ulong fanout,
207208
ulong dest_cnt,
208-
ulong * opt_max_dest_cnt );
209+
ulong * opt_max_dest_cnt,
210+
int use_chacha8 );
209211

210212
/* fd_shred_dest_idx_to_dest maps a destination index (as produced by
211213
fd_shred_dest_compute_children or fd_shred_dest_compute_first) to an

0 commit comments

Comments
 (0)