Skip to content

Commit 9273984

Browse files
ibrl: nontemporal memcpy on bundle path
1 parent c098c50 commit 9273984

File tree

1 file changed

+61
-63
lines changed

1 file changed

+61
-63
lines changed

src/disco/pack/fd_pack.c

Lines changed: 61 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,6 +1128,64 @@ delete_worst( fd_pack_t * pack,
11281128
return delete_transaction( pack, worst, 1, 1 );
11291129
}
11301130

1131+
static inline
1132+
void fd_pack_copy_txnp( fd_txn_p_t * out, fd_pack_ord_txn_t const * cur ) {
1133+
fd_txn_t const * txn = TXN(cur->txn);
1134+
if(
1135+
#if FD_HAS_AVX512 && FD_PACK_USE_NON_TEMPORAL_MEMCPY
1136+
FD_LIKELY( cur->txn->payload_sz>=1024UL )
1137+
#else
1138+
0
1139+
#endif
1140+
) {
1141+
#if FD_HAS_AVX512 && FD_PACK_USE_NON_TEMPORAL_MEMCPY
1142+
_mm512_stream_si512( (void*)(out->payload+ 0UL), _mm512_load_epi64( cur->txn->payload+ 0UL ) );
1143+
_mm512_stream_si512( (void*)(out->payload+ 64UL), _mm512_load_epi64( cur->txn->payload+ 64UL ) );
1144+
_mm512_stream_si512( (void*)(out->payload+ 128UL), _mm512_load_epi64( cur->txn->payload+ 128UL ) );
1145+
_mm512_stream_si512( (void*)(out->payload+ 192UL), _mm512_load_epi64( cur->txn->payload+ 192UL ) );
1146+
_mm512_stream_si512( (void*)(out->payload+ 256UL), _mm512_load_epi64( cur->txn->payload+ 256UL ) );
1147+
_mm512_stream_si512( (void*)(out->payload+ 320UL), _mm512_load_epi64( cur->txn->payload+ 320UL ) );
1148+
_mm512_stream_si512( (void*)(out->payload+ 384UL), _mm512_load_epi64( cur->txn->payload+ 384UL ) );
1149+
_mm512_stream_si512( (void*)(out->payload+ 448UL), _mm512_load_epi64( cur->txn->payload+ 448UL ) );
1150+
_mm512_stream_si512( (void*)(out->payload+ 512UL), _mm512_load_epi64( cur->txn->payload+ 512UL ) );
1151+
_mm512_stream_si512( (void*)(out->payload+ 576UL), _mm512_load_epi64( cur->txn->payload+ 576UL ) );
1152+
_mm512_stream_si512( (void*)(out->payload+ 640UL), _mm512_load_epi64( cur->txn->payload+ 640UL ) );
1153+
_mm512_stream_si512( (void*)(out->payload+ 704UL), _mm512_load_epi64( cur->txn->payload+ 704UL ) );
1154+
_mm512_stream_si512( (void*)(out->payload+ 768UL), _mm512_load_epi64( cur->txn->payload+ 768UL ) );
1155+
_mm512_stream_si512( (void*)(out->payload+ 832UL), _mm512_load_epi64( cur->txn->payload+ 832UL ) );
1156+
_mm512_stream_si512( (void*)(out->payload+ 896UL), _mm512_load_epi64( cur->txn->payload+ 896UL ) );
1157+
_mm512_stream_si512( (void*)(out->payload+ 960UL), _mm512_load_epi64( cur->txn->payload+ 960UL ) );
1158+
_mm512_stream_si512( (void*)(out->payload+1024UL), _mm512_load_epi64( cur->txn->payload+1024UL ) );
1159+
_mm512_stream_si512( (void*)(out->payload+1088UL), _mm512_load_epi64( cur->txn->payload+1088UL ) );
1160+
_mm512_stream_si512( (void*)(out->payload+1152UL), _mm512_load_epi64( cur->txn->payload+1152UL ) );
1161+
_mm512_stream_si512( (void*)(out->payload+1216UL), _mm512_load_epi64( cur->txn->payload+1216UL ) );
1162+
/* Copied out to 1280 bytes, which copies some other fields we needed to
1163+
copy anyway. */
1164+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, payload_sz )+sizeof(((fd_txn_p_t*)NULL)->payload_sz )<=1280UL, nt_memcpy );
1165+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, blockhash_slot )+sizeof(((fd_txn_p_t*)NULL)->blockhash_slot)<=1280UL, nt_memcpy );
1166+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, scheduler_arrival_time_nanos )+sizeof(((fd_txn_p_t*)NULL)->scheduler_arrival_time_nanos )<=1280UL, nt_memcpy );
1167+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, source_tpu )+sizeof(((fd_txn_p_t*)NULL)->source_tpu )<=1280UL, nt_memcpy );
1168+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, source_ipv4 )+sizeof(((fd_txn_p_t*)NULL)->source_ipv4 )<=1280UL, nt_memcpy );
1169+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, flags )+sizeof(((fd_txn_p_t*)NULL)->flags )<=1280UL, nt_memcpy );
1170+
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, _ ) <=1280UL, nt_memcpy );
1171+
const ulong offset_into_txn = 1280UL - offsetof(fd_txn_p_t, _ );
1172+
fd_memcpy( offset_into_txn+(uchar *)TXN(out), offset_into_txn+(uchar const *)txn,
1173+
fd_ulong_max( offset_into_txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) )-offset_into_txn );
1174+
#endif
1175+
} else {
1176+
fd_memcpy( out->payload, cur->txn->payload, cur->txn->payload_sz );
1177+
fd_memcpy( TXN(out), txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) );
1178+
out->payload_sz = cur->txn->payload_sz;
1179+
out->pack_cu.requested_exec_plus_acct_data_cus = cur->txn->pack_cu.requested_exec_plus_acct_data_cus;
1180+
out->pack_cu.non_execution_cus = cur->txn->pack_cu.non_execution_cus;
1181+
out->scheduler_arrival_time_nanos = cur->txn->scheduler_arrival_time_nanos;
1182+
out->source_tpu = cur->txn->source_tpu;
1183+
out->source_ipv4 = cur->txn->source_ipv4;
1184+
out->flags = cur->txn->flags;
1185+
}
1186+
}
1187+
1188+
11311189
static inline int
11321190
validate_transaction( fd_pack_t * pack,
11331191
fd_pack_ord_txn_t const * ord,
@@ -1925,58 +1983,7 @@ fd_pack_schedule_impl( fd_pack_t * pack,
19251983
FD_PACK_BITSET_OR( bitset_rw_in_use, cur->rw_bitset );
19261984
FD_PACK_BITSET_OR( bitset_w_in_use, cur->w_bitset );
19271985

1928-
if(
1929-
#if FD_HAS_AVX512 && FD_PACK_USE_NON_TEMPORAL_MEMCPY
1930-
FD_LIKELY( cur->txn->payload_sz>=1024UL )
1931-
#else
1932-
0
1933-
#endif
1934-
) {
1935-
#if FD_HAS_AVX512 && FD_PACK_USE_NON_TEMPORAL_MEMCPY
1936-
_mm512_stream_si512( (void*)(out->payload+ 0UL), _mm512_load_epi64( cur->txn->payload+ 0UL ) );
1937-
_mm512_stream_si512( (void*)(out->payload+ 64UL), _mm512_load_epi64( cur->txn->payload+ 64UL ) );
1938-
_mm512_stream_si512( (void*)(out->payload+ 128UL), _mm512_load_epi64( cur->txn->payload+ 128UL ) );
1939-
_mm512_stream_si512( (void*)(out->payload+ 192UL), _mm512_load_epi64( cur->txn->payload+ 192UL ) );
1940-
_mm512_stream_si512( (void*)(out->payload+ 256UL), _mm512_load_epi64( cur->txn->payload+ 256UL ) );
1941-
_mm512_stream_si512( (void*)(out->payload+ 320UL), _mm512_load_epi64( cur->txn->payload+ 320UL ) );
1942-
_mm512_stream_si512( (void*)(out->payload+ 384UL), _mm512_load_epi64( cur->txn->payload+ 384UL ) );
1943-
_mm512_stream_si512( (void*)(out->payload+ 448UL), _mm512_load_epi64( cur->txn->payload+ 448UL ) );
1944-
_mm512_stream_si512( (void*)(out->payload+ 512UL), _mm512_load_epi64( cur->txn->payload+ 512UL ) );
1945-
_mm512_stream_si512( (void*)(out->payload+ 576UL), _mm512_load_epi64( cur->txn->payload+ 576UL ) );
1946-
_mm512_stream_si512( (void*)(out->payload+ 640UL), _mm512_load_epi64( cur->txn->payload+ 640UL ) );
1947-
_mm512_stream_si512( (void*)(out->payload+ 704UL), _mm512_load_epi64( cur->txn->payload+ 704UL ) );
1948-
_mm512_stream_si512( (void*)(out->payload+ 768UL), _mm512_load_epi64( cur->txn->payload+ 768UL ) );
1949-
_mm512_stream_si512( (void*)(out->payload+ 832UL), _mm512_load_epi64( cur->txn->payload+ 832UL ) );
1950-
_mm512_stream_si512( (void*)(out->payload+ 896UL), _mm512_load_epi64( cur->txn->payload+ 896UL ) );
1951-
_mm512_stream_si512( (void*)(out->payload+ 960UL), _mm512_load_epi64( cur->txn->payload+ 960UL ) );
1952-
_mm512_stream_si512( (void*)(out->payload+1024UL), _mm512_load_epi64( cur->txn->payload+1024UL ) );
1953-
_mm512_stream_si512( (void*)(out->payload+1088UL), _mm512_load_epi64( cur->txn->payload+1088UL ) );
1954-
_mm512_stream_si512( (void*)(out->payload+1152UL), _mm512_load_epi64( cur->txn->payload+1152UL ) );
1955-
_mm512_stream_si512( (void*)(out->payload+1216UL), _mm512_load_epi64( cur->txn->payload+1216UL ) );
1956-
/* Copied out to 1280 bytes, which copies some other fields we needed to
1957-
copy anyway. */
1958-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, payload_sz )+sizeof(((fd_txn_p_t*)NULL)->payload_sz )<=1280UL, nt_memcpy );
1959-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, blockhash_slot )+sizeof(((fd_txn_p_t*)NULL)->blockhash_slot)<=1280UL, nt_memcpy );
1960-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, scheduler_arrival_time_nanos )+sizeof(((fd_txn_p_t*)NULL)->scheduler_arrival_time_nanos )<=1280UL, nt_memcpy );
1961-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, source_tpu )+sizeof(((fd_txn_p_t*)NULL)->source_tpu )<=1280UL, nt_memcpy );
1962-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, source_ipv4 )+sizeof(((fd_txn_p_t*)NULL)->source_ipv4 )<=1280UL, nt_memcpy );
1963-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, flags )+sizeof(((fd_txn_p_t*)NULL)->flags )<=1280UL, nt_memcpy );
1964-
FD_STATIC_ASSERT( offsetof(fd_txn_p_t, _ ) <=1280UL, nt_memcpy );
1965-
const ulong offset_into_txn = 1280UL - offsetof(fd_txn_p_t, _ );
1966-
fd_memcpy( offset_into_txn+(uchar *)TXN(out), offset_into_txn+(uchar const *)txn,
1967-
fd_ulong_max( offset_into_txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) )-offset_into_txn );
1968-
#endif
1969-
} else {
1970-
fd_memcpy( out->payload, cur->txn->payload, cur->txn->payload_sz );
1971-
fd_memcpy( TXN(out), txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) );
1972-
out->payload_sz = cur->txn->payload_sz;
1973-
out->pack_cu.requested_exec_plus_acct_data_cus = cur->txn->pack_cu.requested_exec_plus_acct_data_cus;
1974-
out->pack_cu.non_execution_cus = cur->txn->pack_cu.non_execution_cus;
1975-
out->scheduler_arrival_time_nanos = cur->txn->scheduler_arrival_time_nanos;
1976-
out->source_tpu = cur->txn->source_tpu;
1977-
out->source_ipv4 = cur->txn->source_ipv4;
1978-
out->flags = cur->txn->flags;
1979-
}
1986+
fd_pack_copy_txnp( out, cur );
19801987
out++;
19811988

19821989
for( fd_txn_acct_iter_t iter=fd_txn_acct_iter_init( txn, FD_TXN_ACCT_CAT_WRITABLE );
@@ -2401,16 +2408,7 @@ fd_pack_try_schedule_bundle( fd_pack_t * pack,
24012408
_next = treap_rev_iter_next( _cur, pool );
24022409

24032410
fd_pack_ord_txn_t * cur = treap_rev_iter_ele( _cur, pool );
2404-
fd_txn_t const * txn = TXN(cur->txn);
2405-
fd_memcpy( out->payload, cur->txn->payload, cur->txn->payload_sz );
2406-
fd_memcpy( TXN(out), txn, fd_txn_footprint( txn->instr_cnt, txn->addr_table_lookup_cnt ) );
2407-
out->payload_sz = cur->txn->payload_sz;
2408-
out->pack_cu.requested_exec_plus_acct_data_cus = cur->txn->pack_cu.requested_exec_plus_acct_data_cus;
2409-
out->pack_cu.non_execution_cus = cur->txn->pack_cu.non_execution_cus;
2410-
out->scheduler_arrival_time_nanos = cur->txn->scheduler_arrival_time_nanos;
2411-
out->source_tpu = cur->txn->source_tpu;
2412-
out->source_ipv4 = cur->txn->source_ipv4;
2413-
out->flags = cur->txn->flags;
2411+
fd_pack_copy_txnp( out, cur );
24142412
out++;
24152413

24162414
pack->cumulative_block_cost += cur->compute_est;
@@ -3225,4 +3223,4 @@ fd_pack_verify( fd_pack_t * pack,
32253223
}
32263224

32273225
void * fd_pack_leave ( fd_pack_t * pack ) { FD_COMPILER_MFENCE(); return (void *)pack; }
3228-
void * fd_pack_delete( void * mem ) { FD_COMPILER_MFENCE(); return mem; }
3226+
void * fd_pack_delete( void * mem ) { FD_COMPILER_MFENCE(); return mem; }

0 commit comments

Comments
 (0)