@@ -3,18 +3,19 @@ package bft
33import (
44 "bytes"
55 "context"
6+ "crypto"
67 "encoding/hex"
78 "errors"
89 "fmt"
910 "math/big"
1011 "sync"
1112 "sync/atomic"
13+ "time"
1214
13- "github.com/libp2p/go-libp2p/core/peer"
1415 "github.com/unicitynetwork/bft-core/network"
1516 "github.com/unicitynetwork/bft-core/network/protocol/certification"
1617 "github.com/unicitynetwork/bft-core/network/protocol/handshake"
17- "github.com/unicitynetwork/bft-go-base/crypto"
18+ cryptobft "github.com/unicitynetwork/bft-go-base/crypto"
1819 "github.com/unicitynetwork/bft-go-base/types"
1920
2021 "github.com/unicitynetwork/aggregator-go/internal/config"
@@ -38,12 +39,11 @@ type (
3839 shardID types.ShardID
3940 logger * logger.Logger
4041
41- // mutex for peer, network, signer, rootNodes
42- mu sync.Mutex
43- peer * network.Peer
44- network * BftNetwork
45- rootNodes peer.IDSlice
46- signer crypto.Signer
42+ // mutex for peer, network, signer TODO: there are readers without mutex
43+ mu sync.Mutex
44+ peer * network.Peer
45+ network * BftNetwork
46+ signer cryptobft.Signer
4747
4848 // Latest UC this node has seen. Can be ahead of the committed UC during recovery.
4949 luc atomic.Pointer [types.UnicityCertificate ]
@@ -57,6 +57,11 @@ type (
5757 ucProcessingMutex sync.Mutex
5858
5959 msgLoopCancelFn context.CancelFunc
60+
61+ // timestamp when last UC was received
62+ lastCertResponseTime atomic.Int64
63+
64+ trustBaseStore TrustBaseStore
6065 }
6166
6267 BFTClient interface {
@@ -70,19 +75,26 @@ type (
7075 StartNewRound (ctx context.Context , roundNumber * api.BigInt ) error
7176 }
7277
78+ TrustBaseStore interface {
79+ GetByEpoch (ctx context.Context , epoch uint64 ) (types.RootTrustBase , error )
80+ }
81+
7382 status int
7483)
7584
76- func NewBFTClient (ctx context. Context , conf * config.BFTConfig , roundManager RoundManager , logger * logger.Logger ) (* BFTClientImpl , error ) {
85+ func NewBFTClient (conf * config.BFTConfig , roundManager RoundManager , trustBaseStore TrustBaseStore , luc * types. UnicityCertificate , logger * logger.Logger ) (* BFTClientImpl , error ) {
7786 logger .Info ("Creating BFT Client" )
7887 bftClient := & BFTClientImpl {
79- logger : logger ,
80- partitionID : conf .ShardConf .PartitionID ,
81- shardID : conf .ShardConf .ShardID ,
82- roundManager : roundManager ,
83- conf : conf ,
88+ logger : logger ,
89+ partitionID : conf .ShardConf .PartitionID ,
90+ shardID : conf .ShardConf .ShardID ,
91+ roundManager : roundManager ,
92+ trustBaseStore : trustBaseStore ,
93+ conf : conf ,
8494 }
8595 bftClient .status .Store (idle )
96+ bftClient .luc .Store (luc )
97+ bftClient .lastCertResponseTime .Store (time .Now ().UnixMilli ())
8698 return bftClient , nil
8799}
88100
@@ -119,10 +131,6 @@ func (c *BFTClientImpl) Start(ctx context.Context) error {
119131 if err != nil {
120132 return err
121133 }
122- rootNodes , err := c .conf .GetRootNodes ()
123- if err != nil {
124- return fmt .Errorf ("failed to get root nodes: %w" , err )
125- }
126134 signer , err := c .conf .KeyConf .Signer ()
127135 if err != nil {
128136 return fmt .Errorf ("failed to create signer: %w" , err )
@@ -136,19 +144,20 @@ func (c *BFTClientImpl) Start(ctx context.Context) error {
136144 c .peer = self
137145 c .network = networkP2P
138146 c .signer = signer
139- c .rootNodes = rootNodes
140147
141148 if err := c .peer .BootstrapConnect (ctx , c .logger .Logger ); err != nil {
142149 return fmt .Errorf ("failed to bootstrap peer: %w" , err )
143150 }
144151
145- if err := c .sendHandshake (ctx ); err != nil {
146- return fmt .Errorf ("failed to send handshake: %w" , err )
147- }
148-
149152 msgLoopCtx , cancelFn := context .WithCancel (ctx )
150153 c .msgLoopCancelFn = cancelFn
151- go c .loop (msgLoopCtx )
154+ go func () {
155+ if err := c .loop (msgLoopCtx ); err != nil {
156+ c .logger .Error ("BFT event loop thread exited with error" , "error" , err .Error ())
157+ } else {
158+ c .logger .Info ("BFT event loop thread finished" )
159+ }
160+ }()
152161
153162 return nil
154163}
@@ -170,16 +179,21 @@ func (c *BFTClientImpl) Stop() {
170179 c .peer = nil
171180 c .network = nil
172181 c .signer = nil
173- c .rootNodes = nil
174182 }
175183}
176184
177185func (c * BFTClientImpl ) sendHandshake (ctx context.Context ) error {
178186 c .logger .WithContext (ctx ).Debug ("sending handshake to root chain" )
187+
188+ // load trust base
189+ rootEpoch := c .luc .Load ().GetRootEpoch ()
190+ tb , err := c .trustBaseStore .GetByEpoch (ctx , rootEpoch )
191+ if err != nil {
192+ return fmt .Errorf ("failed to load trust base for epoch %d: %w" , rootEpoch , err )
193+ }
179194 // select some random root nodes
180- rootIDs , err := randomNodeSelector (c . rootNodes , defaultHandshakeNodes )
195+ rootIDs , err := randomNodeSelector (tb , defaultHandshakeNodes )
181196 if err != nil {
182- // error should only happen in case the root nodes are not initialized
183197 return fmt .Errorf ("failed to select root nodes for handshake: %w" , err )
184198 }
185199 if err = c .network .Send (ctx ,
@@ -195,6 +209,13 @@ func (c *BFTClientImpl) sendHandshake(ctx context.Context) error {
195209}
196210
197211func (c * BFTClientImpl ) loop (ctx context.Context ) error {
212+ if err := c .sendHandshake (ctx ); err != nil {
213+ return fmt .Errorf ("failed to send initial handshake: %w" , err )
214+ }
215+
216+ heartbeat := time .NewTicker (c .conf .HeartbeatInterval )
217+ defer heartbeat .Stop ()
218+
198219 for {
199220 select {
200221 case <- ctx .Done ():
@@ -205,6 +226,15 @@ func (c *BFTClientImpl) loop(ctx context.Context) error {
205226 }
206227 c .logger .WithContext (ctx ).Debug ("received message" , "type" , fmt .Sprintf ("%T" , m ))
207228 c .handleMessage (ctx , m )
229+ case <- heartbeat .C :
230+ lastCertMillis := c .lastCertResponseTime .Load ()
231+ lastCertTime := time .UnixMilli (lastCertMillis )
232+ if time .Since (lastCertTime ) > c .conf .InactivityTimeout {
233+ c .logger .Warn ("BFT client inactivity timeout exceeded, sending new handshake" )
234+ if err := c .sendHandshake (ctx ); err != nil {
235+ c .logger .Error ("failed to send handshake on inactivity timeout" , "error" , err .Error ())
236+ }
237+ }
208238 }
209239 }
210240}
@@ -213,16 +243,30 @@ func (c *BFTClientImpl) handleMessage(ctx context.Context, msg any) {
213243 switch mt := msg .(type ) {
214244 case * certification.CertificationResponse :
215245 c .logger .WithContext (ctx ).Info ("received CertificationResponse" )
216- c .handleCertificationResponse (ctx , mt )
246+ if err := c .handleCertificationResponse (ctx , mt ); err != nil {
247+ c .logger .WithContext (ctx ).Error ("error processing CertificationResponse message" , "error" , err .Error ())
248+ }
217249 default :
218250 c .logger .WithContext (ctx ).Info ("received unknown message" )
219251 }
220252}
221253
222254func (c * BFTClientImpl ) handleCertificationResponse (ctx context.Context , cr * certification.CertificationResponse ) error {
255+ c .lastCertResponseTime .Store (time .Now ().UnixMilli ())
256+
223257 if err := cr .IsValid (); err != nil {
224258 return fmt .Errorf ("invalid CertificationResponse: %w" , err )
225259 }
260+
261+ // verify UC
262+ tb , err := c .trustBaseStore .GetByEpoch (ctx , cr .UC .GetRootEpoch ())
263+ if err != nil {
264+ return fmt .Errorf ("failed to load trust base for epoch %d: %w" , cr .UC .GetRootEpoch (), err )
265+ }
266+ if err := cr .UC .Verify (tb , crypto .SHA256 , c .partitionID , c .shardID , nil ); err != nil {
267+ return fmt .Errorf ("failed to verify UC: %w" , err )
268+ }
269+
226270 c .logger .WithContext (ctx ).Info (fmt .Sprintf ("handleCertificationResponse: UC round %d, next round %d, next leader %s" ,
227271 cr .UC .GetRoundNumber (), cr .Technical .Round , cr .Technical .Leader ))
228272
@@ -472,7 +516,12 @@ func (c *BFTClientImpl) sendCertificationRequest(ctx context.Context, rootHash s
472516 }
473517 c .logger .WithContext (ctx ).Info (fmt .Sprintf ("Round %d sending block certification request to root chain, IR hash %X" ,
474518 req .InputRecord .RoundNumber , req .InputRecord .Hash ))
475- rootIDs , err := rootNodesSelector (luc , c .rootNodes , defaultNofRootNodes )
519+
520+ tb , err := c .trustBaseStore .GetByEpoch (ctx , luc .GetRootEpoch ())
521+ if err != nil {
522+ return fmt .Errorf ("failed to load trust base for epoch %d: %w" , luc .GetRootEpoch (), err )
523+ }
524+ rootIDs , err := rootNodesSelector (luc , tb , defaultNofRootNodes )
476525 if err != nil {
477526 return fmt .Errorf ("selecting root nodes: %w" , err )
478527 }
0 commit comments