diff --git a/base_layer/core/src/base_node/state_machine_service/state_machine.rs b/base_layer/core/src/base_node/state_machine_service/state_machine.rs index c6b05fa88e..e4de5dea53 100644 --- a/base_layer/core/src/base_node/state_machine_service/state_machine.rs +++ b/base_layer/core/src/base_node/state_machine_service/state_machine.rs @@ -19,12 +19,13 @@ // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE // USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -use std::{future::Future, sync::Arc}; +use std::{future::Future, sync::Arc, time::Duration}; use futures::{future, future::Either}; use log::*; use randomx_rs::RandomXFlag; use serde::{Deserialize, Serialize}; +use tari_common::configuration::serializers; use tari_comms::{connectivity::ConnectivityRequester, PeerManager}; use tari_shutdown::ShutdownSignal; use tokio::sync::{broadcast, watch}; @@ -55,6 +56,12 @@ pub struct BaseNodeStateMachineConfig { /// The amount of blocks this node can be behind a peer before considered to be lagging (to test the block /// propagation by delaying lagging) pub blocks_behind_before_considered_lagging: u64, + /// The amount of time this node can know about a stronger chain before considered to be lagging. + /// This is to give a node time to receive the block via propagation, which is usually less network + /// intensive. Be careful of setting this higher than the block time, which would potentially cause it + /// to always be behind the network + #[serde(with = "serializers::seconds")] + pub time_before_considered_lagging: Duration, } #[allow(clippy::derivable_impls)] @@ -62,7 +69,8 @@ impl Default for BaseNodeStateMachineConfig { fn default() -> Self { Self { blockchain_sync_config: Default::default(), - blocks_behind_before_considered_lagging: 0, + blocks_behind_before_considered_lagging: 1, + time_before_considered_lagging: Duration::from_secs(10), } } } diff --git a/base_layer/core/src/base_node/state_machine_service/states/events_and_states.rs b/base_layer/core/src/base_node/state_machine_service/states/events_and_states.rs index c2721da086..9ed15d5026 100644 --- a/base_layer/core/src/base_node/state_machine_service/states/events_and_states.rs +++ b/base_layer/core/src/base_node/state_machine_service/states/events_and_states.rs @@ -90,12 +90,18 @@ pub enum SyncStatus { network: ChainMetadata, sync_peers: Vec, }, + // There is a stronger chain but we are less the `blocks_before_considered_lagging` blocks behind it. + BehindButNotYetLagging { + local: ChainMetadata, + network: ChainMetadata, + sync_peers: Vec, + }, UpToDate, } impl SyncStatus { pub fn is_lagging(&self) -> bool { - !self.is_up_to_date() + matches!(self, SyncStatus::Lagging { .. }) } pub fn is_up_to_date(&self) -> bool { @@ -117,6 +123,7 @@ impl Display for SyncStatus { network.accumulated_difficulty(), ), UpToDate => f.write_str("UpToDate"), + SyncStatus::BehindButNotYetLagging { .. } => f.write_str("Behind but not yet lagging"), } } } diff --git a/base_layer/core/src/base_node/state_machine_service/states/listening.rs b/base_layer/core/src/base_node/state_machine_service/states/listening.rs index a45a04e101..b087bea20d 100644 --- a/base_layer/core/src/base_node/state_machine_service/states/listening.rs +++ b/base_layer/core/src/base_node/state_machine_service/states/listening.rs @@ -24,7 +24,7 @@ use std::{ convert::TryFrom, fmt::{Display, Formatter}, ops::Deref, - time::{Duration, Instant}, + time::Instant, }; use log::*; @@ -56,9 +56,6 @@ use crate::{ const LOG_TARGET: &str = "c::bn::state_machine_service::states::listening"; -/// The length of time to wait for a propagated block when one block behind before proceeding to sync -const ONE_BLOCK_BEHIND_WAIT_PERIOD: Duration = Duration::from_secs(20); - /// This struct contains the info of the peer, and is used to serialised and deserialised. #[derive(Serialize, Deserialize)] pub struct PeerMetadata { @@ -170,35 +167,8 @@ impl Listening { } }; - let local = match shared.db.get_chain_metadata().await { - Ok(m) => m, - Err(e) => { - return FatalError(format!("Could not get local blockchain metadata. {}", e)); - }, - }; log_mdc::extend(mdc.clone()); - // If this node is just one block behind, wait for block propagation before - // rushing to sync mode - if self.is_synced && - peer_metadata.claimed_chain_metadata().height_of_longest_chain() == - local.height_of_longest_chain() + 1 && - time_since_better_block - .map(|ts: Instant| ts.elapsed() < ONE_BLOCK_BEHIND_WAIT_PERIOD) - .unwrap_or(true) - { - if time_since_better_block.is_none() { - time_since_better_block = Some(Instant::now()); - } - debug!( - target: LOG_TARGET, - "This node is one block behind. Best network metadata is at height {}.", - peer_metadata.claimed_chain_metadata().height_of_longest_chain() - ); - continue; - } - time_since_better_block = None; - let local_metadata = match shared.db.get_chain_metadata().await { Ok(m) => m, Err(e) => { @@ -213,6 +183,34 @@ impl Listening { peer_metadata, ); + // Generally we will receive a block via incoming blocks, but something might have + // happened that we have not synced to them, e.g. our network could have been down. + // If we know about a stronger chain, but haven't synced to it, because we didn't get + // the blocks propagated to us, or we have a high `blocks_before_considered_lagging` + // then we will wait at least `time_before_considered_lagging` before we try to sync + // to that new chain. If you want to sync to a new chain immediately, then you can + // set this value to 1 second or lower. + if let SyncStatus::BehindButNotYetLagging { + local, + network, + sync_peers, + } = &sync_mode + { + if time_since_better_block.is_none() { + time_since_better_block = Some(Instant::now()); + } + if time_since_better_block + .map(|t| t.elapsed() > shared.config.time_before_considered_lagging) + .unwrap() + { + return StateEvent::FallenBehind(SyncStatus::Lagging { + local: local.clone(), + network: network.clone(), + sync_peers: sync_peers.clone(), + }); + } + } + if sync_mode.is_lagging() { return StateEvent::FallenBehind(sync_mode); } @@ -300,7 +298,11 @@ fn determine_sync_mode( waiting for the propagated blocks", blocks_behind_before_considered_lagging ); - return UpToDate; + return SyncStatus::BehindButNotYetLagging { + local: local.clone(), + network: network.claimed_chain_metadata().clone(), + sync_peers: vec![network.clone().into()], + }; }; debug!( diff --git a/base_layer/p2p/src/services/liveness/service.rs b/base_layer/p2p/src/services/liveness/service.rs index 35eec23de1..29724b946a 100644 --- a/base_layer/p2p/src/services/liveness/service.rs +++ b/base_layer/p2p/src/services/liveness/service.rs @@ -575,7 +575,7 @@ mod test { let msg = create_dummy_message(PingPongMessage::pong_with_metadata(123, metadata.clone())); state.add_inflight_ping( - msg.inner.as_ref().map(|i| i.nonce.clone()).unwrap(), + msg.inner.as_ref().map(|i| i.nonce).unwrap(), msg.source_peer.node_id.clone(), ); diff --git a/common/config/presets/c_base_node.toml b/common/config/presets/c_base_node.toml index 5724ba20cf..4a2cd65882 100644 --- a/common/config/presets/c_base_node.toml +++ b/common/config/presets/c_base_node.toml @@ -140,8 +140,17 @@ track_reorgs = true # The maximum amount of VMs that RandomX will be use (default = 0) #max_randomx_vms = 0 # The amount of blocks this node can be behind a peer before considered to be lagging (to test the block -# propagation by delaying lagging) (default = 0) -#blocks_behind_before_considered_lagging = 0 +# propagation by delaying lagging, but also to give it time to receive the block via propagation, which is more network +# efficient) +# Note that time_before_considered_lagging will override this setting if the node sees a stronger chain for longer than +# that configured time. +# (default = 1) +#blocks_behind_before_considered_lagging = 1 +# The amount of time this node can know about a stronger chain before considered to be lagging. +# This is to give a node time to receive the block via propagation, which is usually less network +# intensive. Be careful of setting this higher than the block time, which would potentially cause it +# to always be behind the network (default = 10) (in seconds) +#time_before_considered_lagging = 10 [base_node.p2p] # The node's publicly-accessible hostname. This is the host name that is advertised on the network so that