Skip to content

Commit

Permalink
fix: better timeout for lagging
Browse files Browse the repository at this point in the history
  • Loading branch information
stringhandler committed Aug 31, 2023
1 parent dc5cfce commit f909905
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
// USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
use std::{future::Future, sync::Arc};
use std::{future::Future, sync::Arc, time::Duration};

use futures::{future, future::Either};
use log::*;
use randomx_rs::RandomXFlag;
use serde::{Deserialize, Serialize};
use tari_common::configuration::serializers;
use tari_comms::{connectivity::ConnectivityRequester, PeerManager};
use tari_shutdown::ShutdownSignal;
use tokio::sync::{broadcast, watch};
Expand Down Expand Up @@ -55,14 +56,21 @@ pub struct BaseNodeStateMachineConfig {
/// The amount of blocks this node can be behind a peer before considered to be lagging (to test the block
/// propagation by delaying lagging)
pub blocks_behind_before_considered_lagging: u64,
/// The amount of time this node can know about a stronger chain before considered to be lagging.
/// This is to give a node time to receive the block via propagation, which is usually less network
/// intensive. Be careful of setting this higher than the block time, which would potentially cause it
/// to always be behind the network
#[serde(with = "serializers::seconds")]
pub time_before_considered_lagging: Duration,
}

#[allow(clippy::derivable_impls)]
impl Default for BaseNodeStateMachineConfig {
fn default() -> Self {
Self {
blockchain_sync_config: Default::default(),
blocks_behind_before_considered_lagging: 0,
blocks_behind_before_considered_lagging: 1,
time_before_considered_lagging: Duration::from_secs(10),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,18 @@ pub enum SyncStatus {
network: ChainMetadata,
sync_peers: Vec<SyncPeer>,
},
// There is a stronger chain but we are less the `blocks_before_considered_lagging` blocks behind it.
BehindButNotYetLagging {
local: ChainMetadata,
network: ChainMetadata,
sync_peers: Vec<SyncPeer>,
},
UpToDate,
}

impl SyncStatus {
pub fn is_lagging(&self) -> bool {
!self.is_up_to_date()
matches!(self, SyncStatus::Lagging { .. })
}

pub fn is_up_to_date(&self) -> bool {
Expand All @@ -117,6 +123,7 @@ impl Display for SyncStatus {
network.accumulated_difficulty(),
),
UpToDate => f.write_str("UpToDate"),
SyncStatus::BehindButNotYetLagging { .. } => f.write_str("Behind but not yet lagging"),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use std::{
convert::TryFrom,
fmt::{Display, Formatter},
ops::Deref,
time::{Duration, Instant},
time::Instant,
};

use log::*;
Expand Down Expand Up @@ -56,9 +56,6 @@ use crate::{

const LOG_TARGET: &str = "c::bn::state_machine_service::states::listening";

/// The length of time to wait for a propagated block when one block behind before proceeding to sync
const ONE_BLOCK_BEHIND_WAIT_PERIOD: Duration = Duration::from_secs(20);

/// This struct contains the info of the peer, and is used to serialised and deserialised.
#[derive(Serialize, Deserialize)]
pub struct PeerMetadata {
Expand Down Expand Up @@ -170,35 +167,8 @@ impl Listening {
}
};

let local = match shared.db.get_chain_metadata().await {
Ok(m) => m,
Err(e) => {
return FatalError(format!("Could not get local blockchain metadata. {}", e));
},
};
log_mdc::extend(mdc.clone());

// If this node is just one block behind, wait for block propagation before
// rushing to sync mode
if self.is_synced &&
peer_metadata.claimed_chain_metadata().height_of_longest_chain() ==
local.height_of_longest_chain() + 1 &&
time_since_better_block
.map(|ts: Instant| ts.elapsed() < ONE_BLOCK_BEHIND_WAIT_PERIOD)
.unwrap_or(true)
{
if time_since_better_block.is_none() {
time_since_better_block = Some(Instant::now());
}
debug!(
target: LOG_TARGET,
"This node is one block behind. Best network metadata is at height {}.",
peer_metadata.claimed_chain_metadata().height_of_longest_chain()
);
continue;
}
time_since_better_block = None;

let local_metadata = match shared.db.get_chain_metadata().await {
Ok(m) => m,
Err(e) => {
Expand All @@ -213,6 +183,34 @@ impl Listening {
peer_metadata,
);

// Generally we will receive a block via incoming blocks, but something might have
// happened that we have not synced to them, e.g. our network could have been down.
// If we know about a stronger chain, but haven't synced to it, because we didn't get
// the blocks propagated to us, or we have a high `blocks_before_considered_lagging`
// then we will wait at least `time_before_considered_lagging` before we try to sync
// to that new chain. If you want to sync to a new chain immediately, then you can
// set this value to 1 second or lower.
if let SyncStatus::BehindButNotYetLagging {
local,
network,
sync_peers,
} = &sync_mode
{
if time_since_better_block.is_none() {
time_since_better_block = Some(Instant::now());
}
if time_since_better_block
.map(|t| t.elapsed() > shared.config.time_before_considered_lagging)
.unwrap()
{
return StateEvent::FallenBehind(SyncStatus::Lagging {
local: local.clone(),
network: network.clone(),
sync_peers: sync_peers.clone(),
});
}
}

if sync_mode.is_lagging() {
return StateEvent::FallenBehind(sync_mode);
}
Expand Down Expand Up @@ -300,7 +298,11 @@ fn determine_sync_mode(
waiting for the propagated blocks",
blocks_behind_before_considered_lagging
);
return UpToDate;
return SyncStatus::BehindButNotYetLagging {
local: local.clone(),
network: network.claimed_chain_metadata().clone(),
sync_peers: vec![network.clone().into()],
};
};

debug!(
Expand Down
2 changes: 1 addition & 1 deletion base_layer/p2p/src/services/liveness/service.rs
Original file line number Diff line number Diff line change
Expand Up @@ -575,7 +575,7 @@ mod test {
let msg = create_dummy_message(PingPongMessage::pong_with_metadata(123, metadata.clone()));

state.add_inflight_ping(
msg.inner.as_ref().map(|i| i.nonce.clone()).unwrap(),
msg.inner.as_ref().map(|i| i.nonce).unwrap(),
msg.source_peer.node_id.clone(),
);

Expand Down
13 changes: 11 additions & 2 deletions common/config/presets/c_base_node.toml
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,17 @@ track_reorgs = true
# The maximum amount of VMs that RandomX will be use (default = 0)
#max_randomx_vms = 0
# The amount of blocks this node can be behind a peer before considered to be lagging (to test the block
# propagation by delaying lagging) (default = 0)
#blocks_behind_before_considered_lagging = 0
# propagation by delaying lagging, but also to give it time to receive the block via propagation, which is more network
# efficient)
# Note that time_before_considered_lagging will override this setting if the node sees a stronger chain for longer than
# that configured time.
# (default = 1)
#blocks_behind_before_considered_lagging = 1
# The amount of time this node can know about a stronger chain before considered to be lagging.
# This is to give a node time to receive the block via propagation, which is usually less network
# intensive. Be careful of setting this higher than the block time, which would potentially cause it
# to always be behind the network (default = 10) (in seconds)
#time_before_considered_lagging = 10

[base_node.p2p]
# The node's publicly-accessible hostname. This is the host name that is advertised on the network so that
Expand Down

0 comments on commit f909905

Please sign in to comment.