-
Notifications
You must be signed in to change notification settings - Fork 710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make approval-distribution aggression a bit more robust and less spammy #6696
base: alexggh/fix_possible_bug
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -318,7 +318,7 @@ impl Default for AggressionConfig { | |
fn default() -> Self { | ||
AggressionConfig { | ||
l1_threshold: Some(16), | ||
l2_threshold: Some(28), | ||
l2_threshold: Some(64), | ||
resend_unfinalized_period: Some(8), | ||
} | ||
} | ||
|
@@ -514,6 +514,8 @@ struct BlockEntry { | |
vrf_story: RelayVRFStory, | ||
/// The block slot. | ||
slot: Slot, | ||
/// Backing of from re-sending messages to peers. | ||
last_resent_at_block_number: Option<u32>, | ||
} | ||
|
||
impl BlockEntry { | ||
|
@@ -880,6 +882,7 @@ impl State { | |
candidates_metadata: meta.candidates, | ||
vrf_story: meta.vrf_story, | ||
slot: meta.slot, | ||
last_resent_at_block_number: None, | ||
}); | ||
|
||
self.topologies.inc_session_refs(meta.session); | ||
|
@@ -1319,6 +1322,33 @@ impl State { | |
self.enable_aggression(network_sender, Resend::No, metrics).await; | ||
} | ||
|
||
// When finality is lagging as a last resort nodes start sending the messages they have | ||
// multiples times. This means it is safe to accept duplicate messages without punishing the | ||
// peer and reduce the reputation and can end up banning the Peer, which in turn will create | ||
// more no-shows. | ||
fn accept_duplicates_from_validators( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would safer to accept only a bounded amount of duplicates over time. |
||
blocks_by_number: &BTreeMap<BlockNumber, Vec<Hash>>, | ||
topologies: &SessionGridTopologies, | ||
aggression_config: &AggressionConfig, | ||
entry: &BlockEntry, | ||
peer: PeerId, | ||
) -> bool { | ||
let topology = topologies.get_topology(entry.session); | ||
let min_age = blocks_by_number.iter().next().map(|(num, _)| num); | ||
let max_age = blocks_by_number.iter().rev().next().map(|(num, _)| num); | ||
|
||
// Return if we don't have at least 1 block. | ||
let (min_age, max_age) = match (min_age, max_age) { | ||
(Some(min), Some(max)) => (*min, *max), | ||
_ => return false, | ||
}; | ||
|
||
let age = max_age.saturating_sub(min_age); | ||
|
||
aggression_config.should_trigger_aggression(age) && | ||
topology.map(|topology| topology.is_validator(&peer)).unwrap_or(false) | ||
} | ||
|
||
async fn import_and_circulate_assignment<A, N, RA, R>( | ||
&mut self, | ||
approval_voting_sender: &mut A, | ||
|
@@ -1390,13 +1420,22 @@ impl State { | |
"Duplicate assignment", | ||
); | ||
|
||
modify_reputation( | ||
&mut self.reputation, | ||
network_sender, | ||
if !Self::accept_duplicates_from_validators( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. above log could show if duplicate is accepted. |
||
&self.blocks_by_number, | ||
&self.topologies, | ||
&self.aggression_config, | ||
entry, | ||
peer_id, | ||
COST_DUPLICATE_MESSAGE, | ||
) | ||
.await; | ||
) { | ||
modify_reputation( | ||
&mut self.reputation, | ||
network_sender, | ||
peer_id, | ||
COST_DUPLICATE_MESSAGE, | ||
) | ||
.await; | ||
} | ||
|
||
metrics.on_assignment_duplicate(); | ||
} else { | ||
gum::trace!( | ||
|
@@ -1712,6 +1751,9 @@ impl State { | |
assignments_knowledge_key: &Vec<(MessageSubject, MessageKind)>, | ||
approval_knowledge_key: &(MessageSubject, MessageKind), | ||
entry: &mut BlockEntry, | ||
blocks_by_number: &BTreeMap<BlockNumber, Vec<Hash>>, | ||
topologies: &SessionGridTopologies, | ||
aggression_config: &AggressionConfig, | ||
reputation: &mut ReputationAggregator, | ||
peer_id: PeerId, | ||
metrics: &Metrics, | ||
|
@@ -1746,14 +1788,21 @@ impl State { | |
?approval_knowledge_key, | ||
"Duplicate approval", | ||
); | ||
|
||
modify_reputation( | ||
reputation, | ||
network_sender, | ||
if !Self::accept_duplicates_from_validators( | ||
blocks_by_number, | ||
topologies, | ||
aggression_config, | ||
entry, | ||
peer_id, | ||
COST_DUPLICATE_MESSAGE, | ||
) | ||
.await; | ||
) { | ||
modify_reputation( | ||
reputation, | ||
network_sender, | ||
peer_id, | ||
COST_DUPLICATE_MESSAGE, | ||
) | ||
.await; | ||
} | ||
metrics.on_approval_duplicate(); | ||
} | ||
return false | ||
|
@@ -1845,6 +1894,9 @@ impl State { | |
&assignments_knowledge_keys, | ||
&approval_knwowledge_key, | ||
entry, | ||
&self.blocks_by_number, | ||
&self.topologies, | ||
&self.aggression_config, | ||
&mut self.reputation, | ||
peer_id, | ||
metrics, | ||
|
@@ -2255,18 +2307,42 @@ impl State { | |
&self.topologies, | ||
|block_entry| { | ||
let block_age = max_age - block_entry.number; | ||
// We want to resend only for blocks between min_age and min_age + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe it should be sufficient that we trigger aggression only for oldest unfinalized block. From what I've seen in production and testing is that usuallt are a few unapproved candidates holding finality on a particular block. Resending for all unfinalized blocks doesn't really make sense, it's better to make small incremental progress than send large bursts of messages. So I would just remove the Am I missing anything ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think you are, I was think about the same thing, but I did not know the initial reason for using |
||
// resend_unfinalized_period, there is no point in resending for blocks newer than | ||
// that, because we are just going to create load and not gain anything. | ||
let diff_from_min_age = block_entry.number - min_age; | ||
|
||
// We want to back-off on resending for blocks that have been resent recently, to | ||
// give time for nodes to process all the extra messages, if we still have not | ||
// finalized we are going to resend again after unfinalized_period * 3 since the | ||
// last resend. | ||
let blocks_since_last_sent = block_entry | ||
.last_resent_at_block_number | ||
.map(|last_resent_at_block_number| max_age - last_resent_at_block_number); | ||
|
||
let can_resend_at_this_age = blocks_since_last_sent | ||
.zip(config.resend_unfinalized_period) | ||
.map(|(blocks_since_last_sent, unfinalized_period)| { | ||
blocks_since_last_sent > unfinalized_period * 3 | ||
}) | ||
.unwrap_or(true); | ||
|
||
if resend == Resend::Yes && | ||
config | ||
.resend_unfinalized_period | ||
.as_ref() | ||
.map_or(false, |p| block_age > 0 && block_age % p == 0) | ||
{ | ||
config.resend_unfinalized_period.as_ref().map_or(false, |p| { | ||
block_age > 0 && | ||
block_age % p == 0 && diff_from_min_age < *p && | ||
can_resend_at_this_age | ||
}) { | ||
// Retry sending to all peers. | ||
for (_, knowledge) in block_entry.known_by.iter_mut() { | ||
knowledge.sent = Knowledge::default(); | ||
} | ||
|
||
block_entry.last_resent_at_block_number = Some(max_age); | ||
gum::debug!( | ||
target: LOG_TARGET, | ||
block_number = ?block_entry.number, | ||
"Aggression enabled with resend for block", | ||
); | ||
true | ||
} else { | ||
false | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.