Skip to content

Commit c53aa40

Browse files
authored
Fix missing committee blob handling. (#4978) (#4993)
Partial backport of #4978. ## Motivation The current `Committee` implementation is a bit complicated, and sometimes sets the quorum threshold higher than necessary. ## Proposal #4978 revealed an issue with the certificate handling logic in the worker: `committees_for` returns `ViewErrors` instead of `BlobsNotFound`/`EventsNotFound`, so the client fails to send the validators the admin chain if needed. This was fixed; I also kept minor cleanups I made while debugging this. I got a few stack overflows, so I doubled the client's Tokio thread stack size. ## Test Plan CI ## Release Plan - The fixes (but maybe not the quorum change) should be - released in a new SDK and - released in a validator hotfix. ## Links - PR to main: #4978 - [reviewer checklist](https://github.com/linera-io/linera-protocol/blob/main/CONTRIBUTING.md#reviewer-checklist)
1 parent a1f6a5e commit c53aa40

File tree

9 files changed

+165
-178
lines changed

9 files changed

+165
-178
lines changed

linera-core/src/chain_worker/state.rs

Lines changed: 29 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ use linera_base::{
1818
},
1919
ensure,
2020
hashed::Hashed,
21-
identifiers::{AccountOwner, ApplicationId, BlobId, BlobType, ChainId, EventId, StreamId},
21+
identifiers::{AccountOwner, ApplicationId, BlobId, BlobType, ChainId},
2222
};
2323
use linera_chain::{
2424
data_types::{
@@ -30,7 +30,7 @@ use linera_chain::{
3030
ChainError, ChainExecutionContext, ChainStateView, ExecutionResultExt as _,
3131
};
3232
use linera_execution::{
33-
system::EPOCH_STREAM_NAME, Committee, ExecutionStateView, Query, QueryContext, QueryOutcome,
33+
Committee, ExecutionRuntimeContext as _, ExecutionStateView, Query, QueryContext, QueryOutcome,
3434
ServiceRuntimeEndpoint,
3535
};
3636
use linera_storage::{Clock as _, ResultReadCertificates, Storage};
@@ -659,6 +659,14 @@ where
659659
// Check that the chain is active and ready for this validated block.
660660
// Verify the certificate. Returns a catch-all error to make client code more robust.
661661
self.initialize_and_save_if_needed().await?;
662+
let tip_state = self.chain.tip_state.get();
663+
ensure!(
664+
header.height == tip_state.next_block_height,
665+
ChainError::UnexpectedBlockHeight {
666+
expected_block_height: tip_state.next_block_height,
667+
found_block_height: header.height,
668+
}
669+
);
662670
let (epoch, committee) = self.chain.current_committee()?;
663671
check_block_epoch(epoch, header.chain_id, header.epoch)?;
664672
certificate.check(committee)?;
@@ -747,21 +755,23 @@ where
747755
{
748756
certificate.check(committee)?;
749757
} else {
750-
let committees = self.storage.committees_for(epoch..=epoch).await?;
751-
let Some(committee) = committees.get(&epoch) else {
752-
let net_description = self
753-
.storage
754-
.read_network_description()
755-
.await?
756-
.ok_or_else(|| WorkerError::MissingNetworkDescription)?;
757-
return Err(WorkerError::EventsNotFound(vec![EventId {
758-
chain_id: net_description.admin_chain_id,
759-
stream_id: StreamId::system(EPOCH_STREAM_NAME),
760-
index: epoch.0,
761-
}]));
762-
};
763-
// This line is duplicated, but this avoids cloning and a lifetimes error.
764-
certificate.check(committee)?;
758+
let committee = self
759+
.chain
760+
.execution_state
761+
.context()
762+
.extra()
763+
.get_committees(epoch..=epoch)
764+
.await
765+
.map_err(|error| {
766+
ChainError::ExecutionError(Box::new(error), ChainExecutionContext::Block)
767+
})?
768+
.remove(&epoch)
769+
.ok_or_else(|| {
770+
ChainError::InternalError(format!(
771+
"missing committee for epoch {epoch}; this is a bug"
772+
))
773+
})?;
774+
certificate.check(&committee)?;
765775
}
766776

767777
// Certificate check passed - which means the blobs the block requires are legitimate and
@@ -836,17 +846,6 @@ where
836846
.filter_map(|blob_id| blobs.remove(blob_id))
837847
.collect::<Vec<_>>();
838848

839-
// If height is zero, we haven't initialized the chain state or verified the epoch before -
840-
// do it now.
841-
// This will fail if the chain description blob is still missing - but that's alright,
842-
// because we already wrote the blob state above, so the client can now upload the
843-
// blob, which will get accepted, and retry.
844-
if height == BlockHeight::ZERO {
845-
self.initialize_and_save_if_needed().await?;
846-
let (epoch, _) = self.chain.current_committee()?;
847-
check_block_epoch(epoch, chain_id, block.header.epoch)?;
848-
}
849-
850849
// Execute the block and update inboxes.
851850
let local_time = self.storage.clock().current_time();
852851
let chain = &mut self.chain;
@@ -1382,6 +1381,8 @@ where
13821381
} = &proposal;
13831382
let block = &content.block;
13841383
let chain = &self.chain;
1384+
// Check if the chain is ready for this new block proposal.
1385+
chain.tip_state.get().verify_block_chaining(block)?;
13851386
// Check the epoch.
13861387
let (epoch, committee) = chain.current_committee()?;
13871388
check_block_epoch(epoch, block.chain_id, block.epoch)?;
@@ -1431,8 +1432,6 @@ where
14311432
original_proposal.check_signature()?;
14321433
}
14331434
}
1434-
// Check if the chain is ready for this new block proposal.
1435-
chain.tip_state.get().verify_block_chaining(block)?;
14361435
if chain.manager.check_proposed_block(&proposal)? == manager::Outcome::Skip {
14371436
// We already voted for this block.
14381437
return Ok((self.chain_info_response(), NetworkActions::default()));

linera-core/src/updater.rs

Lines changed: 19 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ where
263263
let mut sent_admin_chain = false;
264264
let mut sent_blobs = false;
265265
loop {
266-
result = match result {
266+
match result {
267267
Err(NodeError::EventsNotFound(event_ids))
268268
if !sent_admin_chain
269269
&& certificate.inner().chain_id() != self.admin_id
@@ -275,9 +275,6 @@ where
275275
// The validator doesn't have the committee that signed the certificate.
276276
self.update_admin_chain().await?;
277277
sent_admin_chain = true;
278-
self.remote_node
279-
.handle_confirmed_certificate(certificate.clone(), delivery)
280-
.await
281278
}
282279
Err(NodeError::BlobsNotFound(blob_ids)) if !sent_blobs => {
283280
// The validator is missing the blobs required by the certificate.
@@ -292,12 +289,13 @@ where
292289
let blobs = maybe_blobs.ok_or(NodeError::BlobsNotFound(blob_ids))?;
293290
self.remote_node.node.upload_blobs(blobs).await?;
294291
sent_blobs = true;
295-
self.remote_node
296-
.handle_confirmed_certificate(certificate.clone(), delivery)
297-
.await
298292
}
299293
result => return Ok(result?),
300-
};
294+
}
295+
result = self
296+
.remote_node
297+
.handle_confirmed_certificate(certificate.clone(), delivery)
298+
.await;
301299
}
302300
}
303301

@@ -312,7 +310,7 @@ where
312310
.await;
313311

314312
let chain_id = certificate.inner().chain_id();
315-
Ok(match &result {
313+
match &result {
316314
Err(original_err @ NodeError::BlobsNotFound(blob_ids)) => {
317315
self.remote_node
318316
.check_blobs_not_found(&certificate, blob_ids)?;
@@ -325,9 +323,6 @@ where
325323
.await?
326324
.ok_or_else(|| original_err.clone())?;
327325
self.remote_node.send_pending_blobs(chain_id, blobs).await?;
328-
self.remote_node
329-
.handle_validated_certificate(certificate)
330-
.await
331326
}
332327
Err(error) => {
333328
self.sync_if_needed(
@@ -337,12 +332,13 @@ where
337332
error,
338333
)
339334
.await?;
340-
self.remote_node
341-
.handle_validated_certificate(certificate)
342-
.await
343335
}
344-
_ => result,
345-
}?)
336+
_ => return Ok(result?),
337+
}
338+
Ok(self
339+
.remote_node
340+
.handle_validated_certificate(certificate)
341+
.await?)
346342
}
347343

348344
/// Requests a vote for a timeout certificate for the given round from the remote node.
@@ -658,22 +654,15 @@ where
658654
.ok_or_else(|| ChainClientError::MissingConfirmedBlock(hash))?
659655
};
660656
let info = match self.send_confirmed_certificate(certificate, delivery).await {
661-
Err(ChainClientError::RemoteNodeError(NodeError::EventsNotFound(event_ids)))
662-
if event_ids.iter().all(|event_id| {
663-
event_id.stream_id == StreamId::system(EPOCH_STREAM_NAME)
664-
&& event_id.chain_id == self.admin_id
665-
}) =>
666-
{
667-
if chain_id != self.admin_id {
668-
tracing::error!(
669-
"Missing epochs were not handled by send_confirmed_certificate."
670-
);
671-
}
657+
Ok(info) => info,
658+
Err(error) => {
659+
tracing::debug!(
660+
address = self.remote_node.address(), %error,
661+
"validator failed to handle confirmed certificate; sending whole chain",
662+
);
672663
let query = ChainInfoQuery::new(chain_id);
673664
self.remote_node.handle_chain_info_query(query).await?
674665
}
675-
Err(err) => return Err(err),
676-
Ok(info) => info,
677666
};
678667
// Obtain the missing blocks and the manager state from the local node.
679668
let heights = (info.next_block_height.0..target_block_height.0).map(BlockHeight);

linera-core/src/worker.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ pub enum WorkerError {
171171
#[error(transparent)]
172172
ChainError(#[from] Box<ChainError>),
173173

174+
#[error(transparent)]
175+
BcsError(#[from] bcs::Error),
176+
174177
// Chain access control
175178
#[error("Block was not signed by an authorized owner")]
176179
InvalidOwner,
@@ -278,7 +281,8 @@ impl WorkerError {
278281
| WorkerError::UnexpectedBlob
279282
| WorkerError::TooManyPublishedBlobs(_)
280283
| WorkerError::ViewError(ViewError::NotFound(_)) => false,
281-
WorkerError::InvalidCrossChainRequest
284+
WorkerError::BcsError(_)
285+
| WorkerError::InvalidCrossChainRequest
282286
| WorkerError::ViewError(_)
283287
| WorkerError::ConfirmedLogEntryNotFound { .. }
284288
| WorkerError::PreprocessedBlocksEntryNotFound { .. }
@@ -296,16 +300,14 @@ impl From<ChainError> for WorkerError {
296300
#[instrument(level = "trace", skip(chain_error))]
297301
fn from(chain_error: ChainError) -> Self {
298302
match chain_error {
299-
ChainError::ExecutionError(execution_error, context) => {
300-
if let ExecutionError::BlobsNotFound(blob_ids) = *execution_error {
301-
Self::BlobsNotFound(blob_ids)
302-
} else {
303-
Self::ChainError(Box::new(ChainError::ExecutionError(
304-
execution_error,
305-
context,
306-
)))
307-
}
308-
}
303+
ChainError::ExecutionError(execution_error, context) => match *execution_error {
304+
ExecutionError::BlobsNotFound(blob_ids) => Self::BlobsNotFound(blob_ids),
305+
ExecutionError::EventsNotFound(event_ids) => Self::EventsNotFound(event_ids),
306+
_ => Self::ChainError(Box::new(ChainError::ExecutionError(
307+
execution_error,
308+
context,
309+
))),
310+
},
309311
error => Self::ChainError(Box::new(error)),
310312
}
311313
}

linera-execution/src/committee.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ pub struct Committee {
6868
total_votes: u64,
6969
/// The threshold to form a quorum.
7070
quorum_threshold: u64,
71-
/// The threshold to prove the validity of a statement.
71+
/// The threshold to prove the validity of a statement. I.e. the assumption is that strictly
72+
/// less than `validity_threshold` are faulty.
7273
validity_threshold: u64,
7374
/// The policy agreed on for this epoch.
7475
policy: ResourceControlPolicy,

0 commit comments

Comments
 (0)