Skip to content

Commit

Permalink
feat: add error type label to the near_num_invalid_blocks metric (#…
Browse files Browse the repository at this point in the history
…10164)

The metric `near_num_invalid_blocks` counts the number of invalid blocks
processed by neard. Up until now there was no information why the blocks
are invalid. Let's add a label that describes what kind of error caused
the block to be invalid. This will make it easier to diagnose what's
wrong when there are lots of invalid blocks.

The label is called "error", an example prometheus report looks like
this:
```
near_num_invalid_blocks{error="chunks_missing"} 1234
```

Fixes: #9661
  • Loading branch information
jancionear authored Nov 16, 2023
1 parent bfb3b58 commit 9a7b03a
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 3 deletions.
64 changes: 64 additions & 0 deletions chain/chain-primitives/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,70 @@ impl Error {
_ => false,
}
}

/// Some blockchain errors are reported in the prometheus metrics. In such cases a report might
/// contain a label that specifies the type of error that has occured. For example when the node
/// receives a block with an invalid signature this would be reported as:
/// `near_num_invalid_blocks{error="invalid_signature"}`.
/// This function returns the value of the error label for a specific instance of Error.
pub fn prometheus_label_value(&self) -> &'static str {
match self {
Error::BlockKnown(_) => "block_known",
Error::TooManyProcessingBlocks => "too_many_processing_blocks",
Error::Orphan => "orphan",
Error::ChunkMissing(_) => "chunk_missing",
Error::ChunksMissing(_) => "chunks_missing",
Error::InvalidChunkHeight => "invalid_chunk_height",
Error::IOErr(_) => "io_err",
Error::Other(_) => "other",
Error::ValidatorError(_) => "validator_error",
Error::EpochOutOfBounds(_) => "epoch_out_of_bounds",
Error::ChallengedBlockOnChain => "challenged_block_on_chain",
Error::CannotBeFinalized => "cannot_be_finalized",
Error::StorageError(_) => "storage_error",
Error::GCError(_) => "gc_error",
Error::DBNotFoundErr(_) => "db_not_found_err",
Error::InvalidBlockPastTime(_, _) => "invalid_block_past_time",
Error::InvalidBlockFutureTime(_) => "invalid_block_future_time",
Error::InvalidBlockHeight(_) => "invalid_block_height",
Error::InvalidBlockProposer => "invalid_block_proposer",
Error::InvalidChunk => "invalid_chunk",
Error::InvalidChunkProofs(_) => "invalid_chunk_proofs",
Error::InvalidChunkState(_) => "invalid_chunk_state",
Error::InvalidChunkMask => "invalid_chunk_mask",
Error::InvalidStateRoot => "invalid_state_root",
Error::InvalidTxRoot => "invalid_tx_root",
Error::InvalidChunkReceiptsRoot => "invalid_chunk_receipts_root",
Error::InvalidOutcomesProof => "invalid_outcomes_proof",
Error::InvalidChunkHeadersRoot => "invalid_chunk_headers_root",
Error::InvalidChunkTxRoot => "invalid_chunk_tx_root",
Error::InvalidReceiptsProof => "invalid_receipts_proof",
Error::InvalidStatePayload => "invalid_state_payload",
Error::InvalidTransactions => "invalid_transactions",
Error::InvalidChallenge => "invalid_challenge",
Error::InvalidSplitShardsIds(_, _) => "invalid_split_shard_ids",
Error::MaliciousChallenge => "malicious_challenge",
Error::IncorrectNumberOfChunkHeaders => "incorrect_number_of_chunk_headers",
Error::InvalidEpochHash => "invalid_epoch_hash",
Error::InvalidNextBPHash => "invalid_next_bp_hash",
Error::NotEnoughApprovals => "not_enough_approvals",
Error::InvalidFinalityInfo => "invalid_finality_info",
Error::InvalidValidatorProposals => "invalid_validator_proposals",
Error::InvalidSignature => "invalid_signature",
Error::InvalidApprovals => "invalid_approvals",
Error::InvalidGasLimit => "invalid_gas_limit",
Error::InvalidGasPrice => "invalid_gas_price",
Error::InvalidGasUsed => "invalid_gas_used",
Error::InvalidBalanceBurnt => "invalid_balance_burnt",
Error::InvalidShardId(_) => "invalid_shard_id",
Error::InvalidStateRequest(_) => "invalid_state_request",
Error::InvalidRandomnessBeaconOutput => "invalid_randomness_beacon_output",
Error::InvalidBlockMerkleRoot => "invalid_block_merkele_root",
Error::InvalidProtocolVersion => "invalid_protocol_version",
Error::NotAValidator => "not_a_validator",
Error::InvalidChallengeRoot => "invalid_challenge_root",
}
}
}

impl From<EpochError> for Error {
Expand Down
2 changes: 1 addition & 1 deletion chain/chain/src/chain.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1095,7 +1095,7 @@ impl Chain {
}

fn maybe_mark_block_invalid(&mut self, block_hash: CryptoHash, error: &Error) {
metrics::NUM_INVALID_BLOCKS.inc();
metrics::NUM_INVALID_BLOCKS.with_label_values(&[error.prometheus_label_value()]).inc();
// We only mark the block as invalid if the block has bad data (not for other errors that would
// not be the fault of the block itself), except when the block has a bad signature which means
// the block might not have been what the block producer originally produced. Either way, it's
Expand Down
5 changes: 3 additions & 2 deletions chain/chain/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ pub static STATE_PART_ELAPSED: Lazy<HistogramVec> = Lazy::new(|| {
)
.unwrap()
});
pub static NUM_INVALID_BLOCKS: Lazy<IntGauge> = Lazy::new(|| {
try_create_int_gauge("near_num_invalid_blocks", "Number of invalid blocks").unwrap()
pub static NUM_INVALID_BLOCKS: Lazy<IntGaugeVec> = Lazy::new(|| {
try_create_int_gauge_vec("near_num_invalid_blocks", "Number of invalid blocks", &["error"])
.unwrap()
});
pub(crate) static SCHEDULED_CATCHUP_BLOCK: Lazy<IntGauge> = Lazy::new(|| {
try_create_int_gauge(
Expand Down

0 comments on commit 9a7b03a

Please sign in to comment.