Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue tenure extend if incoming miner fails to mine #5805

Open
wants to merge 16 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE
### Added

- Add miner configuration option `tenure_extend_cost_threshold` to specify the percentage of the tenure budget that must be spent before a time-based tenure extend is attempted
- Add miner configuration option `tenure_extend_wait_timeout_ms` to specify the time to wait to try to continue a tenure if a BlockFound is expected

### Changed

- Miner will include other transactions in blocks with tenure extend transactions (#5760)
- Add `block_rejection_timeout_steps` to miner configuration for defining rejections-based timeouts while waiting for signers response (#5705)
- Miner will not issue a tenure extend until at least half of the block budget has been spent (#5757)
- Miner will issue a tenure extend if the incoming miner has failed to produce a block (#5729)

### Fixed

Expand Down
1 change: 1 addition & 0 deletions stacks-signer/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and this project adheres to the versioning scheme outlined in the [README.md](RE
`StackerDB` messages, it logs `INFO` messages. Other interactions with the `stacks-node`
behave normally (e.g., submitting validation requests, submitting finished blocks). A
dry run signer will error out if the supplied key is actually a registered signer.
- Reduce default value of `block_proposal_timeout_ms` to 120_000

## [3.1.0.0.4.0]

Expand Down
2 changes: 1 addition & 1 deletion stacks-signer/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use stacks_common::util::hash::Hash160;
use crate::client::SignerSlotID;

const EVENT_TIMEOUT_MS: u64 = 5000;
const BLOCK_PROPOSAL_TIMEOUT_MS: u64 = 600_000;
const BLOCK_PROPOSAL_TIMEOUT_MS: u64 = 120_000;
const BLOCK_PROPOSAL_VALIDATION_TIMEOUT_MS: u64 = 120_000;
const DEFAULT_FIRST_PROPOSAL_BURN_BLOCK_TIMING_SECS: u64 = 60;
const DEFAULT_TENURE_LAST_BLOCK_PROPOSAL_TIMEOUT_SECS: u64 = 30;
Expand Down
15 changes: 11 additions & 4 deletions stackslib/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ const DEFAULT_TENURE_COST_LIMIT_PER_BLOCK_PERCENTAGE: u8 = 25;
/// see if we need to extend the ongoing tenure (e.g. because the current
/// sortition is empty or invalid).
const DEFAULT_TENURE_EXTEND_POLL_SECS: u64 = 1;
/// Default number of millis to wait to try to continue a tenure if a BlockFound is expected
const DEFAULT_TENURE_EXTEND_WAIT_MS: u64 = 120_000;
/// Default duration to wait before attempting to issue a tenure extend.
/// This should be greater than the signers' timeout. This is used for issuing
/// fallback tenure extends
Expand Down Expand Up @@ -2177,9 +2179,11 @@ pub struct MinerConfig {
pub block_commit_delay: Duration,
/// The percentage of the remaining tenure cost limit to consume each block.
pub tenure_cost_limit_per_block_percentage: Option<u8>,
/// The number of seconds to wait in-between polling the sortition DB to see if we need to
/// Duration to wait in-between polling the sortition DB to see if we need to
/// extend the ongoing tenure (e.g. because the current sortition is empty or invalid).
pub tenure_extend_poll_secs: Duration,
pub tenure_extend_poll_timeout: Duration,
/// Duration to wait to try to continue a tenure if a BlockFound is expected
jferrant marked this conversation as resolved.
Show resolved Hide resolved
pub tenure_extend_wait_timeout: Duration,
/// Duration to wait before attempting to issue a tenure extend
pub tenure_timeout: Duration,
/// Percentage of block budget that must be used before attempting a time-based tenure extend
Expand Down Expand Up @@ -2222,7 +2226,8 @@ impl Default for MinerConfig {
tenure_cost_limit_per_block_percentage: Some(
DEFAULT_TENURE_COST_LIMIT_PER_BLOCK_PERCENTAGE,
),
tenure_extend_poll_secs: Duration::from_secs(DEFAULT_TENURE_EXTEND_POLL_SECS),
tenure_extend_poll_timeout: Duration::from_secs(DEFAULT_TENURE_EXTEND_POLL_SECS),
tenure_extend_wait_timeout: Duration::from_millis(DEFAULT_TENURE_EXTEND_WAIT_MS),
tenure_timeout: Duration::from_secs(DEFAULT_TENURE_TIMEOUT_SECS),
tenure_extend_cost_threshold: DEFAULT_TENURE_EXTEND_COST_THRESHOLD,

Expand Down Expand Up @@ -2629,6 +2634,7 @@ pub struct MinerConfigFile {
pub block_commit_delay_ms: Option<u64>,
pub tenure_cost_limit_per_block_percentage: Option<u8>,
pub tenure_extend_poll_secs: Option<u64>,
pub tenure_extend_wait_timeout_ms: Option<u64>,
pub tenure_timeout_secs: Option<u64>,
pub tenure_extend_cost_threshold: Option<u64>,
pub block_rejection_timeout_steps: Option<HashMap<String, u64>>,
Expand Down Expand Up @@ -2772,7 +2778,8 @@ impl MinerConfigFile {
subsequent_rejection_pause_ms: self.subsequent_rejection_pause_ms.unwrap_or(miner_default_config.subsequent_rejection_pause_ms),
block_commit_delay: self.block_commit_delay_ms.map(Duration::from_millis).unwrap_or(miner_default_config.block_commit_delay),
tenure_cost_limit_per_block_percentage,
tenure_extend_poll_secs: self.tenure_extend_poll_secs.map(Duration::from_secs).unwrap_or(miner_default_config.tenure_extend_poll_secs),
tenure_extend_poll_timeout: self.tenure_extend_poll_secs.map(Duration::from_secs).unwrap_or(miner_default_config.tenure_extend_poll_timeout),
tenure_extend_wait_timeout: self.tenure_extend_wait_timeout_ms.map(Duration::from_millis).unwrap_or(miner_default_config.tenure_extend_wait_timeout),
tenure_timeout: self.tenure_timeout_secs.map(Duration::from_secs).unwrap_or(miner_default_config.tenure_timeout),
tenure_extend_cost_threshold: self.tenure_extend_cost_threshold.unwrap_or(miner_default_config.tenure_extend_cost_threshold),

Expand Down
152 changes: 99 additions & 53 deletions testnet/stacks-node/src/nakamoto_node/relayer.rs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a lot of the new/changed functions in this file can be tested with unit tests, so we can at least verify that they make the right decisions on whether or not to allow a BlockFound, a late BlockFound, or an Extension. Can you give that a try?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will give it a go!

Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,48 @@ impl MinerStopHandle {
}
}

/// Information necessary to determine when to extend a tenure
pub struct TenureExtendTime {
/// The time at which we determined that we should tenure-extend
time: Instant,
/// The amount of time we should wait before tenure-extending
timeout: Duration,
}

impl TenureExtendTime {
/// Create a new `TenureExtendTime` with a delayed `timeout`
pub fn delayed(timeout: Duration) -> Self {
Self {
time: Instant::now(),
timeout,
}
}

/// Create a new `TenureExtendTime` with no `timeout`
pub fn immediate() -> Self {
Self {
time: Instant::now(),
timeout: Duration::from_secs(0),
}
}

/// Should we attempt to tenure-extend?
pub fn should_extend(&self) -> bool {
// We set the time, but have we waited long enough?
self.time.elapsed() > self.timeout
}

// Amount of time elapsed since we decided to tenure-extend
pub fn elapsed(&self) -> Duration {
self.time.elapsed()
}

// The timeout specified when we decided to tenure-extend
pub fn timeout(&self) -> Duration {
self.timeout
}
}

/// Relayer thread
/// * accepts network results and stores blocks and microblocks
/// * forwards new blocks, microblocks, and transactions to the p2p thread
Expand Down Expand Up @@ -319,8 +361,8 @@ pub struct RelayerThread {
last_committed: Option<LastCommit>,
/// Timeout for waiting for the first block in a tenure before submitting a block commit
new_tenure_timeout: Option<Instant>,
/// Timeout for waiting for a BlockFound in a subsequent tenure before trying to extend our own
tenure_extend_timeout: Option<Instant>,
/// Time to wait before attempting a tenure extend
tenure_extend_time: Option<TenureExtendTime>,
}

impl RelayerThread {
Expand Down Expand Up @@ -380,7 +422,7 @@ impl RelayerThread {
next_initiative: Instant::now() + Duration::from_millis(next_initiative_delay),
last_committed: None,
new_tenure_timeout: None,
tenure_extend_timeout: None,
tenure_extend_time: None,
}
}

Expand Down Expand Up @@ -505,7 +547,7 @@ impl RelayerThread {
SortitionDB::get_canonical_stacks_chain_tip_hash(self.sortdb.conn())
.expect("FATAL: failed to query sortition DB for stacks tip");

self.tenure_extend_timeout = None;
self.tenure_extend_time = None;

if sn.sortition {
// a sortition happened
Expand Down Expand Up @@ -535,11 +577,18 @@ impl RelayerThread {
sn.consensus_hash,
mining_pkh_opt,
) {
Ok(Some(_)) => {
Ok(Some((_, wait_for_miner))) => {
// we can continue our ongoing tenure, but we should give the new winning miner
// a chance to send their BlockFound first.
debug!("Relayer: Did not win sortition, but am mining the ongoing tenure. Allowing the new miner some time to come online before trying to continue.");
self.tenure_extend_timeout = Some(Instant::now());
if wait_for_miner {
debug!("Relayer: Did not win sortition, but am mining the ongoing tenure. Allowing the new miner some time to come online before trying to continue.");
self.tenure_extend_time = Some(TenureExtendTime::delayed(
self.config.miner.tenure_extend_wait_timeout,
));
} else {
debug!("Relayer: Did not win sortition, but am mining the ongoing tenure. Will try to continue tenure immediately.");
jferrant marked this conversation as resolved.
Show resolved Hide resolved
self.tenure_extend_time = Some(TenureExtendTime::immediate());
}
return Some(MinerDirective::StopTenure);
}
Ok(None) => {
Expand Down Expand Up @@ -611,7 +660,7 @@ impl RelayerThread {
"Relayer: ongoing tenure {} already represents last-winning snapshot",
&stacks_tip_sn.consensus_hash
);
self.tenure_extend_timeout = Some(Instant::now());
self.tenure_extend_time = Some(TenureExtendTime::immediate());
false
} else {
// stacks tip's snapshot may be an ancestor of the last-won sortition.
Expand Down Expand Up @@ -654,7 +703,7 @@ impl RelayerThread {
&last_winning_snapshot.consensus_hash
);
// prepare to extend after our BlockFound gets mined.
self.tenure_extend_timeout = Some(Instant::now());
self.tenure_extend_time = Some(TenureExtendTime::immediate());
return Some(MinerDirective::BeginTenure {
parent_tenure_start: StacksBlockId(
last_winning_snapshot.winning_stacks_block_hash.clone().0,
Expand All @@ -675,7 +724,9 @@ impl RelayerThread {
// by someone else -- there's a chance that this other miner will produce a
// BlockFound in the interim.
debug!("Relayer: Did not win last winning snapshot despite mining the ongoing tenure, so allowing the new miner some time to come online.");
self.tenure_extend_timeout = Some(Instant::now());
self.tenure_extend_time = Some(TenureExtendTime::delayed(
self.config.miner.tenure_extend_wait_timeout,
));
return None;
}
return Some(MinerDirective::ContinueTenure {
Expand Down Expand Up @@ -1348,10 +1399,10 @@ impl RelayerThread {
/// Assumes that the caller has already checked that the given miner has _not_ won the new
/// sortition.
///
/// Returns Ok(Some(stacks-tip-election-snapshot)) if the last-winning miner needs to extend.
/// For now, this only happens if the miner's election snapshot was the last-known valid and
/// non-empty snapshot. In the future, this function may return Ok(Some(..)) if the node
/// determines that a subsequent miner won sortition, but never came online.
/// Returns Ok(Some(stacks-tip-election-snapshot, wait-for-miner) if the last-winning miner should attempt to extend
/// This can happen for two seperate reasons:
/// - the miner's election snapshot was the last-known valid and non-empty snapshot and therefore should extend immediately
/// - the node determines that a subsequent miner won sortition, but has not yet produced a valid block and should wait-for-miner before extending
///
/// Returns OK(None) if the last-winning miner should not extend its tenure.
///
Expand All @@ -1361,7 +1412,7 @@ impl RelayerThread {
chain_state: &mut StacksChainState,
new_burn_view: ConsensusHash,
mining_key_opt: Option<Hash160>,
) -> Result<Option<BlockSnapshot>, NakamotoNodeError> {
) -> Result<Option<(BlockSnapshot, bool)>, NakamotoNodeError> {
let Some(mining_pkh) = mining_key_opt else {
return Ok(None);
};
Expand Down Expand Up @@ -1407,31 +1458,34 @@ impl RelayerThread {
return Ok(None);
}

// For now, only allow the miner to extend its tenure if won the highest valid sortition.
// There cannot be any higher sortitions that are valid (as defined above).
//
// In the future, the miner will be able to extend its tenure even if there are higher
// valid sortitions, but only if it determines that the miners of those sortitions are
// offline.
// Allow the miner to extend its tenure if won the highest valid sortition IFF
// it determines that the miners of the sortition fails to produce a block
// by the required timeout.
if let Some(highest_valid_sortition) = Self::find_highest_valid_sortition(
sortdb,
chain_state,
&sort_tip,
&canonical_stacks_snapshot.consensus_hash,
)? {
info!("Relayer: will not extend tenure -- we won sortition {}, but the highest valid sortition is {}", &canonical_stacks_snapshot.consensus_hash, &highest_valid_sortition.consensus_hash);
return Ok(None);
// TODO: I don't understand why this works? HELP???
if sort_tip.consensus_hash != highest_valid_sortition.consensus_hash {
info!("Relayer: will not extend tenure -- we won sortition {}, but the highest valid sortition is {}", &canonical_stacks_snapshot.consensus_hash, &highest_valid_sortition.consensus_hash);
return Ok(None);
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so I think what's happening here is the following:

  1. canonical_stacks_snapshot is the bitcoin block that elected the current highest stacks block (which could be the prior tenure, or maybe an earlier tenure if the prior tenure wasn't active)
  2. At this point in the control flow, canonical_stacks_snapshot was also mined by this miner.
  3. find_highest_valid_sortition will only return a sortition strictly after canonical_stacks_snapshot.consensus_hash which tries to build on top of it. This sortition can't have blocks in it yet (because of the first point above).
  4. So if you're in this if let Some() case, that means that there's a new tenure, that new tenure doesn't have blocks yet, and its trying to build off of this miner's view of the stacks tip.
  5. sort_tip is the actual last processed bitcoin block. if the highest valid sortition isn't the last processed bitcoin block, it means that someone had an empty tenure (maybe due to a flash block), but it also means that the miner is unlikely to be able to even communicate over .miners anyways (because they are "2 miners behind").

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I, personally, find all that reasoning above very confusing (and it may not be accurate), so I think I'd be supportive of some factoring of this function if there's some clearer way to do this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah will take a look through it again. I had a really hard time following our current tenure_extend logic but Brice helped me reason it out to a point that maybe I can refactor now.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

K I got a first pass that I think works...CI pending...but locally the tests that were causing me issues before are resolved. Looking into cleanup phase now.

info!("Relayer: MAY extend tenure -- we won sortition {}, but must give miner time to produce a valid block for the highest valid sortition {}", &canonical_stacks_snapshot.consensus_hash, &highest_valid_sortition.consensus_hash);
return Ok(Some((canonical_stacks_snapshot, true)));
}

Ok(Some(canonical_stacks_snapshot))
// There cannot be any higher sortitions that are valid (as defined above).
Ok(Some((canonical_stacks_snapshot, false)))
}

/// Attempt to continue a miner's tenure into the next burn block.
/// This is allowed if the miner won the last good sortition -- that is, the sortition which
/// elected the local view of the canonical Stacks fork's ongoing tenure.
///
/// This function assumes that the caller has checked that the sortition referred to by
/// `new_burn_view` does not have a sortition winner.
/// `new_burn_view` does not have a sortition winner or that the winner has not produced a
/// valid block yet.
fn continue_tenure(&mut self, new_burn_view: ConsensusHash) -> Result<(), NakamotoNodeError> {
if let Err(e) = self.stop_tenure() {
error!("Relayer: Failed to stop tenure: {e:?}");
Expand All @@ -1440,7 +1494,7 @@ impl RelayerThread {
debug!("Relayer: successfully stopped tenure; will try to continue.");

let mining_pkh_opt = self.get_mining_key_pkh();
let Some(canonical_stacks_tip_election_snapshot) = Self::can_continue_tenure(
let Some((canonical_stacks_tip_election_snapshot, _)) = Self::can_continue_tenure(
&self.sortdb,
&mut self.chainstate,
new_burn_view.clone(),
Expand Down Expand Up @@ -1758,38 +1812,30 @@ impl RelayerThread {
}

/// Try to start up a tenure-extend.
/// Only do this if the miner won the highest valid sortition but the burn view has changed.
/// In the future, the miner will also try to extend its tenure if a subsequent miner appears
/// to be offline.
/// Only do this if:
/// - the miner won the highest valid sortition but the burn view has changed.
/// - the subsequent miner appears to be offline.
fn try_continue_tenure(&mut self) {
jferrant marked this conversation as resolved.
Show resolved Hide resolved
if self.tenure_extend_timeout.is_none() {
return;
}

// time to poll to see if we should begin a tenure-extend?
let deadline_passed = self
.tenure_extend_timeout
.map(|tenure_extend_timeout| {
let deadline_passed =
tenure_extend_timeout.elapsed() > self.config.miner.tenure_extend_poll_secs;
if !deadline_passed {
test_debug!(
"Relayer: will not try to tenure-extend yet ({} <= {})",
tenure_extend_timeout.elapsed().as_secs(),
self.config.miner.tenure_extend_poll_secs.as_secs()
);
}
deadline_passed
})
.unwrap_or(false);

if !deadline_passed {
// Should begin a tenure-extend?
if let Some(tenure_extend_time) = &self.tenure_extend_time {
if !tenure_extend_time.should_extend() {
test_debug!(
"Relayer: will not try to tenure-extend yet ({} <= {})",
tenure_extend_time.elapsed().as_secs(),
tenure_extend_time.timeout().as_secs()
);
return;
}
} else {
// No tenure extend time set, so nothing to do.
return;
}

// reset timer so we can try again if for some reason a miner was already running (e.g. a
// blockfound from earlier).
self.tenure_extend_timeout = Some(Instant::now());
self.tenure_extend_time = Some(TenureExtendTime::delayed(
self.config.miner.tenure_extend_poll_timeout,
));

// try to extend, but only if we aren't already running a thread for the current or newer
// burnchain view
Expand Down
3 changes: 2 additions & 1 deletion testnet/stacks-node/src/tests/nakamoto_integrations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10897,14 +10897,15 @@ fn test_tenure_extend_from_flashblocks() {
assert_ne!(sort_tip.consensus_hash, election_tip.consensus_hash);

// we can, however, continue the tenure
let canonical_stacks_tip = RelayerThread::can_continue_tenure(
let (canonical_stacks_tip, wait) = RelayerThread::can_continue_tenure(
&sortdb,
&mut chainstate,
sort_tip.consensus_hash.clone(),
Some(mining_key_pkh.clone()),
)
.unwrap()
.unwrap();
assert!(!wait);
assert_eq!(canonical_stacks_tip, election_tip);

// if we didn't win the last block -- tantamount to the sortition winner miner key being
Expand Down
Loading
Loading