Skip to content

Commit 4521487

Browse files
authored
Add metrics tracking time to restore a database from disk (#3285)
# Description of Changes Inspired by a question @Shubham8287 asked me: how long does it take to replay real production databases, e.g. a BitCraft region module? I did not have a good answer, and didn't know how I could get an answer. Add 5 new metrics: - `replay_total_time_seconds(db: Identity)`, total time to read and restore snapshot and replay commitlog. - `replay_snapshot_read_time_seconds(db: Identity)`, time to read snapshot from disk. - `replay_snapshot_restore_time_seconds(db: Identity)`, time to restore snapshot once it's already in memory. - `replay_commitlog_time_seconds(db: Identity)`, time to replay commitlog once snapshot is restored. - `replay_commitlog_num_commits(db: Identity)`, number of commits visited during commitlog replay, i.e. the length of the suffix after the most recent usable snapshot. All of these are `Gauge`s (or `IntGauge` for the `num_commits` metric), as we expect observations to be very infrequent (approx. once per label value per SpacetimeDB process). # API and ABI breaking changes N/a # Expected complexity level and risk 1 # Testing I remain unsure how to test metrics. This won't break anything, though, so I'm not worried.
1 parent 75f043a commit 4521487

File tree

2 files changed

+71
-8
lines changed

2 files changed

+71
-8
lines changed

crates/core/src/db/relational_db.rs

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use crate::error::{DBError, DatabaseError, RestoreSnapshotError};
33
use crate::messages::control_db::HostType;
44
use crate::subscription::ExecutionCounters;
55
use crate::util::{asyncify, spawn_rayon};
6+
use crate::worker_metrics::WORKER_METRICS;
67
use anyhow::{anyhow, Context};
78
use enum_map::EnumMap;
89
use fs2::FileExt;
@@ -378,6 +379,9 @@ impl RelationalDB {
378379
let (min_commitlog_offset, _) = history.tx_range_hint();
379380

380381
log::info!("[{database_identity}] DATABASE: durable_tx_offset is {durable_tx_offset:?}");
382+
383+
let start_time = std::time::Instant::now();
384+
381385
let inner = Self::restore_from_snapshot_or_bootstrap(
382386
database_identity,
383387
snapshot_repo.as_deref(),
@@ -387,6 +391,13 @@ impl RelationalDB {
387391
)?;
388392

389393
apply_history(&inner, database_identity, history)?;
394+
395+
let elapsed_time = start_time.elapsed();
396+
WORKER_METRICS
397+
.replay_total_time_seconds
398+
.with_label_values(&database_identity)
399+
.set(elapsed_time.as_secs_f64());
400+
390401
let db = Self::new(
391402
lock,
392403
database_identity,
@@ -532,13 +543,20 @@ impl RelationalDB {
532543
) -> Result<ReconstructedSnapshot, Box<SnapshotError>> {
533544
log::info!("[{database_identity}] DATABASE: restoring snapshot of tx_offset {snapshot_offset}");
534545
let start = std::time::Instant::now();
546+
535547
let snapshot = snapshot_repo
536548
.read_snapshot(snapshot_offset, page_pool)
537549
.map_err(Box::new)?;
550+
551+
let elapsed_time = start.elapsed();
552+
553+
WORKER_METRICS
554+
.replay_snapshot_read_time_seconds
555+
.with_label_values(database_identity)
556+
.set(elapsed_time.as_secs_f64());
557+
538558
log::info!(
539-
"[{database_identity}] DATABASE: read snapshot of tx_offset {} in {:?}",
540-
snapshot_offset,
541-
start.elapsed(),
559+
"[{database_identity}] DATABASE: read snapshot of tx_offset {snapshot_offset} in {elapsed_time:?}",
542560
);
543561

544562
Ok(snapshot)
@@ -554,10 +572,12 @@ impl RelationalDB {
554572
let snapshot_offset = snapshot.tx_offset;
555573
Locking::restore_from_snapshot(snapshot, page_pool)
556574
.inspect(|_| {
575+
let elapsed_time = start.elapsed();
576+
577+
WORKER_METRICS.replay_snapshot_restore_time_seconds.with_label_values(database_identity).set(elapsed_time.as_secs_f64());
578+
557579
log::info!(
558-
"[{database_identity}] DATABASE: restored from snapshot of tx_offset {} in {:?}",
559-
snapshot_offset,
560-
start.elapsed(),
580+
"[{database_identity}] DATABASE: restored from snapshot of tx_offset {snapshot_offset} in {elapsed_time:?}",
561581
)
562582
})
563583
.inspect_err(|e| {
@@ -1602,11 +1622,26 @@ where
16021622
}
16031623
};
16041624

1625+
let time_before = std::time::Instant::now();
1626+
16051627
let mut replay = datastore.replay(progress);
1606-
let start = replay.next_tx_offset();
1628+
let start_tx_offset = replay.next_tx_offset();
16071629
history
1608-
.fold_transactions_from(start, &mut replay)
1630+
.fold_transactions_from(start_tx_offset, &mut replay)
16091631
.map_err(anyhow::Error::from)?;
1632+
1633+
let time_elapsed = time_before.elapsed();
1634+
WORKER_METRICS
1635+
.replay_commitlog_time_seconds
1636+
.with_label_values(&database_identity)
1637+
.set(time_elapsed.as_secs_f64());
1638+
1639+
let end_tx_offset = replay.next_tx_offset();
1640+
WORKER_METRICS
1641+
.replay_commitlog_num_commits
1642+
.with_label_values(&database_identity)
1643+
.set((end_tx_offset - start_tx_offset) as _);
1644+
16101645
log::info!("[{database_identity}] DATABASE: applied transaction history");
16111646
datastore.rebuild_state_after_replay()?;
16121647
log::info!("[{database_identity}] DATABASE: rebuilt state after replay");

crates/core/src/worker_metrics/mod.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,34 @@ metrics_group!(
284284
#[help = "The number of server -> client WebSocket messages waiting in any client's outgoing queue"]
285285
#[labels(db: Identity)]
286286
pub total_outgoing_queue_length: IntGaugeVec,
287+
288+
#[name = spacetime_replay_total_time_seconds]
289+
#[help = "Total time spent replaying a database upon restart, including snapshot read, snapshot restore and commitlog replay"]
290+
#[labels(db: Identity)]
291+
// We expect a small number of observations per label
292+
// (exactly one, for non-replicated databases, and one per leader change for replicated databases)
293+
// so we'll just store a `Gauge` with the most recent observation for each database.
294+
pub replay_total_time_seconds: GaugeVec,
295+
296+
#[name = spacetime_replay_snapshot_read_time_seconds]
297+
#[help = "Time spent reading a snapshot from disk before restoring the snapshot upon restart"]
298+
#[labels(db: Identity)]
299+
pub replay_snapshot_read_time_seconds: GaugeVec,
300+
301+
#[name = spacetime_replay_snapshot_restore_time_seconds]
302+
#[help = "Time spent restoring a database from a snapshot after reading the snapshot and before commitlog replay upon restart"]
303+
#[labels(db: Identity)]
304+
pub replay_snapshot_restore_time_seconds: GaugeVec,
305+
306+
#[name = spacetime_replay_commitlog_time_seconds]
307+
#[help = "Time spent replaying the commitlog after restoring from a snapshot upon restart"]
308+
#[labels(db: Identity)]
309+
pub replay_commitlog_time_seconds: GaugeVec,
310+
311+
#[name = spacetime_replay_commitlog_num_commits]
312+
#[help = "Number of commits replayed after restoring from a snapshot upon restart"]
313+
#[labels(db: Identity)]
314+
pub replay_commitlog_num_commits: IntGaugeVec,
287315
}
288316
);
289317

0 commit comments

Comments
 (0)