Skip to content

Commit 9b662ea

Browse files
authored
[nexus] Use retryable transactions more extensively (#7212)
This PR finds spots where we use `transaction_async` and makes them use `transaction_retry_wrapper` instead. This means that under contention, we'll avoid wasting work, and can make use of CockroachDB's automated retry mechanisms. Additionally, this PR adds a clippy lint to help future usage avoid the "non-retryable" transaction variant. There are some use cases where avoiding retries is still reasonable: 1. Test-only code 2. Transactions which have truly minimal contention, or which can fail with serialization errors without issue 3. Nested transactions
1 parent e73a30e commit 9b662ea

File tree

16 files changed

+478
-384
lines changed

16 files changed

+478
-384
lines changed

clippy.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,10 @@ disallowed-methods = [
1010
# `IncompleteOnConflictExt::as_partial_index` in `nexus-db-queries`.
1111
# See the documentation of that method for more.
1212
"diesel::upsert::DecoratableTarget::filter_target",
13+
14+
# This form of transaction is susceptible to serialization failures,
15+
# and can fail spuriously.
16+
# Instead, the "transaction_retry_wrapper" should be preferred, as it
17+
# automatically retries transactions experiencing contention.
18+
{ path = "async_bb8_diesel::AsyncConnection::transaction_async", reason = "Prefer to use transaction_retry_wrapper, if possible. Feel free to override this for tests and nested transactions." },
1319
]

dev-tools/omdb/src/bin/omdb/db.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
1515
// NOTE: emanates from Tabled macros
1616
#![allow(clippy::useless_vec)]
17+
// NOTE: allowing "transaction_async" without retry
18+
#![allow(clippy::disallowed_methods)]
1719

1820
use crate::check_allow_destructive::DestructiveOperationToken;
1921
use crate::helpers::const_max_len;

nexus/db-queries/src/db/datastore/deployment.rs

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,11 @@ impl DataStore {
335335
// batch rather than making a bunch of round-trips to the database.
336336
// We'd do that if we had an interface for doing that with bound
337337
// parameters, etc. See oxidecomputer/omicron#973.
338+
339+
// The risk of a serialization error is possible here, but low,
340+
// as most of the operations should be insertions rather than in-place
341+
// modifications of existing tables.
342+
#[allow(clippy::disallowed_methods)]
338343
conn.transaction_async(|conn| async move {
339344
// Insert the row for the blueprint.
340345
{
@@ -1087,6 +1092,7 @@ impl DataStore {
10871092
// start removing it and we'd also need to make sure we didn't leak a
10881093
// collection if we crash while deleting it.
10891094
let conn = self.pool_connection_authorized(opctx).await?;
1095+
let err = OptionalError::new();
10901096

10911097
let (
10921098
nblueprints,
@@ -1101,19 +1107,23 @@ impl DataStore {
11011107
nclickhouse_cluster_configs,
11021108
nclickhouse_keepers,
11031109
nclickhouse_servers,
1104-
) = conn
1105-
.transaction_async(|conn| async move {
1110+
) = self.transaction_retry_wrapper("blueprint_delete")
1111+
.transaction(&conn, |conn| {
1112+
let err = err.clone();
1113+
async move {
11061114
// Ensure that blueprint we're about to delete is not the
11071115
// current target.
1108-
let current_target =
1109-
Self::blueprint_current_target_only(&conn).await?;
1116+
let current_target = Self::blueprint_current_target_only(&conn)
1117+
.await
1118+
.map_err(|txn_err| txn_err.into_diesel(&err))?;
1119+
11101120
if current_target.target_id == blueprint_id {
1111-
return Err(TransactionError::CustomError(
1121+
return Err(err.bail(TransactionError::CustomError(
11121122
Error::conflict(format!(
11131123
"blueprint {blueprint_id} is the \
11141124
current target and cannot be deleted",
11151125
)),
1116-
));
1126+
)));
11171127
}
11181128

11191129
// Remove the record describing the blueprint itself.
@@ -1130,9 +1140,9 @@ impl DataStore {
11301140
// references to it in any of the remaining tables either, since
11311141
// deletion always goes through this transaction.
11321142
if nblueprints == 0 {
1133-
return Err(TransactionError::CustomError(
1143+
return Err(err.bail(TransactionError::CustomError(
11341144
authz_blueprint.not_found(),
1135-
));
1145+
)));
11361146
}
11371147

11381148
// Remove rows associated with sled states.
@@ -1259,13 +1269,12 @@ impl DataStore {
12591269
nclickhouse_keepers,
12601270
nclickhouse_servers,
12611271
))
1272+
}
12621273
})
12631274
.await
1264-
.map_err(|error| match error {
1265-
TransactionError::CustomError(e) => e,
1266-
TransactionError::Database(e) => {
1267-
public_error_from_diesel(e, ErrorHandler::Server)
1268-
}
1275+
.map_err(|e| match err.take() {
1276+
Some(err) => err.into(),
1277+
None => public_error_from_diesel(e, ErrorHandler::Server),
12691278
})?;
12701279

12711280
info!(&opctx.log, "removed blueprint";

nexus/db-queries/src/db/datastore/dns.rs

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use crate::db::pagination::paginated;
1919
use crate::db::pagination::Paginator;
2020
use crate::db::pool::DbConnection;
2121
use crate::db::TransactionError;
22+
use crate::transaction_retry::OptionalError;
2223
use async_bb8_diesel::AsyncConnection;
2324
use async_bb8_diesel::AsyncRunQueryDsl;
2425
use diesel::prelude::*;
@@ -363,40 +364,49 @@ impl DataStore {
363364
) -> Result<(), Error> {
364365
opctx.authorize(authz::Action::Modify, &authz::DNS_CONFIG).await?;
365366
let conn = self.pool_connection_authorized(opctx).await?;
366-
conn.transaction_async(|c| async move {
367-
let zones = self
368-
.dns_zones_list_all_on_connection(opctx, &c, update.dns_group)
369-
.await?;
370-
// This looks like a time-of-check-to-time-of-use race, but this
371-
// approach works because we're inside a transaction and the
372-
// isolation level is SERIALIZABLE.
373-
let version = self
374-
.dns_group_latest_version_conn(opctx, &c, update.dns_group)
375-
.await?;
376-
if version.version != old_version {
377-
return Err(TransactionError::CustomError(Error::conflict(
378-
format!(
379-
"expected current DNS version to be {}, found {}",
380-
*old_version, *version.version,
381-
),
382-
)));
383-
}
384367

385-
self.dns_write_version_internal(
386-
&c,
387-
update,
388-
zones,
389-
Generation(old_version.next()),
390-
)
368+
let err = OptionalError::new();
369+
370+
self.transaction_retry_wrapper("dns_update_from_version")
371+
.transaction(&conn, |c| {
372+
let err = err.clone();
373+
let update = update.clone();
374+
async move {
375+
let zones = self
376+
.dns_zones_list_all_on_connection(opctx, &c, update.dns_group)
377+
.await
378+
.map_err(|txn_error| txn_error.into_diesel(&err))?;
379+
// This looks like a time-of-check-to-time-of-use race, but this
380+
// approach works because we're inside a transaction and the
381+
// isolation level is SERIALIZABLE.
382+
let version = self
383+
.dns_group_latest_version_conn(opctx, &c, update.dns_group)
384+
.await
385+
.map_err(|txn_error| txn_error.into_diesel(&err))?;
386+
if version.version != old_version {
387+
return Err(err.bail(TransactionError::CustomError(Error::conflict(
388+
format!(
389+
"expected current DNS version to be {}, found {}",
390+
*old_version, *version.version,
391+
),
392+
))));
393+
}
394+
395+
self.dns_write_version_internal(
396+
&c,
397+
update,
398+
zones,
399+
Generation(old_version.next()),
400+
)
401+
.await
402+
.map_err(|txn_error| txn_error.into_diesel(&err))
403+
}
404+
})
391405
.await
392-
})
393-
.await
394-
.map_err(|e| match e {
395-
TransactionError::CustomError(e) => e,
396-
TransactionError::Database(e) => {
397-
public_error_from_diesel(e, ErrorHandler::Server)
398-
}
399-
})
406+
.map_err(|e| match err.take() {
407+
Some(err) => err.into(),
408+
None => public_error_from_diesel(e, ErrorHandler::Server),
409+
})
400410
}
401411

402412
/// Update the configuration of a DNS zone as specified in `update`
@@ -441,6 +451,9 @@ impl DataStore {
441451
.dns_zones_list_all_on_connection(opctx, conn, update.dns_group)
442452
.await?;
443453

454+
// This method is used in nested transactions, which are not supported
455+
// with retryable transactions.
456+
#[allow(clippy::disallowed_methods)]
444457
conn.transaction_async(|c| async move {
445458
let version = self
446459
.dns_group_latest_version_conn(opctx, conn, update.dns_group)
@@ -1724,6 +1737,8 @@ mod test {
17241737

17251738
let cds = datastore.clone();
17261739
let copctx = opctx.child(std::collections::BTreeMap::new());
1740+
1741+
#[allow(clippy::disallowed_methods)]
17271742
let mut fut = conn1
17281743
.transaction_async(|c1| async move {
17291744
cds.dns_update_incremental(&copctx, &c1, update1)

0 commit comments

Comments
 (0)