persist-txn: compaction

danhhz · danhhz · commit cffe68c038b1 · 2023-10-24T08:16:11.000-07:00
Compaction of data shards is initially delegated to the txns user (the
storage controller). Because txn writes intentionally never read data
shards and in no way depend on the sinces, the since of a data shard is
free to be arbitrarily far ahead of or behind the txns upper. Data shard
reads, when run through the above process, then follow the usual rules
(can read at times beyond the since but not beyond the upper).

Compaction of the txns shard relies on the following invariant that is
carefully maintained: every write less than the since of the txns shard
has been applied. Mechanically, this is accomplished by a critical since
capability held internally by the txns system. Any txn writer is free to
advance it to a time once it has proven that all writes before that time
have been applied.

It is advantageous to compact the txns shard aggressively so that
applied writes are promptly consolidated out, minimizing the size. For a
snapshot read at `as_of`, we need to be able to distinguish when the
latest write `&lt;= as_of` has been applied. The above invariant enables
this as follows:

- If `as_of &lt;= txns_shard.since()`, then the invariant guarantees that
  all writes `&lt;= as_of` have been applied, so we're free to read as
  described in the section above.
- Otherwise, we haven't compacted `as_of` in the txns shard yet, and
  still have perfect information about which writes happened when. We
  can look at the data shard upper to determine which have been applied.
diff --git a/doc/developer/design/20230705_v2_txn_management.md b/doc/developer/design/20230705_v2_txn_management.md
@@ -365,6 +365,32 @@ maintenance or a CRDB write, but this is also true for registering a reader. On
 the balance, I think this is a _much_ better set of tradeoffs than the original
 plan.
 
+### Compaction
+
+Compaction of data shards is initially delegated to the txns user (the storage
+controller). Because txn writes intentionally never read data shards and in no
+way depend on the sinces, the since of a data shard is free to be arbitrarily
+far ahead of or behind the txns upper. Data shard reads, when run through the
+above process, then follow the usual rules (can read at times beyond the since
+but not beyond the upper).
+
+Compaction of the txns shard relies on the following invariant that is carefully
+maintained: every write less than the since of the txns shard has been applied.
+Mechanically, this is accomplished by a critical since capability held
+internally by the txns system. Any txn writer is free to advance it to a time
+once it has proven that all writes before that time have been applied.
+
+It is advantageous to compact the txns shard aggressively so that applied writes
+are promptly consolidated out, minimizing the size. For a snapshot read at
+`as_of`, we need to be able to distinguish when the latest write `<= as_of` has
+been applied. The above invariant enables this as follows:
+
+- If `as_of <= txns_shard.since()`, then the invariant guarantees that all
+  writes `<= as_of` have been applied, so we're free to read as described in the
+  section above.
+- Otherwise, we haven't compacted `as_of` in the txns shard yet, and still have
+  perfect information about which writes happened when. We can look at the data shard upper to determine which have been applied.
+
 ### Forget
 
 A data shard is removed from the txns set using a `forget` operation that writes
diff --git a/src/persist-cli/src/maelstrom/txn_list_append_multi.rs b/src/persist-cli/src/maelstrom/txn_list_append_multi.rs
@@ -116,6 +116,11 @@ impl Transactor {
                 txn.tidy(std::mem::take(&mut self.tidy));
                 match txn.commit_at(&mut self.txns, write_ts).await {
                     Ok(maintenance) => {
+                        // Aggressively allow the txns shard compact. To
+                        // exercise more edge cases, do it before we apply the
+                        // newly committed txn.
+                        self.txns.compact_to(write_ts).await;
+
                         debug!("req committed at read_ts={} write_ts={}", read_ts, write_ts);
                         let tidy = maintenance.apply(&mut self.txns).await;
                         self.tidy.merge(tidy);
diff --git a/src/persist-txn/src/lib.rs b/src/persist-txn/src/lib.rs
@@ -287,11 +287,6 @@ pub mod txn_read;
 pub mod txn_write;
 pub mod txns;
 
-// TODO(txn):
-// - Closing/deleting data shards.
-// - Hold a critical since capability for each registered shard?
-// - Figure out the compaction story for both txn and data shard.
-
 /// The in-mem representation of an update in the txns shard.
 #[derive(Debug)]
 pub enum TxnsEntry {
diff --git a/src/persist-txn/src/operator.rs b/src/persist-txn/src/operator.rs
@@ -32,7 +32,7 @@ use timely::progress::{Antichain, Timestamp};
 use timely::scheduling::Scheduler;
 use timely::worker::Worker;
 use timely::{Data, WorkerConfig};
-use tracing::debug;
+use tracing::{debug, trace};
 
 use crate::txn_read::{DataListenNext, TxnsCache};
 use crate::{TxnsCodec, TxnsCodecDefault};
@@ -132,12 +132,16 @@ where
             )
             .await
             .expect("schema shouldn't change");
-        let () = snap.unblock_read(data_write).await;
+        let empty_to = snap.unblock_read(data_write).await;
+        debug!(
+            "txns_progress({}) {} starting as_of={:?} empty_to={:?}",
+            name, data_id, as_of, empty_to
+        );
 
         // We've ensured that the data shard's physical upper is past as_of, so
         // start by passing through data and frontier updates from the input
         // until it is past the as_of.
-        let mut read_data_to = as_of.step_forward();
+        let mut read_data_to = empty_to;
         let mut output_progress_exclusive = T::minimum();
         loop {
             // TODO(txn): Do something more specific when the input returns None
@@ -151,6 +155,7 @@ where
                     // disconnected.
                     Event::Data(_data_cap, data) => {
                         for data in data.drain(..) {
+                            trace!("txns_progress({}) emitting data {:?}", name, data);
                             passthrough_output.give(&cap, data).await;
                         }
                     }
@@ -166,9 +171,14 @@ where
                         // frontier updates too.
                         if &output_progress_exclusive < input_progress_exclusive {
                             output_progress_exclusive.clone_from(input_progress_exclusive);
+                            trace!(
+                                "txns_progress({}) downgrading cap to {:?}",
+                                name,
+                                output_progress_exclusive
+                            );
                             cap.downgrade(&output_progress_exclusive);
                         }
-                        if read_data_to <= output_progress_exclusive {
+                        if read_data_to.less_than(&output_progress_exclusive) {
                             break;
                         }
                     }
@@ -180,15 +190,9 @@ where
             // find out what to do next given our current progress.
             loop {
                 txns_cache.update_ge(&output_progress_exclusive).await;
+                txns_cache.compact_to(&output_progress_exclusive);
                 let data_listen_next =
                     txns_cache.data_listen_next(&data_id, output_progress_exclusive.clone());
-                debug!(
-                    "txns_progress({}): data_listen_next {:.9} at {:?}: {:?}",
-                    name,
-                    data_id.to_string(),
-                    output_progress_exclusive,
-                    data_listen_next
-                );
                 match data_listen_next {
                     // We've caught up to the txns upper and we have to wait for
                     // it to advance before asking again.
@@ -204,7 +208,7 @@ where
                     // The data shard got a write! Loop back above and pass
                     // through data until we see it.
                     DataListenNext::ReadDataTo(new_target) => {
-                        read_data_to = new_target;
+                        read_data_to = Antichain::from_elem(new_target);
                         break;
                     }
                     // We know there are no writes in
@@ -213,9 +217,19 @@ where
                     DataListenNext::EmitLogicalProgress(new_progress) => {
                         assert!(output_progress_exclusive < new_progress);
                         output_progress_exclusive = new_progress;
+                        debug!(
+                            "txns_progress({}) downgrading cap to {:?}",
+                            name, output_progress_exclusive
+                        );
                         cap.downgrade(&output_progress_exclusive);
                         continue;
                     }
+                    DataListenNext::CompactedTo(since_ts) => {
+                        unreachable!(
+                            "internal logic error: {} unexpectedly compacted past {:?} to {:?}",
+                            data_id, output_progress_exclusive, since_ts
+                        )
+                    }
                 }
             }
         }
diff --git a/src/persist-txn/src/txn_read.rs b/src/persist-txn/src/txn_read.rs
diff --git a/src/persist-txn/src/txns.rs b/src/persist-txn/src/txns.rs