diff --git a/.github/workflows/tests_proc.yml b/.github/workflows/tests_proc.yml index e8fbde3f..dbd7195e 100644 --- a/.github/workflows/tests_proc.yml +++ b/.github/workflows/tests_proc.yml @@ -16,5 +16,7 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Run proc tests - run: python3 scripts/workflow_test.py + - name: Run proc tests (MultiPaxos) + run: python3 scripts/workflow_test.py -p MultiPaxos + - name: Run proc tests (Raft) + run: python3 scripts/workflow_test.py -p Raft diff --git a/.github/workflows/tests_unit.yml b/.github/workflows/tests_unit.yml index 0a1fd8d6..57aa8fb3 100644 --- a/.github/workflows/tests_unit.yml +++ b/.github/workflows/tests_unit.yml @@ -16,5 +16,5 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Run unit tests + - name: Run all unit tests run: cargo test --workspace --verbose diff --git a/Cargo.lock b/Cargo.lock index 48d1c203..3a18786a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,9 +39,9 @@ dependencies = [ [[package]] name = "anstream" -version = "0.5.0" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f58811cfac344940f1a400b6e6231ce35171f614f26439e80f8c1465c5cc0c" +checksum = "2ab91ebe16eb252986481c5b62f6098f3b698a45e34b5b98200cf20dd2484a44" dependencies = [ "anstyle", "anstyle-parse", @@ -53,15 +53,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" [[package]] name = "anstyle-parse" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" +checksum = "317b9a89c1868f5ea6ff1d9539a69f45dffc21ce321ac1fd1160dfa48c8e2140" dependencies = [ "utf8parse", ] @@ -77,9 +77,9 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "2.1.0" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f54d10c6dfa51283a066ceab3ec1ab78d13fae00aa49243a45e4571fb79dfd" +checksum = "f0699d10d2f4d628a98ee7b57b289abbc98ff3bad977cb3152709d4bf2330628" dependencies = [ "anstyle", "windows-sys", @@ -93,7 +93,7 @@ checksum = "bc00ceb34980c03614e35a3a4e218276a0a824e911d07651cd0d858a51e8c0f0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -105,7 +105,7 @@ dependencies = [ "attribute-derive-macro", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -121,7 +121,7 @@ dependencies = [ "proc-macro2", "quote", "quote-use", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -159,9 +159,9 @@ checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" @@ -189,9 +189,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.4.4" +version = "4.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1d7b8d5ec32af0fadc644bf1fd509a688c2103b185644bb1e29d164e0703136" +checksum = "d04704f56c2cde07f43e8e2c154b43f216dc5c92fc98ada720177362f953b956" dependencies = [ "clap_builder", "clap_derive", @@ -199,9 +199,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.4.4" +version = "4.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5179bb514e4d7c2051749d8fcefa2ed6d06a9f4e6d69faf3805f5d80b8cf8d56" +checksum = "0e231faeaca65ebd1ea3c737966bf858971cd38c3849107aa3ea7de90a804e45" dependencies = [ "anstream", "anstyle", @@ -218,7 +218,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -280,7 +280,7 @@ checksum = "146398d62142a0f35248a608f17edf0dde57338354966d6e41d0eb2d16980ccb" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -324,25 +324,14 @@ checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" [[package]] name = "errno" -version = "0.3.3" +version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "136526188508e25c6fef639d7927dfb3e0e3084488bf202267829cf7fc23dbdd" +checksum = "ac3e13f66a2f95e32a39eaa81f6b95d42878ca0e1db0c7543723dfe12557e860" dependencies = [ - "errno-dragonfly", "libc", "windows-sys", ] -[[package]] -name = "errno-dragonfly" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" -dependencies = [ - "cc", - "libc", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -423,7 +412,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -486,7 +475,7 @@ checksum = "13a1bcfb855c1f340d5913ab542e36f25a1c56f57de79022928297632435dec2" dependencies = [ "attribute-derive", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -517,9 +506,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.0" +version = "0.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12" [[package]] name = "heck" @@ -541,12 +530,12 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "indexmap" -version = "2.0.0" +version = "2.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" dependencies = [ "equivalent", - "hashbrown 0.14.0", + "hashbrown 0.14.1", ] [[package]] @@ -589,21 +578,21 @@ checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.148" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libm" -version = "0.2.7" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] name = "linux-raw-sys" -version = "0.4.7" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a9bad9f94746442c783ca431b22403b519cd7fbeed0533fdd6328b2f2212128" +checksum = "45786cec4d5e54a224b15cb9f06751883103a27c19c93eda09b0b4f5f08fefac" [[package]] name = "lock_api" @@ -656,9 +645,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.6.3" +version = "2.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c" +checksum = "f665ee40bc4a3c5590afb1e9677db74a508659dfd71e126420da8274909a0167" [[package]] name = "minimal-lexical" @@ -729,9 +718,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f30b0abd723be7e2ffca1272140fac1a2f084c77ec3e123c192b66af1ee9e6c2" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" dependencies = [ "autocfg", ] @@ -914,9 +903,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "5b1106fec09662ec6dd98ccac0f81cef56984d0b49f75c92d8cbad76e20c005c" dependencies = [ "unicode-ident", ] @@ -938,7 +927,7 @@ checksum = "a7b5abe3fe82fdeeb93f44d66a7b444dedf2e4827defb0a8e69c437b2de2ef94" dependencies = [ "quote", "quote-use-macros", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -950,7 +939,7 @@ dependencies = [ "derive-where", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1029,13 +1018,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.9.5" +version = "1.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47" +checksum = "ebee201405406dbf528b8b672104ae6d6d63e6d118cb10e4d51abbc7b58044ff" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.8", + "regex-automata 0.3.9", "regex-syntax 0.7.5", ] @@ -1050,9 +1039,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795" +checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9" dependencies = [ "aho-corasick", "memchr", @@ -1101,9 +1090,9 @@ checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" [[package]] name = "rustix" -version = "0.38.14" +version = "0.38.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "747c788e9ce8e92b12cd485c49ddf90723550b654b32508f979b71a7b1ecda4f" +checksum = "f25469e9ae0f3d0047ca8b93fc56843f38e6774f0914a107ff8b41be8be8e0b7" dependencies = [ "bitflags 2.4.0", "errno", @@ -1153,7 +1142,7 @@ checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1178,9 +1167,9 @@ dependencies = [ [[package]] name = "sharded-slab" -version = "0.1.4" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" dependencies = [ "lazy_static", ] @@ -1313,9 +1302,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.37" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", @@ -1346,22 +1335,22 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.48" +version = "1.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d6d7a740b8a666a7e828dd00da9c0dc290dff53154ea77ac109281de90589b7" +checksum = "1177e8c6d7ede7afde3585fd2513e611227efd6481bd78d2e82ba1ce16557ed4" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.48" +version = "1.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49922ecae66cc8a249b77e68d1d0623c1b2c514f0060c27cdc68bd62a1219d35" +checksum = "10712f02019e9288794769fba95cd6847df9874d49d871d062172f9dd41bc4cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1401,7 +1390,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1458,7 +1447,7 @@ checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -1638,9 +1627,9 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "winnow" -version = "0.5.15" +version = "0.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c2e3184b9c4e92ad5167ca73039d0c42476302ab603e2fec4487511f38ccefc" +checksum = "037711d82167854aff2018dfd193aa0fef5370f456732f0d5a0c59b0f1b4b907" dependencies = [ "memchr", ] diff --git a/README.md b/README.md index 58d3a50a..b83a7a64 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ [![Proc tests status](https://github.com/josehu07/summerset/actions/workflows/tests_proc.yml/badge.svg)](https://github.com/josehu07/summerset/actions?query=josehu07%3Atests_proc) [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT) -Summerset is a distributed key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added. +Summerset is a distributed, replicated, protocol-generic key-value store supporting a wide range of state machine replication (SMR) protocols for research purposes. More protocols are actively being added.

@@ -21,6 +21,7 @@ Summerset is a distributed key-value store supporting a wide range of state mach | `SimplePush` | Pushing to peers w/o any consistency guarantees | | `MultiPaxos` | Classic [MultiPaxos](https://www.microsoft.com/en-us/research/uploads/prod/2016/12/paxos-simple-Copy.pdf) protocol | | `RS-Paxos` | MultiPaxos w/ Reed-Solomon erasure code sharding | +| `Raft` | [Raft](https://raft.github.io/raft.pdf) on explicit log and strong leadership | Formal TLA+ specification of some protocols are provided in `tla+/`. @@ -32,6 +33,7 @@ Formal TLA+ specification of some protocols are provided in `tla+/`. - **Async Rust**: Summerset is written in Rust and demonstrates canonical usage of async programming structures backed by the [`tokio`](https://tokio.rs/) framework; - **Event-based**: Summerset adopts a channel-oriented, event-based system architecture; each replication protocol is basically just a set of event handlers plus a `tokio::select!` loop; - **Modularized**: Common components of a distributed KV store, e.g. network transport and durable logger, are cleanly separated from each other and connected through channels. +- **Protocol-generic**: With the above two points combined, Summerset is able to support a set of different replication protocols in one codebase, each being just a single file, with common functionalities abstracted out. These design choices make protocol implementation in Summerset surprisingly straight-forward and **understandable**, without any sacrifice on performance. Comments / issues / PRs are always welcome! @@ -118,12 +120,15 @@ Complete cluster management and benchmarking scripts are available in another re - [ ] specialize read-only commands? - [ ] separate commit vs. exec responses? - [ ] membership discovery & view changes? -- [ ] implementation of Raft +- [x] implementation of Raft + - [x] state persistence & restart check + - [x] snapshotting & garbage collection + - [ ] membership discovery & view changes? - [x] client-side utilities - [x] REPL-style client - [x] random benchmarking client - [x] testing client - - [ ] YCSB-driven benchmarking + - [ ] YCSB-driven client - [ ] better README & documentation --- diff --git a/scripts/local_cluster.py b/scripts/local_cluster.py index 72234864..ae356fa0 100644 --- a/scripts/local_cluster.py +++ b/scripts/local_cluster.py @@ -44,11 +44,13 @@ def kill_all_matching(name, force=False): "RepNothing": lambda r: f"backer_path='/tmp/summerset.rep_nothing.{r}.wal'", "SimplePush": lambda r: f"backer_path='/tmp/summerset.simple_push.{r}.wal'", "MultiPaxos": lambda r: f"backer_path='/tmp/summerset.multipaxos.{r}.wal'", + "Raft": lambda r: f"backer_path='/tmp/summerset.raft.{r}.wal'", "RSPaxos": lambda r: f"backer_path='/tmp/summerset.rs_paxos.{r}.wal'", } PROTOCOL_SNAPSHOT_PATH = { "MultiPaxos": lambda r: f"snapshot_path='/tmp/summerset.multipaxos.{r}.snap'", + "Raft": lambda r: f"snapshot_path='/tmp/summerset.raft.{r}.snap'", "RSPaxos": lambda r: f"snapshot_path='/tmp/summerset.rs_paxos.{r}.snap'", } @@ -70,19 +72,6 @@ def config_with_file_paths(protocol, config, replica): return result_config -def config_with_backer_path(protocol, config, replica): - result_config = PROTOCOL_BACKER_PATH[protocol](replica) - - if config is not None and len(config) > 0: - if "backer_path" in config: - result_config = config # use user-supplied path - else: - result_config += "+" - result_config += config - - return result_config - - def compose_manager_cmd(protocol, srv_port, cli_port, num_replicas, release): cmd = [f"./target/{'release' if release else 'debug'}/summerset_manager"] cmd += [ diff --git a/scripts/workflow_test.py b/scripts/workflow_test.py index 33484aca..eb176a7f 100644 --- a/scripts/workflow_test.py +++ b/scripts/workflow_test.py @@ -1,5 +1,6 @@ import sys import os +import argparse import subprocess @@ -76,6 +77,12 @@ def run_tester_client(protocol, test_name): if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-p", "--protocol", type=str, required=True, help="protocol name" + ) + args = parser.parse_args() + do_cargo_build() kill_all_matching("local_client.py", force=True) @@ -85,6 +92,13 @@ def run_tester_client(protocol, test_name): kill_all_matching("summerset_manager", force=True) PROTOCOL = "MultiPaxos" + if args.protocol == "MultiPaxos": + pass + elif args.protocol == "Raft": + PROTOCOL = "Raft" + else: + raise ValueError(f"unrecognized protocol {args.protocol} to run workflow test") + NUM_REPLICAS = 3 TEST_NAME = "primitive_ops" TIMEOUT = 300 diff --git a/src/lib.rs b/src/lib.rs index 9e044072..feb47c90 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,4 +35,5 @@ pub use crate::protocols::SmrProtocol; pub use crate::protocols::{ReplicaConfigRepNothing, ClientConfigRepNothing}; pub use crate::protocols::{ReplicaConfigSimplePush, ClientConfigSimplePush}; pub use crate::protocols::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +pub use crate::protocols::{ReplicaConfigRaft, ClientConfigRaft}; pub use crate::protocols::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; diff --git a/src/manager/clusman.rs b/src/manager/clusman.rs index a21ef9c7..daefadf0 100644 --- a/src/manager/clusman.rs +++ b/src/manager/clusman.rs @@ -186,19 +186,12 @@ impl ClusterManager { protocol); } - // tell it to connect to all existing known servers + // gather the list of all existing known servers let to_peers: HashMap = self .server_info .iter() .map(|(&server, info)| (server, info.p2p_addr)) .collect(); - self.server_reigner.send_ctrl( - CtrlMsg::ConnectToPeers { - population: self.population, - to_peers, - }, - server, - )?; // save new server's info self.server_info.insert( @@ -211,6 +204,16 @@ impl ClusterManager { start_slot: 0, }, ); + + // tell it to connect to all other existing known servers + self.server_reigner.send_ctrl( + CtrlMsg::ConnectToPeers { + population: self.population, + to_peers, + }, + server, + )?; + Ok(()) } @@ -406,9 +409,13 @@ impl ClusterManager { self.server_info.get_mut(&s).unwrap().is_paused = true; // wait for dummy reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if reply != CtrlMsg::PauseReply { - return logged_err!("m"; "unexpected reply type received"); + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + if server != s || reply != CtrlMsg::PauseReply { + self.handle_ctrl_msg(server, reply).await?; + } else { + break; + } } pause_done.insert(s); @@ -442,9 +449,13 @@ impl ClusterManager { self.server_reigner.send_ctrl(CtrlMsg::Resume, s)?; // wait for dummy reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if reply != CtrlMsg::ResumeReply { - return logged_err!("m"; "unexpected reply type received"); + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + if server != s || reply != CtrlMsg::ResumeReply { + self.handle_ctrl_msg(server, reply).await?; + } else { + break; + } } // clear the is_paused flag @@ -482,22 +493,27 @@ impl ClusterManager { self.server_reigner.send_ctrl(CtrlMsg::TakeSnapshot, s)?; // wait for reply - let (_, reply) = self.server_reigner.recv_ctrl().await?; - if let CtrlMsg::SnapshotUpTo { new_start } = reply { - // update the log start index info - assert!(self.server_info.contains_key(&s)); - if new_start < self.server_info[&s].start_slot { - return logged_err!("m"; "server {} snapshot up to {} < {}", - s, new_start, - self.server_info[&s].start_slot); - } else { - self.server_info.get_mut(&s).unwrap().start_slot = - new_start; - } + loop { + let (server, reply) = self.server_reigner.recv_ctrl().await?; + match reply { + CtrlMsg::SnapshotUpTo { new_start } if server == s => { + // update the log start index info + assert!(self.server_info.contains_key(&s)); + if new_start < self.server_info[&s].start_slot { + return logged_err!("m"; "server {} snapshot up to {} < {}", + s, new_start, + self.server_info[&s].start_slot); + } else { + self.server_info.get_mut(&s).unwrap().start_slot = + new_start; + } + + snapshot_up_to.insert(s, new_start); + break; + } - snapshot_up_to.insert(s, new_start); - } else { - return logged_err!("m"; "unexpected reply type received"); + _ => self.handle_ctrl_msg(server, reply).await?, + } } } diff --git a/src/manager/reigner.rs b/src/manager/reigner.rs index 41ae38ec..3be28cde 100644 --- a/src/manager/reigner.rs +++ b/src/manager/reigner.rs @@ -21,7 +21,7 @@ use tokio::task::JoinHandle; /// Control message from/to servers. Control traffic could be bidirectional: /// some initiated by the manager and some by servers. -// TODO: later add leader change, membership change, etc. +// TODO: later add membership/view change, link drop, etc. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)] pub enum CtrlMsg { /// Server -> Manager: new server up, requesting a list of peers' addresses diff --git a/src/protocols/mod.rs b/src/protocols/mod.rs index b7aaaf4f..a0d861e8 100644 --- a/src/protocols/mod.rs +++ b/src/protocols/mod.rs @@ -22,17 +22,27 @@ mod multipaxos; use multipaxos::{MultiPaxosReplica, MultiPaxosClient}; pub use multipaxos::{ReplicaConfigMultiPaxos, ClientConfigMultiPaxos}; +mod raft; +use raft::{RaftReplica, RaftClient}; +pub use raft::{ReplicaConfigRaft, ClientConfigRaft}; + mod rs_paxos; use rs_paxos::{RSPaxosReplica, RSPaxosClient}; pub use rs_paxos::{ReplicaConfigRSPaxos, ClientConfigRSPaxos}; +mod crossword; +use crossword::{CrosswordReplica, CrosswordClient}; +pub use crossword::{ReplicaConfigCrossword, ClientConfigCrossword}; + /// Enum of supported replication protocol types. #[derive(Debug, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)] pub enum SmrProtocol { RepNothing, SimplePush, MultiPaxos, + Raft, RSPaxos, + Crossword, } /// Helper macro for saving boilder-plate `Box` mapping in @@ -51,7 +61,9 @@ impl SmrProtocol { "RepNothing" => Some(Self::RepNothing), "SimplePush" => Some(Self::SimplePush), "MultiPaxos" => Some(Self::MultiPaxos), + "Raft" => Some(Self::Raft), "RSPaxos" => Some(Self::RSPaxos), + "Crossword" => Some(Self::Crossword), _ => None, } } @@ -100,6 +112,14 @@ impl SmrProtocol { .await ) } + Self::Raft => { + box_if_ok!( + RaftReplica::new_and_setup( + api_addr, p2p_addr, manager, config_str + ) + .await + ) + } Self::RSPaxos => { box_if_ok!( RSPaxosReplica::new_and_setup( @@ -108,6 +128,14 @@ impl SmrProtocol { .await ) } + Self::Crossword => { + box_if_ok!( + CrosswordReplica::new_and_setup( + api_addr, p2p_addr, manager, config_str + ) + .await + ) + } } } @@ -133,6 +161,9 @@ impl SmrProtocol { MultiPaxosClient::new_and_setup(manager, config_str).await ) } + Self::Raft => { + box_if_ok!(RaftClient::new_and_setup(manager, config_str).await) + } Self::RSPaxos => { box_if_ok!( RSPaxosClient::new_and_setup(manager, config_str).await @@ -166,7 +197,9 @@ mod protocols_name_tests { valid_name_test!(RepNothing); valid_name_test!(SimplePush); valid_name_test!(MultiPaxos); + valid_name_test!(Raft); valid_name_test!(RSPaxos); + valid_name_test!(Crossword); } #[test] diff --git a/src/protocols/multipaxos.rs b/src/protocols/multipaxos.rs index 00e5f964..dfad698a 100644 --- a/src/protocols/multipaxos.rs +++ b/src/protocols/multipaxos.rs @@ -7,6 +7,7 @@ //! - //! - +use std::cmp; use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; @@ -35,8 +36,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigMultiPaxos { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -49,7 +50,6 @@ pub struct ReplicaConfigMultiPaxos { /// Min timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_max: u64, @@ -74,7 +74,7 @@ pub struct ReplicaConfigMultiPaxos { impl Default for ReplicaConfigMultiPaxos { fn default() -> Self { ReplicaConfigMultiPaxos { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.multipaxos.wal".into(), logger_sync: false, @@ -154,12 +154,12 @@ struct Instance { external: bool, /// Offset of first durable WAL log entry related to this instance. - log_offset: usize, + wal_offset: usize, } -/// Stable storage log entry type. +/// Stable storage WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -175,11 +175,20 @@ enum LogEntry { } /// Snapshot file entry type. +/// +/// NOTE: the current implementation simply appends a squashed log at the +/// end of the snapshot file for simplicity. In production, the snapshot +/// file should be a bounded-sized backend, e.g., an LSM-tree. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { - /// First entry at the start of file: number of log instances covered by - /// this snapshot file == the start slot index of in-mem log. - StartSlot { slot: usize }, + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log instances covered + /// by this snapshot file == the start slot index of in-mem log. + start_slot: usize, + /// Index of the first non-committed slot. + commit_bar: usize, + }, /// Set of key-value pairs to apply to the state. KVPairSet { pairs: HashMap }, @@ -189,7 +198,13 @@ enum SnapEntry { #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. - Prepare { slot: usize, ballot: Ballot }, + Prepare { + /// Slot index in Prepare message is the triggering slot of this + /// Prepare. Once prepared, it means that all slots in the range + /// [slot, +infinity) are prepared under this ballot number. + slot: usize, + ballot: Ballot, + }, /// Prepare reply from replica to leader. PrepareReply { @@ -213,8 +228,18 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Request by a lagging replica to leader asking to re-send Accepts for + /// missing holes + FillHoles { slots: Vec }, + /// Leader activity heartbeat. - Heartbeat { ballot: Ballot, exec_bar: usize }, + Heartbeat { + ballot: Ballot, + /// For leader step-up as well as conservative snapshotting purpose. + exec_bar: usize, + /// For conservative snapshotting purpose. + snap_bar: usize, + }, } /// MultiPaxos server replica module. @@ -247,7 +272,10 @@ pub struct MultiPaxosReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, + + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, /// StorageHub module for the snapshot file. snapshot_hub: StorageHub, @@ -255,14 +283,21 @@ pub struct MultiPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + /// Timer for hearing heartbeat from leader. hb_hear_timer: Timer, /// Interval for sending heartbeat to followers. hb_send_interval: Interval, - /// Do I think I am the leader? - is_leader: bool, + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, /// In-memory log of instances. insts: Vec, @@ -289,15 +324,33 @@ pub struct MultiPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, - /// Current durable log file offset. - log_offset: usize, + /// Map from peer ID -> its latest exec_bar I know; this is for conservative + /// snapshotting purpose. + peer_exec_bar: HashMap, + + /// Slot index before which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed (and executed) that entry. + snap_bar: usize, + + /// Current durable WAL log file offset. + wal_offset: usize, /// Current durable snapshot file offset. snap_offset: usize, } +// MultiPaxosReplica common helpers impl MultiPaxosReplica { + /// Do I think I am the current effective leader? + #[inline] + fn is_leader(&self) -> bool { + self.leader == Some(self.id) + } + /// Create an empty null instance. + #[inline] fn null_instance(&self) -> Instance { Instance { bal: 0, @@ -307,22 +360,36 @@ impl MultiPaxosReplica { leader_bk: None, replica_bk: None, external: false, - log_offset: 0, + wal_offset: 0, } } + /// Locate the first null slot or append a null instance if no holes exist. + fn first_null_slot(&mut self) -> usize { + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { + return s; + } + } + self.insts.push(self.null_instance()); + self.start_slot + self.insts.len() - 1 + } + /// Compose a unique ballot number from base. + #[inline] fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot } /// Compose a unique ballot number greater than the given one. + #[inline] fn make_greater_ballot(&self, bal: Ballot) -> Ballot { self.make_unique_ballot((bal >> 8) + 1) } /// Compose LogActionId from slot index & entry type. /// Uses the `Status` enum type to represent differnet entry types. + #[inline] fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { let type_num = match entry_type { Status::Preparing => 1, @@ -334,6 +401,7 @@ impl MultiPaxosReplica { } /// Decompose LogActionId into slot index & entry type. + #[inline] fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { let slot = (log_action_id >> 2) as usize; let type_num = log_action_id & ((1 << 2) - 1); @@ -347,6 +415,7 @@ impl MultiPaxosReplica { } /// Compose CommandId from slot index & command index within. + #[inline] fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { assert!(slot <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -354,12 +423,16 @@ impl MultiPaxosReplica { } /// Decompose CommandId into slot index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let slot = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (slot, cmd_idx) } +} +// MultiPaxosReplica client requests entrance +impl MultiPaxosReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -370,52 +443,44 @@ impl MultiPaxosReplica { pf_debug!(self.id; "got request batch of size {}", batch_size); // if I'm not a leader, ignore client requests - if !self.is_leader { + if !self.is_leader() { for (client, req) in req_batch { if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; self.external_api.send_reply( ApiReply::Reply { id: req_id, result: None, - redirect: Some(next_replica), + redirect: Some(target), }, client, )?; pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); + client, target); } } return Ok(()); } // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - let mut slot = self.start_slot + self.insts.len(); - for s in self.commit_bar..(self.start_slot + self.insts.len()) { - let old_inst = &mut self.insts[s - self.start_slot]; - if old_inst.status == Status::Null { - old_inst.reqs = req_batch.clone(); - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }); - slot = s; - break; - } - } - if slot == self.start_slot + self.insts.len() { - let mut new_inst = self.null_instance(); - new_inst.reqs = req_batch.clone(); - new_inst.leader_bk = Some(LeaderBookkeeping { + // at the end if no holes exist); fill it up with incoming data + let slot = self.first_null_slot(); + { + let inst = &mut self.insts[slot - self.start_slot]; + assert_eq!(inst.status, Status::Null); + inst.reqs = req_batch.clone(); + inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, accept_acks: Bitmap::new(self.population, false), }); - new_inst.external = true; - self.insts.push(new_inst); + inst.external = true; } // decide whether we can enter fast path for this instance @@ -439,7 +504,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -472,7 +537,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, reqs: req_batch.clone(), @@ -498,7 +563,10 @@ impl MultiPaxosReplica { Ok(()) } +} +// MultiPaxosReplica durable WAL logging +impl MultiPaxosReplica { /// Handler of PrepareBal logging result chan recv. fn handle_logged_prepare_bal( &mut self, @@ -516,7 +584,7 @@ impl MultiPaxosReplica { None }; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of a PrepareBal entry // is equivalent to receiving a Prepare reply from myself // (as an acceptor role) @@ -553,7 +621,7 @@ impl MultiPaxosReplica { slot, self.insts[slot - self.start_slot].bal); let inst = &self.insts[slot - self.start_slot]; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of an AcceptData entry // is equivalent to receiving an Accept reply from myself // (as an acceptor role) @@ -619,6 +687,21 @@ impl MultiPaxosReplica { } } + // if there are hole(s) between current commit_bar and newly committed + // slot, ask the leader to re-send Accept messages for those slots + if slot > self.commit_bar && !self.is_leader() { + if let Some(leader) = self.leader { + let holes: Vec = (self.commit_bar..slot).collect(); + self.transport_hub.send_msg( + PeerMsg::FillHoles { + slots: holes.clone(), + }, + leader, + )?; + pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes); + } + } + Ok(()) } @@ -626,7 +709,7 @@ impl MultiPaxosReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); if slot < self.start_slot { @@ -635,15 +718,15 @@ impl MultiPaxosReplica { assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - // update first log_offset of slot + assert!(now_size >= self.wal_offset); + // update first wal_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 || inst.log_offset > self.log_offset { - inst.log_offset = self.log_offset; + if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { + inst.wal_offset = self.wal_offset; } - assert!(inst.log_offset <= self.log_offset); - // then update self.log_offset - self.log_offset = now_size; + assert!(inst.wal_offset <= self.wal_offset); + // then update self.wal_offset + self.wal_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); } @@ -657,7 +740,10 @@ impl MultiPaxosReplica { } } } +} +// MultiPaxosReplica peer-peer messages handling +impl MultiPaxosReplica { /// Handler of Prepare message from leader. fn handle_msg_prepare( &mut self, @@ -691,7 +777,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, + entry: WalEntry::PrepareBal { slot, ballot }, sync: self.config.logger_sync, }, )?; @@ -719,10 +805,11 @@ impl MultiPaxosReplica { // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -761,7 +848,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs: inst.reqs.clone(), @@ -825,7 +912,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { slot, ballot, reqs }, + entry: WalEntry::AcceptData { slot, ballot, reqs }, sync: self.config.logger_sync, }, )?; @@ -852,10 +939,11 @@ impl MultiPaxosReplica { // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -882,7 +970,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -931,7 +1019,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -941,6 +1029,43 @@ impl MultiPaxosReplica { Ok(()) } + /// Handler of FillHoles message from a lagging peer. + fn handle_msg_fill_holes( + &mut self, + peer: ReplicaId, + slots: Vec, + ) -> Result<(), SummersetError> { + if !self.is_leader() { + return Ok(()); + } + pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots); + + for slot in slots { + if slot < self.start_slot { + continue; + } else if slot >= self.start_slot + self.insts.len() { + break; + } + let inst = &self.insts[slot - self.start_slot]; + + if inst.status >= Status::Committed { + // re-send Accept message for this slot + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: self.bal_prepared, + reqs: inst.reqs.clone(), + }, + peer, + )?; + pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}", + peer, slot, self.bal_prepared); + } + } + + Ok(()) + } + /// Synthesized handler of receiving message from peer. fn handle_msg_recv( &mut self, @@ -963,12 +1088,20 @@ impl MultiPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Heartbeat { ballot, exec_bar } => { - self.heard_heartbeat(peer, ballot, exec_bar) + PeerMsg::FillHoles { slots } => { + self.handle_msg_fill_holes(peer, slots) } + PeerMsg::Heartbeat { + ballot, + exec_bar, + snap_bar, + } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar), } } +} +// MultiPaxosReplica state machine execution +impl MultiPaxosReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -1026,22 +1159,39 @@ impl MultiPaxosReplica { Ok(()) } +} +// MultiPaxosReplica leadership related logic +impl MultiPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader { + if self.is_leader() { return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } } - self.is_leader = true; // this starts broadcasting heartbeats + self.leader = Some(self.id); // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_info!(self.id; "becoming a leader..."); - // broadcast a heartbeat right now + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } self.bcast_heartbeats()?; + // re-initialize peer_exec_bar information + for slot in self.peer_exec_bar.values_mut() { + *slot = 0; + } + // make a greater ballot number and invalidate all in-progress instances self.bal_prepared = 0; self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); @@ -1069,7 +1219,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -1091,7 +1241,6 @@ impl MultiPaxosReplica { slot, inst.bal); } } - Ok(()) } @@ -1101,10 +1250,43 @@ impl MultiPaxosReplica { PeerMsg::Heartbeat { ballot: self.bal_prep_sent, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, None, )?; - self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself + self.heard_heartbeat( + self.id, + self.bal_prep_sent, + self.exec_bar, + self.snap_bar, + )?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1113,6 +1295,8 @@ impl MultiPaxosReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -1128,10 +1312,19 @@ impl MultiPaxosReplica { /// leader status if I currently think I'm a leader. fn heard_heartbeat( &mut self, - _peer: ReplicaId, + peer: ReplicaId, ballot: Ballot, exec_bar: usize, + snap_bar: usize, ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + // ignore outdated heartbeats and those from peers with exec_bar < mine if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); @@ -1140,18 +1333,61 @@ impl MultiPaxosReplica { // reset hearing timer self.kickoff_hb_hear_timer()?; - // clear my leader status if it carries a higher ballot number - if self.is_leader && ballot > self.bal_max_seen { - self.is_leader = false; - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!(self.id; "no longer a leader..."); + if peer != self.id { + // reply back with a Heartbeat message + self.transport_hub.send_msg( + PeerMsg::Heartbeat { + ballot, + exec_bar: self.exec_bar, + snap_bar: self.snap_bar, + }, + peer, + )?; + + // update peer_exec_bar if larger then known; if all servers' + // exec_bar (including myself) have passed a slot, that slot + // is definitely safe to be snapshotted + if exec_bar > self.peer_exec_bar[&peer] { + *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; + let passed_cnt = 1 + self + .peer_exec_bar + .values() + .filter(|&&e| e >= exec_bar) + .count() as u8; + if passed_cnt == self.population { + // all servers have executed up to exec_bar + self.snap_bar = exec_bar; + } + } + + // if the peer has made a higher ballot number + if ballot > self.bal_max_seen { + self.bal_max_seen = ballot; + + // clear my leader status if I was one + if self.is_leader() { + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); + } + + // set this peer to be the believed leader + self.leader = Some(peer); + } + } + + // if snap_bar is larger than mine, update snap_bar + if snap_bar > self.snap_bar { + self.snap_bar = snap_bar; } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } +} +// MultiPaxosReplica control messages handling +impl MultiPaxosReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1263,14 +1499,20 @@ impl MultiPaxosReplica { _ => Ok(None), // ignore all other types } } +} +// MultiPaxosReplica recovery from WAL log +impl MultiPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, - entry: LogEntry, + entry: WalEntry, ) -> Result<(), SummersetError> { match entry { - LogEntry::PrepareBal { slot, ballot } => { + WalEntry::PrepareBal { slot, ballot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); @@ -1289,7 +1531,10 @@ impl MultiPaxosReplica { self.bal_prepared = 0; } - LogEntry::AcceptData { slot, ballot, reqs } => { + WalEntry::AcceptData { slot, ballot, reqs } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()); @@ -1315,9 +1560,12 @@ impl MultiPaxosReplica { assert!(self.bal_prepared <= self.bal_prep_sent); } - LogEntry::CommitSlot { slot } => { + WalEntry::CommitSlot { slot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } assert!(slot < self.start_slot + self.insts.len()); - // update instance state + // update instance status self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine @@ -1337,9 +1585,10 @@ impl MultiPaxosReplica { let _ = self.state_machine.get_result().await?; } } - // update commit_bar and exec_bar + // update instance status, commit_bar and exec_bar self.commit_bar += 1; self.exec_bar += 1; + inst.status = Status::Executed; } } } @@ -1348,15 +1597,15 @@ impl MultiPaxosReplica { Ok(()) } - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1368,7 +1617,7 @@ impl MultiPaxosReplica { } => { self.recover_apply_entry(entry).await?; // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -1384,7 +1633,7 @@ impl MultiPaxosReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1392,17 +1641,27 @@ impl MultiPaxosReplica { offset_ok: true, .. } = log_result { + if self.wal_offset > 0 { + pf_info!(self.id; "recovered from wal log: commit {} exec {}", + self.commit_bar, self.exec_bar); + } Ok(()) } else { logged_err!(self.id; "unexpected log result type or failed truncate") } } +} - /// Dump a new key-value pair to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { +// MultiPaxosReplica snapshotting & GC logic +impl MultiPaxosReplica { + /// Dump new key-value pairs to snapshot file. + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.exec_bar { + for slot in self.start_slot..new_start_slot { let inst = &self.insts[slot - self.start_slot]; for (_, req) in inst.reqs.clone() { if let ApiRequest::Req { @@ -1438,15 +1697,20 @@ impl MultiPaxosReplica { /// Discard everything older than start_slot in durable WAL log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { let cut_offset = if !self.insts.is_empty() { - self.insts[0].log_offset + self.insts[0].wal_offset } else { - self.log_offset + self.wal_offset }; // discard the log before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: 0, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -1459,8 +1723,8 @@ impl MultiPaxosReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; + assert_eq!(self.wal_offset - cut_offset, now_size); + self.wal_offset = now_size; } else { return logged_err!( self.id; @@ -1472,43 +1736,74 @@ impl MultiPaxosReplica { } } - // update inst.log_offset for all remaining in-mem instances + // update inst.wal_offset for all remaining in-mem instances for inst in &mut self.insts { - if inst.log_offset > 0 { - assert!(inst.log_offset >= cut_offset); - inst.log_offset -= cut_offset; + if inst.wal_offset > 0 { + assert!(inst.wal_offset >= cut_offset); + inst.wal_offset -= cut_offset; } } Ok(()) } - /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// Take a snapshot up to current exec_bar, then discard the in-mem log up /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.exec_bar, self.snap_bar); assert!(self.exec_bar >= self.start_slot); - if self.exec_bar == self.start_slot { + + let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); + if new_start_slot == self.start_slot { return Ok(()); } // collect and dump all Puts in executed instances - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; + + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: new_start_slot, + commit_bar: self.commit_bar, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(self.exec_bar - self.start_slot)); - self.start_slot = self.exec_bar; + self.insts.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything older than start_slot in WAL log - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -1533,11 +1828,20 @@ impl MultiPaxosReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::StartSlot { slot }), + entry: + Some(SnapEntry::SlotInfo { + start_slot, + commit_bar, + }), end_offset, } => { self.snap_offset = end_offset; - self.start_slot = slot; // get start slot index of in-mem log + + // recover necessary slot indices info + self.start_slot = start_slot; + self.commit_bar = commit_bar; + self.exec_bar = start_slot; + self.snap_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -1580,6 +1884,11 @@ impl MultiPaxosReplica { self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { new_start: self.start_slot, })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}", + self.start_slot, self.commit_bar, self.exec_bar); + } Ok(()) } @@ -1588,7 +1897,10 @@ impl MultiPaxosReplica { self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::StartSlot { slot: 0 }, + entry: SnapEntry::SlotInfo { + start_slot: 0, + commit_bar: 0, + }, offset: 0, sync: self.config.logger_sync, }, @@ -1602,7 +1914,7 @@ impl MultiPaxosReplica { self.snap_offset = now_size; Ok(()) } else { - logged_err!(self.id; "unexpected log result type or failed truncate") + logged_err!(self.id; "unexpected log result type or failed write") } } @@ -1628,18 +1940,39 @@ impl GenericReplica for MultiPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigMultiPaxos; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, snapshot_path, snapshot_interval_s, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { + return logged_err!( + id; + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms + ); + } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms ); } if config.hb_hear_timeout_min < 100 { @@ -1728,7 +2061,7 @@ impl GenericReplica for MultiPaxosReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -1746,6 +2079,10 @@ impl GenericReplica for MultiPaxosReplica { )); snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(MultiPaxosReplica { id, population, @@ -1759,9 +2096,11 @@ impl GenericReplica for MultiPaxosReplica { storage_hub, snapshot_hub, transport_hub, + leader: None, hb_hear_timer: Timer::new(), hb_send_interval, - is_leader: false, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), insts: vec![], start_slot: 0, snapshot_interval, @@ -1770,7 +2109,11 @@ impl GenericReplica for MultiPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, - log_offset: 0, + peer_exec_bar: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + snap_bar: 0, + wal_offset: 0, snap_offset: 0, }) } @@ -1782,8 +2125,8 @@ impl GenericReplica for MultiPaxosReplica { // recover state from durable snapshot file self.recover_from_snapshot().await?; - // recover the tail-piece memory log & state from durable storage log - self.recover_from_log().await?; + // recover the tail-piece memory log & state from durable WAL log + self.recover_from_wal().await?; // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; @@ -1849,7 +2192,7 @@ impl GenericReplica for MultiPaxosReplica { }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { if let Err(e) = self.bcast_heartbeats() { pf_error!(self.id; "error broadcasting heartbeats: {}", e); } @@ -1857,7 +2200,7 @@ impl GenericReplica for MultiPaxosReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/protocols/raft.rs b/src/protocols/raft.rs new file mode 100644 index 00000000..4ffc04f5 --- /dev/null +++ b/src/protocols/raft.rs @@ -0,0 +1,2222 @@ +//! Replication protocol: Raft. +//! +//! ATC '14 version of Raft. References: +//! - +//! - +//! - + +use std::cmp; +use std::collections::{HashMap, HashSet}; +use std::path::Path; +use std::net::SocketAddr; + +use crate::utils::{SummersetError, Bitmap, Timer}; +use crate::manager::{CtrlMsg, CtrlRequest, CtrlReply}; +use crate::server::{ + ReplicaId, ControlHub, StateMachine, Command, CommandResult, CommandId, + ExternalApi, ApiRequest, ApiReply, StorageHub, LogAction, LogResult, + LogActionId, TransportHub, GenericReplica, +}; +use crate::client::{ClientId, ClientApiStub, ClientCtrlStub, GenericEndpoint}; +use crate::protocols::SmrProtocol; + +use rand::prelude::*; + +use async_trait::async_trait; + +use get_size::GetSize; + +use serde::{Serialize, Deserialize}; + +use tokio::time::{self, Duration, Interval, MissedTickBehavior}; +use tokio::sync::watch; + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ReplicaConfigRaft { + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, + + /// Client request batching maximum batch size. + pub max_batch_size: usize, + + /// Path to backing log file. + pub backer_path: String, + + /// Whether to call `fsync()`/`fdatasync()` on logger. + pub logger_sync: bool, + + /// Min timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_min: u64, + /// Max timeout of not hearing any heartbeat from leader in millisecs. + pub hb_hear_timeout_max: u64, + + /// Interval of leader sending AppendEntries heartbeats to followers. + pub hb_send_interval_ms: u64, + + /// Path to snapshot file. + pub snapshot_path: String, + + /// Snapshot self-triggering interval in secs. 0 means never trigger + /// snapshotting autonomously. + pub snapshot_interval_s: u64, + + // Performance simulation params (all zeros means no perf simulation): + pub perf_storage_a: u64, + pub perf_storage_b: u64, + pub perf_network_a: u64, + pub perf_network_b: u64, +} + +#[allow(clippy::derivable_impls)] +impl Default for ReplicaConfigRaft { + fn default() -> Self { + ReplicaConfigRaft { + batch_interval_ms: 10, + max_batch_size: 5000, + backer_path: "/tmp/summerset.raft.wal".into(), + logger_sync: false, + hb_hear_timeout_min: 600, + hb_hear_timeout_max: 900, + hb_send_interval_ms: 50, + snapshot_path: "/tmp/summerset.raft.snap".into(), + snapshot_interval_s: 0, + perf_storage_a: 0, + perf_storage_b: 0, + perf_network_a: 0, + perf_network_b: 0, + } + } +} + +/// Term number type, defined for better code readability. +type Term = u64; + +/// Request batch type (i.e., the "command" in an entry). +/// +/// NOTE: the originally presented Raft algorithm does not explicitly mention +/// batching, but instead hides it with the heartbeats: every AppendEntries RPC +/// from the leader basically batches all commands it has received since the +/// last sent heartbeat. Here, to make this implementation more comparable to +/// MultiPaxos, we trigger batching also explicitly. +type ReqBatch = Vec<(ClientId, ApiRequest)>; + +/// In-mem + persistent entry of log, containing a term and a commands batch. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +struct LogEntry { + /// Term number. + term: Term, + + /// Batch of client requests. + reqs: ReqBatch, + + /// True if from external client, else false. + external: bool, + + /// Offset in durable log file of this entry. This field is not maintained + /// in durable storage itself, where it is typically 0. It is maintained + /// only in the in-memory log. + log_offset: usize, +} + +/// Stable storage log entry type. +/// +/// NOTE: Raft makes the persistent log exactly mirror the in-memory log, so +/// the backer file is not a WAL log in runtime operation; it might get +/// overwritten, etc. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum DurEntry { + /// Durable metadata. + Metadata { + curr_term: Term, + voted_for: Option, + }, + + /// Log entry mirroring in-mem log. + LogEntry { entry: LogEntry }, +} + +/// Snapshot file entry type. +#[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] +enum SnapEntry { + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log entries covered + /// by this snapshot file == the start slot index of remaining log. + start_slot: usize, + }, + + /// Set of key-value pairs to apply to the state. + KVPairSet { pairs: HashMap }, +} + +/// Peer-peer message type. +#[derive(Debug, Clone, Serialize, Deserialize, GetSize)] +enum PeerMsg { + /// AppendEntries from leader to followers. + AppendEntries { + term: Term, + prev_slot: usize, + prev_term: Term, + entries: Vec, + leader_commit: usize, + /// For conservative snapshotting purpose. + last_snap: usize, + }, + + /// AppendEntries reply from follower to leader. + AppendEntriesReply { + term: Term, + /// For correct tracking of which AppendEntries this reply is for. + end_slot: usize, + success: bool, + }, + + /// RequestVote from leader to followers. + RequestVote { + term: Term, + last_slot: usize, + last_term: Term, + }, + + /// RequestVote reply from follower to leader. + RequestVoteReply { term: Term, granted: bool }, +} + +/// Replica role type. +#[derive( + Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Serialize, Deserialize, +)] +enum Role { + Follower, + Candidate, + Leader, +} + +/// Raft server replica module. +pub struct RaftReplica { + /// Replica ID in cluster. + id: ReplicaId, + + /// Total number of replicas in cluster. + population: u8, + + /// Majority quorum size. + quorum_cnt: u8, + + /// Configuration parameters struct. + config: ReplicaConfigRaft, + + /// Address string for client requests API. + _api_addr: SocketAddr, + + /// Address string for internal peer-peer communication. + _p2p_addr: SocketAddr, + + /// ControlHub module. + control_hub: ControlHub, + + /// ExternalApi module. + external_api: ExternalApi, + + /// StateMachine module. + state_machine: StateMachine, + + /// StorageHub module. + storage_hub: StorageHub, + + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, + + /// TransportHub module. + transport_hub: TransportHub, + + /// Which role am I in right now? + role: Role, + + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + + /// Timer for hearing heartbeat from leader. + hb_hear_timer: Timer, + + /// Interval for sending heartbeat to followers. + hb_send_interval: Interval, + + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, + + /// Latest term seen. + curr_term: Term, + + /// Candidate ID that I voted for in current term. + voted_for: Option, + + /// Replica IDs that voted for me in current election. + votes_granted: HashSet, + + /// In-memory log of entries. Slot 0 is a dummy entry to make indexing happy. + log: Vec, + + /// Start slot index of in-mem log after latest snapshot. + start_slot: usize, + + /// Timer for taking a new autonomous snapshot. + snapshot_interval: Interval, + + /// Slot index of highest log entry known to be committed. + last_commit: usize, + + /// Slot index of highest log entry applied to state machine. + last_exec: usize, + + /// For each server, index of the next log entry to send. + next_slot: HashMap, + + /// For each server, index of the highest log entry known to be replicated. + match_slot: HashMap, + + /// Slot index up to which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed that entry. + last_snap: usize, + + /// Current durable log file end offset. + log_offset: usize, + + /// Current durable log end of offset of metadata. + log_meta_end: usize, + + /// Current durable snapshot file offset. + snap_offset: usize, +} + +// RaftReplica common helpers +impl RaftReplica { + /// Compose LogActionId from (slot, end_slot) pair & entry type. + /// Uses the `Role` enum type to represent differnet entry types. + #[inline] + fn make_log_action_id( + slot: usize, + slot_e: usize, + entry_type: Role, + ) -> LogActionId { + let type_num = match entry_type { + Role::Follower => 1, + Role::Leader => 2, + _ => panic!("unknown log entry type {:?}", entry_type), + }; + ((slot << 33) | (slot_e << 2) | type_num) as LogActionId + } + + /// Decompose LogActionId into (slot, end_slot) pair & entry type. + #[inline] + fn split_log_action_id(log_action_id: LogActionId) -> (usize, usize, Role) { + let slot = (log_action_id >> 33) as usize; + let slot_e = ((log_action_id & ((1 << 33) - 1)) >> 2) as usize; + let type_num = log_action_id & ((1 << 2) - 1); + let entry_type = match type_num { + 1 => Role::Follower, + 2 => Role::Leader, + _ => panic!("unknown log entry type num {}", type_num), + }; + (slot, slot_e, entry_type) + } + + /// Compose CommandId from slot index & command index within. + #[inline] + fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { + assert!(slot <= (u32::MAX as usize)); + assert!(cmd_idx <= (u32::MAX as usize)); + ((slot << 32) | cmd_idx) as CommandId + } + + /// Decompose CommandId into slot index & command index within. + #[inline] + fn split_command_id(command_id: CommandId) -> (usize, usize) { + let slot = (command_id >> 32) as usize; + let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; + (slot, cmd_idx) + } + + /// Check if the given term is larger than mine. If so, convert my role + /// back to follower. Returns true if my role was not follower but now + /// converted to follower, and false otherwise. + #[inline] + fn check_term( + &mut self, + peer: ReplicaId, + term: Term, + ) -> Result { + if term > self.curr_term { + self.curr_term = term; + self.heard_heartbeat(peer, term)?; // refresh election timer + if self.role != Role::Follower { + self.role = Role::Follower; + pf_trace!(self.id; "converted back to follower"); + Ok(true) + } else { + Ok(false) + } + } else { + Ok(false) + } + } +} + +// RaftReplica client requests entrance +impl RaftReplica { + /// Handler of client request batch chan recv. + fn handle_req_batch( + &mut self, + req_batch: ReqBatch, + ) -> Result<(), SummersetError> { + let batch_size = req_batch.len(); + assert!(batch_size > 0); + pf_debug!(self.id; "got request batch of size {}", batch_size); + + // if I'm not a leader, ignore client requests + if self.role != Role::Leader { + for (client, req) in req_batch { + if let ApiRequest::Req { id: req_id, .. } = req { + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; + self.external_api.send_reply( + ApiReply::Reply { + id: req_id, + result: None, + redirect: Some(target), + }, + client, + )?; + pf_trace!(self.id; "redirected client {} to replica {}", + client, target); + } + } + return Ok(()); + } + + // append an entry to in-memory log + let entry = LogEntry { + term: self.curr_term, + reqs: req_batch, + external: true, + log_offset: 0, + }; + let slot = self.start_slot + self.log.len(); + self.log.push(entry.clone()); + + // submit logger action to make this log entry durable + self.storage_hub.submit_action( + Self::make_log_action_id(slot, slot, Role::Leader), + LogAction::Append { + entry: DurEntry::LogEntry { entry }, + sync: self.config.logger_sync, + }, + )?; + pf_trace!(self.id; "submitted leader append log action for slot {}", slot); + + Ok(()) + } +} + +// RaftReplica durable logging +impl RaftReplica { + /// Handler of leader append logging result chan recv. + fn handle_logged_leader_append( + &mut self, + slot: usize, + slot_e: usize, + ) -> Result<(), SummersetError> { + if slot < self.start_slot || self.role != Role::Leader { + return Ok(()); // ignore if outdated + } + pf_trace!(self.id; "finished leader append logging for slot {} <= {}", + slot, slot_e); + assert_eq!(slot, slot_e); + + // broadcast AppendEntries messages to followers + for peer in 0..self.population { + if peer == self.id || self.next_slot[&peer] < 1 { + continue; + } + + let prev_slot = self.next_slot[&peer] - 1; + if prev_slot < self.start_slot { + return logged_err!(self.id; "snapshotted slot {} queried", prev_slot); + } + let prev_term = self.log[prev_slot - self.start_slot].term; + let entries = self + .log + .iter() + .skip(self.next_slot[&peer] - self.start_slot) + .cloned() + .collect(); + + if slot >= self.next_slot[&peer] { + self.transport_hub.send_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries, + leader_commit: self.last_commit, + last_snap: self.last_snap, + }, + peer, + )?; + pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}", + peer, self.next_slot[&peer], + self.start_slot + self.log.len() - 1); + } + } + + // I also heard my own heartbeat + self.heard_heartbeat(self.id, self.curr_term)?; + + Ok(()) + } + + /// Handler of follower append logging result chan recv. + fn handle_logged_follower_append( + &mut self, + slot: usize, + slot_e: usize, + ) -> Result<(), SummersetError> { + if slot < self.start_slot || self.role != Role::Follower { + return Ok(()); // ignore if outdated + } + pf_trace!(self.id; "finished follower append logging for slot {} <= {}", + slot, slot_e); + assert!(slot <= slot_e); + + // if all consecutive entries are made durable, reply AppendEntries + // success back to leader + if slot == slot_e { + if let Some(leader) = self.leader { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: slot_e, + success: true, + }, + leader, + )?; + pf_trace!(self.id; "sent AppendEntriesReply -> {} up to slot {}", + leader, slot_e); + } + } + + Ok(()) + } + + /// Synthesized handler of durable logging result chan recv. + fn handle_log_result( + &mut self, + action_id: LogActionId, + log_result: LogResult, + ) -> Result<(), SummersetError> { + let (slot, slot_e, entry_type) = Self::split_log_action_id(action_id); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot_e < self.start_slot + self.log.len()); + + if let LogResult::Append { now_size } = log_result { + let entry = &mut self.log[slot - self.start_slot]; + if entry.log_offset != self.log_offset { + // entry has incorrect log_offset bookkept; update it + entry.log_offset = self.log_offset; + } + assert!(now_size > self.log_offset); + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type: {:?}", log_result); + } + + match entry_type { + Role::Follower => self.handle_logged_follower_append(slot, slot_e), + Role::Leader => self.handle_logged_leader_append(slot, slot_e), + _ => { + logged_err!(self.id; "unexpected log entry type: {:?}", entry_type) + } + } + } +} + +// RaftReplica peer-peer messages handling +impl RaftReplica { + /// Handler of AppendEntries message from leader. + #[allow(clippy::too_many_arguments)] + async fn handle_msg_append_entries( + &mut self, + leader: ReplicaId, + term: Term, + prev_slot: usize, + prev_term: Term, + mut entries: Vec, + leader_commit: usize, + last_snap: usize, + ) -> Result<(), SummersetError> { + if !entries.is_empty() { + pf_trace!(self.id; "received AcceptEntries <- {} for slots {} - {} term {}", + leader, prev_slot + 1, prev_slot + entries.len(), term); + } + if self.check_term(leader, term)? || self.role != Role::Follower { + return Ok(()); + } + + // reply false if term smaller than mine, or if my log does not + // contain an entry at prev_slot matching prev_term + if term < self.curr_term + || prev_slot < self.start_slot + || prev_slot >= self.start_slot + self.log.len() + || self.log[prev_slot - self.start_slot].term != prev_term + { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: prev_slot, + success: false, + }, + leader, + )?; + pf_trace!(self.id; "sent AcceptEntriesReply -> {} term {} end_slot {} fail", + leader, self.curr_term, prev_slot); + + if term >= self.curr_term { + // also refresh heartbeat timer here since the "decrementing" + // procedure for a lagging follower might take long + self.heard_heartbeat(leader, term)?; + } + return Ok(()); + } + + // update my knowledge of who's the current leader, and reset election + // timeout timer + self.leader = Some(leader); + self.heard_heartbeat(leader, term)?; + + // check if any existing entry conflicts with a new one in `entries`. + // If so, truncate everything at and after that entry + let mut first_new = prev_slot + 1; + for (slot, new_entry) in entries + .iter() + .enumerate() + .map(|(s, e)| (s + prev_slot + 1, e)) + { + if slot >= self.start_slot + self.log.len() { + first_new = slot; + break; + } else if self.log[slot - self.start_slot].term != new_entry.term { + let cut_offset = self.log[slot - self.start_slot].log_offset; + // do this truncation in-place for simplicity + self.storage_hub.submit_action( + 0, + LogAction::Truncate { offset: cut_offset }, + )?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Truncate { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!(now_size, cut_offset); + self.log_offset = cut_offset; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed truncate" + ); + } + break; + } + } + // truncate in-mem log as well + self.log.truncate(slot - self.start_slot); + first_new = slot; + break; + } + } + + // append new entries into my log, and submit logger actions to make + // new entries durable + let (num_entries, mut num_appended) = (entries.len(), 0); + for (slot, mut entry) in entries + .drain((first_new - prev_slot - 1)..entries.len()) + .enumerate() + .map(|(s, e)| (s + first_new, e)) + { + entry.log_offset = 0; + + self.log.push(entry.clone()); + self.storage_hub.submit_action( + Self::make_log_action_id( + slot, + prev_slot + num_entries, + Role::Follower, + ), + LogAction::Append { + entry: DurEntry::LogEntry { entry }, + sync: self.config.logger_sync, + }, + )?; + + num_appended += 1; + } + + // even if no entries appended, also send back AppendEntriesReply + // as a follower-to-leader reverse heardbeat for peer health + // tracking purposes + if num_appended == 0 { + self.transport_hub.send_msg( + PeerMsg::AppendEntriesReply { + term: self.curr_term, + end_slot: first_new - 1, + success: true, + }, + leader, + )?; + } + + // if leader_commit is larger than my last_commit, update last_commit + if leader_commit > self.last_commit { + let new_commit = cmp::min(leader_commit, prev_slot + entries.len()); + + // submit newly committed entries for state machine execution + for slot in (self.last_commit + 1)..=new_commit { + let entry = &self.log[slot - self.start_slot]; + for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(slot, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + } + + self.last_commit = new_commit; + } + + // if last_snap is larger than mine, update last_snap + if last_snap > self.last_snap { + self.last_snap = last_snap; + } + + Ok(()) + } + + /// Handler of AppendEntries reply from follower. + fn handle_msg_append_entries_reply( + &mut self, + peer: ReplicaId, + term: Term, + end_slot: usize, + success: bool, + ) -> Result<(), SummersetError> { + if !success || self.match_slot[&peer] != end_slot { + pf_trace!(self.id; "received AcceptEntriesReply <- {} for term {} {}", + peer, term, if success { "ok" } else { "fail" }); + } + if self.check_term(peer, term)? || self.role != Role::Leader { + return Ok(()); + } + self.heard_heartbeat(peer, term)?; + + if success { + // success: update next_slot and match_slot for follower + *self.next_slot.get_mut(&peer).unwrap() = end_slot + 1; + *self.match_slot.get_mut(&peer).unwrap() = end_slot; + + // since we updated some match_slot here, check if any additional + // entries are now considered committed + let mut new_commit = self.last_commit; + for slot in + (self.last_commit + 1)..(self.start_slot + self.log.len()) + { + let entry = &self.log[slot - self.start_slot]; + if entry.term != self.curr_term { + continue; // cannot decide commit using non-latest term + } + + let match_cnt = 1 + self + .match_slot + .values() + .filter(|&&s| s >= slot) + .count() as u8; + if match_cnt >= self.quorum_cnt { + // quorum size reached, set new_commit to here + new_commit = slot; + } + } + + // submit newly committed commands, if any, for execution + for slot in (self.last_commit + 1)..=new_commit { + let entry = &self.log[slot - self.start_slot]; + for (cmd_idx, (_, req)) in entry.reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id(slot, cmd_idx), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } + } + } + + self.last_commit = new_commit; + + // also check if any additional entries are safe to snapshot + for slot in (self.last_snap + 1)..=end_slot { + let match_cnt = 1 + self + .match_slot + .values() + .filter(|&&s| s >= slot) + .count() as u8; + if match_cnt == self.population { + // all servers have durably stored this entry + self.last_snap = slot; + } + } + } else { + // failed: decrement next_slot for follower and retry + // NOTE: the optimization of fast-backward bypassing (instead of + // always decrementing by 1) not implemented + if self.next_slot[&peer] == 1 { + return Ok(()); // cannot move backward any more + } + *self.next_slot.get_mut(&peer).unwrap() -= 1; + + let prev_slot = self.next_slot[&peer] - 1; + if prev_slot < self.start_slot { + *self.next_slot.get_mut(&peer).unwrap() += 1; + return logged_err!(self.id; "snapshotted slot {} queried", prev_slot); + } + let prev_term = self.log[prev_slot - self.start_slot].term; + let entries = self + .log + .iter() + .skip(self.next_slot[&peer] - self.start_slot) + .cloned() + .collect(); + + self.transport_hub.send_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries, + leader_commit: self.last_commit, + last_snap: self.last_snap, + }, + peer, + )?; + pf_trace!(self.id; "sent AppendEntries -> {} with slots {} - {}", + peer, self.next_slot[&peer], + self.start_slot + self.log.len() - 1); + } + + Ok(()) + } + + /// Handler of RequestVote message from candidate. + fn handle_msg_request_vote( + &mut self, + candidate: ReplicaId, + term: Term, + last_slot: usize, + last_term: Term, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RequestVote <- {} with term {} last {} term {}", + candidate, term, last_slot, last_term); + self.check_term(candidate, term)?; + + // if the given term is smaller than mine, reply false + if term < self.curr_term { + self.transport_hub.send_msg( + PeerMsg::RequestVoteReply { + term: self.curr_term, + granted: false, + }, + candidate, + )?; + pf_trace!(self.id; "sent RequestVoteReply -> {} term {} false", + candidate, self.curr_term); + return Ok(()); + } + + // if I did not vote for anyone else in my current term and that the + // candidate's log is as up-to-date as mine, grant vote + #[allow(clippy::collapsible_if)] + if self.voted_for.is_none() || (self.voted_for.unwrap() == candidate) { + if last_term >= self.log.last().unwrap().term + || (last_term == self.curr_term + && last_slot + 1 >= self.start_slot + self.log.len()) + { + self.transport_hub.send_msg( + PeerMsg::RequestVoteReply { + term: self.curr_term, + granted: true, + }, + candidate, + )?; + pf_trace!(self.id; "sent RequestVoteReply -> {} term {} granted", + candidate, self.curr_term); + + // hear a heartbeat here to prevent me from starting an + // election soon + self.heard_heartbeat(candidate, term)?; + } + } + + Ok(()) + } + + /// Handler of RequestVote reply from peer. + fn handle_msg_request_vote_reply( + &mut self, + peer: ReplicaId, + term: Term, + granted: bool, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received RequestVoteReply <- {} with term {} {}", + peer, term, if granted { "granted" } else { "false" }); + if self.check_term(peer, term)? || self.role != Role::Candidate { + return Ok(()); + } + + // bookkeep this vote + self.votes_granted.insert(peer); + + // if a majority of servers have voted for me, become the leader + if self.votes_granted.len() as u8 >= self.quorum_cnt { + self.become_the_leader()?; + } + + Ok(()) + } + + /// Synthesized handler of receiving message from peer. + async fn handle_msg_recv( + &mut self, + peer: ReplicaId, + msg: PeerMsg, + ) -> Result<(), SummersetError> { + match msg { + PeerMsg::AppendEntries { + term, + prev_slot, + prev_term, + entries, + leader_commit, + last_snap, + } => { + self.handle_msg_append_entries( + peer, + term, + prev_slot, + prev_term, + entries, + leader_commit, + last_snap, + ) + .await + } + PeerMsg::AppendEntriesReply { + term, + end_slot, + success, + } => self + .handle_msg_append_entries_reply(peer, term, end_slot, success), + PeerMsg::RequestVote { + term, + last_slot, + last_term, + } => self.handle_msg_request_vote(peer, term, last_slot, last_term), + PeerMsg::RequestVoteReply { term, granted } => { + self.handle_msg_request_vote_reply(peer, term, granted) + } + } + } +} + +// RaftReplica state machine execution +impl RaftReplica { + /// Handler of state machine exec result chan recv. + fn handle_cmd_result( + &mut self, + cmd_id: CommandId, + cmd_result: CommandResult, + ) -> Result<(), SummersetError> { + let (slot, cmd_idx) = Self::split_command_id(cmd_id); + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } + assert!(slot < self.start_slot + self.log.len()); + pf_trace!(self.id; "executed cmd in entry at slot {} idx {}", + slot, cmd_idx); + + let entry = &mut self.log[slot - self.start_slot]; + assert!(cmd_idx < entry.reqs.len()); + let (client, ref req) = entry.reqs[cmd_idx]; + + // reply command result back to client + if let ApiRequest::Req { id: req_id, .. } = req { + if entry.external && self.external_api.has_client(client) { + self.external_api.send_reply( + ApiReply::Reply { + id: *req_id, + result: Some(cmd_result), + redirect: None, + }, + client, + )?; + pf_trace!(self.id; "replied -> client {} for slot {} idx {}", + client, slot, cmd_idx); + } + } else { + return logged_err!(self.id; "unexpected API request type"); + } + + // if all commands in this entry have been executed, update last_exec + if cmd_idx == entry.reqs.len() - 1 { + pf_debug!(self.id; "executed all cmds in entry at slot {}", slot); + self.last_exec = slot; + } + + Ok(()) + } +} + +// RaftReplica leader election timeout logic +impl RaftReplica { + /// Becomes a candidate and starts the election procedure. + async fn become_a_candidate(&mut self) -> Result<(), SummersetError> { + if self.role != Role::Follower { + return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + + self.role = Role::Candidate; + + // increment current term and vote for myself + self.curr_term += 1; + self.voted_for = Some(self.id); + self.votes_granted = HashSet::from([self.id]); + pf_info!(self.id; "starting election with term {}...", self.curr_term); + + // also make the two critical fields durable, synchronously + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::Metadata { + curr_term: self.curr_term, + voted_for: self.voted_for, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + loop { + let (action_id, log_result) = self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Write { + offset_ok: true, .. + } = log_result + { + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + break; + } + } + + // reset election timeout timer + self.heard_heartbeat(self.id, self.curr_term)?; + + // send RequestVote messages to all other peers + let last_slot = self.start_slot + self.log.len() - 1; + assert!(last_slot >= self.start_slot); + let last_term = self.log[last_slot - self.start_slot].term; + self.transport_hub.bcast_msg( + PeerMsg::RequestVote { + term: self.curr_term, + last_slot, + last_term, + }, + None, + )?; + pf_trace!(self.id; "broadcast RequestVote with term {} last {} term {}", + self.curr_term, last_slot, last_term); + + Ok(()) + } + + /// Becomes the leader after enough votes granted for me. + fn become_the_leader(&mut self) -> Result<(), SummersetError> { + pf_info!(self.id; "elected to be leader with term {}", self.curr_term); + self.role = Role::Leader; + + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } + self.bcast_heartbeats()?; + + // re-initialize next_slot and match_slot information + for slot in self.next_slot.values_mut() { + *slot = self.start_slot + self.log.len(); + } + for slot in self.match_slot.values_mut() { + *slot = 0; + } + + Ok(()) + } + + /// Broadcasts empty AppendEntries messages as heartbeats to all peers. + fn bcast_heartbeats(&mut self) -> Result<(), SummersetError> { + let prev_slot = self.start_slot + self.log.len() - 1; + assert!(prev_slot >= self.start_slot); + let prev_term = self.log[prev_slot - self.start_slot].term; + self.transport_hub.bcast_msg( + PeerMsg::AppendEntries { + term: self.curr_term, + prev_slot, + prev_term, + entries: vec![], + leader_commit: self.last_commit, + last_snap: self.last_snap, + }, + None, + )?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself + self.heard_heartbeat(self.id, self.curr_term)?; + + // pf_trace!(self.id; "broadcast heartbeats term {}", self.curr_term); + Ok(()) + } + + /// Chooses a random hb_hear_timeout from the min-max range and kicks off + /// the hb_hear_timer. + fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { + self.hb_hear_timer.cancel()?; + + let timeout_ms = thread_rng().gen_range( + self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, + ); + + // pf_trace!(self.id; "kickoff hb_hear_timer @ {} ms", timeout_ms); + self.hb_hear_timer + .kickoff(Duration::from_millis(timeout_ms))?; + Ok(()) + } + + /// Heard a heartbeat from some other replica. Resets election timer. + fn heard_heartbeat( + &mut self, + peer: ReplicaId, + _term: Term, + ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + + // reset hearing timer + self.kickoff_hb_hear_timer()?; + + // pf_trace!(self.id; "heard heartbeat <- {} term {}", peer, term); + Ok(()) + } +} + +// RaftReplica control messages handling +impl RaftReplica { + /// Handler of ResetState control message. + async fn handle_ctrl_reset_state( + &mut self, + durable: bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got restart req"); + + // send leave notification to peers and wait for their replies + self.transport_hub.leave().await?; + + // send leave notification to manager and wait for its reply + self.control_hub.send_ctrl(CtrlMsg::Leave)?; + while self.control_hub.recv_ctrl().await? != CtrlMsg::LeaveReply {} + + // if `durable` is false, truncate backer file + if !durable { + // use 0 as a special log action ID here + self.storage_hub + .submit_action(0, LogAction::Truncate { offset: 0 })?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id == 0 { + if log_result + != (LogResult::Truncate { + offset_ok: true, + now_size: 0, + }) + { + return logged_err!(self.id; "failed to truncate log to 0"); + } else { + return Ok(()); + } + } + } + } + + Ok(()) + } + + /// Handler of Pause control message. + fn handle_ctrl_pause( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got pause req"); + *paused = true; + self.control_hub.send_ctrl(CtrlMsg::PauseReply)?; + Ok(()) + } + + /// Handler of Resume control message. + fn handle_ctrl_resume( + &mut self, + paused: &mut bool, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server got resume req"); + + // reset leader heartbeat timer + self.kickoff_hb_hear_timer()?; + + *paused = false; + self.control_hub.send_ctrl(CtrlMsg::ResumeReply)?; + Ok(()) + } + + /// Handler of TakeSnapshot control message. + async fn handle_ctrl_take_snapshot( + &mut self, + ) -> Result<(), SummersetError> { + pf_warn!(self.id; "server told to take snapshot"); + self.take_new_snapshot().await?; + + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + Ok(()) + } + + /// Synthesized handler of manager control messages. If ok, returns + /// `Some(true)` if decides to terminate and reboot, `Some(false)` if + /// decides to shutdown completely, and `None` if not terminating. + async fn handle_ctrl_msg( + &mut self, + msg: CtrlMsg, + paused: &mut bool, + ) -> Result, SummersetError> { + match msg { + CtrlMsg::ResetState { durable } => { + self.handle_ctrl_reset_state(durable).await?; + Ok(Some(true)) + } + + CtrlMsg::Pause => { + self.handle_ctrl_pause(paused)?; + Ok(None) + } + + CtrlMsg::Resume => { + self.handle_ctrl_resume(paused)?; + Ok(None) + } + + CtrlMsg::TakeSnapshot => { + self.handle_ctrl_take_snapshot().await?; + Ok(None) + } + + _ => Ok(None), // ignore all other types + } + } +} + +// RaftReplica recovery from durable log +impl RaftReplica { + /// Recover state from durable storage log. + async fn recover_from_log(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.log_offset, 0); + + // first, try to read the first several bytes, which should record + // necessary durable metadata + self.storage_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: + Some(DurEntry::Metadata { + curr_term, + voted_for, + }), + end_offset, + } => { + self.log_offset = end_offset; + self.log_meta_end = end_offset; + + // recover necessary metadata info + self.curr_term = curr_term; + self.voted_for = voted_for; + + // read out and push all log entries into memory log + loop { + // using 0 as a special log action ID + self.storage_hub.submit_action( + 0, + LogAction::Read { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(DurEntry::LogEntry { mut entry }), + end_offset, + } => { + entry.log_offset = self.log_offset; + entry.external = false; // no re-replying to clients + self.log.push(entry); + self.log_offset = end_offset; // update log offset + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + } + + LogResult::Read { entry: None, .. } => { + // log file is empty, write initial metadata + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::Metadata { + curr_term: 0, + voted_for: None, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.log_offset = now_size; + self.log_meta_end = now_size; + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + // ... and push a 0-th dummy entry into in-mem log + self.log.push(LogEntry { + term: 0, + reqs: vec![], + external: false, + log_offset: 0, + }); + // ... and write the 0-th dummy entry durably + self.storage_hub.submit_action( + 0, + LogAction::Write { + entry: DurEntry::LogEntry { + entry: LogEntry { + term: 0, + reqs: vec![], + external: false, + log_offset: self.log_offset, + }, + }, + offset: self.log_offset, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.log[0].log_offset = self.log_offset; + self.log_offset = now_size; + } else { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } + + _ => return logged_err!(self.id; "unexpected log result type"), + } + + // do an extra Truncate to remove paritial entry at the end if any + assert!(self.log_offset >= self.log_meta_end); + self.storage_hub.submit_action( + 0, + LogAction::Truncate { + offset: self.log_offset, + }, + )?; + let (_, log_result) = self.storage_hub.get_result().await?; + if let LogResult::Truncate { + offset_ok: true, .. + } = log_result + { + if self.log_offset > self.log_meta_end { + pf_info!(self.id; "recovered from wal log: term {} voted {:?} |log| {}", + self.curr_term, self.voted_for, self.log.len()); + } + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed truncate") + } + } +} + +// RaftReplica snapshotting & GC logic +impl RaftReplica { + /// Dump new key-value pairs to snapshot file. + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { + // collect all key-value pairs put up to exec_bar + let mut pairs = HashMap::new(); + for slot in self.start_slot..new_start_slot { + let entry = &self.log[slot - self.start_slot]; + for (_, req) in entry.reqs.clone() { + if let ApiRequest::Req { + cmd: Command::Put { key, value }, + .. + } = req + { + pairs.insert(key, value); + } + } + } + + // write the collection to snapshot file + self.snapshot_hub.submit_action( + 0, // using 0 as dummy log action ID + LogAction::Append { + entry: SnapEntry::KVPairSet { pairs }, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Append { now_size } = log_result { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!( + self.id; + "unexpected log result type" + ) + } + } + + /// Discard everything lower than start_slot in durable log. + async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { + // drain things currently in storage_hub's recv chan if head of log's + // durable file offset has not been set yet + assert!(!self.log.is_empty()); + while self.log[0].log_offset == 0 { + let (action_id, log_result) = self.storage_hub.get_result().await?; + self.handle_log_result(action_id, log_result)?; + } + let cut_offset = self.log[0].log_offset; + + // discard the log after meta_end and before cut_offset + if cut_offset > 0 { + assert!(self.log_meta_end > 0); + assert!(self.log_meta_end <= cut_offset); + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: self.log_meta_end, + }, + )?; + loop { + let (action_id, log_result) = + self.storage_hub.get_result().await?; + if action_id != 0 { + // normal log action previously in queue; process it + self.handle_log_result(action_id, log_result)?; + } else { + if let LogResult::Discard { + offset_ok: true, + now_size, + } = log_result + { + assert_eq!( + self.log_offset - cut_offset + self.log_meta_end, + now_size + ); + self.log_offset = now_size; + } else { + return logged_err!( + self.id; + "unexpected log result type or failed discard" + ); + } + break; + } + } + } + + // update entry.log_offset for all remaining in-mem entries + for entry in &mut self.log { + if entry.log_offset > 0 { + assert!(entry.log_offset >= cut_offset); + entry.log_offset -= cut_offset - self.log_meta_end; + } + } + + Ok(()) + } + + /// Take a snapshot up to current last_exec, then discard the in-mem log up + /// to that index as well as their data in the durable log file. + /// + /// NOTE: the current implementation does not guard against crashes in the + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we take the conservative + /// approach that a snapshot is only taken when data has been durably + /// committed on all servers. + async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.last_exec, self.last_snap); + assert!(self.last_exec + 1 >= self.start_slot); + + // always keep at least one entry in log to make indexing happy + let new_start_slot = cmp::min(self.last_snap, self.last_exec); + assert!(new_start_slot < self.start_slot + self.log.len()); + if new_start_slot < self.start_slot + 1 { + return Ok(()); + } + + // collect and dump all Puts in executed entries + if self.role == Role::Leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_dump_kv_pairs(new_start_slot).await?; + + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: new_start_slot, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } + + // update start_slot and discard all in-mem log entries up to + // new_start_slot + self.log.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; + + // discarding everything lower than start_slot in durable log + if self.role == Role::Leader { + // NOTE: broadcast heartbeats here to appease followers + self.bcast_heartbeats()?; + } + self.snapshot_discard_log().await?; + + // reset the leader heartbeat hear timer + self.kickoff_hb_hear_timer()?; + + pf_info!(self.id; "took snapshot up to: start {}", self.start_slot); + Ok(()) + } + + /// Recover initial state from durable storage snapshot file. + async fn recover_from_snapshot(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.snap_offset, 0); + + // first, try to read the first several bytes, which should record the + // start_slot index + self.snapshot_hub + .submit_action(0, LogAction::Read { offset: 0 })?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::SlotInfo { start_slot }), + end_offset, + } => { + self.snap_offset = end_offset; + + // recover start_slot info + self.start_slot = start_slot; + if start_slot > 0 { + self.last_commit = start_slot - 1; + self.last_exec = start_slot - 1; + self.last_snap = start_slot - 1; + } + + // repeatedly apply key-value pairs + loop { + self.snapshot_hub.submit_action( + 0, + LogAction::Read { + offset: self.snap_offset, + }, + )?; + let (_, log_result) = + self.snapshot_hub.get_result().await?; + + match log_result { + LogResult::Read { + entry: Some(SnapEntry::KVPairSet { pairs }), + end_offset, + } => { + // execute Put commands on state machine + for (key, value) in pairs { + self.state_machine.submit_cmd( + 0, + Command::Put { key, value }, + )?; + let _ = self.state_machine.get_result().await?; + } + // update snapshot file offset + self.snap_offset = end_offset; + } + LogResult::Read { entry: None, .. } => { + // end of log reached + break; + } + _ => { + return logged_err!(self.id; "unexpected log result type"); + } + } + } + + // tell manager about my start_slot index + self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { + new_start: self.start_slot, + })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {}", + self.start_slot); + } + Ok(()) + } + + LogResult::Read { entry: None, .. } => { + // snapshot file is empty. Write a 0 as start_slot and return + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { start_slot: 0 }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + if let LogResult::Write { + offset_ok: true, + now_size, + } = log_result + { + self.snap_offset = now_size; + Ok(()) + } else { + logged_err!(self.id; "unexpected log result type or failed write") + } + } + + _ => { + logged_err!(self.id; "unexpected log result type") + } + } + } +} + +#[async_trait] +impl GenericReplica for RaftReplica { + async fn new_and_setup( + api_addr: SocketAddr, + p2p_addr: SocketAddr, + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + // connect to the cluster manager and get assigned a server ID + let mut control_hub = ControlHub::new_and_setup(manager).await?; + let id = control_hub.me; + let population = control_hub.population; + + // parse protocol-specific configs + let config = parsed_config!(config_str => ReplicaConfigRaft; + batch_interval_ms, max_batch_size, + backer_path, logger_sync, + hb_hear_timeout_min, hb_hear_timeout_max, + hb_send_interval_ms, + snapshot_path, snapshot_interval_s, + perf_storage_a, perf_storage_b, + perf_network_a, perf_network_b)?; + if config.batch_interval_ms == 0 { + return logged_err!( + id; + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms + ); + } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } + + // setup state machine module + let state_machine = StateMachine::new_and_setup(id).await?; + + // setup storage hub module + let storage_hub = StorageHub::new_and_setup( + id, + Path::new(&config.backer_path), + if config.perf_storage_a == 0 && config.perf_storage_b == 0 { + None + } else { + Some((config.perf_storage_a, config.perf_storage_b)) + }, + ) + .await?; + + // setup transport hub module + let mut transport_hub = TransportHub::new_and_setup( + id, + population, + p2p_addr, + if config.perf_network_a == 0 && config.perf_network_b == 0 { + None + } else { + Some((config.perf_network_a, config.perf_network_b)) + }, + ) + .await?; + + // ask for the list of peers to proactively connect to. Do this after + // transport hub has been set up, so that I will be able to accept + // later peer connections + control_hub.send_ctrl(CtrlMsg::NewServerJoin { + id, + protocol: SmrProtocol::Raft, + api_addr, + p2p_addr, + })?; + let to_peers = if let CtrlMsg::ConnectToPeers { to_peers, .. } = + control_hub.recv_ctrl().await? + { + to_peers + } else { + return logged_err!(id; "unexpected ctrl msg type received"); + }; + + // proactively connect to some peers, then wait for all population + // have been connected with me + for (peer, addr) in to_peers { + transport_hub.connect_to_peer(peer, addr).await?; + } + transport_hub.wait_for_group(population).await?; + + // setup snapshot hub module + let snapshot_hub = StorageHub::new_and_setup( + id, + Path::new(&config.snapshot_path), + None, + ) + .await?; + + // setup external API module, ready to take in client requests + let external_api = ExternalApi::new_and_setup( + id, + api_addr, + Duration::from_millis(config.batch_interval_ms), + config.max_batch_size, + ) + .await?; + + let mut hb_send_interval = + time::interval(Duration::from_millis(config.hb_send_interval_ms)); + hb_send_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut snapshot_interval = time::interval(Duration::from_secs( + if config.snapshot_interval_s > 0 { + config.snapshot_interval_s + } else { + 60 // dummy non-zero value to make `time::interval` happy + }, + )); + snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + + Ok(RaftReplica { + id, + population, + quorum_cnt: (population / 2) + 1, + config, + _api_addr: api_addr, + _p2p_addr: p2p_addr, + control_hub, + external_api, + state_machine, + storage_hub, + snapshot_hub, + transport_hub, + role: Role::Follower, + leader: None, + hb_hear_timer: Timer::new(), + hb_send_interval, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), + curr_term: 0, + voted_for: None, + votes_granted: HashSet::new(), + log: vec![], + start_slot: 0, + snapshot_interval, + last_commit: 0, + last_exec: 0, + next_slot: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 1)) }) + .collect(), + match_slot: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + last_snap: 0, + log_offset: 0, + log_meta_end: 0, + snap_offset: 0, + }) + } + + async fn run( + &mut self, + mut rx_term: watch::Receiver, + ) -> Result { + // recover state from durable snapshot file + self.recover_from_snapshot().await?; + + // recover the tail-piece memory log & state from remaining durable log + self.recover_from_log().await?; + + // kick off leader activity hearing timer + self.kickoff_hb_hear_timer()?; + + // main event loop + let mut paused = false; + loop { + tokio::select! { + // client request batch + req_batch = self.external_api.get_req_batch(), if !paused => { + if let Err(e) = req_batch { + pf_error!(self.id; "error getting req batch: {}", e); + continue; + } + let req_batch = req_batch.unwrap(); + if let Err(e) = self.handle_req_batch(req_batch) { + pf_error!(self.id; "error handling req batch: {}", e); + } + }, + + // durable logging result + log_result = self.storage_hub.get_result(), if !paused => { + if let Err(e) = log_result { + pf_error!(self.id; "error getting log result: {}", e); + continue; + } + let (action_id, log_result) = log_result.unwrap(); + if let Err(e) = self.handle_log_result(action_id, log_result) { + pf_error!(self.id; "error handling log result {}: {}", + action_id, e); + } + }, + + // message from peer + msg = self.transport_hub.recv_msg(), if !paused => { + if let Err(e) = msg { + pf_error!(self.id; "error receiving peer msg: {}", e); + continue; + } + let (peer, msg) = msg.unwrap(); + if let Err(e) = self.handle_msg_recv(peer, msg).await { + pf_error!(self.id; "error handling msg recv <- {}: {}", peer, e); + } + }, + + // state machine execution result + cmd_result = self.state_machine.get_result(), if !paused => { + if let Err(e) = cmd_result { + pf_error!(self.id; "error getting cmd result: {}", e); + continue; + } + let (cmd_id, cmd_result) = cmd_result.unwrap(); + if let Err(e) = self.handle_cmd_result(cmd_id, cmd_result) { + pf_error!(self.id; "error handling cmd result {}: {}", cmd_id, e); + } + }, + + // leader inactivity timeout + _ = self.hb_hear_timer.timeout(), if !paused => { + if let Err(e) = self.become_a_candidate().await { + pf_error!(self.id; "error becoming a candidate: {}", e); + } + }, + + // leader sending heartbeat + _ = self.hb_send_interval.tick(), if !paused + && self.role == Role::Leader => { + if let Err(e) = self.bcast_heartbeats() { + pf_error!(self.id; "error broadcasting heartbeats: {}", e); + } + }, + + // autonomous snapshot taking timeout + _ = self.snapshot_interval.tick(), if !paused + && self.config.snapshot_interval_s > 0 => { + if let Err(e) = self.take_new_snapshot().await { + pf_error!(self.id; "error taking a new snapshot: {}", e); + } else { + self.control_hub.send_ctrl( + CtrlMsg::SnapshotUpTo { new_start: self.start_slot } + )?; + } + }, + + // manager control message + ctrl_msg = self.control_hub.recv_ctrl() => { + if let Err(e) = ctrl_msg { + pf_error!(self.id; "error getting ctrl msg: {}", e); + continue; + } + let ctrl_msg = ctrl_msg.unwrap(); + match self.handle_ctrl_msg(ctrl_msg, &mut paused).await { + Ok(terminate) => { + if let Some(restart) = terminate { + return Ok(restart); + } + }, + Err(e) => { + pf_error!(self.id; "error handling ctrl msg: {}", e); + } + } + }, + + // receiving termination signal + _ = rx_term.changed() => { + pf_warn!(self.id; "server caught termination signal"); + return Ok(false); + } + } + } + } + + fn id(&self) -> ReplicaId { + self.id + } +} + +/// Configuration parameters struct. +#[derive(Debug, Deserialize)] +pub struct ClientConfigRaft { + /// Which server to pick initially. + pub init_server_id: ReplicaId, +} + +#[allow(clippy::derivable_impls)] +impl Default for ClientConfigRaft { + fn default() -> Self { + ClientConfigRaft { init_server_id: 0 } + } +} + +/// Raft client-side module. +pub struct RaftClient { + /// Client ID. + id: ClientId, + + /// Configuration parameters struct. + _config: ClientConfigRaft, + + /// List of active servers information. + servers: HashMap, + + /// Current server ID to talk to. + server_id: ReplicaId, + + /// Control API stub to the cluster manager. + ctrl_stub: ClientCtrlStub, + + /// API stubs for communicating with servers. + api_stubs: HashMap, +} + +#[async_trait] +impl GenericEndpoint for RaftClient { + async fn new_and_setup( + manager: SocketAddr, + config_str: Option<&str>, + ) -> Result { + // connect to the cluster manager and get assigned a client ID + pf_info!("c"; "connecting to manager '{}'...", manager); + let ctrl_stub = ClientCtrlStub::new_by_connect(manager).await?; + let id = ctrl_stub.id; + + // parse protocol-specific configs + let config = parsed_config!(config_str => ClientConfigRaft; + init_server_id)?; + let init_server_id = config.init_server_id; + + Ok(RaftClient { + id, + _config: config, + servers: HashMap::new(), + server_id: init_server_id, + ctrl_stub, + api_stubs: HashMap::new(), + }) + } + + async fn connect(&mut self) -> Result<(), SummersetError> { + // disallow reconnection without leaving + if !self.api_stubs.is_empty() { + return logged_err!(self.id; "reconnecting without leaving"); + } + + // ask the manager about the list of active servers + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::QueryInfo))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; + } + + let reply = self.ctrl_stub.recv_reply().await?; + match reply { + CtrlReply::QueryInfo { + population, + servers, + } => { + // shift to a new server_id if current one not active + assert!(!servers.is_empty()); + while !servers.contains_key(&self.server_id) { + self.server_id = (self.server_id + 1) % population; + } + // establish connection to all servers + self.servers = servers + .into_iter() + .map(|(id, info)| (id, info.0)) + .collect(); + for (&id, &server) in &self.servers { + pf_info!(self.id; "connecting to server {} '{}'...", id, server); + let api_stub = + ClientApiStub::new_by_connect(self.id, server).await?; + self.api_stubs.insert(id, api_stub); + } + Ok(()) + } + _ => logged_err!(self.id; "unexpected reply type received"), + } + } + + async fn leave(&mut self, permanent: bool) -> Result<(), SummersetError> { + // send leave notification to all servers + for (id, mut api_stub) in self.api_stubs.drain() { + let mut sent = api_stub.send_req(Some(&ApiRequest::Leave))?; + while !sent { + sent = api_stub.send_req(None)?; + } + + while api_stub.recv_reply().await? != ApiReply::Leave {} + pf_info!(self.id; "left server connection {}", id); + api_stub.forget(); + } + + // if permanently leaving, send leave notification to the manager + if permanent { + let mut sent = + self.ctrl_stub.send_req(Some(&CtrlRequest::Leave))?; + while !sent { + sent = self.ctrl_stub.send_req(None)?; + } + + while self.ctrl_stub.recv_reply().await? != CtrlReply::Leave {} + pf_info!(self.id; "left manager connection"); + } + + Ok(()) + } + + fn send_req( + &mut self, + req: Option<&ApiRequest>, + ) -> Result { + if self.api_stubs.contains_key(&self.server_id) { + self.api_stubs + .get_mut(&self.server_id) + .unwrap() + .send_req(req) + } else { + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) + } + } + + async fn recv_reply(&mut self) -> Result { + if self.api_stubs.contains_key(&self.server_id) { + let reply = self + .api_stubs + .get_mut(&self.server_id) + .unwrap() + .recv_reply() + .await?; + + if let ApiReply::Reply { + ref result, + ref redirect, + .. + } = reply + { + // if the current server redirects me to a different server + if result.is_none() && redirect.is_some() { + let redirect_id = redirect.unwrap(); + assert!(self.servers.contains_key(&redirect_id)); + self.server_id = redirect_id; + pf_debug!(self.id; "redirected to replica {} '{}'", + redirect_id, self.servers[&redirect_id]); + } + } + + Ok(reply) + } else { + Err(SummersetError(format!( + "server_id {} not in api_stubs", + self.server_id + ))) + } + } + + fn id(&self) -> ClientId { + self.id + } + + fn ctrl_stub(&mut self) -> &mut ClientCtrlStub { + &mut self.ctrl_stub + } +} diff --git a/src/protocols/rep_nothing.rs b/src/protocols/rep_nothing.rs index e5c6b0dd..a14af95b 100644 --- a/src/protocols/rep_nothing.rs +++ b/src/protocols/rep_nothing.rs @@ -28,8 +28,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigRepNothing { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -49,7 +49,7 @@ pub struct ReplicaConfigRepNothing { impl Default for ReplicaConfigRepNothing { fn default() -> Self { ReplicaConfigRepNothing { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.rep_nothing.wal".into(), logger_sync: false, @@ -59,9 +59,9 @@ impl Default for ReplicaConfigRepNothing { } } -/// Log entry type. +/// WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -struct LogEntry { +struct WalEntry { reqs: Vec<(ClientId, ApiRequest)>, } @@ -97,17 +97,19 @@ pub struct RepNothingReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// In-memory log of instances. insts: Vec, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, } +// RepNothingReplica common helpers impl RepNothingReplica { /// Compose CommandId from instance index & command index within. + #[inline] fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { assert!(inst_idx <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -115,12 +117,16 @@ impl RepNothingReplica { } /// Decompose CommandId into instance index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let inst_idx = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (inst_idx, cmd_idx) } +} +// RepNothingReplica client requests entrance +impl RepNothingReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -138,23 +144,26 @@ impl RepNothingReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry { reqs: req_batch }; + let wal_entry = WalEntry { reqs: req_batch }; self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: self.config.logger_sync, }, )?; Ok(()) } +} +// RepNothingReplica durable WAL logging +impl RepNothingReplica { /// Handler of durable logging result chan recv. fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let inst_idx = action_id as usize; if inst_idx >= self.insts.len() { @@ -163,8 +172,8 @@ impl RepNothingReplica { match log_result { LogResult::Append { now_size } => { - assert!(now_size >= self.log_offset); - self.log_offset = now_size; + assert!(now_size >= self.wal_offset); + self.wal_offset = now_size; } _ => { return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result); @@ -190,7 +199,10 @@ impl RepNothingReplica { Ok(()) } +} +// RepNothingReplica state machine execution +impl RepNothingReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -236,7 +248,10 @@ impl RepNothingReplica { Ok(()) } +} +// RepNothingReplica control messages handling +impl RepNothingReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -321,16 +336,19 @@ impl RepNothingReplica { _ => Ok(None), // ignore all other types } } +} - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); +// RepNothingReplica recovery from WAL log +impl RepNothingReplica { + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -356,7 +374,7 @@ impl RepNothingReplica { execed: vec![true; num_reqs], }); // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -372,7 +390,7 @@ impl RepNothingReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -401,14 +419,14 @@ impl GenericReplica for RepNothingReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRepNothing; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, perf_storage_a, perf_storage_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } @@ -442,7 +460,7 @@ impl GenericReplica for RepNothingReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -457,7 +475,7 @@ impl GenericReplica for RepNothingReplica { state_machine, storage_hub, insts: vec![], - log_offset: 0, + wal_offset: 0, }) } @@ -465,8 +483,8 @@ impl GenericReplica for RepNothingReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log - self.recover_from_log().await?; + // recover state from durable storage WAL log + self.recover_from_wal().await?; // main event loop let mut paused = false; diff --git a/src/protocols/rs_paxos.rs b/src/protocols/rs_paxos.rs index 82990870..98ef6af2 100644 --- a/src/protocols/rs_paxos.rs +++ b/src/protocols/rs_paxos.rs @@ -3,6 +3,7 @@ //! MultiPaxos with Reed-Solomon erasure coding. References: //! - +use std::cmp; use std::collections::HashMap; use std::path::Path; use std::net::SocketAddr; @@ -33,8 +34,8 @@ use reed_solomon_erasure::galois_8::ReedSolomon; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigRSPaxos { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -47,7 +48,6 @@ pub struct ReplicaConfigRSPaxos { /// Min timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_min: u64, - /// Max timeout of not hearing any heartbeat from leader in millisecs. pub hb_hear_timeout_max: u64, @@ -64,6 +64,9 @@ pub struct ReplicaConfigRSPaxos { /// Fault-tolerance level. pub fault_tolerance: u8, + /// Maximum chunk size of a ReconstructRead message. + pub recon_chunk_size: usize, + // Performance simulation params (all zeros means no perf simulation): pub perf_storage_a: u64, pub perf_storage_b: u64, @@ -75,7 +78,7 @@ pub struct ReplicaConfigRSPaxos { impl Default for ReplicaConfigRSPaxos { fn default() -> Self { ReplicaConfigRSPaxos { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.rs_paxos.wal".into(), logger_sync: false, @@ -85,6 +88,7 @@ impl Default for ReplicaConfigRSPaxos { snapshot_path: "/tmp/summerset.rs_paxos.snap".into(), snapshot_interval_s: 0, fault_tolerance: 0, + recon_chunk_size: 1000, perf_storage_a: 0, perf_storage_b: 0, perf_network_a: 0, @@ -156,12 +160,12 @@ struct Instance { external: bool, /// Offset of first durable WAL log entry related to this instance. - log_offset: usize, + wal_offset: usize, } -/// Stable storage log entry type. +/// Stable storage WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { /// Records an update to the largest prepare ballot seen. PrepareBal { slot: usize, ballot: Ballot }, @@ -177,11 +181,20 @@ enum LogEntry { } /// Snapshot file entry type. +/// +/// NOTE: the current implementation simply appends a squashed log at the +/// end of the snapshot file for simplicity. In production, the snapshot +/// file should be a bounded-sized backend, e.g., an LSM-tree. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] enum SnapEntry { - /// First entry at the start of file: number of log instances covered by - /// this snapshot file == the start slot index of in-mem log. - StartSlot { slot: usize }, + /// Necessary slot indices to remember. + SlotInfo { + /// First entry at the start of file: number of log instances covered + /// by this snapshot file == the start slot index of in-mem log. + start_slot: usize, + /// Index of the first non-committed slot. + commit_bar: usize, + }, /// Set of key-value pairs to apply to the state. KVPairSet { pairs: HashMap }, @@ -191,7 +204,13 @@ enum SnapEntry { #[derive(Debug, Clone, Serialize, Deserialize, GetSize)] enum PeerMsg { /// Prepare message from leader to replicas. - Prepare { slot: usize, ballot: Ballot }, + Prepare { + /// Slot index in Prepare message is the triggering slot of this + /// Prepare. Once prepared, it means that all slots in the range + /// [slot, +infinity) are prepared under this ballot number. + slot: usize, + ballot: Ballot, + }, /// Prepare reply from replica to leader. PrepareReply { @@ -215,18 +234,27 @@ enum PeerMsg { /// Commit notification from leader to replicas. Commit { slot: usize }, + /// Request by a lagging replica to leader asking to re-send Accepts for + /// missing holes + FillHoles { slots: Vec }, + /// Reconstruction read from new leader to replicas. - Reconstruct { slot: usize }, + Reconstruct { slots: Vec }, /// Reconstruction read reply from replica to leader. ReconstructReply { - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + /// Map from slot -> (ballot, peer shards). + slots_data: HashMap)>, }, /// Leader activity heartbeat. - Heartbeat { ballot: Ballot, exec_bar: usize }, + Heartbeat { + ballot: Ballot, + /// For leader step-up as well as conservative snapshotting purpose. + exec_bar: usize, + /// For conservative snapshotting purpose. + snap_bar: usize, + }, } /// RSPaxos server replica module. @@ -238,7 +266,7 @@ pub struct RSPaxosReplica { population: u8, /// Majority quorum size. - quorum_cnt: u8, + majority: u8, /// Configuration parameters struct. config: ReplicaConfigRSPaxos, @@ -259,7 +287,10 @@ pub struct RSPaxosReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, + + /// StorageHub module for the snapshot file. + snapshot_hub: StorageHub, /// StorageHub module for the snapshot file. snapshot_hub: StorageHub, @@ -267,14 +298,21 @@ pub struct RSPaxosReplica { /// TransportHub module. transport_hub: TransportHub, + /// Who do I think is the effective leader of the cluster right now? + leader: Option, + /// Timer for hearing heartbeat from leader. hb_hear_timer: Timer, /// Interval for sending heartbeat to followers. hb_send_interval: Interval, - /// Do I think I am the leader? - is_leader: bool, + /// Heartbeat reply counters for approximate detection of follower health. + /// Tuple of (#hb_replied, #hb_replied seen at last send, repetition). + hb_reply_cnts: HashMap, + + /// Approximate health status tracking of peer replicas. + peer_alive: Bitmap, /// In-memory log of instances. insts: Vec, @@ -301,8 +339,21 @@ pub struct RSPaxosReplica { /// It is always true that exec_bar <= commit_bar <= start_slot + insts.len() exec_bar: usize, - /// Current durable log file offset. - log_offset: usize, + /// Map from peer ID -> its latest exec_bar I know; this is for conservative + /// snapshotting purpose. + peer_exec_bar: HashMap, + + /// Slot index before which it is safe to take snapshot. + /// NOTE: we are taking a conservative approach here that a snapshot + /// covering an entry can be taken only when all servers have durably + /// committed (and executed) that entry. + snap_bar: usize, + + /// Current durable WAL log file offset. + wal_offset: usize, + + /// Current durable snapshot file offset. + snap_offset: usize, /// Current durable snapshot file offset. snap_offset: usize, @@ -311,42 +362,64 @@ pub struct RSPaxosReplica { rs_coder: ReedSolomon, } +// RSPaxosReplica common helpers impl RSPaxosReplica { + /// Do I think I am the current effective leader? + #[inline] + fn is_leader(&self) -> bool { + self.leader == Some(self.id) + } + /// Create an empty null instance. + #[inline] fn null_instance(&self) -> Result { Ok(Instance { bal: 0, status: Status::Null, reqs_cw: RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, voted: ( 0, RSCodeword::::from_null( - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?, ), leader_bk: None, replica_bk: None, external: false, - log_offset: 0, + wal_offset: 0, }) } + /// Locate the first null slot or append a null instance if no holes exist. + fn first_null_slot(&mut self) -> Result { + for s in self.commit_bar..(self.start_slot + self.insts.len()) { + if self.insts[s - self.start_slot].status == Status::Null { + return Ok(s); + } + } + self.insts.push(self.null_instance()?); + Ok(self.start_slot + self.insts.len() - 1) + } + /// Compose a unique ballot number from base. + #[inline] fn make_unique_ballot(&self, base: u64) -> Ballot { ((base << 8) | ((self.id + 1) as u64)) as Ballot } /// Compose a unique ballot number greater than the given one. + #[inline] fn make_greater_ballot(&self, bal: Ballot) -> Ballot { self.make_unique_ballot((bal >> 8) + 1) } /// Compose LogActionId from slot index & entry type. /// Uses the `Status` enum type to represent differnet entry types. + #[inline] fn make_log_action_id(slot: usize, entry_type: Status) -> LogActionId { let type_num = match entry_type { Status::Preparing => 1, @@ -358,6 +431,7 @@ impl RSPaxosReplica { } /// Decompose LogActionId into slot index & entry type. + #[inline] fn split_log_action_id(log_action_id: LogActionId) -> (usize, Status) { let slot = (log_action_id >> 2) as usize; let type_num = log_action_id & ((1 << 2) - 1); @@ -371,6 +445,7 @@ impl RSPaxosReplica { } /// Compose CommandId from slot index & command index within. + #[inline] fn make_command_id(slot: usize, cmd_idx: usize) -> CommandId { assert!(slot <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -378,12 +453,16 @@ impl RSPaxosReplica { } /// Decompose CommandId into slot index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let slot = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (slot, cmd_idx) } +} +// RSPaxosReplica client requests entrance +impl RSPaxosReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -394,21 +473,26 @@ impl RSPaxosReplica { pf_debug!(self.id; "got request batch of size {}", batch_size); // if I'm not a leader, ignore client requests - if !self.is_leader { + if !self.is_leader() { for (client, req) in req_batch { if let ApiRequest::Req { id: req_id, .. } = req { - // tell the client to try on the next replica - let next_replica = (self.id + 1) % self.population; + // tell the client to try on known leader or just the + // next ID replica + let target = if let Some(peer) = self.leader { + peer + } else { + (self.id + 1) % self.population + }; self.external_api.send_reply( ApiReply::Reply { id: req_id, result: None, - redirect: Some(next_replica), + redirect: Some(target), }, client, )?; pf_trace!(self.id; "redirected client {} to replica {}", - client, next_replica); + client, target); } } return Ok(()); @@ -417,39 +501,24 @@ impl RSPaxosReplica { // compute the complete Reed-Solomon codeword for the batch data let mut reqs_cw = RSCodeword::from_data( req_batch, - self.quorum_cnt, - self.population - self.quorum_cnt, + self.majority, + self.population - self.majority, )?; reqs_cw.compute_parity(Some(&self.rs_coder))?; // create a new instance in the first null slot (or append a new one - // at the end if no holes exist) - let mut slot = self.start_slot + self.insts.len(); - for s in self.commit_bar..(self.start_slot + self.insts.len()) { - if self.insts[s - self.start_slot].status == Status::Null { - slot = s; - break; - } - } - if slot < self.start_slot + self.insts.len() { - let old_inst = &mut self.insts[slot - self.start_slot]; - assert_eq!(old_inst.status, Status::Null); - old_inst.reqs_cw = reqs_cw; - old_inst.leader_bk = Some(LeaderBookkeeping { - prepare_acks: Bitmap::new(self.population, false), - prepare_max_bal: 0, - accept_acks: Bitmap::new(self.population, false), - }); - } else { - let mut new_inst = self.null_instance()?; - new_inst.reqs_cw = reqs_cw; - new_inst.leader_bk = Some(LeaderBookkeeping { + // at the end if no holes exist); fill it up with incoming data + let slot = self.first_null_slot()?; + { + let inst = &mut self.insts[slot - self.start_slot]; + assert_eq!(inst.status, Status::Null); + inst.reqs_cw = reqs_cw; + inst.leader_bk = Some(LeaderBookkeeping { prepare_acks: Bitmap::new(self.population, false), prepare_max_bal: 0, accept_acks: Bitmap::new(self.population, false), }); - new_inst.external = true; - self.insts.push(new_inst); + inst.external = true; } // decide whether we can enter fast path for this instance @@ -473,7 +542,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -510,7 +579,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot: inst.bal, // persist only one shard on myself @@ -545,7 +614,10 @@ impl RSPaxosReplica { Ok(()) } +} +// RSPaxosReplica durable WAL logging +impl RSPaxosReplica { /// Handler of PrepareBal logging result chan recv. fn handle_logged_prepare_bal( &mut self, @@ -563,7 +635,7 @@ impl RSPaxosReplica { None }; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of a PrepareBal entry // is equivalent to receiving a Prepare reply from myself // (as an acceptor role) @@ -600,7 +672,7 @@ impl RSPaxosReplica { slot, self.insts[slot - self.start_slot].bal); let inst = &self.insts[slot - self.start_slot]; - if self.is_leader { + if self.is_leader() { // on leader, finishing the logging of an AcceptData entry // is equivalent to receiving an Accept reply from myself // (as an acceptor role) @@ -642,13 +714,15 @@ impl RSPaxosReplica { if inst.status < Status::Committed { break; } + let now_slot = self.commit_bar; + self.commit_bar += 1; - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch pf_debug!(self.id; "postponing execution for slot {} (shards {}/{})", - slot, inst.reqs_cw.avail_shards(), self.quorum_cnt); + slot, inst.reqs_cw.avail_shards(), self.majority); break; - } else if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + } else if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -662,7 +736,7 @@ impl RSPaxosReplica { for (cmd_idx, (_, req)) in reqs.iter().enumerate() { if let ApiRequest::Req { cmd, .. } = req { self.state_machine.submit_cmd( - Self::make_command_id(self.commit_bar, cmd_idx), + Self::make_command_id(now_slot, cmd_idx), cmd.clone(), )?; } else { @@ -670,10 +744,23 @@ impl RSPaxosReplica { } } pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); + reqs.len(), now_slot); } + } + } - self.commit_bar += 1; + // if there are hole(s) between current commit_bar and newly committed + // slot, ask the leader to re-send Accept messages for those slots + if slot > self.commit_bar && !self.is_leader() { + if let Some(leader) = self.leader { + let holes: Vec = (self.commit_bar..slot).collect(); + self.transport_hub.send_msg( + PeerMsg::FillHoles { + slots: holes.clone(), + }, + leader, + )?; + pf_trace!(self.id; "sent FillHoles -> {} slots {:?}", leader, holes); } } @@ -684,7 +771,7 @@ impl RSPaxosReplica { fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let (slot, entry_type) = Self::split_log_action_id(action_id); if slot < self.start_slot { @@ -693,15 +780,15 @@ impl RSPaxosReplica { assert!(slot < self.start_slot + self.insts.len()); if let LogResult::Append { now_size } = log_result { - assert!(now_size >= self.log_offset); - // update first log_offset of slot + assert!(now_size >= self.wal_offset); + // update first wal_offset of slot let inst = &mut self.insts[slot - self.start_slot]; - if inst.log_offset == 0 || inst.log_offset > self.log_offset { - inst.log_offset = self.log_offset; + if inst.wal_offset == 0 || inst.wal_offset > self.wal_offset { + inst.wal_offset = self.wal_offset; } - assert!(inst.log_offset <= self.log_offset); - // then update self.log_offset - self.log_offset = now_size; + assert!(inst.wal_offset <= self.wal_offset); + // then update self.wal_offset + self.wal_offset = now_size; } else { return logged_err!(self.id; "unexpected log result type: {:?}", log_result); } @@ -715,7 +802,10 @@ impl RSPaxosReplica { } } } +} +// RSPaxosReplica peer-peer messages handling +impl RSPaxosReplica { /// Handler of Prepare message from leader. fn handle_msg_prepare( &mut self, @@ -749,7 +839,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { slot, ballot }, + entry: WalEntry::PrepareBal { slot, ballot }, sync: self.config.logger_sync, }, )?; @@ -778,10 +868,11 @@ impl RSPaxosReplica { // if ballot is what I'm currently waiting on for Prepare replies: if ballot == self.bal_prep_sent { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Preparing) || (ballot < inst.bal) { @@ -815,10 +906,10 @@ impl RSPaxosReplica { // reconstruct the original data, enter Accept phase for this // instance using the request batch value constructed using shards // with the highest ballot number in quorum - if leader_bk.prepare_acks.count() >= self.quorum_cnt - && inst.reqs_cw.avail_shards() >= self.quorum_cnt + if leader_bk.prepare_acks.count() >= self.majority + && inst.reqs_cw.avail_shards() >= self.majority { - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_data_shards() < self.majority { // have enough shards but need reconstruction inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; } @@ -845,7 +936,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: subset_copy, @@ -917,7 +1008,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Accepting), LogAction::Append { - entry: LogEntry::AcceptData { + entry: WalEntry::AcceptData { slot, ballot, reqs_cw: inst.reqs_cw.clone(), @@ -948,10 +1039,11 @@ impl RSPaxosReplica { // if ballot is what I'm currently waiting on for Accept replies: if ballot == self.bal_prepared { assert!(slot < self.start_slot + self.insts.len()); + let is_leader = self.is_leader(); let inst = &mut self.insts[slot - self.start_slot]; // ignore spurious duplications and outdated replies - if !self.is_leader + if !is_leader || (inst.status != Status::Accepting) || (ballot < inst.bal) { @@ -970,9 +1062,9 @@ impl RSPaxosReplica { // if quorum size reached AND enough number of shards are // remembered, mark this instance as committed; in RS-Paxos, this - // means accept_acks.count() >= self.quorum_cnt + fault_tolerance + // means accept_acks.count() >= self.majority + fault_tolerance if leader_bk.accept_acks.count() - >= self.quorum_cnt + self.config.fault_tolerance + >= self.majority + self.config.fault_tolerance { inst.status = Status::Committed; pf_debug!(self.id; "committed instance at slot {} bal {}", @@ -982,7 +1074,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1031,7 +1123,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Committed), LogAction::Append { - entry: LogEntry::CommitSlot { slot }, + entry: WalEntry::CommitSlot { slot }, sync: self.config.logger_sync, }, )?; @@ -1041,40 +1133,83 @@ impl RSPaxosReplica { Ok(()) } - /// Handler of Reconstruct message from leader. - fn handle_msg_reconstruct( + /// Handler of FillHoles message from a lagging peer. + fn handle_msg_fill_holes( &mut self, peer: ReplicaId, - slot: usize, + slots: Vec, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated + if !self.is_leader() { + return Ok(()); } - pf_trace!(self.id; "received Reconstruct <- {} for slot {}", peer, slot); + pf_trace!(self.id; "received FillHoles <- {} for slots {:?}", peer, slots); - // locate instance in memory, filling in null instances if needed - while self.start_slot + self.insts.len() <= slot { - self.insts.push(self.null_instance()?); - } - let inst = &mut self.insts[slot - self.start_slot]; + for slot in slots { + if slot < self.start_slot { + continue; + } else if slot >= self.start_slot + self.insts.len() { + break; + } + let inst = &self.insts[slot - self.start_slot]; - // ignore spurious duplications; also ignore if I have nothing to send back - if inst.status < Status::Accepting || inst.reqs_cw.avail_shards() == 0 { - return Ok(()); + if inst.status >= Status::Committed { + // re-send Accept message for this slot + self.transport_hub.send_msg( + PeerMsg::Accept { + slot, + ballot: self.bal_prepared, + reqs_cw: inst.reqs_cw.subset_copy( + Bitmap::from(self.population, vec![peer]), + false, + )?, + }, + peer, + )?; + pf_trace!(self.id; "sent Accept -> {} for slot {} bal {}", + peer, slot, self.bal_prepared); + } } - // send back my ballot for this slot and the available shards - self.transport_hub.send_msg( - PeerMsg::ReconstructReply { - slot, - ballot: inst.bal, - reqs_cw: inst.reqs_cw.clone(), - }, - peer, - )?; - pf_trace!(self.id; "sent ReconstructReply message for slot {} bal {}", - slot, inst.bal); + Ok(()) + } + + /// Handler of Reconstruct message from leader. + fn handle_msg_reconstruct( + &mut self, + peer: ReplicaId, + slots: Vec, + ) -> Result<(), SummersetError> { + pf_trace!(self.id; "received Reconstruct <- {} for slots {:?}", peer, slots); + let mut slots_data = HashMap::new(); + + for slot in slots { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } + + // locate instance in memory, filling in null instances if needed + while self.start_slot + self.insts.len() <= slot { + self.insts.push(self.null_instance()?); + } + let inst = &mut self.insts[slot - self.start_slot]; + + // ignore spurious duplications; also ignore if I have nothing to send back + if inst.status < Status::Accepting + || inst.reqs_cw.avail_shards() == 0 + { + continue; + } + + // send back my ballot for this slot and the available shards + slots_data.insert(slot, (inst.bal, inst.reqs_cw.clone())); + } + if !slots_data.is_empty() { + let num_slots = slots_data.len(); + self.transport_hub + .send_msg(PeerMsg::ReconstructReply { slots_data }, peer)?; + pf_trace!(self.id; "sent ReconstructReply message for {} slots", num_slots); + } Ok(()) } @@ -1082,65 +1217,66 @@ impl RSPaxosReplica { fn handle_msg_reconstruct_reply( &mut self, peer: ReplicaId, - slot: usize, - ballot: Ballot, - reqs_cw: RSCodeword, + slots_data: HashMap)>, ) -> Result<(), SummersetError> { - if slot < self.start_slot { - return Ok(()); // ignore if slot index outdated - } - pf_trace!(self.id; "received ReconstructReply <- {} for slot {} bal {} shards {:?}", - peer, slot, ballot, reqs_cw.avail_shards_map()); - assert!(slot < self.start_slot + self.insts.len()); - assert!(self.insts[slot - self.start_slot].status >= Status::Committed); - let num_insts = self.start_slot + self.insts.len(); - let inst = &mut self.insts[slot - self.start_slot]; + for (slot, (ballot, reqs_cw)) in slots_data { + if slot < self.start_slot { + continue; // ignore if slot index outdated + } + pf_trace!(self.id; "in ReconstructReply <- {} for slot {} bal {} shards {:?}", + peer, slot, ballot, reqs_cw.avail_shards_map()); + assert!(slot < self.start_slot + self.insts.len()); + assert!( + self.insts[slot - self.start_slot].status >= Status::Committed + ); + let inst = &mut self.insts[slot - self.start_slot]; - // if reply not outdated and ballot is up-to-date - if inst.status < Status::Executed && ballot >= inst.bal { - // absorb the shards from this replica - inst.reqs_cw.absorb_other(reqs_cw)?; - - // if enough shards have been gathered, can push execution forward - if slot == self.commit_bar { - while self.commit_bar < num_insts { - let inst = - &mut self.insts[self.commit_bar - self.start_slot]; - if inst.status < Status::Committed - || inst.reqs_cw.avail_shards() < self.quorum_cnt - { - break; - } + // if reply not outdated and ballot is up-to-date + if inst.status < Status::Executed && ballot >= inst.bal { + // absorb the shards from this replica + inst.reqs_cw.absorb_other(reqs_cw)?; + + // if enough shards have been gathered, can push execution forward + if slot == self.exec_bar { + let mut now_slot = self.exec_bar; + while now_slot < self.start_slot + self.insts.len() { + let inst = &mut self.insts[now_slot - self.start_slot]; + if inst.status < Status::Committed + || inst.reqs_cw.avail_shards() < self.majority + { + break; + } - if inst.reqs_cw.avail_data_shards() < self.quorum_cnt { - // have enough shards but need reconstruction - inst.reqs_cw.reconstruct_data(Some(&self.rs_coder))?; - } - let reqs = inst.reqs_cw.get_data()?; + if inst.reqs_cw.avail_data_shards() < self.majority { + // have enough shards but need reconstruction + inst.reqs_cw + .reconstruct_data(Some(&self.rs_coder))?; + } + let reqs = inst.reqs_cw.get_data()?; - // submit commands in committed instance to the state machine - // for execution - if reqs.is_empty() { - inst.status = Status::Executed; - } else { - for (cmd_idx, (_, req)) in reqs.iter().enumerate() { - if let ApiRequest::Req { cmd, .. } = req { - self.state_machine.submit_cmd( - Self::make_command_id( - self.commit_bar, - cmd_idx, - ), - cmd.clone(), - )?; - } else { - continue; // ignore other types of requests + // submit commands in committed instance to the state machine + // for execution + if reqs.is_empty() { + inst.status = Status::Executed; + } else { + for (cmd_idx, (_, req)) in reqs.iter().enumerate() { + if let ApiRequest::Req { cmd, .. } = req { + self.state_machine.submit_cmd( + Self::make_command_id( + now_slot, cmd_idx, + ), + cmd.clone(), + )?; + } else { + continue; // ignore other types of requests + } } + pf_trace!(self.id; "submitted {} exec commands for slot {}", + reqs.len(), now_slot); } - pf_trace!(self.id; "submitted {} exec commands for slot {}", - reqs.len(), self.commit_bar); - } - self.commit_bar += 1; + now_slot += 1; + } } } } @@ -1172,20 +1308,26 @@ impl RSPaxosReplica { self.handle_msg_accept_reply(peer, slot, ballot) } PeerMsg::Commit { slot } => self.handle_msg_commit(peer, slot), - PeerMsg::Reconstruct { slot } => { - self.handle_msg_reconstruct(peer, slot) + PeerMsg::FillHoles { slots } => { + self.handle_msg_fill_holes(peer, slots) } - PeerMsg::ReconstructReply { - slot, - ballot, - reqs_cw, - } => self.handle_msg_reconstruct_reply(peer, slot, ballot, reqs_cw), - PeerMsg::Heartbeat { ballot, exec_bar } => { - self.heard_heartbeat(peer, ballot, exec_bar) + PeerMsg::Reconstruct { slots } => { + self.handle_msg_reconstruct(peer, slots) } + PeerMsg::ReconstructReply { slots_data } => { + self.handle_msg_reconstruct_reply(peer, slots_data) + } + PeerMsg::Heartbeat { + ballot, + exec_bar, + snap_bar, + } => self.heard_heartbeat(peer, ballot, exec_bar, snap_bar), } } +} +// RSPaxosReplica state machine execution +impl RSPaxosReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -1244,27 +1386,45 @@ impl RSPaxosReplica { Ok(()) } +} +// RSPaxosReplica leadership related logic +impl RSPaxosReplica { /// Becomes a leader, sends self-initiated Prepare messages to followers /// for all in-progress instances, and starts broadcasting heartbeats. fn become_a_leader(&mut self) -> Result<(), SummersetError> { - if self.is_leader { + if self.is_leader() { return Ok(()); + } else if let Some(peer) = self.leader { + // mark old leader as dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } } - self.is_leader = true; // this starts broadcasting heartbeats + self.leader = Some(self.id); // this starts broadcasting heartbeats self.control_hub .send_ctrl(CtrlMsg::LeaderStatus { step_up: true })?; pf_info!(self.id; "becoming a leader..."); - // broadcast a heartbeat right now + // clear peers' heartbeat reply counters, and broadcast a heartbeat now + for cnts in self.hb_reply_cnts.values_mut() { + *cnts = (1, 0, 0); + } self.bcast_heartbeats()?; + // re-initialize peer_exec_bar information + for slot in self.peer_exec_bar.values_mut() { + *slot = 0; + } + // make a greater ballot number and invalidate all in-progress instances self.bal_prepared = 0; self.bal_prep_sent = self.make_greater_ballot(self.bal_max_seen); self.bal_max_seen = self.bal_prep_sent; + let mut recon_slots = Vec::new(); for (slot, inst) in self .insts .iter_mut() @@ -1287,7 +1447,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( Self::make_log_action_id(slot, Status::Preparing), LogAction::Append { - entry: LogEntry::PrepareBal { + entry: WalEntry::PrepareBal { slot, ballot: self.bal_prep_sent, }, @@ -1312,15 +1472,20 @@ impl RSPaxosReplica { // do reconstruction reads for all committed instances that do not // hold enough available shards for reconstruction if inst.status == Status::Committed - && inst.reqs_cw.avail_shards() < self.quorum_cnt + && inst.reqs_cw.avail_shards() < self.majority { - self.transport_hub - .bcast_msg(PeerMsg::Reconstruct { slot }, None)?; - pf_trace!(self.id; "broadcast Reconstruct messages for slot {} bal {} shards {:?}", - slot, inst.bal, inst.reqs_cw.avail_shards_map()); + recon_slots.push(slot); } } + // send reconstruction read messages in chunks + for chunk in recon_slots.chunks(self.config.recon_chunk_size) { + let slots = chunk.to_vec(); + let num_slots = slots.len(); + self.transport_hub + .bcast_msg(PeerMsg::Reconstruct { slots }, None)?; + pf_trace!(self.id; "broadcast Reconstruct messages for {} slots", num_slots); + } Ok(()) } @@ -1330,10 +1495,43 @@ impl RSPaxosReplica { PeerMsg::Heartbeat { ballot: self.bal_prep_sent, exec_bar: self.exec_bar, + snap_bar: self.snap_bar, }, None, )?; - self.heard_heartbeat(self.id, self.bal_prep_sent, self.exec_bar)?; + + // update max heartbeat reply counters and their repetitions seen + for (&peer, cnts) in self.hb_reply_cnts.iter_mut() { + if cnts.0 > cnts.1 { + // more hb replies have been received from this peer; it is + // probably alive + cnts.1 = cnts.0; + cnts.2 = 0; + } else { + // did not receive hb reply from this peer at least for the + // last sent hb from me; increment repetition count + cnts.2 += 1; + let repeat_threshold = (self.config.hb_hear_timeout_min + / self.config.hb_send_interval_ms) + as u8; + if cnts.2 > repeat_threshold { + // did not receive hb reply from this peer for too many + // past hbs sent from me; this peer is probably dead + if self.peer_alive.get(peer)? { + self.peer_alive.set(peer, false)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + } + } + + // I also heard this heartbeat from myself + self.heard_heartbeat( + self.id, + self.bal_prep_sent, + self.exec_bar, + self.snap_bar, + )?; // pf_trace!(self.id; "broadcast heartbeats bal {}", self.bal_prep_sent); Ok(()) @@ -1342,6 +1540,11 @@ impl RSPaxosReplica { /// Chooses a random hb_hear_timeout from the min-max range and kicks off /// the hb_hear_timer. fn kickoff_hb_hear_timer(&mut self) -> Result<(), SummersetError> { +<<<<<<< HEAD +======= + self.hb_hear_timer.cancel()?; + +>>>>>>> cb7f7384ce0f94e2d3bc15a486d794092f95f47d let timeout_ms = thread_rng().gen_range( self.config.hb_hear_timeout_min..=self.config.hb_hear_timeout_max, ); @@ -1357,10 +1560,19 @@ impl RSPaxosReplica { /// leader status if I currently think I'm a leader. fn heard_heartbeat( &mut self, - _peer: ReplicaId, + peer: ReplicaId, ballot: Ballot, exec_bar: usize, + snap_bar: usize, ) -> Result<(), SummersetError> { + if peer != self.id { + self.hb_reply_cnts.get_mut(&peer).unwrap().0 += 1; + if !self.peer_alive.get(peer)? { + self.peer_alive.set(peer, true)?; + pf_debug!(self.id; "peer_alive updated: {:?}", self.peer_alive); + } + } + // ignore outdated heartbeats and those from peers with exec_bar < mine if ballot < self.bal_max_seen || exec_bar < self.exec_bar { return Ok(()); @@ -1369,18 +1581,61 @@ impl RSPaxosReplica { // reset hearing timer self.kickoff_hb_hear_timer()?; - // clear my leader status if it carries a higher ballot number - if self.is_leader && ballot > self.bal_max_seen { - self.is_leader = false; - self.control_hub - .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; - pf_info!(self.id; "no longer a leader..."); + if peer != self.id { + // reply back with a Heartbeat message + self.transport_hub.send_msg( + PeerMsg::Heartbeat { + ballot, + exec_bar: self.exec_bar, + snap_bar: self.snap_bar, + }, + peer, + )?; + + // update peer_exec_bar if larger then known; if all servers' + // exec_bar (including myself) have passed a slot, that slot + // is definitely safe to be snapshotted + if exec_bar > self.peer_exec_bar[&peer] { + *self.peer_exec_bar.get_mut(&peer).unwrap() = exec_bar; + let passed_cnt = 1 + self + .peer_exec_bar + .values() + .filter(|&&e| e >= exec_bar) + .count() as u8; + if passed_cnt == self.population { + // all servers have executed up to exec_bar + self.snap_bar = exec_bar; + } + } + + // if the peer has made a higher ballot number + if ballot > self.bal_max_seen { + self.bal_max_seen = ballot; + + // clear my leader status if I was one + if self.is_leader() { + self.control_hub + .send_ctrl(CtrlMsg::LeaderStatus { step_up: false })?; + pf_info!(self.id; "no longer a leader..."); + } + + // set this peer to be the believed leader + self.leader = Some(peer); + } + } + + // if snap_bar is larger than mine, update snap_bar + if snap_bar > self.snap_bar { + self.snap_bar = snap_bar; } // pf_trace!(self.id; "heard heartbeat <- {} bal {}", peer, ballot); Ok(()) } +} +// RSPaxosReplica control messages handling +impl RSPaxosReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -1492,14 +1747,20 @@ impl RSPaxosReplica { _ => Ok(None), // ignore all other types } } +} +// RSPaxosReplica recovery from WAL log +impl RSPaxosReplica { /// Apply a durable storage log entry for recovery. async fn recover_apply_entry( &mut self, - entry: LogEntry, + entry: WalEntry, ) -> Result<(), SummersetError> { match entry { - LogEntry::PrepareBal { slot, ballot } => { + WalEntry::PrepareBal { slot, ballot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1518,11 +1779,14 @@ impl RSPaxosReplica { self.bal_prepared = 0; } - LogEntry::AcceptData { + WalEntry::AcceptData { slot, ballot, reqs_cw, } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } // locate instance in memory, filling in null instances if needed while self.start_slot + self.insts.len() <= slot { self.insts.push(self.null_instance()?); @@ -1548,9 +1812,12 @@ impl RSPaxosReplica { assert!(self.bal_prepared <= self.bal_prep_sent); } - LogEntry::CommitSlot { slot } => { + WalEntry::CommitSlot { slot } => { + if slot < self.start_slot { + return Ok(()); // ignore if slot index outdated + } assert!(slot < self.start_slot + self.insts.len()); - // update instance state + // update instance status self.insts[slot - self.start_slot].status = Status::Committed; // submit commands in contiguously committed instance to the // state machine @@ -1561,12 +1828,14 @@ impl RSPaxosReplica { if inst.status < Status::Committed { break; } + // update commit_bar + self.commit_bar += 1; // check number of available shards - if inst.reqs_cw.avail_shards() < self.quorum_cnt { + if inst.reqs_cw.avail_shards() < self.majority { // can't execute if I don't have the complete request batch break; } else if inst.reqs_cw.avail_data_shards() - < self.quorum_cnt + < self.majority { // have enough shards but need reconstruction inst.reqs_cw @@ -1581,9 +1850,9 @@ impl RSPaxosReplica { let _ = self.state_machine.get_result().await?; } } - // update commit_bar and exec_bar - self.commit_bar += 1; + // update instance status and exec_bar self.exec_bar += 1; + inst.status = Status::Executed; } } } @@ -1592,15 +1861,15 @@ impl RSPaxosReplica { Ok(()) } - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1612,7 +1881,7 @@ impl RSPaxosReplica { } => { self.recover_apply_entry(entry).await?; // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -1628,7 +1897,7 @@ impl RSPaxosReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -1636,19 +1905,29 @@ impl RSPaxosReplica { offset_ok: true, .. } = log_result { + if self.wal_offset > 0 { + pf_info!(self.id; "recovered from wal log: commit {} exec {}", + self.commit_bar, self.exec_bar); + } Ok(()) } else { logged_err!(self.id; "unexpected log result type or failed truncate") } } +} - /// Dump a new key-value pair to snapshot file. - async fn snapshot_dump_kv_pairs(&mut self) -> Result<(), SummersetError> { +// RSPaxosReplica snapshotting & GC logic +impl RSPaxosReplica { + /// Dump new key-value pairs to snapshot file. + async fn snapshot_dump_kv_pairs( + &mut self, + new_start_slot: usize, + ) -> Result<(), SummersetError> { // collect all key-value pairs put up to exec_bar let mut pairs = HashMap::new(); - for slot in self.start_slot..self.exec_bar { + for slot in self.start_slot..new_start_slot { let inst = &mut self.insts[slot - self.start_slot]; - assert!(inst.reqs_cw.avail_data_shards() >= self.quorum_cnt); + assert!(inst.reqs_cw.avail_data_shards() >= self.majority); for (_, req) in inst.reqs_cw.get_data()?.clone() { if let ApiRequest::Req { cmd: Command::Put { key, value }, @@ -1683,15 +1962,20 @@ impl RSPaxosReplica { /// Discard everything older than start_slot in durable WAL log. async fn snapshot_discard_log(&mut self) -> Result<(), SummersetError> { let cut_offset = if !self.insts.is_empty() { - self.insts[0].log_offset + self.insts[0].wal_offset } else { - self.log_offset + self.wal_offset }; // discard the log before cut_offset if cut_offset > 0 { - self.storage_hub - .submit_action(0, LogAction::Discard { offset: cut_offset })?; + self.storage_hub.submit_action( + 0, + LogAction::Discard { + offset: cut_offset, + keep: 0, + }, + )?; loop { let (action_id, log_result) = self.storage_hub.get_result().await?; @@ -1704,8 +1988,8 @@ impl RSPaxosReplica { now_size, } = log_result { - assert_eq!(self.log_offset - cut_offset, now_size); - self.log_offset = now_size; + assert_eq!(self.wal_offset - cut_offset, now_size); + self.wal_offset = now_size; } else { return logged_err!( self.id; @@ -1717,43 +2001,74 @@ impl RSPaxosReplica { } } - // update inst.log_offset for all remaining in-mem instances + // update inst.wal_offset for all remaining in-mem instances for inst in &mut self.insts { - if inst.log_offset > 0 { - assert!(inst.log_offset >= cut_offset); - inst.log_offset -= cut_offset; + if inst.wal_offset > 0 { + assert!(inst.wal_offset >= cut_offset); + inst.wal_offset -= cut_offset; } } Ok(()) } - /// Take a snapshot up to current exec_idx, then discard the in-mem log up + /// Take a snapshot up to current exec_bar, then discard the in-mem log up /// to that index as well as outdate entries in the durable WAL log file. /// /// NOTE: the current implementation does not guard against crashes in the - /// middle of taking a snapshot. + /// middle of taking a snapshot. Production quality implementations should + /// make the snapshotting action "atomic". + /// + /// NOTE: the current implementation does not take care of InstallSnapshot + /// messages (which is needed when some lagging follower has some slot + /// which all other peers have snapshotted); we assume here that failed + /// Accept messages will be retried indefinitely until success before its + /// associated data gets discarded from leader's memory. async fn take_new_snapshot(&mut self) -> Result<(), SummersetError> { - pf_debug!(self.id; "taking new snapshot: start {} exec {}", - self.start_slot, self.exec_bar); + pf_debug!(self.id; "taking new snapshot: start {} exec {} snap {}", + self.start_slot, self.exec_bar, self.snap_bar); assert!(self.exec_bar >= self.start_slot); - if self.exec_bar == self.start_slot { + + let new_start_slot = cmp::min(self.snap_bar, self.exec_bar); + if new_start_slot == self.start_slot { return Ok(()); } // collect and dump all Puts in executed instances - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } - self.snapshot_dump_kv_pairs().await?; + self.snapshot_dump_kv_pairs(new_start_slot).await?; + + // write new slot info entry to the head of snapshot + self.snapshot_hub.submit_action( + 0, + LogAction::Write { + entry: SnapEntry::SlotInfo { + start_slot: new_start_slot, + commit_bar: self.commit_bar, + }, + offset: 0, + sync: self.config.logger_sync, + }, + )?; + let (_, log_result) = self.snapshot_hub.get_result().await?; + match log_result { + LogResult::Write { + offset_ok: true, .. + } => {} + _ => { + return logged_err!(self.id; "unexpected log result type or failed write"); + } + } // update start_slot and discard all in-memory log instances up to exec_bar - self.insts.drain(0..(self.exec_bar - self.start_slot)); - self.start_slot = self.exec_bar; + self.insts.drain(0..(new_start_slot - self.start_slot)); + self.start_slot = new_start_slot; // discarding everything older than start_slot in WAL log - if self.is_leader { + if self.is_leader() { // NOTE: broadcast heartbeats here to appease followers self.bcast_heartbeats()?; } @@ -1778,11 +2093,20 @@ impl RSPaxosReplica { match log_result { LogResult::Read { - entry: Some(SnapEntry::StartSlot { slot }), + entry: + Some(SnapEntry::SlotInfo { + start_slot, + commit_bar, + }), end_offset, } => { self.snap_offset = end_offset; - self.start_slot = slot; // get start slot index of in-mem log + + // recover necessary slot indices info + self.start_slot = start_slot; + self.commit_bar = commit_bar; + self.exec_bar = start_slot; + self.snap_bar = start_slot; // repeatedly apply key-value pairs loop { @@ -1825,6 +2149,11 @@ impl RSPaxosReplica { self.control_hub.send_ctrl(CtrlMsg::SnapshotUpTo { new_start: self.start_slot, })?; + + if self.start_slot > 0 { + pf_info!(self.id; "recovered from snapshot: start {} commit {} exec {}", + self.start_slot, self.commit_bar, self.exec_bar); + } Ok(()) } @@ -1833,7 +2162,10 @@ impl RSPaxosReplica { self.snapshot_hub.submit_action( 0, LogAction::Write { - entry: SnapEntry::StartSlot { slot: 0 }, + entry: SnapEntry::SlotInfo { + start_slot: 0, + commit_bar: 0, + }, offset: 0, sync: self.config.logger_sync, }, @@ -1847,7 +2179,7 @@ impl RSPaxosReplica { self.snap_offset = now_size; Ok(()) } else { - logged_err!(self.id; "unexpected log result type or failed truncate") + logged_err!(self.id; "unexpected log result type or failed write") } } @@ -1873,19 +2205,47 @@ impl GenericReplica for RSPaxosReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigRSPaxos; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, logger_sync, hb_hear_timeout_min, hb_hear_timeout_max, hb_send_interval_ms, snapshot_path, snapshot_interval_s, - fault_tolerance, + fault_tolerance, recon_chunk_size, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { + return logged_err!( + id; + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms + ); + } + if config.hb_hear_timeout_min < 100 { + return logged_err!( + id; + "invalid config.hb_hear_timeout_min '{}'", + config.hb_hear_timeout_min + ); + } + if config.hb_hear_timeout_max < config.hb_hear_timeout_min + 100 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.hb_hear_timeout_max '{}'", + config.hb_hear_timeout_max + ); + } + if config.hb_send_interval_ms == 0 { + return logged_err!( + id; + "invalid config.hb_send_interval_ms '{}'", + config.hb_send_interval_ms + ); + } + if config.recon_chunk_size == 0 { + return logged_err!( + id; + "invalid config.recon_chunk_size '{}'", + config.recon_chunk_size ); } if config.hb_hear_timeout_min < 100 { @@ -1957,14 +2317,14 @@ impl GenericReplica for RSPaxosReplica { // create a Reed-Solomon coder with num_data_shards == quorum size and // num_parity shards == population - quorum - let quorum_cnt = (population / 2) + 1; - if config.fault_tolerance > (population - quorum_cnt) { + let majority = (population / 2) + 1; + if config.fault_tolerance > (population - majority) { return logged_err!(id; "invalid config.fault_tolerance '{}'", config.fault_tolerance); } let rs_coder = ReedSolomon::new( - quorum_cnt as usize, - (population - quorum_cnt) as usize, + majority as usize, + (population - majority) as usize, )?; // proactively connect to some peers, then wait for all population @@ -1986,7 +2346,7 @@ impl GenericReplica for RSPaxosReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -2004,10 +2364,14 @@ impl GenericReplica for RSPaxosReplica { )); snapshot_interval.set_missed_tick_behavior(MissedTickBehavior::Skip); + let hb_reply_cnts = (0..population) + .filter_map(|p| if p == id { None } else { Some((p, (1, 0, 0))) }) + .collect(); + Ok(RSPaxosReplica { id, population, - quorum_cnt, + majority, config, _api_addr: api_addr, _p2p_addr: p2p_addr, @@ -2017,9 +2381,11 @@ impl GenericReplica for RSPaxosReplica { storage_hub, snapshot_hub, transport_hub, + leader: None, hb_hear_timer: Timer::new(), hb_send_interval, - is_leader: false, + hb_reply_cnts, + peer_alive: Bitmap::new(population, true), insts: vec![], start_slot: 0, snapshot_interval, @@ -2028,7 +2394,11 @@ impl GenericReplica for RSPaxosReplica { bal_max_seen: 0, commit_bar: 0, exec_bar: 0, - log_offset: 0, + peer_exec_bar: (0..population) + .filter_map(|s| if s == id { None } else { Some((s, 0)) }) + .collect(), + snap_bar: 0, + wal_offset: 0, snap_offset: 0, rs_coder, }) @@ -2041,8 +2411,8 @@ impl GenericReplica for RSPaxosReplica { // recover state from durable snapshot file self.recover_from_snapshot().await?; - // recover the tail-piece memory log & state from durable storage log - self.recover_from_log().await?; + // recover the tail-piece memory log & state from durable WAL log + self.recover_from_wal().await?; // kick off leader activity hearing timer self.kickoff_hb_hear_timer()?; @@ -2108,7 +2478,7 @@ impl GenericReplica for RSPaxosReplica { }, // leader sending heartbeat - _ = self.hb_send_interval.tick(), if !paused && self.is_leader => { + _ = self.hb_send_interval.tick(), if !paused && self.is_leader() => { if let Err(e) = self.bcast_heartbeats() { pf_error!(self.id; "error broadcasting heartbeats: {}", e); } @@ -2116,7 +2486,7 @@ impl GenericReplica for RSPaxosReplica { // autonomous snapshot taking timeout _ = self.snapshot_interval.tick(), if !paused - && self.config.snapshot_interval_s > 0 => { + && self.config.snapshot_interval_s > 0 => { if let Err(e) = self.take_new_snapshot().await { pf_error!(self.id; "error taking a new snapshot: {}", e); } else { diff --git a/src/protocols/simple_push.rs b/src/protocols/simple_push.rs index ce89c7d1..93baeb0c 100644 --- a/src/protocols/simple_push.rs +++ b/src/protocols/simple_push.rs @@ -29,8 +29,8 @@ use tokio::sync::watch; /// Configuration parameters struct. #[derive(Debug, Deserialize)] pub struct ReplicaConfigSimplePush { - /// Client request batching interval in microsecs. - pub batch_interval_us: u64, + /// Client request batching interval in millisecs. + pub batch_interval_ms: u64, /// Client request batching maximum batch size. pub max_batch_size: usize, @@ -52,7 +52,7 @@ pub struct ReplicaConfigSimplePush { impl Default for ReplicaConfigSimplePush { fn default() -> Self { ReplicaConfigSimplePush { - batch_interval_us: 1000, + batch_interval_ms: 10, max_batch_size: 5000, backer_path: "/tmp/summerset.simple_push.wal".into(), rep_degree: 2, @@ -64,9 +64,9 @@ impl Default for ReplicaConfigSimplePush { } } -/// Log entry type. +/// WAL log entry type. #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, GetSize)] -enum LogEntry { +enum WalEntry { FromClient { reqs: Vec<(ClientId, ApiRequest)>, }, @@ -126,7 +126,7 @@ pub struct SimplePushReplica { state_machine: StateMachine, /// StorageHub module. - storage_hub: StorageHub, + storage_hub: StorageHub, /// TransportHub module. transport_hub: TransportHub, @@ -134,12 +134,14 @@ pub struct SimplePushReplica { /// In-memory log of instances. insts: Vec, - /// Current durable log file offset. - log_offset: usize, + /// Current durable WAL log file offset. + wal_offset: usize, } +// SimplePushReplica common helpers impl SimplePushReplica { /// Compose CommandId from instance index & command index within. + #[inline] fn make_command_id(inst_idx: usize, cmd_idx: usize) -> CommandId { assert!(inst_idx <= (u32::MAX as usize)); assert!(cmd_idx <= (u32::MAX as usize)); @@ -147,12 +149,16 @@ impl SimplePushReplica { } /// Decompose CommandId into instance index & command index within. + #[inline] fn split_command_id(command_id: CommandId) -> (usize, usize) { let inst_idx = (command_id >> 32) as usize; let cmd_idx = (command_id & ((1 << 32) - 1)) as usize; (inst_idx, cmd_idx) } +} +// SimplePushReplica client requests entrance +impl SimplePushReplica { /// Handler of client request batch chan recv. fn handle_req_batch( &mut self, @@ -186,13 +192,13 @@ impl SimplePushReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry::FromClient { + let wal_entry = WalEntry::FromClient { reqs: req_batch.clone(), }; self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: true, }, )?; @@ -208,12 +214,15 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica durable WAL logging +impl SimplePushReplica { /// Handler of durable logging result chan recv. fn handle_log_result( &mut self, action_id: LogActionId, - log_result: LogResult, + log_result: LogResult, ) -> Result<(), SummersetError> { let inst_idx = action_id as usize; if inst_idx >= self.insts.len() { @@ -222,8 +231,8 @@ impl SimplePushReplica { match log_result { LogResult::Append { now_size } => { - assert!(now_size >= self.log_offset); - self.log_offset = now_size; + assert!(now_size >= self.wal_offset); + self.wal_offset = now_size; } _ => { return logged_err!(self.id; "unexpected log result type for {}: {:?}", inst_idx, log_result); @@ -265,7 +274,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica peer-peer messages handling +impl SimplePushReplica { /// Handler of push message from peer. fn handle_push_msg( &mut self, @@ -284,7 +296,7 @@ impl SimplePushReplica { self.insts.push(inst); // submit log action to make this instance durable - let log_entry = LogEntry::PeerPushed { + let wal_entry = WalEntry::PeerPushed { peer, src_inst_idx, reqs: req_batch.clone(), @@ -292,7 +304,7 @@ impl SimplePushReplica { self.storage_hub.submit_action( inst_idx as LogActionId, LogAction::Append { - entry: log_entry, + entry: wal_entry, sync: true, }, )?; @@ -346,7 +358,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica state machine execution +impl SimplePushReplica { /// Handler of state machine exec result chan recv. fn handle_cmd_result( &mut self, @@ -398,7 +413,10 @@ impl SimplePushReplica { Ok(()) } +} +// SimplePushReplica control messages handling +impl SimplePushReplica { /// Handler of ResetState control message. async fn handle_ctrl_reset_state( &mut self, @@ -486,16 +504,19 @@ impl SimplePushReplica { _ => Ok(None), // ignore all other types } } +} - /// Recover state from durable storage log. - async fn recover_from_log(&mut self) -> Result<(), SummersetError> { - assert_eq!(self.log_offset, 0); +// SimplePushReplica recovery from WAL log +impl SimplePushReplica { + /// Recover state from durable storage WAL log. + async fn recover_from_wal(&mut self) -> Result<(), SummersetError> { + assert_eq!(self.wal_offset, 0); loop { // using 0 as a special log action ID self.storage_hub.submit_action( 0, LogAction::Read { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -506,8 +527,8 @@ impl SimplePushReplica { end_offset, } => { let (from_peer, reqs) = match entry { - LogEntry::FromClient { reqs } => (None, reqs), - LogEntry::PeerPushed { + WalEntry::FromClient { reqs } => (None, reqs), + WalEntry::PeerPushed { peer, src_inst_idx, reqs, @@ -531,7 +552,7 @@ impl SimplePushReplica { from_peer, }); // update log offset - self.log_offset = end_offset; + self.wal_offset = end_offset; } LogResult::Read { entry: None, .. } => { // end of log reached @@ -547,7 +568,7 @@ impl SimplePushReplica { self.storage_hub.submit_action( 0, LogAction::Truncate { - offset: self.log_offset, + offset: self.wal_offset, }, )?; let (_, log_result) = self.storage_hub.get_result().await?; @@ -577,15 +598,15 @@ impl GenericReplica for SimplePushReplica { // parse protocol-specific configs let config = parsed_config!(config_str => ReplicaConfigSimplePush; - batch_interval_us, max_batch_size, + batch_interval_ms, max_batch_size, backer_path, rep_degree, perf_storage_a, perf_storage_b, perf_network_a, perf_network_b)?; - if config.batch_interval_us == 0 { + if config.batch_interval_ms == 0 { return logged_err!( id; - "invalid config.batch_interval_us '{}'", - config.batch_interval_us + "invalid config.batch_interval_ms '{}'", + config.batch_interval_ms ); } @@ -645,7 +666,7 @@ impl GenericReplica for SimplePushReplica { let external_api = ExternalApi::new_and_setup( id, api_addr, - Duration::from_micros(config.batch_interval_us), + Duration::from_millis(config.batch_interval_ms), config.max_batch_size, ) .await?; @@ -662,7 +683,7 @@ impl GenericReplica for SimplePushReplica { storage_hub, transport_hub, insts: vec![], - log_offset: 0, + wal_offset: 0, }) } @@ -670,8 +691,8 @@ impl GenericReplica for SimplePushReplica { &mut self, mut rx_term: watch::Receiver, ) -> Result { - // recover state from durable storage log - self.recover_from_log().await?; + // recover state from durable storage WAL log + self.recover_from_wal().await?; // main event loop let mut paused = false; diff --git a/src/server/external.rs b/src/server/external.rs index cc820c00..a0728861 100644 --- a/src/server/external.rs +++ b/src/server/external.rs @@ -21,7 +21,7 @@ use tokio::io::AsyncReadExt; use tokio::sync::{mpsc, Notify}; use tokio::sync::mpsc::error::TryRecvError; use tokio::task::JoinHandle; -use tokio::time::{self, Duration}; +use tokio::time::{self, Duration, MissedTickBehavior}; /// External API request ID type. pub type RequestId = u64; @@ -490,6 +490,7 @@ impl ExternalApi { batch_notify: Arc, ) { let mut interval = time::interval(batch_interval); + interval.set_missed_tick_behavior(MissedTickBehavior::Skip); loop { interval.tick().await; diff --git a/src/server/storage.rs b/src/server/storage.rs index a11d6ba6..06bc0430 100644 --- a/src/server/storage.rs +++ b/src/server/storage.rs @@ -44,8 +44,9 @@ pub enum LogAction { /// Truncate the log at given offset, keeping the head part. Truncate { offset: usize }, - /// Discard the log before given offset, keeping the tail part. - Discard { offset: usize }, + /// Discard the log before given offset, keeping the tail part (and + /// optionally a head part). + Discard { offset: usize, keep: usize }, } /// Action result returned by the logger. @@ -337,12 +338,14 @@ where } } - /// Discard the file before given index, keeping the tail part. + /// Discard the file before given index, keeping the tail part (and + /// optionally a head part). async fn discard_log( me: ReplicaId, backer: &mut File, file_size: usize, offset: usize, + keep: usize, ) -> Result<(bool, usize), SummersetError> { if offset > file_size { pf_warn!( @@ -352,25 +355,32 @@ where file_size ); Ok((false, file_size)) + } else if keep >= offset { + pf_warn!( + me; + "discard keeping {} while offset is {}", + keep, offset + ); + Ok((false, file_size)) } else { let tail_size = file_size - offset; if tail_size > 0 { // due to the limited interfaces provided by `tokio::fs`, we - // read out the tail part and write it back to offset 0 to + // read out the tail part and write it back to offset keep to // achieve the effect of discarding let mut tail_buf: Vec = vec![0; tail_size]; backer.seek(SeekFrom::Start(offset as u64)).await?; backer.read_exact(&mut tail_buf[..]).await?; - backer.seek(SeekFrom::Start(0)).await?; + backer.seek(SeekFrom::Start(keep as u64)).await?; backer.write_all(&tail_buf[..]).await?; } - backer.set_len(tail_size as u64).await?; + backer.set_len((keep + tail_size) as u64).await?; backer.seek(SeekFrom::End(0)).await?; // recover cursor to EOF backer.sync_all().await?; - Ok((true, tail_size)) + Ok((true, keep + tail_size)) } } @@ -422,16 +432,16 @@ where } }) } - LogAction::Discard { offset } => { - Self::discard_log(me, backer, *file_size, offset).await.map( - |(offset_ok, now_size)| { + LogAction::Discard { offset, keep } => { + Self::discard_log(me, backer, *file_size, offset, keep) + .await + .map(|(offset_ok, now_size)| { *file_size = now_size; LogResult::Discard { offset_ok, now_size, } - }, - ) + }) } } } @@ -658,24 +668,55 @@ mod storage_tests { let mut backer_file = prepare_test_file("/tmp/test-backer-4.log").await?; let entry = TestEntry("test-entry-dummy-string".into()); - let mid_offset = + let mid1_offset = StorageHub::append_entry(0, &mut backer_file, 0, &entry, false) .await?; + let mid2_offset = StorageHub::append_entry( + 0, + &mut backer_file, + mid1_offset, + &entry, + false, + ) + .await?; let end_offset = StorageHub::append_entry( 0, &mut backer_file, - mid_offset, + mid2_offset, &entry, true, ) .await?; - let tail_size = end_offset - mid_offset; + let tail_size = end_offset - mid2_offset; assert_eq!( StorageHub::::discard_log( 0, &mut backer_file, end_offset, - mid_offset + mid2_offset, + mid1_offset, + ) + .await?, + (true, 2 * tail_size) + ); + assert_eq!( + StorageHub::::discard_log( + 0, + &mut backer_file, + 2 * tail_size, + mid1_offset, + end_offset, + ) + .await?, + (false, 2 * tail_size) + ); + assert_eq!( + StorageHub::::discard_log( + 0, + &mut backer_file, + 2 * tail_size, + mid1_offset, + 0, ) .await?, (true, tail_size) @@ -685,7 +726,8 @@ mod storage_tests { 0, &mut backer_file, tail_size, - end_offset + end_offset, + 0 ) .await?, (false, tail_size) @@ -695,7 +737,8 @@ mod storage_tests { 0, &mut backer_file, tail_size, - tail_size + tail_size, + 0 ) .await?, (true, 0) diff --git a/src/server/transport.rs b/src/server/transport.rs index ba0e1e8b..a91c44f4 100644 --- a/src/server/transport.rs +++ b/src/server/transport.rs @@ -1,9 +1,10 @@ //! Summerset server internal TCP transport module implementation. //! -//! In concept, all messages are sent through unstable communication channels, -//! and are retried if the sender did not receive an ACK in a timely manner. -//! Here, we use TCP as the communication protocol to get the same effect of -//! "every message a sender wants to send will eventually be delivered". +//! NOTE: In concept, all messages are sent through unstable communication +//! channels, and are retried if the sender did not receive an ACK in a timely +//! manner. Here, we use TCP as the communication protocol to get the same +//! effect of "every message a sender wants to send will be retried until +//! eventually delivered". use std::fmt; use std::net::SocketAddr; @@ -227,11 +228,12 @@ where .map_err(|e| SummersetError(e.to_string()))?; } None => { - pf_error!( - self.me; - "peer ID {} not found among connected ones", - peer - ); + // NOTE: commented out to avoid spurious error messages + // pf_error!( + // self.me; + // "peer ID {} not found among connected ones", + // peer + // ); } } diff --git a/src/utils/bitmap.rs b/src/utils/bitmap.rs index d5fe9c8e..5211e9f9 100644 --- a/src/utils/bitmap.rs +++ b/src/utils/bitmap.rs @@ -71,11 +71,26 @@ impl Bitmap { self.0.count_ones(..) as u8 } + /// Flips all flags in the bitmap. + #[inline] + pub fn flip(&mut self) { + self.0.toggle_range(..) + } + /// Allows `for (id, bit) in map.iter()`. #[inline] pub fn iter(&self) -> BitmapIter { BitmapIter { map: self, idx: 0 } } + + /// Convenience method for converting the bitmap to a vec of indexes where + /// the flag is true. + #[inline] + pub fn to_vec(&self) -> Vec { + self.iter() + .filter_map(|(idx, flag)| if flag { Some(idx) } else { None }) + .collect() + } } /// Iterator over `Bitmap`, yielding `(id, bit)` pairs. @@ -143,6 +158,14 @@ mod bitmap_tests { assert!(map.get(7).is_err()); } + #[test] + fn bitmap_flip() { + let mut map = Bitmap::new(5, false); + assert!(map.set(1, true).is_ok()); + map.flip(); + assert_eq!(map, Bitmap::from(5, vec![0, 2, 3, 4])); + } + #[test] fn bitmap_count() { let mut map = Bitmap::new(7, false); @@ -161,5 +184,6 @@ mod bitmap_tests { for (id, flag) in map.iter() { assert_eq!(ref_map[id as usize], flag); } + assert_eq!(map.to_vec(), [0, 1, 3, 4]); } } diff --git a/src/utils/error.rs b/src/utils/error.rs index 0e73dccb..6c0907a0 100644 --- a/src/utils/error.rs +++ b/src/utils/error.rs @@ -3,6 +3,7 @@ use std::fmt; use std::io; use std::net; +use std::num; use crate::server::ReplicaId; @@ -30,6 +31,7 @@ macro_rules! impl_from_error { } impl_from_error!(io::Error); +impl_from_error!(num::ParseIntError); impl_from_error!(net::AddrParseError); impl_from_error!(rmp_serde::encode::Error); impl_from_error!(rmp_serde::decode::Error); diff --git a/src/utils/rscoding.rs b/src/utils/rscoding.rs index c8461c26..35fdc97c 100644 --- a/src/utils/rscoding.rs +++ b/src/utils/rscoding.rs @@ -305,15 +305,20 @@ where self.shards.iter().filter(|s| s.is_some()).count() as u8 } - /// Gets a bitmap of available shard indexes set true. + /// Gets a vec of available shard indexes. #[inline] - pub fn avail_shards_map(&self) -> Bitmap { - let ones: Vec = self - .shards + pub fn avail_shards_vec(&self) -> Vec { + self.shards .iter() .enumerate() .filter_map(|(i, s)| if s.is_some() { Some(i as u8) } else { None }) - .collect(); + .collect() + } + + /// Gets a bitmap of available shard indexes set true. + #[inline] + pub fn avail_shards_map(&self) -> Bitmap { + let ones = self.avail_shards_vec(); Bitmap::from(self.num_shards(), ones) } diff --git a/summerset_client/src/clients/repl.rs b/summerset_client/src/clients/repl.rs index 09e4f330..88e0cfbb 100644 --- a/summerset_client/src/clients/repl.rs +++ b/summerset_client/src/clients/repl.rs @@ -1,6 +1,8 @@ //! Interactive REPL-style command-line interface client. +use std::collections::HashSet; use std::io::{self, Write}; +use std::str::SplitWhitespace; use crate::drivers::{DriverReply, DriverClosedLoop}; @@ -8,7 +10,9 @@ use color_print::{cprint, cprintln}; use tokio::time::Duration; -use summerset::{GenericEndpoint, Command, SummersetError}; +use summerset::{ + ReplicaId, GenericEndpoint, Command, CtrlRequest, CtrlReply, SummersetError, +}; /// Prompt string at the start of line. const PROMPT: &str = ">>>>> "; @@ -24,6 +28,9 @@ enum ReplCommand { /// Print help message. PrintHelp, + /// Control request to the manager. + Control(CtrlRequest), + /// Client exit. Exit, @@ -54,28 +61,61 @@ impl ClientRepl { } /// Prints the prompt string. - fn print_prompt(&mut self) { + #[inline] + fn print_prompt() { cprint!("{}", PROMPT); io::stdout().flush().unwrap(); } /// Prints (optionally) an error message and the help message. - fn print_help(&mut self, err: Option<&SummersetError>) { + fn print_help(err: Option<&SummersetError>) { if let Some(e) = err { cprintln!("✗ {}", e); } - println!("HELP: Supported commands are:"); - println!(" get "); - println!(" put "); - println!(" reconnect"); - println!(" help"); - println!(" exit"); + println!("HELP: Supported normal commands are:"); + println!(" get "); + println!(" put "); + println!(" help"); + println!(" exit"); + println!(" Commands for control/testing:"); + println!(" reconnect"); + println!(" reset [servers]"); + println!(" pause [servers]"); + println!(" resume [servers]"); + println!(" snapshot [servers]"); println!( " Keys and values currently cannot contain any whitespaces" ); io::stdout().flush().unwrap(); } + /// Expect to get the next segment string from parsed segs. + #[inline] + fn expect_next_seg<'s>( + segs: &mut SplitWhitespace<'s>, + ) -> Result<&'s str, SummersetError> { + if let Some(seg) = segs.next() { + Ok(seg) + } else { + let err = SummersetError("not enough args".into()); + Self::print_help(Some(&err)); + Err(err) + } + } + + /// Drain all of the remaining segments into a hash set and interpret as + /// replica IDs. + #[inline] + fn drain_server_ids( + segs: &mut SplitWhitespace, + ) -> Result, SummersetError> { + let mut servers = HashSet::new(); + for seg in segs { + servers.insert(seg.parse::()?); + } + Ok(servers) + } + /// Reads in user input and parses into a command. fn read_command(&mut self) -> Result { self.input_buf.clear(); @@ -98,36 +138,18 @@ impl ClientRepl { match &cmd_type.unwrap().to_lowercase()[..] { "get" => { - let key = segs.next(); - if key.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - - // keys and values are kept as-is, no case conversions - Ok(ReplCommand::Normal(Command::Get { - key: key.unwrap().into(), - })) + // keys are kept as-is, no case conversions + let key = Self::expect_next_seg(&mut segs)?; + Ok(ReplCommand::Normal(Command::Get { key: key.into() })) } "put" => { - let key = segs.next(); - if key.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - let value = segs.next(); - if value.is_none() { - let err = SummersetError("not enough args".into()); - self.print_help(Some(&err)); - return Err(err); - } - + // keys and values are kept as-is, no case conversions + let key = Self::expect_next_seg(&mut segs)?; + let value = Self::expect_next_seg(&mut segs)?; Ok(ReplCommand::Normal(Command::Put { - key: key.unwrap().into(), - value: value.unwrap().into(), + key: key.into(), + value: value.into(), })) } @@ -135,6 +157,29 @@ impl ClientRepl { "reconnect" => Ok(ReplCommand::Reconnect), + "reset" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::ResetServers { + servers, + durable: true, + })) + } + + "pause" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::PauseServers { servers })) + } + + "resume" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::ResumeServers { servers })) + } + + "snapshot" => { + let servers = Self::drain_server_ids(&mut segs)?; + Ok(ReplCommand::Control(CtrlRequest::TakeSnapshot { servers })) + } + "exit" => Ok(ReplCommand::Exit), _ => { @@ -142,7 +187,7 @@ impl ClientRepl { "unrecognized command: {}", cmd_type.unwrap() )); - self.print_help(Some(&err)); + Self::print_help(Some(&err)); Err(err) } } @@ -200,9 +245,49 @@ impl ClientRepl { io::stdout().flush().unwrap(); } + /// Makes a control request to the manager and wait for the reply. + async fn make_ctrl_req( + &mut self, + req: CtrlRequest, + ) -> Result { + let mut sent = self.driver.ctrl_stub().send_req(Some(&req))?; + while !sent { + sent = self.driver.ctrl_stub().send_req(None)?; + } + self.driver.ctrl_stub().recv_reply().await + } + + /// Prints control request reply. + fn print_ctrl_reply(&mut self, reply: CtrlReply) { + match reply { + CtrlReply::ResetServers { servers } => { + cprintln!("# reset servers {:?}", servers); + } + + CtrlReply::PauseServers { servers } => { + cprintln!("# paused servers {:?}", servers); + } + + CtrlReply::ResumeServers { servers } => { + cprintln!("# resumed servers {:?}", servers); + } + + CtrlReply::TakeSnapshot { snapshot_up_to } => { + cprintln!( + "# servers snapshot up to {:?}", + snapshot_up_to + ); + } + + _ => { + cprintln!("✗ unexpected ctrl reply type"); + } + } + } + /// One iteration of the REPL loop. async fn iter(&mut self) -> Result { - self.print_prompt(); + Self::print_prompt(); let cmd = self.read_command()?; match cmd { @@ -221,7 +306,7 @@ impl ClientRepl { } ReplCommand::PrintHelp => { - self.print_help(None); + Self::print_help(None); Ok(true) } @@ -230,6 +315,12 @@ impl ClientRepl { self.print_result(result); Ok(true) } + + ReplCommand::Control(req) => { + let reply = self.make_ctrl_req(req).await?; + self.print_ctrl_reply(reply); + Ok(true) + } } } diff --git a/summerset_client/src/clients/tester.rs b/summerset_client/src/clients/tester.rs index 378256b7..4fb021e0 100644 --- a/summerset_client/src/clients/tester.rs +++ b/summerset_client/src/clients/tester.rs @@ -467,6 +467,7 @@ impl ClientTester { self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if !is_leader { + // picked a non-leader replica self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_secs(1)).await; @@ -484,6 +485,7 @@ impl ClientTester { self.checked_put("Jose", &v, Some(None), 0).await?; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.reset_servers(HashSet::from([s]), true).await?; time::sleep(Duration::from_secs(1)).await; @@ -515,6 +517,7 @@ impl ClientTester { } } if resets.len() == 2 { + // picked two replicas, one leader and one non-leader self.driver.leave(false).await?; self.reset_servers(resets, true).await?; time::sleep(Duration::from_secs(1)).await; @@ -543,6 +546,7 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if !is_leader { + // picked a non-leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -563,6 +567,7 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -583,24 +588,28 @@ impl ClientTester { time::sleep(Duration::from_millis(500)).await; for (s, is_leader) in self.query_servers().await? { if is_leader { + // picked a leader replica self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v1 = Self::gen_rand_string(8); self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + // resuming old leader replica self.driver.leave(false).await?; self.resume_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v2 = Self::gen_rand_string(8); self.checked_put("Jose", &v2, Some(Some(&v1)), 1).await?; + // pausing that replica again self.driver.leave(false).await?; self.pause_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; self.driver.connect().await?; let v3 = Self::gen_rand_string(8); self.checked_put("Jose", &v3, Some(Some(&v2)), 0).await?; + // resuming that replica again self.driver.leave(false).await?; self.resume_servers(HashSet::from([s])).await?; time::sleep(Duration::from_secs(1)).await; @@ -619,9 +628,21 @@ impl ClientTester { self.checked_put("Jose", &v0, Some(None), 0).await?; let v1 = Self::gen_rand_string(8); self.checked_put("Shawn", &v1, Some(None), 0).await?; + // forcing all nodes to take snapshot time::sleep(Duration::from_millis(500)).await; self.force_snapshot(HashSet::new()).await?; self.checked_put("Jose", &v1, Some(Some(&v0)), 0).await?; + // reseting all nodes and see if things are there + self.driver.leave(false).await?; + self.reset_servers(HashSet::new(), true).await?; + time::sleep(Duration::from_secs(1)).await; + self.driver.connect().await?; + self.checked_get("Shawn", Some(Some(&v1)), 0).await?; + self.checked_get("Jose", Some(Some(&v1)), 0).await?; + // forcing all nodes to take snapshot again + time::sleep(Duration::from_millis(500)).await; + self.force_snapshot(HashSet::new()).await?; + // reseting all nodes again and check again self.driver.leave(false).await?; self.reset_servers(HashSet::new(), true).await?; time::sleep(Duration::from_secs(1)).await; diff --git a/summerset_client/src/drivers/closed_loop.rs b/summerset_client/src/drivers/closed_loop.rs index a0a96e87..06e218df 100644 --- a/summerset_client/src/drivers/closed_loop.rs +++ b/summerset_client/src/drivers/closed_loop.rs @@ -99,46 +99,55 @@ impl DriverClosedLoop { })?; let issue_ts = Instant::now(); - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if reply_id != req_id { - logged_err!(self.id; "request ID mismatch: expected {}, replied {}", - req_id, reply_id) - } else { - match cmd_result { - None => { - if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) - } else { - Ok(DriverReply::Failure) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if reply_id != req_id { + // logged_err!(self.id; "request ID mismatch: expected {}, replied {}", + // req_id, reply_id) + continue; + } else { + match cmd_result { + None => { + if let Some(server) = redirect { + return Ok(DriverReply::Redirect { + server, + }); + } else { + return Ok(DriverReply::Failure); + } } - } - Some(CommandResult::Get { value }) => { - let latency = - Instant::now().duration_since(issue_ts); - Ok(DriverReply::Success { - req_id, - cmd_result: CommandResult::Get { value }, - latency, - }) - } + Some(CommandResult::Get { value }) => { + let latency = + Instant::now().duration_since(issue_ts); + return Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Get { value }, + latency, + }); + } - _ => { - logged_err!(self.id; "command type mismatch: expected Get") + _ => { + return logged_err!(self.id; "command type mismatch: expected Get"); + } } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } } @@ -160,46 +169,57 @@ impl DriverClosedLoop { })?; let issue_ts = Instant::now(); - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if reply_id != req_id { - logged_err!(self.id; "request ID mismatch: expected {}, replied {}", - req_id, reply_id) - } else { - match cmd_result { - None => { - if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) - } else { - Ok(DriverReply::Failure) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if reply_id != req_id { + // logged_err!(self.id; "request ID mismatch: expected {}, replied {}", + // req_id, reply_id) + continue; + } else { + match cmd_result { + None => { + if let Some(server) = redirect { + return Ok(DriverReply::Redirect { + server, + }); + } else { + return Ok(DriverReply::Failure); + } } - } - Some(CommandResult::Put { old_value }) => { - let latency = - Instant::now().duration_since(issue_ts); - Ok(DriverReply::Success { - req_id, - cmd_result: CommandResult::Put { old_value }, - latency, - }) - } + Some(CommandResult::Put { old_value }) => { + let latency = + Instant::now().duration_since(issue_ts); + return Ok(DriverReply::Success { + req_id, + cmd_result: CommandResult::Put { + old_value, + }, + latency, + }); + } - _ => { - logged_err!(self.id; "command type mismatch: expected Put") + _ => { + return logged_err!(self.id; "command type mismatch: expected Put"); + } } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } } diff --git a/summerset_client/src/drivers/open_loop.rs b/summerset_client/src/drivers/open_loop.rs index 8e49c107..37a902d5 100644 --- a/summerset_client/src/drivers/open_loop.rs +++ b/summerset_client/src/drivers/open_loop.rs @@ -168,37 +168,45 @@ impl DriverOpenLoop { /// Waits for the next reply. pub async fn wait_reply(&mut self) -> Result { - let reply = self.recv_reply_with_timeout().await?; - match reply { - Some(ApiReply::Reply { - id: reply_id, - result: cmd_result, - redirect, - }) => { - if !self.pending_reqs.contains_key(&reply_id) { - logged_err!(self.id; "request ID {} not in pending set", - reply_id) - } else { - let issue_ts = self.pending_reqs.remove(&reply_id).unwrap(); - let latency = Instant::now().duration_since(issue_ts); - - if let Some(res) = cmd_result { - Ok(DriverReply::Success { - req_id: reply_id, - cmd_result: res, - latency, - }) - } else if let Some(server) = redirect { - Ok(DriverReply::Redirect { server }) + loop { + let reply = self.recv_reply_with_timeout().await?; + match reply { + Some(ApiReply::Reply { + id: reply_id, + result: cmd_result, + redirect, + }) => { + if !self.pending_reqs.contains_key(&reply_id) { + // logged_err!(self.id; "request ID {} not in pending set", + // reply_id) + continue; } else { - Ok(DriverReply::Failure) + let issue_ts = + self.pending_reqs.remove(&reply_id).unwrap(); + let latency = Instant::now().duration_since(issue_ts); + + if let Some(res) = cmd_result { + return Ok(DriverReply::Success { + req_id: reply_id, + cmd_result: res, + latency, + }); + } else if let Some(server) = redirect { + return Ok(DriverReply::Redirect { server }); + } else { + return Ok(DriverReply::Failure); + } } } - } - None => Ok(DriverReply::Timeout), + None => { + return Ok(DriverReply::Timeout); + } - _ => logged_err!(self.id; "unexpected reply type received"), + _ => { + return logged_err!(self.id; "unexpected reply type received"); + } + } } }