diff --git a/Cargo.lock b/Cargo.lock index f52842e..05f9a13 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -200,6 +200,7 @@ dependencies = [ "maplit", "moka", "nom", + "num-traits", "ordered-float", "ouroboros", "parking_lot", @@ -414,6 +415,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "convert_case" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec182b0ca2f35d8fc196cf3404988fd8b8c739a4d270ff118a398feb0cbec1ca" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "criterion" version = "0.5.1" @@ -558,9 +568,11 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ + "convert_case", "proc-macro2", "quote", "syn 2.0.69", + "unicode-xid", ] [[package]] @@ -2070,6 +2082,12 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "utf8parse" version = "0.2.2" @@ -2247,6 +2265,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -2262,15 +2289,6 @@ dependencies = [ "windows_x86_64_msvc 0.48.5", ] -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets", -] - [[package]] name = "windows-targets" version = "0.52.6" diff --git a/Cargo.toml b/Cargo.toml index 673f7c4..b8347fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,8 +46,9 @@ tokio-util = "0.7.11" futures-concurrency = "7.6.1" ordered-float = "4.2.2" getset = "0.1.2" -derive_more = "1.0.0" +derive_more = { version = "1.0.0", features = ["full"] } tracing-futures = { version = "0.2.5", features = ["futures-03"] } +num-traits = "0.2.19" [dev-dependencies] tempfile = "3" @@ -56,7 +57,6 @@ criterion = { version = "0.5.1", features = ["async_tokio"] } maplit = "1.0.2" [lints.rust] -unused = "allow" unsafe_code = "forbid" [[bench]] diff --git a/benches/ycsb.rs b/benches/ycsb.rs index 57cf7be..8ace7dc 100644 --- a/benches/ycsb.rs +++ b/benches/ycsb.rs @@ -1,24 +1,28 @@ -use better_mini_lsm::fibonacci; -use better_mini_lsm::persistent::LocalFs; -use better_mini_lsm::sst::SstOptions; -use better_mini_lsm::state::{LsmStorageState, Map}; -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use itertools::Itertools; -use maplit::hashmap; use std::collections::HashMap; use std::sync::Arc; + +use criterion::{criterion_group, criterion_main, Criterion}; +use itertools::Itertools; +use maplit::hashmap; use tempfile::tempdir; use ycsb::db::DB; use ycsb::properties::Properties; use ycsb::workload::CoreWorkload; +use better_mini_lsm::persistent::LocalFs; +use better_mini_lsm::sst::SstOptions; +use better_mini_lsm::state::{LsmStorageState, Map}; + #[derive(Clone)] struct LsmStorageStateBench(Arc>); impl IsSend for LsmStorageStateBench {} impl IsSync for LsmStorageStateBench {} +#[allow(dead_code)] trait IsSend: Send {} + +#[allow(dead_code)] trait IsSync: Sync {} impl DB for LsmStorageStateBench { @@ -60,6 +64,7 @@ fn ycsb_bench(c: &mut Criterion) { .num_memtable_limit(1000) .compaction_option(Default::default()) .enable_wal(false) + .enable_mvcc(true) .build(); let runtime = tokio::runtime::Runtime::new().unwrap(); let state = diff --git a/rust-toolchain.toml b/rust-toolchain.toml index eb84621..eee1312 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,2 +1,2 @@ [toolchain] -channel = "nightly-2024-02-09" +channel = "nightly-2024-08-11" diff --git a/src/block/blocks.rs b/src/block/blocks.rs index 3945b46..f1fdc8a 100644 --- a/src/block/blocks.rs +++ b/src/block/blocks.rs @@ -1,7 +1,7 @@ -use crate::key::KeyBytes; -use bytes::Bytes; +use bytes::{Buf, Bytes}; -use crate::entry::Entry; +use crate::entry::InnerEntry; +use crate::key::{KeyBytes, KeySlice}; /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. #[derive(Debug)] @@ -51,7 +51,7 @@ impl Block { self.offsets.len() } - pub fn get_entry_ref(&self, index: usize) -> (&[u8], &[u8]) { + pub fn get_entry_ref(&self, index: usize) -> (KeySlice, &[u8]) { // get key let (data, key) = self.parse_key_ref(index); @@ -67,28 +67,34 @@ impl Block { (data, key, value) } - fn parse_key_ref(&self, index: usize) -> (&[u8], &[u8]) { + fn parse_key_ref(&self, index: usize) -> (&[u8], KeySlice) { let data = &self.data[self.offsets[index] as usize..]; if index == 0 { Self::get_uncompressed_key_ref(data) } else { let first_key = self.first_key_ref(); - Self::get_compressed_key_ref(first_key, data) + Self::get_compressed_key_ref(first_key.raw_ref(), data) } } - fn get_uncompressed_key_ref(data: &[u8]) -> (&[u8], &[u8]) { - get_value(data) + fn get_uncompressed_key_ref(data: &[u8]) -> (&[u8], KeySlice) { + let (data, raw_key) = get_value(data); + let (data, timestamp) = get_u64(data); + let output = KeySlice::new(raw_key, timestamp); + (data, output) } - fn get_compressed_key_ref<'b>(first_key: &[u8], data: &'b [u8]) -> (&'b [u8], &'b [u8]) { + // structure: [common_prefix_len, postfix_len, postfix, timestamp] + fn get_compressed_key_ref<'b>(first_key: &[u8], data: &'b [u8]) -> (&'b [u8], KeySlice<'b>) { let (data, common_prefix_len) = get_u16(data); let prefix = &first_key[..common_prefix_len]; let (data, postfix_len) = get_u16(data); let (data, postfix) = get_data_by_len(data, postfix_len); + let (data, timestamp) = get_u64(data); + // todo: 这里需要能把 (prefix: &[u8], postfix: &[u8]) 当作 &[u8] 的相关数据结构 (tuple of slices) let key = prefix .iter() @@ -97,29 +103,30 @@ impl Block { .collect::>() .leak(); + let key = KeySlice::new(key, timestamp); + (data, key) } - pub fn get_entry(&self, index: usize) -> Entry { + pub fn get_entry(&self, index: usize) -> InnerEntry { let (key, value) = self.get_entry_ref(index); - let key = Bytes::copy_from_slice(key); + let key = key.copy_to_key_bytes(); let value = Bytes::copy_from_slice(value); - Entry { key, value } + InnerEntry { key, value } } pub fn first_key(&self) -> KeyBytes { - let key = self.first_key_ref(); - KeyBytes::from_bytes(Bytes::copy_from_slice(key)) + self.first_key_ref().copy_to_key_bytes() } - fn first_key_ref(&self) -> &[u8] { + fn first_key_ref(&self) -> KeySlice { let (_, key) = self.parse_key_ref(0); key } pub fn last_key(&self) -> KeyBytes { let (_, key) = self.parse_key_ref(self.offsets.len() - 1); - KeyBytes::from_bytes(Bytes::copy_from_slice(key)) + key.copy_to_key_bytes() } } @@ -135,6 +142,12 @@ fn get_u16(data: &[u8]) -> (&[u8], usize) { (new_data, value) } +fn get_u64(data: &[u8]) -> (&[u8], u64) { + let new_data = &data[8..]; + let value = (&data[..8]).get_u64(); + (new_data, value) +} + fn get_data_by_len(data: &[u8], len: usize) -> (&[u8], &[u8]) { (&data[len..], &data[..len]) } diff --git a/src/block/builder.rs b/src/block/builder.rs index 7e0c342..4086ddc 100644 --- a/src/block/builder.rs +++ b/src/block/builder.rs @@ -1,7 +1,9 @@ -use crate::key::{KeySlice, KeyVec}; -use bytes::BufMut; use std::iter; +use bytes::BufMut; + +use crate::key::{KeySlice, KeyVec}; + use super::Block; /// Builds a block. @@ -74,8 +76,7 @@ impl BlockBuilder { compress_key(first_key, key, &mut self.data); } else { // first key - self.data.extend((key.len() as u16).to_be_bytes()); - self.data.extend(key.raw_ref()); + encode_key(key, &mut self.data); } self.data.extend((value.len() as u16).to_be_bytes()); @@ -91,6 +92,7 @@ impl BlockBuilder { fn compress_key(first_key: &KeyVec, key: KeySlice, buffer: &mut Vec) { let first_key = first_key.raw_ref(); + let timestamp = key.timestamp(); let key = key.raw_ref(); let common_prefix = iter::zip(first_key.iter(), key.iter()) @@ -102,15 +104,25 @@ fn compress_key(first_key: &KeyVec, key: KeySlice, buffer: &mut Vec) { if postfix > 0 { buffer.extend_from_slice(&key[common_prefix..]); } + buffer.put_u64(timestamp); +} + +// todo: 太多的 encoding 方法了,需要统一 +fn encode_key(key: KeySlice, buffer: &mut Vec) { + buffer.put_u16(key.len() as u16); + buffer.extend(key.raw_ref()); + buffer.put_u64(key.timestamp()); } #[cfg(test)] mod tests { - use crate::block::{Block, BlockBuilder, BlockIterator}; - use crate::key::{KeySlice, KeyVec}; + use std::sync::Arc; + use bytes::Bytes; use nom::AsBytes; - use std::sync::Arc; + + use crate::block::{Block, BlockBuilder, BlockIterator}; + use crate::key::{KeySlice, KeyVec}; #[test] fn test_block_build_single_key() { @@ -199,7 +211,7 @@ mod tests { for _ in 0..5 { let mut iter = BlockIterator::create_and_seek_to_first(block.clone()); for i in 0..num_of_keys() { - let entry = iter.next().unwrap().unwrap(); + let entry = iter.next().unwrap().unwrap().prune_ts(); let key = entry.key.as_bytes(); let value = entry.value.as_bytes(); assert_eq!( @@ -226,7 +238,7 @@ mod tests { let mut iter = BlockIterator::create_and_seek_to_key(block, key_of(0).as_key_slice()); for offset in 1..=5 { for i in 0..num_of_keys() { - let entry = iter.next().unwrap().unwrap(); + let entry = iter.next().unwrap().unwrap().prune_ts(); let key = entry.key.as_bytes(); let value = entry.value.as_bytes(); assert_eq!( diff --git a/src/block/iterator.rs b/src/block/iterator.rs index a9fa724..f69400e 100644 --- a/src/block/iterator.rs +++ b/src/block/iterator.rs @@ -1,8 +1,7 @@ use crate::block::blocks::Block; -use crate::entry::Entry; -use crate::key::{Key, KeySlice}; +use crate::entry::InnerEntry; +use crate::key::KeySlice; use std::sync::Arc; -use tracing::info; // Iterates on a block. pub struct BlockIterator { @@ -36,7 +35,6 @@ impl BlockIterator { let mut current = self.block.len(); for index in 0..self.block.len() { let (this_key, _) = self.block.get_entry_ref(index); - let this_key = Key::from_slice(this_key); if this_key >= key { current = index; break; @@ -47,7 +45,7 @@ impl BlockIterator { } impl Iterator for BlockIterator { - type Item = anyhow::Result; + type Item = anyhow::Result; fn next(&mut self) -> Option { if self.idx >= self.block.len() { diff --git a/src/bound.rs b/src/bound.rs index 4a9c297..8c116a6 100644 --- a/src/bound.rs +++ b/src/bound.rs @@ -1,38 +1,21 @@ -use bytes::Bytes; -use nom::AsBytes; use std::collections::Bound; use std::ops::RangeBounds; -/// Create a bound of `Bytes` from a bound of `&[u8]`. -pub(crate) fn map_bound_own(bound: Bound<&[u8]>) -> Bound { - map_bound(bound, Bytes::copy_from_slice) -} - -pub(crate) fn map_bound_ref(bound: Bound<&Bytes>) -> Bound<&[u8]> { - map_bound(bound, |b| b.as_bytes()) -} +use crate::key::KeyBytes; -/// todo: 用 Bound::map 替代 -pub(crate) fn map_bound U>(bound: Bound, f: F) -> Bound { - use Bound::{Excluded, Included, Unbounded}; - match bound { - Unbounded => Unbounded, - Included(x) => Included(f(x)), - Excluded(x) => Excluded(f(x)), - } -} +pub type BoundRange = (Bound, Bound); -pub struct BytesBound<'a> { - pub start: Bound<&'a [u8]>, - pub end: Bound<&'a [u8]>, +pub struct BytesBound { + pub start: Bound, + pub end: Bound, } -impl<'a> RangeBounds<[u8]> for BytesBound<'a> { - fn start_bound(&self) -> Bound<&[u8]> { - self.start +impl RangeBounds for BytesBound { + fn start_bound(&self) -> Bound<&KeyBytes> { + self.start.as_ref() } - fn end_bound(&self) -> Bound<&[u8]> { - self.end + fn end_bound(&self) -> Bound<&KeyBytes> { + self.end.as_ref() } } diff --git a/src/entry.rs b/src/entry.rs index 442a60c..bc4750c 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -1,42 +1,70 @@ +use std::cmp::Ordering; + +use crate::key::{Key, KeyBytes}; use bytes::Bytes; -use nom::AsBytes; -use std::cmp; +use derive_new::new; + +pub type Entry = Keyed; +pub type InnerEntry = Keyed; -#[derive(Debug, Eq)] -pub struct Entry { - pub key: Bytes, - pub value: Bytes, +#[derive(Debug, new)] +pub struct Keyed { + pub key: K, + pub value: V, } -impl PartialEq for Entry { +// todo: use Derivative for auto deriving +impl Eq for Keyed {} + +impl PartialEq for Keyed { fn eq(&self, other: &Self) -> bool { - self.cmp(other) == cmp::Ordering::Equal + PartialEq::eq(&self.key, &other.key) } } -impl PartialOrd for Entry { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) +impl PartialOrd for Keyed { + fn partial_cmp(&self, other: &Self) -> Option { + self.key.partial_cmp(&other.key) } } -impl Ord for Entry { - fn cmp(&self, other: &Self) -> cmp::Ordering { +impl Ord for Keyed { + fn cmp(&self, other: &Self) -> Ordering { self.key.cmp(&other.key) } } +impl Keyed { + pub fn into_tuple(self) -> (K, V) { + let Self { key, value } = self; + (key, value) + } +} + +impl Keyed, V> { + pub fn into_timed_tuple(self) -> (Keyed, u64) { + let Self { key, value } = self; + let (key, timestamp) = key.into_tuple(); + (Keyed { key, value }, timestamp) + } +} + #[cfg(test)] -impl Entry { +impl Keyed { pub fn from_slice(key: &[u8], value: &[u8]) -> Self { Self { key: Bytes::copy_from_slice(key), value: Bytes::copy_from_slice(value), } } +} - pub fn into_tuple(self) -> (Bytes, Bytes) { - let Self { key, value } = self; - (key, value) +#[cfg(test)] +impl InnerEntry { + pub fn prune_ts(self) -> Entry { + Entry { + key: self.key.into_inner(), + value: self.value, + } } } diff --git a/src/iterators/inspect.rs b/src/iterators/inspect.rs new file mode 100644 index 0000000..d2647a4 --- /dev/null +++ b/src/iterators/inspect.rs @@ -0,0 +1,24 @@ +use futures::{Stream, StreamExt}; + +pub struct InspectIterImpl; + +pub trait InspectIter { + type Stream, F: FnMut(&Item)>: Stream; + + fn inspect_stream(s: S, f: F) -> Self::Stream + where + S: Stream, + F: FnMut(&Item); +} + +impl InspectIter for InspectIterImpl { + type Stream, F: FnMut(&Item)> = impl Stream; + + fn inspect_stream(s: S, mut f: F) -> Self::Stream + where + S: Stream, + F: FnMut(&Item), + { + s.inspect(move |item| f(item)) + } +} diff --git a/src/iterators/lsm.rs b/src/iterators/lsm.rs index e69b348..1cade1c 100644 --- a/src/iterators/lsm.rs +++ b/src/iterators/lsm.rs @@ -1,25 +1,29 @@ +use bytes::Bytes; use std::collections::Bound; + use std::iter; use std::ops::Deref; use std::sync::Arc; use derive_new::new; -use futures::{stream, StreamExt}; +use futures::{stream, Stream, StreamExt}; use tracing::error; -use crate::entry::Entry; +use crate::entry::{Entry, InnerEntry, Keyed}; use crate::iterators::no_deleted::new_no_deleted_iter; use crate::iterators::{ - create_merge_iter_from_non_empty_iters, create_two_merge_iter, MergeIterator, - NoDeletedIterator, TwoMergeIterator, + create_merge_iter_from_non_empty_iters, create_two_merge_iter, MergeIterator, TwoMergeIterator, }; +use crate::key::Key; use crate::memtable::MemTableIterator; +use crate::mvcc::iterator::{build_time_dedup_iter, transform_bound}; use crate::persistent::Persistent; use crate::sst::iterator::MergedSstIterator; use crate::state::LsmStorageStateInner; -pub type LsmIterator<'a, File> = NoDeletedIterator, anyhow::Error>; +pub type LsmIterator<'a> = Box> + Unpin + Send + 'a>; +#[allow(dead_code)] type LsmIteratorInner<'a, File> = TwoMergeIterator< Entry, MergeIterator>, @@ -28,43 +32,76 @@ type LsmIteratorInner<'a, File> = TwoMergeIterator< #[derive(new)] pub struct LockedLsmIter<'a, P: Persistent> { - state: arc_swap::Guard>>, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, + state: Arc>, + pub(crate) lower: Bound<&'a [u8]>, + pub(crate) upper: Bound<&'a [u8]>, + timestamp: u64, } +fn assert_raw_stream(_s: &impl Stream>) {} + +fn assert_tuple_stream(_s: &impl Stream, u64)>>) {} + +fn assert_result_stream(_s: &impl Stream>>) {} + impl<'a, P> LockedLsmIter<'a, P> where P: Persistent, { - pub async fn iter(&'a self) -> anyhow::Result> { + pub async fn iter(&'a self) -> anyhow::Result> { + let time_dedup = self.iter_with_delete().await?; + let iter = new_no_deleted_iter(time_dedup); + let iter = Box::new(iter) as _; + Ok(iter) + } + + pub async fn iter_with_delete( + &self, + ) -> anyhow::Result> + Unpin + Send + '_> { let a = self.build_memtable_iter().await; + assert_raw_stream(&a); let b = self.build_sst_iter().await?; + assert_raw_stream(&b); let merge = create_two_merge_iter(a, b).await?; - let iter = new_no_deleted_iter(merge); - Ok(iter) + assert_raw_stream(&merge); + let merge = merge.map(|entry| entry.map(Keyed::into_timed_tuple)); + assert_tuple_stream(&merge); + let time_dedup = build_time_dedup_iter(merge, self.timestamp); + assert_result_stream(&time_dedup); + Ok(time_dedup) } - pub async fn build_memtable_iter(&self) -> MergeIterator { + pub async fn build_memtable_iter(&self) -> MergeIterator { + let (lower, upper) = transform_bound(self.lower, self.upper, self.timestamp); + let lower = lower.map(Key::from); + let upper = upper.map(Key::from); + let memtable = self.state.memtable().deref().as_immutable_ref(); let imm_memtables = self.state.imm_memtables().as_slice(); let imm_memtables = imm_memtables.iter().map(Arc::as_ref); let tables = iter::once(memtable).chain(imm_memtables); - let iters = stream::iter(tables).filter_map(move |table| async { - table - .scan(self.lower, self.upper) - .await - .inspect_err(|e| error!(error = ?e)) - .ok() - .flatten() + let iters = stream::iter(tables).filter_map(move |table| { + // todo: 这里不用每个 loop 都 copy,可以放在外面? + let lower = lower.map(|ks| ks.map(Bytes::copy_from_slice)); + let upper = upper.map(|ks| ks.map(Bytes::copy_from_slice)); + + async { + table + .scan_with_ts(lower, upper) + .await + .inspect_err(|e| error!(error = ?e)) + .ok() + .flatten() + } }); create_merge_iter_from_non_empty_iters(iters).await } pub async fn build_sst_iter(&self) -> anyhow::Result> { - self.state - .sstables_state() - .scan_sst(self.lower, self.upper) - .await + let (lower, upper) = transform_bound(self.lower, self.upper, self.timestamp); + let lower = lower.map(Key::from); + let upper = upper.map(Key::from); + + self.state.sstables_state().scan_sst(lower, upper).await } } diff --git a/src/iterators/maybe_empty.rs b/src/iterators/maybe_empty.rs index d32bc46..7a0aeb5 100644 --- a/src/iterators/maybe_empty.rs +++ b/src/iterators/maybe_empty.rs @@ -1,5 +1,5 @@ use futures::{Stream, StreamExt}; -use std::future::Future; + use tracing::Instrument; pub type MaybeEmptyStream = Option>; diff --git a/src/iterators/merge.rs b/src/iterators/merge.rs index 36d0250..ad8eb35 100644 --- a/src/iterators/merge.rs +++ b/src/iterators/merge.rs @@ -1,15 +1,14 @@ use std::collections::BinaryHeap; use std::fmt::Debug; use std::future::ready; -use std::future::Future; -use std::marker::PhantomData; + use std::pin::Pin; use std::task::{Context, Poll}; use futures::stream::unfold; use futures::{pin_mut, FutureExt, Stream, StreamExt}; use pin_project::pin_project; -use tracing::{error, info, Instrument}; +use tracing::error; use crate::iterators::maybe_empty::NonEmptyStream; use crate::iterators::merge::heap::HeapWrapper; @@ -45,7 +44,7 @@ where Item: Ord + Debug, I: Stream> + Unpin, { - iters: Pin>>, + iters: Pin::HeapStream>>, } impl MergeIteratorInner @@ -68,7 +67,7 @@ where .collect() .await; Self { - iters: Box::pin(build_heap_stream(iters)), + iters: Box::pin(HeapStreamBuilderImpl::build_heap_stream(iters)), } } } @@ -88,17 +87,47 @@ where } } -type HeapStream> + Unpin> = - impl Stream>; +pub trait HeapStreamBuilder { + type HeapStream> + Unpin>: Stream< + Item = anyhow::Result, + >; -fn build_heap_stream(heap: BinaryHeap>) -> HeapStream -where - I: Stream> + Unpin, - Item: Ord + Debug, -{ - unfold(heap, unfold_fn) + fn build_heap_stream( + heap: BinaryHeap>, + ) -> Self::HeapStream + where + I: Stream> + Unpin, + Item: Ord + Debug; } +pub struct HeapStreamBuilderImpl; + +impl HeapStreamBuilder for HeapStreamBuilderImpl { + type HeapStream> + Unpin> = + impl Stream>; + + fn build_heap_stream( + heap: BinaryHeap>, + ) -> Self::HeapStream + where + I: Stream> + Unpin, + Item: Ord + Debug, + { + unfold(heap, unfold_fn) + } +} + +// type HeapStream> + Unpin> = +// impl Stream>; +// +// fn build_heap_stream(heap: BinaryHeap>) -> HeapStream +// where +// I: Stream> + Unpin, +// Item: Ord + Debug, +// { +// unfold(heap, unfold_fn) +// } + async fn unfold_fn( mut heap: BinaryHeap>, ) -> Option<(anyhow::Result, BinaryHeap>)> @@ -133,7 +162,7 @@ mod test { use crate::entry::Entry; use crate::iterators::create_merge_iter; use crate::iterators::merge::MergeIteratorInner; - use crate::iterators::utils::{assert_stream_eq, build_stream, build_tuple_stream}; + use crate::iterators::utils::test_utils::{assert_stream_eq, build_stream, build_tuple_stream}; #[tokio::test] async fn test_empty() { diff --git a/src/iterators/merge/heap.rs b/src/iterators/merge/heap.rs index 492ce9d..a016844 100644 --- a/src/iterators/merge/heap.rs +++ b/src/iterators/merge/heap.rs @@ -1,7 +1,7 @@ use crate::iterators::NonEmptyStream; use std::cmp; -pub(super) struct HeapWrapper { +pub struct HeapWrapper { pub index: usize, pub iter: NonEmptyStream, } diff --git a/src/iterators/mod.rs b/src/iterators/mod.rs index 1a0976d..d25dd8a 100644 --- a/src/iterators/mod.rs +++ b/src/iterators/mod.rs @@ -1,4 +1,5 @@ -mod lsm; +pub mod inspect; +pub mod lsm; mod maybe_empty; pub mod merge; pub mod no_deleted; @@ -10,9 +11,9 @@ pub mod utils; pub use lsm::LockedLsmIter; pub use maybe_empty::{MaybeEmptyStream, NonEmptyStream}; pub use merge::{create_merge_iter, create_merge_iter_from_non_empty_iters, MergeIterator}; -pub use no_deleted::NoDeletedIterator; + pub use ok_iter::OkIter; pub use two_merge::{create_two_merge_iter, TwoMergeIterator}; pub use utils::iter_fut_iter_to_stream; pub use utils::split_first; -pub use utils::{eq, iter_fut_to_stream, transpose_try_iter}; +pub use utils::transpose_try_iter; diff --git a/src/iterators/two_merge.rs b/src/iterators/two_merge.rs index 1c23007..a0c6768 100644 --- a/src/iterators/two_merge.rs +++ b/src/iterators/two_merge.rs @@ -1,44 +1,71 @@ +use futures::stream::unfold; +use futures::Stream; use std::fmt::Debug; use std::future::Future; -use futures::stream::unfold; -use futures::{Stream, StreamExt}; - use crate::iterators::no_duplication::{new_no_duplication, NoDuplication}; use crate::iterators::{MaybeEmptyStream, NonEmptyStream}; // Merges two iterators of different types into one. If the two iterators have the same key, only /// produce the key once and prefer the entry from A. -pub type TwoMergeIterator = NoDuplication>; +pub type TwoMergeIterator = + NoDuplication<::MyTwoMergeIterInner>; pub async fn create_two_merge_iter( a: A, b: B, -) -> anyhow::Result>> +) -> anyhow::Result< + NoDuplication<::MyTwoMergeIterInner>, +> where Item: Ord + Debug + Unpin, A: Stream> + Unpin, B: Stream> + Unpin, { - let inner = create_inner(a, b).await?; + let inner = TwoMergeIterImpl::create_inner(a, b).await?; Ok(new_no_duplication(inner)) } -pub type TwoMergeIterInner< - Item: Ord + Debug + Unpin, - A: Stream> + Unpin, - B: Stream> + Unpin, -> = impl Stream> + Unpin; -pub async fn create_inner(a: A, b: B) -> anyhow::Result> -where - Item: Ord + Debug + Unpin, - A: Stream> + Unpin, - B: Stream> + Unpin, -{ - let a = NonEmptyStream::try_new(a).await?; - let b = NonEmptyStream::try_new(b).await?; - let x = unfold((a, b), unfold_fn); - Ok(Box::pin(x)) +pub trait TwoMergeIter { + type MyTwoMergeIterInner< + Item: Ord + Debug + Unpin, + A: Stream> + Unpin, + B: Stream> + Unpin, + >: Stream> + Unpin; + + fn create_inner( + a: A, + b: B, + ) -> impl Future>> + where + Item: Ord + Debug + Unpin, + A: Stream> + Unpin, + B: Stream> + Unpin; +} + +pub struct TwoMergeIterImpl; + +impl TwoMergeIter for TwoMergeIterImpl { + type MyTwoMergeIterInner< + Item: Ord + Debug + Unpin, + A: Stream> + Unpin, + B: Stream> + Unpin, + > = impl Stream> + Unpin; + + async fn create_inner( + a: A, + b: B, + ) -> anyhow::Result> + where + Item: Ord + Debug + Unpin, + A: Stream> + Unpin, + B: Stream> + Unpin, + { + let a = NonEmptyStream::try_new(a).await?; + let b = NonEmptyStream::try_new(b).await?; + let x = unfold((a, b), unfold_fn); + Ok(Box::pin(x)) + } } async fn unfold_fn( diff --git a/src/iterators/utils.rs b/src/iterators/utils.rs index 24692e8..657699b 100644 --- a/src/iterators/utils.rs +++ b/src/iterators/utils.rs @@ -1,16 +1,13 @@ -use bytes::Bytes; -use std::fmt::Debug; use std::future::Future; use std::iter::Map; use std::iter::Once; -use std::pin::pin; -use std::{iter, vec}; -use crate::entry::Entry; +use std::iter; + use either::Either; use futures::future::IntoStream; use futures::stream::{FlatMap, Flatten, Iter}; -use futures::{stream, FutureExt, Stream, StreamExt}; +use futures::{stream, FutureExt, StreamExt}; pub fn transpose_try_iter(iterator: Result) -> Either>> where @@ -58,72 +55,79 @@ where stream::iter(iterator.map(FutureExt::into_stream as fn(_) -> _)).flatten() } -pub async fn assert_stream_eq(s1: S1, s2: S2) -where - S1: Stream, - S2: Stream, - S1::Item: PartialEq + Debug, - S2::Item: Debug, -{ - let s1: Vec<_> = s1.collect().await; - let s2: Vec<_> = s2.collect().await; - assert_eq!(s1, s2); -} +#[cfg(test)] +pub mod test_utils { + use crate::entry::Entry; + use bytes::Bytes; + use futures::stream::Iter; + use futures::{stream, Stream, StreamExt}; + use std::fmt::Debug; + use std::pin::pin; + use std::vec; -pub async fn eq(s1: S1, s2: S2) -> bool -where - S1: Stream, - S2: Stream, - S1::Item: PartialEq + Debug, - S2::Item: Debug, -{ - let mut s1 = pin!(s1); - let mut s2 = pin!(s2); - loop { - match (s1.next().await, s2.next().await) { - (Some(x1), Some(x2)) => { - if x1 != x2 { - dbg!((x1, x2)); - return false; - } - } - (Some(_), None) | (None, Some(_)) => return false, - (None, None) => return true, - } + pub type EntryStream = Iter>; + + pub fn build_stream<'a>(source: impl IntoIterator) -> EntryStream { + let s: Vec<_> = source + .into_iter() + .map(|(key, value)| Entry::from_slice(key.as_bytes(), value.as_bytes())) + .collect(); + stream::iter(s) } -} -#[cfg(test)] -pub type EntryStream = Iter>; + pub fn build_tuple_stream<'a>( + source: impl IntoIterator, + ) -> impl Stream { + let s: Vec<_> = source + .into_iter() + .map(|(key, value)| { + ( + Bytes::copy_from_slice(key.as_bytes()), + Bytes::copy_from_slice(value.as_bytes()), + ) + }) + .collect(); + stream::iter(s) + } -#[cfg(test)] -pub fn build_stream<'a>(source: impl IntoIterator) -> EntryStream { - let s: Vec<_> = source - .into_iter() - .map(|(key, value)| Entry::from_slice(key.as_bytes(), value.as_bytes())) - .collect(); - stream::iter(s) -} + pub async fn assert_stream_eq(s1: S1, s2: S2) + where + S1: Stream, + S2: Stream, + S1::Item: PartialEq + Debug, + S2::Item: Debug, + { + let s1: Vec<_> = s1.collect().await; + let s2: Vec<_> = s2.collect().await; + assert_eq!(s1, s2); + } -#[cfg(test)] -pub fn build_tuple_stream<'a>( - source: impl IntoIterator, -) -> impl Stream { - let s: Vec<_> = source - .into_iter() - .map(|(key, value)| { - ( - Bytes::copy_from_slice(key.as_bytes()), - Bytes::copy_from_slice(value.as_bytes()), - ) - }) - .collect(); - stream::iter(s) + pub async fn eq(s1: S1, s2: S2) -> bool + where + S1: Stream, + S2: Stream, + S1::Item: PartialEq + Debug, + S2::Item: Debug, + { + let mut s1 = pin!(s1); + let mut s2 = pin!(s2); + loop { + match (s1.next().await, s2.next().await) { + (Some(x1), Some(x2)) => { + if x1 != x2 { + return false; + } + } + (Some(_), None) | (None, Some(_)) => return false, + (None, None) => return true, + } + } + } } #[cfg(test)] mod tests { - use super::eq; + use crate::iterators::utils::test_utils::eq; use futures::stream; use std::fmt::Debug; diff --git a/src/key.rs b/src/key.rs index cfe761c..bbe28db 100644 --- a/src/key.rs +++ b/src/key.rs @@ -1,166 +1,177 @@ -use bytes::Bytes; +use bytes::{Buf, Bytes}; +use derive_new::new; +use nom::AsBytes; + +use std::cmp::Ordering; use std::fmt::Debug; -use std::ops::Bound; -#[derive(PartialEq)] -pub struct Key>(T); +#[derive(PartialEq, Eq, Debug, new, Default, Clone, Copy)] +pub struct Key { + pub key: T, + pub timestamp: u64, +} pub type KeySlice<'a> = Key<&'a [u8]>; pub type KeyVec = Key>; pub type KeyBytes = Key; -impl> Key { - pub fn into_inner(self) -> T { - self.0 +impl Key { + pub fn map(self, f: impl FnOnce(T) -> U) -> Key { + Key::new(f(self.key), self.timestamp) } - pub fn len(&self) -> usize { - self.0.as_ref().len() + pub fn as_ref(&self) -> Key<&T> { + Key::new(&self.key, self.timestamp) } - pub fn is_empty(&self) -> bool { - self.0.as_ref().is_empty() + pub fn timestamp(&self) -> u64 { + self.timestamp } - pub fn for_testing_ts(self) -> u64 { - 0 + pub fn into_tuple(self) -> (T, u64) { + (self.key, self.timestamp) } } -impl Key> { - pub fn new() -> Self { - Self(Vec::new()) +impl From<(T, u64)> for Key { + fn from(pair: (T, u64)) -> Self { + Self { + key: pair.0, + timestamp: pair.1, + } } +} - /// Create a `KeyVec` from a `Vec`. Will be removed in week 3. - pub fn from_vec(key: Vec) -> Self { - Self(key) +#[cfg(test)] +impl Key { + pub fn new_for_test(key: &[u8], ts: u64) -> Self { + Self::new(Bytes::copy_from_slice(key), ts) } +} - /// Clears the key and set ts to 0. - pub fn clear(&mut self) { - self.0.clear() +impl> Key { + pub fn into_inner(self) -> T { + self.key } - /// Append a slice to the end of the key - pub fn append(&mut self, data: &[u8]) { - self.0.extend(data) + pub fn len(&self) -> usize { + self.key.as_ref().len() } - /// Set the key from a slice without re-allocating. The signature will change in week 3. - pub fn set_from_slice(&mut self, key_slice: KeySlice) { - self.0.clear(); - self.0.extend(key_slice.0); + pub fn is_empty(&self) -> bool { + self.key.as_ref().is_empty() + } + + pub fn copy_to_key_bytes(self) -> KeyBytes { + self.map(|slice| Bytes::copy_from_slice(slice.as_ref())) + } + + pub fn to_byte_iter(&self) -> impl Iterator + '_ { + let key = self.key.as_ref(); + let key_len = (key.len() as u16).to_be_bytes().into_iter(); + let key = key.iter().copied(); + let timestamp = self.timestamp.to_be_bytes().into_iter(); + + key_len.chain(key).chain(timestamp) } +} +impl Key> { pub fn as_key_slice(&self) -> KeySlice { - Key(self.0.as_slice()) + self.as_ref().map(Vec::as_slice) } pub fn into_key_bytes(self) -> KeyBytes { - Key(self.0.into()) + self.map(Into::into) } /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. pub fn raw_ref(&self) -> &[u8] { - self.0.as_ref() + self.key.as_ref() } pub fn for_testing_key_ref(&self) -> &[u8] { - self.0.as_ref() + self.key.as_ref() } pub fn for_testing_from_vec_no_ts(key: Vec) -> Self { - Self(key) + Self::new(key, 0) } } impl Key { pub fn as_key_slice(&self) -> KeySlice { - Key(&self.0) + self.as_ref().map(|b| b.as_bytes()) } /// Create a `KeyBytes` from a `Bytes`. Will be removed in week 3. - pub fn from_bytes(bytes: Bytes) -> KeyBytes { - Key(bytes) + pub fn from_bytes(_bytes: Bytes) -> KeyBytes { + todo!() } /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. pub fn raw_ref(&self) -> &[u8] { - self.0.as_ref() + self.key.as_ref() } - pub fn for_testing_from_bytes_no_ts(bytes: Bytes) -> KeyBytes { - Key(bytes) + pub fn for_testing_from_bytes_no_ts(_bytes: Bytes) -> KeyBytes { + todo!() + // Key(bytes) } pub fn for_testing_key_ref(&self) -> &[u8] { - self.0.as_ref() + todo!() + // self.0.as_ref() + } + + pub fn decode(buf: &mut impl Buf) -> KeyBytes { + let key_len = buf.get_u16() as usize; + let key = buf.copy_to_bytes(key_len); + let timestamp = buf.get_u64(); + Key::new(key, timestamp) } } impl<'a> Key<&'a [u8]> { pub fn to_key_vec(self) -> KeyVec { - Key(self.0.to_vec()) - } - - /// Create a key slice from a slice. Will be removed in week 3. - pub fn from_slice(slice: &'a [u8]) -> Self { - Self(slice) + self.map(|key| key.to_vec()) } /// Always use `raw_ref` to access the key in week 1 + 2. This function will be removed in week 3. pub fn raw_ref(self) -> &'a [u8] { - self.0 + self.key } pub fn for_testing_key_ref(self) -> &'a [u8] { - self.0 + todo!() } pub fn for_testing_from_slice_no_ts(slice: &'a [u8]) -> Self { - Self(slice) + Self::new(slice, 123) } - pub fn for_testing_from_slice_with_ts(slice: &'a [u8], _ts: u64) -> Self { - Self(slice) + pub fn for_testing_from_slice_with_ts(slice: &'a [u8], ts: u64) -> Self { + Self::new(slice, ts) } } -impl + Debug> Debug for Key { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.0.fmt(f) +impl PartialOrd for Key { + fn partial_cmp(&self, other: &Self) -> Option { + let key_order = self.key.partial_cmp(&other.key)?; + match key_order { + Ordering::Equal => self + .timestamp + .partial_cmp(&other.timestamp) + .map(Ordering::reverse), + _ => Some(key_order), + } } } -impl + Default> Default for Key { - fn default() -> Self { - Self(T::default()) - } -} - -impl + Eq> Eq for Key {} - -impl + Clone> Clone for Key { - fn clone(&self) -> Self { - Self(self.0.clone()) +impl Ord for Key { + fn cmp(&self, other: &Self) -> Ordering { + self.key + .cmp(&other.key) + .then_with(|| self.timestamp.cmp(&other.timestamp).reverse()) } } - -impl + Copy> Copy for Key {} - -impl + PartialOrd> PartialOrd for Key { - fn partial_cmp(&self, other: &Self) -> Option { - self.0.partial_cmp(&other.0) - } -} - -impl + Ord> Ord for Key { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.0.cmp(&other.0) - } -} - -pub fn bound_bytes_as_ref(b: &Bound) -> Bound<&[u8]> { - b.as_ref().map(Bytes::as_ref) -} diff --git a/src/lib.rs b/src/lib.rs index 9283cc0..763f64e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -#![feature(type_alias_impl_trait)] +#![feature(impl_trait_in_assoc_type)] mod block; mod bound; @@ -11,24 +11,9 @@ pub mod sst; pub mod state; mod wal; -mod lsm; +pub mod lsm; mod manifest; +pub mod mvcc; mod test_utils; +pub mod time; mod utils; - -pub async fn fibonacci(n: u64) -> u64 { - let mut a = 0; - let mut b = 1; - - match n { - 0 => b, - _ => { - for _ in 0..n { - let c = a + b; - a = b; - b = c; - } - b - } - } -} diff --git a/src/lsm/core.rs b/src/lsm/core.rs index 2567152..b3afb35 100644 --- a/src/lsm/core.rs +++ b/src/lsm/core.rs @@ -1,30 +1,27 @@ use std::future::{ready, Future}; use std::sync::Arc; -use std::thread; -use std::thread::sleep; + use std::time::Duration; use bytes::Bytes; -use futures::executor::block_on; -use futures::{ready, FutureExt, StreamExt}; + +use crate::persistent::Persistent; +use crate::sst::SstOptions; +use crate::state::{LsmStorageState, Map}; +use futures::{FutureExt, StreamExt}; use futures_concurrency::stream::Merge; -use tokio::sync::MutexGuard; -use tokio::task::{block_in_place, JoinHandle}; +use tokio::runtime::Handle; +use tokio::task::JoinHandle; use tokio::time::interval; use tokio_stream::wrappers::IntervalStream; use tokio_util::sync::CancellationToken; use tracing::error; -use crate::persistent::Persistent; -use crate::sst::SstOptions; -use crate::state::{LsmStorageState, Map}; -use crate::utils::func::do_nothing; - pub struct Lsm { state: Arc>, cancel_token: CancellationToken, flush_handle: Option>, - compaction_handle: Option>, + spawn_handle: Option>, } impl Lsm

{ @@ -32,19 +29,18 @@ impl Lsm

{ let state = Arc::new(LsmStorageState::new(options, persistent).await?); let cancel_token = CancellationToken::new(); let flush_handle = Self::spawn_flush(state.clone(), cancel_token.clone()); - let compaction_handle = Self::spawn_compaction(state.clone(), cancel_token.clone()); + let spawn_handle = Self::spawn_compaction(state.clone(), cancel_token.clone()); let this = Self { state, cancel_token, flush_handle: Some(flush_handle), - compaction_handle: Some(compaction_handle), + spawn_handle: Some(spawn_handle), }; Ok(this) } pub async fn sync(&self) -> anyhow::Result<()> { - // todo - Ok(()) + self.state.sync_wal().await } fn spawn_flush( @@ -82,7 +78,6 @@ impl Lsm

{ .take_while(|signal| ready(matches!(signal, Trigger))) .for_each(|_| async { let lock = state.state_lock().lock().await; - // println!("trigger compaction"); state .force_compact(&lock) .await @@ -123,6 +118,19 @@ where impl Drop for Lsm

{ fn drop(&mut self) { self.cancel_token.cancel(); + let flush_handle = self.flush_handle.take(); + let spawn_handle = self.spawn_handle.take(); + + tokio::task::block_in_place(|| { + Handle::current().block_on(async { + if let Some(flush_handle) = flush_handle { + flush_handle.await.inspect_err(|e| error!(error = ?e)).ok(); + } + if let Some(spawn_handle) = spawn_handle { + spawn_handle.await.inspect_err(|e| error!(error = ?e)).ok(); + } + }) + }); } } @@ -146,13 +154,11 @@ enum Signal { #[cfg(test)] mod tests { - use std::time::Duration; - use nom::AsBytes; + + use std::time::Duration; use tempfile::{tempdir, TempDir}; use tokio::time::sleep; - use tracing_futures::Instrument; - use tracing_subscriber::fmt::format::FmtSpan; use crate::lsm::core::Lsm; use crate::persistent::{LocalFs, Persistent}; @@ -192,6 +198,7 @@ mod tests { // .is_empty()); // } + #[allow(dead_code)] async fn build_lsm(dir: &TempDir) -> anyhow::Result> { let persistent = LocalFs::new(dir.path().to_path_buf()); let options = SstOptions::builder() @@ -204,7 +211,7 @@ mod tests { Lsm::new(options, persistent).await } - #[tokio::test] + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_auto_compaction() { let dir = tempdir().unwrap(); let persistent = LocalFs::new(dir.path().to_path_buf()); @@ -219,6 +226,7 @@ mod tests { .num_memtable_limit(1000) .compaction_option(CompactionOptions::Leveled(compaction_options)) .enable_wal(false) + .enable_mvcc(true) .build(); let lsm = Lsm::new(options, persistent).await.unwrap(); for i in 0..10 { @@ -226,7 +234,6 @@ mod tests { insert_sst(&lsm, begin..begin + 100).await.unwrap(); } sleep(Duration::from_secs(2)).await; - dbg!(&lsm.state); for i in 0..10 { let begin = i * 100; @@ -240,7 +247,7 @@ mod tests { } } - #[tokio::test] + #[tokio::test(flavor = "multi_thread", worker_threads = 1)] async fn test_wal_integration() { let compaction_options = LeveledCompactionOptions::builder() .max_levels(3) @@ -253,6 +260,7 @@ mod tests { .num_memtable_limit(10) .compaction_option(CompactionOptions::Leveled(compaction_options)) .enable_wal(true) + .enable_mvcc(true) .build(); let dir = tempdir().unwrap(); let persistent = LocalFs::new(dir.path().to_path_buf()); @@ -271,12 +279,12 @@ mod tests { let persistent = LocalFs::new(dir.path().to_path_buf()); let lsm = Lsm::new(options, persistent).await.unwrap(); assert_eq!( - &lsm.get(b"key-0").await.unwrap().unwrap()[..], - b"value-1024".as_slice() + std::str::from_utf8(&lsm.get(b"key-0").await.unwrap().unwrap()[..]).unwrap(), + "value-1024", ); assert_eq!( - &lsm.get(b"key-1").await.unwrap().unwrap()[..], - b"value-1024".as_slice() + std::str::from_utf8(&lsm.get(b"key-1").await.unwrap().unwrap()[..]).unwrap(), + "value-1024", ); assert_eq!(lsm.get(b"key-2").await.unwrap(), None); } diff --git a/src/lsm/mod.rs b/src/lsm/mod.rs index 9816037..5a7ca06 100644 --- a/src/lsm/mod.rs +++ b/src/lsm/mod.rs @@ -1 +1 @@ -mod core; +pub mod core; diff --git a/src/manifest.rs b/src/manifest.rs index 8ca1ecc..9e8d170 100644 --- a/src/manifest.rs +++ b/src/manifest.rs @@ -1,5 +1,4 @@ use std::future::Future; -use std::io::{Read, Write}; use std::sync::Arc; use crate::persistent::interface::ManifestHandle; @@ -8,7 +7,7 @@ use derive_more::From; use serde::{Deserialize, Serialize}; use serde_json::Deserializer; use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::sync::{Mutex, MutexGuard}; +use tokio::sync::Mutex; use crate::persistent::Persistent; use crate::sst::compact::common::CompactionTask; @@ -80,7 +79,7 @@ impl Manifest { mod tests { use crate::manifest::{Compaction, Flush, Manifest, ManifestRecord, NewMemtable}; use crate::persistent::LocalFs; - use crate::sst::compact::common::CompactionTask; + use crate::sst::compact::common::{CompactionTask, SourceIndex}; use tempfile::tempdir; #[tokio::test] @@ -93,7 +92,10 @@ mod tests { { let manifest = Manifest::create(&persistent).await.unwrap(); - let record = Compaction(CompactionTask::new(1, 2, 3), vec![1, 2, 3]); + let record = Compaction( + CompactionTask::new(1, SourceIndex::Index { index: 2 }, 3), + vec![1, 2, 3], + ); manifest .add_record_when_init(R::Compaction(record)) .await @@ -109,9 +111,12 @@ mod tests { } { - let (manifest, records) = Manifest::recover(&persistent).await.unwrap(); + let (_manifest, records) = Manifest::recover(&persistent).await.unwrap(); - let record = Compaction(CompactionTask::new(1, 2, 3), vec![1, 2, 3]); + let record = Compaction( + CompactionTask::new(1, SourceIndex::Index { index: 2 }, 3), + vec![1, 2, 3], + ); assert_eq!( records, diff --git a/src/memtable/immutable.rs b/src/memtable/immutable.rs index e6498e8..dd3a789 100644 --- a/src/memtable/immutable.rs +++ b/src/memtable/immutable.rs @@ -1,11 +1,12 @@ use std::collections::Bound; use std::fmt::{Debug, Formatter}; +use crate::key::{KeyBytes, KeySlice}; use bytemuck::TransparentWrapper; use bytes::Bytes; use crossbeam_skiplist::map; -use deref_ext::DerefExt; use derive_new::new; + use ref_cast::RefCast; use crate::memtable::iterator::MaybeEmptyMemTableIterRef; @@ -35,6 +36,7 @@ impl ImmutableMemTable { } } +// todo: remove it impl ImmutableMemTable { pub async fn scan<'a>( &'a self, @@ -48,11 +50,25 @@ impl ImmutableMemTable { self.0.get(key) } - pub fn iter(&self) -> impl Iterator> { + pub fn iter(&self) -> impl Iterator> { self.0.map().iter() } } +impl ImmutableMemTable { + pub fn get_with_ts(&self, key: KeySlice) -> Option { + self.0.get_with_ts(key) + } + + pub async fn scan_with_ts( + &self, + lower: Bound, + upper: Bound, + ) -> anyhow::Result> { + self.0.scan_with_ts(lower, upper).await + } +} + impl From> for ImmutableMemTable { fn from(table: MemTable) -> Self { Self::new(table) diff --git a/src/memtable/iterator.rs b/src/memtable/iterator.rs index c90c858..d664e32 100644 --- a/src/memtable/iterator.rs +++ b/src/memtable/iterator.rs @@ -1,4 +1,3 @@ -use std::collections::Bound; use std::iter; use bytes::Bytes; @@ -6,46 +5,45 @@ use crossbeam_skiplist::map; use futures::stream; use crate::bound::BytesBound; -use crate::entry::Entry; +use crate::entry::InnerEntry; use crate::iterators::{MaybeEmptyStream, NonEmptyStream, OkIter}; +use crate::key::KeyBytes; pub type MemTableIterator<'a> = stream::Iter>>; type ClonedSkipMapRangeIter<'a> = - iter::Map, for<'b> fn(SkipMapRangeEntry<'b>) -> Entry>; + iter::Map, for<'b> fn(map::Entry<'b, KeyBytes, Bytes>) -> InnerEntry>; pub fn new_memtable_iter(iter: SkipMapRangeIter) -> MemTableIterator { - let iter = iter.map(convert_entry as for<'a> fn(map::Entry<'a, Bytes, Bytes>) -> _); + let iter = iter.map(convert_entry as for<'a> fn(map::Entry<'a, KeyBytes, Bytes>) -> _); stream::iter(OkIter::new(iter)) } -fn convert_entry(x: map::Entry<'_, Bytes, Bytes>) -> Entry { - Entry { +fn convert_entry(x: map::Entry<'_, KeyBytes, Bytes>) -> InnerEntry { + InnerEntry { key: x.key().clone(), value: x.value().clone(), } } -type SkipMapRangeIter<'a> = map::Range<'a, [u8], BytesBound<'a>, Bytes, Bytes>; +type SkipMapRangeIter<'a> = map::Range<'a, KeyBytes, BytesBound, KeyBytes, Bytes>; -type SkipMapRangeEntry<'a> = map::Entry<'a, Bytes, Bytes>; - -pub type NonEmptyMemTableIterRef<'a> = NonEmptyStream>; -pub type MaybeEmptyMemTableIterRef<'a> = MaybeEmptyStream>; +#[allow(dead_code)] +pub type NonEmptyMemTableIterRef<'a> = NonEmptyStream>; +pub type MaybeEmptyMemTableIterRef<'a> = MaybeEmptyStream>; #[cfg(test)] mod test { + use std::collections::Bound; + + use futures::{stream, Stream, StreamExt}; + use nom::AsBytes; + use crate::entry::Entry; - use crate::iterators::{ - create_merge_iter, create_merge_iter_from_non_empty_iters, eq, MergeIterator, - }; + use crate::iterators::create_merge_iter_from_non_empty_iters; use crate::memtable::MemTable; use crate::persistent::interface::WalHandle; use crate::persistent::wal_handle::WalFile; - use bytes::Bytes; - use futures::{stream, Stream, StreamExt}; - use nom::AsBytes; - use std::collections::Bound; - use std::vec; + use crate::test_utils::iterator::unwrap_ts_stream; #[tokio::test] async fn test_task1_memtable_iter() { @@ -143,6 +141,7 @@ mod test { ) -> impl Stream> + Send + 'a { let iter = memtable.for_testing_scan_slice(lower, upper).await.unwrap(); - create_merge_iter_from_non_empty_iters(stream::iter(iter.into_iter())).await + let x = create_merge_iter_from_non_empty_iters(stream::iter(iter.into_iter())).await; + unwrap_ts_stream(x) } } diff --git a/src/memtable/mutable.rs b/src/memtable/mutable.rs index cecbc35..75aa3e1 100644 --- a/src/memtable/mutable.rs +++ b/src/memtable/mutable.rs @@ -1,7 +1,7 @@ use std::fmt::{Debug, Formatter}; -use std::future::Future; -use std::ops::{Bound, RangeBounds}; -use std::path::{Path, PathBuf}; + +use std::ops::Bound; +use std::slice; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; @@ -9,18 +9,18 @@ use bytemuck::TransparentWrapperAlloc; use bytes::Bytes; use crossbeam_skiplist::SkipMap; use derive_getters::Getters; -use nom::AsBytes; use ref_cast::RefCast; use tracing_futures::Instrument; use crate::bound::BytesBound; +use crate::entry::{Entry, Keyed}; use crate::iterators::NonEmptyStream; +use crate::key::{KeyBytes, KeySlice}; use crate::manifest::{Manifest, ManifestRecord, NewMemtable}; use crate::memtable::immutable::ImmutableMemTable; use crate::memtable::iterator::{new_memtable_iter, MaybeEmptyMemTableIterRef}; -use crate::persistent::interface::{ManifestHandle, WalHandle}; +use crate::persistent::interface::WalHandle; use crate::persistent::Persistent; -use crate::state::Map; use crate::wal::Wal; /// A basic mem-table based on crossbeam-skiplist. @@ -30,7 +30,7 @@ use crate::wal::Wal; /// todo: MemTable 本质是 Map,可以抽象为 trait #[derive(Getters)] pub struct MemTable { - pub(self) map: SkipMap, + pub(self) map: SkipMap, wal: Option>, id: usize, @@ -61,7 +61,7 @@ impl MemTable { Self::new(id, SkipMap::new(), None) } - fn new(id: usize, map: SkipMap, wal: impl Into>>) -> Self { + fn new(id: usize, map: SkipMap, wal: impl Into>>) -> Self { Self { map, wal: wal.into(), @@ -107,25 +107,9 @@ impl MemTable { } /// Get a value by key. + /// todo: remote this method pub fn get(&self, key: &[u8]) -> Option { - self.map.get(key).map(|x| x.value().clone()) - } - - /// Put a key-value pair into the mem-table. - /// - /// In week 1, day 1, simply put the key-value pair into the skipmap. - /// In week 2, day 6, also flush the data to WAL. - pub async fn put(&self, key: Bytes, value: Bytes) -> anyhow::Result<()> { - let size = key.len() + value.len(); - if let Some(wal) = self.wal.as_ref() { - wal.put(key.as_bytes(), value.as_bytes()) - .instrument(tracing::info_span!("wal_put")) - .await? - } - self.map.insert(key, value); - self.approximate_size.fetch_add(size, Ordering::Release); - - Ok(()) + self.get_with_ts(KeySlice::new(key, 0)) } pub async fn sync_wal(&self) -> anyhow::Result<()> { @@ -141,11 +125,65 @@ impl MemTable { } /// Get an iterator over a range of keys. + /// todo: remote this method pub async fn scan<'a>( &'a self, lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>, ) -> anyhow::Result> { + self.scan_with_ts( + lower.map(|k| KeyBytes::new(Bytes::copy_from_slice(k), 0)), + upper.map(|k| KeyBytes::new(Bytes::copy_from_slice(k), 0)), + ) + .await + } +} + +// with transaction +impl MemTable { + pub fn get_with_ts(&self, key: KeySlice) -> Option { + // todo: 因为 rust 的 Borrow trait 不够 general,这里需要复制,如何避免? + let key = key.map(Bytes::copy_from_slice); + self.map.get(&key).map(|x| x.value().clone()) + } + + pub async fn put_with_ts(&self, key: KeyBytes, value: Bytes) -> anyhow::Result<()> { + let KeyBytes { key, timestamp } = key; + let entry = Entry::new(key, value); + self.put_batch(slice::from_ref(&entry), timestamp).await + } + + pub async fn put_batch(&self, entries: &[Entry], timestamp: u64) -> anyhow::Result<()> { + // todo: entries 可以改成 iterator + if let Some(wal) = self.wal.as_ref() { + let entries = entries + .iter() + .map(|e| Keyed::new(e.key.as_ref(), e.value.as_ref())); + wal.put_batch(entries, timestamp) + .instrument(tracing::info_span!("wal_put")) + .await?; + } + let mut size = 0; + for entry in entries { + let Entry { key, value } = entry; + let key = KeyBytes::new(key.clone(), timestamp); + let value = value.clone(); + + size += key.len() + value.len(); + self.map.insert(key, value); + } + self.approximate_size.fetch_add(size, Ordering::Release); + + Ok(()) + } + + pub async fn scan_with_ts( + &self, + lower: Bound, + upper: Bound, + ) -> anyhow::Result> { + // todo: 由于 rust 的 Borrow trait 的限制,这里只能 copy + let iter = self.map.range(BytesBound { start: lower, end: upper, @@ -155,10 +193,6 @@ impl MemTable { } } -fn build_path(dir: impl AsRef, id: usize) -> PathBuf { - dir.as_ref().join(format!("{}.wal", id)) -} - #[cfg(test)] impl MemTable { pub async fn for_testing_put_slice(&self, key: &[u8], value: &[u8]) -> anyhow::Result<()> { @@ -175,7 +209,20 @@ impl MemTable { lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>, ) -> anyhow::Result> { - self.scan(lower, upper).await + self.scan_with_ts( + lower.map(|k| KeyBytes::new(Bytes::copy_from_slice(k), 0)), + upper.map(|k| KeyBytes::new(Bytes::copy_from_slice(k), 0)), + ) + .await + } + + /// Put a key-value pair into the mem-table. + /// + /// In week 1, day 1, simply put the key-value pair into the skipmap. + /// In week 2, day 6, also flush the data to WAL. + /// todo: remote this method + pub async fn put(&self, key: Bytes, value: Bytes) -> anyhow::Result<()> { + self.put_with_ts(KeyBytes::new(key, 0), value).await } } @@ -195,11 +242,15 @@ impl MemTable { #[cfg(test)] mod test { - use tempfile::tempdir; - + use crate::key::Key; use crate::manifest::Manifest; use crate::memtable::mutable::MemTable; + use crate::mvcc::iterator::transform_bound; use crate::persistent::LocalFs; + use crate::time::{TimeIncrement, TimeProvider}; + use bytes::Bytes; + use std::ops::Bound::Included; + use tempfile::tempdir; #[tokio::test] async fn test_task1_memtable_get_wal() { @@ -300,4 +351,39 @@ mod test { ); } } + + #[tokio::test] + async fn test_memtable_mvcc() { + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let manifest = Manifest::create(&persistent).await.unwrap(); + let id = 123; + + let memtable = MemTable::create_with_wal(id, &persistent, &manifest) + .await + .unwrap(); + let time_provider = Box::::default(); + + memtable + .put_with_ts( + Key::new(Bytes::copy_from_slice(b"key1"), time_provider.now()), + Bytes::copy_from_slice(b"value1"), + ) + .await + .unwrap(); + + { + let now = time_provider.now(); + let (lower, upper) = transform_bound(Included(b"key1"), Included(b"key1"), now); + let lower = lower.map(Key::from); + let upper = upper.map(Key::from); + let lower = lower.map(|ks| ks.map(|b| Bytes::copy_from_slice(b))); + let upper = upper.map(|ks| ks.map(|b| Bytes::copy_from_slice(b))); + let iter = memtable.scan_with_ts(lower, upper).await.unwrap(); + + let (new_iter, _) = iter.unwrap().next().await; + let new_iter = new_iter.unwrap(); + assert!(new_iter.is_none()); + } + } } diff --git a/src/mvcc/core.rs b/src/mvcc/core.rs new file mode 100644 index 0000000..646bf1e --- /dev/null +++ b/src/mvcc/core.rs @@ -0,0 +1,59 @@ +use std::collections::{BTreeMap, HashSet}; +use std::sync::Arc; + +use parking_lot::Mutex; + +use crate::mvcc::transaction::Transaction; +use crate::mvcc::watermark::Watermark; +use crate::persistent::Persistent; +use crate::state::LsmStorageState; +use crate::time::TimeProvider; + +#[derive(Debug)] +pub struct CommittedTxnData { + pub key_hashes: HashSet, + pub read_ts: u64, + pub commit_ts: u64, +} + +pub type TimeProviderWrapper = Box; + +pub struct LsmMvccInner { + pub ts: Arc>, + pub committed_txns: Arc>>, +} + +impl LsmMvccInner { + pub fn new(initial_ts: u64) -> Self { + Self { + ts: Arc::new(Mutex::new((initial_ts, Watermark::new()))), + committed_txns: Arc::new(tokio::sync::Mutex::new(BTreeMap::new())), + } + } + + pub fn latest_commit_ts(&self) -> u64 { + self.ts.lock().0 + } + + pub fn update_commit_ts(&self, ts: u64) { + self.ts.lock().0 = ts; + } + + /// All ts (strictly) below this ts can be garbage collected. + pub fn watermark(&self) -> u64 { + let ts = self.ts.lock(); + ts.1.watermark().unwrap_or(ts.0) + } + + pub fn new_txn<'a, P: Persistent>( + self: &Arc, + state: &'a LsmStorageState

, + serializable: bool, + ) -> Transaction<'a, P> { + let ts = { + let guard = self.ts.lock(); + guard.0 + }; + Transaction::new(ts, state, serializable, self.clone()) + } +} diff --git a/src/mvcc/iterator.rs b/src/mvcc/iterator.rs new file mode 100644 index 0000000..e555a45 --- /dev/null +++ b/src/mvcc/iterator.rs @@ -0,0 +1,319 @@ +use crate::bound::BoundRange; +use crate::entry::{Entry, InnerEntry, Keyed}; +use crate::iterators::inspect::{InspectIter, InspectIterImpl}; +use crate::iterators::lsm::LsmIterator; +use crate::iterators::no_deleted::new_no_deleted_iter; +use crate::iterators::{create_two_merge_iter, LockedLsmIter}; +use crate::key::KeyBytes; +use crate::mvcc::transaction::{RWSet, Transaction}; +use crate::persistent::Persistent; +use crate::utils::scoped::ScopedMutex; +use async_iter_ext::StreamTools; +use bytes::Bytes; +use crossbeam_skiplist::SkipMap; +use derive_new::new; +use futures::{stream, Stream, StreamExt, TryStreamExt}; +use num_traits::Bounded; +use ouroboros::self_referencing; +use std::future::ready; +use std::ops::Bound; + +struct TxnWithBound<'a, P: Persistent> { + txn: Transaction<'a, P>, + lower: Bound<&'a [u8]>, + upper: Bound<&'a [u8]>, +} + +#[self_referencing] +pub struct LockedTxnIterWithTxn<'a, P: Persistent> { + txn: TxnWithBound<'a, P>, + + #[borrows(txn)] + #[covariant] + iter: LockedTxnIter<'this, P>, +} + +impl<'a, P: Persistent> LockedTxnIterWithTxn<'a, P> { + pub fn new_(txn: Transaction<'a, P>, lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>) -> Self { + Self::new(TxnWithBound { txn, lower, upper }, |txn| { + txn.txn.scan(txn.lower, txn.upper) + }) + } + + pub async fn iter(&'a self) -> anyhow::Result> { + self.with_iter(|iter| iter.iter()).await + } +} + +#[derive(new)] +pub struct LockedTxnIter<'a, P: Persistent> { + local_storage: &'a SkipMap, + lsm_iter: LockedLsmIter<'a, P>, + key_hashes: Option<&'a ScopedMutex>, +} + +impl<'a, P: Persistent> LockedTxnIter<'a, P> { + pub async fn iter(&'a self) -> anyhow::Result> { + let lsm_iter = self.lsm_iter.iter_with_delete().await?; + let local_iter = txn_local_iterator( + self.local_storage, + self.lsm_iter.lower.map(Bytes::copy_from_slice), + self.lsm_iter.upper.map(Bytes::copy_from_slice), + ); + let merged = create_two_merge_iter(local_iter, lsm_iter).await?; + let iter = new_no_deleted_iter(merged); + let iter = InspectIterImpl::inspect_stream(iter, |entry| { + let Ok(entry) = entry else { return }; + let Some(key_hashes) = self.key_hashes else { + return; + }; + let key = entry.key.as_ref(); + key_hashes.lock_with(|mut set| set.add_read_key(key)); + }); + let iter = Box::new(iter) as _; + Ok(iter) + } +} + +pub fn txn_local_iterator( + map: &SkipMap, + lower: Bound, + upper: Bound, +) -> impl Stream> + Unpin + Send + '_ { + let it = map.range((lower, upper)).map(|entry| { + let key = entry.key().clone(); + let value = entry.value().clone(); + let pair = Keyed::new(key, value); + Ok::<_, anyhow::Error>(pair) + }); + stream::iter(it) +} + +pub fn build_time_dedup_iter( + s: S, + timestamp_upper: T, +) -> impl Stream> + Unpin + Send +where + S: Stream> + Unpin + Send, + A: PartialEq + Send, + E: Send, + T: PartialOrd + Copy + Send + Sync, +{ + s.try_filter(move |(_, timestamp)| { + // todo: use binary search? + let condition = timestamp.le(×tamp_upper); + ready(condition) + }) + .dedup_by(|left, right| match (left, right) { + (Ok((left, _)), Ok((right, _))) => left.eq(right), + _ => false, + }) + .map(|entry| entry.map(|pair| pair.0)) +} + +pub trait WatermarkGcIter { + type Stream> + Send + Unpin>: Stream> + + Send + + Unpin; + + fn build_watermark_gc_iter(s: S, watermark: u64) -> Self::Stream + where + S: Stream> + Send + Unpin; +} + +pub struct WatermarkGcIterImpl; + +impl WatermarkGcIter for WatermarkGcIterImpl { + type Stream> + Send + Unpin> = + impl Stream> + Send + Unpin; + + fn build_watermark_gc_iter(s: S, watermark: u64) -> Self::Stream + where + S: Stream> + Send + Unpin, + { + let result = s + .scan(None, move |state: &mut Option, entry| { + let item = Some(build_item(state, entry, watermark)); + ready(item) + }) + .filter_map(|entry| async { entry }); + + // todo: remove Box::pin? + Box::pin(result) + } +} + +fn build_item( + state: &mut Option, + entry: anyhow::Result, + watermark: u64, +) -> Option> { + match entry { + Err(e) => Some(Err(e)), + Ok(entry) => { + if let Some(prev_key) = state { + if prev_key.key == entry.key.key { + return None; + } + } + if entry.key.timestamp > watermark { + Some(Ok(entry)) + } else { + *state = Some(entry.key.clone()); + if entry.value.is_empty() { + // is delete + None + } else { + Some(Ok(entry)) + } + } + } + } +} + +pub fn transform_bound(lower: Bound, upper: Bound, timestamp: T) -> BoundRange<(A, T)> +where + T: Bounded, +{ + ( + transform_lower_bound(lower, timestamp), + transform_upper_bound(upper), + ) +} + +fn transform_lower_bound(lower: Bound, timestamp: T) -> Bound<(A, T)> +where + T: Bounded, +{ + use Bound::{Excluded, Included, Unbounded}; + match lower { + Included(a) => Included((a, timestamp)), + Excluded(a) => Excluded((a, T::min_value())), + Unbounded => Unbounded, + } +} + +fn transform_upper_bound(upper: Bound) -> Bound<(A, T)> +where + T: Bounded, +{ + use Bound::{Excluded, Included, Unbounded}; + match upper { + Included(a) => Included((a, T::min_value())), + Excluded(a) => Excluded((a, T::max_value())), + Unbounded => Unbounded, + } +} + +#[cfg(test)] +mod tests { + use crate::entry::{InnerEntry, Keyed}; + use crate::iterators::utils::test_utils::assert_stream_eq; + use crate::key::KeyBytes; + use crate::mvcc::iterator::{build_time_dedup_iter, WatermarkGcIter, WatermarkGcIterImpl}; + use bytes::Bytes; + use futures::{stream, Stream, StreamExt, TryStreamExt}; + + #[tokio::test] + async fn test_build_time_dedup_iter() { + test_time_dedup_iter_helper([(Keyed::new("a", "a1"), 1)], 3, [Keyed::new("a", "a1")]).await; + test_time_dedup_iter_helper( + [ + (Keyed::new("a", "a1"), 1), + (Keyed::new("a", "a2"), 2), + (Keyed::new("b", "b3"), 3), + ], + 2, + [Keyed::new("a", "a2")], + ) + .await; + test_time_dedup_iter_helper( + [ + (Keyed::new("a", "a1"), 1), + (Keyed::new("a", "a2"), 2), + (Keyed::new("a", "a3"), 3), + (Keyed::new("b", "b2"), 2), + (Keyed::new("c", "c1"), 1), + (Keyed::new("c", "c3"), 3), + ], + 2, + [ + Keyed::new("a", "a2"), + Keyed::new("b", "b2"), + Keyed::new("c", "c1"), + ], + ) + .await; + } + + async fn test_time_dedup_iter_helper(s: I, timestamp_upper: u64, expected: S) + where + I: IntoIterator, u64)>, + I::IntoIter: Send, + S: IntoIterator>, + { + let s = stream::iter(s).map(Ok::<_, ()>); + let result: Vec<_> = build_time_dedup_iter(s, timestamp_upper) + .try_collect() + .await + .unwrap(); + let expected: Vec<_> = expected.into_iter().collect(); + assert_eq!(result, expected); + } + + #[tokio::test] + async fn test_watermark_gc() { + test_watermark_gc_helper( + [("a", "a", 0), ("b", "b", 0)], + 5, + [("a", "a", 0), ("b", "b", 0)], + ) + .await; + test_watermark_gc_helper( + [("a", "a", 3), ("a", "a", 2), ("b", "b", 0)], + 2, + [("a", "a", 3), ("a", "a", 2), ("b", "b", 0)], + ) + .await; + test_watermark_gc_helper( + [("a", "a", 3), ("a", "a", 2), ("b", "b", 5)], + 2, + [("a", "a", 3), ("a", "a", 2), ("b", "b", 5)], + ) + .await; + test_watermark_gc_helper( + [("a", "a", 2), ("a", "a", 1), ("b", "b", 5)], + 3, + [("a", "a", 2), ("b", "b", 5)], + ) + .await; + test_watermark_gc_helper([("a", "", 2), ("a", "a", 1)], 2, []).await; + } + + async fn test_watermark_gc_helper(input: S1, watermark: u64, expected_output: S2) + where + S1: IntoIterator, + S1::IntoIter: Send, + S2: IntoIterator, + S2::IntoIter: Send, + { + fn transform( + iter: impl IntoIterator, + ) -> impl Stream> { + let s = iter.into_iter().map(|(key, value, timestamp)| { + Ok(InnerEntry::new( + KeyBytes::new(Bytes::from(key), timestamp), + Bytes::from(value), + )) + }); + stream::iter(s) + } + + assert_stream_eq( + WatermarkGcIterImpl::build_watermark_gc_iter(transform(input), watermark) + .map(Result::unwrap), + transform(expected_output).map(Result::unwrap), + ) + .await; + } +} diff --git a/src/mvcc/mod.rs b/src/mvcc/mod.rs new file mode 100644 index 0000000..ffa40cd --- /dev/null +++ b/src/mvcc/mod.rs @@ -0,0 +1,4 @@ +pub mod core; +pub mod iterator; +pub mod transaction; +mod watermark; diff --git a/src/mvcc/transaction.rs b/src/mvcc/transaction.rs new file mode 100644 index 0000000..407621e --- /dev/null +++ b/src/mvcc/transaction.rs @@ -0,0 +1,199 @@ +use anyhow::anyhow; +use bytes::Bytes; +use crossbeam_skiplist::SkipMap; +use std::collections::{Bound, HashSet}; +use std::ops::Bound::Excluded; +use std::slice; +use std::sync::Arc; +use tokio_stream::StreamExt; + +use crate::entry::Entry; +use crate::iterators::LockedLsmIter; +use crate::mvcc::core::{CommittedTxnData, LsmMvccInner}; +use crate::mvcc::iterator::LockedTxnIter; +use crate::persistent::Persistent; +use crate::state::write_batch::WriteBatchRecord; +use crate::state::{LsmStorageState, Map}; +use crate::utils::scoped::ScopedMutex; + +#[derive(Debug, Default)] +pub struct RWSet { + read_set: HashSet, + write_set: HashSet, +} + +impl RWSet { + pub fn add_read_key(&mut self, key: &[u8]) { + self.read_set.insert(farmhash::hash32(key)); + } + + pub fn add_write_key(&mut self, key: &[u8]) { + self.write_set.insert(farmhash::hash32(key)); + } +} + +pub struct Transaction<'a, P: Persistent> { + pub(crate) read_ts: u64, + pub(crate) state: &'a LsmStorageState

, + + // todo: need Arc<...> ? + pub(crate) local_storage: Arc>, + + /// Write set and read set + /// todo: check deadlock? + pub(crate) key_hashes: Option>, + + mvcc: Arc, +} + +// todo: no need for async +impl<'a, P: Persistent> Map for Transaction<'a, P> { + type Error = anyhow::Error; + + async fn get(&self, key: &[u8]) -> Result, Self::Error> { + let guard = self.scan(Bound::Included(key), Bound::Included(key)); + let output = guard + .iter() + .await? + .next() + .await + .transpose()? + .map(|entry| entry.value); + if let Some(key_hashes) = self.key_hashes.as_ref() { + key_hashes.lock_with(|mut set| set.add_read_key(key)); + } + Ok(output) + } + + async fn put( + &self, + key: impl Into + Send, + value: impl Into + Send, + ) -> Result<(), Self::Error> { + let key = key.into(); + let value = value.into(); + let record = WriteBatchRecord::Put(key, value); + self.write_batch(slice::from_ref(&record)); + Ok(()) + } + + async fn delete(&self, key: impl Into + Send) -> Result<(), Self::Error> { + let key = key.into(); + if let Some(key_hashes) = self.key_hashes.as_ref() { + key_hashes.lock_with(|mut set| set.add_write_key(key.as_ref())); + } + self.put(key, Bytes::new()).await + } +} + +impl<'a, P: Persistent> Transaction<'a, P> { + pub fn new( + read_ts: u64, + state: &'a LsmStorageState

, + serializable: bool, + mvcc: Arc, + ) -> Self { + { + let mut guard = mvcc.ts.lock(); + guard.1.add_reader(read_ts); + } + Self { + read_ts, + state, + local_storage: Arc::default(), + key_hashes: serializable.then(ScopedMutex::default), + mvcc, + } + } + + pub fn write_batch(&self, batch: &[WriteBatchRecord]) { + if let Some(key_hashes) = self.key_hashes.as_ref() { + key_hashes.lock_with(|mut set| { + for record in batch { + set.add_write_key(record.get_key().as_ref()); + } + }); + } + + for record in batch { + let pair = record.clone().into_keyed(); + self.local_storage.insert(pair.key, pair.value); + } + } + + // todo: no need for Result? + pub fn scan(&'a self, lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>) -> LockedTxnIter<'a, P> { + let inner = self.state.inner.load_full(); + let inner_iter = LockedLsmIter::new(inner, lower, upper, self.read_ts); + let guard = LockedTxnIter::new(&self.local_storage, inner_iter, self.key_hashes.as_ref()); + guard + } + + // todo: 区分 snapshot isolation vs serializable isolation + pub async fn commit(mut self) -> anyhow::Result<()> { + let mut commit_guard = self.mvcc.committed_txns.lock().await; + + let expected_commit_ts = { + // todo: 这里的锁可以去掉? + let guard = self.mvcc.ts.lock(); + guard.0 + 1 + }; + let key_hashes = self.key_hashes.take().map(ScopedMutex::into_inner); + let conflict = if let Some(key_hashes) = key_hashes.as_ref() { + let range = (Excluded(self.read_ts), Excluded(expected_commit_ts)); + let read_set = &key_hashes.read_set; + commit_guard + .range(range) + .any(|(_, data)| !data.key_hashes.is_disjoint(read_set)) + } else { + false + }; + if conflict { + return Err(anyhow!("commit conflict")); + } + + // todo: avoid collecting + let entries: Vec<_> = self + .local_storage + .iter() + .map(|e| Entry::new(e.key().clone(), e.value().clone())) + .collect(); + // todo: 如果 write_batch 失败怎么保证 atomicity + self.state.write_batch(&entries, expected_commit_ts).await?; + self.mvcc.update_commit_ts(expected_commit_ts); + + if let Some(key_hashes) = key_hashes { + let committed_data = CommittedTxnData { + key_hashes: key_hashes.write_set, + read_ts: self.read_ts, + commit_ts: expected_commit_ts, + }; + commit_guard.insert(expected_commit_ts, committed_data); + } + + Ok(()) + } +} + +impl<'a, P: Persistent> Drop for Transaction<'a, P> { + fn drop(&mut self) { + let mut guard = self.mvcc.ts.lock(); + guard.1.remove_reader(self.read_ts); + } +} + +#[cfg(test)] +impl<'a, P: Persistent> Transaction<'a, P> { + pub async fn get_for_test(&self, key: &[u8]) -> anyhow::Result> { + self.get(key).await + } + + pub async fn put_for_test(&self, key: &[u8], value: &[u8]) -> anyhow::Result<()> { + self.put(Bytes::copy_from_slice(key), Bytes::copy_from_slice(value)) + .await + } + + pub async fn delete_for_test(&self, key: &[u8]) -> anyhow::Result<()> { + self.delete(Bytes::copy_from_slice(key)).await + } +} diff --git a/src/mvcc/watermark.rs b/src/mvcc/watermark.rs new file mode 100644 index 0000000..fa6d491 --- /dev/null +++ b/src/mvcc/watermark.rs @@ -0,0 +1,89 @@ +use std::collections::BTreeMap; + +pub struct Watermark { + readers: BTreeMap, +} + +impl Default for Watermark { + fn default() -> Self { + Self::new() + } +} + +impl Watermark { + pub fn new() -> Self { + Self { + readers: BTreeMap::new(), + } + } + + pub fn add_reader(&mut self, ts: u64) { + self.readers + .entry(ts) + .and_modify(|count| *count += 1) + .or_insert(1); + } + + pub fn remove_reader(&mut self, ts: u64) { + let count = self.readers.get_mut(&ts).unwrap(); + if *count == 1 { + self.readers.remove(&ts); + } else { + *count -= 1; + } + } + + pub fn watermark(&self) -> Option { + self.readers.keys().copied().next() + } +} + +#[cfg(test)] +impl Watermark { + fn num_retained_snapshots(&self) -> usize { + self.readers.len() + } +} + +#[cfg(test)] +mod tests { + use crate::mvcc::watermark::Watermark; + + #[test] + fn test_task1_watermark() { + let mut watermark = Watermark::new(); + watermark.add_reader(0); + for i in 1..=1000 { + watermark.add_reader(i); + assert_eq!(watermark.watermark(), Some(0)); + assert_eq!(watermark.num_retained_snapshots(), i as usize + 1); + } + let mut cnt = 1001; + for i in 0..500 { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(i + 1)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + for i in (501..=1000).rev() { + watermark.remove_reader(i); + assert_eq!(watermark.watermark(), Some(500)); + cnt -= 1; + assert_eq!(watermark.num_retained_snapshots(), cnt); + } + watermark.remove_reader(500); + assert_eq!(watermark.watermark(), None); + assert_eq!(watermark.num_retained_snapshots(), 0); + watermark.add_reader(2000); + watermark.add_reader(2000); + watermark.add_reader(2001); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 2); + assert_eq!(watermark.watermark(), Some(2000)); + watermark.remove_reader(2000); + assert_eq!(watermark.num_retained_snapshots(), 1); + assert_eq!(watermark.watermark(), Some(2001)); + } +} diff --git a/src/persistent/file_object.rs b/src/persistent/file_object.rs index 1eacee5..bcfd3a2 100644 --- a/src/persistent/file_object.rs +++ b/src/persistent/file_object.rs @@ -1,20 +1,18 @@ use anyhow::Context; -use bytes::Bytes; -use std::fs::{File, OpenOptions}; -use std::future::Future; -use std::io::Write; + +use std::fs::File; + use std::os::unix::fs::FileExt; use std::path::PathBuf; use std::sync::Arc; use derive_new::new; -use nom::AsBytes; + use tokio::io::BufWriter; -use tokio::spawn; + use tokio::task::spawn_blocking; use tracing::Instrument; -use crate::persistent::interface::WalHandle; use crate::persistent::manifest_handle::ManifestFile; use crate::persistent::wal_handle::WalFile; use crate::persistent::{Persistent, SstHandle}; @@ -45,7 +43,6 @@ impl Persistent for LocalFs { /// Create a new file object (day 2) and write the file to the disk (day 4). async fn create_sst(&self, id: usize, data: Vec) -> anyhow::Result { - println!("create sst {}", id); let size = data.len().try_into()?; let path = self.build_sst_path(id); let file = spawn_blocking(move || { @@ -77,7 +74,6 @@ impl Persistent for LocalFs { } async fn open_wal_handle(&self, id: usize) -> anyhow::Result { - println!("open wal {}", id); let path = self.build_wal_path(id); let file = tokio::fs::OpenOptions::new() .create(true) diff --git a/src/persistent/wal_handle.rs b/src/persistent/wal_handle.rs index f05f171..59d03d1 100644 --- a/src/persistent/wal_handle.rs +++ b/src/persistent/wal_handle.rs @@ -1,5 +1,5 @@ use derive_new::new; -use std::future::Future; + use std::io::Error; use std::pin::{pin, Pin}; use std::task::{Context, Poll}; diff --git a/src/sst/block_meta.rs b/src/sst/block_meta.rs index 360457f..2ee14bd 100644 --- a/src/sst/block_meta.rs +++ b/src/sst/block_meta.rs @@ -1,4 +1,4 @@ -use crate::key::KeyBytes; +use crate::key::{KeyBytes, KeySlice}; use bytes::Buf; use derive_getters::Getters; @@ -7,36 +7,32 @@ use derive_getters::Getters; pub struct BlockMeta { /// Offset of this data block. pub offset: usize, + /// The first key of the data block. + #[getter(skip)] pub first_key: KeyBytes, + /// The last key of the data block. + #[getter(skip)] pub last_key: KeyBytes, } impl BlockMeta { pub fn encode(&self) -> impl Iterator + '_ { let offset = (self.offset as u16).to_be_bytes().into_iter(); - let first_key_len = (self.first_key.len() as u16).to_be_bytes().into_iter(); - let first_key = self.first_key.raw_ref().iter().copied(); - let last_key_len = (self.last_key.len() as u16).to_be_bytes().into_iter(); - let last_key = self.last_key.raw_ref().iter().copied(); offset - .chain(first_key_len) - .chain(first_key) - .chain(last_key_len) - .chain(last_key) + .chain(self.first_key.to_byte_iter()) + .chain(self.last_key.to_byte_iter()) } pub fn decode(mut data: impl Buf) -> Self { let offset = data.get_u16() as usize; - let first_key_len = data.get_u16() as usize; - let first_key = data.copy_to_bytes(first_key_len); - let last_key_len = data.get_u16() as usize; - let last_key = data.copy_to_bytes(last_key_len); + let first_key = KeyBytes::decode(&mut data); + let last_key = KeyBytes::decode(&mut data); Self { offset, - first_key: KeyBytes::from_bytes(first_key), - last_key: KeyBytes::from_bytes(last_key), + first_key, + last_key, } } @@ -47,4 +43,12 @@ impl BlockMeta { } result } + + pub fn first_key(&self) -> KeySlice { + KeySlice::new(self.first_key.raw_ref(), self.first_key.timestamp()) + } + + pub fn last_key(&self) -> KeySlice { + KeySlice::new(self.last_key.raw_ref(), self.last_key.timestamp()) + } } diff --git a/src/sst/bloom.rs b/src/sst/bloom.rs index 322740a..fe45976 100644 --- a/src/sst/bloom.rs +++ b/src/sst/bloom.rs @@ -71,7 +71,7 @@ impl Bloom { /// Build bloom filter from key hashes pub fn build_from_key_hashes(keys: &[u32], bits_per_key: usize) -> Self { let k = (bits_per_key as f64 * 0.69) as u32; - let k = k.min(30).max(1); + let k = k.clamp(1, 30); let nbits = (keys.len() * bits_per_key).max(64); let nbytes = (nbits + 7) / 8; let nbits = nbytes * 8; @@ -103,7 +103,7 @@ impl Bloom { } fn compute_index(value: u32, num_hash: u8, nbits: usize) -> impl Iterator { - let delta = (value >> 17) | (value << 15); + let delta = value.rotate_left(15); (0..num_hash).scan(value, move |h, _| { let new_h = (*h).wrapping_add(delta); @@ -122,7 +122,7 @@ pub fn may_contain(bloom: Option<&Bloom>, key: &[u8]) -> bool { #[cfg(test)] mod tests { use crate::sst::bloom::Bloom; - use crate::sst::builder::{key_of, num_of_keys}; + use crate::sst::builder::test_util::{key_of, num_of_keys}; #[test] fn test_task1_bloom_filter() { diff --git a/src/sst/builder.rs b/src/sst/builder.rs index c37aa68..460f3a7 100644 --- a/src/sst/builder.rs +++ b/src/sst/builder.rs @@ -1,19 +1,15 @@ use std::mem; -use std::ops::Bound::Unbounded; use std::sync::Arc; use anyhow::Result; use bytes::BufMut; use nom::AsBytes; -#[cfg(test)] -use tempfile::TempDir; use crate::block::{BlockBuilder, BlockCache}; -use crate::key::{KeySlice, KeyVec}; +use crate::key::KeySlice; use crate::memtable::ImmutableMemTable; -use crate::persistent::file_object::FileObject; use crate::persistent::interface::WalHandle; -use crate::persistent::{LocalFs, Persistent}; +use crate::persistent::Persistent; use crate::sst::bloom::Bloom; use crate::sst::{BlockMeta, SsTable}; @@ -24,6 +20,7 @@ pub struct SsTableBuilder { pub(crate) meta: Vec, block_size: usize, key_hashes: Vec, + max_ts: u64, // todo: use Option } impl SsTableBuilder { @@ -35,6 +32,7 @@ impl SsTableBuilder { meta: Vec::new(), block_size, key_hashes: Vec::new(), + max_ts: 0, } } @@ -49,6 +47,7 @@ impl SsTableBuilder { self.key_hashes.push(farmhash::fingerprint32(key.raw_ref())); return; } + self.max_ts = self.max_ts.max(key.timestamp()); let old_builder = mem::replace(&mut self.builder, BlockBuilder::new(self.block_size)); Self::add_block(&mut self.data, &mut self.meta, old_builder); self.add(key, value); @@ -94,6 +93,7 @@ impl SsTableBuilder { mut meta, block_size: _, key_hashes, + max_ts, } = self; // add last block @@ -102,6 +102,7 @@ impl SsTableBuilder { } // first/last key + // todo: unwrap 能不能去掉? let first_key = meta.first().unwrap().first_key.clone(); let last_key = meta.last().unwrap().last_key.clone(); @@ -121,6 +122,9 @@ impl SsTableBuilder { bloom.encode(&mut data); data.put_u32(bloom_offset); + // encode max_ts + data.put_u64(max_ts); + let file = persistent.create_sst(id, data).await?; let table = SsTable::builder() @@ -132,7 +136,7 @@ impl SsTableBuilder { .first_key(first_key) .last_key(last_key) .bloom(Some(bloom)) - .max_ts(0) + .max_ts(max_ts) .build(); Ok(table) @@ -144,56 +148,69 @@ impl SsTableBuilder { pub fn flush(&mut self, memtable: &ImmutableMemTable) { for entry in memtable.iter() { - self.add( - KeySlice::from_slice(entry.key().as_bytes()), - entry.value().as_bytes(), - ); + self.add(entry.key().as_key_slice(), entry.value().as_bytes()); } } } #[cfg(test)] -impl SsTableBuilder { - async fn build_for_test(self, dir: &TempDir, id: usize) -> anyhow::Result> { - let persistent = LocalFs::new(dir.path().to_path_buf()); - self.build(id, None, &persistent).await +pub mod test_util { + use tempfile::TempDir; + + use crate::key::KeyVec; + use crate::persistent::file_object::FileObject; + use crate::persistent::LocalFs; + use crate::sst::{SsTable, SsTableBuilder}; + + impl SsTableBuilder { + pub(crate) async fn build_for_test( + self, + dir: &TempDir, + id: usize, + ) -> anyhow::Result> { + let persistent = LocalFs::new(dir.path().to_path_buf()); + self.build(id, None, &persistent).await + } } -} -#[cfg(test)] -pub fn key_of(idx: usize) -> KeyVec { - KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) -} + pub fn key_of(idx: usize) -> KeyVec { + KeyVec::for_testing_from_vec_no_ts(format!("key_{:03}", idx * 5).into_bytes()) + } -#[cfg(test)] -pub fn value_of(idx: usize) -> Vec { - format!("value_{:010}", idx).into_bytes() -} + pub fn value_of(idx: usize) -> Vec { + format!("value_{:010}", idx).into_bytes() + } -#[cfg(test)] -pub fn num_of_keys() -> usize { - 100 -} + pub fn num_of_keys() -> usize { + 100 + } -#[cfg(test)] -pub async fn generate_sst(dir: &TempDir) -> SsTable { - let mut builder = SsTableBuilder::new(128); - for idx in 0..num_of_keys() { - let key = key_of(idx); - let value = value_of(idx); - builder.add(key.as_key_slice(), &value[..]); + pub async fn generate_sst(dir: &TempDir) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for idx in 0..num_of_keys() { + let key = key_of(idx); + let value = value_of(idx); + builder.add(key.as_key_slice(), &value[..]); + } + builder.build_for_test(dir, 1).await.unwrap() } - builder.build_for_test(dir, 1).await.unwrap() } #[cfg(test)] mod tests { + use std::sync::Arc; + + use bytes::Bytes; use tempfile::tempdir; - use crate::key::KeySlice; - use crate::persistent::LocalFs; - use crate::sst::builder::{key_of, num_of_keys, value_of}; + use crate::block::BlockCache; + + use crate::key::{Key, KeySlice}; + use crate::persistent::{LocalFs, Persistent}; + use crate::sst::builder::test_util::{key_of, num_of_keys, value_of}; + use crate::sst::iterator::SsTableIterator; use crate::sst::{SsTable, SsTableBuilder}; + use futures::StreamExt; #[tokio::test] async fn test_sst_build_single_key() { @@ -237,12 +254,90 @@ mod tests { let dir = tempdir().unwrap(); let sst = builder.build_for_test(&dir, 1).await.unwrap(); assert!( - sst.block_meta.len() <= 25, + sst.block_meta.len() <= 40, "you have {} blocks, expect 25", sst.block_meta.len() ); } + #[tokio::test] + async fn test_sst_build_multi_version_simple() { + let mut builder = SsTableBuilder::new(16); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 233), + b"233333", + ); + builder.add( + KeySlice::for_testing_from_slice_with_ts(b"233", 0), + b"2333333", + ); + let dir = tempdir().unwrap(); + builder.build_for_test(&dir, 1).await.unwrap(); + } + + // todo: add test + #[tokio::test] + async fn test_sst_build_multi_version_hard() { + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let data = generate_test_data(); + let _ = generate_sst_with_ts(1, &persistent, data.clone(), None).await; + let sst = SsTable::open(1, None, &persistent).await.unwrap(); + let result: Vec<_> = SsTableIterator::create_and_seek_to_first(&sst) + .map(|entry| { + let entry = entry.unwrap(); + let Key { key, timestamp } = entry.key; + let value = entry.value; + ((key, timestamp), value) + }) + .collect() + .await; + assert_eq!(data, result); + } + + #[tokio::test] + async fn test_task3_sst_ts() { + let mut builder = SsTableBuilder::new(16); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"11", 1), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"22", 2), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"33", 3), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"44", 4), b"22"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"55", 5), b"11"); + builder.add(KeySlice::for_testing_from_slice_with_ts(b"66", 6), b"22"); + let dir = tempdir().unwrap(); + let sst = builder.build_for_test(&dir, 1).await.unwrap(); + assert_eq!(*sst.max_ts(), 6); + } + + #[allow(dead_code)] + pub async fn generate_sst_with_ts( + id: usize, + persistent: &P, + data: Vec<((Bytes, u64), Bytes)>, + block_cache: Option>, + ) -> SsTable { + let mut builder = SsTableBuilder::new(128); + for ((key, ts), value) in data { + builder.add( + KeySlice::for_testing_from_slice_with_ts(&key[..], ts), + &value[..], + ); + } + builder.build(id, block_cache, persistent).await.unwrap() + } + + #[allow(dead_code)] + fn generate_test_data() -> Vec<((Bytes, u64), Bytes)> { + (0..100) + .map(|id| { + ( + (Bytes::from(format!("key{:05}", id / 5)), 5 - (id % 5)), + Bytes::from(format!("value{:05}", id)), + ) + }) + .collect() + } + fn get_builder() -> SsTableBuilder { let mut builder = SsTableBuilder::new(128); for idx in 0..num_of_keys() { diff --git a/src/sst/compact/common.rs b/src/sst/compact/common.rs index 30e9fd2..7332a44 100644 --- a/src/sst/compact/common.rs +++ b/src/sst/compact/common.rs @@ -1,31 +1,45 @@ -use crate::entry::Entry; +use crate::entry::InnerEntry; +use crate::iterators::create_two_merge_iter; use crate::iterators::merge::MergeIteratorInner; -use crate::iterators::{ - create_merge_iter_from_non_empty_iters, create_two_merge_iter, iter_fut_to_stream, - MergeIterator, NonEmptyStream, -}; -use crate::key::KeySlice; + use derive_new::new; use futures::{stream, Stream, StreamExt}; use getset::CopyGetters; use serde::{Deserialize, Serialize}; -use std::future::{ready, Future}; -use std::ops::{Range, RangeBounds}; -use std::sync::Arc; -use tracing::error; +use crate::mvcc::iterator::{WatermarkGcIter, WatermarkGcIterImpl}; use crate::persistent::{Persistent, SstHandle}; use crate::sst::iterator::{create_sst_concat_and_seek_to_first, SsTableIterator}; use crate::sst::{SsTable, SsTableBuilder, SstOptions, Sstables}; +use crate::utils::send::assert_send; +use futures::future::Either; +use std::ops::Range; +use std::sync::Arc; +use tracing::error; #[derive(Serialize, Deserialize, new, CopyGetters, PartialEq, Debug)] #[getset(get_copy = "pub")] pub struct CompactionTask { source: usize, - source_index: usize, + source_index: SourceIndex, destination: usize, } +#[derive(Serialize, Deserialize, PartialEq, Debug, Copy, Clone)] +pub enum SourceIndex { + Index { index: usize }, + Full { len: usize }, +} + +impl SourceIndex { + pub fn build_range(self) -> Range { + match self { + SourceIndex::Index { index } => index..index + 1, + SourceIndex::Full { len } => 0..len, + } + } +} + pub fn apply_compaction( sstables: &mut Sstables, source: Range, @@ -69,6 +83,7 @@ pub async fn compact_generate_new_sst<'a, P: Persistent, U, L>( next_sst_id: impl Fn() -> usize + Send + 'a + Sync, options: &'a SstOptions, persistent: &'a P, + watermark: Option, ) -> anyhow::Result>>> where U: IntoIterator> + Send + 'a, @@ -89,7 +104,14 @@ where let tables = lower_sstables.into_iter().collect(); create_sst_concat_and_seek_to_first(tables) }?; - let iter = assert_send(create_two_merge_iter(l0, l1)).await?; + let merged_iter = assert_send(create_two_merge_iter(l0, l1)).await?; + let iter = match watermark { + Some(watermark) => Either::Left(WatermarkGcIterImpl::build_watermark_gc_iter( + merged_iter, + watermark, + )), + None => Either::Right(merged_iter), + }; let s: Vec<_> = assert_send( stream::unfold(iter, |mut iter| async { let id = next_sst_id(); @@ -109,10 +131,6 @@ where Ok(s) } -fn assert_send(x: T) -> T { - x -} - async fn batch( iter: &mut I, sst_id: usize, @@ -122,7 +140,7 @@ async fn batch( ) -> Option>> where P: Persistent, - I: Stream> + Unpin, + I: Stream> + Unpin, { let mut builder = SsTableBuilder::new(block_size); @@ -133,11 +151,11 @@ where }; // 被删除的 entry 不再添加 - if entry.value.is_empty() { - continue; - } + // if entry.value.is_empty() { + // continue; + // } - let key = KeySlice::from_slice(entry.key.as_ref()); + let key = entry.key.as_key_slice(); let value = entry.value.as_ref(); builder.add(key, value); } diff --git a/src/sst/compact/full.rs b/src/sst/compact/full.rs new file mode 100644 index 0000000..764b317 --- /dev/null +++ b/src/sst/compact/full.rs @@ -0,0 +1,14 @@ +use crate::persistent::SstHandle; +use crate::sst::compact::common::{CompactionTask, SourceIndex}; +use crate::sst::Sstables; + +#[derive(Debug, Clone, Copy)] +pub struct LeveledCompactionOptions; + +pub fn generate_full_compaction_task( + sstables: &Sstables, +) -> Option { + let len = sstables.l0_sstables.len(); + let task = CompactionTask::new(0, SourceIndex::Full { len }, 1); + Some(task) +} diff --git a/src/sst/compact/leveled.rs b/src/sst/compact/leveled.rs index 42b53b6..db9e02f 100644 --- a/src/sst/compact/leveled.rs +++ b/src/sst/compact/leveled.rs @@ -1,21 +1,19 @@ use std::cmp::max; -use std::future::{ready, Future}; use std::iter; -use std::sync::Arc; use crate::manifest::{Compaction, Manifest, ManifestRecord}; use derive_new::new; use getset::CopyGetters; -use ordered_float::NotNan; -use tokio::sync::MutexGuard; -use tracing::{info, trace}; + use typed_builder::TypedBuilder; use crate::persistent::{Persistent, SstHandle}; -use crate::sst::compact::common::{apply_compaction, compact_generate_new_sst, CompactionTask}; -use crate::sst::compact::CompactionOptions::Leveled; -use crate::sst::{SsTable, SstOptions, Sstables}; +use crate::sst::compact::common::{compact_generate_new_sst, CompactionTask, SourceIndex}; +use crate::sst::compact::full::generate_full_compaction_task; +use crate::sst::compact::CompactionOptions::{Full, Leveled, NoCompaction}; +use crate::sst::{SstOptions, Sstables}; use crate::utils::num::power_of_2; +use crate::utils::send::assert_send; #[derive(Debug, Clone, new, TypedBuilder, CopyGetters)] #[getset(get_copy = "pub")] @@ -44,18 +42,33 @@ impl LeveledCompactionOptions { } } +// todo: move this function out of leveled.rs pub async fn force_compact( sstables: &mut Sstables, next_sst_id: impl Fn() -> usize + Send + Sync, options: &SstOptions, persistent: &P, manifest: Option<&Manifest>, + watermark: Option, ) -> anyhow::Result<()> { - let Some(task) = generate_task(sstables, options) else { + // todo: 这个可以提到外面,就不用 clone state 了 + let Some(task) = (match options.compaction_option() { + Leveled(options) => generate_task(options, sstables), + Full => generate_full_compaction_task(sstables), + NoCompaction => None, + }) else { return Ok(()); }; - let new_sst_ids = compact_with_task(sstables, next_sst_id, options, persistent, &task).await?; + let new_sst_ids = assert_send(compact_with_task( + sstables, + next_sst_id, + options, + persistent, + &task, + watermark, + )) + .await?; if let Some(manifest) = manifest { let record = ManifestRecord::Compaction(Compaction(task, new_sst_ids)); @@ -71,31 +84,38 @@ pub async fn compact_with_task( options: &SstOptions, persistent: &P, task: &CompactionTask, + watermark: Option, ) -> anyhow::Result> { let source = task.source(); - let source_index = task.source_index(); - let source_id = *sstables.table_ids(source).get(source_index).unwrap(); - let source_level = sstables.sstables.get(&source_id).unwrap().as_ref(); + let source_level: Vec<_> = match task.source_index() { + SourceIndex::Index { index } => { + let source_id = *sstables.table_ids(source).get(index).unwrap(); + let source_level = sstables.sstables.get(&source_id).unwrap().as_ref(); + let source = iter::once(source_level); + source.collect() + } + SourceIndex::Full { .. } => { + let source = sstables.tables(source); + source.collect() + } + }; + let destination = task.destination(); - let new_sst = compact_generate_new_sst( - iter::once(source_level), + let new_sst = assert_send(compact_generate_new_sst( + source_level, sstables.tables(destination), next_sst_id, options, persistent, - ) + watermark, + )) .await?; let new_sst_ids: Vec<_> = new_sst.iter().map(|table| table.id()).copied().collect(); - apply_compaction( - sstables, - source_index..source_index + 1, - source, - destination, - new_sst, - ); + sstables.apply_compaction_sst(new_sst, task); + sstables.apply_compaction_sst_ids(task, new_sst_ids.clone()); Ok(new_sst_ids) } @@ -118,7 +138,6 @@ fn select_level_source( *level_size as f64 / denominator as f64 }) .collect(); - // println!("max_bytes_for_level_base={}, scores={:?}", max_bytes_for_level_base, scores); let source = scores .iter() .enumerate() @@ -178,14 +197,9 @@ fn compute_target_sizes(last_level_size: u64, options: &LeveledCompactionOptions } pub fn generate_task( + compact_options: &LeveledCompactionOptions, sstables: &Sstables, - options: &SstOptions, ) -> Option { - let Leveled(compact_options) = options.compaction_option() else { - trace!("skip force compaction"); - return None; - }; - let level_sizes = compute_level_sizes(sstables); let target_sizes = compute_target_sizes(*level_sizes.last().unwrap(), compact_options); @@ -204,7 +218,13 @@ pub fn generate_task( .enumerate() .min_by(|(_, left_id), (_, right_id)| left_id.cmp(right_id))?; - let task = CompactionTask::new(source, source_index, destination); + let task = CompactionTask::new( + source, + SourceIndex::Index { + index: source_index, + }, + destination, + ); Some(task) } @@ -219,7 +239,7 @@ mod tests { use crate::persistent::file_object::FileObject; use crate::persistent::LocalFs; - use crate::sst::compact::common::CompactionTask; + use crate::sst::compact::common::{CompactionTask, SourceIndex}; use crate::sst::compact::leveled::{ compact_with_task, force_compact, select_level_destination, select_level_source, }; @@ -277,7 +297,8 @@ mod tests { build_next_sst_id(&state.sst_id), &state.options, &state.persistent, - &CompactionTask::new(0, 4, 1), + &CompactionTask::new(0, SourceIndex::Index { index: 4 }, 1), + None, ) .await .unwrap(); @@ -293,15 +314,16 @@ mod tests { build_next_sst_id(&state.sst_id), &state.options, &state.persistent, - &CompactionTask::new(0, 3, 1), + &CompactionTask::new(0, SourceIndex::Index { index: 3 }, 1), + None, ) .await .unwrap(); { assert_eq!(sstables.l0_sstables, [4, 3, 2]); - assert_eq!(sstables.levels, vec![vec![12, 13, 14], vec![], vec![]]); - assert_eq!(sstables.sstables.len(), 6); + assert_eq!(sstables.levels, vec![vec![12, 13, 14, 15], vec![], vec![]]); + assert_eq!(sstables.sstables.len(), 7); } compact_with_task( @@ -309,15 +331,16 @@ mod tests { build_next_sst_id(&state.sst_id), &state.options, &state.persistent, - &CompactionTask::new(1, 0, 2), + &CompactionTask::new(1, SourceIndex::Index { index: 0 }, 2), + None, ) .await .unwrap(); { assert_eq!(sstables.l0_sstables, [4, 3, 2]); - assert_eq!(sstables.levels, vec![vec![13, 14], vec![16], vec![]]); - assert_eq!(sstables.sstables.len(), 6); + assert_eq!(sstables.levels, vec![vec![13, 14, 15], vec![17], vec![]]); + assert_eq!(sstables.sstables.len(), 7); } } @@ -331,6 +354,7 @@ mod tests { &state.options, &state.persistent, None, + None, ) .await .unwrap(); @@ -365,9 +389,10 @@ mod tests { .num_memtable_limit(1000) .compaction_option(CompactionOptions::Leveled(compaction_options)) .enable_wal(false) + .enable_mvcc(true) .build(); - let mut state = LsmStorageState::new(options, persistent).await.unwrap(); - let next_sst_id = AtomicUsize::default(); + let state = LsmStorageState::new(options, persistent).await.unwrap(); + let _next_sst_id = AtomicUsize::default(); let state_lock = Mutex::default(); for i in 0..5 { diff --git a/src/sst/compact/mod.rs b/src/sst/compact/mod.rs index bef8f31..1229046 100644 --- a/src/sst/compact/mod.rs +++ b/src/sst/compact/mod.rs @@ -1,8 +1,8 @@ pub mod common; +pub mod full; pub mod leveled; mod option; mod simple_leveled; -mod tiered; pub use leveled::LeveledCompactionOptions; pub use option::CompactionOptions; diff --git a/src/sst/compact/option.rs b/src/sst/compact/option.rs index ecf2769..874096b 100644 --- a/src/sst/compact/option.rs +++ b/src/sst/compact/option.rs @@ -5,6 +5,7 @@ pub enum CompactionOptions { /// Leveled compaction with partial compaction + dynamic level support (= RocksDB's Leveled /// Compaction) Leveled(LeveledCompactionOptions), + Full, /// In no compaction mode (week 1), always flush to L0 #[default] NoCompaction, diff --git a/src/sst/compact/simple_leveled.rs b/src/sst/compact/simple_leveled.rs index d8efd87..3686034 100644 --- a/src/sst/compact/simple_leveled.rs +++ b/src/sst/compact/simple_leveled.rs @@ -1,6 +1,3 @@ -use crate::sst::Sstables; -use itertools::Itertools; - #[derive(Debug, Clone)] pub struct SimpleLeveledCompactionOptions { pub size_ratio_percent: usize, diff --git a/src/sst/compact/tiered.rs b/src/sst/compact/tiered.rs deleted file mode 100644 index faf3595..0000000 --- a/src/sst/compact/tiered.rs +++ /dev/null @@ -1,7 +0,0 @@ -#[derive(Debug, Clone)] -pub struct TieredCompactionOptions { - pub num_tiers: usize, - pub max_size_amplification_percent: usize, - pub size_ratio: usize, - pub min_merge_width: usize, -} diff --git a/src/sst/iterator/concat.rs b/src/sst/iterator/concat.rs index d234308..4e7b0dd 100644 --- a/src/sst/iterator/concat.rs +++ b/src/sst/iterator/concat.rs @@ -1,10 +1,10 @@ use std::ops::Bound; use anyhow::Result; -use bytes::Bytes; + use futures::{stream, Stream, StreamExt}; -use crate::entry::Entry; +use crate::entry::InnerEntry; use crate::key::KeySlice; use crate::persistent::SstHandle; use crate::sst::iterator::iter::SsTableIterator; @@ -14,7 +14,7 @@ use crate::sst::SsTable; /// iterators when initializing this iterator to reduce the overhead of seeking. // todo: 这里应该用 type alias impl trait 去除 Box -pub type SstConcatIterator<'a> = Box> + Send + Unpin + 'a>; +pub type SstConcatIterator<'a> = Box> + Send + Unpin + 'a>; pub fn create_sst_concat_and_seek_to_first( sstables: Vec<&SsTable>, @@ -42,8 +42,8 @@ where pub fn scan_sst_concat<'a, File, I>( sstables: I, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, + lower: Bound>, + upper: Bound>, ) -> Result> where File: SstHandle + 'a, diff --git a/src/sst/iterator/iter.rs b/src/sst/iterator/iter.rs index c5b6e4e..1a387b7 100644 --- a/src/sst/iterator/iter.rs +++ b/src/sst/iterator/iter.rs @@ -7,43 +7,40 @@ use std::task::{Context, Poll}; use futures::{future, FutureExt}; use futures::{stream, Stream, StreamExt}; use pin_project::pin_project; -use tracing::info; use crate::block::BlockIterator; -use crate::entry::Entry; +use crate::entry::{Entry, InnerEntry}; use crate::iterators::{iter_fut_iter_to_stream, split_first, MergeIterator, TwoMergeIterator}; -use crate::key::{KeyBytes, KeySlice}; +use crate::key::KeySlice; use crate::persistent::SstHandle; use crate::sst::bloom::Bloom; use crate::sst::iterator::concat::SstConcatIterator; -use crate::sst::{bloom, BlockMeta, SsTable}; +use crate::sst::{BlockMeta, SsTable}; // 暂时用 box,目前 rust 不能够方便地在 struct 中存 closure -type InnerIter<'a> = Pin> + Send + 'a>>; +type InnerIter<'a> = Pin> + Send + 'a>>; fn build_iter<'a, File>( table: &'a SsTable, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, -) -> impl Stream> + Send + 'a + lower: Bound>, + upper: Bound>, +) -> impl Stream> + Send + 'a where File: SstHandle, { - let iter = match lower { - Bound::Included(key) => future::Either::Left(future::Either::Left(build_bounded_iter( - table, - KeySlice::from_slice(key), - upper, - |meta: &BlockMeta, key| meta.last_key.raw_ref() < key, - ))), - Bound::Excluded(key) => future::Either::Left(future::Either::Right(build_bounded_iter( - table, - KeySlice::from_slice(key), - upper, - |meta, key| meta.last_key.raw_ref() <= key, - ))), - Bound::Unbounded => future::Either::Right(build_unbounded_iter(table)), - }; + let iter = + match lower { + Bound::Included(key) => future::Either::Left(future::Either::Left(build_bounded_iter( + table, + key, + upper, + |meta: &BlockMeta, key| meta.last_key() < key, + ))), + Bound::Excluded(key) => future::Either::Left(future::Either::Right( + build_bounded_iter(table, key, upper, |meta, key| meta.last_key() <= key), + )), + Bound::Unbounded => future::Either::Right(build_unbounded_iter(table)), + }; match upper { Bound::Included(upper) => future::Either::Left(future::Either::Left(transform_stop_iter( iter, @@ -60,37 +57,34 @@ where } fn transform_stop_iter<'a>( - iter: impl Stream> + 'a, - upper: &'a [u8], - f: for<'b> fn(&'b [u8], &'b [u8]) -> bool, -) -> impl Stream> + 'a { + iter: impl Stream> + 'a, + upper: KeySlice<'a>, + f: for<'b> fn(KeySlice<'b>, KeySlice<'b>) -> bool, +) -> impl Stream> + 'a { iter.take_while(move |entry| { - let condition = entry + let x = entry .as_ref() - .map(|entry| f(&entry.key, upper)) + .map(|entry| f(entry.key.as_key_slice(), upper)) .unwrap_or(true); - ready(condition) + ready(x) }) } fn build_bounded_iter<'a, File>( table: &'a SsTable, low: KeySlice<'a>, - upper: Bound<&'a [u8]>, - partition: impl for<'c> Fn(&'c BlockMeta, &'c [u8]) -> bool, -) -> impl Stream> + 'a + upper: Bound>, + partition: impl for<'c> Fn(&'c BlockMeta, KeySlice<'c>) -> bool, +) -> impl Stream> + 'a where File: SstHandle, { let index = table .block_meta .as_slice() - .partition_point(|meta| partition(meta, low.raw_ref())); + .partition_point(|meta| partition(meta, low)); - let metas = table.block_meta[index..] - .iter() - .map(BlockMeta::first_key) - .map(KeyBytes::raw_ref); + let metas = table.block_meta[index..].iter().map(BlockMeta::first_key); let metas = (index..).zip(metas); let Some(((head_index, _), tail)) = split_first(metas) else { @@ -116,7 +110,7 @@ where fn build_unbounded_iter( table: &SsTable, -) -> impl Stream> + '_ +) -> impl Stream> + '_ where File: SstHandle, { @@ -133,8 +127,10 @@ pub struct SsTableIterator<'a, File> { } impl<'a, File> SsTableIterator<'a, File> { - pub fn may_contain(&self, key: &[u8]) -> bool { - bloom::may_contain(self.bloom, key) + pub fn may_contain(&self, _key: &[u8]) -> bool { + true + // todo + // bloom::may_contain(self.bloom, key) } } @@ -145,18 +141,17 @@ where pub fn create_and_seek_to_first(table: &'a SsTable) -> Self { Self::scan(table, Bound::Unbounded, Bound::Unbounded) } - - // todo: 能不能删除 - pub fn create_and_seek_to_key(table: &'a SsTable, key: &'a [u8]) -> Self { - Self::scan(table, Bound::Included(key), Bound::Unbounded) - } } impl<'a, File> SsTableIterator<'a, File> where File: SstHandle, { - pub fn scan(table: &'a SsTable, lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>) -> Self { + pub fn scan( + table: &'a SsTable, + lower: Bound>, + upper: Bound>, + ) -> Self { let iter = build_iter(table, lower, upper); let this = Self { table, @@ -169,7 +164,7 @@ where // todo: 感觉没必要 impl Stream,使用 (Bloom, InnerIter) 比较好? impl<'a, File> Stream for SsTableIterator<'a, File> { - type Item = anyhow::Result; + type Item = anyhow::Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { let this = self.project(); @@ -179,7 +174,7 @@ impl<'a, File> Stream for SsTableIterator<'a, File> { } } -pub type BlockFallibleIter = either::Either>>; +pub type BlockFallibleIter = either::Either>>; pub type MergedSstIterator<'a, File> = TwoMergeIterator< Entry, @@ -196,7 +191,7 @@ mod tests { use nom::AsBytes; use tempfile::tempdir; - use crate::sst::builder::{generate_sst, key_of, num_of_keys, value_of}; + use crate::sst::builder::test_util::{generate_sst, key_of, num_of_keys, value_of}; use crate::sst::iterator::SsTableIterator; #[tokio::test] @@ -209,7 +204,8 @@ mod tests { .next() .await .unwrap() - .unwrap_or_else(|_| panic!("panic on {}", i)); + .unwrap_or_else(|_| panic!("panic on {}", i)) + .prune_ts(); let key = entry.key.as_bytes(); let value = entry.value.as_bytes(); assert_eq!( diff --git a/src/sst/iterator/merged.rs b/src/sst/iterator/merged.rs index 00d54c7..337fd6a 100644 --- a/src/sst/iterator/merged.rs +++ b/src/sst/iterator/merged.rs @@ -1,11 +1,11 @@ -use crate::entry::Entry; +use crate::entry::InnerEntry; use crate::iterators::{MergeIterator, TwoMergeIterator}; use crate::sst::iterator::concat::SstConcatIterator; use crate::sst::iterator::iter::SsTableIterator; // todo: 用 MergeIterator vs MergeIteratorInner pub type MergedSstIterator<'a, File> = TwoMergeIterator< - Entry, - MergeIterator>, - MergeIterator>, + InnerEntry, + MergeIterator>, + MergeIterator>, >; diff --git a/src/sst/option.rs b/src/sst/option.rs index 7b7c664..101fa0f 100644 --- a/src/sst/option.rs +++ b/src/sst/option.rs @@ -12,4 +12,10 @@ pub struct SstOptions { num_memtable_limit: usize, compaction_option: CompactionOptions, enable_wal: bool, + + #[builder(default)] + enable_mvcc: bool, + + #[builder(default)] + serializable: bool, } diff --git a/src/sst/sstables.rs b/src/sst/sstables.rs index 7bb1a31..99b9184 100644 --- a/src/sst/sstables.rs +++ b/src/sst/sstables.rs @@ -1,42 +1,30 @@ use anyhow::anyhow; -use deref_ext::DerefExt; -use std::cmp::max; + use std::collections::{Bound, HashMap}; use std::fmt::{Debug, Formatter}; -use std::future::ready; + use std::iter::repeat; -use std::pin::Pin; + +use futures::stream; use std::sync::atomic::AtomicUsize; use std::sync::atomic::Ordering::Relaxed; use std::sync::Arc; -use std::{iter, mem}; -use futures::{pin_mut, stream, FutureExt, Stream, StreamExt}; -use itertools::Itertools; -use ordered_float::NotNan; -use tokio::sync::RwLock; use tracing::error; -use crate::entry::Entry; -use crate::iterators::merge::MergeIteratorInner; -use crate::iterators::{ - create_merge_iter, create_merge_iter_from_non_empty_iters, create_two_merge_iter, - iter_fut_to_stream, MergeIterator, NonEmptyStream, -}; +use crate::entry::InnerEntry; + +use crate::iterators::{create_merge_iter, create_two_merge_iter, MergeIterator}; use crate::key::KeySlice; -use crate::manifest::{Compaction, Flush, ManifestRecord}; +use crate::manifest::Flush; use crate::memtable::ImmutableMemTable; -use crate::persistent::{Persistent, SstHandle}; -use crate::sst::compact::{ - CompactionOptions, LeveledCompactionOptions, SimpleLeveledCompactionOptions, -}; +use crate::persistent::SstHandle; +use crate::sst::compact::common::CompactionTask; +use crate::sst::compact::CompactionOptions; use crate::sst::iterator::concat::SstConcatIterator; -use crate::sst::iterator::{ - create_sst_concat_and_seek_to_first, scan_sst_concat, MergedSstIterator, SsTableIterator, -}; +use crate::sst::iterator::{scan_sst_concat, MergedSstIterator, SsTableIterator}; use crate::sst::option::SstOptions; -use crate::sst::{bloom, SsTable, SsTableBuilder}; -use crate::state::LsmStorageStateInner; +use crate::sst::SsTable; #[derive(Default)] pub struct Sstables { @@ -89,14 +77,14 @@ impl Sstables { pub fn levels(&self) -> &[Vec] { &self.levels } +} +// only for test +impl Sstables { pub fn sstables(&self) -> &HashMap>> { &self.sstables } -} -// only for test -impl Sstables { // todo: delete it pub fn sstables_mut(&mut self) -> &mut HashMap>> { &mut self.sstables @@ -108,6 +96,7 @@ impl Sstables { repeat(Vec::new()).take(opt.max_levels() - 1).collect() } CompactionOptions::NoCompaction => Vec::new(), + CompactionOptions::Full => repeat(Vec::new()).take(1).collect(), }; Self { l0_sstables: Vec::new(), @@ -128,8 +117,8 @@ where pub async fn scan_sst<'a>( &'a self, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, + lower: Bound>, + upper: Bound>, ) -> anyhow::Result> { let l0 = self.scan_l0(lower, upper).await; let levels = self.scan_levels(lower, upper).await; @@ -139,34 +128,25 @@ where pub async fn scan_l0<'a>( &'a self, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, - ) -> MergeIterator> { + lower: Bound>, + upper: Bound>, + ) -> MergeIterator> { let iters = self.build_l0_iter(lower, upper); let iters = stream::iter(iters); create_merge_iter(iters).await } - async fn scan_l02<'a>( - &'a self, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, - ) -> MergeIteratorInner> { - let iters = self.build_l0_iter(lower, upper); - let iters = stream::iter(iters); - MergeIteratorInner::create(iters).await - } - fn build_l0_iter<'a>( &'a self, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, + lower: Bound>, + upper: Bound>, ) -> impl Iterator> + 'a { let iters = self .l0_sstables .iter() .map(|id| self.sstables.get(id).unwrap()) .filter_map(move |table| { + // todo: scan not use bloom? if !filter_sst_by_bloom(table, lower, upper) { None } else { @@ -178,9 +158,9 @@ where async fn scan_levels<'a>( &'a self, - lower: Bound<&'a [u8]>, - upper: Bound<&'a [u8]>, - ) -> MergeIterator> { + lower: Bound>, + upper: Bound>, + ) -> MergeIterator> { let iters = self.levels.iter().filter_map(move |ids| { let tables = ids.iter().map(|id| self.sstables.get(id).unwrap().as_ref()); scan_sst_concat(tables, lower, upper) @@ -191,15 +171,6 @@ where create_merge_iter(iters).await } - fn level_size(&self, level: usize) -> usize { - if level == 0 { - self.l0_sstables.len() - } else { - let ids = self.levels.get(level - 1).unwrap(); - ids.len() - } - } - pub(super) fn table_ids_mut(&mut self, level: usize) -> &mut Vec { if level == 0 { &mut self.l0_sstables @@ -228,36 +199,51 @@ where todo!() } - fn debug_level(&self, level: usize) -> DebugLevel { - let tables = self.tables(level); - let (size, count) = tables.fold((0, 0), |(size, count), table| { - (size + table.table_size(), count + 1) - }); - let ids = self.table_ids(level); - DebugLevel { - ids: ids.clone(), - size, - count, - } + pub fn apply_compaction_sst_ids(&mut self, task: &CompactionTask, new_sst_ids: Vec) { + let source_level = task.source(); + let source_range = task.source_index().build_range(); + self.table_ids_mut(source_level).splice(source_range, []); + + let destination_level = task.destination(); + self.table_ids_mut(destination_level) + .splice(.., new_sst_ids); } - pub fn fold_compaction_manifest(&mut self, Compaction(task, result_ids): Compaction) { - let source = self.table_ids_mut(task.source()); - source.remove(task.source_index()); - let destination = self.table_ids_mut(task.destination()); - let _ = mem::replace(destination, result_ids); + pub fn apply_compaction_sst( + &mut self, + new_sst: Vec>>, + task: &CompactionTask, + ) { + let source_level = task.source(); + let source_range = task.source_index().build_range(); + let source_ids = self.table_ids(source_level).clone(); + let source_ids = &source_ids[source_range]; + for id in source_ids { + self.sstables.remove(id); + } + + let destination_level = task.destination(); + let destination_ids = self.table_ids(destination_level).clone(); + for id in &destination_ids { + self.sstables.remove(id); + } + + for table in new_sst { + self.sstables.insert(*table.id(), table); + } } } fn filter_sst_by_bloom( - table: &SsTable, - lower: Bound<&[u8]>, - upper: Bound<&[u8]>, + _table: &SsTable, + lower: Bound, + upper: Bound, ) -> bool { use Bound::Included; if let (Included(lower), Included(upper)) = (lower, upper) { if lower == upper { - return bloom::may_contain(table.bloom.as_ref(), lower); + return true; + // return bloom::may_contain(table.bloom.as_ref(), lower); } } true @@ -282,63 +268,5 @@ pub fn fold_flush_manifest( Ok(()) } -struct DebugLevel { - ids: Vec, - size: u64, - count: usize, -} - #[cfg(test)] -mod tests { - use std::ops::Bound::Unbounded; - use std::path::PathBuf; - use std::sync::Arc; - use std::time::Duration; - - use crate::iterators::{create_two_merge_iter, NonEmptyStream}; - use crate::key::KeySlice; - use futures::StreamExt; - use tempfile::{tempdir, TempDir}; - use tokio::time::timeout; - use tracing::{info, Instrument}; - use tracing_subscriber::fmt::format::FmtSpan; - - use crate::persistent::file_object::FileObject; - use crate::persistent::LocalFs; - use crate::sst::iterator::SsTableIterator; - use crate::sst::{SsTable, SsTableBuilder, SstOptions, Sstables}; - - #[tokio::test] - async fn test() { - // tracing_subscriber::fmt::fmt() - // .with_span_events(FmtSpan::EXIT | FmtSpan::ENTER | FmtSpan::CLOSE) - // .with_target(false) - // .with_level(false) - // .init(); - let options = SstOptions::builder() - .target_sst_size(1024) - .block_size(4096) - .num_memtable_limit(1000) - .compaction_option(Default::default()) - .enable_wal(false) - .build(); - let dir = TempDir::new().unwrap(); - let path = dir.as_ref(); - let persistent = LocalFs::new(path.to_path_buf()); - let mut sst = Sstables::::new(&options); - let table = { - let mut builder = SsTableBuilder::new(16); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"11"), b"11"); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"22"), b"22"); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"33"), b"11"); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"44"), b"22"); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"55"), b"11"); - builder.add(KeySlice::for_testing_from_slice_no_ts(b"66"), b"22"); - builder.build(0, None, &persistent).await.unwrap() - }; - sst.insert_sst(Arc::new(table)); - - let iter = sst.scan_sst(Unbounded, Unbounded).await.unwrap(); - // assert_eq!() - } -} +mod tests {} diff --git a/src/sst/tables.rs b/src/sst/tables.rs index a091aee..f4acac3 100644 --- a/src/sst/tables.rs +++ b/src/sst/tables.rs @@ -4,7 +4,6 @@ use std::sync::Arc; use anyhow::{anyhow, Result}; use bytes::Buf; use derive_getters::Getters; -use futures::StreamExt; use typed_builder::TypedBuilder; use crate::block::{Block, BlockCache, BlockIterator}; @@ -30,7 +29,7 @@ pub struct SsTable { last_key: KeyBytes, pub(crate) bloom: Option, /// The maximum timestamp stored in this SST, implemented in week 3. - max_ts: u64, + pub max_ts: u64, // todo: use Option? } impl Debug for SsTable { @@ -55,6 +54,13 @@ impl SsTable { ) -> Result { let file = persistent.open_sst(id).await?; let mut end = file.size(); + + let max_ts = { + let data = file.read(end - 8, 8).await?; + end -= 8; + u64::from_be_bytes(data.as_slice().try_into()?) + }; + let bloom = { let bloom_offset_begin = end - 4; let bloom_offset = file.read(bloom_offset_begin, 4).await?.as_slice().get_u32() as u64; @@ -93,7 +99,7 @@ impl SsTable { first_key, last_key, bloom: Some(bloom), - max_ts: 0, + max_ts, }; Ok(table) } diff --git a/src/state/inner.rs b/src/state/inner.rs index 08838b1..14a6310 100644 --- a/src/state/inner.rs +++ b/src/state/inner.rs @@ -1,20 +1,25 @@ -use crossbeam_skiplist::SkipMap; -use std::cmp::max; -use std::fmt::{Debug, Formatter}; -use std::sync::Arc; - +use bytes::Bytes; use derive_getters::Getters; use futures::stream; use futures::stream::{StreamExt, TryStreamExt}; +use std::cmp::max; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; use typed_builder::TypedBuilder; use crate::block::BlockCache; -use crate::manifest::{Manifest, ManifestRecord, NewMemtable}; +use crate::key::KeyBytes; +use crate::manifest::{Compaction, Manifest, ManifestRecord, NewMemtable}; use crate::memtable::{ImmutableMemTable, MemTable}; use crate::persistent::Persistent; use crate::sst::sstables::fold_flush_manifest; use crate::sst::{SsTable, SstOptions, Sstables}; -use crate::wal::Wal; + +pub struct RecoveredState { + pub state: LsmStorageStateInner

, + pub next_sst_id: usize, + pub initial_ts: u64, +} #[derive(Getters, TypedBuilder)] pub struct LsmStorageStateInner { @@ -24,13 +29,21 @@ pub struct LsmStorageStateInner { } impl LsmStorageStateInner

{ + pub async fn put(&self, key: KeyBytes, value: impl Into + Send) -> anyhow::Result<()> { + self.memtable().put_with_ts(key, value.into()).await?; + // self.try_freeze_memtable(&snapshot) + // .await?; + // todo + Ok(()) + } + pub async fn recover( options: &SstOptions, manifest: &Manifest, manifest_records: Vec, persistent: &P, block_cache: Option>, - ) -> anyhow::Result<(Self, usize)> { + ) -> anyhow::Result> { let (imm_memtables, mut sstables_state) = build_state(options, manifest_records, persistent).await?; // todo: split sst_ids & sst hashmap @@ -66,12 +79,31 @@ impl LsmStorageStateInner

{ let memtable = MemTable::create_with_wal(next_sst_id, persistent, manifest).await?; + let max_ts = { + let memtable_max_ts = imm_memtables + .iter() + .flat_map(|table| table.iter().map(|entry| entry.key().timestamp())); + let sst_max_ts = sstables_state.sstables().values().map(|sst| sst.max_ts); + memtable_max_ts.chain(sst_max_ts).reduce(max).unwrap_or(0) + }; + let this = Self { memtable: Arc::new(memtable), imm_memtables, sstables_state: Arc::new(sstables_state), }; - Ok((this, next_sst_id + 1)) + + let recovered = RecoveredState { + state: this, + next_sst_id: next_sst_id + 1, + initial_ts: max_ts, + }; + + Ok(recovered) + } + + pub async fn sync_wal(&self) -> anyhow::Result<()> { + self.memtable.sync_wal().await } } @@ -121,18 +153,15 @@ async fn build_state( |(mut imm_memtables, mut sstables), manifest| async { match manifest { ManifestRecord::Flush(record) => { - let flush = &record; fold_flush_manifest(&mut imm_memtables, &mut sstables, record)?; Ok((imm_memtables, sstables)) } ManifestRecord::NewMemtable(record) => { - let new_mem = &record; fold_new_imm_memtable(&mut imm_memtables, persistent, record).await?; Ok((imm_memtables, sstables)) } - ManifestRecord::Compaction(record) => { - let compact = &record; - sstables.fold_compaction_manifest(record); + ManifestRecord::Compaction(Compaction(task, new_sst_ids)) => { + sstables.apply_compaction_sst_ids(&task, new_sst_ids); Ok((imm_memtables, sstables)) } } @@ -145,7 +174,7 @@ async fn build_state( mod tests { use crate::manifest::{Compaction, Flush, NewMemtable}; use crate::persistent::LocalFs; - use crate::sst::compact::common::CompactionTask; + use crate::sst::compact::common::{CompactionTask, SourceIndex}; use crate::sst::compact::{CompactionOptions, LeveledCompactionOptions}; use crate::sst::SstOptions; use crate::state::inner::build_state; @@ -178,11 +207,19 @@ mod tests { Flush(1).into(), NewMemtable(5).into(), Flush(2).into(), - Compaction(CompactionTask::new(0, 2, 1), vec![6]).into(), + Compaction( + CompactionTask::new(0, SourceIndex::Index { index: 2 }, 1), + vec![6], + ) + .into(), NewMemtable(7).into(), NewMemtable(8).into(), Flush(3).into(), - Compaction(CompactionTask::new(0, 2, 1), vec![9, 10]).into(), + Compaction( + CompactionTask::new(0, SourceIndex::Index { index: 2 }, 1), + vec![9, 10], + ) + .into(), ]; let (imm, ssts) = build_state(&options, manifest_records, &persistent) diff --git a/src/state/map.rs b/src/state/map.rs index 28c8a10..85a9814 100644 --- a/src/state/map.rs +++ b/src/state/map.rs @@ -1,6 +1,7 @@ -use bytes::Bytes; use std::future::Future; +use bytes::Bytes; + pub trait Map { type Error; diff --git a/src/state/mod.rs b/src/state/mod.rs index 7e4cc5a..019cafd 100644 --- a/src/state/mod.rs +++ b/src/state/mod.rs @@ -1,6 +1,8 @@ -mod inner; +pub mod inner; mod map; +mod mut_op; mod states; +pub mod write_batch; pub use inner::LsmStorageStateInner; pub use map::Map; diff --git a/src/state/mut_op.rs b/src/state/mut_op.rs new file mode 100644 index 0000000..1b41e41 --- /dev/null +++ b/src/state/mut_op.rs @@ -0,0 +1,6 @@ +#[allow(dead_code)] +#[derive(Debug)] +pub enum Op { + Put { key: T, value: T }, + Del(T), +} diff --git a/src/state/states.rs b/src/state/states.rs index 1b32eed..c314de1 100644 --- a/src/state/states.rs +++ b/src/state/states.rs @@ -1,26 +1,26 @@ +use anyhow::anyhow; +use arc_swap::ArcSwap; +use bytes::Bytes; +use derive_getters::Getters; use std::collections::Bound; use std::fmt::{Debug, Formatter}; -use std::future::Future; use std::ops::Deref; use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; - -use arc_swap::ArcSwap; -use bytes::Bytes; -use deref_ext::DerefExt; -use derive_getters::Getters; -use futures::StreamExt; use tokio::sync::{Mutex, MutexGuard}; -use tracing_futures::Instrument; use crate::block::BlockCache; -use crate::iterators::LockedLsmIter; +use crate::entry::Entry; use crate::manifest::{Flush, Manifest, ManifestRecord}; use crate::memtable::MemTable; +use crate::mvcc::core::LsmMvccInner; +use crate::mvcc::iterator::LockedTxnIterWithTxn; +use crate::mvcc::transaction::Transaction; use crate::persistent::Persistent; use crate::sst::compact::leveled::force_compact; use crate::sst::{SsTableBuilder, SstOptions}; -use crate::state::inner::LsmStorageStateInner; +use crate::state::inner::{LsmStorageStateInner, RecoveredState}; +use crate::state::write_batch::WriteBatchRecord; use crate::state::Map; use crate::utils::vec::pop; @@ -30,9 +30,11 @@ pub struct LsmStorageState { block_cache: Arc, manifest: Manifest, pub(crate) state_lock: Mutex<()>, + write_lock: Mutex<()>, pub(crate) persistent: P, pub(crate) options: SstOptions, pub(crate) sst_id: AtomicUsize, + mvcc: Option>, } impl

Debug for LsmStorageState

@@ -56,7 +58,11 @@ where pub async fn new(options: SstOptions, persistent: P) -> anyhow::Result { let (manifest, manifest_records) = Manifest::recover(&persistent).await?; let block_cache = Arc::new(BlockCache::new(1024)); - let (inner, next_sst_id) = LsmStorageStateInner::recover( + let RecoveredState { + state: inner, + next_sst_id, + initial_ts, + } = LsmStorageStateInner::recover( &options, &manifest, manifest_records, @@ -66,17 +72,36 @@ where .await?; let sst_id = AtomicUsize::new(next_sst_id); + let mvcc = if *options.enable_mvcc() { + Some(Arc::new(LsmMvccInner::new(initial_ts))) + } else { + None + }; + let this = Self { inner: ArcSwap::new(Arc::new(inner)), block_cache: Arc::new(BlockCache::new(1024)), manifest, + write_lock: Mutex::default(), state_lock: Mutex::default(), persistent, options, sst_id, + mvcc, }; Ok(this) } + + pub fn new_txn(&self) -> anyhow::Result> { + // todo: avoid clone? + let mvcc = self.mvcc.as_ref().ok_or(anyhow!("no mvcc"))?; + let tx = mvcc.new_txn(self, *self.options.serializable()); + Ok(tx) + } + + pub async fn sync_wal(&self) -> anyhow::Result<()> { + self.inner.load().sync_wal().await + } } // KV store @@ -87,15 +112,8 @@ where type Error = anyhow::Error; async fn get(&self, key: &[u8]) -> anyhow::Result> { - let guard = self.scan(Bound::Included(key), Bound::Included(key)); - let value = guard - .iter() - .await? - .next() - .await - .transpose()? - .map(|entry| entry.value); - Ok(value) + let txn = self.new_txn()?; + txn.get(key).await } async fn put( @@ -103,23 +121,17 @@ where key: impl Into + Send, value: impl Into + Send, ) -> anyhow::Result<()> { - let snapshot = self.inner.load(); - snapshot - .memtable() - .put(key.into(), value.into()) - .instrument(tracing::info_span!("memtable_put")) - .await?; - self.try_freeze_memtable(&snapshot) - .instrument(tracing::info_span!("try_freeze_memtable")) - .await?; - Ok(()) + // let _guard = + // todo: check options.serializable + let txn = self.new_txn()?; + txn.put(key, value).await?; + txn.commit().await } async fn delete(&self, key: impl Into + Send) -> anyhow::Result<()> { - let snapshot = self.inner.load(); - snapshot.memtable().put(key.into(), Bytes::new()).await?; - self.try_freeze_memtable(&snapshot).await?; - Ok(()) + let txn = self.new_txn()?; + txn.delete(key).await?; + txn.commit().await } } @@ -127,13 +139,31 @@ impl

LsmStorageState

where P: Persistent, { + pub async fn put_batch(&self, batch: &[WriteBatchRecord]) -> anyhow::Result<()> { + let txn = self.new_txn()?; + txn.write_batch(batch); + txn.commit().await + } + + pub async fn write_batch(&self, entries: &[Entry], timestamp: u64) -> anyhow::Result<()> { + let guard = self.inner.load(); + guard.memtable.put_batch(entries, timestamp).await?; + self.try_freeze_memtable(guard.as_ref()).await?; + Ok(()) + } + pub(crate) fn next_sst_id(&self) -> usize { self.sst_id().fetch_add(1, Ordering::Relaxed) } - fn scan<'a>(&self, lower: Bound<&'a [u8]>, upper: Bound<&'a [u8]>) -> LockedLsmIter<'a, P> { - let snapshot = self.inner.load(); - LockedLsmIter::new(snapshot, lower, upper) + pub fn scan<'a>( + &'a self, + lower: Bound<&'a [u8]>, + upper: Bound<&'a [u8]>, + ) -> LockedTxnIterWithTxn<'a, P> { + // todo: remove unwrap + let txn = self.new_txn().unwrap(); + LockedTxnIterWithTxn::new_(txn, lower, upper) } } @@ -156,6 +186,7 @@ where memtable.deref().approximate_size() > *self.options.target_sst_size() } + // todo: 这个函数用到的 snapshot 不用 load?直接从 caller 传过来? pub(crate) async fn force_freeze_memtable( &self, _guard: &MutexGuard<'_, ()>, @@ -209,6 +240,7 @@ where let new = { let mut new = Clone::clone(self.inner.load().as_ref()); let mut new_sstables = Clone::clone(new.sstables_state().as_ref()); + let watermark = self.mvcc.as_ref().map(|mvcc| mvcc.watermark()); force_compact( &mut new_sstables, @@ -216,6 +248,7 @@ where self.options(), self.persistent(), Some(&self.manifest), + watermark, ) .await?; @@ -312,26 +345,36 @@ where async fn delete_for_test(&self, key: &[u8]) -> anyhow::Result<()> { self.delete(Bytes::copy_from_slice(key)).await } + + // async fn write_batch_for_test( + // &self, + // records: impl IntoIterator>, + // ) -> anyhow::Result<()> { + // todo!() + // } } #[cfg(test)] mod test { - use std::collections::Bound; - use std::ops::Bound::{Included, Unbounded}; - use bytes::Bytes; - use futures::StreamExt; + use futures::{stream, Stream, StreamExt}; + use std::collections::Bound; + use std::ops::Bound::{Excluded, Included, Unbounded}; use tempfile::{tempdir, TempDir}; - use crate::entry::Entry; - use crate::iterators::no_deleted::new_no_deleted_iter; - use crate::iterators::two_merge::create_inner; - use crate::iterators::utils::{assert_stream_eq, build_stream, build_tuple_stream}; - use crate::iterators::{create_two_merge_iter, eq}; + use crate::entry::{Entry, InnerEntry}; + use crate::iterators::merge::MergeIteratorInner; + use crate::iterators::utils::test_utils::{ + assert_stream_eq, build_stream, build_tuple_stream, eq, + }; + use crate::mvcc::transaction::Transaction; use crate::persistent::file_object::LocalFs; use crate::persistent::Persistent; + use crate::sst::compact::CompactionOptions; + use crate::sst::iterator::SsTableIterator; use crate::sst::SstOptions; use crate::state::states::LsmStorageState; + use crate::state::LsmStorageStateInner; #[tokio::test] async fn test_task2_storage_integration() { @@ -410,7 +453,6 @@ mod test { storage.put_for_test(b"1", b"2333").await.unwrap(); } let num_imm_memtables = storage.inner.load().imm_memtables().len(); - println!("num_imm_memtables: {}", num_imm_memtables); assert!(num_imm_memtables >= 1, "no memtable frozen?"); for _ in 0..1000 { storage.delete_for_test(b"1").await.unwrap(); @@ -661,93 +703,779 @@ mod test { storage.get_for_test(b"0").await.unwrap(), Some(Bytes::from_static(b"2333333")) ); + assert_eq!( + storage.get_for_test(b"00").await.unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get_for_test(b"2").await.unwrap(), + Some(Bytes::from_static(b"2333")) + ); + assert_eq!( + storage.get_for_test(b"3").await.unwrap(), + Some(Bytes::from_static(b"23333")) + ); + assert_eq!(storage.get_for_test(b"4").await.unwrap(), None); + assert_eq!(storage.get_for_test(b"--").await.unwrap(), None); + assert_eq!(storage.get_for_test(b"555").await.unwrap(), None); + } + + async fn build_storage(dir: &TempDir) -> anyhow::Result> { + let persistent = LocalFs::new(dir.path().to_path_buf()); + let options = SstOptions::builder() + .target_sst_size(1024) + .block_size(4096) + .num_memtable_limit(1000) + .compaction_option(Default::default()) + .enable_wal(false) + .enable_mvcc(true) + .build(); + LsmStorageState::new(options, persistent).await + } + + #[tokio::test] + async fn test_task2_memtable_mvcc() { + test_task2_memtable_mvcc_helper(false).await; + } + + #[tokio::test] + async fn test_task2_lsm_iterator_mvcc() { + test_task2_memtable_mvcc_helper(true).await; + } + + async fn test_task2_memtable_mvcc_helper(flush: bool) { + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let options = SstOptions::builder() + .target_sst_size(1024) + .block_size(4096) + .num_memtable_limit(1000) + .compaction_option(Default::default()) + .enable_wal(true) + .enable_mvcc(true) + .build(); + let storage = LsmStorageState::new(options, persistent).await.unwrap(); + + storage.put_for_test(b"a", b"1").await.unwrap(); + storage.put_for_test(b"b", b"1").await.unwrap(); + + assert_eq!( + storage.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + + let snapshot1 = storage.new_txn().unwrap(); + + assert_eq!( + snapshot1.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + storage.put_for_test(b"a", b"2").await.unwrap(); + let snapshot2 = storage.new_txn().unwrap(); + storage.delete_for_test(b"b").await.unwrap(); + storage.put_for_test(b"c", b"1").await.unwrap(); + let snapshot3 = storage.new_txn().unwrap(); + assert_eq!( + snapshot1.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!( + snapshot1.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!(snapshot1.get_for_test(b"c").await.unwrap(), None); + + assert_scan_iter(&snapshot1, Unbounded, Unbounded, [("a", "1"), ("b", "1")]).await; + + assert_eq!( + snapshot2.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"2")) + ); + assert_eq!( + snapshot2.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!(snapshot2.get_for_test(b"c").await.unwrap(), None); + + assert_scan_iter(&snapshot2, Unbounded, Unbounded, [("a", "2"), ("b", "1")]).await; + + assert_eq!( + snapshot3.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"2")) + ); + assert_eq!(snapshot3.get_for_test(b"b").await.unwrap(), None); + assert_eq!( + snapshot3.get_for_test(b"c").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + + assert_scan_iter(&snapshot3, Unbounded, Unbounded, [("a", "2"), ("c", "1")]).await; + + if !flush { + let guard = storage.state_lock.lock().await; + storage.force_freeze_memtable(&guard).await.unwrap(); + } + + storage.put_for_test(b"a", b"3").await.unwrap(); + storage.put_for_test(b"b", b"3").await.unwrap(); + let snapshot4 = storage.new_txn().unwrap(); + storage.put_for_test(b"a", b"4").await.unwrap(); + let snapshot5 = storage.new_txn().unwrap(); + storage.delete_for_test(b"b").await.unwrap(); + storage.put_for_test(b"c", b"5").await.unwrap(); + let snapshot6 = storage.new_txn().unwrap(); + + if flush { + let guard = storage.state_lock.lock().await; + storage.force_flush_imm_memtable(&guard).await.unwrap(); + } + + assert_eq!( + snapshot1.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!( + snapshot1.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!(snapshot1.get_for_test(b"c").await.unwrap(), None); + + assert_scan_iter(&snapshot1, Unbounded, Unbounded, [("a", "1"), ("b", "1")]).await; + + assert_eq!( + snapshot2.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"2")) + ); + assert_eq!( + snapshot2.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + assert_eq!(snapshot2.get_for_test(b"c").await.unwrap(), None); + + assert_scan_iter(&snapshot2, Unbounded, Unbounded, [("a", "2"), ("b", "1")]).await; + + assert_eq!( + snapshot3.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"2")) + ); + assert_eq!(snapshot3.get_for_test(b"b").await.unwrap(), None); + assert_eq!( + snapshot3.get_for_test(b"c").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + + assert_scan_iter(&snapshot3, Unbounded, Unbounded, [("a", "2"), ("c", "1")]).await; + + assert_eq!( + snapshot4.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"3")) + ); + assert_eq!( + snapshot4.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"3")) + ); + assert_eq!( + snapshot4.get_for_test(b"c").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + + assert_scan_iter( + &snapshot4, + Unbounded, + Unbounded, + [("a", "3"), ("b", "3"), ("c", "1")], + ) + .await; + + assert_eq!( + snapshot5.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"4")) + ); + assert_eq!( + snapshot5.get_for_test(b"b").await.unwrap(), + Some(Bytes::from_static(b"3")) + ); + assert_eq!( + snapshot5.get_for_test(b"c").await.unwrap(), + Some(Bytes::from_static(b"1")) + ); + + assert_scan_iter( + &snapshot5, + Unbounded, + Unbounded, + [("a", "4"), ("b", "3"), ("c", "1")], + ) + .await; + + assert_eq!( + snapshot6.get_for_test(b"a").await.unwrap(), + Some(Bytes::from_static(b"4")) + ); + assert_eq!(snapshot6.get_for_test(b"b").await.unwrap(), None); + assert_eq!( + snapshot6.get_for_test(b"c").await.unwrap(), + Some(Bytes::from_static(b"5")) + ); + + assert_scan_iter(&snapshot6, Unbounded, Unbounded, [("a", "4"), ("c", "5")]).await; + + if flush { + assert_scan_iter(&snapshot6, Included(b"a"), Included(b"a"), [("a", "4")]).await; + assert_scan_iter(&snapshot6, Excluded(b"a"), Excluded(b"c"), []).await; + } + } + + #[tokio::test] + async fn test_task2_snapshot_watermark() { + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let options = SstOptions::builder() + .target_sst_size(1024) + .block_size(4096) + .num_memtable_limit(1000) + .compaction_option(Default::default()) + .enable_wal(true) + .enable_mvcc(true) + .build(); + let storage = LsmStorageState::new(options, persistent).await.unwrap(); + + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + storage.put_for_test(b"233", b"23333").await.unwrap(); + let txn3 = storage.new_txn().unwrap(); + assert_eq!(storage.mvcc().as_ref().unwrap().watermark(), txn1.read_ts); + drop(txn1); + assert_eq!(storage.mvcc().as_ref().unwrap().watermark(), txn2.read_ts); + drop(txn2); + assert_eq!(storage.mvcc().as_ref().unwrap().watermark(), txn3.read_ts); + drop(txn3); + assert_eq!( + storage.mvcc().as_ref().unwrap().watermark(), + storage.mvcc().as_ref().unwrap().latest_commit_ts() + ); + } + + // todo: add test + #[tokio::test] + async fn test_task3_mvcc_compaction() { + use crate::state::write_batch::WriteBatchRecord::{Del, Put}; + + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let compaction_option = CompactionOptions::Full; + let options = SstOptions::builder() + .target_sst_size(1024) + .block_size(4096) + .num_memtable_limit(1000) + .compaction_option(compaction_option) + .enable_wal(true) + .enable_mvcc(true) + .build(); + let storage = LsmStorageState::new(options, persistent).await.unwrap(); + + /* + a b c d + ----------- 0 + 1 1 + ----------- 1 + 2 2 + ----------- 2 + 3 - + ----------- 3 + - 4 + */ + + let snapshot0 = storage.new_txn().unwrap(); + storage + .put_batch(&[ + Put(Bytes::copy_from_slice(b"a"), Bytes::copy_from_slice(b"1")), + Put(Bytes::copy_from_slice(b"b"), Bytes::copy_from_slice(b"1")), + ]) + .await + .unwrap(); + + let snapshot1 = storage.new_txn().unwrap(); + storage + .put_batch(&[ + Put(Bytes::copy_from_slice(b"a"), Bytes::copy_from_slice(b"2")), + Put(Bytes::copy_from_slice(b"d"), Bytes::copy_from_slice(b"2")), + ]) + .await + .unwrap(); + + let snapshot2 = storage.new_txn().unwrap(); + storage + .put_batch(&[ + Put(Bytes::copy_from_slice(b"a"), Bytes::copy_from_slice(b"3")), + Del(Bytes::copy_from_slice(b"d")), + ]) + .await + .unwrap(); + + let snapshot3 = storage.new_txn().unwrap(); + storage + .put_batch(&[ + Put(Bytes::copy_from_slice(b"c"), Bytes::copy_from_slice(b"4")), + Del(Bytes::copy_from_slice(b"a")), + ]) + .await + .unwrap(); + { - let guard = storage.scan(Included(b"00"), Included(b"00")); - let mut iter = guard.build_memtable_iter().await; - assert_stream_eq( - iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333")]), - ) - .await; + let guard = storage.state_lock.lock().await; + storage.force_freeze_memtable(&guard).await.unwrap(); + storage.force_flush_imm_memtable(&guard).await.unwrap(); + storage.force_compact(&guard).await.unwrap(); } + { - let guard = storage.scan(Included(b"00"), Included(b"00")); - let mut iter = guard.build_sst_iter().await.unwrap(); - assert_stream_eq( - iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333333")]), + let inner = storage.inner.load(); + let iter = construct_test_mvcc_compaction_iter(inner.as_ref()).await; + assert_mvcc_compaction_iter( + iter, + [ + ("a", ""), + ("a", "3"), + ("a", "2"), + ("a", "1"), + ("b", "1"), + ("c", "4"), + ("d", ""), + ("d", "2"), + ], ) .await; } + + drop(snapshot0); { - let guard = storage.scan(Included(b"00"), Included(b"00")); - let a = guard.build_memtable_iter().await; - let b = guard.build_sst_iter().await.unwrap(); - let iter = create_inner(a, b).await.unwrap(); - assert_stream_eq( - iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333"), ("00", "2333333")]), - ) - .await; + let guard = storage.state_lock.lock().await; + storage.force_compact(&guard).await.unwrap(); } + { - let guard = storage.scan(Included(b"00"), Included(b"00")); - let a = guard.build_memtable_iter().await; - let b = guard.build_sst_iter().await.unwrap(); - let iter = create_two_merge_iter(a, b).await.unwrap(); - assert_stream_eq( - iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333")]), + let inner = storage.inner.load(); + let iter = construct_test_mvcc_compaction_iter(inner.as_ref()).await; + assert_mvcc_compaction_iter( + iter, + [ + ("a", ""), + ("a", "3"), + ("a", "2"), + ("a", "1"), + ("b", "1"), + ("c", "4"), + ("d", ""), + ("d", "2"), + ], ) .await; } + + drop(snapshot1); { - let guard = storage.scan(Included(b"00"), Included(b"00")); - let a = guard.build_memtable_iter().await; - let b = guard.build_sst_iter().await.unwrap(); - let iter = create_two_merge_iter(a, b).await.unwrap(); - let iter = new_no_deleted_iter(iter); - assert_stream_eq( - iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333")]), + let guard = storage.state_lock.lock().await; + storage.force_compact(&guard).await.unwrap(); + } + + { + let inner = storage.inner.load(); + let iter = construct_test_mvcc_compaction_iter(inner.as_ref()).await; + assert_mvcc_compaction_iter( + iter, + [ + ("a", ""), + ("a", "3"), + ("a", "2"), + ("b", "1"), + ("c", "4"), + ("d", ""), + ("d", "2"), + ], ) .await; } + + drop(snapshot2); + { + let guard = storage.state_lock.lock().await; + storage.force_compact(&guard).await.unwrap(); + } + + { + let inner = storage.inner.load(); + let iter = construct_test_mvcc_compaction_iter(inner.as_ref()).await; + assert_mvcc_compaction_iter(iter, [("a", ""), ("a", "3"), ("b", "1"), ("c", "4")]) + .await; + } + + drop(snapshot3); + { + let guard = storage.state_lock.lock().await; + storage.force_compact(&guard).await.unwrap(); + } + { - let guard = storage.scan(Included(b"00"), Included(b"00")); + let inner = storage.inner.load(); + let iter = construct_test_mvcc_compaction_iter(inner.as_ref()).await; + assert_mvcc_compaction_iter(iter, [("b", "1"), ("c", "4")]).await; + } + } + + async fn construct_test_mvcc_compaction_iter( + storage: &LsmStorageStateInner

, + ) -> impl Stream> + '_ { + let l0 = storage.sstables_state.l0_sstables().iter(); + let level_other = storage + .sstables_state + .levels() + .iter() + .flat_map(|v| v.iter()); + let iter = l0 + .chain(level_other) + .map(|id| storage.sstables_state.sstables().get(id).unwrap()) + .map(|sst| SsTableIterator::scan(sst.as_ref(), Unbounded, Unbounded)); + let iter = stream::iter(iter); + + MergeIteratorInner::create(iter).await + } + + #[tokio::test] + async fn test_txn_integration() { + let dir = tempdir().unwrap(); + let persistent = LocalFs::new(dir.path().to_path_buf()); + let options = SstOptions::builder() + .target_sst_size(1024) + .block_size(4096) + .num_memtable_limit(1000) + .compaction_option(Default::default()) + .enable_wal(true) + .enable_mvcc(true) + .build(); + let storage = LsmStorageState::new(options, persistent).await.unwrap(); + + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put_for_test(b"test1", b"233").await.unwrap(); + txn2.put_for_test(b"test2", b"233").await.unwrap(); + + assert_scan_iter(&txn1, Unbounded, Unbounded, [("test1", "233")]).await; + assert_scan_iter(&txn2, Unbounded, Unbounded, [("test2", "233")]).await; + + let txn3 = storage.new_txn().unwrap(); + + assert_scan_iter(&txn3, Unbounded, Unbounded, []).await; + + txn1.commit().await.unwrap(); + txn2.commit().await.unwrap(); + + assert_scan_iter(&txn3, Unbounded, Unbounded, []).await; + + drop(txn3); + + { + let guard = storage.scan(Unbounded, Unbounded); let iter = guard.iter().await.unwrap(); assert_stream_eq( iter.map(Result::unwrap).map(Entry::into_tuple), - build_tuple_stream([("00", "2333")]), + build_tuple_stream([("test1", "233"), ("test2", "233")]), ) .await; } + + let txn4 = storage.new_txn().unwrap(); + assert_eq!( - storage.get_for_test(b"00").await.unwrap(), - Some(Bytes::from_static(b"2333")) + txn4.get_for_test(b"test1").await.unwrap(), + Some(Bytes::from("233")) ); assert_eq!( - storage.get_for_test(b"2").await.unwrap(), - Some(Bytes::from_static(b"2333")) + txn4.get_for_test(b"test2").await.unwrap(), + Some(Bytes::from("233")) ); + + assert_scan_iter( + &txn4, + Unbounded, + Unbounded, + [("test1", "233"), ("test2", "233")], + ) + .await; + + txn4.put_for_test(b"test2", b"2333").await.unwrap(); assert_eq!( - storage.get_for_test(b"3").await.unwrap(), - Some(Bytes::from_static(b"23333")) + txn4.get_for_test(b"test1").await.unwrap(), + Some(Bytes::from("233")) ); - assert_eq!(storage.get_for_test(b"4").await.unwrap(), None); - assert_eq!(storage.get_for_test(b"--").await.unwrap(), None); - assert_eq!(storage.get_for_test(b"555").await.unwrap(), None); + assert_eq!( + txn4.get_for_test(b"test2").await.unwrap(), + Some(Bytes::from("2333")) + ); + + assert_scan_iter( + &txn4, + Unbounded, + Unbounded, + [("test1", "233"), ("test2", "2333")], + ) + .await; + + txn4.delete_for_test(b"test2").await.unwrap(); + + assert_eq!( + txn4.get_for_test(b"test1").await.unwrap(), + Some(Bytes::from("233")) + ); + assert_eq!(txn4.get_for_test(b"test2").await.unwrap(), None); + + assert_scan_iter(&txn4, Unbounded, Unbounded, [("test1", "233")]).await; } - async fn build_storage(dir: &TempDir) -> anyhow::Result> { + async fn assert_scan_iter<'a, P: Persistent>( + snapshot: &'a Transaction<'a, P>, + lower: Bound<&[u8]>, + upper: Bound<&[u8]>, + expected: impl IntoIterator, + ) { + let guard = snapshot.scan(lower, upper); + let iter = guard.iter().await.unwrap(); + assert_stream_eq( + iter.map(Result::unwrap).map(Entry::into_tuple), + build_tuple_stream(expected), + ) + .await; + } + + async fn assert_mvcc_compaction_iter( + iter: impl Stream>, + expected: impl IntoIterator, + ) { + let iter = iter + .map(Result::unwrap) + .map(|entry| (entry.key.key, entry.value)); + let expected = expected + .into_iter() + .map(|(key, value)| (Bytes::from(key), Bytes::from(value))); + let expected = stream::iter(expected); + assert_stream_eq(iter, expected).await; + } + + #[tokio::test] + async fn test_serializable_1() { + let dir = tempdir().unwrap(); + let storage = build_serializable_lsm(&dir).await; + + storage.put_for_test(b"key1", b"1").await.unwrap(); + storage.put_for_test(b"key2", b"2").await.unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put_for_test(b"key1", &txn1.get_for_test(b"key2").await.unwrap().unwrap()) + .await + .unwrap(); + txn2.put_for_test(b"key2", &txn2.get_for_test(b"key1").await.unwrap().unwrap()) + .await + .unwrap(); + txn1.commit().await.unwrap(); + assert!(txn2.commit().await.is_err()); + assert_eq!( + storage.get_for_test(b"key1").await.unwrap(), + Some(Bytes::from("2")) + ); + assert_eq!( + storage.get_for_test(b"key2").await.unwrap(), + Some(Bytes::from("2")) + ); + } + + #[tokio::test] + async fn test_serializable_2() { + let dir = tempdir().unwrap(); + let storage = build_serializable_lsm(&dir).await; + + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put_for_test(b"key1", b"1").await.unwrap(); + txn2.put_for_test(b"key1", b"2").await.unwrap(); + txn1.commit().await.unwrap(); + txn2.commit().await.unwrap(); + assert_eq!( + storage.get_for_test(b"key1").await.unwrap(), + Some(Bytes::from("2")) + ); + } + + #[tokio::test] + async fn test_serializable_3_ts_range() { + let dir = tempdir().unwrap(); + let storage = build_serializable_lsm(&dir).await; + + storage.put_for_test(b"key1", b"1").await.unwrap(); + storage.put_for_test(b"key2", b"2").await.unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put_for_test(b"key1", &txn1.get_for_test(b"key2").await.unwrap().unwrap()) + .await + .unwrap(); + txn1.commit().await.unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.put_for_test(b"key2", &txn2.get_for_test(b"key1").await.unwrap().unwrap()) + .await + .unwrap(); + txn2.commit().await.unwrap(); + assert_eq!( + storage.get_for_test(b"key1").await.unwrap(), + Some(Bytes::from("2")) + ); + assert_eq!( + storage.get_for_test(b"key2").await.unwrap(), + Some(Bytes::from("2")) + ); + } + + #[tokio::test] + async fn test_serializable_4_scan() { + let dir = tempdir().unwrap(); + let storage = build_serializable_lsm(&dir).await; + + storage.put_for_test(b"key1", b"1").await.unwrap(); + storage.put_for_test(b"key2", b"2").await.unwrap(); + let txn1 = storage.new_txn().unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn1.put_for_test(b"key1", &txn1.get_for_test(b"key2").await.unwrap().unwrap()) + .await + .unwrap(); + txn1.commit().await.unwrap(); + + { + let guard = txn2.scan(Unbounded, Unbounded); + let mut iter = guard.iter().await.unwrap(); + while let Some(entry) = iter.next().await { + // todo: check entry + let _entry = entry.unwrap(); + } + } + + txn2.put_for_test(b"key2", b"1").await.unwrap(); + assert!(txn2.commit().await.is_err()); + assert_eq!( + storage.get_for_test(b"key1").await.unwrap(), + Some(Bytes::from("2")) + ); + assert_eq!( + storage.get_for_test(b"key2").await.unwrap(), + Some(Bytes::from("2")) + ); + } + + #[tokio::test] + async fn test_serializable_5_read_only() { + let dir = tempdir().unwrap(); + let storage = build_serializable_lsm(&dir).await; + + storage.put_for_test(b"key1", b"1").await.unwrap(); + storage.put_for_test(b"key2", b"2").await.unwrap(); + let txn1 = storage.new_txn().unwrap(); + txn1.put_for_test(b"key1", &txn1.get_for_test(b"key2").await.unwrap().unwrap()) + .await + .unwrap(); + txn1.commit().await.unwrap(); + let txn2 = storage.new_txn().unwrap(); + txn2.get_for_test(b"key1").await.unwrap().unwrap(); + + { + let guard = txn2.scan(Unbounded, Unbounded); + let mut iter = guard.iter().await.unwrap(); + while let Some(entry) = iter.next().await { + // todo: check entry + let _entry = entry.unwrap(); + } + } + + txn2.commit().await.unwrap(); + assert_eq!( + storage.get_for_test(b"key1").await.unwrap(), + Some(Bytes::from("2")) + ); + assert_eq!( + storage.get_for_test(b"key2").await.unwrap(), + Some(Bytes::from("2")) + ); + } + + async fn build_serializable_lsm(dir: &TempDir) -> LsmStorageState { let persistent = LocalFs::new(dir.path().to_path_buf()); let options = SstOptions::builder() .target_sst_size(1024) .block_size(4096) .num_memtable_limit(1000) .compaction_option(Default::default()) - .enable_wal(false) + .enable_wal(true) + .enable_mvcc(true) + .serializable(true) .build(); - LsmStorageState::new(options, persistent).await + + LsmStorageState::new(options, persistent).await.unwrap() } + + // todo: week 3, day 7 test + // #[test] + // fn test_task3_mvcc_compaction() { + // let dir = tempdir().unwrap(); + // let options = LsmStorageOptions::default_for_week2_test(CompactionOptions::NoCompaction); + // let storage = MiniLsm::open(&dir, options.clone()).unwrap(); + // storage + // .write_batch(&[ + // WriteBatchRecord::Put("table1_a", "1"), + // WriteBatchRecord::Put("table1_b", "1"), + // WriteBatchRecord::Put("table1_c", "1"), + // WriteBatchRecord::Put("table2_a", "1"), + // WriteBatchRecord::Put("table2_b", "1"), + // WriteBatchRecord::Put("table2_c", "1"), + // ]) + // .unwrap(); + // storage.force_flush().unwrap(); + // let snapshot0 = storage.new_txn().unwrap(); + // storage + // .write_batch(&[ + // WriteBatchRecord::Put("table1_a", "2"), + // WriteBatchRecord::Del("table1_b"), + // WriteBatchRecord::Put("table1_c", "2"), + // WriteBatchRecord::Put("table2_a", "2"), + // WriteBatchRecord::Del("table2_b"), + // WriteBatchRecord::Put("table2_c", "2"), + // ]) + // .unwrap(); + // storage.force_flush().unwrap(); + // storage.add_compaction_filter(CompactionFilter::Prefix(Bytes::from("table2_"))); + // storage.force_full_compaction().unwrap(); + // + // let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + // check_iter_result_by_key( + // &mut iter, + // vec![ + // (Bytes::from("table1_a"), Bytes::from("2")), + // (Bytes::from("table1_a"), Bytes::from("1")), + // (Bytes::from("table1_b"), Bytes::new()), + // (Bytes::from("table1_b"), Bytes::from("1")), + // (Bytes::from("table1_c"), Bytes::from("2")), + // (Bytes::from("table1_c"), Bytes::from("1")), + // (Bytes::from("table2_a"), Bytes::from("2")), + // (Bytes::from("table2_b"), Bytes::new()), + // (Bytes::from("table2_c"), Bytes::from("2")), + // ], + // ); + // + // drop(snapshot0); + // + // storage.force_full_compaction().unwrap(); + // + // let mut iter = construct_merge_iterator_over_storage(&storage.inner.state.read()); + // check_iter_result_by_key( + // &mut iter, + // vec![ + // (Bytes::from("table1_a"), Bytes::from("2")), + // (Bytes::from("table1_c"), Bytes::from("2")), + // ], + // ); + // } } diff --git a/src/state/write_batch.rs b/src/state/write_batch.rs new file mode 100644 index 0000000..ebf25e4 --- /dev/null +++ b/src/state/write_batch.rs @@ -0,0 +1,24 @@ +use crate::entry::Keyed; +use bytes::Bytes; + +#[derive(Debug, Clone)] +pub enum WriteBatchRecord { + Put(Bytes, Bytes), + Del(Bytes), +} + +impl WriteBatchRecord { + pub fn get_key(&self) -> &Bytes { + match self { + WriteBatchRecord::Put(key, _) => key, + WriteBatchRecord::Del(key) => key, + } + } + + pub fn into_keyed(self) -> Keyed { + match self { + WriteBatchRecord::Put(key, value) => Keyed::new(key, value), + WriteBatchRecord::Del(key) => Keyed::new(key, Bytes::new()), + } + } +} diff --git a/src/test_utils/command.rs b/src/test_utils/command.rs deleted file mode 100644 index 96fcb12..0000000 --- a/src/test_utils/command.rs +++ /dev/null @@ -1,5 +0,0 @@ -pub enum Command { - Get { key: Vec }, - Put { key: Vec, value: Vec }, - Delete { key: Vec }, -} diff --git a/src/test_utils/iterator.rs b/src/test_utils/iterator.rs new file mode 100644 index 0000000..a92dd49 --- /dev/null +++ b/src/test_utils/iterator.rs @@ -0,0 +1,14 @@ +use crate::entry::{Entry, InnerEntry}; +use futures::{Stream, StreamExt}; + +pub fn unwrap_ts_stream(s: S) -> impl Stream> +where + S: Stream>, +{ + s.map(|item| { + item.map(|entry| Entry { + key: entry.key.into_inner(), + value: entry.value, + }) + }) +} diff --git a/src/test_utils/map.rs b/src/test_utils/map.rs deleted file mode 100644 index 014c2fa..0000000 --- a/src/test_utils/map.rs +++ /dev/null @@ -1 +0,0 @@ -pub struct Map {} diff --git a/src/test_utils/mod.rs b/src/test_utils/mod.rs index a119e18..da5dd00 100644 --- a/src/test_utils/mod.rs +++ b/src/test_utils/mod.rs @@ -1,12 +1,13 @@ -use std::error::Error; +#[cfg(test)] use std::ops::Range; -use crate::persistent::Persistent; +#[cfg(test)] use crate::state::Map; -mod command; -mod map; +#[cfg(test)] +pub mod iterator; +#[cfg(test)] pub async fn insert_sst>( state: &M, range: Range, diff --git a/src/time/mod.rs b/src/time/mod.rs new file mode 100644 index 0000000..8130392 --- /dev/null +++ b/src/time/mod.rs @@ -0,0 +1,28 @@ +use std::sync::atomic::{AtomicU64, Ordering}; +use std::time::UNIX_EPOCH; + +pub trait TimeProvider: Send + Sync + 'static { + fn now(&self) -> u64; +} + +#[derive(Default)] +pub struct TimeIncrement(AtomicU64); + +impl TimeProvider for TimeIncrement { + fn now(&self) -> u64 { + self.0.fetch_add(1, Ordering::Relaxed) + } +} + +#[derive(Default)] +pub struct SystemTime; + +impl TimeProvider for SystemTime { + fn now(&self) -> u64 { + // todo: remove unwrap? + std::time::SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs() + } +} diff --git a/src/utils/func.rs b/src/utils/func.rs deleted file mode 100644 index 10a9f6a..0000000 --- a/src/utils/func.rs +++ /dev/null @@ -1 +0,0 @@ -pub fn do_nothing(_: T) {} diff --git a/src/utils/mod.rs b/src/utils/mod.rs index ae9e6a4..0c7cfa1 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,3 +1,4 @@ -pub mod func; pub mod num; +pub mod scoped; +pub mod send; pub mod vec; diff --git a/src/utils/scoped.rs b/src/utils/scoped.rs new file mode 100644 index 0000000..d8e8516 --- /dev/null +++ b/src/utils/scoped.rs @@ -0,0 +1,26 @@ +use parking_lot::{Mutex, MutexGuard}; + +#[derive(Debug, Default)] +pub struct ScopedMutex { + inner: Mutex, +} + +impl ScopedMutex { + pub fn new(t: T) -> Self { + Self { + inner: Mutex::new(t), + } + } + + pub fn lock_with(&self, f: F) -> B + where + F: FnOnce(MutexGuard) -> B, + { + let guard = self.inner.lock(); + f(guard) + } + + pub fn into_inner(self) -> T { + self.inner.into_inner() + } +} diff --git a/src/utils/send.rs b/src/utils/send.rs new file mode 100644 index 0000000..8f7380e --- /dev/null +++ b/src/utils/send.rs @@ -0,0 +1,3 @@ +pub fn assert_send(x: T) -> T { + x +} diff --git a/src/wal.rs b/src/wal.rs index 49cd7c1..8205d6a 100644 --- a/src/wal.rs +++ b/src/wal.rs @@ -1,18 +1,15 @@ -use std::future::Future; -use std::io::Cursor; -use std::ops::DerefMut; -use std::path::Path; -use std::sync::Arc; - +use crate::key::KeyBytes; use bytes::{Buf, Bytes}; use crossbeam_skiplist::SkipMap; -use tokio::fs::{File, OpenOptions}; -use tokio::io::{AsyncReadExt, AsyncWriteExt}; -use tokio::sync::Mutex; -use tracing_futures::Instrument; +use std::io::Cursor; +use std::iter; +use std::sync::Arc; +use crate::entry::Keyed; use crate::persistent::interface::WalHandle; use crate::persistent::Persistent; +use tokio::io::{AsyncReadExt, AsyncWriteExt}; +use tokio::sync::Mutex; pub struct Wal { file: Arc>, @@ -33,7 +30,7 @@ impl Wal { pub async fn recover>( id: usize, persistent: &P, - ) -> anyhow::Result<(Self, SkipMap)> { + ) -> anyhow::Result<(Self, SkipMap)> { let mut file = persistent.open_wal_handle(id).await?; let data = { let mut data = Vec::new(); @@ -45,8 +42,12 @@ impl Wal { while data.has_remaining() { let key_len = data.get_u32(); let key = data.copy_to_bytes(key_len as usize); + let timestamp = data.get_u64(); + let key = KeyBytes::new(key, timestamp); + let value_len = data.get_u32(); let value = data.copy_to_bytes(value_len as usize); + map.insert(key, value); } let wal = Wal { @@ -55,50 +56,52 @@ impl Wal { Ok((wal, map)) } - pub async fn put<'a>(&'a self, key: &'a [u8], value: &'a [u8]) -> anyhow::Result<()> { + pub async fn put_batch( + &self, + entries: impl Iterator> + Send, + timestamp: u64, + ) -> anyhow::Result<()> { + // todo: atomic wal let mut guard = self.file.lock().await; - guard - .write_u32(key.len() as u32) - .instrument(tracing::info_span!("wal_put_write_key_len")) - .await?; - guard - .write_all(key) - .instrument(tracing::info_span!("wal_put_write_all_key")) - .await?; - guard - .write_u32(value.len() as u32) - .instrument(tracing::info_span!("wal_put_write_value_len")) - .await?; - guard - .write_all(value) - .instrument(tracing::info_span!("wal_put_write_all_value")) - .await?; - guard - .flush() - .instrument(tracing::info_span!("wal_put_flush")) - .await?; + for entry in entries { + let key = entry.key; + guard.write_u32(key.len() as u32).await?; + guard.write_all(key).await?; + guard.write_u64(timestamp).await?; + + let value = entry.value; + guard.write_u32(value.len() as u32).await?; + guard.write_all(value).await?; + } + guard.flush().await?; + guard.sync_all().await?; + Ok(()) } pub async fn sync(&self) -> anyhow::Result<()> { - let mut guard = self.file.lock().await; + let guard = self.file.lock().await; guard.sync_all().await?; Ok(()) } } -async fn get_file(path: impl AsRef) -> anyhow::Result { - let file = OpenOptions::new() - .create(true) - .append(true) - .open(path) - .await?; - Ok(file) +impl Wal { + pub async fn put_for_test<'a>( + &'a self, + key: &'a [u8], + ts: u64, + value: &'a [u8], + ) -> anyhow::Result<()> { + let entry = Keyed::new(key, value); + self.put_batch(iter::once(entry), ts).await + } } #[cfg(test)] mod tests { - use bytes::Bytes; + use crate::key::KeyBytes; + use tempfile::tempdir; use crate::persistent::LocalFs; @@ -112,21 +115,46 @@ mod tests { { let wal = Wal::create(id, &persistent).await.unwrap(); - wal.put("111".as_bytes(), "a".as_bytes()).await.unwrap(); - wal.put("222".as_bytes(), "bb".as_bytes()).await.unwrap(); - wal.put("333".as_bytes(), "ccc".as_bytes()).await.unwrap(); - wal.put("4".as_bytes(), "".as_bytes()).await.unwrap(); + wal.put_for_test("111".as_bytes(), 123, "a".as_bytes()) + .await + .unwrap(); + wal.put_for_test("222".as_bytes(), 234, "bb".as_bytes()) + .await + .unwrap(); + wal.put_for_test("333".as_bytes(), 345, "ccc".as_bytes()) + .await + .unwrap(); + wal.put_for_test("4".as_bytes(), 456, "".as_bytes()) + .await + .unwrap(); wal.sync().await.unwrap(); } { - let (wal, map) = Wal::recover(id, &persistent).await.unwrap(); - - assert_eq!(map.get(&Bytes::from("111")).unwrap().value(), "a"); - assert_eq!(map.get(&Bytes::from("222")).unwrap().value(), "bb"); - assert_eq!(map.get(&Bytes::from("333")).unwrap().value(), "ccc"); - assert_eq!(map.get(&Bytes::from("4")).unwrap().value(), ""); - assert!(map.get(&Bytes::from("555")).is_none()); + let (_wal, map) = Wal::recover(id, &persistent).await.unwrap(); + assert_eq!( + map.get(&KeyBytes::new_for_test(b"111", 123)) + .unwrap() + .value(), + "a" + ); + assert_eq!( + map.get(&KeyBytes::new_for_test(b"222", 234)) + .unwrap() + .value(), + "bb" + ); + assert_eq!( + map.get(&KeyBytes::new_for_test(b"333", 345)) + .unwrap() + .value(), + "ccc" + ); + assert_eq!( + map.get(&KeyBytes::new_for_test(b"4", 456)).unwrap().value(), + "" + ); + assert!(map.get(&KeyBytes::new_for_test(b"555", 0)).is_none()); } } }