Skip to content

Commit

Permalink
separate week 1 solution
Browse files Browse the repository at this point in the history
Signed-off-by: Alex Chi Z <[email protected]>
  • Loading branch information
skyzh committed Jan 16, 2024
1 parent a5c8a06 commit 327f6ba
Show file tree
Hide file tree
Showing 26 changed files with 2,176 additions and 0 deletions.
15 changes: 15 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[workspace]
members = [
"mini-lsm",
"mini-lsm-week-1",
"xtask",
"mini-lsm-starter",
]
Expand Down
22 changes: 22 additions & 0 deletions mini-lsm-week-1/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "mini-lsm-week-1"
version = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
keywords = { workspace = true }
license = { workspace = true }
repository = { workspace = true }
description = "A tutorial for building an LSM tree storage engine in a week."

[dependencies]
anyhow = "1"
arc-swap = "1"
bytes = "1"
crossbeam-epoch = "0.9"
crossbeam-skiplist = "0.1"
parking_lot = "0.12"
ouroboros = "0.15"
moka = "0.9"

[dev-dependencies]
tempfile = "3"
1 change: 1 addition & 0 deletions mini-lsm-week-1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# mini-lsm week-1 solution
46 changes: 46 additions & 0 deletions mini-lsm-week-1/src/block.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
mod builder;
mod iterator;

pub use builder::BlockBuilder;
use bytes::{Buf, BufMut, Bytes};
pub use iterator::BlockIterator;

pub const SIZEOF_U16: usize = std::mem::size_of::<u16>();

/// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted
/// key-value pairs.
pub struct Block {
data: Vec<u8>,
offsets: Vec<u16>,
}

impl Block {
pub fn encode(&self) -> Bytes {
let mut buf = self.data.clone();
let offsets_len = self.offsets.len();
for offset in &self.offsets {
buf.put_u16(*offset);
}
// Adds number of elements at the end of the block
buf.put_u16(offsets_len as u16);
buf.into()
}

pub fn decode(data: &[u8]) -> Self {
// get number of elements in the block
let entry_offsets_len = (&data[data.len() - SIZEOF_U16..]).get_u16() as usize;
let data_end = data.len() - SIZEOF_U16 - entry_offsets_len * SIZEOF_U16;
let offsets_raw = &data[data_end..data.len() - SIZEOF_U16];
// get offset array
let offsets = offsets_raw
.chunks(SIZEOF_U16)
.map(|mut x| x.get_u16())
.collect();
// retrieve data
let data = data[0..data_end].to_vec();
Self { data, offsets }
}
}

#[cfg(test)]
mod tests;
67 changes: 67 additions & 0 deletions mini-lsm-week-1/src/block/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
use bytes::BufMut;

use super::{Block, SIZEOF_U16};

/// Builds a block.
pub struct BlockBuilder {
/// Offsets of each key-value entries.
offsets: Vec<u16>,
/// All serialized key-value pairs in the block.
data: Vec<u8>,
/// The expected block size.
block_size: usize,
}

impl BlockBuilder {
/// Creates a new block builder.
pub fn new(block_size: usize) -> Self {
Self {
offsets: Vec::new(),
data: Vec::new(),
block_size,
}
}

fn estimated_size(&self) -> usize {
SIZEOF_U16 /* number of key-value pairs in the block */ + self.offsets.len() * SIZEOF_U16 /* offsets */ + self.data.len()
/* key-value pairs */
}

/// Adds a key-value pair to the block. Returns false when the block is full.
#[must_use]
pub fn add(&mut self, key: &[u8], value: &[u8]) -> bool {
assert!(!key.is_empty(), "key must not be empty");
if self.estimated_size() + key.len() + value.len() + SIZEOF_U16 * 3 /* key_len, value_len and offset */ > self.block_size
&& !self.is_empty()
{
return false;
}
// Add the offset of the data into the offset array.
self.offsets.push(self.data.len() as u16);
// Encode key length.
self.data.put_u16(key.len() as u16);
// Encode key content.
self.data.put(key);
// Encode value length.
self.data.put_u16(value.len() as u16);
// Encode value content.
self.data.put(value);
true
}

/// Check if there are no key-value pairs in the block.
pub fn is_empty(&self) -> bool {
self.offsets.is_empty()
}

/// Finalize the block.
pub fn build(self) -> Block {
if self.is_empty() {
panic!("block should not be empty");
}
Block {
data: self.data,
offsets: self.offsets,
}
}
}
117 changes: 117 additions & 0 deletions mini-lsm-week-1/src/block/iterator.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
use std::sync::Arc;

use bytes::Buf;

use super::Block;

/// Iterates on a block.
pub struct BlockIterator {
/// reference to the block
block: Arc<Block>,
/// the current key at the iterator position
key: Vec<u8>,
/// the current value at the iterator position
value: Vec<u8>,
/// the current index at the iterator position
idx: usize,
}

impl BlockIterator {
fn new(block: Arc<Block>) -> Self {
Self {
block,
key: Vec::new(),
value: Vec::new(),
idx: 0,
}
}

/// Creates a block iterator and seek to the first entry.
pub fn create_and_seek_to_first(block: Arc<Block>) -> Self {
let mut iter = Self::new(block);
iter.seek_to_first();
iter
}

/// Creates a block iterator and seek to the first key that >= `key`.
pub fn create_and_seek_to_key(block: Arc<Block>, key: &[u8]) -> Self {
let mut iter = Self::new(block);
iter.seek_to_key(key);
iter
}

/// Returns the key of the current entry.
pub fn key(&self) -> &[u8] {
debug_assert!(!self.key.is_empty(), "invalid iterator");
&self.key
}

/// Returns the value of the current entry.
pub fn value(&self) -> &[u8] {
debug_assert!(!self.key.is_empty(), "invalid iterator");
&self.value
}

/// Returns true if the iterator is valid.
pub fn is_valid(&self) -> bool {
!self.key.is_empty()
}

/// Seeks to the first key in the block.
pub fn seek_to_first(&mut self) {
self.seek_to(0);
}

/// Seeks to the idx-th key in the block.
fn seek_to(&mut self, idx: usize) {
if idx >= self.block.offsets.len() {
self.key.clear();
self.value.clear();
return;
}
let offset = self.block.offsets[idx] as usize;
self.seek_to_offset(offset);
self.idx = idx;
}

/// Move to the next key in the block.
pub fn next(&mut self) {
self.idx += 1;
self.seek_to(self.idx);
}

/// Seek to the specified position and update the current `key` and `value`
/// Index update will be handled by caller
fn seek_to_offset(&mut self, offset: usize) {
let mut entry = &self.block.data[offset..];
// Since `get_u16()` will automatically move the ptr 2 bytes ahead here,
// we don't need to manually advance it
let key_len = entry.get_u16() as usize;
let key = entry[..key_len].to_vec();
entry.advance(key_len);
self.key.clear();
self.key.extend(key);
let value_len = entry.get_u16() as usize;
let value = entry[..value_len].to_vec();
entry.advance(value_len);
self.value.clear();
self.value.extend(value);
}

/// Seek to the first key that is >= `key`.
pub fn seek_to_key(&mut self, key: &[u8]) {
let mut low = 0;
let mut high = self.block.offsets.len();
while low < high {
let mid = low + (high - low) / 2;
self.seek_to(mid);
assert!(self.is_valid());
match self.key().cmp(key) {
std::cmp::Ordering::Less => low = mid + 1,
std::cmp::Ordering::Greater => high = mid,
std::cmp::Ordering::Equal => return,
}
}
self.seek_to(low);
}
}
Loading

0 comments on commit 327f6ba

Please sign in to comment.