From 4ee60acb5ac716427654c22c18419a401f7b13c6 Mon Sep 17 00:00:00 2001 From: HeartLinked Date: Tue, 24 Sep 2024 16:35:52 +0800 Subject: [PATCH] [week1] 1.4 task1 --- mini-lsm-starter/src/block.rs | 28 +++++++-- mini-lsm-starter/src/block/builder.rs | 1 - mini-lsm-starter/src/block/iterator.rs | 2 +- mini-lsm-starter/src/lsm_iterator.rs | 3 - mini-lsm-starter/src/lsm_storage.rs | 7 +-- mini-lsm-starter/src/mem_table.rs | 2 - mini-lsm-starter/src/table.rs | 80 ++++++++++++++++++++++-- mini-lsm-starter/src/table/builder.rs | 85 ++++++++++++++++++++++---- mini-lsm-starter/src/tests.rs | 1 + 9 files changed, 178 insertions(+), 31 deletions(-) diff --git a/mini-lsm-starter/src/block.rs b/mini-lsm-starter/src/block.rs index 14704d7..9089bb5 100644 --- a/mini-lsm-starter/src/block.rs +++ b/mini-lsm-starter/src/block.rs @@ -1,13 +1,9 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - mod builder; mod iterator; pub use builder::BlockBuilder; use bytes::{Buf, BufMut, Bytes, BytesMut}; pub use iterator::BlockIterator; -use std::mem::size_of; /// A block is the smallest unit of read and caching in LSM tree. It is a collection of sorted key-value pairs. pub struct Block { @@ -52,4 +48,28 @@ impl Block { offsets, } } + + pub fn first_key(&self) -> Option { + let offset = match self.offsets.first() { + Some(offset) => *offset as usize, + None => return None, + }; + + let mut data = &self.data[offset..]; + let len = data.get_u16() as usize; + + Some(Bytes::copy_from_slice(&data[..len])) + } + + pub fn last_key(&self) -> Option { + let offset = match self.offsets.last() { + Some(offset) => *offset as usize, + None => return None, + }; + + let mut data = &self.data[offset..]; + let len = data.get_u16() as usize; + + Some(Bytes::copy_from_slice(&data[..len])) + } } diff --git a/mini-lsm-starter/src/block/builder.rs b/mini-lsm-starter/src/block/builder.rs index 5b5a786..ba01bcf 100644 --- a/mini-lsm-starter/src/block/builder.rs +++ b/mini-lsm-starter/src/block/builder.rs @@ -38,7 +38,6 @@ impl BlockBuilder { if self.offsets.len() * 2 + self.data.len() + size_add + NUM_OF_ELEMENTS_LEN > self.block_size { - println!("---------OVERFLOW!--------------"); return false; } } diff --git a/mini-lsm-starter/src/block/iterator.rs b/mini-lsm-starter/src/block/iterator.rs index d8b9520..06090a2 100644 --- a/mini-lsm-starter/src/block/iterator.rs +++ b/mini-lsm-starter/src/block/iterator.rs @@ -21,7 +21,7 @@ pub struct BlockIterator { } impl BlockIterator { - fn new(block: Arc) -> Self { + pub fn new(block: Arc) -> Self { let mut iter = Self { block, key: KeyVec::new(), diff --git a/mini-lsm-starter/src/lsm_iterator.rs b/mini-lsm-starter/src/lsm_iterator.rs index 8c3b16a..1eea6c1 100644 --- a/mini-lsm-starter/src/lsm_iterator.rs +++ b/mini-lsm-starter/src/lsm_iterator.rs @@ -1,6 +1,3 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use anyhow::{anyhow, Result}; use crate::{ diff --git a/mini-lsm-starter/src/lsm_storage.rs b/mini-lsm-starter/src/lsm_storage.rs index 2b5c062..6757753 100644 --- a/mini-lsm-starter/src/lsm_storage.rs +++ b/mini-lsm-starter/src/lsm_storage.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality - use std::collections::HashMap; use std::ops::Bound; use std::path::{Path, PathBuf}; @@ -16,10 +14,9 @@ use crate::compact::{ SimpleLeveledCompactionController, SimpleLeveledCompactionOptions, TieredCompactionController, }; use crate::iterators::merge_iterator::MergeIterator; -use crate::iterators::StorageIterator; use crate::lsm_iterator::{FusedIterator, LsmIterator}; use crate::manifest::Manifest; -use crate::mem_table::{MemTable, MemTableIterator}; +use crate::mem_table::MemTable; use crate::mvcc::LsmMvccInner; use crate::table::SsTable; @@ -439,7 +436,7 @@ impl LsmStorageInner { } // 用 vec 创建 let lsm_iterator_inner = MergeIterator::create(iters); - let mut iter = LsmIterator::new(lsm_iterator_inner)?; + let iter = LsmIterator::new(lsm_iterator_inner)?; Ok(FusedIterator::new(iter)) } } diff --git a/mini-lsm-starter/src/mem_table.rs b/mini-lsm-starter/src/mem_table.rs index 951d61d..de79162 100644 --- a/mini-lsm-starter/src/mem_table.rs +++ b/mini-lsm-starter/src/mem_table.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] // REMOVE THIS LINE after fully implementing this functionality - use std::ops::Bound; use std::path::Path; use std::sync::atomic::AtomicUsize; diff --git a/mini-lsm-starter/src/table.rs b/mini-lsm-starter/src/table.rs index 02baaa2..f06dd11 100644 --- a/mini-lsm-starter/src/table.rs +++ b/mini-lsm-starter/src/table.rs @@ -5,11 +5,13 @@ pub(crate) mod bloom; mod builder; mod iterator; +use bytes::Bytes; use std::fs::File; +use std::io::{Read, Seek}; use std::path::Path; use std::sync::Arc; -use anyhow::Result; +use anyhow::{anyhow, Result}; pub use builder::SsTableBuilder; use bytes::Buf; pub use iterator::SsTableIterator; @@ -21,6 +23,8 @@ use crate::lsm_storage::BlockCache; use self::bloom::Bloom; #[derive(Clone, Debug, PartialEq, Eq)] +/// you will need to maintain block metadata BlockMeta, +/// which includes the first/last keys in each block and the offsets of each block. pub struct BlockMeta { /// Offset of this data block. pub offset: usize, @@ -39,12 +43,47 @@ impl BlockMeta { #[allow(clippy::ptr_arg)] // remove this allow after you finish buf: &mut Vec, ) { - unimplemented!() + for meta in block_meta { + // 编码 offset,使用 8 字节(u64)来存储 + buf.extend_from_slice(&(meta.offset as u64).to_le_bytes()); + // 编码 first_key 的长度和内容 + let first_key_len = meta.first_key.raw_ref().len() as u64; + buf.extend_from_slice(&first_key_len.to_le_bytes()); + buf.extend_from_slice(meta.first_key.raw_ref()); + // 编码 last_key 的长度和内容 + let last_key_len = meta.last_key.raw_ref().len() as u64; + buf.extend_from_slice(&last_key_len.to_le_bytes()); + buf.extend_from_slice(meta.last_key.raw_ref()); + } } /// Decode block meta from a buffer. pub fn decode_block_meta(buf: impl Buf) -> Vec { - unimplemented!() + let mut block_metas = Vec::new(); + let mut buf = buf; // 不需要将 `buf` 声明为可变,因为 Buf trait 是内部可变的 + while buf.remaining() > 0 { + // 解码 offset(8 字节) + let offset = buf.get_u64_le() as usize; + + // 解码 first_key 的长度(8 字节),然后读取 first_key + let first_key_len = buf.get_u64_le() as usize; + let mut first_key = vec![0; first_key_len]; + buf.copy_to_slice(&mut first_key); + + // 解码 last_key 的长度(8 字节),然后读取 last_key + let last_key_len = buf.get_u64_le() as usize; + let mut last_key = vec![0; last_key_len]; + buf.copy_to_slice(&mut last_key); + + // 构建 BlockMeta 并推入列表 + block_metas.push(BlockMeta { + offset, + first_key: KeyBytes::from_bytes(Bytes::from(first_key)), + last_key: KeyBytes::from_bytes(Bytes::from(last_key)), + }); + } + + block_metas } } @@ -108,7 +147,40 @@ impl SsTable { /// Open SSTable from a file. pub fn open(id: usize, block_cache: Option>, file: FileObject) -> Result { - unimplemented!() + let (mut file, file_size) = match file.0 { + Some(f) => (f, file.1), + None => return Err(anyhow!("file not exists")), + }; + + let mut bytes = vec![0; file_size as usize]; + file.read_exact(&mut bytes)?; + + const U32_SIZE: usize = size_of::(); + let block_meta_offset = (&bytes[bytes.len() - U32_SIZE..]).get_u32() as usize; + + let block_meta = &bytes[block_meta_offset..bytes.len() - U32_SIZE]; + let block_meta = BlockMeta::decode_block_meta(block_meta); + let first_key = block_meta + .first() + .map(|meta| meta.first_key.clone()) + .unwrap_or_default(); + let last_key = block_meta + .last() + .map(|meta| meta.last_key.clone()) + .unwrap_or_default(); + + file.rewind()?; + Ok(Self { + file: FileObject(Some(file), file_size), + block_meta, + block_meta_offset, + id, + block_cache, + first_key, + last_key, + bloom: None, + max_ts: 0, + }) } /// Create a mock SST with only first key + last key metadata diff --git a/mini-lsm-starter/src/table/builder.rs b/mini-lsm-starter/src/table/builder.rs index cea3d08..449189a 100644 --- a/mini-lsm-starter/src/table/builder.rs +++ b/mini-lsm-starter/src/table/builder.rs @@ -1,36 +1,66 @@ -#![allow(unused_variables)] // TODO(you): remove this lint after implementing this mod -#![allow(dead_code)] // TODO(you): remove this lint after implementing this mod - use std::path::Path; use std::sync::Arc; use anyhow::Result; +use bytes::BufMut; -use super::{BlockMeta, SsTable}; +use super::{BlockMeta, FileObject, SsTable}; +use crate::key::KeyBytes; use crate::{block::BlockBuilder, key::KeySlice, lsm_storage::BlockCache}; /// Builds an SSTable from key-value pairs. + pub struct SsTableBuilder { builder: BlockBuilder, first_key: Vec, last_key: Vec, - data: Vec, - pub(crate) meta: Vec, + data: Vec, // 目前打算只在块满和 build 时将 data block 写入该字段 + pub(crate) meta: Vec, // 每个块的首尾键(first/last keys)和每个块的偏移量(offsets) block_size: usize, } impl SsTableBuilder { /// Create a builder based on target block size. pub fn new(block_size: usize) -> Self { - unimplemented!() + Self { + builder: BlockBuilder::new(block_size), + first_key: Vec::new(), + last_key: Vec::new(), + data: Vec::new(), + meta: Vec::new(), + block_size, + } } /// Adds a key-value pair to SSTable. - /// /// Note: You should split a new block when the current block is full.(`std::mem::replace` may /// be helpful here) pub fn add(&mut self, key: KeySlice, value: &[u8]) { - unimplemented!() + if self.first_key.is_empty() { + self.first_key = key.raw_ref().to_vec(); + } + self.last_key = key.raw_ref().to_vec(); + if self.builder.add(key, value) == false { + // block 满,添加失败,创建新 block + self.split_new_block(); + } + let _ = self.builder.add(key, value); + } + + fn split_new_block(&mut self) { + let new_builder = BlockBuilder::new(self.block_size); + let old_builder = std::mem::replace(&mut self.builder, new_builder); + let old_block = old_builder.build(); + // 将 old block 的数据作为一个新的 data block 写入 SsTableBuilder 的 data 字段 + // 更新 SsTableBuilder 的 meta 字段,即我们需要创建一个新的 BlockMeta + let block_meta = BlockMeta { + offset: self.data.len(), + first_key: KeyBytes::from_bytes(old_block.first_key().unwrap_or_default()), + last_key: KeyBytes::from_bytes(old_block.last_key().unwrap_or_default()), + }; + // old_block.encode() 返回 block 的 Bytes,AsRef 转换为 &[u8] slice + self.meta.push(block_meta); + self.data.extend_from_slice(&old_block.encode()); } /// Get the estimated size of the SSTable. @@ -38,7 +68,7 @@ impl SsTableBuilder { /// Since the data blocks contain much more data than meta blocks, just return the size of data /// blocks here. pub fn estimated_size(&self) -> usize { - unimplemented!() + self.data.len() } /// Builds the SSTable and writes it to the given path. Use the `FileObject` structure to manipulate the disk objects. @@ -48,7 +78,40 @@ impl SsTableBuilder { block_cache: Option>, path: impl AsRef, ) -> Result { - unimplemented!() + let mut sst_builder = self; + sst_builder.split_new_block(); + + let mut bytes = vec![]; + bytes.extend_from_slice(&sst_builder.data); + + let block_meta_offset = bytes.len(); + let first_key = sst_builder + .meta + .first() + .map(|meta| meta.first_key.clone()) + .unwrap_or_default(); + let last_key = sst_builder + .meta + .last() + .map(|meta| meta.last_key.clone()) + .unwrap_or_default(); + BlockMeta::encode_block_meta(&sst_builder.meta, &mut bytes); + + bytes.put_u32(block_meta_offset as u32); + + let file = FileObject::create(path.as_ref(), bytes)?; + + Ok(SsTable { + file, + block_meta: sst_builder.meta, + block_meta_offset, + id, + block_cache, + first_key, + last_key, + bloom: None, + max_ts: 0, + }) } #[cfg(test)] diff --git a/mini-lsm-starter/src/tests.rs b/mini-lsm-starter/src/tests.rs index 69007ca..590f76e 100644 --- a/mini-lsm-starter/src/tests.rs +++ b/mini-lsm-starter/src/tests.rs @@ -5,3 +5,4 @@ mod harness; mod week1_day1; mod week1_day2; mod week1_day3; +mod week1_day4;