feat(storage): add Bitcask storage migration, binary WAL, and data compaction support
This commit is contained in:
39
rust/Cargo.lock
generated
39
rust/Cargo.lock
generated
@@ -275,6 +275,12 @@ dependencies = [
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a043dc74da1e37d6afe657061213aa6f425f855399a11d3463c6ecccc4dfda1f"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
@@ -477,6 +483,12 @@ dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.14"
|
||||
@@ -802,6 +814,7 @@ dependencies = [
|
||||
"dashmap",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
"tracing",
|
||||
@@ -835,6 +848,19 @@ dependencies = [
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "1.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.22"
|
||||
@@ -977,6 +1003,19 @@ version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.27.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.4.2",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.18"
|
||||
|
||||
@@ -66,6 +66,9 @@ uuid = { version = "1", features = ["v4", "serde"] }
|
||||
# Async traits
|
||||
async-trait = "0.1"
|
||||
|
||||
# Test utilities
|
||||
tempfile = "3"
|
||||
|
||||
# Internal crates
|
||||
rustdb-config = { path = "crates/rustdb-config" }
|
||||
rustdb-wire = { path = "crates/rustdb-wire" }
|
||||
|
||||
@@ -17,3 +17,6 @@ tracing = { workspace = true }
|
||||
crc32fast = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
async-trait = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = { workspace = true }
|
||||
|
||||
499
rust/crates/rustdb-storage/src/binary_wal.rs
Normal file
499
rust/crates/rustdb-storage/src/binary_wal.rs
Normal file
@@ -0,0 +1,499 @@
|
||||
//! Binary Write-Ahead Log for crash recovery.
|
||||
//!
|
||||
//! # Protocol
|
||||
//!
|
||||
//! Every mutation follows this sequence:
|
||||
//! 1. Append WAL record → fsync
|
||||
//! 2. Perform the actual data write
|
||||
//! 3. Append WAL commit marker → fsync
|
||||
//!
|
||||
//! On recovery, uncommitted entries (those without a matching commit marker)
|
||||
//! are replayed or verified.
|
||||
//!
|
||||
//! # Record format
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬────────────┐
|
||||
//! │ magic │ seq │ op │ key_len │ val_len │ crc32 │ payload │
|
||||
//! │ u16 LE │ u64 LE │ u8 │ u32 LE │ u32 LE │ u32 LE │ [key][val] │
|
||||
//! │ 0xWA01 │ │ │ │ │ │ │
|
||||
//! └──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Commit marker
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌──────────┬──────────┬──────────┐
|
||||
//! │ magic │ seq │ crc32 │
|
||||
//! │ u16 LE │ u64 LE │ u32 LE │
|
||||
//! │ 0xCA01 │ │ │
|
||||
//! └──────────┴──────────┴──────────┘
|
||||
//! ```
|
||||
|
||||
use std::io::{self, BufReader, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use crate::error::{StorageError, StorageResult};
|
||||
use crate::record::{FileHeader, FileType, FILE_HEADER_SIZE};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const WAL_RECORD_MAGIC: u16 = 0xAA01;
|
||||
const WAL_COMMIT_MAGIC: u16 = 0xCC01;
|
||||
|
||||
/// WAL record header: magic(2) + seq(8) + op(1) + key_len(4) + val_len(4) + crc(4) = 23
|
||||
const WAL_RECORD_HEADER: usize = 23;
|
||||
|
||||
/// Commit marker size: magic(2) + seq(8) + crc(4) = 14
|
||||
const WAL_COMMIT_SIZE: usize = 14;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// WAL operation type
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[repr(u8)]
|
||||
pub enum WalOpType {
|
||||
Insert = 1,
|
||||
Update = 2,
|
||||
Delete = 3,
|
||||
}
|
||||
|
||||
impl WalOpType {
|
||||
fn from_u8(v: u8) -> StorageResult<Self> {
|
||||
match v {
|
||||
1 => Ok(WalOpType::Insert),
|
||||
2 => Ok(WalOpType::Update),
|
||||
3 => Ok(WalOpType::Delete),
|
||||
_ => Err(StorageError::WalError(format!("unknown WAL op: {v}"))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// WAL entry (parsed from file)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct WalEntry {
|
||||
pub seq: u64,
|
||||
pub op: WalOpType,
|
||||
pub key: Vec<u8>,
|
||||
pub value: Vec<u8>,
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Internal: what we read from the WAL file
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug)]
|
||||
enum WalItem {
|
||||
Record(WalEntry),
|
||||
Commit(u64), // seq that was committed
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// BinaryWal
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Binary write-ahead log backed by a single file.
|
||||
pub struct BinaryWal {
|
||||
path: PathBuf,
|
||||
next_seq: AtomicU64,
|
||||
}
|
||||
|
||||
impl BinaryWal {
|
||||
/// Create a new WAL. Does not touch the filesystem until `initialize()`.
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
Self {
|
||||
path,
|
||||
next_seq: AtomicU64::new(1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize: create parent dirs, recover sequence counter from existing file.
|
||||
pub fn initialize(&self) -> StorageResult<()> {
|
||||
if let Some(parent) = self.path.parent() {
|
||||
std::fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
if self.path.exists() {
|
||||
// Scan to find highest seq
|
||||
let items = self.read_all_items()?;
|
||||
let max_seq = items
|
||||
.iter()
|
||||
.map(|item| match item {
|
||||
WalItem::Record(e) => e.seq,
|
||||
WalItem::Commit(s) => *s,
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
self.next_seq.store(max_seq + 1, Ordering::SeqCst);
|
||||
} else {
|
||||
// Create the file with a header
|
||||
let mut f = std::fs::File::create(&self.path)?;
|
||||
let hdr = FileHeader::new(FileType::Wal);
|
||||
f.write_all(&hdr.encode())?;
|
||||
f.flush()?;
|
||||
f.sync_all()?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Append a WAL record. Returns the sequence number. Fsyncs.
|
||||
pub fn append(
|
||||
&self,
|
||||
op: WalOpType,
|
||||
key: &[u8],
|
||||
value: &[u8],
|
||||
) -> StorageResult<u64> {
|
||||
let seq = self.next_seq.fetch_add(1, Ordering::SeqCst);
|
||||
let key_len = key.len() as u32;
|
||||
let val_len = value.len() as u32;
|
||||
|
||||
// Build header bytes (without CRC)
|
||||
let mut hdr = Vec::with_capacity(WAL_RECORD_HEADER);
|
||||
hdr.extend_from_slice(&WAL_RECORD_MAGIC.to_le_bytes());
|
||||
hdr.extend_from_slice(&seq.to_le_bytes());
|
||||
hdr.push(op as u8);
|
||||
hdr.extend_from_slice(&key_len.to_le_bytes());
|
||||
hdr.extend_from_slice(&val_len.to_le_bytes());
|
||||
// CRC placeholder
|
||||
hdr.extend_from_slice(&0u32.to_le_bytes());
|
||||
|
||||
// Compute CRC over header (without crc field) + payload
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&hdr[0..19]); // magic + seq + op + key_len + val_len
|
||||
hasher.update(key);
|
||||
hasher.update(value);
|
||||
let crc = hasher.finalize();
|
||||
hdr[19..23].copy_from_slice(&crc.to_le_bytes());
|
||||
|
||||
// Append to file
|
||||
let mut f = std::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&self.path)?;
|
||||
f.write_all(&hdr)?;
|
||||
f.write_all(key)?;
|
||||
f.write_all(value)?;
|
||||
f.sync_all()?;
|
||||
|
||||
Ok(seq)
|
||||
}
|
||||
|
||||
/// Append a commit marker for the given sequence. Fsyncs.
|
||||
pub fn append_commit(&self, seq: u64) -> StorageResult<()> {
|
||||
let mut buf = Vec::with_capacity(WAL_COMMIT_SIZE);
|
||||
buf.extend_from_slice(&WAL_COMMIT_MAGIC.to_le_bytes());
|
||||
buf.extend_from_slice(&seq.to_le_bytes());
|
||||
|
||||
// CRC over magic + seq
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&buf[0..10]);
|
||||
let crc = hasher.finalize();
|
||||
buf.extend_from_slice(&crc.to_le_bytes());
|
||||
|
||||
let mut f = std::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&self.path)?;
|
||||
f.write_all(&buf)?;
|
||||
f.sync_all()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recover: return all WAL entries that were NOT committed.
|
||||
pub fn recover(&self) -> StorageResult<Vec<WalEntry>> {
|
||||
let items = self.read_all_items()?;
|
||||
|
||||
// Collect committed seq numbers
|
||||
let committed: std::collections::HashSet<u64> = items
|
||||
.iter()
|
||||
.filter_map(|item| {
|
||||
if let WalItem::Commit(s) = item {
|
||||
Some(*s)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Return records without a commit marker
|
||||
let uncommitted: Vec<WalEntry> = items
|
||||
.into_iter()
|
||||
.filter_map(|item| {
|
||||
if let WalItem::Record(entry) = item {
|
||||
if !committed.contains(&entry.seq) {
|
||||
return Some(entry);
|
||||
}
|
||||
}
|
||||
None
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(uncommitted)
|
||||
}
|
||||
|
||||
/// Truncate the WAL: rewrite with just the file header (clears all entries).
|
||||
pub fn truncate(&self) -> StorageResult<()> {
|
||||
let mut f = std::fs::File::create(&self.path)?;
|
||||
let hdr = FileHeader::new(FileType::Wal);
|
||||
f.write_all(&hdr.encode())?;
|
||||
f.flush()?;
|
||||
f.sync_all()?;
|
||||
// Don't reset next_seq — it should keep incrementing
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Path to the WAL file.
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.path
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Internal: read all items from the WAL file
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
fn read_all_items(&self) -> StorageResult<Vec<WalItem>> {
|
||||
if !self.path.exists() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
let file = std::fs::File::open(&self.path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
// Skip file header (if present)
|
||||
let file_len = std::fs::metadata(&self.path)?.len();
|
||||
if file_len >= FILE_HEADER_SIZE as u64 {
|
||||
let mut hdr_buf = [0u8; FILE_HEADER_SIZE];
|
||||
reader.read_exact(&mut hdr_buf)?;
|
||||
// Validate but don't fail hard — allow reading even slightly off headers
|
||||
let _ = FileHeader::decode(&hdr_buf);
|
||||
}
|
||||
|
||||
let mut items = Vec::new();
|
||||
|
||||
loop {
|
||||
// Peek at the magic to determine if this is a record or commit marker
|
||||
let mut magic_buf = [0u8; 2];
|
||||
match reader.read_exact(&mut magic_buf) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
let magic = u16::from_le_bytes(magic_buf);
|
||||
|
||||
match magic {
|
||||
WAL_RECORD_MAGIC => {
|
||||
// Read rest of header: seq(8) + op(1) + key_len(4) + val_len(4) + crc(4) = 21
|
||||
let mut rest = [0u8; 21];
|
||||
match reader.read_exact(&mut rest) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
|
||||
let seq = u64::from_le_bytes(rest[0..8].try_into().unwrap());
|
||||
let op = WalOpType::from_u8(rest[8])?;
|
||||
let key_len = u32::from_le_bytes(rest[9..13].try_into().unwrap()) as usize;
|
||||
let val_len = u32::from_le_bytes(rest[13..17].try_into().unwrap()) as usize;
|
||||
let stored_crc = u32::from_le_bytes(rest[17..21].try_into().unwrap());
|
||||
|
||||
let mut payload = vec![0u8; key_len + val_len];
|
||||
match reader.read_exact(&mut payload) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
|
||||
// Verify CRC
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&magic_buf);
|
||||
hasher.update(&rest[0..17]); // seq + op + key_len + val_len
|
||||
hasher.update(&payload);
|
||||
let computed = hasher.finalize();
|
||||
|
||||
if computed != stored_crc {
|
||||
// Corrupt WAL entry — skip it (best-effort recovery)
|
||||
tracing::warn!(
|
||||
seq,
|
||||
"skipping corrupt WAL record: CRC mismatch (expected 0x{stored_crc:08X}, got 0x{computed:08X})"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = payload[..key_len].to_vec();
|
||||
let value = payload[key_len..].to_vec();
|
||||
items.push(WalItem::Record(WalEntry {
|
||||
seq,
|
||||
op,
|
||||
key,
|
||||
value,
|
||||
}));
|
||||
}
|
||||
WAL_COMMIT_MAGIC => {
|
||||
// Read rest: seq(8) + crc(4) = 12
|
||||
let mut rest = [0u8; 12];
|
||||
match reader.read_exact(&mut rest) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
|
||||
let seq = u64::from_le_bytes(rest[0..8].try_into().unwrap());
|
||||
let stored_crc = u32::from_le_bytes(rest[8..12].try_into().unwrap());
|
||||
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&magic_buf);
|
||||
hasher.update(&rest[0..8]);
|
||||
let computed = hasher.finalize();
|
||||
|
||||
if computed != stored_crc {
|
||||
tracing::warn!(
|
||||
seq,
|
||||
"skipping corrupt WAL commit marker: CRC mismatch"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
items.push(WalItem::Commit(seq));
|
||||
}
|
||||
_ => {
|
||||
// Unknown magic — file is corrupt past this point
|
||||
tracing::warn!("unknown WAL magic 0x{magic:04X}, stopping scan");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(items)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn make_wal(dir: &tempfile::TempDir) -> BinaryWal {
|
||||
let path = dir.path().join("test.wal");
|
||||
let wal = BinaryWal::new(path);
|
||||
wal.initialize().unwrap();
|
||||
wal
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn append_and_commit() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let wal = make_wal(&dir);
|
||||
|
||||
let seq = wal
|
||||
.append(WalOpType::Insert, b"key1", b"value1")
|
||||
.unwrap();
|
||||
assert_eq!(seq, 1);
|
||||
|
||||
wal.append_commit(seq).unwrap();
|
||||
|
||||
// All committed — recover should return empty
|
||||
let uncommitted = wal.recover().unwrap();
|
||||
assert!(uncommitted.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn uncommitted_entries_recovered() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let wal = make_wal(&dir);
|
||||
|
||||
let s1 = wal
|
||||
.append(WalOpType::Insert, b"k1", b"v1")
|
||||
.unwrap();
|
||||
wal.append_commit(s1).unwrap();
|
||||
|
||||
// s2 is NOT committed
|
||||
let s2 = wal
|
||||
.append(WalOpType::Update, b"k2", b"v2")
|
||||
.unwrap();
|
||||
|
||||
let uncommitted = wal.recover().unwrap();
|
||||
assert_eq!(uncommitted.len(), 1);
|
||||
assert_eq!(uncommitted[0].seq, s2);
|
||||
assert_eq!(uncommitted[0].op, WalOpType::Update);
|
||||
assert_eq!(uncommitted[0].key, b"k2");
|
||||
assert_eq!(uncommitted[0].value, b"v2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncate_clears_wal() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let wal = make_wal(&dir);
|
||||
|
||||
wal.append(WalOpType::Insert, b"k", b"v").unwrap();
|
||||
wal.truncate().unwrap();
|
||||
|
||||
let uncommitted = wal.recover().unwrap();
|
||||
assert!(uncommitted.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_operations() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let wal = make_wal(&dir);
|
||||
|
||||
let s1 = wal.append(WalOpType::Insert, b"a", b"1").unwrap();
|
||||
let s2 = wal.append(WalOpType::Update, b"b", b"2").unwrap();
|
||||
let s3 = wal.append(WalOpType::Delete, b"c", b"").unwrap();
|
||||
|
||||
// Commit only s1 and s3
|
||||
wal.append_commit(s1).unwrap();
|
||||
wal.append_commit(s3).unwrap();
|
||||
|
||||
let uncommitted = wal.recover().unwrap();
|
||||
assert_eq!(uncommitted.len(), 1);
|
||||
assert_eq!(uncommitted[0].seq, s2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sequence_numbers_persist_across_reinit() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let path = dir.path().join("persist.wal");
|
||||
|
||||
{
|
||||
let wal = BinaryWal::new(path.clone());
|
||||
wal.initialize().unwrap();
|
||||
let s1 = wal.append(WalOpType::Insert, b"k", b"v").unwrap();
|
||||
assert_eq!(s1, 1);
|
||||
wal.append_commit(s1).unwrap();
|
||||
}
|
||||
|
||||
// Re-open — seq should continue from 2+ (since max committed was 1)
|
||||
{
|
||||
let wal = BinaryWal::new(path);
|
||||
wal.initialize().unwrap();
|
||||
let s2 = wal.append(WalOpType::Insert, b"k2", b"v2").unwrap();
|
||||
assert!(s2 >= 2, "seq should continue: got {s2}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_has_empty_value() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let wal = make_wal(&dir);
|
||||
|
||||
let seq = wal.append(WalOpType::Delete, b"key", b"").unwrap();
|
||||
|
||||
let uncommitted = wal.recover().unwrap();
|
||||
assert_eq!(uncommitted.len(), 1);
|
||||
assert_eq!(uncommitted[0].seq, seq);
|
||||
assert_eq!(uncommitted[0].op, WalOpType::Delete);
|
||||
assert!(uncommitted[0].value.is_empty());
|
||||
}
|
||||
}
|
||||
270
rust/crates/rustdb-storage/src/compaction.rs
Normal file
270
rust/crates/rustdb-storage/src/compaction.rs
Normal file
@@ -0,0 +1,270 @@
|
||||
//! Compaction for the Bitcask-style storage engine.
|
||||
//!
|
||||
//! Over time, the data file accumulates dead records (superseded by updates,
|
||||
//! tombstones from deletes). Compaction rewrites the data file with only live
|
||||
//! records, reclaiming disk space.
|
||||
//!
|
||||
//! The process is:
|
||||
//! 1. Create a new `data.rdb.compact` file with a fresh file header.
|
||||
//! 2. Iterate all live entries from the KeyDir.
|
||||
//! 3. Read each live document from the old data file, write to the new file.
|
||||
//! 4. Atomically rename `data.rdb.compact` → `data.rdb`.
|
||||
//! 5. Update KeyDir entries with new offsets.
|
||||
//! 6. Reset dead_bytes counter.
|
||||
|
||||
use std::io::{Seek, SeekFrom, Write};
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use tracing::info;
|
||||
|
||||
use crate::error::StorageResult;
|
||||
use crate::keydir::{KeyDir, KeyDirEntry};
|
||||
use crate::record::{DataRecord, FileHeader, FileType, FILE_HEADER_SIZE};
|
||||
|
||||
/// Result of a compaction operation.
|
||||
#[derive(Debug)]
|
||||
pub struct CompactionResult {
|
||||
/// Number of live records written.
|
||||
pub records_written: u64,
|
||||
/// Bytes reclaimed (old file size - new file size).
|
||||
pub bytes_reclaimed: u64,
|
||||
/// New data file size.
|
||||
pub new_file_size: u64,
|
||||
}
|
||||
|
||||
/// Compact a collection's data file.
|
||||
///
|
||||
/// This function:
|
||||
/// - Reads all live documents (entries present in the KeyDir) from the old data file
|
||||
/// - Writes them sequentially to a new file
|
||||
/// - Atomically renames the new file over the old one
|
||||
/// - Updates all KeyDir entries with their new offsets
|
||||
///
|
||||
/// The caller must hold the collection's write lock during this operation.
|
||||
pub fn compact_data_file(
|
||||
data_path: &Path,
|
||||
keydir: &KeyDir,
|
||||
dead_bytes: &std::sync::atomic::AtomicU64,
|
||||
data_file_size: &std::sync::atomic::AtomicU64,
|
||||
) -> StorageResult<CompactionResult> {
|
||||
let compact_path = data_path.with_extension("rdb.compact");
|
||||
|
||||
let old_file_size = std::fs::metadata(data_path)
|
||||
.map(|m| m.len())
|
||||
.unwrap_or(0);
|
||||
|
||||
// Collect all live entries with their keys
|
||||
let mut live_entries: Vec<(String, KeyDirEntry)> = Vec::with_capacity(keydir.len() as usize);
|
||||
keydir.for_each(|key, entry| {
|
||||
live_entries.push((key.to_string(), *entry));
|
||||
});
|
||||
|
||||
// Sort by offset for sequential reads (cache-friendly)
|
||||
live_entries.sort_by_key(|(_, e)| e.offset);
|
||||
|
||||
// Create compact file with header
|
||||
let mut compact_file = std::fs::File::create(&compact_path)?;
|
||||
let hdr = FileHeader::new(FileType::Data);
|
||||
compact_file.write_all(&hdr.encode())?;
|
||||
|
||||
let mut current_offset = FILE_HEADER_SIZE as u64;
|
||||
let mut new_entries: Vec<(String, KeyDirEntry)> = Vec::with_capacity(live_entries.len());
|
||||
let mut old_data_file = std::fs::File::open(data_path)?;
|
||||
|
||||
for (key, entry) in &live_entries {
|
||||
// Read the record from the old file
|
||||
old_data_file.seek(SeekFrom::Start(entry.offset))?;
|
||||
let (record, _disk_size) = DataRecord::decode_from(&mut old_data_file)?
|
||||
.ok_or_else(|| {
|
||||
crate::error::StorageError::CorruptRecord(format!(
|
||||
"compaction: unexpected EOF reading doc '{key}' at offset {}",
|
||||
entry.offset
|
||||
))
|
||||
})?;
|
||||
|
||||
// Write to compact file
|
||||
let encoded = record.encode();
|
||||
let new_disk_size = encoded.len() as u32;
|
||||
compact_file.write_all(&encoded)?;
|
||||
|
||||
new_entries.push((
|
||||
key.clone(),
|
||||
KeyDirEntry {
|
||||
offset: current_offset,
|
||||
record_len: new_disk_size,
|
||||
value_len: entry.value_len,
|
||||
timestamp: entry.timestamp,
|
||||
},
|
||||
));
|
||||
|
||||
current_offset += new_disk_size as u64;
|
||||
}
|
||||
|
||||
compact_file.sync_all()?;
|
||||
drop(compact_file);
|
||||
drop(old_data_file);
|
||||
|
||||
// Atomic rename
|
||||
std::fs::rename(&compact_path, data_path)?;
|
||||
|
||||
// Update KeyDir with new offsets
|
||||
for (key, new_entry) in new_entries {
|
||||
keydir.insert(key, new_entry);
|
||||
}
|
||||
|
||||
// Reset counters
|
||||
dead_bytes.store(0, Ordering::Relaxed);
|
||||
data_file_size.store(current_offset, Ordering::Relaxed);
|
||||
|
||||
let bytes_reclaimed = old_file_size.saturating_sub(current_offset);
|
||||
|
||||
info!(
|
||||
records = live_entries.len(),
|
||||
old_size = old_file_size,
|
||||
new_size = current_offset,
|
||||
reclaimed = bytes_reclaimed,
|
||||
"compaction complete"
|
||||
);
|
||||
|
||||
Ok(CompactionResult {
|
||||
records_written: live_entries.len() as u64,
|
||||
bytes_reclaimed,
|
||||
new_file_size: current_offset,
|
||||
})
|
||||
}
|
||||
|
||||
/// Check if compaction is warranted for a collection.
|
||||
/// Returns true if dead bytes exceed 50% of live data.
|
||||
pub fn should_compact(dead_bytes: u64, data_file_size: u64) -> bool {
|
||||
if data_file_size <= FILE_HEADER_SIZE as u64 {
|
||||
return false;
|
||||
}
|
||||
let useful_bytes = data_file_size - FILE_HEADER_SIZE as u64;
|
||||
// Trigger when dead > 50% of total useful data
|
||||
dead_bytes > useful_bytes / 2
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::keydir::KeyDir;
|
||||
use crate::record::{now_ms, DataRecord, FileHeader, FileType};
|
||||
use std::io::Write;
|
||||
use std::sync::atomic::AtomicU64;
|
||||
|
||||
#[test]
|
||||
fn compact_removes_dead_records() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let data_path = dir.path().join("data.rdb");
|
||||
|
||||
// Write a data file: insert A, update A (new version), insert B
|
||||
let mut f = std::fs::File::create(&data_path).unwrap();
|
||||
let hdr = FileHeader::new(FileType::Data);
|
||||
f.write_all(&hdr.encode()).unwrap();
|
||||
|
||||
let ts = now_ms();
|
||||
|
||||
// Record 1: A v1 (will be superseded)
|
||||
let r1 = DataRecord {
|
||||
timestamp: ts,
|
||||
key: b"aaa".to_vec(),
|
||||
value: b"old_value".to_vec(),
|
||||
};
|
||||
let r1_enc = r1.encode();
|
||||
let r1_offset = FILE_HEADER_SIZE as u64;
|
||||
let r1_size = r1_enc.len();
|
||||
f.write_all(&r1_enc).unwrap();
|
||||
|
||||
// Record 2: A v2 (current)
|
||||
let r2 = DataRecord {
|
||||
timestamp: ts + 1,
|
||||
key: b"aaa".to_vec(),
|
||||
value: b"new_value".to_vec(),
|
||||
};
|
||||
let r2_enc = r2.encode();
|
||||
let r2_offset = r1_offset + r1_size as u64;
|
||||
let r2_size = r2_enc.len();
|
||||
f.write_all(&r2_enc).unwrap();
|
||||
|
||||
// Record 3: B (live)
|
||||
let r3 = DataRecord {
|
||||
timestamp: ts + 2,
|
||||
key: b"bbb".to_vec(),
|
||||
value: b"bbb_value".to_vec(),
|
||||
};
|
||||
let r3_enc = r3.encode();
|
||||
let r3_offset = r2_offset + r2_size as u64;
|
||||
f.write_all(&r3_enc).unwrap();
|
||||
f.sync_all().unwrap();
|
||||
drop(f);
|
||||
|
||||
let total_size = std::fs::metadata(&data_path).unwrap().len();
|
||||
|
||||
// Build KeyDir — only points to latest versions
|
||||
let keydir = KeyDir::new();
|
||||
keydir.insert(
|
||||
"aaa".into(),
|
||||
KeyDirEntry {
|
||||
offset: r2_offset,
|
||||
record_len: r2_size as u32,
|
||||
value_len: r2.value.len() as u32,
|
||||
timestamp: ts + 1,
|
||||
},
|
||||
);
|
||||
keydir.insert(
|
||||
"bbb".into(),
|
||||
KeyDirEntry {
|
||||
offset: r3_offset,
|
||||
record_len: r3.encode().len() as u32,
|
||||
value_len: r3.value.len() as u32,
|
||||
timestamp: ts + 2,
|
||||
},
|
||||
);
|
||||
|
||||
let dead_bytes_counter = AtomicU64::new(r1_size as u64);
|
||||
let data_file_size_counter = AtomicU64::new(total_size);
|
||||
|
||||
let result = compact_data_file(
|
||||
&data_path,
|
||||
&keydir,
|
||||
&dead_bytes_counter,
|
||||
&data_file_size_counter,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(result.records_written, 2);
|
||||
assert!(result.bytes_reclaimed > 0);
|
||||
assert!(result.new_file_size < total_size);
|
||||
|
||||
// Verify dead_bytes was reset
|
||||
assert_eq!(dead_bytes_counter.load(Ordering::Relaxed), 0);
|
||||
|
||||
// Verify KeyDir was updated with new offsets
|
||||
let a_entry = keydir.get("aaa").unwrap();
|
||||
assert_eq!(a_entry.offset, FILE_HEADER_SIZE as u64); // first record after header
|
||||
assert_eq!(a_entry.value_len, b"new_value".len() as u32);
|
||||
|
||||
let b_entry = keydir.get("bbb").unwrap();
|
||||
assert!(b_entry.offset > a_entry.offset);
|
||||
|
||||
// Verify the compacted file can be used to rebuild KeyDir
|
||||
let (rebuilt, dead) = KeyDir::build_from_data_file(&data_path).unwrap();
|
||||
assert_eq!(rebuilt.len(), 2);
|
||||
assert_eq!(dead, 0); // no dead records in compacted file
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_compact_thresholds() {
|
||||
// Under threshold
|
||||
assert!(!should_compact(10, 100 + FILE_HEADER_SIZE as u64));
|
||||
// Over threshold (dead > 50% of useful)
|
||||
assert!(should_compact(60, 100 + FILE_HEADER_SIZE as u64));
|
||||
// Empty file
|
||||
assert!(!should_compact(0, FILE_HEADER_SIZE as u64));
|
||||
}
|
||||
}
|
||||
@@ -17,6 +17,15 @@ pub enum StorageError {
|
||||
|
||||
#[error("conflict detected: {0}")]
|
||||
ConflictError(String),
|
||||
|
||||
#[error("corrupt record: {0}")]
|
||||
CorruptRecord(String),
|
||||
|
||||
#[error("checksum mismatch: expected 0x{expected:08X}, got 0x{actual:08X}")]
|
||||
ChecksumMismatch { expected: u32, actual: u32 },
|
||||
|
||||
#[error("WAL error: {0}")]
|
||||
WalError(String),
|
||||
}
|
||||
|
||||
impl From<serde_json::Error> for StorageError {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
453
rust/crates/rustdb-storage/src/keydir.rs
Normal file
453
rust/crates/rustdb-storage/src/keydir.rs
Normal file
@@ -0,0 +1,453 @@
|
||||
//! KeyDir — in-memory document location index for the Bitcask storage engine.
|
||||
//!
|
||||
//! Maps document `_id` (hex string) to its location in the append-only data file.
|
||||
//! Backed by `DashMap` for lock-free concurrent reads and fine-grained write locking.
|
||||
//!
|
||||
//! The KeyDir can be rebuilt from a data file scan, or loaded quickly from a
|
||||
//! persisted hint file for fast restart.
|
||||
|
||||
use std::io::{self, BufReader, BufWriter, Read, Write};
|
||||
use std::path::Path;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use dashmap::DashMap;
|
||||
|
||||
use crate::error::{StorageError, StorageResult};
|
||||
use crate::record::{
|
||||
FileHeader, FileType, RecordScanner, FILE_HEADER_SIZE, FORMAT_VERSION,
|
||||
};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// KeyDirEntry
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Location of a single document in the data file.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct KeyDirEntry {
|
||||
/// Byte offset of the record in `data.rdb`.
|
||||
pub offset: u64,
|
||||
/// Total record size on disk (header + payload).
|
||||
pub record_len: u32,
|
||||
/// BSON value length. 0 means tombstone (used during compaction accounting).
|
||||
pub value_len: u32,
|
||||
/// Timestamp (epoch ms) from the record. Used for conflict detection.
|
||||
pub timestamp: u64,
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// KeyDir
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// In-memory index mapping document ID → data file location.
|
||||
pub struct KeyDir {
|
||||
map: DashMap<String, KeyDirEntry>,
|
||||
/// Running count of live documents.
|
||||
doc_count: AtomicU64,
|
||||
}
|
||||
|
||||
impl KeyDir {
|
||||
/// Create an empty KeyDir.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
map: DashMap::new(),
|
||||
doc_count: AtomicU64::new(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Insert or update an entry. Returns the previous entry if one existed.
|
||||
pub fn insert(&self, key: String, entry: KeyDirEntry) -> Option<KeyDirEntry> {
|
||||
let prev = self.map.insert(key, entry);
|
||||
if prev.is_none() {
|
||||
self.doc_count.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
prev
|
||||
}
|
||||
|
||||
/// Look up an entry by key.
|
||||
pub fn get(&self, key: &str) -> Option<KeyDirEntry> {
|
||||
self.map.get(key).map(|r| *r.value())
|
||||
}
|
||||
|
||||
/// Remove an entry. Returns the removed entry if it existed.
|
||||
pub fn remove(&self, key: &str) -> Option<KeyDirEntry> {
|
||||
let removed = self.map.remove(key).map(|(_, v)| v);
|
||||
if removed.is_some() {
|
||||
self.doc_count.fetch_sub(1, Ordering::Relaxed);
|
||||
}
|
||||
removed
|
||||
}
|
||||
|
||||
/// Number of live documents.
|
||||
pub fn len(&self) -> u64 {
|
||||
self.doc_count.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
/// Whether the index is empty.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Check if a key exists.
|
||||
pub fn contains(&self, key: &str) -> bool {
|
||||
self.map.contains_key(key)
|
||||
}
|
||||
|
||||
/// Iterate over all entries. The closure receives (key, entry).
|
||||
pub fn for_each(&self, mut f: impl FnMut(&str, &KeyDirEntry)) {
|
||||
for entry in self.map.iter() {
|
||||
f(entry.key(), entry.value());
|
||||
}
|
||||
}
|
||||
|
||||
/// Collect all keys.
|
||||
pub fn keys(&self) -> Vec<String> {
|
||||
self.map.iter().map(|e| e.key().clone()).collect()
|
||||
}
|
||||
|
||||
/// Clear all entries.
|
||||
pub fn clear(&self) {
|
||||
self.map.clear();
|
||||
self.doc_count.store(0, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Build from data file
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Rebuild the KeyDir by scanning an entire data file.
|
||||
/// The file must start with a valid `FileHeader`.
|
||||
/// Returns `(keydir, dead_bytes)` where `dead_bytes` is the total size of
|
||||
/// stale records (superseded by later writes or tombstoned).
|
||||
pub fn build_from_data_file(path: &Path) -> StorageResult<(Self, u64)> {
|
||||
let file = std::fs::File::open(path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
// Read and validate file header
|
||||
let mut hdr_buf = [0u8; FILE_HEADER_SIZE];
|
||||
reader.read_exact(&mut hdr_buf)?;
|
||||
let hdr = FileHeader::decode(&hdr_buf)?;
|
||||
if hdr.file_type != FileType::Data {
|
||||
return Err(StorageError::CorruptRecord(format!(
|
||||
"expected data file (type 1), got type {:?}",
|
||||
hdr.file_type
|
||||
)));
|
||||
}
|
||||
|
||||
let keydir = KeyDir::new();
|
||||
let mut dead_bytes: u64 = 0;
|
||||
|
||||
let scanner = RecordScanner::new(reader, FILE_HEADER_SIZE as u64);
|
||||
for result in scanner {
|
||||
let (offset, record) = result?;
|
||||
let is_tombstone = record.is_tombstone();
|
||||
let disk_size = record.disk_size() as u32;
|
||||
let value_len = record.value.len() as u32;
|
||||
let timestamp = record.timestamp;
|
||||
let key = String::from_utf8(record.key)
|
||||
.map_err(|e| StorageError::CorruptRecord(format!("invalid UTF-8 key: {e}")))?;
|
||||
|
||||
if is_tombstone {
|
||||
// Remove from index; the tombstone itself is dead weight
|
||||
if let Some(prev) = keydir.remove(&key) {
|
||||
dead_bytes += prev.record_len as u64;
|
||||
}
|
||||
dead_bytes += disk_size as u64;
|
||||
} else {
|
||||
let entry = KeyDirEntry {
|
||||
offset,
|
||||
record_len: disk_size,
|
||||
value_len,
|
||||
timestamp,
|
||||
};
|
||||
if let Some(prev) = keydir.insert(key, entry) {
|
||||
// Previous version of same key is now dead
|
||||
dead_bytes += prev.record_len as u64;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((keydir, dead_bytes))
|
||||
}
|
||||
|
||||
// -----------------------------------------------------------------------
|
||||
// Hint file persistence (for fast startup)
|
||||
// -----------------------------------------------------------------------
|
||||
|
||||
/// Persist the KeyDir to a hint file for fast restart.
|
||||
///
|
||||
/// Hint file format (after the 64-byte file header):
|
||||
/// For each entry: [key_len:u32 LE][key bytes][offset:u64 LE][record_len:u32 LE][value_len:u32 LE][timestamp:u64 LE]
|
||||
pub fn persist_to_hint_file(&self, path: &Path) -> StorageResult<()> {
|
||||
let file = std::fs::File::create(path)?;
|
||||
let mut writer = BufWriter::new(file);
|
||||
|
||||
// Write file header
|
||||
let hdr = FileHeader::new(FileType::Hint);
|
||||
writer.write_all(&hdr.encode())?;
|
||||
|
||||
// Write entries
|
||||
for entry in self.map.iter() {
|
||||
let key_bytes = entry.key().as_bytes();
|
||||
let key_len = key_bytes.len() as u32;
|
||||
writer.write_all(&key_len.to_le_bytes())?;
|
||||
writer.write_all(key_bytes)?;
|
||||
writer.write_all(&entry.value().offset.to_le_bytes())?;
|
||||
writer.write_all(&entry.value().record_len.to_le_bytes())?;
|
||||
writer.write_all(&entry.value().value_len.to_le_bytes())?;
|
||||
writer.write_all(&entry.value().timestamp.to_le_bytes())?;
|
||||
}
|
||||
|
||||
writer.flush()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Load a KeyDir from a hint file. Returns None if the file doesn't exist.
|
||||
pub fn load_from_hint_file(path: &Path) -> StorageResult<Option<Self>> {
|
||||
if !path.exists() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let file = std::fs::File::open(path)?;
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
// Read and validate header
|
||||
let mut hdr_buf = [0u8; FILE_HEADER_SIZE];
|
||||
match reader.read_exact(&mut hdr_buf) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
let hdr = FileHeader::decode(&hdr_buf)?;
|
||||
if hdr.file_type != FileType::Hint {
|
||||
return Err(StorageError::CorruptRecord(format!(
|
||||
"expected hint file (type 3), got type {:?}",
|
||||
hdr.file_type
|
||||
)));
|
||||
}
|
||||
if hdr.version > FORMAT_VERSION {
|
||||
return Err(StorageError::CorruptRecord(format!(
|
||||
"hint file version {} is newer than supported {}",
|
||||
hdr.version, FORMAT_VERSION
|
||||
)));
|
||||
}
|
||||
|
||||
let keydir = KeyDir::new();
|
||||
|
||||
loop {
|
||||
// Read key_len
|
||||
let mut key_len_buf = [0u8; 4];
|
||||
match reader.read_exact(&mut key_len_buf) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break,
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
let key_len = u32::from_le_bytes(key_len_buf) as usize;
|
||||
|
||||
// Read key
|
||||
let mut key_buf = vec![0u8; key_len];
|
||||
reader.read_exact(&mut key_buf)?;
|
||||
let key = String::from_utf8(key_buf)
|
||||
.map_err(|e| StorageError::CorruptRecord(format!("invalid UTF-8 key: {e}")))?;
|
||||
|
||||
// Read entry fields
|
||||
let mut fields = [0u8; 8 + 4 + 4 + 8]; // offset + record_len + value_len + timestamp = 24
|
||||
reader.read_exact(&mut fields)?;
|
||||
|
||||
let offset = u64::from_le_bytes(fields[0..8].try_into().unwrap());
|
||||
let record_len = u32::from_le_bytes(fields[8..12].try_into().unwrap());
|
||||
let value_len = u32::from_le_bytes(fields[12..16].try_into().unwrap());
|
||||
let timestamp = u64::from_le_bytes(fields[16..24].try_into().unwrap());
|
||||
|
||||
keydir.insert(
|
||||
key,
|
||||
KeyDirEntry {
|
||||
offset,
|
||||
record_len,
|
||||
value_len,
|
||||
timestamp,
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
Ok(Some(keydir))
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for KeyDir {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::record::DataRecord;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn basic_insert_get_remove() {
|
||||
let kd = KeyDir::new();
|
||||
assert!(kd.is_empty());
|
||||
|
||||
let entry = KeyDirEntry {
|
||||
offset: 100,
|
||||
record_len: 50,
|
||||
value_len: 30,
|
||||
timestamp: 1700000000000,
|
||||
};
|
||||
|
||||
assert!(kd.insert("abc".into(), entry).is_none());
|
||||
assert_eq!(kd.len(), 1);
|
||||
assert!(kd.contains("abc"));
|
||||
|
||||
let got = kd.get("abc").unwrap();
|
||||
assert_eq!(got.offset, 100);
|
||||
assert_eq!(got.value_len, 30);
|
||||
|
||||
let removed = kd.remove("abc").unwrap();
|
||||
assert_eq!(removed.offset, 100);
|
||||
assert_eq!(kd.len(), 0);
|
||||
assert!(!kd.contains("abc"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn insert_overwrites_returns_previous() {
|
||||
let kd = KeyDir::new();
|
||||
let e1 = KeyDirEntry {
|
||||
offset: 100,
|
||||
record_len: 50,
|
||||
value_len: 30,
|
||||
timestamp: 1,
|
||||
};
|
||||
let e2 = KeyDirEntry {
|
||||
offset: 200,
|
||||
record_len: 60,
|
||||
value_len: 40,
|
||||
timestamp: 2,
|
||||
};
|
||||
|
||||
kd.insert("k".into(), e1);
|
||||
assert_eq!(kd.len(), 1);
|
||||
|
||||
let prev = kd.insert("k".into(), e2).unwrap();
|
||||
assert_eq!(prev.offset, 100);
|
||||
// Count stays at 1 (overwrite, not new)
|
||||
assert_eq!(kd.len(), 1);
|
||||
assert_eq!(kd.get("k").unwrap().offset, 200);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_from_data_file() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let data_path = dir.path().join("data.rdb");
|
||||
|
||||
// Write a data file with 3 records: insert A, insert B, delete A
|
||||
{
|
||||
let mut f = std::fs::File::create(&data_path).unwrap();
|
||||
let hdr = FileHeader::new(FileType::Data);
|
||||
f.write_all(&hdr.encode()).unwrap();
|
||||
|
||||
let r1 = DataRecord {
|
||||
timestamp: 1,
|
||||
key: b"aaa".to_vec(),
|
||||
value: b"val_a".to_vec(),
|
||||
};
|
||||
let r2 = DataRecord {
|
||||
timestamp: 2,
|
||||
key: b"bbb".to_vec(),
|
||||
value: b"val_b".to_vec(),
|
||||
};
|
||||
let r3 = DataRecord {
|
||||
timestamp: 3,
|
||||
key: b"aaa".to_vec(),
|
||||
value: vec![], // tombstone
|
||||
};
|
||||
f.write_all(&r1.encode()).unwrap();
|
||||
f.write_all(&r2.encode()).unwrap();
|
||||
f.write_all(&r3.encode()).unwrap();
|
||||
}
|
||||
|
||||
let (kd, dead_bytes) = KeyDir::build_from_data_file(&data_path).unwrap();
|
||||
|
||||
// Only B should be live
|
||||
assert_eq!(kd.len(), 1);
|
||||
assert!(kd.contains("bbb"));
|
||||
assert!(!kd.contains("aaa"));
|
||||
|
||||
// Dead bytes: r1 (aaa live, then superseded by tombstone) + r3 (tombstone itself)
|
||||
assert!(dead_bytes > 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hint_file_roundtrip() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let hint_path = dir.path().join("keydir.hint");
|
||||
|
||||
let kd = KeyDir::new();
|
||||
kd.insert(
|
||||
"doc1".into(),
|
||||
KeyDirEntry {
|
||||
offset: 64,
|
||||
record_len: 100,
|
||||
value_len: 80,
|
||||
timestamp: 1000,
|
||||
},
|
||||
);
|
||||
kd.insert(
|
||||
"doc2".into(),
|
||||
KeyDirEntry {
|
||||
offset: 164,
|
||||
record_len: 200,
|
||||
value_len: 150,
|
||||
timestamp: 2000,
|
||||
},
|
||||
);
|
||||
|
||||
kd.persist_to_hint_file(&hint_path).unwrap();
|
||||
let loaded = KeyDir::load_from_hint_file(&hint_path).unwrap().unwrap();
|
||||
|
||||
assert_eq!(loaded.len(), 2);
|
||||
let e1 = loaded.get("doc1").unwrap();
|
||||
assert_eq!(e1.offset, 64);
|
||||
assert_eq!(e1.record_len, 100);
|
||||
assert_eq!(e1.value_len, 80);
|
||||
assert_eq!(e1.timestamp, 1000);
|
||||
|
||||
let e2 = loaded.get("doc2").unwrap();
|
||||
assert_eq!(e2.offset, 164);
|
||||
assert_eq!(e2.timestamp, 2000);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hint_file_nonexistent_returns_none() {
|
||||
let result = KeyDir::load_from_hint_file(Path::new("/tmp/nonexistent_hint_file.hint"));
|
||||
assert!(result.unwrap().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn for_each_and_keys() {
|
||||
let kd = KeyDir::new();
|
||||
let e = KeyDirEntry {
|
||||
offset: 0,
|
||||
record_len: 10,
|
||||
value_len: 5,
|
||||
timestamp: 1,
|
||||
};
|
||||
kd.insert("x".into(), e);
|
||||
kd.insert("y".into(), e);
|
||||
|
||||
let mut collected = Vec::new();
|
||||
kd.for_each(|k, _| collected.push(k.to_string()));
|
||||
collected.sort();
|
||||
assert_eq!(collected, vec!["x", "y"]);
|
||||
|
||||
let mut keys = kd.keys();
|
||||
keys.sort();
|
||||
assert_eq!(keys, vec!["x", "y"]);
|
||||
}
|
||||
}
|
||||
@@ -2,21 +2,30 @@
|
||||
//!
|
||||
//! Provides the [`StorageAdapter`] trait and two concrete implementations:
|
||||
//! - [`MemoryStorageAdapter`] -- fast in-memory store backed by `DashMap`
|
||||
//! - [`FileStorageAdapter`] -- JSON-file-per-collection persistent store
|
||||
//! - [`FileStorageAdapter`] -- Bitcask-style append-only log with crash recovery
|
||||
//!
|
||||
//! Also includes an [`OpLog`] for operation logging and a [`WriteAheadLog`]
|
||||
//! for crash recovery.
|
||||
//! Also includes an [`OpLog`] for operation logging, a [`BinaryWal`] for
|
||||
//! write-ahead logging, and [`compaction`] for dead record reclamation.
|
||||
|
||||
pub mod adapter;
|
||||
pub mod binary_wal;
|
||||
pub mod compaction;
|
||||
pub mod error;
|
||||
pub mod file;
|
||||
pub mod keydir;
|
||||
pub mod memory;
|
||||
pub mod oplog;
|
||||
pub mod wal;
|
||||
pub mod record;
|
||||
|
||||
pub use adapter::StorageAdapter;
|
||||
pub use binary_wal::{BinaryWal, WalEntry, WalOpType};
|
||||
pub use compaction::{compact_data_file, should_compact, CompactionResult};
|
||||
pub use error::{StorageError, StorageResult};
|
||||
pub use file::FileStorageAdapter;
|
||||
pub use keydir::{KeyDir, KeyDirEntry};
|
||||
pub use memory::MemoryStorageAdapter;
|
||||
pub use oplog::{OpLog, OpLogEntry, OpLogStats, OpType};
|
||||
pub use wal::{WalOp, WalRecord, WriteAheadLog};
|
||||
pub use record::{
|
||||
DataRecord, FileHeader, FileType, RecordScanner, FILE_HEADER_SIZE, FILE_MAGIC, FORMAT_VERSION,
|
||||
RECORD_HEADER_SIZE, RECORD_MAGIC,
|
||||
};
|
||||
|
||||
452
rust/crates/rustdb-storage/src/record.rs
Normal file
452
rust/crates/rustdb-storage/src/record.rs
Normal file
@@ -0,0 +1,452 @@
|
||||
//! Binary data record format for the Bitcask-style storage engine.
|
||||
//!
|
||||
//! # File Version Header (64 bytes, at offset 0 of every .rdb / .hint file)
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌──────────────┬──────────┬──────────┬──────────┬──────────┬───────────────┐
|
||||
//! │ magic │ version │ file_type│ flags │ created │ reserved │
|
||||
//! │ 8 bytes │ u16 LE │ u8 │ u32 LE │ u64 LE │ 41 bytes │
|
||||
//! │ "SMARTDB\0" │ │ │ │ epoch_ms │ (zeros) │
|
||||
//! └──────────────┴──────────┴──────────┴──────────┴──────────┴───────────────┘
|
||||
//! ```
|
||||
//!
|
||||
//! # Data Record (appended after the header)
|
||||
//!
|
||||
//! ```text
|
||||
//! ┌──────────┬──────────┬──────────┬──────────┬──────────┬──────────────────┐
|
||||
//! │ magic │ timestamp│ key_len │ val_len │ crc32 │ payload │
|
||||
//! │ u16 LE │ u64 LE │ u32 LE │ u32 LE │ u32 LE │ [key][value] │
|
||||
//! │ 0xDB01 │ epoch_ms │ │ 0=delete │ │ │
|
||||
//! └──────────┴──────────┴──────────┴──────────┴──────────┴──────────────────┘
|
||||
//! ```
|
||||
|
||||
use std::io::{self, Read};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use crate::error::{StorageError, StorageResult};
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Constants
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// File-level magic: b"SMARTDB\0"
|
||||
pub const FILE_MAGIC: &[u8; 8] = b"SMARTDB\0";
|
||||
|
||||
/// Current storage format version.
|
||||
pub const FORMAT_VERSION: u16 = 1;
|
||||
|
||||
/// File version header size.
|
||||
pub const FILE_HEADER_SIZE: usize = 64;
|
||||
|
||||
/// Per-record magic.
|
||||
pub const RECORD_MAGIC: u16 = 0xDB01;
|
||||
|
||||
/// Per-record header size (before payload).
|
||||
pub const RECORD_HEADER_SIZE: usize = 2 + 8 + 4 + 4 + 4; // 22 bytes
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// File type tag stored in the version header
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
#[repr(u8)]
|
||||
pub enum FileType {
|
||||
Data = 1,
|
||||
Wal = 2,
|
||||
Hint = 3,
|
||||
}
|
||||
|
||||
impl FileType {
|
||||
pub fn from_u8(v: u8) -> StorageResult<Self> {
|
||||
match v {
|
||||
1 => Ok(FileType::Data),
|
||||
2 => Ok(FileType::Wal),
|
||||
3 => Ok(FileType::Hint),
|
||||
_ => Err(StorageError::CorruptRecord(format!(
|
||||
"unknown file type tag: {v}"
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// File Version Header
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FileHeader {
|
||||
pub version: u16,
|
||||
pub file_type: FileType,
|
||||
pub flags: u32,
|
||||
pub created_ms: u64,
|
||||
}
|
||||
|
||||
impl FileHeader {
|
||||
/// Create a new header for the current format version.
|
||||
pub fn new(file_type: FileType) -> Self {
|
||||
Self {
|
||||
version: FORMAT_VERSION,
|
||||
file_type,
|
||||
flags: 0,
|
||||
created_ms: now_ms(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode the header to a 64-byte buffer.
|
||||
pub fn encode(&self) -> [u8; FILE_HEADER_SIZE] {
|
||||
let mut buf = [0u8; FILE_HEADER_SIZE];
|
||||
buf[0..8].copy_from_slice(FILE_MAGIC);
|
||||
buf[8..10].copy_from_slice(&self.version.to_le_bytes());
|
||||
buf[10] = self.file_type as u8;
|
||||
buf[11..15].copy_from_slice(&self.flags.to_le_bytes());
|
||||
buf[15..23].copy_from_slice(&self.created_ms.to_le_bytes());
|
||||
// bytes 23..64 are reserved (zeros)
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode a 64-byte header. Validates magic and version.
|
||||
pub fn decode(buf: &[u8; FILE_HEADER_SIZE]) -> StorageResult<Self> {
|
||||
if &buf[0..8] != FILE_MAGIC {
|
||||
return Err(StorageError::CorruptRecord(
|
||||
"invalid file magic — not a SmartDB file".into(),
|
||||
));
|
||||
}
|
||||
let version = u16::from_le_bytes([buf[8], buf[9]]);
|
||||
if version > FORMAT_VERSION {
|
||||
return Err(StorageError::CorruptRecord(format!(
|
||||
"file format version {version} is newer than supported version {FORMAT_VERSION} — please upgrade"
|
||||
)));
|
||||
}
|
||||
if version == 0 {
|
||||
return Err(StorageError::CorruptRecord(
|
||||
"file format version 0 is invalid".into(),
|
||||
));
|
||||
}
|
||||
let file_type = FileType::from_u8(buf[10])?;
|
||||
let flags = u32::from_le_bytes([buf[11], buf[12], buf[13], buf[14]]);
|
||||
let created_ms = u64::from_le_bytes([
|
||||
buf[15], buf[16], buf[17], buf[18], buf[19], buf[20], buf[21], buf[22],
|
||||
]);
|
||||
Ok(Self {
|
||||
version,
|
||||
file_type,
|
||||
flags,
|
||||
created_ms,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Data Record
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// A single data record (live document or tombstone).
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DataRecord {
|
||||
pub timestamp: u64,
|
||||
pub key: Vec<u8>,
|
||||
/// BSON value bytes. Empty for tombstones.
|
||||
pub value: Vec<u8>,
|
||||
}
|
||||
|
||||
impl DataRecord {
|
||||
/// Whether this record is a tombstone (delete marker).
|
||||
pub fn is_tombstone(&self) -> bool {
|
||||
self.value.is_empty()
|
||||
}
|
||||
|
||||
/// Total size on disk (header + payload).
|
||||
pub fn disk_size(&self) -> usize {
|
||||
RECORD_HEADER_SIZE + self.key.len() + self.value.len()
|
||||
}
|
||||
|
||||
/// Encode to bytes. CRC32 covers magic + timestamp + key_len + val_len + payload.
|
||||
pub fn encode(&self) -> Vec<u8> {
|
||||
let key_len = self.key.len() as u32;
|
||||
let val_len = self.value.len() as u32;
|
||||
let total = RECORD_HEADER_SIZE + self.key.len() + self.value.len();
|
||||
let mut buf = Vec::with_capacity(total);
|
||||
|
||||
// Write fields WITHOUT crc first to compute checksum.
|
||||
buf.extend_from_slice(&RECORD_MAGIC.to_le_bytes()); // 2
|
||||
buf.extend_from_slice(&self.timestamp.to_le_bytes()); // 8
|
||||
buf.extend_from_slice(&key_len.to_le_bytes()); // 4
|
||||
buf.extend_from_slice(&val_len.to_le_bytes()); // 4
|
||||
// placeholder for crc32 — we'll fill it after computing
|
||||
buf.extend_from_slice(&0u32.to_le_bytes()); // 4
|
||||
buf.extend_from_slice(&self.key); // key_len
|
||||
buf.extend_from_slice(&self.value); // val_len
|
||||
|
||||
// CRC covers everything except the crc32 field itself:
|
||||
// bytes [0..18] (magic+ts+key_len+val_len) + bytes [22..] (payload)
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&buf[0..18]);
|
||||
hasher.update(&buf[22..]);
|
||||
let crc = hasher.finalize();
|
||||
buf[18..22].copy_from_slice(&crc.to_le_bytes());
|
||||
|
||||
buf
|
||||
}
|
||||
|
||||
/// Decode a record from a reader. Returns the record and its total disk size.
|
||||
/// On EOF at the very start (no bytes to read), returns Ok(None).
|
||||
pub fn decode_from<R: Read>(reader: &mut R) -> StorageResult<Option<(Self, usize)>> {
|
||||
// Read header
|
||||
let mut hdr = [0u8; RECORD_HEADER_SIZE];
|
||||
match reader.read_exact(&mut hdr) {
|
||||
Ok(()) => {}
|
||||
Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
}
|
||||
|
||||
let magic = u16::from_le_bytes([hdr[0], hdr[1]]);
|
||||
if magic != RECORD_MAGIC {
|
||||
return Err(StorageError::CorruptRecord(format!(
|
||||
"invalid record magic: 0x{magic:04X}, expected 0x{RECORD_MAGIC:04X}"
|
||||
)));
|
||||
}
|
||||
|
||||
let timestamp = u64::from_le_bytes(hdr[2..10].try_into().unwrap());
|
||||
let key_len = u32::from_le_bytes(hdr[10..14].try_into().unwrap()) as usize;
|
||||
let val_len = u32::from_le_bytes(hdr[14..18].try_into().unwrap()) as usize;
|
||||
let stored_crc = u32::from_le_bytes(hdr[18..22].try_into().unwrap());
|
||||
|
||||
// Read payload
|
||||
let payload_len = key_len + val_len;
|
||||
let mut payload = vec![0u8; payload_len];
|
||||
reader.read_exact(&mut payload)?;
|
||||
|
||||
// Verify CRC: covers header bytes [0..18] + payload
|
||||
let mut hasher = crc32fast::Hasher::new();
|
||||
hasher.update(&hdr[0..18]);
|
||||
hasher.update(&payload);
|
||||
let computed_crc = hasher.finalize();
|
||||
if computed_crc != stored_crc {
|
||||
return Err(StorageError::ChecksumMismatch {
|
||||
expected: stored_crc,
|
||||
actual: computed_crc,
|
||||
});
|
||||
}
|
||||
|
||||
let key = payload[..key_len].to_vec();
|
||||
let value = payload[key_len..].to_vec();
|
||||
let disk_size = RECORD_HEADER_SIZE + payload_len;
|
||||
|
||||
Ok(Some((
|
||||
DataRecord {
|
||||
timestamp,
|
||||
key,
|
||||
value,
|
||||
},
|
||||
disk_size,
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Record Scanner — iterate records from a byte slice or reader
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Scans records sequentially from a reader, yielding (offset, record) pairs.
|
||||
/// Starts reading from the current reader position. The `base_offset` parameter
|
||||
/// indicates the byte offset in the file where reading begins (typically
|
||||
/// `FILE_HEADER_SIZE` for a data file).
|
||||
pub struct RecordScanner<R> {
|
||||
reader: R,
|
||||
offset: u64,
|
||||
}
|
||||
|
||||
impl<R: Read> RecordScanner<R> {
|
||||
pub fn new(reader: R, base_offset: u64) -> Self {
|
||||
Self {
|
||||
reader,
|
||||
offset: base_offset,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<R: Read> Iterator for RecordScanner<R> {
|
||||
/// (file_offset, record) or an error. Iteration stops on EOF or error.
|
||||
type Item = StorageResult<(u64, DataRecord)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match DataRecord::decode_from(&mut self.reader) {
|
||||
Ok(Some((record, disk_size))) => {
|
||||
let offset = self.offset;
|
||||
self.offset += disk_size as u64;
|
||||
Some(Ok((offset, record)))
|
||||
}
|
||||
Ok(None) => None, // clean EOF
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// Current time in milliseconds since UNIX epoch.
|
||||
pub fn now_ms() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn file_header_roundtrip() {
|
||||
let hdr = FileHeader::new(FileType::Data);
|
||||
let buf = hdr.encode();
|
||||
assert_eq!(buf.len(), FILE_HEADER_SIZE);
|
||||
|
||||
let decoded = FileHeader::decode(&buf).unwrap();
|
||||
assert_eq!(decoded.version, FORMAT_VERSION);
|
||||
assert_eq!(decoded.file_type, FileType::Data);
|
||||
assert_eq!(decoded.flags, 0);
|
||||
assert_eq!(decoded.created_ms, hdr.created_ms);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_header_rejects_bad_magic() {
|
||||
let mut buf = [0u8; FILE_HEADER_SIZE];
|
||||
buf[0..8].copy_from_slice(b"BADMAGIC");
|
||||
assert!(FileHeader::decode(&buf).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn file_header_rejects_future_version() {
|
||||
let mut hdr = FileHeader::new(FileType::Data);
|
||||
hdr.version = FORMAT_VERSION + 1;
|
||||
let buf = hdr.encode();
|
||||
// Manually patch the version in the buffer
|
||||
let mut buf2 = buf;
|
||||
buf2[8..10].copy_from_slice(&(FORMAT_VERSION + 1).to_le_bytes());
|
||||
assert!(FileHeader::decode(&buf2).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_roundtrip_live() {
|
||||
let rec = DataRecord {
|
||||
timestamp: 1700000000000,
|
||||
key: b"abc123".to_vec(),
|
||||
value: b"\x10\x00\x00\x00\x02hi\x00\x03\x00\x00\x00ok\x00\x00".to_vec(),
|
||||
};
|
||||
let encoded = rec.encode();
|
||||
assert_eq!(encoded.len(), rec.disk_size());
|
||||
|
||||
let mut cursor = std::io::Cursor::new(&encoded);
|
||||
let (decoded, size) = DataRecord::decode_from(&mut cursor).unwrap().unwrap();
|
||||
assert_eq!(size, encoded.len());
|
||||
assert_eq!(decoded.timestamp, rec.timestamp);
|
||||
assert_eq!(decoded.key, rec.key);
|
||||
assert_eq!(decoded.value, rec.value);
|
||||
assert!(!decoded.is_tombstone());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_roundtrip_tombstone() {
|
||||
let rec = DataRecord {
|
||||
timestamp: 1700000000000,
|
||||
key: b"def456".to_vec(),
|
||||
value: vec![],
|
||||
};
|
||||
assert!(rec.is_tombstone());
|
||||
let encoded = rec.encode();
|
||||
|
||||
let mut cursor = std::io::Cursor::new(&encoded);
|
||||
let (decoded, _) = DataRecord::decode_from(&mut cursor).unwrap().unwrap();
|
||||
assert!(decoded.is_tombstone());
|
||||
assert_eq!(decoded.key, b"def456");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_detects_corruption() {
|
||||
let rec = DataRecord {
|
||||
timestamp: 42,
|
||||
key: b"key".to_vec(),
|
||||
value: b"value".to_vec(),
|
||||
};
|
||||
let mut encoded = rec.encode();
|
||||
// Flip a bit in the payload
|
||||
let last = encoded.len() - 1;
|
||||
encoded[last] ^= 0xFF;
|
||||
|
||||
let mut cursor = std::io::Cursor::new(&encoded);
|
||||
let result = DataRecord::decode_from(&mut cursor);
|
||||
assert!(matches!(result, Err(StorageError::ChecksumMismatch { .. })));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_detects_bad_magic() {
|
||||
let rec = DataRecord {
|
||||
timestamp: 42,
|
||||
key: b"key".to_vec(),
|
||||
value: b"value".to_vec(),
|
||||
};
|
||||
let mut encoded = rec.encode();
|
||||
encoded[0] = 0xFF;
|
||||
encoded[1] = 0xFF;
|
||||
|
||||
let mut cursor = std::io::Cursor::new(&encoded);
|
||||
let result = DataRecord::decode_from(&mut cursor);
|
||||
assert!(matches!(result, Err(StorageError::CorruptRecord(_))));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn eof_returns_none() {
|
||||
let empty: &[u8] = &[];
|
||||
let mut cursor = std::io::Cursor::new(empty);
|
||||
let result = DataRecord::decode_from(&mut cursor).unwrap();
|
||||
assert!(result.is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scanner_iterates_multiple_records() {
|
||||
let records = vec![
|
||||
DataRecord {
|
||||
timestamp: 1,
|
||||
key: b"a".to_vec(),
|
||||
value: b"v1".to_vec(),
|
||||
},
|
||||
DataRecord {
|
||||
timestamp: 2,
|
||||
key: b"b".to_vec(),
|
||||
value: b"v2".to_vec(),
|
||||
},
|
||||
DataRecord {
|
||||
timestamp: 3,
|
||||
key: b"c".to_vec(),
|
||||
value: vec![],
|
||||
},
|
||||
];
|
||||
|
||||
let mut buf = Vec::new();
|
||||
for r in &records {
|
||||
buf.extend_from_slice(&r.encode());
|
||||
}
|
||||
|
||||
let scanner = RecordScanner::new(std::io::Cursor::new(&buf), 0);
|
||||
let results: Vec<_> = scanner.collect::<Result<Vec<_>, _>>().unwrap();
|
||||
assert_eq!(results.len(), 3);
|
||||
assert_eq!(results[0].1.key, b"a");
|
||||
assert_eq!(results[1].1.key, b"b");
|
||||
assert!(results[2].1.is_tombstone());
|
||||
|
||||
// Verify offsets are correct
|
||||
assert_eq!(results[0].0, 0);
|
||||
assert_eq!(results[1].0, records[0].disk_size() as u64);
|
||||
assert_eq!(
|
||||
results[2].0,
|
||||
(records[0].disk_size() + records[1].disk_size()) as u64
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,186 +0,0 @@
|
||||
//! Write-Ahead Log (WAL) for crash recovery.
|
||||
//!
|
||||
//! Before any mutation is applied to storage, it is first written to the WAL.
|
||||
//! On recovery, uncommitted WAL entries can be replayed or discarded.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use bson::Document;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
use crate::error::StorageResult;
|
||||
|
||||
/// WAL operation kind.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum WalOp {
|
||||
Insert,
|
||||
Update,
|
||||
Delete,
|
||||
}
|
||||
|
||||
/// A single WAL record.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct WalRecord {
|
||||
/// Sequence number.
|
||||
pub seq: u64,
|
||||
/// Operation kind.
|
||||
pub op: WalOp,
|
||||
/// Database name.
|
||||
pub db: String,
|
||||
/// Collection name.
|
||||
pub collection: String,
|
||||
/// Document id (hex string).
|
||||
pub document_id: String,
|
||||
/// Document data (for insert/update).
|
||||
pub document: Option<Document>,
|
||||
/// Whether this record has been committed (applied to storage).
|
||||
pub committed: bool,
|
||||
/// CRC32 checksum of the serialized payload for integrity verification.
|
||||
pub checksum: u32,
|
||||
}
|
||||
|
||||
/// Write-ahead log that persists records to a file.
|
||||
pub struct WriteAheadLog {
|
||||
path: PathBuf,
|
||||
next_seq: AtomicU64,
|
||||
}
|
||||
|
||||
impl WriteAheadLog {
|
||||
/// Create a new WAL at the given file path.
|
||||
pub fn new(path: PathBuf) -> Self {
|
||||
Self {
|
||||
path,
|
||||
next_seq: AtomicU64::new(1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Initialize the WAL (create file if needed, load sequence counter).
|
||||
pub async fn initialize(&self) -> StorageResult<()> {
|
||||
if let Some(parent) = self.path.parent() {
|
||||
tokio::fs::create_dir_all(parent).await?;
|
||||
}
|
||||
if self.path.exists() {
|
||||
// Load existing records to find the max sequence number.
|
||||
let records = self.read_all().await?;
|
||||
if let Some(max_seq) = records.iter().map(|r| r.seq).max() {
|
||||
self.next_seq.store(max_seq + 1, Ordering::SeqCst);
|
||||
}
|
||||
}
|
||||
debug!("WAL initialized at {:?}", self.path);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Append a record to the WAL. Returns the sequence number.
|
||||
pub async fn append(
|
||||
&self,
|
||||
op: WalOp,
|
||||
db: &str,
|
||||
collection: &str,
|
||||
document_id: &str,
|
||||
document: Option<Document>,
|
||||
) -> StorageResult<u64> {
|
||||
let seq = self.next_seq.fetch_add(1, Ordering::SeqCst);
|
||||
|
||||
// Compute checksum over the payload.
|
||||
let payload = serde_json::json!({
|
||||
"op": op,
|
||||
"db": db,
|
||||
"collection": collection,
|
||||
"document_id": document_id,
|
||||
});
|
||||
let payload_bytes = serde_json::to_vec(&payload)?;
|
||||
let checksum = crc32fast::hash(&payload_bytes);
|
||||
|
||||
let record = WalRecord {
|
||||
seq,
|
||||
op,
|
||||
db: db.to_string(),
|
||||
collection: collection.to_string(),
|
||||
document_id: document_id.to_string(),
|
||||
document,
|
||||
committed: false,
|
||||
checksum,
|
||||
};
|
||||
|
||||
let line = serde_json::to_string(&record)?;
|
||||
let mut file = tokio::fs::OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&self.path)
|
||||
.await?;
|
||||
file.write_all(line.as_bytes()).await?;
|
||||
file.write_all(b"\n").await?;
|
||||
file.flush().await?;
|
||||
|
||||
Ok(seq)
|
||||
}
|
||||
|
||||
/// Mark a WAL record as committed by rewriting the file.
|
||||
pub async fn mark_committed(&self, seq: u64) -> StorageResult<()> {
|
||||
let mut records = self.read_all().await?;
|
||||
for record in &mut records {
|
||||
if record.seq == seq {
|
||||
record.committed = true;
|
||||
}
|
||||
}
|
||||
self.write_all(&records).await
|
||||
}
|
||||
|
||||
/// Read all WAL records.
|
||||
pub async fn read_all(&self) -> StorageResult<Vec<WalRecord>> {
|
||||
if !self.path.exists() {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
let data = tokio::fs::read_to_string(&self.path).await?;
|
||||
let mut records = Vec::new();
|
||||
for line in data.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_str::<WalRecord>(line) {
|
||||
Ok(record) => records.push(record),
|
||||
Err(e) => {
|
||||
warn!("skipping corrupt WAL record: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(records)
|
||||
}
|
||||
|
||||
/// Get all uncommitted records (for replay during recovery).
|
||||
pub async fn uncommitted(&self) -> StorageResult<Vec<WalRecord>> {
|
||||
let records = self.read_all().await?;
|
||||
Ok(records.into_iter().filter(|r| !r.committed).collect())
|
||||
}
|
||||
|
||||
/// Truncate the WAL, removing all committed records.
|
||||
pub async fn truncate_committed(&self) -> StorageResult<()> {
|
||||
let records = self.read_all().await?;
|
||||
let uncommitted: Vec<_> = records.into_iter().filter(|r| !r.committed).collect();
|
||||
self.write_all(&uncommitted).await
|
||||
}
|
||||
|
||||
/// Clear the entire WAL.
|
||||
pub async fn clear(&self) -> StorageResult<()> {
|
||||
if self.path.exists() {
|
||||
tokio::fs::write(&self.path, "").await?;
|
||||
}
|
||||
self.next_seq.store(1, Ordering::SeqCst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write all records to the WAL file (overwrites).
|
||||
async fn write_all(&self, records: &[WalRecord]) -> StorageResult<()> {
|
||||
let mut content = String::new();
|
||||
for record in records {
|
||||
let line = serde_json::to_string(record)?;
|
||||
content.push_str(&line);
|
||||
content.push('\n');
|
||||
}
|
||||
tokio::fs::write(&self.path, content).await?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user