feat(storage): add offline data validation and strengthen storage/index integrity checks

This commit is contained in:
2026-04-05 02:46:05 +00:00
parent b8567ebe08
commit 418e8dc052
13 changed files with 724 additions and 41 deletions
+115 -6
View File
@@ -6,7 +6,7 @@
//! The KeyDir can be rebuilt from a data file scan, or loaded quickly from a
//! persisted hint file for fast restart.
use std::io::{self, BufReader, BufWriter, Read, Write};
use std::io::{self, BufReader, BufWriter, Read, Seek, SeekFrom, Write};
use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
@@ -14,7 +14,7 @@ use dashmap::DashMap;
use crate::error::{StorageError, StorageResult};
use crate::record::{
FileHeader, FileType, RecordScanner, FILE_HEADER_SIZE, FORMAT_VERSION,
DataRecord, FileHeader, FileType, RecordScanner, FILE_HEADER_SIZE, FORMAT_VERSION,
};
// ---------------------------------------------------------------------------
@@ -34,6 +34,23 @@ pub struct KeyDirEntry {
pub timestamp: u64,
}
// ---------------------------------------------------------------------------
// BuildStats — statistics from building KeyDir from a data file scan
// ---------------------------------------------------------------------------
/// Statistics collected while building a KeyDir from a data file scan.
#[derive(Debug, Clone, Default)]
pub struct BuildStats {
/// Total records scanned (live + tombstones + superseded).
pub total_records_scanned: u64,
/// Number of live documents in the final KeyDir.
pub live_documents: u64,
/// Number of tombstone records encountered.
pub tombstones: u64,
/// Number of records superseded by a later write for the same key.
pub superseded_records: u64,
}
// ---------------------------------------------------------------------------
// KeyDir
// ---------------------------------------------------------------------------
@@ -116,9 +133,9 @@ impl KeyDir {
/// Rebuild the KeyDir by scanning an entire data file.
/// The file must start with a valid `FileHeader`.
/// Returns `(keydir, dead_bytes)` where `dead_bytes` is the total size of
/// Returns `(keydir, dead_bytes, stats)` where `dead_bytes` is the total size of
/// stale records (superseded by later writes or tombstoned).
pub fn build_from_data_file(path: &Path) -> StorageResult<(Self, u64)> {
pub fn build_from_data_file(path: &Path) -> StorageResult<(Self, u64, BuildStats)> {
let file = std::fs::File::open(path)?;
let mut reader = BufReader::new(file);
@@ -135,6 +152,7 @@ impl KeyDir {
let keydir = KeyDir::new();
let mut dead_bytes: u64 = 0;
let mut stats = BuildStats::default();
let scanner = RecordScanner::new(reader, FILE_HEADER_SIZE as u64);
for result in scanner {
@@ -146,7 +164,10 @@ impl KeyDir {
let key = String::from_utf8(record.key)
.map_err(|e| StorageError::CorruptRecord(format!("invalid UTF-8 key: {e}")))?;
stats.total_records_scanned += 1;
if is_tombstone {
stats.tombstones += 1;
// Remove from index; the tombstone itself is dead weight
if let Some(prev) = keydir.remove(&key) {
dead_bytes += prev.record_len as u64;
@@ -162,11 +183,13 @@ impl KeyDir {
if let Some(prev) = keydir.insert(key, entry) {
// Previous version of same key is now dead
dead_bytes += prev.record_len as u64;
stats.superseded_records += 1;
}
}
}
Ok((keydir, dead_bytes))
stats.live_documents = keydir.len();
Ok((keydir, dead_bytes, stats))
}
// -----------------------------------------------------------------------
@@ -271,6 +294,86 @@ impl KeyDir {
Ok(Some(keydir))
}
// -----------------------------------------------------------------------
// Hint file validation
// -----------------------------------------------------------------------
/// Validate this KeyDir (loaded from a hint file) against the actual data file.
/// Returns `Ok(true)` if the hint appears consistent, `Ok(false)` if a rebuild
/// from the data file is recommended.
///
/// Checks:
/// 1. All entry offsets + record_len fit within the data file size.
/// 2. All entry offsets are >= FILE_HEADER_SIZE.
/// 3. A random sample of entries is spot-checked by reading the record at
/// the offset and verifying the key matches.
pub fn validate_against_data_file(&self, data_path: &Path, sample_size: usize) -> StorageResult<bool> {
let file_size = std::fs::metadata(data_path)
.map(|m| m.len())
.unwrap_or(0);
if file_size < FILE_HEADER_SIZE as u64 {
// Data file is too small to even contain a header
return Ok(self.is_empty());
}
// Pass 1: bounds check all entries
let mut all_keys: Vec<(String, KeyDirEntry)> = Vec::with_capacity(self.len() as usize);
let mut bounds_ok = true;
self.for_each(|key, entry| {
if entry.offset < FILE_HEADER_SIZE as u64
|| entry.offset + entry.record_len as u64 > file_size
{
bounds_ok = false;
}
all_keys.push((key.to_string(), *entry));
});
if !bounds_ok {
return Ok(false);
}
// Pass 2: spot-check a sample of entries by reading records from data.rdb
if all_keys.is_empty() {
return Ok(true);
}
// Sort by offset for sequential I/O, take first `sample_size` entries
all_keys.sort_by_key(|(_, e)| e.offset);
let step = if all_keys.len() <= sample_size {
1
} else {
all_keys.len() / sample_size
};
let mut file = std::fs::File::open(data_path)?;
let mut checked = 0usize;
for (i, (expected_key, entry)) in all_keys.iter().enumerate() {
if checked >= sample_size {
break;
}
if i % step != 0 {
continue;
}
// Seek to the entry's offset and try to decode the record
file.seek(SeekFrom::Start(entry.offset))?;
match DataRecord::decode_from(&mut file) {
Ok(Some((record, _disk_size))) => {
let record_key = String::from_utf8_lossy(&record.key);
if record_key != *expected_key {
return Ok(false);
}
}
Ok(None) | Err(_) => {
return Ok(false);
}
}
checked += 1;
}
Ok(true)
}
}
impl Default for KeyDir {
@@ -372,7 +475,7 @@ mod tests {
f.write_all(&r3.encode()).unwrap();
}
let (kd, dead_bytes) = KeyDir::build_from_data_file(&data_path).unwrap();
let (kd, dead_bytes, stats) = KeyDir::build_from_data_file(&data_path).unwrap();
// Only B should be live
assert_eq!(kd.len(), 1);
@@ -381,6 +484,12 @@ mod tests {
// Dead bytes: r1 (aaa live, then superseded by tombstone) + r3 (tombstone itself)
assert!(dead_bytes > 0);
// Stats
assert_eq!(stats.total_records_scanned, 3);
assert_eq!(stats.live_documents, 1);
assert_eq!(stats.tombstones, 1);
assert_eq!(stats.superseded_records, 0); // aaa was removed by tombstone, not superseded
}
#[test]