feat(storage): add offline data validation and strengthen storage/index integrity checks
This commit is contained in:
@@ -21,7 +21,7 @@ use std::sync::Arc;
|
||||
use async_trait::async_trait;
|
||||
use bson::{doc, oid::ObjectId, Document};
|
||||
use dashmap::DashMap;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::adapter::StorageAdapter;
|
||||
use crate::binary_wal::{BinaryWal, WalOpType};
|
||||
@@ -83,6 +83,20 @@ impl CollectionState {
|
||||
.map_err(|e| StorageError::SerializationError(format!("BSON decode: {e}")))
|
||||
}
|
||||
|
||||
/// Ensure a data file has the 64-byte SMARTDB header.
|
||||
/// If the file was just created (empty), writes the header and updates
|
||||
/// the data_file_size counter. Must be called under write_lock.
|
||||
fn ensure_data_header(&self, file: &mut std::fs::File) -> StorageResult<()> {
|
||||
let pos = file.seek(SeekFrom::End(0))?;
|
||||
if pos == 0 {
|
||||
let hdr = FileHeader::new(FileType::Data);
|
||||
file.write_all(&hdr.encode())?;
|
||||
self.data_file_size
|
||||
.fetch_add(FILE_HEADER_SIZE as u64, Ordering::Relaxed);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Append a data record and update the KeyDir. Must be called under write_lock.
|
||||
fn append_record(
|
||||
&self,
|
||||
@@ -104,6 +118,7 @@ impl CollectionState {
|
||||
.append(true)
|
||||
.open(&data_path)?;
|
||||
|
||||
self.ensure_data_header(&mut file)?;
|
||||
let offset = file.seek(SeekFrom::End(0))?;
|
||||
file.write_all(&encoded)?;
|
||||
file.sync_all()?;
|
||||
@@ -137,6 +152,7 @@ impl CollectionState {
|
||||
.append(true)
|
||||
.open(&data_path)?;
|
||||
|
||||
self.ensure_data_header(&mut file)?;
|
||||
file.write_all(&encoded)?;
|
||||
file.sync_all()?;
|
||||
|
||||
@@ -160,6 +176,11 @@ impl CollectionState {
|
||||
&self.data_file_size,
|
||||
) {
|
||||
tracing::warn!("compaction failed for {:?}: {e}", self.coll_dir);
|
||||
} else {
|
||||
// Persist hint file after successful compaction to prevent stale hints
|
||||
if let Err(e) = self.keydir.persist_to_hint_file(&self.hint_path()) {
|
||||
tracing::warn!("failed to persist hint after compaction for {:?}: {e}", self.coll_dir);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -234,33 +255,42 @@ impl FileStorageAdapter {
|
||||
let hint_path = coll_dir.join("keydir.hint");
|
||||
|
||||
// Try loading from hint file first, fall back to data file scan
|
||||
let (keydir, dead_bytes) = if hint_path.exists() && data_path.exists() {
|
||||
let (keydir, dead_bytes, loaded_from_hint) = if hint_path.exists() && data_path.exists() {
|
||||
match KeyDir::load_from_hint_file(&hint_path) {
|
||||
Ok(Some(kd)) => {
|
||||
debug!("loaded KeyDir from hint file: {:?}", hint_path);
|
||||
// We don't know dead_bytes from the hint file; estimate from file size
|
||||
let file_size = std::fs::metadata(&data_path)
|
||||
.map(|m| m.len())
|
||||
.unwrap_or(FILE_HEADER_SIZE as u64);
|
||||
let live_bytes: u64 = {
|
||||
let mut total = 0u64;
|
||||
kd.for_each(|_, e| total += e.record_len as u64);
|
||||
total
|
||||
};
|
||||
let dead = file_size.saturating_sub(FILE_HEADER_SIZE as u64).saturating_sub(live_bytes);
|
||||
(kd, dead)
|
||||
// Validate hint against actual data file
|
||||
let hint_valid = kd.validate_against_data_file(&data_path, 16)
|
||||
.unwrap_or(false);
|
||||
if hint_valid {
|
||||
debug!("loaded KeyDir from hint file: {:?}", hint_path);
|
||||
let file_size = std::fs::metadata(&data_path)
|
||||
.map(|m| m.len())
|
||||
.unwrap_or(FILE_HEADER_SIZE as u64);
|
||||
let live_bytes: u64 = {
|
||||
let mut total = 0u64;
|
||||
kd.for_each(|_, e| total += e.record_len as u64);
|
||||
total
|
||||
};
|
||||
let dead = file_size.saturating_sub(FILE_HEADER_SIZE as u64).saturating_sub(live_bytes);
|
||||
(kd, dead, true)
|
||||
} else {
|
||||
tracing::warn!("hint file {:?} is stale, rebuilding from data file", hint_path);
|
||||
let (kd, dead, _stats) = KeyDir::build_from_data_file(&data_path)?;
|
||||
(kd, dead, false)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
debug!("hint file invalid, rebuilding KeyDir from data file");
|
||||
KeyDir::build_from_data_file(&data_path)?
|
||||
let (kd, dead, _stats) = KeyDir::build_from_data_file(&data_path)?;
|
||||
(kd, dead, false)
|
||||
}
|
||||
}
|
||||
} else if data_path.exists() {
|
||||
KeyDir::build_from_data_file(&data_path)?
|
||||
let (kd, dead, _stats) = KeyDir::build_from_data_file(&data_path)?;
|
||||
(kd, dead, false)
|
||||
} else {
|
||||
(KeyDir::new(), 0)
|
||||
(KeyDir::new(), 0, false)
|
||||
};
|
||||
|
||||
let doc_count = keydir.len();
|
||||
let data_file_size = if data_path.exists() {
|
||||
std::fs::metadata(&data_path)?.len()
|
||||
@@ -268,6 +298,15 @@ impl FileStorageAdapter {
|
||||
FILE_HEADER_SIZE as u64
|
||||
};
|
||||
|
||||
info!(
|
||||
collection = %coll_dir.display(),
|
||||
documents = doc_count,
|
||||
data_bytes = data_file_size,
|
||||
dead_bytes = dead_bytes,
|
||||
source = if loaded_from_hint { "hint" } else { "scan" },
|
||||
"loaded collection"
|
||||
);
|
||||
|
||||
// Initialize WAL and recover
|
||||
let wal = BinaryWal::new(wal_path);
|
||||
wal.initialize()?;
|
||||
@@ -275,10 +314,10 @@ impl FileStorageAdapter {
|
||||
// Recover uncommitted WAL entries
|
||||
let uncommitted = wal.recover()?;
|
||||
if !uncommitted.is_empty() {
|
||||
debug!(
|
||||
"recovering {} uncommitted WAL entries for {:?}",
|
||||
uncommitted.len(),
|
||||
coll_dir
|
||||
info!(
|
||||
collection = %coll_dir.display(),
|
||||
entries = uncommitted.len(),
|
||||
"recovering uncommitted WAL entries"
|
||||
);
|
||||
}
|
||||
|
||||
@@ -415,15 +454,18 @@ impl FileStorageAdapter {
|
||||
impl StorageAdapter for FileStorageAdapter {
|
||||
async fn initialize(&self) -> StorageResult<()> {
|
||||
std::fs::create_dir_all(&self.base_path)?;
|
||||
debug!("FileStorageAdapter initialized at {:?}", self.base_path);
|
||||
|
||||
// Pre-load all existing collections
|
||||
let mut db_count: usize = 0;
|
||||
if let Ok(entries) = std::fs::read_dir(&self.base_path) {
|
||||
for entry in entries.flatten() {
|
||||
if entry.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
|
||||
if let Some(db_name) = entry.file_name().to_str() {
|
||||
let db_name = db_name.to_string();
|
||||
if let Ok(colls) = self.list_collection_dirs(&db_name) {
|
||||
if !colls.is_empty() {
|
||||
db_count += 1;
|
||||
}
|
||||
for coll_name in colls {
|
||||
let _ = self.get_or_init_collection(&db_name, &coll_name);
|
||||
}
|
||||
@@ -433,6 +475,13 @@ impl StorageAdapter for FileStorageAdapter {
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
databases = db_count,
|
||||
collections = self.collections.len(),
|
||||
path = %self.base_path.display(),
|
||||
"FileStorageAdapter initialization complete"
|
||||
);
|
||||
|
||||
// Start periodic compaction task (runs every 24 hours)
|
||||
{
|
||||
let collections = self.collections.clone();
|
||||
|
||||
Reference in New Issue
Block a user