2026-03-21 21:50:42 +00:00
|
|
|
use anyhow::Result;
|
2026-04-19 11:57:28 +00:00
|
|
|
use chrono::{DateTime, Utc};
|
2026-03-21 22:00:41 +00:00
|
|
|
use std::path::PathBuf;
|
2026-03-21 21:50:42 +00:00
|
|
|
use std::sync::Arc;
|
|
|
|
|
use std::time::Duration;
|
2026-03-21 22:00:41 +00:00
|
|
|
use tokio::fs;
|
2026-04-19 11:57:28 +00:00
|
|
|
use tokio::sync::RwLock;
|
2026-03-21 21:50:42 +00:00
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
use super::config::ErasureConfig;
|
|
|
|
|
use super::erasure::ErasureCoder;
|
|
|
|
|
use super::metadata::ObjectManifest;
|
|
|
|
|
use super::shard_store::{ShardId, ShardStore};
|
2026-03-21 21:50:42 +00:00
|
|
|
use super::state::ClusterState;
|
|
|
|
|
|
|
|
|
|
/// Background healing service that scans for under-replicated shards
|
|
|
|
|
/// and reconstructs them.
|
|
|
|
|
pub struct HealingService {
|
|
|
|
|
state: Arc<ClusterState>,
|
2026-03-21 22:00:41 +00:00
|
|
|
erasure_coder: ErasureCoder,
|
|
|
|
|
local_shard_stores: Vec<Arc<ShardStore>>,
|
|
|
|
|
manifest_dir: PathBuf,
|
2026-03-21 21:50:42 +00:00
|
|
|
scan_interval: Duration,
|
2026-04-19 11:57:28 +00:00
|
|
|
runtime_state: Arc<RwLock<HealingRuntimeState>>,
|
2026-03-21 21:50:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl HealingService {
|
2026-03-21 22:00:41 +00:00
|
|
|
pub fn new(
|
|
|
|
|
state: Arc<ClusterState>,
|
|
|
|
|
erasure_config: &ErasureConfig,
|
|
|
|
|
local_shard_stores: Vec<Arc<ShardStore>>,
|
|
|
|
|
manifest_dir: PathBuf,
|
|
|
|
|
scan_interval_hours: u64,
|
2026-04-19 11:57:28 +00:00
|
|
|
runtime_state: Arc<RwLock<HealingRuntimeState>>,
|
2026-03-21 22:00:41 +00:00
|
|
|
) -> Result<Self> {
|
2026-04-19 11:57:28 +00:00
|
|
|
let scan_interval = Duration::from_secs(scan_interval_hours * 3600);
|
|
|
|
|
if let Ok(mut state_guard) = runtime_state.try_write() {
|
|
|
|
|
state_guard.scan_interval_ms = scan_interval.as_millis() as u64;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
Ok(Self {
|
2026-03-21 21:50:42 +00:00
|
|
|
state,
|
2026-03-21 22:00:41 +00:00
|
|
|
erasure_coder: ErasureCoder::new(erasure_config)?,
|
|
|
|
|
local_shard_stores,
|
|
|
|
|
manifest_dir,
|
2026-04-19 11:57:28 +00:00
|
|
|
scan_interval,
|
|
|
|
|
runtime_state,
|
2026-03-21 22:00:41 +00:00
|
|
|
})
|
2026-03-21 21:50:42 +00:00
|
|
|
}
|
|
|
|
|
|
2026-04-19 11:57:28 +00:00
|
|
|
pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {
|
|
|
|
|
self.runtime_state.clone()
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 21:50:42 +00:00
|
|
|
/// Run the healing loop as a background task.
|
|
|
|
|
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
|
|
|
|
let mut interval = tokio::time::interval(self.scan_interval);
|
|
|
|
|
|
|
|
|
|
// Skip the first immediate tick
|
|
|
|
|
interval.tick().await;
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
tokio::select! {
|
|
|
|
|
_ = interval.tick() => {
|
2026-04-19 11:57:28 +00:00
|
|
|
let started_at = Utc::now();
|
|
|
|
|
self.mark_healing_started(started_at).await;
|
2026-03-21 21:50:42 +00:00
|
|
|
tracing::info!("Starting healing scan");
|
|
|
|
|
match self.heal_scan().await {
|
|
|
|
|
Ok(stats) => {
|
2026-04-19 11:57:28 +00:00
|
|
|
self.mark_healing_finished(started_at, Some(stats.clone()), None).await;
|
2026-03-21 21:50:42 +00:00
|
|
|
tracing::info!(
|
|
|
|
|
checked = stats.shards_checked,
|
|
|
|
|
healed = stats.shards_healed,
|
|
|
|
|
errors = stats.errors,
|
|
|
|
|
"Healing scan completed"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
2026-04-19 11:57:28 +00:00
|
|
|
self.mark_healing_finished(started_at, None, Some(e.to_string())).await;
|
2026-03-21 21:50:42 +00:00
|
|
|
tracing::error!("Healing scan failed: {}", e);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
_ = shutdown.changed() => {
|
|
|
|
|
tracing::info!("Healing service shutting down");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-19 11:57:28 +00:00
|
|
|
async fn mark_healing_started(&self, started_at: DateTime<Utc>) {
|
|
|
|
|
let mut runtime_state = self.runtime_state.write().await;
|
|
|
|
|
runtime_state.active = true;
|
|
|
|
|
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
|
|
|
|
|
runtime_state.last_run_started_at = Some(started_at);
|
|
|
|
|
runtime_state.last_error = None;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async fn mark_healing_finished(
|
|
|
|
|
&self,
|
|
|
|
|
started_at: DateTime<Utc>,
|
|
|
|
|
stats: Option<HealStats>,
|
|
|
|
|
last_error: Option<String>,
|
|
|
|
|
) {
|
|
|
|
|
let finished_at = Utc::now();
|
|
|
|
|
let mut runtime_state = self.runtime_state.write().await;
|
|
|
|
|
runtime_state.active = false;
|
|
|
|
|
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
|
|
|
|
|
runtime_state.last_run_completed_at = Some(finished_at);
|
|
|
|
|
runtime_state.last_duration_ms = Some(
|
|
|
|
|
finished_at
|
|
|
|
|
.signed_duration_since(started_at)
|
|
|
|
|
.num_milliseconds()
|
|
|
|
|
.max(0) as u64,
|
|
|
|
|
);
|
|
|
|
|
if let Some(stats) = stats {
|
|
|
|
|
runtime_state.last_stats = Some(stats);
|
|
|
|
|
}
|
|
|
|
|
runtime_state.last_error = last_error;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
/// Scan all manifests for shards on offline nodes, reconstruct and re-place them.
|
2026-03-21 21:50:42 +00:00
|
|
|
async fn heal_scan(&self) -> Result<HealStats> {
|
|
|
|
|
let mut stats = HealStats::default();
|
|
|
|
|
|
|
|
|
|
let offline_nodes = self.state.offline_nodes().await;
|
|
|
|
|
if offline_nodes.is_empty() {
|
|
|
|
|
tracing::debug!("No offline nodes, skipping heal scan");
|
|
|
|
|
return Ok(stats);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
// Check that we have majority before healing (split-brain prevention)
|
|
|
|
|
if !self.state.has_majority().await {
|
|
|
|
|
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
|
|
|
|
|
return Ok(stats);
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 21:50:42 +00:00
|
|
|
tracing::info!(
|
|
|
|
|
"Found {} offline nodes, scanning for affected shards",
|
|
|
|
|
offline_nodes.len()
|
|
|
|
|
);
|
|
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
// Iterate all bucket directories under manifest_dir
|
|
|
|
|
let mut bucket_entries = match fs::read_dir(&self.manifest_dir).await {
|
|
|
|
|
Ok(e) => e,
|
|
|
|
|
Err(_) => return Ok(stats),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
while let Some(bucket_entry) = bucket_entries.next_entry().await? {
|
|
|
|
|
if !bucket_entry.metadata().await?.is_dir() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
let bucket_name = bucket_entry.file_name().to_string_lossy().to_string();
|
|
|
|
|
if bucket_name.starts_with('.') {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2026-03-21 21:50:42 +00:00
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
// Scan manifests in this bucket
|
|
|
|
|
self.heal_bucket(&bucket_name, &offline_nodes, &mut stats)
|
|
|
|
|
.await;
|
|
|
|
|
|
|
|
|
|
// Yield to avoid starving foreground I/O
|
|
|
|
|
tokio::task::yield_now().await;
|
|
|
|
|
}
|
2026-03-21 21:50:42 +00:00
|
|
|
|
|
|
|
|
Ok(stats)
|
|
|
|
|
}
|
2026-03-21 22:00:41 +00:00
|
|
|
|
2026-05-02 12:09:13 +00:00
|
|
|
async fn heal_bucket(&self, bucket: &str, offline_nodes: &[String], stats: &mut HealStats) {
|
2026-03-21 22:00:41 +00:00
|
|
|
let bucket_dir = self.manifest_dir.join(bucket);
|
|
|
|
|
let manifests = match self.collect_manifests(&bucket_dir).await {
|
|
|
|
|
Ok(m) => m,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
tracing::warn!(bucket = bucket, error = %e, "Failed to list manifests");
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
let local_id = self.state.local_node_id().to_string();
|
|
|
|
|
|
|
|
|
|
for manifest in &manifests {
|
|
|
|
|
for chunk in &manifest.chunks {
|
|
|
|
|
// Check if any shard in this chunk is on an offline node
|
|
|
|
|
let affected: Vec<_> = chunk
|
|
|
|
|
.shard_placements
|
|
|
|
|
.iter()
|
|
|
|
|
.filter(|p| offline_nodes.contains(&p.node_id))
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
if affected.is_empty() {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
stats.shards_checked += chunk.shard_placements.len() as u64;
|
|
|
|
|
|
|
|
|
|
// Try to reconstruct missing shards from available ones
|
|
|
|
|
let k = manifest.data_shards;
|
|
|
|
|
let total = manifest.data_shards + manifest.parity_shards;
|
|
|
|
|
|
|
|
|
|
// Count available shards (those NOT on offline nodes)
|
|
|
|
|
let available_count = chunk
|
|
|
|
|
.shard_placements
|
|
|
|
|
.iter()
|
|
|
|
|
.filter(|p| !offline_nodes.contains(&p.node_id))
|
|
|
|
|
.count();
|
|
|
|
|
|
|
|
|
|
if available_count < k {
|
|
|
|
|
tracing::error!(
|
|
|
|
|
bucket = manifest.bucket,
|
|
|
|
|
key = manifest.key,
|
|
|
|
|
chunk = chunk.chunk_index,
|
|
|
|
|
available = available_count,
|
|
|
|
|
needed = k,
|
|
|
|
|
"Cannot heal chunk: not enough available shards"
|
|
|
|
|
);
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Fetch available shards (only local ones for now)
|
|
|
|
|
let mut shards: Vec<Option<Vec<u8>>> = vec![None; total];
|
|
|
|
|
let mut fetched = 0usize;
|
|
|
|
|
|
|
|
|
|
for placement in &chunk.shard_placements {
|
|
|
|
|
if offline_nodes.contains(&placement.node_id) {
|
|
|
|
|
continue; // Skip offline nodes
|
|
|
|
|
}
|
|
|
|
|
if fetched >= k {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if placement.node_id == local_id {
|
|
|
|
|
let shard_id = ShardId {
|
|
|
|
|
bucket: manifest.bucket.clone(),
|
|
|
|
|
key: manifest.key.clone(),
|
|
|
|
|
chunk_index: chunk.chunk_index,
|
|
|
|
|
shard_index: placement.shard_index,
|
|
|
|
|
};
|
|
|
|
|
let store_idx = placement.drive_id.parse::<usize>().unwrap_or(0);
|
|
|
|
|
if let Some(store) = self.local_shard_stores.get(store_idx) {
|
|
|
|
|
if let Ok((data, _)) = store.read_shard(&shard_id).await {
|
|
|
|
|
shards[placement.shard_index as usize] = Some(data);
|
|
|
|
|
fetched += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// TODO: fetch from other online remote nodes
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if fetched < k {
|
|
|
|
|
tracing::warn!(
|
|
|
|
|
bucket = manifest.bucket,
|
|
|
|
|
key = manifest.key,
|
|
|
|
|
chunk = chunk.chunk_index,
|
|
|
|
|
"Not enough local shards to heal, skipping"
|
|
|
|
|
);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Reconstruct all shards
|
2026-05-02 12:09:13 +00:00
|
|
|
let reconstructed = match self
|
|
|
|
|
.erasure_coder
|
|
|
|
|
.decode_chunk(&mut shards, chunk.data_size)
|
|
|
|
|
{
|
2026-03-21 22:00:41 +00:00
|
|
|
Ok(_) => true,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
tracing::error!(
|
|
|
|
|
bucket = manifest.bucket,
|
|
|
|
|
key = manifest.key,
|
|
|
|
|
chunk = chunk.chunk_index,
|
|
|
|
|
error = %e,
|
|
|
|
|
"Reconstruction failed"
|
|
|
|
|
);
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if !reconstructed {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Re-encode to get all shards back (including the missing ones)
|
|
|
|
|
let full_data_size = chunk.data_size;
|
|
|
|
|
let mut data_buf = Vec::with_capacity(full_data_size);
|
|
|
|
|
for i in 0..k {
|
|
|
|
|
if let Some(ref shard) = shards[i] {
|
|
|
|
|
data_buf.extend_from_slice(shard);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
data_buf.truncate(full_data_size);
|
|
|
|
|
|
|
|
|
|
let all_shards = match self.erasure_coder.encode_chunk(&data_buf) {
|
|
|
|
|
Ok(s) => s,
|
|
|
|
|
Err(e) => {
|
|
|
|
|
tracing::error!(error = %e, "Re-encoding for heal failed");
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2026-03-21 22:19:51 +00:00
|
|
|
// Verify reconstructed shards are consistent
|
|
|
|
|
if !self.erasure_coder.verify(&all_shards).unwrap_or(false) {
|
|
|
|
|
tracing::error!(
|
|
|
|
|
bucket = manifest.bucket,
|
|
|
|
|
key = manifest.key,
|
|
|
|
|
chunk = chunk.chunk_index,
|
|
|
|
|
"Shard verification failed after reconstruction"
|
|
|
|
|
);
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2026-03-21 22:00:41 +00:00
|
|
|
// Write the missing shards to the first available local drive
|
|
|
|
|
for affected_placement in &affected {
|
|
|
|
|
let shard_idx = affected_placement.shard_index as usize;
|
|
|
|
|
if shard_idx < all_shards.len() {
|
|
|
|
|
let shard_data = &all_shards[shard_idx];
|
|
|
|
|
let checksum = crc32c::crc32c(shard_data);
|
|
|
|
|
|
|
|
|
|
let shard_id = ShardId {
|
|
|
|
|
bucket: manifest.bucket.clone(),
|
|
|
|
|
key: manifest.key.clone(),
|
|
|
|
|
chunk_index: chunk.chunk_index,
|
|
|
|
|
shard_index: affected_placement.shard_index,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Place on first available local drive
|
|
|
|
|
if let Some(store) = self.local_shard_stores.first() {
|
|
|
|
|
match store.write_shard(&shard_id, shard_data, checksum).await {
|
|
|
|
|
Ok(()) => {
|
|
|
|
|
stats.shards_healed += 1;
|
|
|
|
|
tracing::info!(
|
|
|
|
|
bucket = manifest.bucket,
|
|
|
|
|
key = manifest.key,
|
|
|
|
|
chunk = chunk.chunk_index,
|
|
|
|
|
shard = affected_placement.shard_index,
|
|
|
|
|
"Shard healed successfully"
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
Err(e) => {
|
|
|
|
|
tracing::error!(error = %e, "Failed to write healed shard");
|
|
|
|
|
stats.errors += 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tokio::task::yield_now().await;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Collect all manifests under a bucket directory.
|
|
|
|
|
async fn collect_manifests(&self, dir: &std::path::Path) -> Result<Vec<ObjectManifest>> {
|
|
|
|
|
let mut manifests = Vec::new();
|
2026-05-02 12:09:13 +00:00
|
|
|
self.collect_manifests_recursive(dir, &mut manifests)
|
|
|
|
|
.await?;
|
2026-03-21 22:00:41 +00:00
|
|
|
Ok(manifests)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn collect_manifests_recursive<'a>(
|
|
|
|
|
&'a self,
|
|
|
|
|
dir: &'a std::path::Path,
|
|
|
|
|
manifests: &'a mut Vec<ObjectManifest>,
|
|
|
|
|
) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
|
|
|
|
|
Box::pin(async move {
|
|
|
|
|
let mut entries = match fs::read_dir(dir).await {
|
|
|
|
|
Ok(e) => e,
|
|
|
|
|
Err(_) => return Ok(()),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
while let Some(entry) = entries.next_entry().await? {
|
|
|
|
|
let meta = entry.metadata().await?;
|
|
|
|
|
let name = entry.file_name().to_string_lossy().to_string();
|
|
|
|
|
|
|
|
|
|
if meta.is_dir() {
|
|
|
|
|
self.collect_manifests_recursive(&entry.path(), manifests)
|
|
|
|
|
.await?;
|
|
|
|
|
} else if name.ends_with(".manifest.json") {
|
|
|
|
|
if let Ok(content) = fs::read_to_string(entry.path()).await {
|
|
|
|
|
if let Ok(manifest) = serde_json::from_str::<ObjectManifest>(&content) {
|
|
|
|
|
manifests.push(manifest);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
})
|
|
|
|
|
}
|
2026-03-21 21:50:42 +00:00
|
|
|
}
|
|
|
|
|
|
2026-04-19 11:57:28 +00:00
|
|
|
#[derive(Debug, Clone, Default)]
|
2026-03-21 21:50:42 +00:00
|
|
|
pub struct HealStats {
|
|
|
|
|
pub shards_checked: u64,
|
|
|
|
|
pub shards_healed: u64,
|
|
|
|
|
pub errors: u64,
|
|
|
|
|
}
|
2026-04-19 11:57:28 +00:00
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Default)]
|
|
|
|
|
pub struct HealingRuntimeState {
|
|
|
|
|
pub active: bool,
|
|
|
|
|
pub scan_interval_ms: u64,
|
|
|
|
|
pub last_run_started_at: Option<DateTime<Utc>>,
|
|
|
|
|
pub last_run_completed_at: Option<DateTime<Utc>>,
|
|
|
|
|
pub last_duration_ms: Option<u64>,
|
|
|
|
|
pub last_stats: Option<HealStats>,
|
|
|
|
|
pub last_error: Option<String>,
|
|
|
|
|
}
|