Files
smartstorage/rust/src/cluster/healing.rs
T

93 lines
3.0 KiB
Rust
Raw Normal View History

use anyhow::Result;
use std::sync::Arc;
use std::time::Duration;
use super::coordinator::DistributedStore;
use super::state::ClusterState;
/// Background healing service that scans for under-replicated shards
/// and reconstructs them.
pub struct HealingService {
state: Arc<ClusterState>,
scan_interval: Duration,
}
impl HealingService {
pub fn new(state: Arc<ClusterState>, scan_interval_hours: u64) -> Self {
Self {
state,
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
}
}
/// Run the healing loop as a background task.
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
let mut interval = tokio::time::interval(self.scan_interval);
// Skip the first immediate tick
interval.tick().await;
loop {
tokio::select! {
_ = interval.tick() => {
tracing::info!("Starting healing scan");
match self.heal_scan().await {
Ok(stats) => {
tracing::info!(
checked = stats.shards_checked,
healed = stats.shards_healed,
errors = stats.errors,
"Healing scan completed"
);
}
Err(e) => {
tracing::error!("Healing scan failed: {}", e);
}
}
}
_ = shutdown.changed() => {
tracing::info!("Healing service shutting down");
break;
}
}
}
}
/// Scan for offline nodes and identify objects that need healing.
async fn heal_scan(&self) -> Result<HealStats> {
let mut stats = HealStats::default();
let offline_nodes = self.state.offline_nodes().await;
if offline_nodes.is_empty() {
tracing::debug!("No offline nodes, skipping heal scan");
return Ok(stats);
}
tracing::info!(
"Found {} offline nodes, scanning for affected shards",
offline_nodes.len()
);
// Check that we have majority before healing
// (prevents healing during split-brain)
if !self.state.has_majority().await {
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
return Ok(stats);
}
// TODO: Iterate all manifests, find shards on offline nodes,
// reconstruct from remaining shards and place on healthy nodes.
// This requires access to the DistributedStore and manifest listing
// which will be wired in when the full healing pipeline is implemented.
Ok(stats)
}
}
#[derive(Debug, Default)]
pub struct HealStats {
pub shards_checked: u64,
pub shards_healed: u64,
pub errors: u64,
}