93 lines
3.0 KiB
Rust
93 lines
3.0 KiB
Rust
|
|
use anyhow::Result;
|
||
|
|
use std::sync::Arc;
|
||
|
|
use std::time::Duration;
|
||
|
|
|
||
|
|
use super::coordinator::DistributedStore;
|
||
|
|
use super::state::ClusterState;
|
||
|
|
|
||
|
|
/// Background healing service that scans for under-replicated shards
|
||
|
|
/// and reconstructs them.
|
||
|
|
pub struct HealingService {
|
||
|
|
state: Arc<ClusterState>,
|
||
|
|
scan_interval: Duration,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl HealingService {
|
||
|
|
pub fn new(state: Arc<ClusterState>, scan_interval_hours: u64) -> Self {
|
||
|
|
Self {
|
||
|
|
state,
|
||
|
|
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Run the healing loop as a background task.
|
||
|
|
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
||
|
|
let mut interval = tokio::time::interval(self.scan_interval);
|
||
|
|
|
||
|
|
// Skip the first immediate tick
|
||
|
|
interval.tick().await;
|
||
|
|
|
||
|
|
loop {
|
||
|
|
tokio::select! {
|
||
|
|
_ = interval.tick() => {
|
||
|
|
tracing::info!("Starting healing scan");
|
||
|
|
match self.heal_scan().await {
|
||
|
|
Ok(stats) => {
|
||
|
|
tracing::info!(
|
||
|
|
checked = stats.shards_checked,
|
||
|
|
healed = stats.shards_healed,
|
||
|
|
errors = stats.errors,
|
||
|
|
"Healing scan completed"
|
||
|
|
);
|
||
|
|
}
|
||
|
|
Err(e) => {
|
||
|
|
tracing::error!("Healing scan failed: {}", e);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
_ = shutdown.changed() => {
|
||
|
|
tracing::info!("Healing service shutting down");
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/// Scan for offline nodes and identify objects that need healing.
|
||
|
|
async fn heal_scan(&self) -> Result<HealStats> {
|
||
|
|
let mut stats = HealStats::default();
|
||
|
|
|
||
|
|
let offline_nodes = self.state.offline_nodes().await;
|
||
|
|
if offline_nodes.is_empty() {
|
||
|
|
tracing::debug!("No offline nodes, skipping heal scan");
|
||
|
|
return Ok(stats);
|
||
|
|
}
|
||
|
|
|
||
|
|
tracing::info!(
|
||
|
|
"Found {} offline nodes, scanning for affected shards",
|
||
|
|
offline_nodes.len()
|
||
|
|
);
|
||
|
|
|
||
|
|
// Check that we have majority before healing
|
||
|
|
// (prevents healing during split-brain)
|
||
|
|
if !self.state.has_majority().await {
|
||
|
|
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
|
||
|
|
return Ok(stats);
|
||
|
|
}
|
||
|
|
|
||
|
|
// TODO: Iterate all manifests, find shards on offline nodes,
|
||
|
|
// reconstruct from remaining shards and place on healthy nodes.
|
||
|
|
// This requires access to the DistributedStore and manifest listing
|
||
|
|
// which will be wired in when the full healing pipeline is implemented.
|
||
|
|
|
||
|
|
Ok(stats)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[derive(Debug, Default)]
|
||
|
|
pub struct HealStats {
|
||
|
|
pub shards_checked: u64,
|
||
|
|
pub shards_healed: u64,
|
||
|
|
pub errors: u64,
|
||
|
|
}
|