feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
use anyhow::Result;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use super::coordinator::DistributedStore;
|
||||
use super::state::ClusterState;
|
||||
|
||||
/// Background healing service that scans for under-replicated shards
|
||||
/// and reconstructs them.
|
||||
pub struct HealingService {
|
||||
state: Arc<ClusterState>,
|
||||
scan_interval: Duration,
|
||||
}
|
||||
|
||||
impl HealingService {
|
||||
pub fn new(state: Arc<ClusterState>, scan_interval_hours: u64) -> Self {
|
||||
Self {
|
||||
state,
|
||||
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
|
||||
}
|
||||
}
|
||||
|
||||
/// Run the healing loop as a background task.
|
||||
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
||||
let mut interval = tokio::time::interval(self.scan_interval);
|
||||
|
||||
// Skip the first immediate tick
|
||||
interval.tick().await;
|
||||
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
tracing::info!("Starting healing scan");
|
||||
match self.heal_scan().await {
|
||||
Ok(stats) => {
|
||||
tracing::info!(
|
||||
checked = stats.shards_checked,
|
||||
healed = stats.shards_healed,
|
||||
errors = stats.errors,
|
||||
"Healing scan completed"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::error!("Healing scan failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
_ = shutdown.changed() => {
|
||||
tracing::info!("Healing service shutting down");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scan for offline nodes and identify objects that need healing.
|
||||
async fn heal_scan(&self) -> Result<HealStats> {
|
||||
let mut stats = HealStats::default();
|
||||
|
||||
let offline_nodes = self.state.offline_nodes().await;
|
||||
if offline_nodes.is_empty() {
|
||||
tracing::debug!("No offline nodes, skipping heal scan");
|
||||
return Ok(stats);
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Found {} offline nodes, scanning for affected shards",
|
||||
offline_nodes.len()
|
||||
);
|
||||
|
||||
// Check that we have majority before healing
|
||||
// (prevents healing during split-brain)
|
||||
if !self.state.has_majority().await {
|
||||
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
|
||||
return Ok(stats);
|
||||
}
|
||||
|
||||
// TODO: Iterate all manifests, find shards on offline nodes,
|
||||
// reconstruct from remaining shards and place on healthy nodes.
|
||||
// This requires access to the DistributedStore and manifest listing
|
||||
// which will be wired in when the full healing pipeline is implemented.
|
||||
|
||||
Ok(stats)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct HealStats {
|
||||
pub shards_checked: u64,
|
||||
pub shards_healed: u64,
|
||||
pub errors: u64,
|
||||
}
|
||||
Reference in New Issue
Block a user