feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management

2026-03-21 21:50:42 +00:00
parent 4fcd05d3c6
commit d12d321079
25 changed files with 7472 additions and 3467 deletions
@@ -0,0 +1,92 @@
+use anyhow::Result;
+use std::sync::Arc;
+use std::time::Duration;
+
+use super::coordinator::DistributedStore;
+use super::state::ClusterState;
+
+/// Background healing service that scans for under-replicated shards
+/// and reconstructs them.
+pub struct HealingService {
+    state: Arc<ClusterState>,
+    scan_interval: Duration,
+}
+
+impl HealingService {
+    pub fn new(state: Arc<ClusterState>, scan_interval_hours: u64) -> Self {
+        Self {
+            state,
+            scan_interval: Duration::from_secs(scan_interval_hours * 3600),
+        }
+    }
+
+    /// Run the healing loop as a background task.
+    pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
+        let mut interval = tokio::time::interval(self.scan_interval);
+
+        // Skip the first immediate tick
+        interval.tick().await;
+
+        loop {
+            tokio::select! {
+                _ = interval.tick() => {
+                    tracing::info!("Starting healing scan");
+                    match self.heal_scan().await {
+                        Ok(stats) => {
+                            tracing::info!(
+                                checked = stats.shards_checked,
+                                healed = stats.shards_healed,
+                                errors = stats.errors,
+                                "Healing scan completed"
+                            );
+                        }
+                        Err(e) => {
+                            tracing::error!("Healing scan failed: {}", e);
+                        }
+                    }
+                }
+                _ = shutdown.changed() => {
+                    tracing::info!("Healing service shutting down");
+                    break;
+                }
+            }
+        }
+    }
+
+    /// Scan for offline nodes and identify objects that need healing.
+    async fn heal_scan(&self) -> Result<HealStats> {
+        let mut stats = HealStats::default();
+
+        let offline_nodes = self.state.offline_nodes().await;
+        if offline_nodes.is_empty() {
+            tracing::debug!("No offline nodes, skipping heal scan");
+            return Ok(stats);
+        }
+
+        tracing::info!(
+            "Found {} offline nodes, scanning for affected shards",
+            offline_nodes.len()
+        );
+
+        // Check that we have majority before healing
+        // (prevents healing during split-brain)
+        if !self.state.has_majority().await {
+            tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
+            return Ok(stats);
+        }
+
+        // TODO: Iterate all manifests, find shards on offline nodes,
+        // reconstruct from remaining shards and place on healthy nodes.
+        // This requires access to the DistributedStore and manifest listing
+        // which will be wired in when the full healing pipeline is implemented.
+
+        Ok(stats)
+    }
+}
+
+#[derive(Debug, Default)]
+pub struct HealStats {
+    pub shards_checked: u64,
+    pub shards_healed: u64,
+    pub errors: u64,
+}