rust/src/cluster/healing.rs

use anyhow::Result;
use chrono::{DateTime, Utc};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::fs;
use tokio::sync::RwLock;

use super::config::ErasureConfig;
use super::erasure::ErasureCoder;
use super::metadata::ObjectManifest;
use super::shard_store::{ShardId, ShardStore};
use super::state::ClusterState;

/// Background healing service that scans for under-replicated shards
/// and reconstructs them.
pub struct HealingService {
    state: Arc<ClusterState>,
    erasure_coder: ErasureCoder,
    local_shard_stores: Vec<Arc<ShardStore>>,
    manifest_dir: PathBuf,
    scan_interval: Duration,
    runtime_state: Arc<RwLock<HealingRuntimeState>>,
}

impl HealingService {
    pub fn new(
        state: Arc<ClusterState>,
        erasure_config: &ErasureConfig,
        local_shard_stores: Vec<Arc<ShardStore>>,
        manifest_dir: PathBuf,
        scan_interval_hours: u64,
        runtime_state: Arc<RwLock<HealingRuntimeState>>,
    ) -> Result<Self> {
        let scan_interval = Duration::from_secs(scan_interval_hours * 3600);
        if let Ok(mut state_guard) = runtime_state.try_write() {
            state_guard.scan_interval_ms = scan_interval.as_millis() as u64;
        }

        Ok(Self {
            state,
            erasure_coder: ErasureCoder::new(erasure_config)?,
            local_shard_stores,
            manifest_dir,
            scan_interval,
            runtime_state,
        })
    }

    pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {
        self.runtime_state.clone()
    }

    /// Run the healing loop as a background task.
    pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
        let mut interval = tokio::time::interval(self.scan_interval);

        // Skip the first immediate tick
        interval.tick().await;

        loop {
            tokio::select! {
                _ = interval.tick() => {
                    let started_at = Utc::now();
                    self.mark_healing_started(started_at).await;
                    tracing::info!("Starting healing scan");
                    match self.heal_scan().await {
                        Ok(stats) => {
                            self.mark_healing_finished(started_at, Some(stats.clone()), None).await;
                            tracing::info!(
                                checked = stats.shards_checked,
                                healed = stats.shards_healed,
                                errors = stats.errors,
                                "Healing scan completed"
                            );
                        }
                        Err(e) => {
                            self.mark_healing_finished(started_at, None, Some(e.to_string())).await;
                            tracing::error!("Healing scan failed: {}", e);
                        }
                    }
                }
                _ = shutdown.changed() => {
                    tracing::info!("Healing service shutting down");
                    break;
                }
            }
        }
    }

    async fn mark_healing_started(&self, started_at: DateTime<Utc>) {
        let mut runtime_state = self.runtime_state.write().await;
        runtime_state.active = true;
        runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
        runtime_state.last_run_started_at = Some(started_at);
        runtime_state.last_error = None;
    }

    async fn mark_healing_finished(
        &self,
        started_at: DateTime<Utc>,
        stats: Option<HealStats>,
        last_error: Option<String>,
    ) {
        let finished_at = Utc::now();
        let mut runtime_state = self.runtime_state.write().await;
        runtime_state.active = false;
        runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
        runtime_state.last_run_completed_at = Some(finished_at);
        runtime_state.last_duration_ms = Some(
            finished_at
                .signed_duration_since(started_at)
                .num_milliseconds()
                .max(0) as u64,
        );
        if let Some(stats) = stats {
            runtime_state.last_stats = Some(stats);
        }
        runtime_state.last_error = last_error;
    }

    /// Scan all manifests for shards on offline nodes, reconstruct and re-place them.
    async fn heal_scan(&self) -> Result<HealStats> {
        let mut stats = HealStats::default();

        let offline_nodes = self.state.offline_nodes().await;
        if offline_nodes.is_empty() {
            tracing::debug!("No offline nodes, skipping heal scan");
            return Ok(stats);
        }

        // Check that we have majority before healing (split-brain prevention)
        if !self.state.has_majority().await {
            tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
            return Ok(stats);
        }

        tracing::info!(
            "Found {} offline nodes, scanning for affected shards",
            offline_nodes.len()
        );

        // Iterate all bucket directories under manifest_dir
        let mut bucket_entries = match fs::read_dir(&self.manifest_dir).await {
            Ok(e) => e,
            Err(_) => return Ok(stats),
        };

        while let Some(bucket_entry) = bucket_entries.next_entry().await? {
            if !bucket_entry.metadata().await?.is_dir() {
                continue;
            }
            let bucket_name = bucket_entry.file_name().to_string_lossy().to_string();
            if bucket_name.starts_with('.') {
                continue;
            }

            // Scan manifests in this bucket
            self.heal_bucket(&bucket_name, &offline_nodes, &mut stats)
                .await;

            // Yield to avoid starving foreground I/O
            tokio::task::yield_now().await;
        }

        Ok(stats)
    }

    async fn heal_bucket(
        &self,
        bucket: &str,
        offline_nodes: &[String],
        stats: &mut HealStats,
    ) {
        let bucket_dir = self.manifest_dir.join(bucket);
        let manifests = match self.collect_manifests(&bucket_dir).await {
            Ok(m) => m,
            Err(e) => {
                tracing::warn!(bucket = bucket, error = %e, "Failed to list manifests");
                stats.errors += 1;
                return;
            }
        };

        let local_id = self.state.local_node_id().to_string();

        for manifest in &manifests {
            for chunk in &manifest.chunks {
                // Check if any shard in this chunk is on an offline node
                let affected: Vec<_> = chunk
                    .shard_placements
                    .iter()
                    .filter(|p| offline_nodes.contains(&p.node_id))
                    .collect();

                if affected.is_empty() {
                    continue;
                }

                stats.shards_checked += chunk.shard_placements.len() as u64;

                // Try to reconstruct missing shards from available ones
                let k = manifest.data_shards;
                let total = manifest.data_shards + manifest.parity_shards;

                // Count available shards (those NOT on offline nodes)
                let available_count = chunk
                    .shard_placements
                    .iter()
                    .filter(|p| !offline_nodes.contains(&p.node_id))
                    .count();

                if available_count < k {
                    tracing::error!(
                        bucket = manifest.bucket,
                        key = manifest.key,
                        chunk = chunk.chunk_index,
                        available = available_count,
                        needed = k,
                        "Cannot heal chunk: not enough available shards"
                    );
                    stats.errors += 1;
                    continue;
                }

                // Fetch available shards (only local ones for now)
                let mut shards: Vec<Option<Vec<u8>>> = vec![None; total];
                let mut fetched = 0usize;

                for placement in &chunk.shard_placements {
                    if offline_nodes.contains(&placement.node_id) {
                        continue; // Skip offline nodes
                    }
                    if fetched >= k {
                        break;
                    }

                    if placement.node_id == local_id {
                        let shard_id = ShardId {
                            bucket: manifest.bucket.clone(),
                            key: manifest.key.clone(),
                            chunk_index: chunk.chunk_index,
                            shard_index: placement.shard_index,
                        };
                        let store_idx = placement.drive_id.parse::<usize>().unwrap_or(0);
                        if let Some(store) = self.local_shard_stores.get(store_idx) {
                            if let Ok((data, _)) = store.read_shard(&shard_id).await {
                                shards[placement.shard_index as usize] = Some(data);
                                fetched += 1;
                            }
                        }
                    }
                    // TODO: fetch from other online remote nodes
                }

                if fetched < k {
                    tracing::warn!(
                        bucket = manifest.bucket,
                        key = manifest.key,
                        chunk = chunk.chunk_index,
                        "Not enough local shards to heal, skipping"
                    );
                    continue;
                }

                // Reconstruct all shards
                let reconstructed = match self.erasure_coder.decode_chunk(
                    &mut shards,
                    chunk.data_size,
                ) {
                    Ok(_) => true,
                    Err(e) => {
                        tracing::error!(
                            bucket = manifest.bucket,
                            key = manifest.key,
                            chunk = chunk.chunk_index,
                            error = %e,
                            "Reconstruction failed"
                        );
                        stats.errors += 1;
                        false
                    }
                };

                if !reconstructed {
                    continue;
                }

                // Re-encode to get all shards back (including the missing ones)
                let full_data_size = chunk.data_size;
                let mut data_buf = Vec::with_capacity(full_data_size);
                for i in 0..k {
                    if let Some(ref shard) = shards[i] {
                        data_buf.extend_from_slice(shard);
                    }
                }
                data_buf.truncate(full_data_size);

                let all_shards = match self.erasure_coder.encode_chunk(&data_buf) {
                    Ok(s) => s,
                    Err(e) => {
                        tracing::error!(error = %e, "Re-encoding for heal failed");
                        stats.errors += 1;
                        continue;
                    }
                };

                // Verify reconstructed shards are consistent
                if !self.erasure_coder.verify(&all_shards).unwrap_or(false) {
                    tracing::error!(
                        bucket = manifest.bucket,
                        key = manifest.key,
                        chunk = chunk.chunk_index,
                        "Shard verification failed after reconstruction"
                    );
                    stats.errors += 1;
                    continue;
                }

                // Write the missing shards to the first available local drive
                for affected_placement in &affected {
                    let shard_idx = affected_placement.shard_index as usize;
                    if shard_idx < all_shards.len() {
                        let shard_data = &all_shards[shard_idx];
                        let checksum = crc32c::crc32c(shard_data);

                        let shard_id = ShardId {
                            bucket: manifest.bucket.clone(),
                            key: manifest.key.clone(),
                            chunk_index: chunk.chunk_index,
                            shard_index: affected_placement.shard_index,
                        };

                        // Place on first available local drive
                        if let Some(store) = self.local_shard_stores.first() {
                            match store.write_shard(&shard_id, shard_data, checksum).await {
                                Ok(()) => {
                                    stats.shards_healed += 1;
                                    tracing::info!(
                                        bucket = manifest.bucket,
                                        key = manifest.key,
                                        chunk = chunk.chunk_index,
                                        shard = affected_placement.shard_index,
                                        "Shard healed successfully"
                                    );
                                }
                                Err(e) => {
                                    tracing::error!(error = %e, "Failed to write healed shard");
                                    stats.errors += 1;
                                }
                            }
                        }
                    }
                }

                tokio::task::yield_now().await;
            }
        }
    }

    /// Collect all manifests under a bucket directory.
    async fn collect_manifests(&self, dir: &std::path::Path) -> Result<Vec<ObjectManifest>> {
        let mut manifests = Vec::new();
        self.collect_manifests_recursive(dir, &mut manifests).await?;
        Ok(manifests)
    }

    fn collect_manifests_recursive<'a>(
        &'a self,
        dir: &'a std::path::Path,
        manifests: &'a mut Vec<ObjectManifest>,
    ) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {
        Box::pin(async move {
            let mut entries = match fs::read_dir(dir).await {
                Ok(e) => e,
                Err(_) => return Ok(()),
            };

            while let Some(entry) = entries.next_entry().await? {
                let meta = entry.metadata().await?;
                let name = entry.file_name().to_string_lossy().to_string();

                if meta.is_dir() {
                    self.collect_manifests_recursive(&entry.path(), manifests)
                        .await?;
                } else if name.ends_with(".manifest.json") {
                    if let Ok(content) = fs::read_to_string(entry.path()).await {
                        if let Ok(manifest) = serde_json::from_str::<ObjectManifest>(&content) {
                            manifests.push(manifest);
                        }
                    }
                }
            }

            Ok(())
        })
    }
}

#[derive(Debug, Clone, Default)]
pub struct HealStats {
    pub shards_checked: u64,
    pub shards_healed: u64,
    pub errors: u64,
}

#[derive(Debug, Clone, Default)]
pub struct HealingRuntimeState {
    pub active: bool,
    pub scan_interval_ms: u64,
    pub last_run_started_at: Option<DateTime<Utc>>,
    pub last_run_completed_at: Option<DateTime<Utc>>,
    pub last_duration_ms: Option<u64>,
    pub last_stats: Option<HealStats>,
    pub last_error: Option<String>,
}
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`use anyhow::Result;`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`use chrono::{DateTime, Utc};`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`use std::path::PathBuf;`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`use std::sync::Arc;`
			`use std::time::Duration;`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`use tokio::fs;`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`use tokio::sync::RwLock;`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`use super::config::ErasureConfig;`
			`use super::erasure::ErasureCoder;`
			`use super::metadata::ObjectManifest;`
			`use super::shard_store::{ShardId, ShardStore};`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`use super::state::ClusterState;`

			`/// Background healing service that scans for under-replicated shards`
			`/// and reconstructs them.`
			`pub struct HealingService {`
			`state: Arc<ClusterState>,`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`erasure_coder: ErasureCoder,`
			`local_shard_stores: Vec<Arc<ShardStore>>,`
			`manifest_dir: PathBuf,`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`scan_interval: Duration,`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`runtime_state: Arc<RwLock<HealingRuntimeState>>,`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`}`

			`impl HealingService {`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`pub fn new(`
			`state: Arc<ClusterState>,`
			`erasure_config: &ErasureConfig,`
			`local_shard_stores: Vec<Arc<ShardStore>>,`
			`manifest_dir: PathBuf,`
			`scan_interval_hours: u64,`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`runtime_state: Arc<RwLock<HealingRuntimeState>>,`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`) -> Result<Self> {`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`let scan_interval = Duration::from_secs(scan_interval_hours * 3600);`
			`if let Ok(mut state_guard) = runtime_state.try_write() {`
			`state_guard.scan_interval_ms = scan_interval.as_millis() as u64;`
			`}`

feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`Ok(Self {`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`state,`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`erasure_coder: ErasureCoder::new(erasure_config)?,`
			`local_shard_stores,`
			`manifest_dir,`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`scan_interval,`
			`runtime_state,`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`})`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`}`

feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {`
			`self.runtime_state.clone()`
			`}`

feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`/// Run the healing loop as a background task.`
			`pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {`
			`let mut interval = tokio::time::interval(self.scan_interval);`

			`// Skip the first immediate tick`
			`interval.tick().await;`

			`loop {`
			`tokio::select! {`
			`_ = interval.tick() => {`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`let started_at = Utc::now();`
			`self.mark_healing_started(started_at).await;`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`tracing::info!("Starting healing scan");`
			`match self.heal_scan().await {`
			`Ok(stats) => {`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`self.mark_healing_finished(started_at, Some(stats.clone()), None).await;`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`tracing::info!(`
			`checked = stats.shards_checked,`
			`healed = stats.shards_healed,`
			`errors = stats.errors,`
			`"Healing scan completed"`
			`);`
			`}`
			`Err(e) => {`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`self.mark_healing_finished(started_at, None, Some(e.to_string())).await;`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`tracing::error!("Healing scan failed: {}", e);`
			`}`
			`}`
			`}`
			`_ = shutdown.changed() => {`
			`tracing::info!("Healing service shutting down");`
			`break;`
			`}`
			`}`
			`}`
			`}`

feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`async fn mark_healing_started(&self, started_at: DateTime<Utc>) {`
			`let mut runtime_state = self.runtime_state.write().await;`
			`runtime_state.active = true;`
			`runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;`
			`runtime_state.last_run_started_at = Some(started_at);`
			`runtime_state.last_error = None;`
			`}`

			`async fn mark_healing_finished(`
			`&self,`
			`started_at: DateTime<Utc>,`
			`stats: Option<HealStats>,`
			`last_error: Option<String>,`
			`) {`
			`let finished_at = Utc::now();`
			`let mut runtime_state = self.runtime_state.write().await;`
			`runtime_state.active = false;`
			`runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;`
			`runtime_state.last_run_completed_at = Some(finished_at);`
			`runtime_state.last_duration_ms = Some(`
			`finished_at`
			`.signed_duration_since(started_at)`
			`.num_milliseconds()`
			`.max(0) as u64,`
			`);`
			`if let Some(stats) = stats {`
			`runtime_state.last_stats = Some(stats);`
			`}`
			`runtime_state.last_error = last_error;`
			`}`

feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`/// Scan all manifests for shards on offline nodes, reconstruct and re-place them.`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`async fn heal_scan(&self) -> Result<HealStats> {`
			`let mut stats = HealStats::default();`

			`let offline_nodes = self.state.offline_nodes().await;`
			`if offline_nodes.is_empty() {`
			`tracing::debug!("No offline nodes, skipping heal scan");`
			`return Ok(stats);`
			`}`

feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`// Check that we have majority before healing (split-brain prevention)`
			`if !self.state.has_majority().await {`
			`tracing::warn!("No majority quorum, skipping heal to prevent split-brain");`
			`return Ok(stats);`
			`}`

feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`tracing::info!(`
			`"Found {} offline nodes, scanning for affected shards",`
			`offline_nodes.len()`
			`);`

feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`// Iterate all bucket directories under manifest_dir`
			`let mut bucket_entries = match fs::read_dir(&self.manifest_dir).await {`
			`Ok(e) => e,`
			`Err(_) => return Ok(stats),`
			`};`

			`while let Some(bucket_entry) = bucket_entries.next_entry().await? {`
			`if !bucket_entry.metadata().await?.is_dir() {`
			`continue;`
			`}`
			`let bucket_name = bucket_entry.file_name().to_string_lossy().to_string();`
			`if bucket_name.starts_with('.') {`
			`continue;`
			`}`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`// Scan manifests in this bucket`
			`self.heal_bucket(&bucket_name, &offline_nodes, &mut stats)`
			`.await;`

			`// Yield to avoid starving foreground I/O`
			`tokio::task::yield_now().await;`
			`}`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00
			`Ok(stats)`
			`}`
feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00
			`async fn heal_bucket(`
			`&self,`
			`bucket: &str,`
			`offline_nodes: &[String],`
			`stats: &mut HealStats,`
			`) {`
			`let bucket_dir = self.manifest_dir.join(bucket);`
			`let manifests = match self.collect_manifests(&bucket_dir).await {`
			`Ok(m) => m,`
			`Err(e) => {`
			`tracing::warn!(bucket = bucket, error = %e, "Failed to list manifests");`
			`stats.errors += 1;`
			`return;`
			`}`
			`};`

			`let local_id = self.state.local_node_id().to_string();`

			`for manifest in &manifests {`
			`for chunk in &manifest.chunks {`
			`// Check if any shard in this chunk is on an offline node`
			`let affected: Vec<_> = chunk`
			`.shard_placements`
			`.iter()`
			`.filter(\|p\| offline_nodes.contains(&p.node_id))`
			`.collect();`

			`if affected.is_empty() {`
			`continue;`
			`}`

			`stats.shards_checked += chunk.shard_placements.len() as u64;`

			`// Try to reconstruct missing shards from available ones`
			`let k = manifest.data_shards;`
			`let total = manifest.data_shards + manifest.parity_shards;`

			`// Count available shards (those NOT on offline nodes)`
			`let available_count = chunk`
			`.shard_placements`
			`.iter()`
			`.filter(\|p\| !offline_nodes.contains(&p.node_id))`
			`.count();`

			`if available_count < k {`
			`tracing::error!(`
			`bucket = manifest.bucket,`
			`key = manifest.key,`
			`chunk = chunk.chunk_index,`
			`available = available_count,`
			`needed = k,`
			`"Cannot heal chunk: not enough available shards"`
			`);`
			`stats.errors += 1;`
			`continue;`
			`}`

			`// Fetch available shards (only local ones for now)`
			`let mut shards: Vec<Option<Vec<u8>>> = vec![None; total];`
			`let mut fetched = 0usize;`

			`for placement in &chunk.shard_placements {`
			`if offline_nodes.contains(&placement.node_id) {`
			`continue; // Skip offline nodes`
			`}`
			`if fetched >= k {`
			`break;`
			`}`

			`if placement.node_id == local_id {`
			`let shard_id = ShardId {`
			`bucket: manifest.bucket.clone(),`
			`key: manifest.key.clone(),`
			`chunk_index: chunk.chunk_index,`
			`shard_index: placement.shard_index,`
			`};`
			`let store_idx = placement.drive_id.parse::<usize>().unwrap_or(0);`
			`if let Some(store) = self.local_shard_stores.get(store_idx) {`
			`if let Ok((data, _)) = store.read_shard(&shard_id).await {`
			`shards[placement.shard_index as usize] = Some(data);`
			`fetched += 1;`
			`}`
			`}`
			`}`
			`// TODO: fetch from other online remote nodes`
			`}`

			`if fetched < k {`
			`tracing::warn!(`
			`bucket = manifest.bucket,`
			`key = manifest.key,`
			`chunk = chunk.chunk_index,`
			`"Not enough local shards to heal, skipping"`
			`);`
			`continue;`
			`}`

			`// Reconstruct all shards`
			`let reconstructed = match self.erasure_coder.decode_chunk(`
			`&mut shards,`
			`chunk.data_size,`
			`) {`
			`Ok(_) => true,`
			`Err(e) => {`
			`tracing::error!(`
			`bucket = manifest.bucket,`
			`key = manifest.key,`
			`chunk = chunk.chunk_index,`
			`error = %e,`
			`"Reconstruction failed"`
			`);`
			`stats.errors += 1;`
			`false`
			`}`
			`};`

			`if !reconstructed {`
			`continue;`
			`}`

			`// Re-encode to get all shards back (including the missing ones)`
			`let full_data_size = chunk.data_size;`
			`let mut data_buf = Vec::with_capacity(full_data_size);`
			`for i in 0..k {`
			`if let Some(ref shard) = shards[i] {`
			`data_buf.extend_from_slice(shard);`
			`}`
			`}`
			`data_buf.truncate(full_data_size);`

			`let all_shards = match self.erasure_coder.encode_chunk(&data_buf) {`
			`Ok(s) => s,`
			`Err(e) => {`
			`tracing::error!(error = %e, "Re-encoding for heal failed");`
			`stats.errors += 1;`
			`continue;`
			`}`
			`};`

fix(cluster): improve shard reconstruction validation and start background healing service 2026-03-21 22:19:51 +00:00			`// Verify reconstructed shards are consistent`
			`if !self.erasure_coder.verify(&all_shards).unwrap_or(false) {`
			`tracing::error!(`
			`bucket = manifest.bucket,`
			`key = manifest.key,`
			`chunk = chunk.chunk_index,`
			`"Shard verification failed after reconstruction"`
			`);`
			`stats.errors += 1;`
			`continue;`
			`}`

feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support 2026-03-21 22:00:41 +00:00			`// Write the missing shards to the first available local drive`
			`for affected_placement in &affected {`
			`let shard_idx = affected_placement.shard_index as usize;`
			`if shard_idx < all_shards.len() {`
			`let shard_data = &all_shards[shard_idx];`
			`let checksum = crc32c::crc32c(shard_data);`

			`let shard_id = ShardId {`
			`bucket: manifest.bucket.clone(),`
			`key: manifest.key.clone(),`
			`chunk_index: chunk.chunk_index,`
			`shard_index: affected_placement.shard_index,`
			`};`

			`// Place on first available local drive`
			`if let Some(store) = self.local_shard_stores.first() {`
			`match store.write_shard(&shard_id, shard_data, checksum).await {`
			`Ok(()) => {`
			`stats.shards_healed += 1;`
			`tracing::info!(`
			`bucket = manifest.bucket,`
			`key = manifest.key,`
			`chunk = chunk.chunk_index,`
			`shard = affected_placement.shard_index,`
			`"Shard healed successfully"`
			`);`
			`}`
			`Err(e) => {`
			`tracing::error!(error = %e, "Failed to write healed shard");`
			`stats.errors += 1;`
			`}`
			`}`
			`}`
			`}`
			`}`

			`tokio::task::yield_now().await;`
			`}`
			`}`
			`}`

			`/// Collect all manifests under a bucket directory.`
			`async fn collect_manifests(&self, dir: &std::path::Path) -> Result<Vec<ObjectManifest>> {`
			`let mut manifests = Vec::new();`
			`self.collect_manifests_recursive(dir, &mut manifests).await?;`
			`Ok(manifests)`
			`}`

			`fn collect_manifests_recursive<'a>(`
			`&'a self,`
			`dir: &'a std::path::Path,`
			`manifests: &'a mut Vec<ObjectManifest>,`
			`) -> std::pin::Pin<Box<dyn std::future::Future<Output = Result<()>> + Send + 'a>> {`
			`Box::pin(async move {`
			`let mut entries = match fs::read_dir(dir).await {`
			`Ok(e) => e,`
			`Err(_) => return Ok(()),`
			`};`

			`while let Some(entry) = entries.next_entry().await? {`
			`let meta = entry.metadata().await?;`
			`let name = entry.file_name().to_string_lossy().to_string();`

			`if meta.is_dir() {`
			`self.collect_manifests_recursive(&entry.path(), manifests)`
			`.await?;`
			`} else if name.ends_with(".manifest.json") {`
			`if let Ok(content) = fs::read_to_string(entry.path()).await {`
			`if let Ok(manifest) = serde_json::from_str::<ObjectManifest>(&content) {`
			`manifests.push(manifest);`
			`}`
			`}`
			`}`
			`}`

			`Ok(())`
			`})`
			`}`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`}`

feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00			`#[derive(Debug, Clone, Default)]`
feat(cluster): add clustered storage backend with QUIC transport, erasure coding, and shard management 2026-03-21 21:50:42 +00:00			`pub struct HealStats {`
			`pub shards_checked: u64,`
			`pub shards_healed: u64,`
			`pub errors: u64,`
			`}`
feat: enhance storage stats and cluster health reporting 2026-04-19 11:57:28 +00:00
			`#[derive(Debug, Clone, Default)]`
			`pub struct HealingRuntimeState {`
			`pub active: bool,`
			`pub scan_interval_ms: u64,`
			`pub last_run_started_at: Option<DateTime<Utc>>,`
			`pub last_run_completed_at: Option<DateTime<Utc>>,`
			`pub last_duration_ms: Option<u64>,`
			`pub last_stats: Option<HealStats>,`
			`pub last_error: Option<String>,`
			`}`