feat: enhance storage stats and cluster health reporting

- Introduced new data structures for bucket and storage statistics, including BucketSummary, StorageStats, and ClusterHealth.
- Implemented runtime statistics tracking for buckets, including object count and total size.
- Added methods to retrieve storage stats and bucket summaries in the FileStore.
- Enhanced the SmartStorage interface to expose storage stats and cluster health.
- Implemented tests for runtime stats, cluster health, and credential management.
- Added support for runtime-managed credentials with atomic replacement.
- Improved filesystem usage reporting for storage locations.
This commit is contained in:
2026-04-19 11:57:28 +00:00
parent c683b02e8c
commit 0e9862efca
16 changed files with 1803 additions and 85 deletions
+62 -2
View File
@@ -1,8 +1,10 @@
use anyhow::Result;
use chrono::{DateTime, Utc};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use tokio::fs;
use tokio::sync::RwLock;
use super::config::ErasureConfig;
use super::erasure::ErasureCoder;
@@ -18,6 +20,7 @@ pub struct HealingService {
local_shard_stores: Vec<Arc<ShardStore>>,
manifest_dir: PathBuf,
scan_interval: Duration,
runtime_state: Arc<RwLock<HealingRuntimeState>>,
}
impl HealingService {
@@ -27,16 +30,27 @@ impl HealingService {
local_shard_stores: Vec<Arc<ShardStore>>,
manifest_dir: PathBuf,
scan_interval_hours: u64,
runtime_state: Arc<RwLock<HealingRuntimeState>>,
) -> Result<Self> {
let scan_interval = Duration::from_secs(scan_interval_hours * 3600);
if let Ok(mut state_guard) = runtime_state.try_write() {
state_guard.scan_interval_ms = scan_interval.as_millis() as u64;
}
Ok(Self {
state,
erasure_coder: ErasureCoder::new(erasure_config)?,
local_shard_stores,
manifest_dir,
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
scan_interval,
runtime_state,
})
}
pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {
self.runtime_state.clone()
}
/// Run the healing loop as a background task.
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
let mut interval = tokio::time::interval(self.scan_interval);
@@ -47,9 +61,12 @@ impl HealingService {
loop {
tokio::select! {
_ = interval.tick() => {
let started_at = Utc::now();
self.mark_healing_started(started_at).await;
tracing::info!("Starting healing scan");
match self.heal_scan().await {
Ok(stats) => {
self.mark_healing_finished(started_at, Some(stats.clone()), None).await;
tracing::info!(
checked = stats.shards_checked,
healed = stats.shards_healed,
@@ -58,6 +75,7 @@ impl HealingService {
);
}
Err(e) => {
self.mark_healing_finished(started_at, None, Some(e.to_string())).await;
tracing::error!("Healing scan failed: {}", e);
}
}
@@ -70,6 +88,37 @@ impl HealingService {
}
}
async fn mark_healing_started(&self, started_at: DateTime<Utc>) {
let mut runtime_state = self.runtime_state.write().await;
runtime_state.active = true;
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
runtime_state.last_run_started_at = Some(started_at);
runtime_state.last_error = None;
}
async fn mark_healing_finished(
&self,
started_at: DateTime<Utc>,
stats: Option<HealStats>,
last_error: Option<String>,
) {
let finished_at = Utc::now();
let mut runtime_state = self.runtime_state.write().await;
runtime_state.active = false;
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
runtime_state.last_run_completed_at = Some(finished_at);
runtime_state.last_duration_ms = Some(
finished_at
.signed_duration_since(started_at)
.num_milliseconds()
.max(0) as u64,
);
if let Some(stats) = stats {
runtime_state.last_stats = Some(stats);
}
runtime_state.last_error = last_error;
}
/// Scan all manifests for shards on offline nodes, reconstruct and re-place them.
async fn heal_scan(&self) -> Result<HealStats> {
let mut stats = HealStats::default();
@@ -348,9 +397,20 @@ impl HealingService {
}
}
#[derive(Debug, Default)]
#[derive(Debug, Clone, Default)]
pub struct HealStats {
pub shards_checked: u64,
pub shards_healed: u64,
pub errors: u64,
}
#[derive(Debug, Clone, Default)]
pub struct HealingRuntimeState {
pub active: bool,
pub scan_interval_ms: u64,
pub last_run_started_at: Option<DateTime<Utc>>,
pub last_run_completed_at: Option<DateTime<Utc>>,
pub last_duration_ms: Option<u64>,
pub last_stats: Option<HealStats>,
pub last_error: Option<String>,
}