feat: enhance storage stats and cluster health reporting
- Introduced new data structures for bucket and storage statistics, including BucketSummary, StorageStats, and ClusterHealth. - Implemented runtime statistics tracking for buckets, including object count and total size. - Added methods to retrieve storage stats and bucket summaries in the FileStore. - Enhanced the SmartStorage interface to expose storage stats and cluster health. - Implemented tests for runtime stats, cluster health, and credential management. - Added support for runtime-managed credentials with atomic replacement. - Improved filesystem usage reporting for storage locations.
This commit is contained in:
@@ -1,8 +1,10 @@
|
||||
use anyhow::Result;
|
||||
use chrono::{DateTime, Utc};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tokio::fs;
|
||||
use tokio::sync::RwLock;
|
||||
|
||||
use super::config::ErasureConfig;
|
||||
use super::erasure::ErasureCoder;
|
||||
@@ -18,6 +20,7 @@ pub struct HealingService {
|
||||
local_shard_stores: Vec<Arc<ShardStore>>,
|
||||
manifest_dir: PathBuf,
|
||||
scan_interval: Duration,
|
||||
runtime_state: Arc<RwLock<HealingRuntimeState>>,
|
||||
}
|
||||
|
||||
impl HealingService {
|
||||
@@ -27,16 +30,27 @@ impl HealingService {
|
||||
local_shard_stores: Vec<Arc<ShardStore>>,
|
||||
manifest_dir: PathBuf,
|
||||
scan_interval_hours: u64,
|
||||
runtime_state: Arc<RwLock<HealingRuntimeState>>,
|
||||
) -> Result<Self> {
|
||||
let scan_interval = Duration::from_secs(scan_interval_hours * 3600);
|
||||
if let Ok(mut state_guard) = runtime_state.try_write() {
|
||||
state_guard.scan_interval_ms = scan_interval.as_millis() as u64;
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
state,
|
||||
erasure_coder: ErasureCoder::new(erasure_config)?,
|
||||
local_shard_stores,
|
||||
manifest_dir,
|
||||
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
|
||||
scan_interval,
|
||||
runtime_state,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn runtime_state(&self) -> Arc<RwLock<HealingRuntimeState>> {
|
||||
self.runtime_state.clone()
|
||||
}
|
||||
|
||||
/// Run the healing loop as a background task.
|
||||
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
||||
let mut interval = tokio::time::interval(self.scan_interval);
|
||||
@@ -47,9 +61,12 @@ impl HealingService {
|
||||
loop {
|
||||
tokio::select! {
|
||||
_ = interval.tick() => {
|
||||
let started_at = Utc::now();
|
||||
self.mark_healing_started(started_at).await;
|
||||
tracing::info!("Starting healing scan");
|
||||
match self.heal_scan().await {
|
||||
Ok(stats) => {
|
||||
self.mark_healing_finished(started_at, Some(stats.clone()), None).await;
|
||||
tracing::info!(
|
||||
checked = stats.shards_checked,
|
||||
healed = stats.shards_healed,
|
||||
@@ -58,6 +75,7 @@ impl HealingService {
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
self.mark_healing_finished(started_at, None, Some(e.to_string())).await;
|
||||
tracing::error!("Healing scan failed: {}", e);
|
||||
}
|
||||
}
|
||||
@@ -70,6 +88,37 @@ impl HealingService {
|
||||
}
|
||||
}
|
||||
|
||||
async fn mark_healing_started(&self, started_at: DateTime<Utc>) {
|
||||
let mut runtime_state = self.runtime_state.write().await;
|
||||
runtime_state.active = true;
|
||||
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
|
||||
runtime_state.last_run_started_at = Some(started_at);
|
||||
runtime_state.last_error = None;
|
||||
}
|
||||
|
||||
async fn mark_healing_finished(
|
||||
&self,
|
||||
started_at: DateTime<Utc>,
|
||||
stats: Option<HealStats>,
|
||||
last_error: Option<String>,
|
||||
) {
|
||||
let finished_at = Utc::now();
|
||||
let mut runtime_state = self.runtime_state.write().await;
|
||||
runtime_state.active = false;
|
||||
runtime_state.scan_interval_ms = self.scan_interval.as_millis() as u64;
|
||||
runtime_state.last_run_completed_at = Some(finished_at);
|
||||
runtime_state.last_duration_ms = Some(
|
||||
finished_at
|
||||
.signed_duration_since(started_at)
|
||||
.num_milliseconds()
|
||||
.max(0) as u64,
|
||||
);
|
||||
if let Some(stats) = stats {
|
||||
runtime_state.last_stats = Some(stats);
|
||||
}
|
||||
runtime_state.last_error = last_error;
|
||||
}
|
||||
|
||||
/// Scan all manifests for shards on offline nodes, reconstruct and re-place them.
|
||||
async fn heal_scan(&self) -> Result<HealStats> {
|
||||
let mut stats = HealStats::default();
|
||||
@@ -348,9 +397,20 @@ impl HealingService {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct HealStats {
|
||||
pub shards_checked: u64,
|
||||
pub shards_healed: u64,
|
||||
pub errors: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct HealingRuntimeState {
|
||||
pub active: bool,
|
||||
pub scan_interval_ms: u64,
|
||||
pub last_run_started_at: Option<DateTime<Utc>>,
|
||||
pub last_run_completed_at: Option<DateTime<Utc>>,
|
||||
pub last_duration_ms: Option<u64>,
|
||||
pub last_stats: Option<HealStats>,
|
||||
pub last_error: Option<String>,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user