fix(cluster): improve shard reconstruction validation and start background healing service
This commit is contained in:
+23
-4
@@ -24,8 +24,8 @@ use crate::config::SmartStorageConfig;
|
||||
use crate::policy::{self, PolicyDecision, PolicyStore};
|
||||
use crate::error::StorageError;
|
||||
use crate::cluster::coordinator::DistributedStore;
|
||||
use crate::cluster::config::ErasureConfig;
|
||||
use crate::cluster::drive_manager::DriveManager;
|
||||
use crate::cluster::healing::HealingService;
|
||||
use crate::cluster::membership::MembershipManager;
|
||||
use crate::cluster::placement;
|
||||
use crate::cluster::protocol::NodeInfo;
|
||||
@@ -237,9 +237,15 @@ impl StorageServer {
|
||||
.join_cluster(&cluster_config.seed_nodes)
|
||||
.await?;
|
||||
|
||||
// Build local shard stores (one per drive) for shared use
|
||||
let local_shard_stores: Vec<Arc<ShardStore>> = drive_paths
|
||||
.iter()
|
||||
.map(|p| Arc::new(ShardStore::new(p.clone())))
|
||||
.collect();
|
||||
|
||||
// Start QUIC accept loop for incoming connections
|
||||
let shard_store_for_accept = Arc::new(ShardStore::new(drive_paths[0].clone()));
|
||||
let (quic_shutdown_tx, quic_shutdown_rx) = watch::channel(false);
|
||||
let shard_store_for_accept = local_shard_stores[0].clone();
|
||||
let (_quic_shutdown_tx, quic_shutdown_rx) = watch::channel(false);
|
||||
let transport_clone = transport.clone();
|
||||
tokio::spawn(async move {
|
||||
transport_clone
|
||||
@@ -249,11 +255,24 @@ impl StorageServer {
|
||||
|
||||
// Start heartbeat loop
|
||||
let membership_clone = membership.clone();
|
||||
let (hb_shutdown_tx, hb_shutdown_rx) = watch::channel(false);
|
||||
let (_hb_shutdown_tx, hb_shutdown_rx) = watch::channel(false);
|
||||
tokio::spawn(async move {
|
||||
membership_clone.heartbeat_loop(hb_shutdown_rx).await;
|
||||
});
|
||||
|
||||
// Start healing service
|
||||
let healing_service = HealingService::new(
|
||||
cluster_state.clone(),
|
||||
&erasure_config,
|
||||
local_shard_stores.clone(),
|
||||
manifest_dir.clone(),
|
||||
24, // scan every 24 hours
|
||||
)?;
|
||||
let (_heal_shutdown_tx, heal_shutdown_rx) = watch::channel(false);
|
||||
tokio::spawn(async move {
|
||||
healing_service.run(heal_shutdown_rx).await;
|
||||
});
|
||||
|
||||
// Create distributed store
|
||||
let distributed_store = DistributedStore::new(
|
||||
cluster_state,
|
||||
|
||||
Reference in New Issue
Block a user