feat(cluster): add shard healing, drive health heartbeats, and clustered policy directory support

This commit is contained in:
2026-03-21 22:00:41 +00:00
parent 639eb5d36c
commit 3eb0045676
7 changed files with 780 additions and 70 deletions

View File

@@ -3,8 +3,12 @@ use std::net::SocketAddr;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::Mutex;
use super::drive_manager::{DriveManager, DriveStatus};
use super::protocol::{
ClusterRequest, ClusterResponse, HeartbeatMessage, JoinRequestMessage, NodeInfo,
ClusterRequest, ClusterResponse, DriveStateInfo, HeartbeatMessage, JoinRequestMessage,
NodeInfo,
};
use super::quic_transport::QuicTransport;
use super::state::ClusterState;
@@ -15,6 +19,7 @@ pub struct MembershipManager {
transport: Arc<QuicTransport>,
heartbeat_interval: Duration,
local_node_info: NodeInfo,
drive_manager: Option<Arc<Mutex<DriveManager>>>,
}
impl MembershipManager {
@@ -29,9 +34,16 @@ impl MembershipManager {
transport,
heartbeat_interval: Duration::from_millis(heartbeat_interval_ms),
local_node_info,
drive_manager: None,
}
}
/// Set the drive manager for health reporting in heartbeats.
pub fn with_drive_manager(mut self, dm: Arc<Mutex<DriveManager>>) -> Self {
self.drive_manager = Some(dm);
self
}
/// Join the cluster by contacting seed nodes.
/// Sends a JoinRequest to each seed node until one accepts.
pub async fn join_cluster(&self, seed_nodes: &[String]) -> Result<()> {
@@ -129,6 +141,9 @@ impl MembershipManager {
let topology_version = self.state.version().await;
let mut responded = Vec::new();
// Collect drive health states
let drive_states = self.collect_drive_states().await;
for peer in &peers {
let addr: SocketAddr = match peer.quic_addr.parse() {
Ok(a) => a,
@@ -138,7 +153,7 @@ impl MembershipManager {
let heartbeat = ClusterRequest::Heartbeat(HeartbeatMessage {
node_id: self.local_node_info.node_id.clone(),
timestamp: chrono::Utc::now().to_rfc3339(),
drive_states: Vec::new(), // TODO: populate from DriveManager
drive_states: drive_states.clone(),
topology_version,
});
@@ -181,4 +196,31 @@ impl MembershipManager {
let _response = self.transport.send_request(&conn, heartbeat).await?;
Ok(())
}
/// Collect drive health states from the DriveManager, if available.
async fn collect_drive_states(&self) -> Vec<DriveStateInfo> {
let dm = match &self.drive_manager {
Some(dm) => dm,
None => return Vec::new(),
};
let mut manager = dm.lock().await;
let results = manager.check_all_drives().await;
results
.into_iter()
.map(|(idx, status)| {
let status_str = match status {
DriveStatus::Online => "online",
DriveStatus::Degraded => "degraded",
DriveStatus::Offline => "offline",
DriveStatus::Healing => "healing",
};
DriveStateInfo {
drive_index: idx as u32,
status: status_str.to_string(),
}
})
.collect()
}
}