feat(cluster,server,auth): add operational health endpoints, persist cluster topology, and hide credential secrets from runtime listings

This commit is contained in:
2026-04-30 06:08:42 +00:00
parent c2b40ee240
commit a31e477359
16 changed files with 1120 additions and 123 deletions
+31 -14
View File
@@ -18,6 +18,7 @@ pub struct MembershipManager {
state: Arc<ClusterState>,
transport: Arc<QuicTransport>,
heartbeat_interval: Duration,
heartbeat_timeout: Duration,
local_node_info: NodeInfo,
drive_manager: Option<Arc<Mutex<DriveManager>>>,
}
@@ -27,12 +28,14 @@ impl MembershipManager {
state: Arc<ClusterState>,
transport: Arc<QuicTransport>,
heartbeat_interval_ms: u64,
heartbeat_timeout_ms: u64,
local_node_info: NodeInfo,
) -> Self {
Self {
state,
transport,
heartbeat_interval: Duration::from_millis(heartbeat_interval_ms),
heartbeat_timeout: Duration::from_millis(heartbeat_timeout_ms),
local_node_info,
drive_manager: None,
}
@@ -46,7 +49,7 @@ impl MembershipManager {
/// Join the cluster by contacting seed nodes.
/// Sends a JoinRequest to each seed node until one accepts.
pub async fn join_cluster(&self, seed_nodes: &[String]) -> Result<()> {
pub async fn join_cluster(&self, seed_nodes: &[String], allow_bootstrap_on_failure: bool) -> Result<()> {
if seed_nodes.is_empty() {
tracing::info!("No seed nodes configured, starting as initial cluster node");
self.state.add_node(self.local_node_info.clone()).await;
@@ -75,10 +78,13 @@ impl MembershipManager {
}
}
// If no seed responded, start as a new cluster
tracing::info!("Could not reach any seed nodes, starting as initial cluster node");
self.state.add_node(self.local_node_info.clone()).await;
Ok(())
if allow_bootstrap_on_failure {
tracing::warn!("Could not reach any seed nodes, bootstrapping a new cluster because no persisted topology exists");
self.state.add_node(self.local_node_info.clone()).await;
return Ok(());
}
anyhow::bail!("Could not reach any configured seed nodes; refusing unsafe cluster bootstrap")
}
async fn try_join(&self, addr: SocketAddr) -> Result<()> {
@@ -97,9 +103,14 @@ impl MembershipManager {
ClusterResponse::JoinResponse(join_resp) => {
if join_resp.accepted {
if let Some(topology) = &join_resp.topology {
let topology_contains_self = topology
.nodes
.iter()
.any(|node| node.node_id == self.local_node_info.node_id);
self.state.apply_topology(topology).await;
// Also register self
self.state.add_node(self.local_node_info.clone()).await;
if !topology_contains_self {
self.state.add_node(self.local_node_info.clone()).await;
}
tracing::info!(
"Applied cluster topology (version {}, {} nodes, {} erasure sets)",
topology.version,
@@ -137,7 +148,13 @@ impl MembershipManager {
}
async fn send_heartbeats(&self) {
let peers = self.state.online_peers().await;
let peers = self
.state
.all_nodes()
.await
.into_iter()
.filter(|node| node.info.node_id != self.local_node_info.node_id)
.collect::<Vec<_>>();
let topology_version = self.state.version().await;
let mut responded = Vec::new();
@@ -145,7 +162,7 @@ impl MembershipManager {
let drive_states = self.collect_drive_states().await;
for peer in &peers {
let addr: SocketAddr = match peer.quic_addr.parse() {
let addr: SocketAddr = match peer.info.quic_addr.parse() {
Ok(a) => a,
Err(_) => continue,
};
@@ -158,23 +175,23 @@ impl MembershipManager {
});
match tokio::time::timeout(
Duration::from_secs(5),
self.send_heartbeat_to_peer(&peer.node_id, addr, &heartbeat),
self.heartbeat_timeout,
self.send_heartbeat_to_peer(&peer.info.node_id, addr, &heartbeat),
)
.await
{
Ok(Ok(())) => {
responded.push(peer.node_id.clone());
responded.push(peer.info.node_id.clone());
}
Ok(Err(e)) => {
tracing::debug!(
peer = %peer.node_id,
peer = %peer.info.node_id,
error = %e,
"Heartbeat failed"
);
}
Err(_) => {
tracing::debug!(peer = %peer.node_id, "Heartbeat timed out");
tracing::debug!(peer = %peer.info.node_id, "Heartbeat timed out");
}
}
}