feat(cluster,server,auth): add operational health endpoints, persist cluster topology, and hide credential secrets from runtime listings
This commit is contained in:
@@ -18,6 +18,7 @@ pub struct MembershipManager {
|
||||
state: Arc<ClusterState>,
|
||||
transport: Arc<QuicTransport>,
|
||||
heartbeat_interval: Duration,
|
||||
heartbeat_timeout: Duration,
|
||||
local_node_info: NodeInfo,
|
||||
drive_manager: Option<Arc<Mutex<DriveManager>>>,
|
||||
}
|
||||
@@ -27,12 +28,14 @@ impl MembershipManager {
|
||||
state: Arc<ClusterState>,
|
||||
transport: Arc<QuicTransport>,
|
||||
heartbeat_interval_ms: u64,
|
||||
heartbeat_timeout_ms: u64,
|
||||
local_node_info: NodeInfo,
|
||||
) -> Self {
|
||||
Self {
|
||||
state,
|
||||
transport,
|
||||
heartbeat_interval: Duration::from_millis(heartbeat_interval_ms),
|
||||
heartbeat_timeout: Duration::from_millis(heartbeat_timeout_ms),
|
||||
local_node_info,
|
||||
drive_manager: None,
|
||||
}
|
||||
@@ -46,7 +49,7 @@ impl MembershipManager {
|
||||
|
||||
/// Join the cluster by contacting seed nodes.
|
||||
/// Sends a JoinRequest to each seed node until one accepts.
|
||||
pub async fn join_cluster(&self, seed_nodes: &[String]) -> Result<()> {
|
||||
pub async fn join_cluster(&self, seed_nodes: &[String], allow_bootstrap_on_failure: bool) -> Result<()> {
|
||||
if seed_nodes.is_empty() {
|
||||
tracing::info!("No seed nodes configured, starting as initial cluster node");
|
||||
self.state.add_node(self.local_node_info.clone()).await;
|
||||
@@ -75,10 +78,13 @@ impl MembershipManager {
|
||||
}
|
||||
}
|
||||
|
||||
// If no seed responded, start as a new cluster
|
||||
tracing::info!("Could not reach any seed nodes, starting as initial cluster node");
|
||||
self.state.add_node(self.local_node_info.clone()).await;
|
||||
Ok(())
|
||||
if allow_bootstrap_on_failure {
|
||||
tracing::warn!("Could not reach any seed nodes, bootstrapping a new cluster because no persisted topology exists");
|
||||
self.state.add_node(self.local_node_info.clone()).await;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
anyhow::bail!("Could not reach any configured seed nodes; refusing unsafe cluster bootstrap")
|
||||
}
|
||||
|
||||
async fn try_join(&self, addr: SocketAddr) -> Result<()> {
|
||||
@@ -97,9 +103,14 @@ impl MembershipManager {
|
||||
ClusterResponse::JoinResponse(join_resp) => {
|
||||
if join_resp.accepted {
|
||||
if let Some(topology) = &join_resp.topology {
|
||||
let topology_contains_self = topology
|
||||
.nodes
|
||||
.iter()
|
||||
.any(|node| node.node_id == self.local_node_info.node_id);
|
||||
self.state.apply_topology(topology).await;
|
||||
// Also register self
|
||||
self.state.add_node(self.local_node_info.clone()).await;
|
||||
if !topology_contains_self {
|
||||
self.state.add_node(self.local_node_info.clone()).await;
|
||||
}
|
||||
tracing::info!(
|
||||
"Applied cluster topology (version {}, {} nodes, {} erasure sets)",
|
||||
topology.version,
|
||||
@@ -137,7 +148,13 @@ impl MembershipManager {
|
||||
}
|
||||
|
||||
async fn send_heartbeats(&self) {
|
||||
let peers = self.state.online_peers().await;
|
||||
let peers = self
|
||||
.state
|
||||
.all_nodes()
|
||||
.await
|
||||
.into_iter()
|
||||
.filter(|node| node.info.node_id != self.local_node_info.node_id)
|
||||
.collect::<Vec<_>>();
|
||||
let topology_version = self.state.version().await;
|
||||
let mut responded = Vec::new();
|
||||
|
||||
@@ -145,7 +162,7 @@ impl MembershipManager {
|
||||
let drive_states = self.collect_drive_states().await;
|
||||
|
||||
for peer in &peers {
|
||||
let addr: SocketAddr = match peer.quic_addr.parse() {
|
||||
let addr: SocketAddr = match peer.info.quic_addr.parse() {
|
||||
Ok(a) => a,
|
||||
Err(_) => continue,
|
||||
};
|
||||
@@ -158,23 +175,23 @@ impl MembershipManager {
|
||||
});
|
||||
|
||||
match tokio::time::timeout(
|
||||
Duration::from_secs(5),
|
||||
self.send_heartbeat_to_peer(&peer.node_id, addr, &heartbeat),
|
||||
self.heartbeat_timeout,
|
||||
self.send_heartbeat_to_peer(&peer.info.node_id, addr, &heartbeat),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(())) => {
|
||||
responded.push(peer.node_id.clone());
|
||||
responded.push(peer.info.node_id.clone());
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
tracing::debug!(
|
||||
peer = %peer.node_id,
|
||||
peer = %peer.info.node_id,
|
||||
error = %e,
|
||||
"Heartbeat failed"
|
||||
);
|
||||
}
|
||||
Err(_) => {
|
||||
tracing::debug!(peer = %peer.node_id, "Heartbeat timed out");
|
||||
tracing::debug!(peer = %peer.info.node_id, "Heartbeat timed out");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user