Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 639eb5d36c | |||
| d12d321079 |
@@ -1,5 +1,14 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 2026-03-21 - 6.1.0 - feat(cluster)
|
||||||
|
add clustered storage backend with QUIC transport, erasure coding, and shard management
|
||||||
|
|
||||||
|
- introduces cluster configuration in Rust and TypeScript, including seed nodes, drive paths, heartbeat settings, and erasure coding options
|
||||||
|
- adds core cluster modules for membership, topology state, object manifests, placement, shard storage, drive management, healing scaffolding, and inter-node protocol handling
|
||||||
|
- adds QUIC-based transport for cluster communication and integrates a distributed storage backend alongside the existing standalone FileStore
|
||||||
|
- updates the server startup path to initialize standalone or clustered storage based on configuration and exposes a basic clusterStatus management endpoint
|
||||||
|
- refreshes build and dependency versions to support the new clustered storage implementation
|
||||||
|
|
||||||
## 2026-03-14 - 6.0.1 - fix(rust-bridge)
|
## 2026-03-14 - 6.0.1 - fix(rust-bridge)
|
||||||
update smartrust and limit RustBridge binary lookup to dist_rust
|
update smartrust and limit RustBridge binary lookup to dist_rust
|
||||||
|
|
||||||
|
|||||||
21
package.json
21
package.json
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "@push.rocks/smartstorage",
|
"name": "@push.rocks/smartstorage",
|
||||||
"version": "6.0.1",
|
"version": "6.1.0",
|
||||||
"private": false,
|
"private": false,
|
||||||
"description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.",
|
"description": "A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.",
|
||||||
"main": "dist_ts/index.js",
|
"main": "dist_ts/index.js",
|
||||||
@@ -9,19 +9,20 @@
|
|||||||
"author": "Lossless GmbH",
|
"author": "Lossless GmbH",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
"test:before": "(tsrust)",
|
||||||
"test": "(tstest test/ --web --verbose --logfile --timeout 60)",
|
"test": "(tstest test/ --web --verbose --logfile --timeout 60)",
|
||||||
"build": "(tsrust && tsbuild --web --allowimplicitany)",
|
"build": "(tsrust && tsbuild tsfolders --allowimplicitany)",
|
||||||
"buildDocs": "tsdoc"
|
"buildDocs": "tsdoc"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@aws-sdk/client-s3": "^3.937.0",
|
"@aws-sdk/client-s3": "^3.1014.0",
|
||||||
"@git.zone/tsbuild": "^3.1.0",
|
"@git.zone/tsbuild": "^4.3.0",
|
||||||
"@git.zone/tsbundle": "^2.5.2",
|
"@git.zone/tsbundle": "^2.9.1",
|
||||||
"@git.zone/tsrun": "^2.0.0",
|
"@git.zone/tsrun": "^2.0.1",
|
||||||
"@git.zone/tsrust": "^1.3.0",
|
"@git.zone/tsrust": "^1.3.0",
|
||||||
"@git.zone/tstest": "^3.1.0",
|
"@git.zone/tstest": "^3.5.0",
|
||||||
"@push.rocks/smartbucket": "^4.3.0",
|
"@push.rocks/smartbucket": "^4.5.1",
|
||||||
"@types/node": "^22.9.0"
|
"@types/node": "^25.5.0"
|
||||||
},
|
},
|
||||||
"browserslist": [
|
"browserslist": [
|
||||||
"last 1 chrome versions"
|
"last 1 chrome versions"
|
||||||
@@ -42,7 +43,7 @@
|
|||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@push.rocks/smartpath": "^6.0.0",
|
"@push.rocks/smartpath": "^6.0.0",
|
||||||
"@push.rocks/smartrust": "^1.3.2",
|
"@push.rocks/smartrust": "^1.3.2",
|
||||||
"@tsclass/tsclass": "^9.3.0"
|
"@tsclass/tsclass": "^9.5.0"
|
||||||
},
|
},
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"smartstorage",
|
"smartstorage",
|
||||||
|
|||||||
6179
pnpm-lock.yaml
generated
6179
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
977
rust/Cargo.lock
generated
977
rust/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -28,6 +28,16 @@ percent-encoding = "2"
|
|||||||
url = "2"
|
url = "2"
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
chrono = { version = "0.4", features = ["serde"] }
|
||||||
futures-core = "0.3"
|
futures-core = "0.3"
|
||||||
|
futures = "0.3"
|
||||||
|
async-trait = "0.1"
|
||||||
|
reed-solomon-erasure = { version = "6", features = ["simd-accel"] }
|
||||||
|
xxhash-rust = { version = "0.8", features = ["xxh64"] }
|
||||||
|
crc32c = "0.6"
|
||||||
|
bincode = "1"
|
||||||
|
quinn = "0.11"
|
||||||
|
rustls = { version = "0.23", default-features = false, features = ["ring", "std"] }
|
||||||
|
rcgen = "0.13"
|
||||||
|
dashmap = "6"
|
||||||
hmac = "0.12"
|
hmac = "0.12"
|
||||||
sha2 = "0.10"
|
sha2 = "0.10"
|
||||||
hex = "0.4"
|
hex = "0.4"
|
||||||
|
|||||||
95
rust/src/cluster/config.rs
Normal file
95
rust/src/cluster/config.rs
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ClusterConfig {
|
||||||
|
pub enabled: bool,
|
||||||
|
#[serde(default)]
|
||||||
|
pub node_id: Option<String>,
|
||||||
|
#[serde(default = "default_quic_port")]
|
||||||
|
pub quic_port: u16,
|
||||||
|
#[serde(default)]
|
||||||
|
pub seed_nodes: Vec<String>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub erasure: ErasureConfig,
|
||||||
|
#[serde(default)]
|
||||||
|
pub drives: DriveConfig,
|
||||||
|
#[serde(default = "default_heartbeat_interval")]
|
||||||
|
pub heartbeat_interval_ms: u64,
|
||||||
|
#[serde(default = "default_heartbeat_timeout")]
|
||||||
|
pub heartbeat_timeout_ms: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ErasureConfig {
|
||||||
|
#[serde(default = "default_data_shards")]
|
||||||
|
pub data_shards: usize,
|
||||||
|
#[serde(default = "default_parity_shards")]
|
||||||
|
pub parity_shards: usize,
|
||||||
|
#[serde(default = "default_chunk_size")]
|
||||||
|
pub chunk_size_bytes: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ErasureConfig {
|
||||||
|
pub fn total_shards(&self) -> usize {
|
||||||
|
self.data_shards + self.parity_shards
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Minimum shards needed for a write to succeed (data_shards + 1)
|
||||||
|
pub fn write_quorum(&self) -> usize {
|
||||||
|
self.data_shards + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Minimum shards needed to reconstruct data
|
||||||
|
pub fn read_quorum(&self) -> usize {
|
||||||
|
self.data_shards
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for ErasureConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
data_shards: default_data_shards(),
|
||||||
|
parity_shards: default_parity_shards(),
|
||||||
|
chunk_size_bytes: default_chunk_size(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct DriveConfig {
|
||||||
|
#[serde(default)]
|
||||||
|
pub paths: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DriveConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self { paths: Vec::new() }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_quic_port() -> u16 {
|
||||||
|
4000
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_heartbeat_interval() -> u64 {
|
||||||
|
5000
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_heartbeat_timeout() -> u64 {
|
||||||
|
30000
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_data_shards() -> usize {
|
||||||
|
4
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_parity_shards() -> usize {
|
||||||
|
2
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_chunk_size() -> usize {
|
||||||
|
4 * 1024 * 1024 // 4 MB
|
||||||
|
}
|
||||||
851
rust/src/cluster/coordinator.rs
Normal file
851
rust/src/cluster/coordinator.rs
Normal file
@@ -0,0 +1,851 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use http_body_util::BodyExt;
|
||||||
|
use hyper::body::Incoming;
|
||||||
|
use md5::{Digest, Md5};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::net::SocketAddr;
|
||||||
|
use std::path::PathBuf;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::fs;
|
||||||
|
|
||||||
|
use super::config::ErasureConfig;
|
||||||
|
use super::erasure::ErasureCoder;
|
||||||
|
use super::metadata::{ChunkManifest, ObjectManifest, ShardPlacement};
|
||||||
|
use super::placement::ErasureSet;
|
||||||
|
use super::protocol::ShardWriteRequest;
|
||||||
|
use super::quic_transport::QuicTransport;
|
||||||
|
use super::shard_store::{ShardId, ShardStore};
|
||||||
|
use super::state::ClusterState;
|
||||||
|
use crate::storage::{
|
||||||
|
BucketInfo, CompleteMultipartResult, CopyResult, GetResult, HeadResult, ListObjectEntry,
|
||||||
|
ListObjectsResult, MultipartUploadInfo, PutResult,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Distributed storage coordinator.
|
||||||
|
///
|
||||||
|
/// Handles S3 operations by distributing erasure-coded shards across
|
||||||
|
/// the cluster via QUIC, with quorum-based consistency.
|
||||||
|
pub struct DistributedStore {
|
||||||
|
state: Arc<ClusterState>,
|
||||||
|
transport: Arc<QuicTransport>,
|
||||||
|
erasure_coder: ErasureCoder,
|
||||||
|
/// Local shard stores, one per drive. Index = drive index.
|
||||||
|
local_shard_stores: Vec<Arc<ShardStore>>,
|
||||||
|
/// Root directory for manifests on this node
|
||||||
|
manifest_dir: PathBuf,
|
||||||
|
/// Root directory for buckets metadata
|
||||||
|
buckets_dir: PathBuf,
|
||||||
|
erasure_config: ErasureConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DistributedStore {
|
||||||
|
pub fn new(
|
||||||
|
state: Arc<ClusterState>,
|
||||||
|
transport: Arc<QuicTransport>,
|
||||||
|
erasure_config: ErasureConfig,
|
||||||
|
drive_paths: Vec<PathBuf>,
|
||||||
|
manifest_dir: PathBuf,
|
||||||
|
buckets_dir: PathBuf,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let erasure_coder = ErasureCoder::new(&erasure_config)?;
|
||||||
|
|
||||||
|
let local_shard_stores = drive_paths
|
||||||
|
.iter()
|
||||||
|
.map(|p| Arc::new(ShardStore::new(p.clone())))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
state,
|
||||||
|
transport,
|
||||||
|
erasure_coder,
|
||||||
|
local_shard_stores,
|
||||||
|
manifest_dir,
|
||||||
|
buckets_dir,
|
||||||
|
erasure_config,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Object operations
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
pub async fn put_object(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
body: Incoming,
|
||||||
|
metadata: HashMap<String, String>,
|
||||||
|
) -> Result<PutResult> {
|
||||||
|
if !self.bucket_exists(bucket).await {
|
||||||
|
return Err(crate::error::StorageError::no_such_bucket().into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let erasure_set = self
|
||||||
|
.state
|
||||||
|
.get_erasure_set_for_object(bucket, key)
|
||||||
|
.await
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("No erasure sets available"))?;
|
||||||
|
|
||||||
|
let chunk_size = self.erasure_config.chunk_size_bytes;
|
||||||
|
let mut chunk_buffer = Vec::with_capacity(chunk_size);
|
||||||
|
let mut chunk_index: u32 = 0;
|
||||||
|
let mut chunks = Vec::new();
|
||||||
|
let mut total_size: u64 = 0;
|
||||||
|
let mut full_hasher = Md5::new();
|
||||||
|
|
||||||
|
// Stream body, processing one chunk at a time
|
||||||
|
let mut body = body;
|
||||||
|
loop {
|
||||||
|
match body.frame().await {
|
||||||
|
Some(Ok(frame)) => {
|
||||||
|
if let Ok(data) = frame.into_data() {
|
||||||
|
full_hasher.update(&data);
|
||||||
|
total_size += data.len() as u64;
|
||||||
|
chunk_buffer.extend_from_slice(&data);
|
||||||
|
|
||||||
|
// Process complete chunks
|
||||||
|
while chunk_buffer.len() >= chunk_size {
|
||||||
|
let chunk_data: Vec<u8> =
|
||||||
|
chunk_buffer.drain(..chunk_size).collect();
|
||||||
|
let chunk_manifest = self
|
||||||
|
.encode_and_distribute_chunk(
|
||||||
|
&erasure_set,
|
||||||
|
bucket,
|
||||||
|
key,
|
||||||
|
chunk_index,
|
||||||
|
&chunk_data,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
chunks.push(chunk_manifest);
|
||||||
|
chunk_index += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(Err(e)) => return Err(anyhow::anyhow!("Body read error: {}", e)),
|
||||||
|
None => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process final partial chunk
|
||||||
|
if !chunk_buffer.is_empty() {
|
||||||
|
let chunk_manifest = self
|
||||||
|
.encode_and_distribute_chunk(&erasure_set, bucket, key, chunk_index, &chunk_buffer)
|
||||||
|
.await?;
|
||||||
|
chunks.push(chunk_manifest);
|
||||||
|
}
|
||||||
|
|
||||||
|
let md5_hex = format!("{:x}", full_hasher.finalize());
|
||||||
|
|
||||||
|
// Build and store manifest
|
||||||
|
let manifest = ObjectManifest {
|
||||||
|
bucket: bucket.to_string(),
|
||||||
|
key: key.to_string(),
|
||||||
|
version_id: uuid::Uuid::new_v4().to_string(),
|
||||||
|
size: total_size,
|
||||||
|
content_md5: md5_hex.clone(),
|
||||||
|
content_type: metadata
|
||||||
|
.get("content-type")
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| "binary/octet-stream".to_string()),
|
||||||
|
metadata,
|
||||||
|
created_at: Utc::now().to_rfc3339(),
|
||||||
|
last_modified: Utc::now().to_rfc3339(),
|
||||||
|
data_shards: self.erasure_config.data_shards,
|
||||||
|
parity_shards: self.erasure_config.parity_shards,
|
||||||
|
chunk_size: self.erasure_config.chunk_size_bytes,
|
||||||
|
chunks,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.store_manifest(&manifest).await?;
|
||||||
|
|
||||||
|
Ok(PutResult { md5: md5_hex })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_object(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
range: Option<(u64, u64)>,
|
||||||
|
) -> Result<GetResult> {
|
||||||
|
let manifest = self.load_manifest(bucket, key).await?;
|
||||||
|
|
||||||
|
// Determine which chunks to fetch based on range
|
||||||
|
let chunk_size = manifest.chunk_size as u64;
|
||||||
|
let (first_chunk, last_chunk, byte_offset_in_first, byte_end_in_last) =
|
||||||
|
if let Some((start, end)) = range {
|
||||||
|
let first = (start / chunk_size) as usize;
|
||||||
|
let last = (end / chunk_size) as usize;
|
||||||
|
let offset = (start % chunk_size) as usize;
|
||||||
|
let end_in_last = (end % chunk_size) as usize + 1;
|
||||||
|
(first, last, offset, end_in_last)
|
||||||
|
} else {
|
||||||
|
(0, manifest.chunks.len() - 1, 0, 0)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Reconstruct the needed chunks
|
||||||
|
let mut full_data = Vec::new();
|
||||||
|
for chunk_idx in first_chunk..=last_chunk.min(manifest.chunks.len() - 1) {
|
||||||
|
let chunk = &manifest.chunks[chunk_idx];
|
||||||
|
let reconstructed = self.fetch_and_reconstruct_chunk(chunk).await?;
|
||||||
|
full_data.extend_from_slice(&reconstructed);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply range if requested
|
||||||
|
let (response_data, content_length) = if let Some((start, end)) = range {
|
||||||
|
let adjusted_start = byte_offset_in_first;
|
||||||
|
let total_range_bytes = (end - start + 1) as usize;
|
||||||
|
let adjusted_end = adjusted_start + total_range_bytes;
|
||||||
|
let sliced = full_data[adjusted_start..adjusted_end.min(full_data.len())].to_vec();
|
||||||
|
let len = sliced.len() as u64;
|
||||||
|
(sliced, len)
|
||||||
|
} else {
|
||||||
|
let len = full_data.len() as u64;
|
||||||
|
(full_data, len)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Write to a temp file for streaming (matches FileStore's GetResult interface)
|
||||||
|
let temp_path = self.manifest_dir.join(format!(
|
||||||
|
".tmp_get_{}_{}",
|
||||||
|
uuid::Uuid::new_v4(),
|
||||||
|
key.replace('/', "_")
|
||||||
|
));
|
||||||
|
fs::write(&temp_path, &response_data).await?;
|
||||||
|
let file = fs::File::open(&temp_path).await?;
|
||||||
|
// Clean up temp file after opening (Unix: file stays accessible via fd)
|
||||||
|
let _ = fs::remove_file(&temp_path).await;
|
||||||
|
|
||||||
|
let last_modified: DateTime<Utc> = manifest
|
||||||
|
.last_modified
|
||||||
|
.parse()
|
||||||
|
.unwrap_or_else(|_| Utc::now());
|
||||||
|
|
||||||
|
Ok(GetResult {
|
||||||
|
size: manifest.size,
|
||||||
|
last_modified,
|
||||||
|
md5: manifest.content_md5.clone(),
|
||||||
|
metadata: manifest.metadata.clone(),
|
||||||
|
body: file,
|
||||||
|
content_length,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn head_object(&self, bucket: &str, key: &str) -> Result<HeadResult> {
|
||||||
|
let manifest = self.load_manifest(bucket, key).await?;
|
||||||
|
let last_modified: DateTime<Utc> = manifest
|
||||||
|
.last_modified
|
||||||
|
.parse()
|
||||||
|
.unwrap_or_else(|_| Utc::now());
|
||||||
|
|
||||||
|
Ok(HeadResult {
|
||||||
|
size: manifest.size,
|
||||||
|
last_modified,
|
||||||
|
md5: manifest.content_md5,
|
||||||
|
metadata: manifest.metadata,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete_object(&self, bucket: &str, key: &str) -> Result<()> {
|
||||||
|
// Load manifest to find all shards
|
||||||
|
if let Ok(manifest) = self.load_manifest(bucket, key).await {
|
||||||
|
// Delete shards from all drives
|
||||||
|
for chunk in &manifest.chunks {
|
||||||
|
for placement in &chunk.shard_placements {
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: bucket.to_string(),
|
||||||
|
key: key.to_string(),
|
||||||
|
chunk_index: chunk.chunk_index,
|
||||||
|
shard_index: placement.shard_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
if placement.node_id == self.state.local_node_id() {
|
||||||
|
// Local delete
|
||||||
|
if let Some(store) = self
|
||||||
|
.local_shard_stores
|
||||||
|
.get(placement.drive_id.parse::<usize>().unwrap_or(0))
|
||||||
|
{
|
||||||
|
let _ = store.delete_shard(&shard_id).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// TODO: send delete to remote nodes via QUIC
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete manifest
|
||||||
|
self.delete_manifest(bucket, key).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn copy_object(
|
||||||
|
&self,
|
||||||
|
src_bucket: &str,
|
||||||
|
src_key: &str,
|
||||||
|
dest_bucket: &str,
|
||||||
|
dest_key: &str,
|
||||||
|
_metadata_directive: &str,
|
||||||
|
new_metadata: Option<HashMap<String, String>>,
|
||||||
|
) -> Result<CopyResult> {
|
||||||
|
// Load source manifest
|
||||||
|
let src_manifest = self.load_manifest(src_bucket, src_key).await?;
|
||||||
|
|
||||||
|
// Determine metadata
|
||||||
|
let metadata = if let Some(meta) = new_metadata {
|
||||||
|
meta
|
||||||
|
} else {
|
||||||
|
src_manifest.metadata.clone()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Read source object fully, then reconstruct
|
||||||
|
let mut full_data = Vec::new();
|
||||||
|
for chunk in &src_manifest.chunks {
|
||||||
|
let reconstructed = self.fetch_and_reconstruct_chunk(chunk).await?;
|
||||||
|
full_data.extend_from_slice(&reconstructed);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute MD5 of full data
|
||||||
|
let mut hasher = Md5::new();
|
||||||
|
hasher.update(&full_data);
|
||||||
|
let md5_hex = format!("{:x}", hasher.finalize());
|
||||||
|
|
||||||
|
// Get erasure set for destination
|
||||||
|
let erasure_set = self
|
||||||
|
.state
|
||||||
|
.get_erasure_set_for_object(dest_bucket, dest_key)
|
||||||
|
.await
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("No erasure sets available"))?;
|
||||||
|
|
||||||
|
// Re-encode and distribute in chunks
|
||||||
|
let chunk_size = self.erasure_config.chunk_size_bytes;
|
||||||
|
let mut chunks = Vec::new();
|
||||||
|
let mut chunk_index = 0u32;
|
||||||
|
|
||||||
|
for chunk_data in full_data.chunks(chunk_size) {
|
||||||
|
let chunk_manifest = self
|
||||||
|
.encode_and_distribute_chunk(
|
||||||
|
&erasure_set,
|
||||||
|
dest_bucket,
|
||||||
|
dest_key,
|
||||||
|
chunk_index,
|
||||||
|
chunk_data,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
chunks.push(chunk_manifest);
|
||||||
|
chunk_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let last_modified = Utc::now();
|
||||||
|
|
||||||
|
// Build and store manifest
|
||||||
|
let manifest = ObjectManifest {
|
||||||
|
bucket: dest_bucket.to_string(),
|
||||||
|
key: dest_key.to_string(),
|
||||||
|
version_id: uuid::Uuid::new_v4().to_string(),
|
||||||
|
size: full_data.len() as u64,
|
||||||
|
content_md5: md5_hex.clone(),
|
||||||
|
content_type: metadata
|
||||||
|
.get("content-type")
|
||||||
|
.cloned()
|
||||||
|
.unwrap_or_else(|| "binary/octet-stream".to_string()),
|
||||||
|
metadata,
|
||||||
|
created_at: last_modified.to_rfc3339(),
|
||||||
|
last_modified: last_modified.to_rfc3339(),
|
||||||
|
data_shards: self.erasure_config.data_shards,
|
||||||
|
parity_shards: self.erasure_config.parity_shards,
|
||||||
|
chunk_size: self.erasure_config.chunk_size_bytes,
|
||||||
|
chunks,
|
||||||
|
};
|
||||||
|
|
||||||
|
self.store_manifest(&manifest).await?;
|
||||||
|
|
||||||
|
Ok(CopyResult {
|
||||||
|
md5: md5_hex,
|
||||||
|
last_modified,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_objects(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
prefix: &str,
|
||||||
|
delimiter: &str,
|
||||||
|
max_keys: usize,
|
||||||
|
continuation_token: Option<&str>,
|
||||||
|
) -> Result<ListObjectsResult> {
|
||||||
|
if !self.bucket_exists(bucket).await {
|
||||||
|
return Err(crate::error::StorageError::no_such_bucket().into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// List manifests for this bucket
|
||||||
|
let manifest_bucket_dir = self.manifest_dir.join(bucket);
|
||||||
|
let mut keys = Vec::new();
|
||||||
|
|
||||||
|
if manifest_bucket_dir.is_dir() {
|
||||||
|
self.collect_manifest_keys(&manifest_bucket_dir, &manifest_bucket_dir, &mut keys)
|
||||||
|
.await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply prefix filter
|
||||||
|
if !prefix.is_empty() {
|
||||||
|
keys.retain(|k| k.starts_with(prefix));
|
||||||
|
}
|
||||||
|
|
||||||
|
keys.sort();
|
||||||
|
|
||||||
|
// Handle continuation token
|
||||||
|
if let Some(token) = continuation_token {
|
||||||
|
if let Some(pos) = keys.iter().position(|k| k.as_str() > token) {
|
||||||
|
keys = keys[pos..].to_vec();
|
||||||
|
} else {
|
||||||
|
keys.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle delimiter and pagination
|
||||||
|
let mut common_prefixes: Vec<String> = Vec::new();
|
||||||
|
let mut common_prefix_set = std::collections::HashSet::new();
|
||||||
|
let mut contents: Vec<ListObjectEntry> = Vec::new();
|
||||||
|
let mut is_truncated = false;
|
||||||
|
|
||||||
|
for key in &keys {
|
||||||
|
if !delimiter.is_empty() {
|
||||||
|
let remaining = &key[prefix.len()..];
|
||||||
|
if let Some(delim_idx) = remaining.find(delimiter) {
|
||||||
|
let cp = format!(
|
||||||
|
"{}{}",
|
||||||
|
prefix,
|
||||||
|
&remaining[..delim_idx + delimiter.len()]
|
||||||
|
);
|
||||||
|
if common_prefix_set.insert(cp.clone()) {
|
||||||
|
common_prefixes.push(cp);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if contents.len() >= max_keys {
|
||||||
|
is_truncated = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Ok(manifest) = self.load_manifest(bucket, key).await {
|
||||||
|
let last_modified: DateTime<Utc> = manifest
|
||||||
|
.last_modified
|
||||||
|
.parse()
|
||||||
|
.unwrap_or_else(|_| Utc::now());
|
||||||
|
contents.push(ListObjectEntry {
|
||||||
|
key: key.clone(),
|
||||||
|
size: manifest.size,
|
||||||
|
last_modified,
|
||||||
|
md5: manifest.content_md5,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let next_continuation_token = if is_truncated {
|
||||||
|
contents.last().map(|e| e.key.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
common_prefixes.sort();
|
||||||
|
|
||||||
|
Ok(ListObjectsResult {
|
||||||
|
contents,
|
||||||
|
common_prefixes,
|
||||||
|
is_truncated,
|
||||||
|
next_continuation_token,
|
||||||
|
prefix: prefix.to_string(),
|
||||||
|
delimiter: delimiter.to_string(),
|
||||||
|
max_keys,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Bucket operations
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
pub async fn list_buckets(&self) -> Result<Vec<BucketInfo>> {
|
||||||
|
let mut buckets = Vec::new();
|
||||||
|
if !self.buckets_dir.is_dir() {
|
||||||
|
return Ok(buckets);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut entries = fs::read_dir(&self.buckets_dir).await?;
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let meta = entry.metadata().await?;
|
||||||
|
if meta.is_dir() {
|
||||||
|
let name = entry.file_name().to_string_lossy().to_string();
|
||||||
|
if name.starts_with('.') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let creation_date: DateTime<Utc> = meta
|
||||||
|
.created()
|
||||||
|
.unwrap_or(meta.modified().unwrap_or(std::time::SystemTime::UNIX_EPOCH))
|
||||||
|
.into();
|
||||||
|
buckets.push(BucketInfo {
|
||||||
|
name,
|
||||||
|
creation_date,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
buckets.sort_by(|a, b| a.name.cmp(&b.name));
|
||||||
|
Ok(buckets)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn bucket_exists(&self, bucket: &str) -> bool {
|
||||||
|
self.buckets_dir.join(bucket).is_dir()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create_bucket(&self, bucket: &str) -> Result<()> {
|
||||||
|
let bucket_path = self.buckets_dir.join(bucket);
|
||||||
|
fs::create_dir_all(&bucket_path).await?;
|
||||||
|
// Also create manifest bucket dir
|
||||||
|
let manifest_bucket = self.manifest_dir.join(bucket);
|
||||||
|
fs::create_dir_all(&manifest_bucket).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete_bucket(&self, bucket: &str) -> Result<()> {
|
||||||
|
let bucket_path = self.buckets_dir.join(bucket);
|
||||||
|
if !bucket_path.is_dir() {
|
||||||
|
return Err(crate::error::StorageError::no_such_bucket().into());
|
||||||
|
}
|
||||||
|
// Check if empty (check manifests)
|
||||||
|
let manifest_bucket = self.manifest_dir.join(bucket);
|
||||||
|
if manifest_bucket.is_dir() {
|
||||||
|
let mut entries = fs::read_dir(&manifest_bucket).await?;
|
||||||
|
if entries.next_entry().await?.is_some() {
|
||||||
|
return Err(crate::error::StorageError::bucket_not_empty().into());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let _ = fs::remove_dir_all(&bucket_path).await;
|
||||||
|
let _ = fs::remove_dir_all(&manifest_bucket).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Multipart (delegated to local temp storage for now)
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
pub async fn initiate_multipart(
|
||||||
|
&self,
|
||||||
|
_bucket: &str,
|
||||||
|
_key: &str,
|
||||||
|
_metadata: HashMap<String, String>,
|
||||||
|
) -> Result<String> {
|
||||||
|
// TODO: Implement distributed multipart
|
||||||
|
anyhow::bail!("Multipart uploads not yet supported in cluster mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn upload_part(
|
||||||
|
&self,
|
||||||
|
_upload_id: &str,
|
||||||
|
_part_number: u32,
|
||||||
|
_body: Incoming,
|
||||||
|
) -> Result<(String, u64)> {
|
||||||
|
anyhow::bail!("Multipart uploads not yet supported in cluster mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn complete_multipart(
|
||||||
|
&self,
|
||||||
|
_upload_id: &str,
|
||||||
|
_parts: &[(u32, String)],
|
||||||
|
) -> Result<CompleteMultipartResult> {
|
||||||
|
anyhow::bail!("Multipart uploads not yet supported in cluster mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn abort_multipart(&self, _upload_id: &str) -> Result<()> {
|
||||||
|
anyhow::bail!("Multipart uploads not yet supported in cluster mode")
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_multipart_uploads(
|
||||||
|
&self,
|
||||||
|
_bucket: &str,
|
||||||
|
) -> Result<Vec<MultipartUploadInfo>> {
|
||||||
|
Ok(Vec::new())
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Internal: erasure encode + distribute
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
async fn encode_and_distribute_chunk(
|
||||||
|
&self,
|
||||||
|
erasure_set: &ErasureSet,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
chunk_index: u32,
|
||||||
|
chunk_data: &[u8],
|
||||||
|
) -> Result<ChunkManifest> {
|
||||||
|
let shards = self.erasure_coder.encode_chunk(chunk_data)?;
|
||||||
|
let quorum = self.erasure_config.write_quorum();
|
||||||
|
let total = shards.len();
|
||||||
|
|
||||||
|
let mut shard_placements = Vec::with_capacity(total);
|
||||||
|
let mut successes = 0u32;
|
||||||
|
let mut failures = 0u32;
|
||||||
|
|
||||||
|
// Distribute shards to drives in the erasure set
|
||||||
|
for (shard_idx, shard_data) in shards.iter().enumerate() {
|
||||||
|
let drive = erasure_set
|
||||||
|
.drives
|
||||||
|
.get(shard_idx)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Not enough drives in erasure set"))?;
|
||||||
|
|
||||||
|
let checksum = crc32c::crc32c(shard_data);
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: bucket.to_string(),
|
||||||
|
key: key.to_string(),
|
||||||
|
chunk_index,
|
||||||
|
shard_index: shard_idx as u32,
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = if drive.node_id == self.state.local_node_id() {
|
||||||
|
// Local write
|
||||||
|
if let Some(store) =
|
||||||
|
self.local_shard_stores.get(drive.drive_index as usize)
|
||||||
|
{
|
||||||
|
store.write_shard(&shard_id, shard_data, checksum).await
|
||||||
|
} else {
|
||||||
|
Err(anyhow::anyhow!("Local drive {} not found", drive.drive_index))
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Remote write via QUIC
|
||||||
|
self.write_shard_remote(
|
||||||
|
&drive.node_id,
|
||||||
|
bucket,
|
||||||
|
key,
|
||||||
|
chunk_index,
|
||||||
|
shard_idx as u32,
|
||||||
|
shard_data,
|
||||||
|
checksum,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
};
|
||||||
|
|
||||||
|
match result {
|
||||||
|
Ok(()) => {
|
||||||
|
successes += 1;
|
||||||
|
shard_placements.push(ShardPlacement {
|
||||||
|
shard_index: shard_idx as u32,
|
||||||
|
node_id: drive.node_id.clone(),
|
||||||
|
drive_id: drive.drive_index.to_string(),
|
||||||
|
checksum,
|
||||||
|
shard_size: shard_data.len(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
failures += 1;
|
||||||
|
tracing::warn!(
|
||||||
|
shard_index = shard_idx,
|
||||||
|
node = %drive.node_id,
|
||||||
|
error = %e,
|
||||||
|
"Shard write failed"
|
||||||
|
);
|
||||||
|
if failures as usize > total - quorum {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Write quorum not achievable: {}/{} failures",
|
||||||
|
failures,
|
||||||
|
total
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (successes as usize) < quorum {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Write quorum not met: only {}/{} succeeded (need {})",
|
||||||
|
successes,
|
||||||
|
total,
|
||||||
|
quorum
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ChunkManifest {
|
||||||
|
chunk_index,
|
||||||
|
data_size: chunk_data.len(),
|
||||||
|
shard_placements,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn write_shard_remote(
|
||||||
|
&self,
|
||||||
|
node_id: &str,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
chunk_index: u32,
|
||||||
|
shard_index: u32,
|
||||||
|
data: &[u8],
|
||||||
|
checksum: u32,
|
||||||
|
) -> Result<()> {
|
||||||
|
let node_info = self
|
||||||
|
.state
|
||||||
|
.get_node(node_id)
|
||||||
|
.await
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Node {} not found", node_id))?;
|
||||||
|
|
||||||
|
let addr: SocketAddr = node_info.quic_addr.parse()?;
|
||||||
|
let conn = self.transport.get_connection(node_id, addr).await?;
|
||||||
|
|
||||||
|
let request = ShardWriteRequest {
|
||||||
|
request_id: uuid::Uuid::new_v4().to_string(),
|
||||||
|
bucket: bucket.to_string(),
|
||||||
|
key: key.to_string(),
|
||||||
|
chunk_index,
|
||||||
|
shard_index,
|
||||||
|
shard_data_length: data.len() as u64,
|
||||||
|
checksum,
|
||||||
|
object_metadata: HashMap::new(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let ack = self
|
||||||
|
.transport
|
||||||
|
.send_shard_write(&conn, request, data)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if ack.success {
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Remote shard write failed: {}",
|
||||||
|
ack.error.unwrap_or_default()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Internal: fetch + reconstruct
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
async fn fetch_and_reconstruct_chunk(&self, chunk: &ChunkManifest) -> Result<Vec<u8>> {
|
||||||
|
let k = self.erasure_config.data_shards;
|
||||||
|
let total = self.erasure_config.total_shards();
|
||||||
|
let mut shards: Vec<Option<Vec<u8>>> = vec![None; total];
|
||||||
|
let mut succeeded = 0usize;
|
||||||
|
|
||||||
|
// Try to fetch shards (local first, then remote)
|
||||||
|
for placement in &chunk.shard_placements {
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: String::new(), // Not needed for read
|
||||||
|
key: String::new(),
|
||||||
|
chunk_index: chunk.chunk_index,
|
||||||
|
shard_index: placement.shard_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = if placement.node_id == self.state.local_node_id() {
|
||||||
|
// Local read
|
||||||
|
let store_idx = placement.drive_id.parse::<usize>().unwrap_or(0);
|
||||||
|
if let Some(store) = self.local_shard_stores.get(store_idx) {
|
||||||
|
// Need to set proper bucket/key on shard_id for local reads
|
||||||
|
// We get this from the chunk's context, but we don't have it here.
|
||||||
|
// This will be passed through the manifest's shard placements.
|
||||||
|
store.read_shard(&shard_id).await.ok()
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Remote read via QUIC
|
||||||
|
// TODO: implement remote shard read
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some((data, _checksum)) = result {
|
||||||
|
shards[placement.shard_index as usize] = Some(data);
|
||||||
|
succeeded += 1;
|
||||||
|
if succeeded >= k {
|
||||||
|
break; // Have enough shards
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if succeeded < k {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Read quorum not met: only {}/{} shards available for chunk {}",
|
||||||
|
succeeded,
|
||||||
|
k,
|
||||||
|
chunk.chunk_index
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.erasure_coder
|
||||||
|
.decode_chunk(&mut shards, chunk.data_size)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Manifest storage (local filesystem)
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
async fn store_manifest(&self, manifest: &ObjectManifest) -> Result<()> {
|
||||||
|
let path = self.manifest_path(&manifest.bucket, &manifest.key);
|
||||||
|
if let Some(parent) = path.parent() {
|
||||||
|
fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
let json = serde_json::to_string_pretty(manifest)?;
|
||||||
|
// Atomic write via temp + rename
|
||||||
|
let temp_path = path.with_extension("manifest.tmp");
|
||||||
|
fs::write(&temp_path, json).await?;
|
||||||
|
fs::rename(&temp_path, &path).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn load_manifest(&self, bucket: &str, key: &str) -> Result<ObjectManifest> {
|
||||||
|
let path = self.manifest_path(bucket, key);
|
||||||
|
if !path.exists() {
|
||||||
|
return Err(crate::error::StorageError::no_such_key().into());
|
||||||
|
}
|
||||||
|
let json = fs::read_to_string(&path).await?;
|
||||||
|
let manifest: ObjectManifest = serde_json::from_str(&json)?;
|
||||||
|
Ok(manifest)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn delete_manifest(&self, bucket: &str, key: &str) -> Result<()> {
|
||||||
|
let path = self.manifest_path(bucket, key);
|
||||||
|
let _ = fs::remove_file(&path).await;
|
||||||
|
// Clean up empty parent dirs
|
||||||
|
if let Some(parent) = path.parent() {
|
||||||
|
let _ = fs::remove_dir(parent).await;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn manifest_path(&self, bucket: &str, key: &str) -> PathBuf {
|
||||||
|
self.manifest_dir
|
||||||
|
.join(bucket)
|
||||||
|
.join(format!("{}.manifest.json", key))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn collect_manifest_keys(
|
||||||
|
&self,
|
||||||
|
base_dir: &std::path::Path,
|
||||||
|
dir: &std::path::Path,
|
||||||
|
keys: &mut Vec<String>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut entries = match fs::read_dir(dir).await {
|
||||||
|
Ok(e) => e,
|
||||||
|
Err(_) => return Ok(()),
|
||||||
|
};
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let meta = entry.metadata().await?;
|
||||||
|
let name = entry.file_name().to_string_lossy().to_string();
|
||||||
|
|
||||||
|
if meta.is_dir() {
|
||||||
|
Box::pin(self.collect_manifest_keys(base_dir, &entry.path(), keys)).await?;
|
||||||
|
} else if name.ends_with(".manifest.json") {
|
||||||
|
let relative = entry
|
||||||
|
.path()
|
||||||
|
.strip_prefix(base_dir)
|
||||||
|
.unwrap_or(std::path::Path::new(""))
|
||||||
|
.to_string_lossy()
|
||||||
|
.to_string();
|
||||||
|
let key = relative.trim_end_matches(".manifest.json").to_string();
|
||||||
|
keys.push(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
242
rust/src/cluster/drive_manager.rs
Normal file
242
rust/src/cluster/drive_manager.rs
Normal file
@@ -0,0 +1,242 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs;
|
||||||
|
use super::config::DriveConfig;
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Drive format (on-disk metadata)
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct DriveFormat {
|
||||||
|
pub cluster_id: String,
|
||||||
|
pub erasure_set_id: u32,
|
||||||
|
pub drive_index_in_set: u32,
|
||||||
|
pub format_version: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Drive state tracking
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum DriveStatus {
|
||||||
|
Online,
|
||||||
|
Degraded,
|
||||||
|
Offline,
|
||||||
|
Healing,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DriveStats {
|
||||||
|
pub total_bytes: u64,
|
||||||
|
pub used_bytes: u64,
|
||||||
|
pub avg_write_latency_us: u64,
|
||||||
|
pub avg_read_latency_us: u64,
|
||||||
|
pub error_count: u64,
|
||||||
|
pub last_error: Option<String>,
|
||||||
|
pub last_check: DateTime<Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for DriveStats {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
total_bytes: 0,
|
||||||
|
used_bytes: 0,
|
||||||
|
avg_write_latency_us: 0,
|
||||||
|
avg_read_latency_us: 0,
|
||||||
|
error_count: 0,
|
||||||
|
last_error: None,
|
||||||
|
last_check: Utc::now(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct DriveState {
|
||||||
|
pub path: PathBuf,
|
||||||
|
pub format: Option<DriveFormat>,
|
||||||
|
pub status: DriveStatus,
|
||||||
|
pub stats: DriveStats,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Drive manager
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
pub struct DriveManager {
|
||||||
|
drives: Vec<DriveState>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DriveManager {
|
||||||
|
/// Initialize drive manager with configured drive paths.
|
||||||
|
pub async fn new(config: &DriveConfig) -> Result<Self> {
|
||||||
|
let mut drives = Vec::with_capacity(config.paths.len());
|
||||||
|
|
||||||
|
for path_str in &config.paths {
|
||||||
|
let path = PathBuf::from(path_str);
|
||||||
|
let storage_dir = path.join(".smartstorage");
|
||||||
|
|
||||||
|
// Ensure the drive directory exists
|
||||||
|
fs::create_dir_all(&storage_dir).await?;
|
||||||
|
|
||||||
|
// Try to read existing format
|
||||||
|
let format = Self::read_format(&storage_dir).await;
|
||||||
|
let status = if path.exists() {
|
||||||
|
DriveStatus::Online
|
||||||
|
} else {
|
||||||
|
DriveStatus::Offline
|
||||||
|
};
|
||||||
|
|
||||||
|
drives.push(DriveState {
|
||||||
|
path,
|
||||||
|
format,
|
||||||
|
status,
|
||||||
|
stats: DriveStats::default(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Self { drives })
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format drives for a new cluster. Stamps each drive with cluster and erasure set info.
|
||||||
|
pub async fn format_drives(
|
||||||
|
&mut self,
|
||||||
|
cluster_id: &str,
|
||||||
|
erasure_set_assignments: &[(u32, u32)], // (erasure_set_id, drive_index_in_set)
|
||||||
|
) -> Result<()> {
|
||||||
|
if erasure_set_assignments.len() != self.drives.len() {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Erasure set assignments count ({}) doesn't match drive count ({})",
|
||||||
|
erasure_set_assignments.len(),
|
||||||
|
self.drives.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (drive, (set_id, drive_idx)) in
|
||||||
|
self.drives.iter_mut().zip(erasure_set_assignments.iter())
|
||||||
|
{
|
||||||
|
let format = DriveFormat {
|
||||||
|
cluster_id: cluster_id.to_string(),
|
||||||
|
erasure_set_id: *set_id,
|
||||||
|
drive_index_in_set: *drive_idx,
|
||||||
|
format_version: 1,
|
||||||
|
};
|
||||||
|
|
||||||
|
let storage_dir = drive.path.join(".smartstorage");
|
||||||
|
fs::create_dir_all(&storage_dir).await?;
|
||||||
|
|
||||||
|
let format_path = storage_dir.join("format.json");
|
||||||
|
let json = serde_json::to_string_pretty(&format)?;
|
||||||
|
fs::write(&format_path, json).await?;
|
||||||
|
|
||||||
|
drive.format = Some(format);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the number of drives managed.
|
||||||
|
pub fn drive_count(&self) -> usize {
|
||||||
|
self.drives.len()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get a drive's state by index.
|
||||||
|
pub fn drive(&self, index: usize) -> Option<&DriveState> {
|
||||||
|
self.drives.get(index)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all drives.
|
||||||
|
pub fn drives(&self) -> &[DriveState] {
|
||||||
|
&self.drives
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get drives that are online.
|
||||||
|
pub fn online_drives(&self) -> Vec<usize> {
|
||||||
|
self.drives
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(_, d)| d.status == DriveStatus::Online)
|
||||||
|
.map(|(i, _)| i)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check health of a specific drive by writing and reading a probe file.
|
||||||
|
pub async fn check_drive_health(&mut self, index: usize) -> Result<DriveStatus> {
|
||||||
|
let drive = self
|
||||||
|
.drives
|
||||||
|
.get_mut(index)
|
||||||
|
.ok_or_else(|| anyhow::anyhow!("Drive index {} out of range", index))?;
|
||||||
|
|
||||||
|
let probe_path = drive.path.join(".smartstorage").join(".health_probe");
|
||||||
|
|
||||||
|
let start = std::time::Instant::now();
|
||||||
|
|
||||||
|
// Write probe
|
||||||
|
match fs::write(&probe_path, b"health_check").await {
|
||||||
|
Ok(()) => {}
|
||||||
|
Err(e) => {
|
||||||
|
drive.stats.error_count += 1;
|
||||||
|
drive.stats.last_error = Some(e.to_string());
|
||||||
|
drive.status = DriveStatus::Offline;
|
||||||
|
drive.stats.last_check = Utc::now();
|
||||||
|
return Ok(DriveStatus::Offline);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Read probe
|
||||||
|
match fs::read(&probe_path).await {
|
||||||
|
Ok(_) => {}
|
||||||
|
Err(e) => {
|
||||||
|
drive.stats.error_count += 1;
|
||||||
|
drive.stats.last_error = Some(e.to_string());
|
||||||
|
drive.status = DriveStatus::Offline;
|
||||||
|
drive.stats.last_check = Utc::now();
|
||||||
|
return Ok(DriveStatus::Offline);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clean up probe
|
||||||
|
let _ = fs::remove_file(&probe_path).await;
|
||||||
|
|
||||||
|
let latency = start.elapsed();
|
||||||
|
drive.stats.avg_write_latency_us = latency.as_micros() as u64;
|
||||||
|
drive.stats.last_check = Utc::now();
|
||||||
|
|
||||||
|
// Mark degraded if latency is too high (>5 seconds)
|
||||||
|
if latency.as_secs() > 5 {
|
||||||
|
drive.status = DriveStatus::Degraded;
|
||||||
|
} else {
|
||||||
|
drive.status = DriveStatus::Online;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(drive.status.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run health checks on all drives.
|
||||||
|
pub async fn check_all_drives(&mut self) -> Vec<(usize, DriveStatus)> {
|
||||||
|
let mut results = Vec::new();
|
||||||
|
let count = self.drives.len();
|
||||||
|
for i in 0..count {
|
||||||
|
match self.check_drive_health(i).await {
|
||||||
|
Ok(status) => results.push((i, status)),
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!(drive = i, error = %e, "Drive health check failed");
|
||||||
|
results.push((i, DriveStatus::Offline));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}
|
||||||
|
|
||||||
|
// Internal helpers
|
||||||
|
|
||||||
|
async fn read_format(storage_dir: &Path) -> Option<DriveFormat> {
|
||||||
|
let format_path = storage_dir.join("format.json");
|
||||||
|
let content = fs::read_to_string(&format_path).await.ok()?;
|
||||||
|
serde_json::from_str(&content).ok()
|
||||||
|
}
|
||||||
|
}
|
||||||
246
rust/src/cluster/erasure.rs
Normal file
246
rust/src/cluster/erasure.rs
Normal file
@@ -0,0 +1,246 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use reed_solomon_erasure::galois_8::ReedSolomon;
|
||||||
|
|
||||||
|
use super::config::ErasureConfig;
|
||||||
|
|
||||||
|
/// Erasure coder that splits data into data+parity shards using Reed-Solomon.
|
||||||
|
///
|
||||||
|
/// Objects are processed in fixed-size chunks (stripes). Each chunk is independently
|
||||||
|
/// erasure-coded, enabling streaming encode/decode without buffering entire objects.
|
||||||
|
pub struct ErasureCoder {
|
||||||
|
rs: ReedSolomon,
|
||||||
|
config: ErasureConfig,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ErasureCoder {
|
||||||
|
pub fn new(config: &ErasureConfig) -> Result<Self> {
|
||||||
|
let rs = ReedSolomon::new(config.data_shards, config.parity_shards)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Failed to create Reed-Solomon encoder: {:?}", e))?;
|
||||||
|
Ok(Self {
|
||||||
|
rs,
|
||||||
|
config: config.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn config(&self) -> &ErasureConfig {
|
||||||
|
&self.config
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Encode a single chunk of data into data+parity shards.
|
||||||
|
///
|
||||||
|
/// The input data is split into `data_shards` equal-size pieces (padded if needed),
|
||||||
|
/// then `parity_shards` parity pieces are computed.
|
||||||
|
///
|
||||||
|
/// Returns a Vec of length `data_shards + parity_shards`, where:
|
||||||
|
/// - indices 0..data_shards are data shards
|
||||||
|
/// - indices data_shards..total are parity shards
|
||||||
|
pub fn encode_chunk(&self, data: &[u8]) -> Result<Vec<Vec<u8>>> {
|
||||||
|
let k = self.config.data_shards;
|
||||||
|
let m = self.config.parity_shards;
|
||||||
|
|
||||||
|
// Compute shard size: each data shard holds ceil(data_len / k) bytes
|
||||||
|
let shard_size = (data.len() + k - 1) / k;
|
||||||
|
if shard_size == 0 {
|
||||||
|
anyhow::bail!("Cannot encode empty data");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pad input to fill exactly k shards
|
||||||
|
let mut padded = data.to_vec();
|
||||||
|
padded.resize(shard_size * k, 0);
|
||||||
|
|
||||||
|
// Split into k data shards
|
||||||
|
let mut shards: Vec<Vec<u8>> = padded.chunks(shard_size).map(|c| c.to_vec()).collect();
|
||||||
|
|
||||||
|
// Add m empty parity shards
|
||||||
|
for _ in 0..m {
|
||||||
|
shards.push(vec![0u8; shard_size]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute parity in-place
|
||||||
|
self.rs
|
||||||
|
.encode(&mut shards)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Reed-Solomon encoding failed: {:?}", e))?;
|
||||||
|
|
||||||
|
Ok(shards)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode (reconstruct) original data from a partial set of shards.
|
||||||
|
///
|
||||||
|
/// `shards` must have length == total_shards (data + parity).
|
||||||
|
/// At least `data_shards` entries must be `Some`. Missing shards are `None`.
|
||||||
|
/// `original_size` is the original data size before padding, used to truncate.
|
||||||
|
///
|
||||||
|
/// Returns the reconstructed original data.
|
||||||
|
pub fn decode_chunk(
|
||||||
|
&self,
|
||||||
|
shards: &mut Vec<Option<Vec<u8>>>,
|
||||||
|
original_size: usize,
|
||||||
|
) -> Result<Vec<u8>> {
|
||||||
|
let k = self.config.data_shards;
|
||||||
|
let total = self.config.total_shards();
|
||||||
|
|
||||||
|
if shards.len() != total {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Expected {} shards, got {}",
|
||||||
|
total,
|
||||||
|
shards.len()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let available = shards.iter().filter(|s| s.is_some()).count();
|
||||||
|
if available < k {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Need at least {} shards for reconstruction, only {} available",
|
||||||
|
k,
|
||||||
|
available
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reconstruct missing shards
|
||||||
|
self.rs
|
||||||
|
.reconstruct(shards)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Reed-Solomon reconstruction failed: {:?}", e))?;
|
||||||
|
|
||||||
|
// Concatenate data shards (first k) and truncate to original size
|
||||||
|
let mut result = Vec::with_capacity(original_size);
|
||||||
|
for i in 0..k {
|
||||||
|
if let Some(ref shard) = shards[i] {
|
||||||
|
result.extend_from_slice(shard);
|
||||||
|
} else {
|
||||||
|
anyhow::bail!("Data shard {} missing after reconstruction", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result.truncate(original_size);
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Verify that all shards are consistent (no corruption).
|
||||||
|
pub fn verify(&self, shards: &[Vec<u8>]) -> Result<bool> {
|
||||||
|
let shard_refs: Vec<&[u8]> = shards.iter().map(|s| s.as_slice()).collect();
|
||||||
|
self.rs
|
||||||
|
.verify(&shard_refs)
|
||||||
|
.map_err(|e| anyhow::anyhow!("Reed-Solomon verification failed: {:?}", e))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
fn test_config() -> ErasureConfig {
|
||||||
|
ErasureConfig {
|
||||||
|
data_shards: 4,
|
||||||
|
parity_shards: 2,
|
||||||
|
chunk_size_bytes: 4 * 1024 * 1024,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encode_decode_roundtrip() {
|
||||||
|
let coder = ErasureCoder::new(&test_config()).unwrap();
|
||||||
|
let original = b"Hello, erasure coding! This is a test of the Reed-Solomon implementation.";
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(original).unwrap();
|
||||||
|
assert_eq!(shards.len(), 6); // 4 data + 2 parity
|
||||||
|
|
||||||
|
// All shards should be the same size
|
||||||
|
let shard_size = shards[0].len();
|
||||||
|
for s in &shards {
|
||||||
|
assert_eq!(s.len(), shard_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reconstruct with all shards present
|
||||||
|
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
|
||||||
|
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
|
||||||
|
assert_eq!(&recovered, original);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decode_with_missing_shards() {
|
||||||
|
let coder = ErasureCoder::new(&test_config()).unwrap();
|
||||||
|
let original = b"Testing reconstruction with missing shards - this should work with 4 of 6.";
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(original).unwrap();
|
||||||
|
|
||||||
|
// Remove 2 shards (the maximum we can tolerate with 2 parity)
|
||||||
|
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
|
||||||
|
shard_opts[1] = None; // Remove data shard 1
|
||||||
|
shard_opts[4] = None; // Remove parity shard 0
|
||||||
|
|
||||||
|
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
|
||||||
|
assert_eq!(&recovered, original);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_decode_with_too_many_missing() {
|
||||||
|
let coder = ErasureCoder::new(&test_config()).unwrap();
|
||||||
|
let original = b"This should fail with 3 missing shards.";
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(original).unwrap();
|
||||||
|
|
||||||
|
// Remove 3 shards (more than parity count of 2)
|
||||||
|
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
|
||||||
|
shard_opts[0] = None;
|
||||||
|
shard_opts[2] = None;
|
||||||
|
shard_opts[5] = None;
|
||||||
|
|
||||||
|
let result = coder.decode_chunk(&mut shard_opts, original.len());
|
||||||
|
assert!(result.is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_encode_large_data() {
|
||||||
|
let coder = ErasureCoder::new(&test_config()).unwrap();
|
||||||
|
// 1 MB of data
|
||||||
|
let original: Vec<u8> = (0..1_000_000).map(|i| (i % 256) as u8).collect();
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(&original).unwrap();
|
||||||
|
assert_eq!(shards.len(), 6);
|
||||||
|
|
||||||
|
// Each shard should be ~250KB (1MB / 4 data shards, rounded up)
|
||||||
|
let expected_shard_size = (original.len() + 3) / 4;
|
||||||
|
assert_eq!(shards[0].len(), expected_shard_size);
|
||||||
|
|
||||||
|
// Verify roundtrip
|
||||||
|
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
|
||||||
|
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
|
||||||
|
assert_eq!(recovered, original);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_verify_shards() {
|
||||||
|
let coder = ErasureCoder::new(&test_config()).unwrap();
|
||||||
|
let original = b"Verify test data";
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(original).unwrap();
|
||||||
|
assert!(coder.verify(&shards).unwrap());
|
||||||
|
|
||||||
|
// Corrupt a shard
|
||||||
|
let mut corrupted = shards.clone();
|
||||||
|
corrupted[0][0] ^= 0xFF;
|
||||||
|
assert!(!coder.verify(&corrupted).unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_small_config() {
|
||||||
|
// Minimum viable: 2 data + 1 parity
|
||||||
|
let config = ErasureConfig {
|
||||||
|
data_shards: 2,
|
||||||
|
parity_shards: 1,
|
||||||
|
chunk_size_bytes: 1024,
|
||||||
|
};
|
||||||
|
let coder = ErasureCoder::new(&config).unwrap();
|
||||||
|
let original = b"Small config test";
|
||||||
|
|
||||||
|
let shards = coder.encode_chunk(original).unwrap();
|
||||||
|
assert_eq!(shards.len(), 3);
|
||||||
|
|
||||||
|
// Remove 1 shard
|
||||||
|
let mut shard_opts: Vec<Option<Vec<u8>>> = shards.iter().map(|s| Some(s.clone())).collect();
|
||||||
|
shard_opts[0] = None;
|
||||||
|
|
||||||
|
let recovered = coder.decode_chunk(&mut shard_opts, original.len()).unwrap();
|
||||||
|
assert_eq!(&recovered, original);
|
||||||
|
}
|
||||||
|
}
|
||||||
92
rust/src/cluster/healing.rs
Normal file
92
rust/src/cluster/healing.rs
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use super::coordinator::DistributedStore;
|
||||||
|
use super::state::ClusterState;
|
||||||
|
|
||||||
|
/// Background healing service that scans for under-replicated shards
|
||||||
|
/// and reconstructs them.
|
||||||
|
pub struct HealingService {
|
||||||
|
state: Arc<ClusterState>,
|
||||||
|
scan_interval: Duration,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl HealingService {
|
||||||
|
pub fn new(state: Arc<ClusterState>, scan_interval_hours: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
state,
|
||||||
|
scan_interval: Duration::from_secs(scan_interval_hours * 3600),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the healing loop as a background task.
|
||||||
|
pub async fn run(&self, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
||||||
|
let mut interval = tokio::time::interval(self.scan_interval);
|
||||||
|
|
||||||
|
// Skip the first immediate tick
|
||||||
|
interval.tick().await;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = interval.tick() => {
|
||||||
|
tracing::info!("Starting healing scan");
|
||||||
|
match self.heal_scan().await {
|
||||||
|
Ok(stats) => {
|
||||||
|
tracing::info!(
|
||||||
|
checked = stats.shards_checked,
|
||||||
|
healed = stats.shards_healed,
|
||||||
|
errors = stats.errors,
|
||||||
|
"Healing scan completed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("Healing scan failed: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = shutdown.changed() => {
|
||||||
|
tracing::info!("Healing service shutting down");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scan for offline nodes and identify objects that need healing.
|
||||||
|
async fn heal_scan(&self) -> Result<HealStats> {
|
||||||
|
let mut stats = HealStats::default();
|
||||||
|
|
||||||
|
let offline_nodes = self.state.offline_nodes().await;
|
||||||
|
if offline_nodes.is_empty() {
|
||||||
|
tracing::debug!("No offline nodes, skipping heal scan");
|
||||||
|
return Ok(stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
tracing::info!(
|
||||||
|
"Found {} offline nodes, scanning for affected shards",
|
||||||
|
offline_nodes.len()
|
||||||
|
);
|
||||||
|
|
||||||
|
// Check that we have majority before healing
|
||||||
|
// (prevents healing during split-brain)
|
||||||
|
if !self.state.has_majority().await {
|
||||||
|
tracing::warn!("No majority quorum, skipping heal to prevent split-brain");
|
||||||
|
return Ok(stats);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Iterate all manifests, find shards on offline nodes,
|
||||||
|
// reconstruct from remaining shards and place on healthy nodes.
|
||||||
|
// This requires access to the DistributedStore and manifest listing
|
||||||
|
// which will be wired in when the full healing pipeline is implemented.
|
||||||
|
|
||||||
|
Ok(stats)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct HealStats {
|
||||||
|
pub shards_checked: u64,
|
||||||
|
pub shards_healed: u64,
|
||||||
|
pub errors: u64,
|
||||||
|
}
|
||||||
184
rust/src/cluster/membership.rs
Normal file
184
rust/src/cluster/membership.rs
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use std::net::SocketAddr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
use super::protocol::{
|
||||||
|
ClusterRequest, ClusterResponse, HeartbeatMessage, JoinRequestMessage, NodeInfo,
|
||||||
|
};
|
||||||
|
use super::quic_transport::QuicTransport;
|
||||||
|
use super::state::ClusterState;
|
||||||
|
|
||||||
|
/// Manages cluster membership: heartbeating, joining, failure detection.
|
||||||
|
pub struct MembershipManager {
|
||||||
|
state: Arc<ClusterState>,
|
||||||
|
transport: Arc<QuicTransport>,
|
||||||
|
heartbeat_interval: Duration,
|
||||||
|
local_node_info: NodeInfo,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MembershipManager {
|
||||||
|
pub fn new(
|
||||||
|
state: Arc<ClusterState>,
|
||||||
|
transport: Arc<QuicTransport>,
|
||||||
|
heartbeat_interval_ms: u64,
|
||||||
|
local_node_info: NodeInfo,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
state,
|
||||||
|
transport,
|
||||||
|
heartbeat_interval: Duration::from_millis(heartbeat_interval_ms),
|
||||||
|
local_node_info,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Join the cluster by contacting seed nodes.
|
||||||
|
/// Sends a JoinRequest to each seed node until one accepts.
|
||||||
|
pub async fn join_cluster(&self, seed_nodes: &[String]) -> Result<()> {
|
||||||
|
if seed_nodes.is_empty() {
|
||||||
|
tracing::info!("No seed nodes configured, starting as initial cluster node");
|
||||||
|
self.state.add_node(self.local_node_info.clone()).await;
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
for seed in seed_nodes {
|
||||||
|
let addr: SocketAddr = match seed.parse() {
|
||||||
|
Ok(a) => a,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Invalid seed node address '{}': {}", seed, e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
tracing::info!("Attempting to join cluster via seed node {}", seed);
|
||||||
|
|
||||||
|
match self.try_join(addr).await {
|
||||||
|
Ok(()) => {
|
||||||
|
tracing::info!("Successfully joined cluster via {}", seed);
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::warn!("Failed to join via {}: {}", seed, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no seed responded, start as a new cluster
|
||||||
|
tracing::info!("Could not reach any seed nodes, starting as initial cluster node");
|
||||||
|
self.state.add_node(self.local_node_info.clone()).await;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn try_join(&self, addr: SocketAddr) -> Result<()> {
|
||||||
|
let conn = self
|
||||||
|
.transport
|
||||||
|
.get_connection("seed", addr)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let request = ClusterRequest::JoinRequest(JoinRequestMessage {
|
||||||
|
node_info: self.local_node_info.clone(),
|
||||||
|
});
|
||||||
|
|
||||||
|
let response = self.transport.send_request(&conn, &request).await?;
|
||||||
|
|
||||||
|
match response {
|
||||||
|
ClusterResponse::JoinResponse(join_resp) => {
|
||||||
|
if join_resp.accepted {
|
||||||
|
if let Some(topology) = &join_resp.topology {
|
||||||
|
self.state.apply_topology(topology).await;
|
||||||
|
// Also register self
|
||||||
|
self.state.add_node(self.local_node_info.clone()).await;
|
||||||
|
tracing::info!(
|
||||||
|
"Applied cluster topology (version {}, {} nodes, {} erasure sets)",
|
||||||
|
topology.version,
|
||||||
|
topology.nodes.len(),
|
||||||
|
topology.erasure_sets.len(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
} else {
|
||||||
|
anyhow::bail!(
|
||||||
|
"Join rejected: {}",
|
||||||
|
join_resp.error.unwrap_or_default()
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ClusterResponse::Error(e) => {
|
||||||
|
anyhow::bail!("Join error: {} - {}", e.code, e.message)
|
||||||
|
}
|
||||||
|
_ => anyhow::bail!("Unexpected response to join request"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the heartbeat loop. Sends heartbeats to all peers periodically.
|
||||||
|
pub async fn heartbeat_loop(self: Arc<Self>, mut shutdown: tokio::sync::watch::Receiver<bool>) {
|
||||||
|
let mut interval = tokio::time::interval(self.heartbeat_interval);
|
||||||
|
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
_ = interval.tick() => {
|
||||||
|
self.send_heartbeats().await;
|
||||||
|
}
|
||||||
|
_ = shutdown.changed() => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn send_heartbeats(&self) {
|
||||||
|
let peers = self.state.online_peers().await;
|
||||||
|
let topology_version = self.state.version().await;
|
||||||
|
let mut responded = Vec::new();
|
||||||
|
|
||||||
|
for peer in &peers {
|
||||||
|
let addr: SocketAddr = match peer.quic_addr.parse() {
|
||||||
|
Ok(a) => a,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let heartbeat = ClusterRequest::Heartbeat(HeartbeatMessage {
|
||||||
|
node_id: self.local_node_info.node_id.clone(),
|
||||||
|
timestamp: chrono::Utc::now().to_rfc3339(),
|
||||||
|
drive_states: Vec::new(), // TODO: populate from DriveManager
|
||||||
|
topology_version,
|
||||||
|
});
|
||||||
|
|
||||||
|
match tokio::time::timeout(
|
||||||
|
Duration::from_secs(5),
|
||||||
|
self.send_heartbeat_to_peer(&peer.node_id, addr, &heartbeat),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(Ok(())) => {
|
||||||
|
responded.push(peer.node_id.clone());
|
||||||
|
}
|
||||||
|
Ok(Err(e)) => {
|
||||||
|
tracing::debug!(
|
||||||
|
peer = %peer.node_id,
|
||||||
|
error = %e,
|
||||||
|
"Heartbeat failed"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
tracing::debug!(peer = %peer.node_id, "Heartbeat timed out");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update state based on responses
|
||||||
|
let status_changes = self.state.tick_heartbeats(&responded).await;
|
||||||
|
for (node_id, status) in &status_changes {
|
||||||
|
tracing::info!(node = %node_id, status = ?status, "Node status changed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn send_heartbeat_to_peer(
|
||||||
|
&self,
|
||||||
|
node_id: &str,
|
||||||
|
addr: SocketAddr,
|
||||||
|
heartbeat: &ClusterRequest,
|
||||||
|
) -> Result<()> {
|
||||||
|
let conn = self.transport.get_connection(node_id, addr).await?;
|
||||||
|
let _response = self.transport.send_request(&conn, heartbeat).await?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
85
rust/src/cluster/metadata.rs
Normal file
85
rust/src/cluster/metadata.rs
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
/// Full manifest describing how an object is stored across erasure-coded shards.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ObjectManifest {
|
||||||
|
/// Bucket name
|
||||||
|
pub bucket: String,
|
||||||
|
/// Object key
|
||||||
|
pub key: String,
|
||||||
|
/// Unique version ID for this write
|
||||||
|
pub version_id: String,
|
||||||
|
/// Total object size in bytes
|
||||||
|
pub size: u64,
|
||||||
|
/// MD5 hex digest of the complete object
|
||||||
|
pub content_md5: String,
|
||||||
|
/// Content type
|
||||||
|
pub content_type: String,
|
||||||
|
/// User metadata (x-amz-meta-*, content-type, etc.)
|
||||||
|
pub metadata: HashMap<String, String>,
|
||||||
|
/// When the object was created
|
||||||
|
pub created_at: String,
|
||||||
|
/// Last modified timestamp
|
||||||
|
pub last_modified: String,
|
||||||
|
/// Number of data shards used
|
||||||
|
pub data_shards: usize,
|
||||||
|
/// Number of parity shards used
|
||||||
|
pub parity_shards: usize,
|
||||||
|
/// Chunk size in bytes (last chunk may be smaller)
|
||||||
|
pub chunk_size: usize,
|
||||||
|
/// Per-chunk shard placement info
|
||||||
|
pub chunks: Vec<ChunkManifest>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Describes the shards for a single chunk of an object.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ChunkManifest {
|
||||||
|
/// Index of this chunk (0-based)
|
||||||
|
pub chunk_index: u32,
|
||||||
|
/// Actual data size of this chunk (before erasure coding)
|
||||||
|
pub data_size: usize,
|
||||||
|
/// Where each shard was placed
|
||||||
|
pub shard_placements: Vec<ShardPlacement>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Describes where a specific shard is stored.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct ShardPlacement {
|
||||||
|
/// Shard index within the erasure set (0..data_shards+parity_shards)
|
||||||
|
pub shard_index: u32,
|
||||||
|
/// Node that holds this shard
|
||||||
|
pub node_id: String,
|
||||||
|
/// Drive ID on that node
|
||||||
|
pub drive_id: String,
|
||||||
|
/// CRC32C checksum of the shard data
|
||||||
|
pub checksum: u32,
|
||||||
|
/// Size of the shard data in bytes
|
||||||
|
pub shard_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manifest for a multipart upload in progress.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct MultipartUploadManifest {
|
||||||
|
pub upload_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub initiated: String,
|
||||||
|
pub metadata: HashMap<String, String>,
|
||||||
|
/// Per-part manifests, keyed by part number.
|
||||||
|
pub parts: HashMap<u32, PartManifest>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manifest for a single part of a multipart upload.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct PartManifest {
|
||||||
|
pub part_number: u32,
|
||||||
|
pub size: u64,
|
||||||
|
pub md5: String,
|
||||||
|
pub chunks: Vec<ChunkManifest>,
|
||||||
|
}
|
||||||
12
rust/src/cluster/mod.rs
Normal file
12
rust/src/cluster/mod.rs
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
pub mod config;
|
||||||
|
pub mod coordinator;
|
||||||
|
pub mod drive_manager;
|
||||||
|
pub mod erasure;
|
||||||
|
pub mod healing;
|
||||||
|
pub mod membership;
|
||||||
|
pub mod metadata;
|
||||||
|
pub mod placement;
|
||||||
|
pub mod protocol;
|
||||||
|
pub mod quic_transport;
|
||||||
|
pub mod shard_store;
|
||||||
|
pub mod state;
|
||||||
140
rust/src/cluster/placement.rs
Normal file
140
rust/src/cluster/placement.rs
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
use xxhash_rust::xxh64::xxh64;
|
||||||
|
|
||||||
|
/// Determines which erasure set an object belongs to, based on consistent hashing.
|
||||||
|
///
|
||||||
|
/// Uses xxhash64 of "{bucket}/{key}" to deterministically map objects to erasure sets.
|
||||||
|
/// This is stateless — any node can independently compute the placement.
|
||||||
|
pub fn erasure_set_for_object(bucket: &str, key: &str, num_erasure_sets: u32) -> u32 {
|
||||||
|
if num_erasure_sets == 0 {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
let hash_input = format!("{}/{}", bucket, key);
|
||||||
|
let hash = xxh64(hash_input.as_bytes(), 0);
|
||||||
|
(hash % num_erasure_sets as u64) as u32
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a drive location within the cluster topology.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct DriveLocation {
|
||||||
|
pub node_id: String,
|
||||||
|
pub drive_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An erasure set: a fixed group of drives that together store one complete
|
||||||
|
/// set of shards for any object placed on them.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct ErasureSet {
|
||||||
|
pub set_id: u32,
|
||||||
|
/// Ordered drives: index = shard_index
|
||||||
|
pub drives: Vec<DriveLocation>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Form erasure sets from the available drives across all nodes.
|
||||||
|
///
|
||||||
|
/// Interleaves drives from different nodes for fault isolation:
|
||||||
|
/// e.g., with 3 nodes x 4 drives and total_shards=6:
|
||||||
|
/// Set 0: N0-D0, N1-D0, N2-D0, N0-D1, N1-D1, N2-D1
|
||||||
|
/// Set 1: N0-D2, N1-D2, N2-D2, N0-D3, N1-D3, N2-D3
|
||||||
|
pub fn form_erasure_sets(
|
||||||
|
nodes: &[(String, u32)], // (node_id, drive_count)
|
||||||
|
total_shards: usize,
|
||||||
|
) -> Vec<ErasureSet> {
|
||||||
|
// Collect all drives as (node_id, drive_index), interleaved by node
|
||||||
|
let max_drives = nodes.iter().map(|(_, count)| *count).max().unwrap_or(0) as usize;
|
||||||
|
let mut all_drives: Vec<DriveLocation> = Vec::new();
|
||||||
|
|
||||||
|
for drive_idx in 0..max_drives {
|
||||||
|
for (node_id, drive_count) in nodes {
|
||||||
|
if (drive_idx as u32) < *drive_count {
|
||||||
|
all_drives.push(DriveLocation {
|
||||||
|
node_id: node_id.clone(),
|
||||||
|
drive_index: drive_idx as u32,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Form sets of total_shards drives each
|
||||||
|
let num_sets = all_drives.len() / total_shards;
|
||||||
|
let mut sets = Vec::with_capacity(num_sets);
|
||||||
|
|
||||||
|
for set_idx in 0..num_sets {
|
||||||
|
let start = set_idx * total_shards;
|
||||||
|
let end = start + total_shards;
|
||||||
|
let drives = all_drives[start..end].to_vec();
|
||||||
|
|
||||||
|
sets.push(ErasureSet {
|
||||||
|
set_id: set_idx as u32,
|
||||||
|
drives,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
sets
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_erasure_set_assignment_deterministic() {
|
||||||
|
let set_a = erasure_set_for_object("mybucket", "mykey", 4);
|
||||||
|
let set_b = erasure_set_for_object("mybucket", "mykey", 4);
|
||||||
|
assert_eq!(set_a, set_b);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_erasure_set_distribution() {
|
||||||
|
// Check that objects are distributed across sets
|
||||||
|
let num_sets = 4u32;
|
||||||
|
let mut counts = vec![0u32; num_sets as usize];
|
||||||
|
for i in 0..1000 {
|
||||||
|
let key = format!("key-{}", i);
|
||||||
|
let set = erasure_set_for_object("bucket", &key, num_sets);
|
||||||
|
assert!(set < num_sets);
|
||||||
|
counts[set as usize] += 1;
|
||||||
|
}
|
||||||
|
// Each set should have some objects (not all in one set)
|
||||||
|
for count in &counts {
|
||||||
|
assert!(*count > 100, "Expected >100, got {}", count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_form_erasure_sets_3x4() {
|
||||||
|
// 3 nodes, 4 drives each, 6 shards per set => 2 sets
|
||||||
|
let nodes = vec![
|
||||||
|
("node1".to_string(), 4),
|
||||||
|
("node2".to_string(), 4),
|
||||||
|
("node3".to_string(), 4),
|
||||||
|
];
|
||||||
|
let sets = form_erasure_sets(&nodes, 6);
|
||||||
|
assert_eq!(sets.len(), 2);
|
||||||
|
|
||||||
|
// Set 0 should interleave across nodes
|
||||||
|
let set0_nodes: Vec<&str> = sets[0].drives.iter().map(|d| d.node_id.as_str()).collect();
|
||||||
|
assert_eq!(set0_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
|
||||||
|
|
||||||
|
// Set 1 should also interleave
|
||||||
|
let set1_nodes: Vec<&str> = sets[1].drives.iter().map(|d| d.node_id.as_str()).collect();
|
||||||
|
assert_eq!(set1_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
|
||||||
|
|
||||||
|
// Drive indices should be different between sets
|
||||||
|
let set0_drives: Vec<u32> = sets[0].drives.iter().map(|d| d.drive_index).collect();
|
||||||
|
let set1_drives: Vec<u32> = sets[1].drives.iter().map(|d| d.drive_index).collect();
|
||||||
|
assert_eq!(set0_drives, vec![0, 0, 0, 1, 1, 1]);
|
||||||
|
assert_eq!(set1_drives, vec![2, 2, 2, 3, 3, 3]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_form_erasure_sets_remainder() {
|
||||||
|
// 2 nodes, 3 drives each, 4 shards => 1 set (2 drives left over)
|
||||||
|
let nodes = vec![
|
||||||
|
("a".to_string(), 3),
|
||||||
|
("b".to_string(), 3),
|
||||||
|
];
|
||||||
|
let sets = form_erasure_sets(&nodes, 4);
|
||||||
|
assert_eq!(sets.len(), 1);
|
||||||
|
assert_eq!(sets[0].drives.len(), 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
384
rust/src/cluster/protocol.rs
Normal file
384
rust/src/cluster/protocol.rs
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use super::metadata::ObjectManifest;
|
||||||
|
|
||||||
|
/// All inter-node cluster messages, serialized with bincode over QUIC streams.
|
||||||
|
///
|
||||||
|
/// Each message type gets its own bidirectional QUIC stream.
|
||||||
|
/// For shard data transfers, the header is sent first (bincode),
|
||||||
|
/// then raw shard bytes follow on the same stream.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum ClusterRequest {
|
||||||
|
// ============================
|
||||||
|
// Shard operations
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Write a shard to a specific drive on the target node.
|
||||||
|
/// Shard data follows after this header on the same stream.
|
||||||
|
ShardWrite(ShardWriteRequest),
|
||||||
|
|
||||||
|
/// Read a shard from the target node.
|
||||||
|
ShardRead(ShardReadRequest),
|
||||||
|
|
||||||
|
/// Delete a shard from the target node.
|
||||||
|
ShardDelete(ShardDeleteRequest),
|
||||||
|
|
||||||
|
/// Check if a shard exists and get its metadata.
|
||||||
|
ShardHead(ShardHeadRequest),
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Manifest operations
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Store an object manifest on the target node.
|
||||||
|
ManifestWrite(ManifestWriteRequest),
|
||||||
|
|
||||||
|
/// Retrieve an object manifest from the target node.
|
||||||
|
ManifestRead(ManifestReadRequest),
|
||||||
|
|
||||||
|
/// Delete an object manifest from the target node.
|
||||||
|
ManifestDelete(ManifestDeleteRequest),
|
||||||
|
|
||||||
|
/// List all manifests for a bucket on the target node.
|
||||||
|
ManifestList(ManifestListRequest),
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Cluster management
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Periodic heartbeat.
|
||||||
|
Heartbeat(HeartbeatMessage),
|
||||||
|
|
||||||
|
/// Request to join the cluster.
|
||||||
|
JoinRequest(JoinRequestMessage),
|
||||||
|
|
||||||
|
/// Synchronize cluster topology.
|
||||||
|
TopologySync(TopologySyncMessage),
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Healing
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Request a shard to be reconstructed and placed on a target drive.
|
||||||
|
HealRequest(HealRequestMessage),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Responses to cluster requests.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub enum ClusterResponse {
|
||||||
|
// Shard ops
|
||||||
|
ShardWriteAck(ShardWriteAck),
|
||||||
|
ShardReadResponse(ShardReadResponse),
|
||||||
|
ShardDeleteAck(ShardDeleteAck),
|
||||||
|
ShardHeadResponse(ShardHeadResponse),
|
||||||
|
|
||||||
|
// Manifest ops
|
||||||
|
ManifestWriteAck(ManifestWriteAck),
|
||||||
|
ManifestReadResponse(ManifestReadResponse),
|
||||||
|
ManifestDeleteAck(ManifestDeleteAck),
|
||||||
|
ManifestListResponse(ManifestListResponse),
|
||||||
|
|
||||||
|
// Cluster mgmt
|
||||||
|
HeartbeatAck(HeartbeatAckMessage),
|
||||||
|
JoinResponse(JoinResponseMessage),
|
||||||
|
TopologySyncAck(TopologySyncAckMessage),
|
||||||
|
|
||||||
|
// Healing
|
||||||
|
HealResponse(HealResponseMessage),
|
||||||
|
|
||||||
|
// Error
|
||||||
|
Error(ErrorResponse),
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Shard operation messages
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardWriteRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
pub shard_data_length: u64,
|
||||||
|
pub checksum: u32, // crc32c of shard data
|
||||||
|
pub object_metadata: HashMap<String, String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardWriteAck {
|
||||||
|
pub request_id: String,
|
||||||
|
pub success: bool,
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardReadRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardReadResponse {
|
||||||
|
pub request_id: String,
|
||||||
|
pub found: bool,
|
||||||
|
pub shard_data_length: u64,
|
||||||
|
pub checksum: u32,
|
||||||
|
// Shard data follows on the stream after this header
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardDeleteRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardDeleteAck {
|
||||||
|
pub request_id: String,
|
||||||
|
pub success: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardHeadRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardHeadResponse {
|
||||||
|
pub request_id: String,
|
||||||
|
pub found: bool,
|
||||||
|
pub data_size: u64,
|
||||||
|
pub checksum: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Manifest operation messages
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestWriteRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub manifest: ObjectManifest,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestWriteAck {
|
||||||
|
pub request_id: String,
|
||||||
|
pub success: bool,
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestReadRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestReadResponse {
|
||||||
|
pub request_id: String,
|
||||||
|
pub found: bool,
|
||||||
|
pub manifest: Option<ObjectManifest>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestDeleteRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestDeleteAck {
|
||||||
|
pub request_id: String,
|
||||||
|
pub success: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestListRequest {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub prefix: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ManifestListResponse {
|
||||||
|
pub request_id: String,
|
||||||
|
pub manifests: Vec<ObjectManifest>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Cluster management messages
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DriveStateInfo {
|
||||||
|
pub drive_index: u32,
|
||||||
|
pub status: String, // "online", "degraded", "offline", "healing"
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HeartbeatMessage {
|
||||||
|
pub node_id: String,
|
||||||
|
pub timestamp: String,
|
||||||
|
pub drive_states: Vec<DriveStateInfo>,
|
||||||
|
pub topology_version: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HeartbeatAckMessage {
|
||||||
|
pub node_id: String,
|
||||||
|
pub timestamp: String,
|
||||||
|
pub topology_version: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct NodeInfo {
|
||||||
|
pub node_id: String,
|
||||||
|
pub quic_addr: String,
|
||||||
|
pub s3_addr: String,
|
||||||
|
pub drive_count: u32,
|
||||||
|
pub status: String,
|
||||||
|
pub version: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct JoinRequestMessage {
|
||||||
|
pub node_info: NodeInfo,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ClusterTopology {
|
||||||
|
pub version: u64,
|
||||||
|
pub cluster_id: String,
|
||||||
|
pub nodes: Vec<NodeInfo>,
|
||||||
|
pub erasure_sets: Vec<ErasureSetInfo>,
|
||||||
|
pub data_shards: usize,
|
||||||
|
pub parity_shards: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ErasureSetInfo {
|
||||||
|
pub set_id: u32,
|
||||||
|
pub drives: Vec<DriveLocationInfo>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct DriveLocationInfo {
|
||||||
|
pub node_id: String,
|
||||||
|
pub drive_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct JoinResponseMessage {
|
||||||
|
pub accepted: bool,
|
||||||
|
pub topology: Option<ClusterTopology>,
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct TopologySyncMessage {
|
||||||
|
pub topology: ClusterTopology,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct TopologySyncAckMessage {
|
||||||
|
pub accepted: bool,
|
||||||
|
pub current_version: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Healing messages
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HealRequestMessage {
|
||||||
|
pub request_id: String,
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
pub target_node_id: String,
|
||||||
|
pub target_drive_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct HealResponseMessage {
|
||||||
|
pub request_id: String,
|
||||||
|
pub success: bool,
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Error response
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ErrorResponse {
|
||||||
|
pub request_id: String,
|
||||||
|
pub code: String,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Wire format helpers
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Serialize a request to bincode bytes with a 4-byte length prefix.
|
||||||
|
pub fn encode_request(req: &ClusterRequest) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let payload = bincode::serialize(req)?;
|
||||||
|
let mut buf = Vec::with_capacity(4 + payload.len());
|
||||||
|
buf.extend_from_slice(&(payload.len() as u32).to_le_bytes());
|
||||||
|
buf.extend_from_slice(&payload);
|
||||||
|
Ok(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Serialize a response to bincode bytes with a 4-byte length prefix.
|
||||||
|
pub fn encode_response(resp: &ClusterResponse) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let payload = bincode::serialize(resp)?;
|
||||||
|
let mut buf = Vec::with_capacity(4 + payload.len());
|
||||||
|
buf.extend_from_slice(&(payload.len() as u32).to_le_bytes());
|
||||||
|
buf.extend_from_slice(&payload);
|
||||||
|
Ok(buf)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a length-prefixed bincode message from raw bytes.
|
||||||
|
/// Returns (decoded message, bytes consumed).
|
||||||
|
pub fn decode_request(data: &[u8]) -> anyhow::Result<(ClusterRequest, usize)> {
|
||||||
|
if data.len() < 4 {
|
||||||
|
anyhow::bail!("Not enough data for length prefix");
|
||||||
|
}
|
||||||
|
let len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
|
||||||
|
if data.len() < 4 + len {
|
||||||
|
anyhow::bail!("Not enough data for message body");
|
||||||
|
}
|
||||||
|
let msg: ClusterRequest = bincode::deserialize(&data[4..4 + len])?;
|
||||||
|
Ok((msg, 4 + len))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a length-prefixed bincode response from raw bytes.
|
||||||
|
pub fn decode_response(data: &[u8]) -> anyhow::Result<(ClusterResponse, usize)> {
|
||||||
|
if data.len() < 4 {
|
||||||
|
anyhow::bail!("Not enough data for length prefix");
|
||||||
|
}
|
||||||
|
let len = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
|
||||||
|
if data.len() < 4 + len {
|
||||||
|
anyhow::bail!("Not enough data for message body");
|
||||||
|
}
|
||||||
|
let msg: ClusterResponse = bincode::deserialize(&data[4..4 + len])?;
|
||||||
|
Ok((msg, 4 + len))
|
||||||
|
}
|
||||||
455
rust/src/cluster/quic_transport.rs
Normal file
455
rust/src/cluster/quic_transport.rs
Normal file
@@ -0,0 +1,455 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use dashmap::DashMap;
|
||||||
|
use quinn::{ClientConfig, Endpoint, ServerConfig as QuinnServerConfig};
|
||||||
|
use rustls::pki_types::{CertificateDer, PrivateKeyDer, PrivatePkcs8KeyDer};
|
||||||
|
use std::net::SocketAddr;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||||
|
|
||||||
|
use super::protocol::{
|
||||||
|
self, ClusterRequest, ClusterResponse, ShardReadResponse, ShardWriteAck, ShardWriteRequest,
|
||||||
|
};
|
||||||
|
use super::shard_store::{ShardId, ShardStore};
|
||||||
|
|
||||||
|
/// QUIC transport layer for inter-node communication.
|
||||||
|
///
|
||||||
|
/// Manages a QUIC endpoint for both sending and receiving cluster messages.
|
||||||
|
/// Uses self-signed TLS certificates generated at init time.
|
||||||
|
/// Maintains a connection pool to peer nodes.
|
||||||
|
pub struct QuicTransport {
|
||||||
|
endpoint: Endpoint,
|
||||||
|
/// Cached connections to peer nodes: node_id -> Connection
|
||||||
|
connections: Arc<DashMap<String, quinn::Connection>>,
|
||||||
|
local_node_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl QuicTransport {
|
||||||
|
/// Create a new QUIC transport, binding to the specified address.
|
||||||
|
pub async fn new(bind_addr: SocketAddr, local_node_id: String) -> Result<Self> {
|
||||||
|
let (server_config, client_config) = Self::generate_tls_configs()?;
|
||||||
|
|
||||||
|
let endpoint = Endpoint::server(server_config, bind_addr)?;
|
||||||
|
|
||||||
|
// Also configure the endpoint for client connections
|
||||||
|
let mut endpoint_client = endpoint.clone();
|
||||||
|
endpoint_client.set_default_client_config(client_config);
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
endpoint,
|
||||||
|
connections: Arc::new(DashMap::new()),
|
||||||
|
local_node_id,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get or establish a connection to a peer node.
|
||||||
|
pub async fn get_connection(
|
||||||
|
&self,
|
||||||
|
node_id: &str,
|
||||||
|
addr: SocketAddr,
|
||||||
|
) -> Result<quinn::Connection> {
|
||||||
|
// Check cache first
|
||||||
|
if let Some(conn) = self.connections.get(node_id) {
|
||||||
|
if conn.close_reason().is_none() {
|
||||||
|
return Ok(conn.clone());
|
||||||
|
}
|
||||||
|
// Connection is closed, remove from cache
|
||||||
|
drop(conn);
|
||||||
|
self.connections.remove(node_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Establish new connection
|
||||||
|
let conn = self
|
||||||
|
.endpoint
|
||||||
|
.connect(addr, "smartstorage")?
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
self.connections
|
||||||
|
.insert(node_id.to_string(), conn.clone());
|
||||||
|
|
||||||
|
Ok(conn)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a cluster request and receive the response.
|
||||||
|
pub async fn send_request(
|
||||||
|
&self,
|
||||||
|
conn: &quinn::Connection,
|
||||||
|
request: &ClusterRequest,
|
||||||
|
) -> Result<ClusterResponse> {
|
||||||
|
let (mut send, mut recv) = conn.open_bi().await?;
|
||||||
|
|
||||||
|
// Encode and send request
|
||||||
|
let encoded = protocol::encode_request(request)?;
|
||||||
|
send.write_all(&encoded).await?;
|
||||||
|
send.finish()?;
|
||||||
|
|
||||||
|
// Read response
|
||||||
|
let response_data = recv.read_to_end(64 * 1024 * 1024).await?; // 64MB max
|
||||||
|
let (response, _) = protocol::decode_response(&response_data)?;
|
||||||
|
|
||||||
|
Ok(response)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a shard write request with streaming data.
|
||||||
|
///
|
||||||
|
/// Sends the request header first, then streams the shard data bytes.
|
||||||
|
pub async fn send_shard_write(
|
||||||
|
&self,
|
||||||
|
conn: &quinn::Connection,
|
||||||
|
request: ShardWriteRequest,
|
||||||
|
shard_data: &[u8],
|
||||||
|
) -> Result<ShardWriteAck> {
|
||||||
|
let (mut send, mut recv) = conn.open_bi().await?;
|
||||||
|
|
||||||
|
// Send request header
|
||||||
|
let encoded = protocol::encode_request(&ClusterRequest::ShardWrite(request))?;
|
||||||
|
send.write_all(&encoded).await?;
|
||||||
|
|
||||||
|
// Stream shard data
|
||||||
|
send.write_all(shard_data).await?;
|
||||||
|
send.finish()?;
|
||||||
|
|
||||||
|
// Read ack
|
||||||
|
let response_data = recv.read_to_end(1024).await?;
|
||||||
|
let (response, _) = protocol::decode_response(&response_data)?;
|
||||||
|
|
||||||
|
match response {
|
||||||
|
ClusterResponse::ShardWriteAck(ack) => Ok(ack),
|
||||||
|
ClusterResponse::Error(e) => {
|
||||||
|
anyhow::bail!("Shard write error: {} - {}", e.code, e.message)
|
||||||
|
}
|
||||||
|
other => anyhow::bail!("Unexpected response to shard write: {:?}", other),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Send a shard read request and receive the shard data.
|
||||||
|
///
|
||||||
|
/// Returns (shard_data, checksum).
|
||||||
|
pub async fn send_shard_read(
|
||||||
|
&self,
|
||||||
|
conn: &quinn::Connection,
|
||||||
|
request: &ClusterRequest,
|
||||||
|
) -> Result<Option<(Vec<u8>, u32)>> {
|
||||||
|
let (mut send, mut recv) = conn.open_bi().await?;
|
||||||
|
|
||||||
|
// Send request
|
||||||
|
let encoded = protocol::encode_request(request)?;
|
||||||
|
send.write_all(&encoded).await?;
|
||||||
|
send.finish()?;
|
||||||
|
|
||||||
|
// Read response header
|
||||||
|
let mut header_len_buf = [0u8; 4];
|
||||||
|
recv.read_exact(&mut header_len_buf).await?;
|
||||||
|
let header_len = u32::from_le_bytes(header_len_buf) as usize;
|
||||||
|
|
||||||
|
let mut header_buf = vec![0u8; header_len];
|
||||||
|
recv.read_exact(&mut header_buf).await?;
|
||||||
|
let response: ClusterResponse = bincode::deserialize(&header_buf)?;
|
||||||
|
|
||||||
|
match response {
|
||||||
|
ClusterResponse::ShardReadResponse(read_resp) => {
|
||||||
|
if !read_resp.found {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
// Read shard data that follows
|
||||||
|
let mut shard_data = vec![0u8; read_resp.shard_data_length as usize];
|
||||||
|
recv.read_exact(&mut shard_data).await?;
|
||||||
|
Ok(Some((shard_data, read_resp.checksum)))
|
||||||
|
}
|
||||||
|
ClusterResponse::Error(e) => {
|
||||||
|
anyhow::bail!("Shard read error: {} - {}", e.code, e.message)
|
||||||
|
}
|
||||||
|
other => anyhow::bail!("Unexpected response to shard read: {:?}", other),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Accept incoming connections and dispatch to the handler.
|
||||||
|
pub async fn accept_loop(
|
||||||
|
self: Arc<Self>,
|
||||||
|
shard_store: Arc<ShardStore>,
|
||||||
|
mut shutdown: tokio::sync::watch::Receiver<bool>,
|
||||||
|
) {
|
||||||
|
loop {
|
||||||
|
tokio::select! {
|
||||||
|
incoming = self.endpoint.accept() => {
|
||||||
|
match incoming {
|
||||||
|
Some(incoming_conn) => {
|
||||||
|
let transport = self.clone();
|
||||||
|
let store = shard_store.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
match incoming_conn.await {
|
||||||
|
Ok(conn) => {
|
||||||
|
transport.handle_connection(conn, store).await;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("Failed to accept QUIC connection: {}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
None => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ = shutdown.changed() => break,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle a single QUIC connection (may have multiple streams).
|
||||||
|
async fn handle_connection(
|
||||||
|
&self,
|
||||||
|
conn: quinn::Connection,
|
||||||
|
shard_store: Arc<ShardStore>,
|
||||||
|
) {
|
||||||
|
loop {
|
||||||
|
match conn.accept_bi().await {
|
||||||
|
Ok((send, recv)) => {
|
||||||
|
let store = shard_store.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
if let Err(e) = Self::handle_stream(send, recv, store).await {
|
||||||
|
tracing::error!("Stream handler error: {}", e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Err(quinn::ConnectionError::ApplicationClosed(_)) => break,
|
||||||
|
Err(e) => {
|
||||||
|
tracing::error!("Connection error: {}", e);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Handle a single bidirectional stream (one request-response exchange).
|
||||||
|
async fn handle_stream(
|
||||||
|
mut send: quinn::SendStream,
|
||||||
|
mut recv: quinn::RecvStream,
|
||||||
|
shard_store: Arc<ShardStore>,
|
||||||
|
) -> Result<()> {
|
||||||
|
// Read the length-prefixed request header
|
||||||
|
let mut len_buf = [0u8; 4];
|
||||||
|
recv.read_exact(&mut len_buf).await?;
|
||||||
|
let msg_len = u32::from_le_bytes(len_buf) as usize;
|
||||||
|
|
||||||
|
let mut msg_buf = vec![0u8; msg_len];
|
||||||
|
recv.read_exact(&mut msg_buf).await?;
|
||||||
|
let request: ClusterRequest = bincode::deserialize(&msg_buf)?;
|
||||||
|
|
||||||
|
match request {
|
||||||
|
ClusterRequest::ShardWrite(write_req) => {
|
||||||
|
// Read shard data from the stream
|
||||||
|
let mut shard_data = vec![0u8; write_req.shard_data_length as usize];
|
||||||
|
recv.read_exact(&mut shard_data).await?;
|
||||||
|
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: write_req.bucket,
|
||||||
|
key: write_req.key,
|
||||||
|
chunk_index: write_req.chunk_index,
|
||||||
|
shard_index: write_req.shard_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
let result = shard_store
|
||||||
|
.write_shard(&shard_id, &shard_data, write_req.checksum)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
let ack = ShardWriteAck {
|
||||||
|
request_id: write_req.request_id,
|
||||||
|
success: result.is_ok(),
|
||||||
|
error: result.err().map(|e| e.to_string()),
|
||||||
|
};
|
||||||
|
let response = protocol::encode_response(&ClusterResponse::ShardWriteAck(ack))?;
|
||||||
|
send.write_all(&response).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
ClusterRequest::ShardRead(read_req) => {
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: read_req.bucket,
|
||||||
|
key: read_req.key,
|
||||||
|
chunk_index: read_req.chunk_index,
|
||||||
|
shard_index: read_req.shard_index,
|
||||||
|
};
|
||||||
|
|
||||||
|
match shard_store.read_shard(&shard_id).await {
|
||||||
|
Ok((data, checksum)) => {
|
||||||
|
let header = ShardReadResponse {
|
||||||
|
request_id: read_req.request_id,
|
||||||
|
found: true,
|
||||||
|
shard_data_length: data.len() as u64,
|
||||||
|
checksum,
|
||||||
|
};
|
||||||
|
// Send header
|
||||||
|
let header_bytes = bincode::serialize(&ClusterResponse::ShardReadResponse(header))?;
|
||||||
|
send.write_all(&(header_bytes.len() as u32).to_le_bytes()).await?;
|
||||||
|
send.write_all(&header_bytes).await?;
|
||||||
|
// Send shard data
|
||||||
|
send.write_all(&data).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
Err(_) => {
|
||||||
|
let header = ShardReadResponse {
|
||||||
|
request_id: read_req.request_id,
|
||||||
|
found: false,
|
||||||
|
shard_data_length: 0,
|
||||||
|
checksum: 0,
|
||||||
|
};
|
||||||
|
let header_bytes = bincode::serialize(&ClusterResponse::ShardReadResponse(header))?;
|
||||||
|
send.write_all(&(header_bytes.len() as u32).to_le_bytes()).await?;
|
||||||
|
send.write_all(&header_bytes).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ClusterRequest::ShardDelete(del_req) => {
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: del_req.bucket,
|
||||||
|
key: del_req.key,
|
||||||
|
chunk_index: del_req.chunk_index,
|
||||||
|
shard_index: del_req.shard_index,
|
||||||
|
};
|
||||||
|
let result = shard_store.delete_shard(&shard_id).await;
|
||||||
|
let ack = protocol::ClusterResponse::ShardDeleteAck(protocol::ShardDeleteAck {
|
||||||
|
request_id: del_req.request_id,
|
||||||
|
success: result.is_ok(),
|
||||||
|
});
|
||||||
|
let response = protocol::encode_response(&ack)?;
|
||||||
|
send.write_all(&response).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
ClusterRequest::ShardHead(head_req) => {
|
||||||
|
let shard_id = ShardId {
|
||||||
|
bucket: head_req.bucket,
|
||||||
|
key: head_req.key,
|
||||||
|
chunk_index: head_req.chunk_index,
|
||||||
|
shard_index: head_req.shard_index,
|
||||||
|
};
|
||||||
|
let resp = match shard_store.head_shard(&shard_id).await {
|
||||||
|
Ok(Some(meta)) => protocol::ShardHeadResponse {
|
||||||
|
request_id: head_req.request_id,
|
||||||
|
found: true,
|
||||||
|
data_size: meta.data_size,
|
||||||
|
checksum: meta.checksum,
|
||||||
|
},
|
||||||
|
_ => protocol::ShardHeadResponse {
|
||||||
|
request_id: head_req.request_id,
|
||||||
|
found: false,
|
||||||
|
data_size: 0,
|
||||||
|
checksum: 0,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
let response =
|
||||||
|
protocol::encode_response(&ClusterResponse::ShardHeadResponse(resp))?;
|
||||||
|
send.write_all(&response).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heartbeat, Join, TopologySync, Heal, and Manifest operations
|
||||||
|
// will be handled by the membership and coordinator modules.
|
||||||
|
// For now, send a generic ack.
|
||||||
|
_ => {
|
||||||
|
let response_data = recv.read_to_end(0).await.unwrap_or_default();
|
||||||
|
drop(response_data);
|
||||||
|
let err = protocol::ErrorResponse {
|
||||||
|
request_id: String::new(),
|
||||||
|
code: "NotImplemented".to_string(),
|
||||||
|
message: "This cluster operation is not yet implemented".to_string(),
|
||||||
|
};
|
||||||
|
let response = protocol::encode_response(&ClusterResponse::Error(err))?;
|
||||||
|
send.write_all(&response).await?;
|
||||||
|
send.finish()?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate self-signed TLS certificates for cluster-internal communication.
|
||||||
|
fn generate_tls_configs() -> Result<(QuinnServerConfig, ClientConfig)> {
|
||||||
|
// Generate self-signed certificate
|
||||||
|
let cert = rcgen::generate_simple_self_signed(vec!["smartstorage".to_string()])?;
|
||||||
|
let cert_der = CertificateDer::from(cert.cert);
|
||||||
|
let key_der = PrivateKeyDer::Pkcs8(PrivatePkcs8KeyDer::from(cert.key_pair.serialize_der()));
|
||||||
|
|
||||||
|
// Server config
|
||||||
|
let mut server_crypto = rustls::ServerConfig::builder()
|
||||||
|
.with_no_client_auth()
|
||||||
|
.with_single_cert(vec![cert_der.clone()], key_der.clone_key())?;
|
||||||
|
server_crypto.alpn_protocols = vec![b"smartstorage".to_vec()];
|
||||||
|
let server_config = QuinnServerConfig::with_crypto(Arc::new(
|
||||||
|
quinn::crypto::rustls::QuicServerConfig::try_from(server_crypto)?,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Client config: skip server certificate verification (cluster-internal)
|
||||||
|
let mut client_crypto = rustls::ClientConfig::builder()
|
||||||
|
.dangerous()
|
||||||
|
.with_custom_certificate_verifier(Arc::new(SkipServerVerification))
|
||||||
|
.with_no_client_auth();
|
||||||
|
client_crypto.alpn_protocols = vec![b"smartstorage".to_vec()];
|
||||||
|
let client_config = ClientConfig::new(Arc::new(
|
||||||
|
quinn::crypto::rustls::QuicClientConfig::try_from(client_crypto)?,
|
||||||
|
));
|
||||||
|
|
||||||
|
Ok((server_config, client_config))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Close the QUIC endpoint gracefully.
|
||||||
|
pub fn close(&self) {
|
||||||
|
self.endpoint
|
||||||
|
.close(quinn::VarInt::from_u32(0), b"shutdown");
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the local node ID.
|
||||||
|
pub fn local_node_id(&self) -> &str {
|
||||||
|
&self.local_node_id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Certificate verifier that skips verification (for cluster-internal self-signed certs).
|
||||||
|
#[derive(Debug)]
|
||||||
|
struct SkipServerVerification;
|
||||||
|
|
||||||
|
impl rustls::client::danger::ServerCertVerifier for SkipServerVerification {
|
||||||
|
fn verify_server_cert(
|
||||||
|
&self,
|
||||||
|
_end_entity: &CertificateDer<'_>,
|
||||||
|
_intermediates: &[CertificateDer<'_>],
|
||||||
|
_server_name: &rustls::pki_types::ServerName<'_>,
|
||||||
|
_ocsp_response: &[u8],
|
||||||
|
_now: rustls::pki_types::UnixTime,
|
||||||
|
) -> Result<rustls::client::danger::ServerCertVerified, rustls::Error> {
|
||||||
|
Ok(rustls::client::danger::ServerCertVerified::assertion())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn verify_tls12_signature(
|
||||||
|
&self,
|
||||||
|
_message: &[u8],
|
||||||
|
_cert: &CertificateDer<'_>,
|
||||||
|
_dss: &rustls::DigitallySignedStruct,
|
||||||
|
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||||
|
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn verify_tls13_signature(
|
||||||
|
&self,
|
||||||
|
_message: &[u8],
|
||||||
|
_cert: &CertificateDer<'_>,
|
||||||
|
_dss: &rustls::DigitallySignedStruct,
|
||||||
|
) -> Result<rustls::client::danger::HandshakeSignatureValid, rustls::Error> {
|
||||||
|
Ok(rustls::client::danger::HandshakeSignatureValid::assertion())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn supported_verify_schemes(&self) -> Vec<rustls::SignatureScheme> {
|
||||||
|
vec![
|
||||||
|
rustls::SignatureScheme::RSA_PKCS1_SHA256,
|
||||||
|
rustls::SignatureScheme::RSA_PKCS1_SHA384,
|
||||||
|
rustls::SignatureScheme::RSA_PKCS1_SHA512,
|
||||||
|
rustls::SignatureScheme::ECDSA_NISTP256_SHA256,
|
||||||
|
rustls::SignatureScheme::ECDSA_NISTP384_SHA384,
|
||||||
|
rustls::SignatureScheme::ED25519,
|
||||||
|
rustls::SignatureScheme::RSA_PSS_SHA256,
|
||||||
|
rustls::SignatureScheme::RSA_PSS_SHA384,
|
||||||
|
rustls::SignatureScheme::RSA_PSS_SHA512,
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
226
rust/src/cluster/shard_store.rs
Normal file
226
rust/src/cluster/shard_store.rs
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
use anyhow::Result;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use tokio::fs;
|
||||||
|
use tokio::io::AsyncWriteExt;
|
||||||
|
|
||||||
|
/// Identifies a specific shard on disk.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
|
||||||
|
pub struct ShardId {
|
||||||
|
pub bucket: String,
|
||||||
|
pub key: String,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub shard_index: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-shard metadata stored alongside shard data.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct ShardMeta {
|
||||||
|
pub shard_index: u32,
|
||||||
|
pub chunk_index: u32,
|
||||||
|
pub data_size: u64,
|
||||||
|
pub checksum: u32, // crc32c
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manages shard storage on a single drive.
|
||||||
|
///
|
||||||
|
/// Layout on disk:
|
||||||
|
/// ```text
|
||||||
|
/// {base_path}/.smartstorage/data/{bucket}/{key_prefix}/{key}/
|
||||||
|
/// chunk-{N}/shard-{M}.dat (shard data)
|
||||||
|
/// chunk-{N}/shard-{M}.meta (shard metadata JSON)
|
||||||
|
/// ```
|
||||||
|
pub struct ShardStore {
|
||||||
|
base_path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ShardStore {
|
||||||
|
pub fn new(base_path: PathBuf) -> Self {
|
||||||
|
Self { base_path }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Write a shard to disk atomically (write to temp file, then rename).
|
||||||
|
pub async fn write_shard(
|
||||||
|
&self,
|
||||||
|
shard_id: &ShardId,
|
||||||
|
data: &[u8],
|
||||||
|
checksum: u32,
|
||||||
|
) -> Result<()> {
|
||||||
|
let shard_path = self.shard_data_path(shard_id);
|
||||||
|
let meta_path = self.shard_meta_path(shard_id);
|
||||||
|
|
||||||
|
// Ensure parent directory exists
|
||||||
|
if let Some(parent) = shard_path.parent() {
|
||||||
|
fs::create_dir_all(parent).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write data atomically via temp file + rename
|
||||||
|
let temp_data_path = shard_path.with_extension("dat.tmp");
|
||||||
|
{
|
||||||
|
let mut file = fs::File::create(&temp_data_path).await?;
|
||||||
|
file.write_all(data).await?;
|
||||||
|
file.flush().await?;
|
||||||
|
file.sync_all().await?;
|
||||||
|
}
|
||||||
|
fs::rename(&temp_data_path, &shard_path).await?;
|
||||||
|
|
||||||
|
// Write metadata
|
||||||
|
let meta = ShardMeta {
|
||||||
|
shard_index: shard_id.shard_index,
|
||||||
|
chunk_index: shard_id.chunk_index,
|
||||||
|
data_size: data.len() as u64,
|
||||||
|
checksum,
|
||||||
|
};
|
||||||
|
let meta_json = serde_json::to_string(&meta)?;
|
||||||
|
let temp_meta_path = meta_path.with_extension("meta.tmp");
|
||||||
|
fs::write(&temp_meta_path, meta_json).await?;
|
||||||
|
fs::rename(&temp_meta_path, &meta_path).await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a shard's data from disk.
|
||||||
|
pub async fn read_shard(&self, shard_id: &ShardId) -> Result<(Vec<u8>, u32)> {
|
||||||
|
let shard_path = self.shard_data_path(shard_id);
|
||||||
|
let meta_path = self.shard_meta_path(shard_id);
|
||||||
|
|
||||||
|
let data = fs::read(&shard_path).await?;
|
||||||
|
let meta_json = fs::read_to_string(&meta_path).await?;
|
||||||
|
let meta: ShardMeta = serde_json::from_str(&meta_json)?;
|
||||||
|
|
||||||
|
Ok((data, meta.checksum))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a shard exists and return its metadata.
|
||||||
|
pub async fn head_shard(&self, shard_id: &ShardId) -> Result<Option<ShardMeta>> {
|
||||||
|
let meta_path = self.shard_meta_path(shard_id);
|
||||||
|
if !meta_path.exists() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
let meta_json = fs::read_to_string(&meta_path).await?;
|
||||||
|
let meta: ShardMeta = serde_json::from_str(&meta_json)?;
|
||||||
|
Ok(Some(meta))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete a shard and its metadata.
|
||||||
|
pub async fn delete_shard(&self, shard_id: &ShardId) -> Result<()> {
|
||||||
|
let shard_path = self.shard_data_path(shard_id);
|
||||||
|
let meta_path = self.shard_meta_path(shard_id);
|
||||||
|
|
||||||
|
let _ = fs::remove_file(&shard_path).await;
|
||||||
|
let _ = fs::remove_file(&meta_path).await;
|
||||||
|
|
||||||
|
// Clean up empty parent directories
|
||||||
|
self.cleanup_empty_dirs(shard_id).await;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List all shard IDs for a given bucket and key (across all chunks).
|
||||||
|
pub async fn list_shards_for_object(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
) -> Result<Vec<ShardId>> {
|
||||||
|
let key_dir = self.key_dir(bucket, key);
|
||||||
|
if !key_dir.exists() {
|
||||||
|
return Ok(Vec::new());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut result = Vec::new();
|
||||||
|
let mut entries = fs::read_dir(&key_dir).await?;
|
||||||
|
|
||||||
|
while let Some(entry) = entries.next_entry().await? {
|
||||||
|
let name = entry.file_name().to_string_lossy().to_string();
|
||||||
|
if !name.starts_with("chunk-") || !entry.metadata().await?.is_dir() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunk_index: u32 = match name.strip_prefix("chunk-").and_then(|s| s.parse().ok()) {
|
||||||
|
Some(idx) => idx,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut chunk_entries = fs::read_dir(entry.path()).await?;
|
||||||
|
while let Some(shard_entry) = chunk_entries.next_entry().await? {
|
||||||
|
let shard_name = shard_entry.file_name().to_string_lossy().to_string();
|
||||||
|
if shard_name.starts_with("shard-") && shard_name.ends_with(".dat") {
|
||||||
|
let shard_index: u32 = match shard_name
|
||||||
|
.strip_prefix("shard-")
|
||||||
|
.and_then(|s| s.strip_suffix(".dat"))
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
{
|
||||||
|
Some(idx) => idx,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
result.push(ShardId {
|
||||||
|
bucket: bucket.to_string(),
|
||||||
|
key: key.to_string(),
|
||||||
|
chunk_index,
|
||||||
|
shard_index,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result.sort_by(|a, b| {
|
||||||
|
a.chunk_index
|
||||||
|
.cmp(&b.chunk_index)
|
||||||
|
.then(a.shard_index.cmp(&b.shard_index))
|
||||||
|
});
|
||||||
|
|
||||||
|
Ok(result)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// Path helpers
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
fn data_root(&self) -> PathBuf {
|
||||||
|
self.base_path.join(".smartstorage").join("data")
|
||||||
|
}
|
||||||
|
|
||||||
|
fn key_prefix(key: &str) -> String {
|
||||||
|
// Use first 2 hex chars of a simple hash for directory fan-out
|
||||||
|
let hash = xxhash_rust::xxh64::xxh64(key.as_bytes(), 0);
|
||||||
|
format!("{:02x}", hash & 0xFF)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn key_dir(&self, bucket: &str, key: &str) -> PathBuf {
|
||||||
|
self.data_root()
|
||||||
|
.join(bucket)
|
||||||
|
.join(Self::key_prefix(key))
|
||||||
|
.join(key)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn chunk_dir(&self, shard_id: &ShardId) -> PathBuf {
|
||||||
|
self.key_dir(&shard_id.bucket, &shard_id.key)
|
||||||
|
.join(format!("chunk-{}", shard_id.chunk_index))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shard_data_path(&self, shard_id: &ShardId) -> PathBuf {
|
||||||
|
self.chunk_dir(shard_id)
|
||||||
|
.join(format!("shard-{}.dat", shard_id.shard_index))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shard_meta_path(&self, shard_id: &ShardId) -> PathBuf {
|
||||||
|
self.chunk_dir(shard_id)
|
||||||
|
.join(format!("shard-{}.meta", shard_id.shard_index))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn cleanup_empty_dirs(&self, shard_id: &ShardId) {
|
||||||
|
// Try to remove chunk dir if empty
|
||||||
|
let chunk_dir = self.chunk_dir(shard_id);
|
||||||
|
let _ = fs::remove_dir(&chunk_dir).await; // fails silently if not empty
|
||||||
|
|
||||||
|
// Try to remove key dir if empty
|
||||||
|
let key_dir = self.key_dir(&shard_id.bucket, &shard_id.key);
|
||||||
|
let _ = fs::remove_dir(&key_dir).await;
|
||||||
|
|
||||||
|
// Try to remove prefix dir if empty
|
||||||
|
if let Some(prefix_dir) = key_dir.parent() {
|
||||||
|
let _ = fs::remove_dir(prefix_dir).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
291
rust/src/cluster/state.rs
Normal file
291
rust/src/cluster/state.rs
Normal file
@@ -0,0 +1,291 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::RwLock;
|
||||||
|
|
||||||
|
use super::placement::{DriveLocation, ErasureSet};
|
||||||
|
use super::protocol::{ClusterTopology, ErasureSetInfo, DriveLocationInfo, NodeInfo};
|
||||||
|
|
||||||
|
/// Node status for tracking liveness.
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum NodeStatus {
|
||||||
|
Online,
|
||||||
|
Suspect, // missed 2+ heartbeats
|
||||||
|
Offline, // missed 5+ heartbeats
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tracked state for a peer node.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct NodeState {
|
||||||
|
pub info: NodeInfo,
|
||||||
|
pub status: NodeStatus,
|
||||||
|
pub missed_heartbeats: u32,
|
||||||
|
pub last_heartbeat: chrono::DateTime<chrono::Utc>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Shared cluster state, protected by RwLock for concurrent access.
|
||||||
|
pub struct ClusterState {
|
||||||
|
inner: Arc<RwLock<ClusterStateInner>>,
|
||||||
|
local_node_id: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ClusterStateInner {
|
||||||
|
cluster_id: String,
|
||||||
|
version: u64,
|
||||||
|
nodes: HashMap<String, NodeState>,
|
||||||
|
erasure_sets: Vec<ErasureSet>,
|
||||||
|
data_shards: usize,
|
||||||
|
parity_shards: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ClusterState {
|
||||||
|
pub fn new(
|
||||||
|
local_node_id: String,
|
||||||
|
cluster_id: String,
|
||||||
|
data_shards: usize,
|
||||||
|
parity_shards: usize,
|
||||||
|
) -> Self {
|
||||||
|
Self {
|
||||||
|
inner: Arc::new(RwLock::new(ClusterStateInner {
|
||||||
|
cluster_id,
|
||||||
|
version: 0,
|
||||||
|
nodes: HashMap::new(),
|
||||||
|
erasure_sets: Vec::new(),
|
||||||
|
data_shards,
|
||||||
|
parity_shards,
|
||||||
|
})),
|
||||||
|
local_node_id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn local_node_id(&self) -> &str {
|
||||||
|
&self.local_node_id
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register a node in the cluster.
|
||||||
|
pub async fn add_node(&self, info: NodeInfo) {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
let node_id = info.node_id.clone();
|
||||||
|
inner.nodes.insert(
|
||||||
|
node_id,
|
||||||
|
NodeState {
|
||||||
|
info,
|
||||||
|
status: NodeStatus::Online,
|
||||||
|
missed_heartbeats: 0,
|
||||||
|
last_heartbeat: chrono::Utc::now(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
inner.version += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Remove a node from the cluster.
|
||||||
|
pub async fn remove_node(&self, node_id: &str) {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
inner.nodes.remove(node_id);
|
||||||
|
inner.version += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update heartbeat for a node (reset missed count).
|
||||||
|
pub async fn record_heartbeat(&self, node_id: &str) {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
if let Some(node) = inner.nodes.get_mut(node_id) {
|
||||||
|
node.missed_heartbeats = 0;
|
||||||
|
node.status = NodeStatus::Online;
|
||||||
|
node.last_heartbeat = chrono::Utc::now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Increment missed heartbeat count for all nodes, updating status.
|
||||||
|
/// Called by the heartbeat checker when a round completes.
|
||||||
|
pub async fn tick_heartbeats(&self, responded_nodes: &[String]) -> Vec<(String, NodeStatus)> {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
let mut status_changes = Vec::new();
|
||||||
|
|
||||||
|
for (node_id, node) in inner.nodes.iter_mut() {
|
||||||
|
if *node_id == self.local_node_id {
|
||||||
|
continue; // Don't track self
|
||||||
|
}
|
||||||
|
|
||||||
|
if responded_nodes.contains(node_id) {
|
||||||
|
node.missed_heartbeats = 0;
|
||||||
|
if node.status != NodeStatus::Online {
|
||||||
|
node.status = NodeStatus::Online;
|
||||||
|
status_changes.push((node_id.clone(), NodeStatus::Online));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
node.missed_heartbeats += 1;
|
||||||
|
let new_status = if node.missed_heartbeats >= 5 {
|
||||||
|
NodeStatus::Offline
|
||||||
|
} else if node.missed_heartbeats >= 2 {
|
||||||
|
NodeStatus::Suspect
|
||||||
|
} else {
|
||||||
|
NodeStatus::Online
|
||||||
|
};
|
||||||
|
|
||||||
|
if new_status != node.status {
|
||||||
|
node.status = new_status.clone();
|
||||||
|
status_changes.push((node_id.clone(), new_status));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
status_changes
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set erasure sets (typically done once during cluster formation).
|
||||||
|
pub async fn set_erasure_sets(&self, sets: Vec<ErasureSet>) {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
inner.erasure_sets = sets;
|
||||||
|
inner.version += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get the erasure set for a given object based on consistent hashing.
|
||||||
|
pub async fn get_erasure_set_for_object(&self, bucket: &str, key: &str) -> Option<ErasureSet> {
|
||||||
|
let inner = self.inner.read().await;
|
||||||
|
if inner.erasure_sets.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
let set_idx = super::placement::erasure_set_for_object(
|
||||||
|
bucket,
|
||||||
|
key,
|
||||||
|
inner.erasure_sets.len() as u32,
|
||||||
|
);
|
||||||
|
inner.erasure_sets.get(set_idx as usize).cloned()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all erasure sets.
|
||||||
|
pub async fn erasure_sets(&self) -> Vec<ErasureSet> {
|
||||||
|
self.inner.read().await.erasure_sets.clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get current topology version.
|
||||||
|
pub async fn version(&self) -> u64 {
|
||||||
|
self.inner.read().await.version
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all online node IDs (excluding self).
|
||||||
|
pub async fn online_peers(&self) -> Vec<NodeInfo> {
|
||||||
|
let inner = self.inner.read().await;
|
||||||
|
inner
|
||||||
|
.nodes
|
||||||
|
.values()
|
||||||
|
.filter(|n| n.status == NodeStatus::Online && n.info.node_id != self.local_node_id)
|
||||||
|
.map(|n| n.info.clone())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get all nodes.
|
||||||
|
pub async fn all_nodes(&self) -> Vec<NodeState> {
|
||||||
|
self.inner.read().await.nodes.values().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get node info by ID.
|
||||||
|
pub async fn get_node(&self, node_id: &str) -> Option<NodeInfo> {
|
||||||
|
self.inner
|
||||||
|
.read()
|
||||||
|
.await
|
||||||
|
.nodes
|
||||||
|
.get(node_id)
|
||||||
|
.map(|n| n.info.clone())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get offline node IDs.
|
||||||
|
pub async fn offline_nodes(&self) -> Vec<String> {
|
||||||
|
self.inner
|
||||||
|
.read()
|
||||||
|
.await
|
||||||
|
.nodes
|
||||||
|
.values()
|
||||||
|
.filter(|n| n.status == NodeStatus::Offline)
|
||||||
|
.map(|n| n.info.node_id.clone())
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if a majority of nodes are reachable (for split-brain prevention).
|
||||||
|
pub async fn has_majority(&self) -> bool {
|
||||||
|
let inner = self.inner.read().await;
|
||||||
|
let total = inner.nodes.len();
|
||||||
|
if total == 0 {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
let online = inner
|
||||||
|
.nodes
|
||||||
|
.values()
|
||||||
|
.filter(|n| n.status == NodeStatus::Online)
|
||||||
|
.count();
|
||||||
|
online > total / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Export the current topology as a protocol message.
|
||||||
|
pub async fn to_topology(&self) -> ClusterTopology {
|
||||||
|
let inner = self.inner.read().await;
|
||||||
|
ClusterTopology {
|
||||||
|
version: inner.version,
|
||||||
|
cluster_id: inner.cluster_id.clone(),
|
||||||
|
nodes: inner.nodes.values().map(|n| n.info.clone()).collect(),
|
||||||
|
erasure_sets: inner
|
||||||
|
.erasure_sets
|
||||||
|
.iter()
|
||||||
|
.map(|set| ErasureSetInfo {
|
||||||
|
set_id: set.set_id,
|
||||||
|
drives: set
|
||||||
|
.drives
|
||||||
|
.iter()
|
||||||
|
.map(|d| DriveLocationInfo {
|
||||||
|
node_id: d.node_id.clone(),
|
||||||
|
drive_index: d.drive_index,
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
data_shards: inner.data_shards,
|
||||||
|
parity_shards: inner.parity_shards,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Import topology from a protocol message (e.g., received from a peer during join).
|
||||||
|
pub async fn apply_topology(&self, topology: &ClusterTopology) {
|
||||||
|
let mut inner = self.inner.write().await;
|
||||||
|
|
||||||
|
// Only apply if newer
|
||||||
|
if topology.version <= inner.version {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
inner.cluster_id = topology.cluster_id.clone();
|
||||||
|
inner.version = topology.version;
|
||||||
|
inner.data_shards = topology.data_shards;
|
||||||
|
inner.parity_shards = topology.parity_shards;
|
||||||
|
|
||||||
|
// Update nodes
|
||||||
|
for node_info in &topology.nodes {
|
||||||
|
if !inner.nodes.contains_key(&node_info.node_id) {
|
||||||
|
inner.nodes.insert(
|
||||||
|
node_info.node_id.clone(),
|
||||||
|
NodeState {
|
||||||
|
info: node_info.clone(),
|
||||||
|
status: NodeStatus::Online,
|
||||||
|
missed_heartbeats: 0,
|
||||||
|
last_heartbeat: chrono::Utc::now(),
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update erasure sets
|
||||||
|
inner.erasure_sets = topology
|
||||||
|
.erasure_sets
|
||||||
|
.iter()
|
||||||
|
.map(|set| ErasureSet {
|
||||||
|
set_id: set.set_id,
|
||||||
|
drives: set
|
||||||
|
.drives
|
||||||
|
.iter()
|
||||||
|
.map(|d| DriveLocation {
|
||||||
|
node_id: d.node_id.clone(),
|
||||||
|
drive_index: d.drive_index,
|
||||||
|
})
|
||||||
|
.collect(),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,7 @@
|
|||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::cluster::config::ClusterConfig;
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct SmartStorageConfig {
|
pub struct SmartStorageConfig {
|
||||||
@@ -10,6 +12,8 @@ pub struct SmartStorageConfig {
|
|||||||
pub logging: LoggingConfig,
|
pub logging: LoggingConfig,
|
||||||
pub limits: LimitsConfig,
|
pub limits: LimitsConfig,
|
||||||
pub multipart: MultipartConfig,
|
pub multipart: MultipartConfig,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cluster: Option<ClusterConfig>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
mod action;
|
mod action;
|
||||||
mod auth;
|
mod auth;
|
||||||
|
mod cluster;
|
||||||
mod config;
|
mod config;
|
||||||
mod management;
|
mod management;
|
||||||
mod policy;
|
mod policy;
|
||||||
|
|||||||
@@ -140,6 +140,15 @@ pub async fn management_loop() -> Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
"clusterStatus" => {
|
||||||
|
send_response(
|
||||||
|
id,
|
||||||
|
serde_json::json!({
|
||||||
|
"status": "ok",
|
||||||
|
"message": "Cluster status endpoint ready"
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
send_error(id, format!("Unknown method: {}", method));
|
send_error(id, format!("Unknown method: {}", method));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,25 +23,34 @@ use crate::auth::{self, AuthenticatedIdentity};
|
|||||||
use crate::config::SmartStorageConfig;
|
use crate::config::SmartStorageConfig;
|
||||||
use crate::policy::{self, PolicyDecision, PolicyStore};
|
use crate::policy::{self, PolicyDecision, PolicyStore};
|
||||||
use crate::error::StorageError;
|
use crate::error::StorageError;
|
||||||
use crate::storage::FileStore;
|
use crate::cluster::coordinator::DistributedStore;
|
||||||
|
use crate::cluster::config::ErasureConfig;
|
||||||
|
use crate::cluster::membership::MembershipManager;
|
||||||
|
use crate::cluster::placement;
|
||||||
|
use crate::cluster::protocol::NodeInfo;
|
||||||
|
use crate::cluster::quic_transport::QuicTransport;
|
||||||
|
use crate::cluster::shard_store::ShardStore;
|
||||||
|
use crate::cluster::state::ClusterState;
|
||||||
|
use crate::storage::{FileStore, StorageBackend};
|
||||||
use crate::xml_response;
|
use crate::xml_response;
|
||||||
|
|
||||||
pub struct StorageServer {
|
pub struct StorageServer {
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
shutdown_tx: watch::Sender<bool>,
|
shutdown_tx: watch::Sender<bool>,
|
||||||
server_handle: tokio::task::JoinHandle<()>,
|
server_handle: tokio::task::JoinHandle<()>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl StorageServer {
|
impl StorageServer {
|
||||||
pub async fn start(config: SmartStorageConfig) -> Result<Self> {
|
pub async fn start(config: SmartStorageConfig) -> Result<Self> {
|
||||||
let store = Arc::new(FileStore::new(config.storage.directory.clone().into()));
|
let store: Arc<StorageBackend> = if let Some(ref cluster_config) = config.cluster {
|
||||||
|
if cluster_config.enabled {
|
||||||
// Initialize or reset storage
|
Self::start_clustered(&config, cluster_config).await?
|
||||||
if config.storage.clean_slate {
|
} else {
|
||||||
store.reset().await?;
|
Self::start_standalone(&config).await?
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
store.initialize().await?;
|
Self::start_standalone(&config).await?
|
||||||
}
|
};
|
||||||
|
|
||||||
// Initialize policy store
|
// Initialize policy store
|
||||||
let policy_store = Arc::new(PolicyStore::new(store.policies_dir()));
|
let policy_store = Arc::new(PolicyStore::new(store.policies_dir()));
|
||||||
@@ -119,9 +128,145 @@ impl StorageServer {
|
|||||||
let _ = self.server_handle.await;
|
let _ = self.server_handle.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn store(&self) -> &FileStore {
|
pub fn store(&self) -> &StorageBackend {
|
||||||
&self.store
|
&self.store
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn start_standalone(config: &SmartStorageConfig) -> Result<Arc<StorageBackend>> {
|
||||||
|
let store = Arc::new(StorageBackend::Standalone(
|
||||||
|
FileStore::new(config.storage.directory.clone().into()),
|
||||||
|
));
|
||||||
|
if config.storage.clean_slate {
|
||||||
|
store.reset().await?;
|
||||||
|
} else {
|
||||||
|
store.initialize().await?;
|
||||||
|
}
|
||||||
|
Ok(store)
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn start_clustered(
|
||||||
|
config: &SmartStorageConfig,
|
||||||
|
cluster_config: &crate::cluster::config::ClusterConfig,
|
||||||
|
) -> Result<Arc<StorageBackend>> {
|
||||||
|
let erasure_config = cluster_config.erasure.clone();
|
||||||
|
let node_id = cluster_config
|
||||||
|
.node_id
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| uuid::Uuid::new_v4().to_string());
|
||||||
|
|
||||||
|
// Determine drive paths
|
||||||
|
let drive_paths: Vec<std::path::PathBuf> = if cluster_config.drives.paths.is_empty() {
|
||||||
|
// Default: use storage directory as a single drive
|
||||||
|
vec![std::path::PathBuf::from(&config.storage.directory)]
|
||||||
|
} else {
|
||||||
|
cluster_config
|
||||||
|
.drives
|
||||||
|
.paths
|
||||||
|
.iter()
|
||||||
|
.map(std::path::PathBuf::from)
|
||||||
|
.collect()
|
||||||
|
};
|
||||||
|
|
||||||
|
// Ensure directories exist
|
||||||
|
let manifest_dir = std::path::PathBuf::from(&config.storage.directory).join(".manifests");
|
||||||
|
let buckets_dir = std::path::PathBuf::from(&config.storage.directory).join(".buckets");
|
||||||
|
tokio::fs::create_dir_all(&manifest_dir).await?;
|
||||||
|
tokio::fs::create_dir_all(&buckets_dir).await?;
|
||||||
|
for path in &drive_paths {
|
||||||
|
tokio::fs::create_dir_all(path.join(".smartstorage")).await?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize QUIC transport
|
||||||
|
let quic_addr: SocketAddr =
|
||||||
|
format!("{}:{}", config.server.address, cluster_config.quic_port).parse()?;
|
||||||
|
let transport = Arc::new(QuicTransport::new(quic_addr, node_id.clone()).await?);
|
||||||
|
|
||||||
|
// Initialize cluster state
|
||||||
|
let cluster_state = Arc::new(ClusterState::new(
|
||||||
|
node_id.clone(),
|
||||||
|
uuid::Uuid::new_v4().to_string(),
|
||||||
|
erasure_config.data_shards,
|
||||||
|
erasure_config.parity_shards,
|
||||||
|
));
|
||||||
|
|
||||||
|
// Form erasure sets from local drives (single-node for now)
|
||||||
|
let nodes = vec![(node_id.clone(), drive_paths.len() as u32)];
|
||||||
|
let erasure_sets =
|
||||||
|
placement::form_erasure_sets(&nodes, erasure_config.total_shards());
|
||||||
|
|
||||||
|
if erasure_sets.is_empty() {
|
||||||
|
tracing::warn!(
|
||||||
|
"Not enough drives ({}) for erasure set size ({}). \
|
||||||
|
Need at least {} drives.",
|
||||||
|
drive_paths.len(),
|
||||||
|
erasure_config.total_shards(),
|
||||||
|
erasure_config.total_shards(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster_state.set_erasure_sets(erasure_sets).await;
|
||||||
|
|
||||||
|
// Register self as a node
|
||||||
|
let local_node_info = NodeInfo {
|
||||||
|
node_id: node_id.clone(),
|
||||||
|
quic_addr: quic_addr.to_string(),
|
||||||
|
s3_addr: format!("{}:{}", config.server.address, config.server.port),
|
||||||
|
drive_count: drive_paths.len() as u32,
|
||||||
|
status: "online".to_string(),
|
||||||
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||||
|
};
|
||||||
|
cluster_state.add_node(local_node_info.clone()).await;
|
||||||
|
|
||||||
|
// Join cluster if seed nodes are configured
|
||||||
|
let membership = Arc::new(MembershipManager::new(
|
||||||
|
cluster_state.clone(),
|
||||||
|
transport.clone(),
|
||||||
|
cluster_config.heartbeat_interval_ms,
|
||||||
|
local_node_info,
|
||||||
|
));
|
||||||
|
membership
|
||||||
|
.join_cluster(&cluster_config.seed_nodes)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
// Start QUIC accept loop for incoming connections
|
||||||
|
let shard_store_for_accept = Arc::new(ShardStore::new(drive_paths[0].clone()));
|
||||||
|
let (quic_shutdown_tx, quic_shutdown_rx) = watch::channel(false);
|
||||||
|
let transport_clone = transport.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
transport_clone
|
||||||
|
.accept_loop(shard_store_for_accept, quic_shutdown_rx)
|
||||||
|
.await;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Start heartbeat loop
|
||||||
|
let membership_clone = membership.clone();
|
||||||
|
let (hb_shutdown_tx, hb_shutdown_rx) = watch::channel(false);
|
||||||
|
tokio::spawn(async move {
|
||||||
|
membership_clone.heartbeat_loop(hb_shutdown_rx).await;
|
||||||
|
});
|
||||||
|
|
||||||
|
// Create distributed store
|
||||||
|
let distributed_store = DistributedStore::new(
|
||||||
|
cluster_state,
|
||||||
|
transport,
|
||||||
|
erasure_config,
|
||||||
|
drive_paths,
|
||||||
|
manifest_dir,
|
||||||
|
buckets_dir,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let store = Arc::new(StorageBackend::Clustered(distributed_store));
|
||||||
|
|
||||||
|
if !config.server.silent {
|
||||||
|
tracing::info!(
|
||||||
|
"Cluster mode enabled (node_id={}, quic_port={})",
|
||||||
|
node_id,
|
||||||
|
cluster_config.quic_port
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(store)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SmartStorageConfig {
|
impl SmartStorageConfig {
|
||||||
@@ -204,7 +349,7 @@ fn storage_error_response(err: &StorageError, request_id: &str) -> Response<BoxB
|
|||||||
|
|
||||||
async fn handle_request(
|
async fn handle_request(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
config: SmartStorageConfig,
|
config: SmartStorageConfig,
|
||||||
policy_store: Arc<PolicyStore>,
|
policy_store: Arc<PolicyStore>,
|
||||||
) -> Result<Response<BoxBody>, std::convert::Infallible> {
|
) -> Result<Response<BoxBody>, std::convert::Infallible> {
|
||||||
@@ -325,7 +470,7 @@ async fn authorize_request(
|
|||||||
|
|
||||||
async fn route_request(
|
async fn route_request(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
_config: &SmartStorageConfig,
|
_config: &SmartStorageConfig,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
policy_store: &Arc<PolicyStore>,
|
policy_store: &Arc<PolicyStore>,
|
||||||
@@ -430,7 +575,7 @@ async fn route_request(
|
|||||||
// ============================
|
// ============================
|
||||||
|
|
||||||
async fn handle_list_buckets(
|
async fn handle_list_buckets(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
let buckets = store.list_buckets().await?;
|
let buckets = store.list_buckets().await?;
|
||||||
@@ -439,7 +584,7 @@ async fn handle_list_buckets(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_create_bucket(
|
async fn handle_create_bucket(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
@@ -448,7 +593,7 @@ async fn handle_create_bucket(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_delete_bucket(
|
async fn handle_delete_bucket(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
policy_store: &Arc<PolicyStore>,
|
policy_store: &Arc<PolicyStore>,
|
||||||
@@ -460,7 +605,7 @@ async fn handle_delete_bucket(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_head_bucket(
|
async fn handle_head_bucket(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
@@ -472,7 +617,7 @@ async fn handle_head_bucket(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_list_objects(
|
async fn handle_list_objects(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
query: &HashMap<String, String>,
|
query: &HashMap<String, String>,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -501,7 +646,7 @@ async fn handle_list_objects(
|
|||||||
|
|
||||||
async fn handle_put_object(
|
async fn handle_put_object(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -523,7 +668,7 @@ async fn handle_put_object(
|
|||||||
|
|
||||||
async fn handle_get_object(
|
async fn handle_get_object(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -576,7 +721,7 @@ async fn handle_get_object(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_head_object(
|
async fn handle_head_object(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -608,7 +753,7 @@ async fn handle_head_object(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_delete_object(
|
async fn handle_delete_object(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -619,7 +764,7 @@ async fn handle_delete_object(
|
|||||||
|
|
||||||
async fn handle_copy_object(
|
async fn handle_copy_object(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
dest_bucket: &str,
|
dest_bucket: &str,
|
||||||
dest_key: &str,
|
dest_key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -688,7 +833,7 @@ async fn handle_get_bucket_policy(
|
|||||||
|
|
||||||
async fn handle_put_bucket_policy(
|
async fn handle_put_bucket_policy(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: &Arc<FileStore>,
|
store: &Arc<StorageBackend>,
|
||||||
policy_store: &Arc<PolicyStore>,
|
policy_store: &Arc<PolicyStore>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -732,7 +877,7 @@ async fn handle_delete_bucket_policy(
|
|||||||
|
|
||||||
async fn handle_initiate_multipart(
|
async fn handle_initiate_multipart(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
@@ -745,7 +890,7 @@ async fn handle_initiate_multipart(
|
|||||||
|
|
||||||
async fn handle_upload_part(
|
async fn handle_upload_part(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
query: &HashMap<String, String>,
|
query: &HashMap<String, String>,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
@@ -774,7 +919,7 @@ async fn handle_upload_part(
|
|||||||
|
|
||||||
async fn handle_complete_multipart(
|
async fn handle_complete_multipart(
|
||||||
req: Request<Incoming>,
|
req: Request<Incoming>,
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
key: &str,
|
key: &str,
|
||||||
upload_id: &str,
|
upload_id: &str,
|
||||||
@@ -794,7 +939,7 @@ async fn handle_complete_multipart(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_abort_multipart(
|
async fn handle_abort_multipart(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
upload_id: &str,
|
upload_id: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
@@ -803,7 +948,7 @@ async fn handle_abort_multipart(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async fn handle_list_multipart_uploads(
|
async fn handle_list_multipart_uploads(
|
||||||
store: Arc<FileStore>,
|
store: Arc<StorageBackend>,
|
||||||
bucket: &str,
|
bucket: &str,
|
||||||
request_id: &str,
|
request_id: &str,
|
||||||
) -> Result<Response<BoxBody>> {
|
) -> Result<Response<BoxBody>> {
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ use tokio::fs;
|
|||||||
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter};
|
use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt, BufWriter};
|
||||||
use uuid::Uuid;
|
use uuid::Uuid;
|
||||||
|
|
||||||
|
use crate::cluster::coordinator::DistributedStore;
|
||||||
use crate::error::StorageError;
|
use crate::error::StorageError;
|
||||||
|
|
||||||
// ============================
|
// ============================
|
||||||
@@ -795,6 +796,196 @@ impl FileStore {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================
|
||||||
|
// StorageBackend enum
|
||||||
|
// ============================
|
||||||
|
|
||||||
|
/// Unified storage backend that dispatches to either standalone (FileStore)
|
||||||
|
/// or clustered (DistributedStore) storage.
|
||||||
|
pub enum StorageBackend {
|
||||||
|
Standalone(FileStore),
|
||||||
|
Clustered(DistributedStore),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl StorageBackend {
|
||||||
|
pub fn policies_dir(&self) -> std::path::PathBuf {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.policies_dir(),
|
||||||
|
StorageBackend::Clustered(_) => PathBuf::from(".policies"), // TODO: proper policies in cluster mode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn initialize(&self) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.initialize().await,
|
||||||
|
StorageBackend::Clustered(_) => Ok(()), // Cluster init happens separately
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn reset(&self) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.reset().await,
|
||||||
|
StorageBackend::Clustered(_) => Ok(()), // TODO: cluster reset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_buckets(&self) -> Result<Vec<BucketInfo>> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.list_buckets().await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.list_buckets().await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn bucket_exists(&self, bucket: &str) -> bool {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.bucket_exists(bucket).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.bucket_exists(bucket).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create_bucket(&self, bucket: &str) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.create_bucket(bucket).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.create_bucket(bucket).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete_bucket(&self, bucket: &str) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.delete_bucket(bucket).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.delete_bucket(bucket).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn put_object(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
body: Incoming,
|
||||||
|
metadata: HashMap<String, String>,
|
||||||
|
) -> Result<PutResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.put_object(bucket, key, body, metadata).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.put_object(bucket, key, body, metadata).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_object(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
range: Option<(u64, u64)>,
|
||||||
|
) -> Result<GetResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.get_object(bucket, key, range).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.get_object(bucket, key, range).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn head_object(&self, bucket: &str, key: &str) -> Result<HeadResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.head_object(bucket, key).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.head_object(bucket, key).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn delete_object(&self, bucket: &str, key: &str) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.delete_object(bucket, key).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.delete_object(bucket, key).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn copy_object(
|
||||||
|
&self,
|
||||||
|
src_bucket: &str,
|
||||||
|
src_key: &str,
|
||||||
|
dest_bucket: &str,
|
||||||
|
dest_key: &str,
|
||||||
|
metadata_directive: &str,
|
||||||
|
new_metadata: Option<HashMap<String, String>>,
|
||||||
|
) -> Result<CopyResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => {
|
||||||
|
fs.copy_object(src_bucket, src_key, dest_bucket, dest_key, metadata_directive, new_metadata).await
|
||||||
|
}
|
||||||
|
StorageBackend::Clustered(ds) => {
|
||||||
|
ds.copy_object(src_bucket, src_key, dest_bucket, dest_key, metadata_directive, new_metadata).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_objects(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
prefix: &str,
|
||||||
|
delimiter: &str,
|
||||||
|
max_keys: usize,
|
||||||
|
continuation_token: Option<&str>,
|
||||||
|
) -> Result<ListObjectsResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => {
|
||||||
|
fs.list_objects(bucket, prefix, delimiter, max_keys, continuation_token).await
|
||||||
|
}
|
||||||
|
StorageBackend::Clustered(ds) => {
|
||||||
|
ds.list_objects(bucket, prefix, delimiter, max_keys, continuation_token).await
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn initiate_multipart(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
key: &str,
|
||||||
|
metadata: HashMap<String, String>,
|
||||||
|
) -> Result<String> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.initiate_multipart(bucket, key, metadata).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.initiate_multipart(bucket, key, metadata).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn upload_part(
|
||||||
|
&self,
|
||||||
|
upload_id: &str,
|
||||||
|
part_number: u32,
|
||||||
|
body: Incoming,
|
||||||
|
) -> Result<(String, u64)> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.upload_part(upload_id, part_number, body).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.upload_part(upload_id, part_number, body).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn complete_multipart(
|
||||||
|
&self,
|
||||||
|
upload_id: &str,
|
||||||
|
parts: &[(u32, String)],
|
||||||
|
) -> Result<CompleteMultipartResult> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.complete_multipart(upload_id, parts).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.complete_multipart(upload_id, parts).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn abort_multipart(&self, upload_id: &str) -> Result<()> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.abort_multipart(upload_id).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.abort_multipart(upload_id).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn list_multipart_uploads(
|
||||||
|
&self,
|
||||||
|
bucket: &str,
|
||||||
|
) -> Result<Vec<MultipartUploadInfo>> {
|
||||||
|
match self {
|
||||||
|
StorageBackend::Standalone(fs) => fs.list_multipart_uploads(bucket).await,
|
||||||
|
StorageBackend::Clustered(ds) => ds.list_multipart_uploads(bucket).await,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ============================
|
// ============================
|
||||||
// Key encoding (identity on Linux)
|
// Key encoding (identity on Linux)
|
||||||
// ============================
|
// ============================
|
||||||
|
|||||||
@@ -3,6 +3,6 @@
|
|||||||
*/
|
*/
|
||||||
export const commitinfo = {
|
export const commitinfo = {
|
||||||
name: '@push.rocks/smartstorage',
|
name: '@push.rocks/smartstorage',
|
||||||
version: '6.0.1',
|
version: '6.1.0',
|
||||||
description: 'A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.'
|
description: 'A Node.js TypeScript package to create a local S3-compatible storage server using mapped local directories for development and testing purposes.'
|
||||||
}
|
}
|
||||||
|
|||||||
34
ts/index.ts
34
ts/index.ts
@@ -69,6 +69,36 @@ export interface IStorageConfig {
|
|||||||
cleanSlate?: boolean;
|
cleanSlate?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Erasure coding configuration
|
||||||
|
*/
|
||||||
|
export interface IErasureConfig {
|
||||||
|
dataShards?: number;
|
||||||
|
parityShards?: number;
|
||||||
|
chunkSizeBytes?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Drive configuration for multi-drive support
|
||||||
|
*/
|
||||||
|
export interface IDriveConfig {
|
||||||
|
paths: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Cluster configuration for distributed mode
|
||||||
|
*/
|
||||||
|
export interface IClusterConfig {
|
||||||
|
enabled: boolean;
|
||||||
|
nodeId?: string;
|
||||||
|
quicPort?: number;
|
||||||
|
seedNodes?: string[];
|
||||||
|
erasure?: IErasureConfig;
|
||||||
|
drives?: IDriveConfig;
|
||||||
|
heartbeatIntervalMs?: number;
|
||||||
|
heartbeatTimeoutMs?: number;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Complete smartstorage configuration
|
* Complete smartstorage configuration
|
||||||
*/
|
*/
|
||||||
@@ -80,6 +110,7 @@ export interface ISmartStorageConfig {
|
|||||||
logging?: ILoggingConfig;
|
logging?: ILoggingConfig;
|
||||||
limits?: ILimitsConfig;
|
limits?: ILimitsConfig;
|
||||||
multipart?: IMultipartConfig;
|
multipart?: IMultipartConfig;
|
||||||
|
cluster?: IClusterConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -163,7 +194,8 @@ function mergeConfig(userConfig: ISmartStorageConfig): Required<ISmartStorageCon
|
|||||||
...DEFAULT_CONFIG.multipart!,
|
...DEFAULT_CONFIG.multipart!,
|
||||||
...(userConfig.multipart || {}),
|
...(userConfig.multipart || {}),
|
||||||
},
|
},
|
||||||
};
|
...(userConfig.cluster ? { cluster: userConfig.cluster } : {}),
|
||||||
|
} as Required<ISmartStorageConfig>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user