141 lines
4.8 KiB
Rust
141 lines
4.8 KiB
Rust
use xxhash_rust::xxh64::xxh64;
|
|
|
|
/// Determines which erasure set an object belongs to, based on consistent hashing.
|
|
///
|
|
/// Uses xxhash64 of "{bucket}/{key}" to deterministically map objects to erasure sets.
|
|
/// This is stateless — any node can independently compute the placement.
|
|
pub fn erasure_set_for_object(bucket: &str, key: &str, num_erasure_sets: u32) -> u32 {
|
|
if num_erasure_sets == 0 {
|
|
return 0;
|
|
}
|
|
let hash_input = format!("{}/{}", bucket, key);
|
|
let hash = xxh64(hash_input.as_bytes(), 0);
|
|
(hash % num_erasure_sets as u64) as u32
|
|
}
|
|
|
|
/// Represents a drive location within the cluster topology.
|
|
#[derive(Debug, Clone)]
|
|
pub struct DriveLocation {
|
|
pub node_id: String,
|
|
pub drive_index: u32,
|
|
}
|
|
|
|
/// An erasure set: a fixed group of drives that together store one complete
|
|
/// set of shards for any object placed on them.
|
|
#[derive(Debug, Clone)]
|
|
pub struct ErasureSet {
|
|
pub set_id: u32,
|
|
/// Ordered drives: index = shard_index
|
|
pub drives: Vec<DriveLocation>,
|
|
}
|
|
|
|
/// Form erasure sets from the available drives across all nodes.
|
|
///
|
|
/// Interleaves drives from different nodes for fault isolation:
|
|
/// e.g., with 3 nodes x 4 drives and total_shards=6:
|
|
/// Set 0: N0-D0, N1-D0, N2-D0, N0-D1, N1-D1, N2-D1
|
|
/// Set 1: N0-D2, N1-D2, N2-D2, N0-D3, N1-D3, N2-D3
|
|
pub fn form_erasure_sets(
|
|
nodes: &[(String, u32)], // (node_id, drive_count)
|
|
total_shards: usize,
|
|
) -> Vec<ErasureSet> {
|
|
// Collect all drives as (node_id, drive_index), interleaved by node
|
|
let max_drives = nodes.iter().map(|(_, count)| *count).max().unwrap_or(0) as usize;
|
|
let mut all_drives: Vec<DriveLocation> = Vec::new();
|
|
|
|
for drive_idx in 0..max_drives {
|
|
for (node_id, drive_count) in nodes {
|
|
if (drive_idx as u32) < *drive_count {
|
|
all_drives.push(DriveLocation {
|
|
node_id: node_id.clone(),
|
|
drive_index: drive_idx as u32,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// Form sets of total_shards drives each
|
|
let num_sets = all_drives.len() / total_shards;
|
|
let mut sets = Vec::with_capacity(num_sets);
|
|
|
|
for set_idx in 0..num_sets {
|
|
let start = set_idx * total_shards;
|
|
let end = start + total_shards;
|
|
let drives = all_drives[start..end].to_vec();
|
|
|
|
sets.push(ErasureSet {
|
|
set_id: set_idx as u32,
|
|
drives,
|
|
});
|
|
}
|
|
|
|
sets
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_erasure_set_assignment_deterministic() {
|
|
let set_a = erasure_set_for_object("mybucket", "mykey", 4);
|
|
let set_b = erasure_set_for_object("mybucket", "mykey", 4);
|
|
assert_eq!(set_a, set_b);
|
|
}
|
|
|
|
#[test]
|
|
fn test_erasure_set_distribution() {
|
|
// Check that objects are distributed across sets
|
|
let num_sets = 4u32;
|
|
let mut counts = vec![0u32; num_sets as usize];
|
|
for i in 0..1000 {
|
|
let key = format!("key-{}", i);
|
|
let set = erasure_set_for_object("bucket", &key, num_sets);
|
|
assert!(set < num_sets);
|
|
counts[set as usize] += 1;
|
|
}
|
|
// Each set should have some objects (not all in one set)
|
|
for count in &counts {
|
|
assert!(*count > 100, "Expected >100, got {}", count);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_form_erasure_sets_3x4() {
|
|
// 3 nodes, 4 drives each, 6 shards per set => 2 sets
|
|
let nodes = vec![
|
|
("node1".to_string(), 4),
|
|
("node2".to_string(), 4),
|
|
("node3".to_string(), 4),
|
|
];
|
|
let sets = form_erasure_sets(&nodes, 6);
|
|
assert_eq!(sets.len(), 2);
|
|
|
|
// Set 0 should interleave across nodes
|
|
let set0_nodes: Vec<&str> = sets[0].drives.iter().map(|d| d.node_id.as_str()).collect();
|
|
assert_eq!(set0_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
|
|
|
|
// Set 1 should also interleave
|
|
let set1_nodes: Vec<&str> = sets[1].drives.iter().map(|d| d.node_id.as_str()).collect();
|
|
assert_eq!(set1_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]);
|
|
|
|
// Drive indices should be different between sets
|
|
let set0_drives: Vec<u32> = sets[0].drives.iter().map(|d| d.drive_index).collect();
|
|
let set1_drives: Vec<u32> = sets[1].drives.iter().map(|d| d.drive_index).collect();
|
|
assert_eq!(set0_drives, vec![0, 0, 0, 1, 1, 1]);
|
|
assert_eq!(set1_drives, vec![2, 2, 2, 3, 3, 3]);
|
|
}
|
|
|
|
#[test]
|
|
fn test_form_erasure_sets_remainder() {
|
|
// 2 nodes, 3 drives each, 4 shards => 1 set (2 drives left over)
|
|
let nodes = vec![
|
|
("a".to_string(), 3),
|
|
("b".to_string(), 3),
|
|
];
|
|
let sets = form_erasure_sets(&nodes, 4);
|
|
assert_eq!(sets.len(), 1);
|
|
assert_eq!(sets[0].drives.len(), 4);
|
|
}
|
|
}
|