use xxhash_rust::xxh64::xxh64; /// Determines which erasure set an object belongs to, based on consistent hashing. /// /// Uses xxhash64 of "{bucket}/{key}" to deterministically map objects to erasure sets. /// This is stateless — any node can independently compute the placement. pub fn erasure_set_for_object(bucket: &str, key: &str, num_erasure_sets: u32) -> u32 { if num_erasure_sets == 0 { return 0; } let hash_input = format!("{}/{}", bucket, key); let hash = xxh64(hash_input.as_bytes(), 0); (hash % num_erasure_sets as u64) as u32 } /// Represents a drive location within the cluster topology. #[derive(Debug, Clone)] pub struct DriveLocation { pub node_id: String, pub drive_index: u32, } /// An erasure set: a fixed group of drives that together store one complete /// set of shards for any object placed on them. #[derive(Debug, Clone)] pub struct ErasureSet { pub set_id: u32, /// Ordered drives: index = shard_index pub drives: Vec, } /// Form erasure sets from the available drives across all nodes. /// /// Interleaves drives from different nodes for fault isolation: /// e.g., with 3 nodes x 4 drives and total_shards=6: /// Set 0: N0-D0, N1-D0, N2-D0, N0-D1, N1-D1, N2-D1 /// Set 1: N0-D2, N1-D2, N2-D2, N0-D3, N1-D3, N2-D3 pub fn form_erasure_sets( nodes: &[(String, u32)], // (node_id, drive_count) total_shards: usize, ) -> Vec { // Collect all drives as (node_id, drive_index), interleaved by node let max_drives = nodes.iter().map(|(_, count)| *count).max().unwrap_or(0) as usize; let mut all_drives: Vec = Vec::new(); for drive_idx in 0..max_drives { for (node_id, drive_count) in nodes { if (drive_idx as u32) < *drive_count { all_drives.push(DriveLocation { node_id: node_id.clone(), drive_index: drive_idx as u32, }); } } } // Form sets of total_shards drives each let num_sets = all_drives.len() / total_shards; let mut sets = Vec::with_capacity(num_sets); for set_idx in 0..num_sets { let start = set_idx * total_shards; let end = start + total_shards; let drives = all_drives[start..end].to_vec(); sets.push(ErasureSet { set_id: set_idx as u32, drives, }); } sets } #[cfg(test)] mod tests { use super::*; #[test] fn test_erasure_set_assignment_deterministic() { let set_a = erasure_set_for_object("mybucket", "mykey", 4); let set_b = erasure_set_for_object("mybucket", "mykey", 4); assert_eq!(set_a, set_b); } #[test] fn test_erasure_set_distribution() { // Check that objects are distributed across sets let num_sets = 4u32; let mut counts = vec![0u32; num_sets as usize]; for i in 0..1000 { let key = format!("key-{}", i); let set = erasure_set_for_object("bucket", &key, num_sets); assert!(set < num_sets); counts[set as usize] += 1; } // Each set should have some objects (not all in one set) for count in &counts { assert!(*count > 100, "Expected >100, got {}", count); } } #[test] fn test_form_erasure_sets_3x4() { // 3 nodes, 4 drives each, 6 shards per set => 2 sets let nodes = vec![ ("node1".to_string(), 4), ("node2".to_string(), 4), ("node3".to_string(), 4), ]; let sets = form_erasure_sets(&nodes, 6); assert_eq!(sets.len(), 2); // Set 0 should interleave across nodes let set0_nodes: Vec<&str> = sets[0].drives.iter().map(|d| d.node_id.as_str()).collect(); assert_eq!(set0_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]); // Set 1 should also interleave let set1_nodes: Vec<&str> = sets[1].drives.iter().map(|d| d.node_id.as_str()).collect(); assert_eq!(set1_nodes, vec!["node1", "node2", "node3", "node1", "node2", "node3"]); // Drive indices should be different between sets let set0_drives: Vec = sets[0].drives.iter().map(|d| d.drive_index).collect(); let set1_drives: Vec = sets[1].drives.iter().map(|d| d.drive_index).collect(); assert_eq!(set0_drives, vec![0, 0, 0, 1, 1, 1]); assert_eq!(set1_drives, vec![2, 2, 2, 3, 3, 3]); } #[test] fn test_form_erasure_sets_remainder() { // 2 nodes, 3 drives each, 4 shards => 1 set (2 drives left over) let nodes = vec![ ("a".to_string(), 3), ("b".to_string(), 3), ]; let sets = form_erasure_sets(&nodes, 4); assert_eq!(sets.len(), 1); assert_eq!(sets[0].drives.len(), 4); } }