/// Retention-based pruning and garbage collection. /// /// Prune determines which snapshots to keep based on retention policies, /// deletes expired snapshots, and removes pack files where ALL chunks /// are unreferenced (whole-pack GC only). use std::collections::HashSet; use std::path::Path; use serde::{Deserialize, Serialize}; use crate::error::ArchiveError; use crate::global_index::IndexEntry; use crate::pack_reader; use crate::pack_writer::PackWriter; use crate::hasher; use crate::repository::Repository; use crate::snapshot; #[derive(Debug, Clone, Serialize, Deserialize, Default)] #[serde(rename_all = "camelCase")] pub struct RetentionPolicy { #[serde(default)] pub keep_last: Option, #[serde(default)] pub keep_days: Option, #[serde(default)] pub keep_weeks: Option, #[serde(default)] pub keep_months: Option, } #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub struct PruneResult { pub removed_snapshots: u32, pub removed_packs: u32, pub rewritten_packs: u32, pub freed_bytes: u64, pub dry_run: bool, } pub async fn prune( repo: &mut Repository, retention: &RetentionPolicy, dry_run: bool, ) -> Result { // Acquire lock if !dry_run { repo.acquire_lock("prune").await?; } let result = do_prune(repo, retention, dry_run).await; if !dry_run { repo.release_lock().await?; } result } async fn do_prune( repo: &mut Repository, retention: &RetentionPolicy, dry_run: bool, ) -> Result { let mut result = PruneResult { removed_snapshots: 0, removed_packs: 0, rewritten_packs: 0, freed_bytes: 0, dry_run, }; // Load all snapshots let mut snapshots = snapshot::list_snapshots(&repo.path, None).await?; // Sort by creation time (newest first) snapshots.sort_by(|a, b| b.created_at.cmp(&a.created_at)); // Determine which snapshots to keep let keep_ids = determine_kept_snapshots(&snapshots, retention); // Phase 1: Remove expired snapshots let to_remove: Vec<_> = snapshots.iter() .filter(|s| !keep_ids.contains(&s.id)) .collect(); result.removed_snapshots = to_remove.len() as u32; if !dry_run { for snap in &to_remove { snapshot::delete_snapshot(&repo.path, &snap.id).await?; tracing::info!("Removed snapshot {}", snap.id); } } // Phase 2: Find and remove unreferenced packs // Reload remaining snapshots let remaining_snapshots = if dry_run { snapshots.iter() .filter(|s| keep_ids.contains(&s.id)) .cloned() .collect::>() } else { snapshot::list_snapshots(&repo.path, None).await? }; let referenced_chunks = snapshot::referenced_chunks(&remaining_snapshots); let referenced_packs = find_referenced_packs(repo, &referenced_chunks); // Find all pack IDs on disk let all_packs = find_all_pack_ids(&repo.path).await?; for pack_id in &all_packs { if !referenced_packs.contains(pack_id) { // This pack is fully unreferenced — delete it let shard = &pack_id[..std::cmp::min(2, pack_id.len())]; let pack_path = Path::new(&repo.path) .join("packs").join("data").join(shard) .join(format!("{}.pack", pack_id)); let idx_path = Path::new(&repo.path) .join("packs").join("data").join(shard) .join(format!("{}.idx", pack_id)); if pack_path.exists() { if let Ok(meta) = tokio::fs::metadata(&pack_path).await { result.freed_bytes += meta.len(); } } if idx_path.exists() { if let Ok(meta) = tokio::fs::metadata(&idx_path).await { result.freed_bytes += meta.len(); } } if !dry_run { let _ = tokio::fs::remove_file(&pack_path).await; let _ = tokio::fs::remove_file(&idx_path).await; // Remove entries from global index repo.index.remove_pack_entries(pack_id); tracing::info!("Removed pack {}", pack_id); } result.removed_packs += 1; } } // Phase 3: Rewrite partially-referenced packs to reclaim wasted space if !dry_run { rewrite_partial_packs(repo, &referenced_chunks, &mut result).await?; } // Compact index after pruning if !dry_run && (result.removed_packs > 0 || result.rewritten_packs > 0) { repo.index.compact(&repo.path).await?; } tracing::info!( "Prune {}: removed {} snapshots, {} packs, rewrote {} packs, freed {} bytes", if dry_run { "(dry run)" } else { "complete" }, result.removed_snapshots, result.removed_packs, result.rewritten_packs, result.freed_bytes ); Ok(result) } /// Rewrite packs that contain a mix of referenced and unreferenced chunks. /// Only rewrites packs where >25% of data is unreferenced (to avoid churn). async fn rewrite_partial_packs( repo: &mut Repository, referenced_chunks: &HashSet, result: &mut PruneResult, ) -> Result<(), ArchiveError> { let all_packs = find_all_pack_ids(&repo.path).await?; for pack_id in &all_packs { let shard = &pack_id[..std::cmp::min(2, pack_id.len())]; let idx_path = Path::new(&repo.path) .join("packs").join("data").join(shard) .join(format!("{}.idx", pack_id)); let pack_path = Path::new(&repo.path) .join("packs").join("data").join(shard) .join(format!("{}.pack", pack_id)); if !idx_path.exists() || !pack_path.exists() { continue; } let entries = match pack_reader::load_idx(&idx_path).await { Ok(e) => e, Err(_) => continue, }; // Count referenced vs unreferenced chunks in this pack let mut referenced_count = 0usize; let mut unreferenced_bytes = 0u64; let mut total_bytes = 0u64; for entry in &entries { let hash_hex = hasher::hash_to_hex(&entry.content_hash); total_bytes += entry.compressed_size as u64; if referenced_chunks.contains(&hash_hex) { referenced_count += 1; } else { unreferenced_bytes += entry.compressed_size as u64; } } // Skip if all chunks are referenced (nothing to reclaim) if referenced_count == entries.len() { continue; } // Skip if all chunks are unreferenced (already handled by Phase 2) if referenced_count == 0 { continue; } // Skip if waste is less than 25% (not worth the I/O) if total_bytes > 0 && (unreferenced_bytes * 100 / total_bytes) < 25 { continue; } tracing::info!( "Rewriting pack {} ({}/{} chunks referenced, {} bytes reclaimable)", pack_id, referenced_count, entries.len(), unreferenced_bytes ); // Read referenced chunks and write them to a new pack let mut new_pack_writer = PackWriter::new(repo.config.pack_target_size); for entry in &entries { let hash_hex = hasher::hash_to_hex(&entry.content_hash); if !referenced_chunks.contains(&hash_hex) { continue; // Skip unreferenced chunks } // Read chunk data from old pack let chunk_data = pack_reader::read_chunk( &pack_path, entry.offset, entry.compressed_size, ).await?; new_pack_writer.add_chunk( entry.content_hash, &chunk_data, entry.plaintext_size, entry.nonce, entry.flags, ); } // Finalize the new pack if !new_pack_writer.is_empty() { let new_pack_info = new_pack_writer.finalize(&repo.path).await?; // Update global index: point referenced chunks to the new pack for entry in &entries { let hash_hex = hasher::hash_to_hex(&entry.content_hash); if referenced_chunks.contains(&hash_hex) { let nonce = if entry.nonce != [0u8; 12] { Some(hex::encode(entry.nonce)) } else { None }; repo.index.add_entry(hash_hex, IndexEntry { pack_id: new_pack_info.pack_id.clone(), offset: entry.offset, // Note: offset in the new pack may differ compressed_size: entry.compressed_size, plaintext_size: entry.plaintext_size, nonce, flags: entry.flags, }); } } } // Delete old pack + idx let old_size = tokio::fs::metadata(&pack_path).await .map(|m| m.len()).unwrap_or(0); let old_idx_size = tokio::fs::metadata(&idx_path).await .map(|m| m.len()).unwrap_or(0); let _ = tokio::fs::remove_file(&pack_path).await; let _ = tokio::fs::remove_file(&idx_path).await; // Remove old pack entries from index repo.index.remove_pack_entries(pack_id); result.freed_bytes += unreferenced_bytes; result.rewritten_packs += 1; tracing::info!( "Rewrote pack {} -> saved {} bytes", pack_id, unreferenced_bytes ); } Ok(()) } /// Determine which snapshot IDs to keep based on retention policy. fn determine_kept_snapshots( snapshots: &[snapshot::Snapshot], retention: &RetentionPolicy, ) -> HashSet { let mut keep = HashSet::new(); // keepLast: keep the N most recent if let Some(n) = retention.keep_last { for snap in snapshots.iter().take(n as usize) { keep.insert(snap.id.clone()); } } let now = chrono::Utc::now(); // keepDays: keep one per day for the last N days if let Some(days) = retention.keep_days { let mut seen_days = HashSet::new(); for snap in snapshots { if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(&snap.created_at) { let age = now.signed_duration_since(dt); if age.num_days() <= days as i64 { let day_key = dt.format("%Y-%m-%d").to_string(); if seen_days.insert(day_key) { keep.insert(snap.id.clone()); } } } } } // keepWeeks: keep one per week for the last N weeks if let Some(weeks) = retention.keep_weeks { let mut seen_weeks = HashSet::new(); for snap in snapshots { if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(&snap.created_at) { let age = now.signed_duration_since(dt); if age.num_weeks() <= weeks as i64 { let week_key = dt.format("%Y-W%W").to_string(); if seen_weeks.insert(week_key) { keep.insert(snap.id.clone()); } } } } } // keepMonths: keep one per month for the last N months if let Some(months) = retention.keep_months { let mut seen_months = HashSet::new(); for snap in snapshots { if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(&snap.created_at) { let age = now.signed_duration_since(dt); if age.num_days() <= (months as i64) * 31 { let month_key = dt.format("%Y-%m").to_string(); if seen_months.insert(month_key) { keep.insert(snap.id.clone()); } } } } } // If no retention policy is specified, keep everything if retention.keep_last.is_none() && retention.keep_days.is_none() && retention.keep_weeks.is_none() && retention.keep_months.is_none() { for snap in snapshots { keep.insert(snap.id.clone()); } } keep } /// Find pack IDs that contain at least one referenced chunk. fn find_referenced_packs( repo: &Repository, referenced_chunks: &HashSet, ) -> HashSet { let mut packs = HashSet::new(); for hash_hex in referenced_chunks { if let Some(entry) = repo.index.get(hash_hex) { packs.insert(entry.pack_id.clone()); } } packs } /// Find all pack IDs on disk. async fn find_all_pack_ids(repo_path: &str) -> Result, ArchiveError> { let packs_dir = Path::new(repo_path).join("packs").join("data"); if !packs_dir.exists() { return Ok(Vec::new()); } let mut pack_ids = Vec::new(); let mut stack = vec![packs_dir]; while let Some(current) = stack.pop() { let mut entries = tokio::fs::read_dir(¤t).await?; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); if path.is_dir() { stack.push(path); } else if path.extension().and_then(|e| e.to_str()) == Some("pack") { if let Some(id) = path.file_stem().and_then(|s| s.to_str()) { pack_ids.push(id.to_string()); } } } } Ok(pack_ids) }