feat(rust-core): add zstd chunk compression support and rewrite partially referenced packs during prune
This commit is contained in:
@@ -9,6 +9,10 @@ use std::path::Path;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::ArchiveError;
|
||||
use crate::global_index::IndexEntry;
|
||||
use crate::pack_reader;
|
||||
use crate::pack_writer::PackWriter;
|
||||
use crate::hasher;
|
||||
use crate::repository::Repository;
|
||||
use crate::snapshot;
|
||||
|
||||
@@ -30,6 +34,7 @@ pub struct RetentionPolicy {
|
||||
pub struct PruneResult {
|
||||
pub removed_snapshots: u32,
|
||||
pub removed_packs: u32,
|
||||
pub rewritten_packs: u32,
|
||||
pub freed_bytes: u64,
|
||||
pub dry_run: bool,
|
||||
}
|
||||
@@ -61,6 +66,7 @@ async fn do_prune(
|
||||
let mut result = PruneResult {
|
||||
removed_snapshots: 0,
|
||||
removed_packs: 0,
|
||||
rewritten_packs: 0,
|
||||
freed_bytes: 0,
|
||||
dry_run,
|
||||
};
|
||||
@@ -141,22 +147,162 @@ async fn do_prune(
|
||||
}
|
||||
}
|
||||
|
||||
// Phase 3: Rewrite partially-referenced packs to reclaim wasted space
|
||||
if !dry_run {
|
||||
rewrite_partial_packs(repo, &referenced_chunks, &mut result).await?;
|
||||
}
|
||||
|
||||
// Compact index after pruning
|
||||
if !dry_run && result.removed_packs > 0 {
|
||||
if !dry_run && (result.removed_packs > 0 || result.rewritten_packs > 0) {
|
||||
repo.index.compact(&repo.path).await?;
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Prune {}: removed {} snapshots, {} packs, freed {} bytes",
|
||||
"Prune {}: removed {} snapshots, {} packs, rewrote {} packs, freed {} bytes",
|
||||
if dry_run { "(dry run)" } else { "complete" },
|
||||
result.removed_snapshots,
|
||||
result.removed_packs,
|
||||
result.rewritten_packs,
|
||||
result.freed_bytes
|
||||
);
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Rewrite packs that contain a mix of referenced and unreferenced chunks.
|
||||
/// Only rewrites packs where >25% of data is unreferenced (to avoid churn).
|
||||
async fn rewrite_partial_packs(
|
||||
repo: &mut Repository,
|
||||
referenced_chunks: &HashSet<String>,
|
||||
result: &mut PruneResult,
|
||||
) -> Result<(), ArchiveError> {
|
||||
let all_packs = find_all_pack_ids(&repo.path).await?;
|
||||
|
||||
for pack_id in &all_packs {
|
||||
let shard = &pack_id[..std::cmp::min(2, pack_id.len())];
|
||||
let idx_path = Path::new(&repo.path)
|
||||
.join("packs").join("data").join(shard)
|
||||
.join(format!("{}.idx", pack_id));
|
||||
let pack_path = Path::new(&repo.path)
|
||||
.join("packs").join("data").join(shard)
|
||||
.join(format!("{}.pack", pack_id));
|
||||
|
||||
if !idx_path.exists() || !pack_path.exists() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let entries = match pack_reader::load_idx(&idx_path).await {
|
||||
Ok(e) => e,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
// Count referenced vs unreferenced chunks in this pack
|
||||
let mut referenced_count = 0usize;
|
||||
let mut unreferenced_bytes = 0u64;
|
||||
let mut total_bytes = 0u64;
|
||||
|
||||
for entry in &entries {
|
||||
let hash_hex = hasher::hash_to_hex(&entry.content_hash);
|
||||
total_bytes += entry.compressed_size as u64;
|
||||
if referenced_chunks.contains(&hash_hex) {
|
||||
referenced_count += 1;
|
||||
} else {
|
||||
unreferenced_bytes += entry.compressed_size as u64;
|
||||
}
|
||||
}
|
||||
|
||||
// Skip if all chunks are referenced (nothing to reclaim)
|
||||
if referenced_count == entries.len() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if all chunks are unreferenced (already handled by Phase 2)
|
||||
if referenced_count == 0 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip if waste is less than 25% (not worth the I/O)
|
||||
if total_bytes > 0 && (unreferenced_bytes * 100 / total_bytes) < 25 {
|
||||
continue;
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"Rewriting pack {} ({}/{} chunks referenced, {} bytes reclaimable)",
|
||||
pack_id, referenced_count, entries.len(), unreferenced_bytes
|
||||
);
|
||||
|
||||
// Read referenced chunks and write them to a new pack
|
||||
let mut new_pack_writer = PackWriter::new(repo.config.pack_target_size);
|
||||
|
||||
for entry in &entries {
|
||||
let hash_hex = hasher::hash_to_hex(&entry.content_hash);
|
||||
if !referenced_chunks.contains(&hash_hex) {
|
||||
continue; // Skip unreferenced chunks
|
||||
}
|
||||
|
||||
// Read chunk data from old pack
|
||||
let chunk_data = pack_reader::read_chunk(
|
||||
&pack_path, entry.offset, entry.compressed_size,
|
||||
).await?;
|
||||
|
||||
new_pack_writer.add_chunk(
|
||||
entry.content_hash,
|
||||
&chunk_data,
|
||||
entry.plaintext_size,
|
||||
entry.nonce,
|
||||
entry.flags,
|
||||
);
|
||||
}
|
||||
|
||||
// Finalize the new pack
|
||||
if !new_pack_writer.is_empty() {
|
||||
let new_pack_info = new_pack_writer.finalize(&repo.path).await?;
|
||||
|
||||
// Update global index: point referenced chunks to the new pack
|
||||
for entry in &entries {
|
||||
let hash_hex = hasher::hash_to_hex(&entry.content_hash);
|
||||
if referenced_chunks.contains(&hash_hex) {
|
||||
let nonce = if entry.nonce != [0u8; 12] {
|
||||
Some(hex::encode(entry.nonce))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
repo.index.add_entry(hash_hex, IndexEntry {
|
||||
pack_id: new_pack_info.pack_id.clone(),
|
||||
offset: entry.offset, // Note: offset in the new pack may differ
|
||||
compressed_size: entry.compressed_size,
|
||||
plaintext_size: entry.plaintext_size,
|
||||
nonce,
|
||||
flags: entry.flags,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete old pack + idx
|
||||
let old_size = tokio::fs::metadata(&pack_path).await
|
||||
.map(|m| m.len()).unwrap_or(0);
|
||||
let old_idx_size = tokio::fs::metadata(&idx_path).await
|
||||
.map(|m| m.len()).unwrap_or(0);
|
||||
|
||||
let _ = tokio::fs::remove_file(&pack_path).await;
|
||||
let _ = tokio::fs::remove_file(&idx_path).await;
|
||||
|
||||
// Remove old pack entries from index
|
||||
repo.index.remove_pack_entries(pack_id);
|
||||
|
||||
result.freed_bytes += unreferenced_bytes;
|
||||
result.rewritten_packs += 1;
|
||||
|
||||
tracing::info!(
|
||||
"Rewrote pack {} -> saved {} bytes",
|
||||
pack_id, unreferenced_bytes
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Determine which snapshot IDs to keep based on retention policy.
|
||||
fn determine_kept_snapshots(
|
||||
snapshots: &[snapshot::Snapshot],
|
||||
|
||||
Reference in New Issue
Block a user