prune: Enhance treatment of duplicates

2024-12-23 00:07:25 +00:00 · 2021-02-19 16:57:51 +01:00 · 2021-02-19 16:57:51 +01:00 · 7478cbf70e
commit 7478cbf70e
parent 6cbeb4a9f9
2 changed files with 91 additions and 34 deletions
--- a/changelog/unreleased/issue-3114
+++ b/changelog/unreleased/issue-3114
@ -0,0 +1,12 @@
 Enhancement: Improve `prune` in presence of duplicate blobs
 Restic `prune` always used to repack all pack files containing duplicate
 blobs. This effectively removed all duplicates during prune. However, one 
 of the consequences was that all those pack files were downloadeded and
 duplicate blobs did not contribute to the threshold for unused repository
 space.
 This is now changed and `prune` works nice and fast also if there are lots
 of duplicates.
 https://github.com/restic/restic/issues/3114
 https://github.com/restic/restic/pull/3290
--- a/cmd/restic/cmd_prune.go
+++ b/cmd/restic/cmd_prune.go
@ -195,13 +195,12 @@ func runPruneWithRepo(opts PruneOptions, gopts GlobalOptions, repo *repository.R
 }
 type packInfo struct {
-	usedBlobs      uint
+	usedBlobs    uint
-	unusedBlobs    uint
+	unusedBlobs  uint
-	duplicateBlobs uint
+	usedSize     uint64
-	usedSize       uint64
+	unusedSize   uint64
-	unusedSize     uint64
+	tpe          restic.BlobType
-	tpe            restic.BlobType
+	uncompressed bool
 	uncompressed   bool
 }
 type packInfoWithID struct {
@ -243,7 +242,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	Verbosef("searching used packs...\n")
 	keepBlobs := restic.NewBlobSet()
-	duplicateBlobs := restic.NewBlobSet()
+	duplicateBlobs := make(map[restic.BlobHandle]uint8)
 	// iterate over all blobs in index to find out which blobs are duplicates
 	for blob := range repo.Index().Each(ctx) {
@ -256,7 +255,17 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 			stats.size.used += size
 			stats.blobs.used++
 		case keepBlobs.Has(bh): // duplicate blob
-			duplicateBlobs.Insert(bh)
+			count, ok := duplicateBlobs[bh]
 			if !ok {
 				count = 2 // this one is already the second blob!
 			} else {
 				count++
 				if count == 0 {
 					// catch uint8 overflow
 					panic("too many duplicates, prune can only handly up to 255!")
 				}
 			}
 			duplicateBlobs[bh] = count
 			stats.size.duplicate += size
 			stats.blobs.duplicate++
 		default:
@ -299,10 +308,9 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		bh := blob.BlobHandle
 		size := uint64(blob.Length)
 		_, isDuplicate := duplicateBlobs[bh]
 		switch {
-		case duplicateBlobs.Has(bh): // duplicate blob
+		case isDuplicate: // duplicate blobs will be handled later
 			ip.usedSize += size
 			ip.duplicateBlobs++
 		case keepBlobs.Has(bh): // used blob, not duplicate
 			ip.usedSize += size
 			ip.usedBlobs++
@ -317,19 +325,52 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		indexPack[blob.PackID] = ip
 	}
 	// if duplicate blobs exist, those will be set to either "used" or "unused":
 	// - mark only one occurency of duplicate blobs as used
 	// - if there are already some used blobs in a pack, possibly mark duplicates in this pack as "used"
 	// - if there are no used blobs in a pack, possibly mark duplicates as "usused"
 	if len(duplicateBlobs) > 0 {
 		// iterate again over all blobs in index (this is pretty cheap, all in-mem)
 		for blob := range repo.Index().Each(ctx) {
 			bh := blob.BlobHandle
 			count, isDuplicate := duplicateBlobs[bh]
 			if !isDuplicate {
 				continue
 			}
 			ip := indexPack[blob.PackID]
 			size := uint64(blob.Length)
 			switch {
 			case count == 0:
 				// used duplicate exists ->  mark as unused
 				ip.unusedSize += size
 				ip.unusedBlobs++
 			case ip.usedBlobs > 0, count == 1:
 				// other used blobs in pack or "last" occurency ->  mark as used
 				ip.usedSize += size
 				ip.usedBlobs++
 				// let other occurences be marked as unused
 				duplicateBlobs[bh] = 0
 			default:
 				// mark as unused and decrease counter
 				ip.unusedSize += size
 				ip.unusedBlobs++
 				duplicateBlobs[bh] = count - 1
 			}
 			// update indexPack
 			indexPack[blob.PackID] = ip
 		}
 	}
 	Verbosef("collecting packs for deletion and repacking\n")
 	removePacksFirst := restic.NewIDSet()
 	removePacks := restic.NewIDSet()
 	repackPacks := restic.NewIDSet()
 	var repackCandidates []packInfoWithID
 	repackAllPacksWithDuplicates := true
 	keep := func(p packInfo) {
 		stats.packs.keep++
 		if p.duplicateBlobs > 0 {
 			repackAllPacksWithDuplicates = false
 		}
 	}
 	repoVersion := repo.Config().Version
@ -347,7 +388,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		}
 		if p.unusedSize+p.usedSize != uint64(packSize) &&
-			!(p.usedBlobs == 0 && p.duplicateBlobs == 0) {
+			p.usedBlobs != 0 {
 			// Pack size does not fit and pack is needed => error
 			// If the pack is not needed, this is no error, the pack can
 			// and will be simply removed, see below.
@ -358,7 +399,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		// statistics
 		switch {
-		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+		case p.usedBlobs == 0:
 			stats.packs.unused++
 		case p.unusedBlobs == 0:
 			stats.packs.used++
@ -377,7 +418,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		// decide what to do
 		switch {
-		case p.usedBlobs == 0 && p.duplicateBlobs == 0:
+		case p.usedBlobs == 0:
 			// All blobs in pack are no longer used => remove pack!
 			removePacks.Insert(id)
 			stats.blobs.remove += p.unusedBlobs
@ -387,8 +428,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 			// if this is a data pack and --repack-cacheable-only is set => keep pack!
 			keep(p)
-		case p.unusedBlobs == 0 && p.duplicateBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
+		case p.unusedBlobs == 0 && p.tpe != restic.InvalidBlob && !mustCompress:
-			// All blobs in pack are used and not duplicates/mixed => keep pack!
+			// All blobs in pack are used and not mixed => keep pack!
 			keep(p)
 		default:
@ -410,7 +451,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	// missing packs that are not needed can be ignored
 	ignorePacks := restic.NewIDSet()
 	for id, p := range indexPack {
-		if p.usedBlobs == 0 && p.duplicateBlobs == 0 {
+		if p.usedBlobs == 0 {
 			ignorePacks.Insert(id)
 			stats.blobs.remove += p.unusedBlobs
 			stats.size.remove += p.unusedSize
@ -439,15 +480,11 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	// This is equivalent to sorting by unused / total space.
 	// Instead of unused[i] / used[i] > unused[j] / used[j] we use
 	// unused[i] * used[j] > unused[j] * used[i] as uint32*uint32 < uint64
-	// Morover duplicates and packs containing trees are sorted to the beginning
+	// Morover packs containing trees are sorted to the beginning
 	sort.Slice(repackCandidates, func(i, j int) bool {
 		pi := repackCandidates[i].packInfo
 		pj := repackCandidates[j].packInfo
 		switch {
 		case pi.duplicateBlobs > 0 && pj.duplicateBlobs == 0:
 			return true
 		case pj.duplicateBlobs > 0 && pi.duplicateBlobs == 0:
 			return false
 		case pi.tpe != restic.DataBlob && pj.tpe == restic.DataBlob:
 			return true
 		case pj.tpe != restic.DataBlob && pi.tpe == restic.DataBlob:
@ -458,7 +495,7 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 	repack := func(id restic.ID, p packInfo) {
 		repackPacks.Insert(id)
-		stats.blobs.repack += p.unusedBlobs + p.duplicateBlobs + p.usedBlobs
+		stats.blobs.repack += p.unusedBlobs + p.usedBlobs
 		stats.size.repack += p.unusedSize + p.usedSize
 		stats.blobs.repackrm += p.unusedBlobs
 		stats.size.repackrm += p.unusedSize
@ -472,8 +509,8 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		case reachedRepackSize:
 			keep(p.packInfo)
-		case p.duplicateBlobs > 0, p.tpe != restic.DataBlob, p.uncompressed:
+		case p.tpe != restic.DataBlob, p.uncompressed:
-			// repacking duplicates/non-data/uncompressed-trees is only limited by repackSize
+			// repacking non-data packs / uncompressed-trees is only limited by repackSize
 			repack(p.ID, p.packInfo)
 		case reachedUnusedSizeAfter:
@ -485,10 +522,18 @@ func prune(opts PruneOptions, gopts GlobalOptions, repo restic.Repository, usedB
 		}
 	}
-	// if all duplicates are repacked, print out correct statistics
+	if len(repackPacks) != 0 {
-	if repackAllPacksWithDuplicates {
+		// when repacking, we do not want to keep blobs which are
-		stats.blobs.repackrm += stats.blobs.duplicate
+		// already contained in kept packs, so delete them from keepBlobs
-		stats.size.repackrm += stats.size.duplicate
+		for blob := range repo.Index().Each(ctx) {
 			if removePacks.Has(blob.PackID) || repackPacks.Has(blob.PackID) {
 				continue
 			}
 			keepBlobs.Delete(blob.BlobHandle)
 		}
 	} else {
 		// keepBlobs is only needed if packs are repacked
 		keepBlobs = nil
 	}
 	Verboseff("\nused:         %10d blobs / %s\n", stats.blobs.used, formatBytes(stats.size.used))