From 231076fa4ae9825a1b2f9385a4ae260743486709 Mon Sep 17 00:00:00 2001 From: Igor Fedorenko Date: Thu, 11 Jan 2018 21:00:48 -0500 Subject: [PATCH] checker: Optimize checker.Packs() Use result of single repository.List() to find both missing and orphaned data packs. For 500GB repository this eliminates ~100K repository.Test() calls and improves check time by >30M in my environment (~45min before this change and ~7min after). Signed-off-by: Igor Fedorenko --- changelog/0.8.2/issue-1541 | 6 +++ internal/checker/checker.go | 87 +++++++++++-------------------------- 2 files changed, 32 insertions(+), 61 deletions(-) create mode 100644 changelog/0.8.2/issue-1541 diff --git a/changelog/0.8.2/issue-1541 b/changelog/0.8.2/issue-1541 new file mode 100644 index 000000000..c8f63fb3c --- /dev/null +++ b/changelog/0.8.2/issue-1541 @@ -0,0 +1,6 @@ +Enhancement: Reduce number of remote requests during repository check + +This change eliminates redundant remote repository calls and significantly +improves repository check time. + +https://github.com/restic/restic/issues/1541 diff --git a/internal/checker/checker.go b/internal/checker/checker.go index efef7ae94..6f134dbb7 100644 --- a/internal/checker/checker.go +++ b/internal/checker/checker.go @@ -30,8 +30,7 @@ type Checker struct { sync.Mutex M map[restic.ID]uint } - indexes map[restic.ID]*repository.Index - orphanedPacks restic.IDs + indexes map[restic.ID]*repository.Index masterIndex *repository.MasterIndex @@ -183,38 +182,6 @@ func (e PackError) Error() string { return "pack " + e.ID.String() + ": " + e.Err.Error() } -func packIDTester(ctx context.Context, repo restic.Repository, inChan <-chan restic.ID, errChan chan<- error, wg *sync.WaitGroup) { - debug.Log("worker start") - defer debug.Log("worker done") - - defer wg.Done() - - for id := range inChan { - h := restic.Handle{Type: restic.DataFile, Name: id.String()} - ok, err := repo.Backend().Test(ctx, h) - if err != nil { - err = PackError{ID: id, Err: err} - } else { - if !ok { - err = PackError{ID: id, Err: errors.New("does not exist")} - } - } - - if err != nil { - debug.Log("error checking for pack %s: %v", id.Str(), err) - select { - case <-ctx.Done(): - return - case errChan <- err: - } - - continue - } - - debug.Log("pack %s exists", id.Str()) - } -} - // Packs checks that all packs referenced in the index are still available and // there are no packs that aren't in an index. errChan is closed after all // packs have been checked. @@ -222,35 +189,33 @@ func (c *Checker) Packs(ctx context.Context, errChan chan<- error) { defer close(errChan) debug.Log("checking for %d packs", len(c.packs)) - seenPacks := restic.NewIDSet() - - var workerWG sync.WaitGroup - - IDChan := make(chan restic.ID) - for i := 0; i < defaultParallelism; i++ { - workerWG.Add(1) - go packIDTester(ctx, c.repo, IDChan, errChan, &workerWG) - } - - for id := range c.packs { - seenPacks.Insert(id) - IDChan <- id - } - close(IDChan) - - debug.Log("waiting for %d workers to terminate", defaultParallelism) - workerWG.Wait() - debug.Log("workers terminated") + debug.Log("listing repository packs") + repoPacks := restic.NewIDSet() for id := range c.repo.List(ctx, restic.DataFile) { - debug.Log("check data blob %v", id.Str()) - if !seenPacks.Has(id) { - c.orphanedPacks = append(c.orphanedPacks, id) - select { - case <-ctx.Done(): - return - case errChan <- PackError{ID: id, Orphaned: true, Err: errors.New("not referenced in any index")}: - } + select { + case <-ctx.Done(): + return + default: + } + repoPacks.Insert(id) + } + + // orphaned: present in the repo but not in c.packs + for orphanID := range repoPacks.Sub(c.packs) { + select { + case <-ctx.Done(): + return + case errChan <- PackError{ID: orphanID, Orphaned: true, Err: errors.New("not referenced in any index")}: + } + } + + // missing: present in c.packs but not in the repo + for missingID := range c.packs.Sub(repoPacks) { + select { + case <-ctx.Done(): + return + case errChan <- PackError{ID: missingID, Err: errors.New("does not exist")}: } } }