From 3b1c0df7c87a9016eb8d651ecc57cc303c723f9b Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 28 Oct 2023 17:12:54 +0200 Subject: [PATCH 1/6] test the shadowing-by-double-put behaviour, see #5661 the new test is currently failing due to a bug in the repository code. --- src/borg/testsuite/repository.py | 47 +++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index f3a72f47..a9a2f8a6 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -516,7 +516,8 @@ def test_moved_deletes_are_tracked(repository): assert H(1) not in repository.shadow_index -def test_shadowed_entries_are_preserved(repository): +def test_shadowed_entries_are_preserved1(repository): + # this tests the shadowing-by-del behaviour with repository: get_latest_segment = repository.io.get_latest_segment repository.put(H(1), fchunk(b"1")) @@ -546,6 +547,50 @@ def test_shadowed_entries_are_preserved(repository): assert H(1) not in repository +def test_shadowed_entries_are_preserved2(repository): + # this tests the shadowing-by-double-put behaviour, see issue #5661 + # assume this repo state: + # seg1: PUT H1 + # seg2: COMMIT + # seg3: DEL H1, PUT H1, DEL H1, PUT H2 + # seg4: COMMIT + # Note how due to the final DEL H1 in seg3, H1 is effectively deleted. + # + # compaction of only seg3: + # PUT H1 gets dropped because it is not needed any more. + # DEL H1 must be kept, because there is still a PUT H1 in seg1 which must not + # "reappear" in the index if the index gets rebuilt. + with repository: + get_latest_segment = repository.io.get_latest_segment + repository.put(H(1), fchunk(b"1")) + # This is the segment with our original PUT of interest + put_segment = get_latest_segment() + repository.commit(compact=False) + # We now put H(1) again (which implicitly does DEL(H(1)) followed by PUT(H(1), ...)), + # delete H(1) afterwards, and force this segment to not be compacted, which can happen + # if it's not sparse enough (symbolized by H(2) here). + repository.put(H(1), fchunk(b"1")) + repository.delete(H(1)) + repository.put(H(2), fchunk(b"1")) + delete_segment = get_latest_segment() + # We pretend these are mostly dense (not sparse) and won't be compacted + del repository.compact[put_segment] + del repository.compact[delete_segment] + repository.commit(compact=True) + # Now we perform an unrelated operation on the segment containing the DELETE, + # causing it to be compacted. + repository.delete(H(2)) + repository.commit(compact=True) + assert repository.io.segment_exists(put_segment) + assert not repository.io.segment_exists(delete_segment) + # Basic case, since the index survived this must be ok + assert H(1) not in repository + # Nuke index, force replay + os.unlink(os.path.join(repository.path, "index.%d" % get_latest_segment())) + # Must not reappear + assert H(1) not in repository # F + + def test_shadow_index_rollback(repository): with repository: repository.put(H(1), fchunk(b"1")) From 6d9513f507a4fcbd2663b769e22ff40b181fe9c7 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 28 Oct 2023 17:17:47 +0200 Subject: [PATCH 2/6] update shadow index when doing a double-put, fixes #5661 this fixes the test added in previous commit (avoids that the PUT reappears after index rebuild). --- src/borg/repository.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 673cf758..8f7c31e9 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1300,10 +1300,7 @@ class Repository: pass else: # note: doing a delete first will do some bookkeeping. - # we do not want to update the shadow_index here, because - # we know already that we will PUT to this id, so it will - # be in the repo index (and we won't need it in the shadow_index). - self._delete(id, in_index.segment, in_index.offset, in_index.size, update_shadow_index=False) + self._delete(id, in_index.segment, in_index.offset, in_index.size, update_shadow_index=True) segment, offset = self.io.write_put(id, data) self.storage_quota_use += header_size(TAG_PUT2) + len(data) self.segments.setdefault(segment, 0) From 3ba533ac50c34fb23f5d0e39532d0aa340cd5598 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 28 Oct 2023 17:36:10 +0200 Subject: [PATCH 3/6] shadow index updates: simplify and more comments no functional change here. --- src/borg/repository.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 8f7c31e9..2ac69c0c 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1299,8 +1299,11 @@ class Repository: except KeyError: pass else: - # note: doing a delete first will do some bookkeeping. - self._delete(id, in_index.segment, in_index.offset, in_index.size, update_shadow_index=True) + # this put call supersedes a previous put to same id. + # it is essential to do a delete first to get correct quota bookkeeping + # and also a correctly updated shadow_index, so that the compaction code + # does not wrongly resurrect an old PUT by dropping a DEL that is still needed. + self._delete(id, in_index.segment, in_index.offset, in_index.size) segment, offset = self.io.write_put(id, data) self.storage_quota_use += header_size(TAG_PUT2) + len(data) self.segments.setdefault(segment, 0) @@ -1324,16 +1327,15 @@ class Repository: in_index = self.index.pop(id) except KeyError: raise self.ObjectNotFound(id, self.path) from None - # if we get here, there is an object with this id in the repo, - # we write a DEL here that shadows the respective PUT. - # after the delete, the object is not in the repo index any more, - # for the compaction code, we need to update the shadow_index in this case. - self._delete(id, in_index.segment, in_index.offset, in_index.size, update_shadow_index=True) + self._delete(id, in_index.segment, in_index.offset, in_index.size) - def _delete(self, id, segment, offset, size, *, update_shadow_index): + def _delete(self, id, segment, offset, size): # common code used by put and delete - if update_shadow_index: - self.shadow_index.setdefault(id, []).append(segment) + # because we'll write a DEL tag to the repository, we must update the shadow index. + # this is always true, no matter whether we are called from put() or delete(). + # the compaction code needs this to not drop DEL tags if they are still required + # to keep a PUT in an earlier segment in the "effectively deleted" state. + self.shadow_index.setdefault(id, []).append(segment) self.segments[segment] -= 1 self.compact[segment] += header_size(TAG_PUT2) + size segment, size = self.io.write_delete(id) From 5d28992b51522bb36186bd8e238043e3331fb283 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sat, 28 Oct 2023 17:36:10 +0200 Subject: [PATCH 4/6] shadow index: add more comments --- src/borg/repository.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/borg/repository.py b/src/borg/repository.py index 2ac69c0c..05333bc2 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -195,6 +195,9 @@ class Repository: # segment_n PUT A, segment_x DELETE A # After the "DELETE A" in segment_x the shadow index will contain "A -> [n]". # .delete() is updating this index, it is persisted into "hints" file and is later used by .compact_segments(). + # We need the entries in the shadow_index to not accidentally drop the "DELETE A" when we compact segment_x + # only (and we do not compact segment_n), because DELETE A is still needed then because PUT A will be still + # there. Otherwise chunk A would reappear although it was previously deleted. self.shadow_index = {} self._active_txn = False self.lock_wait = lock_wait From 7892e4212b8e677653c6697579c3b17fe2f18cbd Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 Oct 2023 01:01:17 +0200 Subject: [PATCH 5/6] check --repair: test if shadow index is recreated still failing here, because it is not. --- src/borg/testsuite/repository.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/borg/testsuite/repository.py b/src/borg/testsuite/repository.py index a9a2f8a6..ca60b53b 100644 --- a/src/borg/testsuite/repository.py +++ b/src/borg/testsuite/repository.py @@ -1061,6 +1061,15 @@ def test_hints_persistence(repository): assert compact_expected == repository.compact del repository.segments[2] # ignore the segment created by put(H(42), ...) assert segments_expected == repository.segments + with reopen(repository) as repository: + check(repository, repository.path, repair=True) + with reopen(repository) as repository: + repository.put(H(42), fchunk(b"foobar")) # this will call prepare_txn() and load the hints data + assert shadow_index_expected == repository.shadow_index + # sizes do not match, with vs. without header? + # assert compact_expected == repository.compact + del repository.segments[2] # ignore the segment created by put(H(42), ...) + assert segments_expected == repository.segments def test_hints_behaviour(repository): From 57f3dd1dae7f8c14d3a17bb5015a47865fcee5e1 Mon Sep 17 00:00:00 2001 From: Thomas Waldmann Date: Sun, 29 Oct 2023 01:07:16 +0200 Subject: [PATCH 6/6] check --repair: recreate shadow index, see #6687 before this fix, borg check --repair just created an empty shadow index, which can lead to incomplete entries if entries are added later. and such incomplete (but present) entries can lead to compact_segments() resurrecting old PUTs by accidentally dropping related DELs. --- src/borg/repository.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/borg/repository.py b/src/borg/repository.py index 05333bc2..bdc801b4 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -963,6 +963,7 @@ class Repository: in_index = self.index[key] self.compact[in_index.segment] += header_size(tag) + size self.segments[in_index.segment] -= 1 + self.shadow_index.setdefault(key, []).append(in_index.segment) except KeyError: pass self.index[key] = NSIndexEntry(segment, offset, size) @@ -980,6 +981,7 @@ class Repository: # is already gone, then it was already compacted. self.segments[in_index.segment] -= 1 self.compact[in_index.segment] += header_size(tag) + in_index.size + self.shadow_index.setdefault(key, []).append(in_index.segment) elif tag == TAG_COMMIT: continue else: