Merge pull request #7896 from ThomasWaldmann/fix-shadow-index-1.2

fix shadow index update for double-put (1.2-maint)
This commit is contained in:
TW 2023-11-03 14:42:57 +01:00 committed by GitHub
commit 686714c366
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 62 additions and 14 deletions

View File

@ -1225,11 +1225,11 @@ class Repository:
except KeyError:
pass
else:
# note: doing a delete first will do some bookkeeping.
# we do not want to update the shadow_index here, because
# we know already that we will PUT to this id, so it will
# be in the repo index (and we won't need it in the shadow_index).
self._delete(id, segment, offset, update_shadow_index=False)
# this put call supersedes a previous put to same id.
# it is essential to do a delete first to get correct quota bookkeeping
# and also a correctly updated shadow_index, so that the compaction code
# does not wrongly resurrect an old PUT by dropping a DEL that is still needed.
self._delete(id, segment, offset)
segment, offset = self.io.write_put(id, data)
self.storage_quota_use += len(data) + self.io.put_header_fmt.size
self.segments.setdefault(segment, 0)
@ -1252,16 +1252,15 @@ class Repository:
segment, offset = self.index.pop(id)
except KeyError:
raise self.ObjectNotFound(id, self.path) from None
# if we get here, there is an object with this id in the repo,
# we write a DEL here that shadows the respective PUT.
# after the delete, the object is not in the repo index any more,
# for the compaction code, we need to update the shadow_index in this case.
self._delete(id, segment, offset, update_shadow_index=True)
self._delete(id, segment, offset)
def _delete(self, id, segment, offset, *, update_shadow_index):
def _delete(self, id, segment, offset):
# common code used by put and delete
if update_shadow_index:
self.shadow_index.setdefault(id, []).append(segment)
# because we'll write a DEL tag to the repository, we must update the shadow index.
# this is always true, no matter whether we are called from put() or delete().
# the compaction code needs this to not drop DEL tags if they are still required
# to keep a PUT in an earlier segment in the "effectively deleted" state.
self.shadow_index.setdefault(id, []).append(segment)
self.segments[segment] -= 1
size = self.io.read(segment, offset, id, read_data=False)
self.compact[segment] += size

View File

@ -345,7 +345,8 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase):
# we have no put or del any more for H(1), so we lost knowledge about H(1).
assert H(1) not in self.repository.shadow_index
def test_shadowed_entries_are_preserved(self):
def test_shadowed_entries_are_preserved1(self):
# this tests the shadowing-by-del behaviour
get_latest_segment = self.repository.io.get_latest_segment
self.repository.put(H(1), b'1')
# This is the segment with our original PUT of interest
@ -379,6 +380,54 @@ class RepositoryCommitTestCase(RepositoryTestCaseBase):
# Must not reappear
assert H(1) not in self.repository
def test_shadowed_entries_are_preserved2(self):
# this tests the shadowing-by-double-put behaviour, see issue #5661
# assume this repo state:
# seg1: PUT H1
# seg2: COMMIT
# seg3: DEL H1, PUT H1, DEL H1, PUT H2
# seg4: COMMIT
# Note how due to the final DEL H1 in seg3, H1 is effectively deleted.
#
# compaction of only seg3:
# PUT H1 gets dropped because it is not needed any more.
# DEL H1 must be kept, because there is still a PUT H1 in seg1 which must not
# "reappear" in the index if the index gets rebuilt.
get_latest_segment = self.repository.io.get_latest_segment
self.repository.put(H(1), b'1')
# This is the segment with our original PUT of interest
put_segment = get_latest_segment()
self.repository.commit(compact=False)
# We now put H(1) again (which implicitly does DEL(H(1)) followed by PUT(H(1), ...)),
# delete H(1) afterwards, and force this segment to not be compacted, which can happen
# if it's not sparse enough (symbolized by H(2) here).
self.repository.put(H(1), b'1')
self.repository.delete(H(1))
self.repository.put(H(2), b'1')
delete_segment = get_latest_segment()
# We pretend these are mostly dense (not sparse) and won't be compacted
del self.repository.compact[put_segment]
del self.repository.compact[delete_segment]
self.repository.commit(compact=True)
# Now we perform an unrelated operation on the segment containing the DELETE,
# causing it to be compacted.
self.repository.delete(H(2))
self.repository.commit(compact=True)
assert self.repository.io.segment_exists(put_segment)
assert not self.repository.io.segment_exists(delete_segment)
# Basic case, since the index survived this must be ok
assert H(1) not in self.repository
# Nuke index, force replay
os.unlink(os.path.join(self.repository.path, 'index.%d' % get_latest_segment()))
# Must not reappear
assert H(1) not in self.repository
def test_shadow_index_rollback(self):
self.repository.put(H(1), b'1')
self.repository.delete(H(1))