1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-25 17:27:31 +00:00

Merge pull request #7038 from ThomasWaldmann/fix-scan

Fix repository.scan
This commit is contained in:
TW 2022-09-19 12:28:09 +02:00 committed by GitHub
commit c0e674ce61
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 78 additions and 35 deletions

View file

@ -1749,11 +1749,10 @@ def verify_data(self):
)
marker = None
while True:
chunk_ids = self.repository.scan(limit=100, marker=marker)
chunk_ids, marker = self.repository.scan(limit=100, marker=marker)
if not chunk_ids:
break
chunks_count_segments += len(chunk_ids)
marker = chunk_ids[-1]
chunk_data_iter = self.repository.get_many(chunk_ids)
chunk_ids_revd = list(reversed(chunk_ids))
while chunk_ids_revd:

View file

@ -155,11 +155,12 @@ def decrypt_dump(i, id, cdata, tag=None, segment=None, offset=None):
marker = None
i = 0
while True:
result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here
if not result:
ids, marker = repository.scan(
limit=LIST_SCAN_LIMIT, marker=marker
) # must use on-disk order scanning here
if not ids:
break
marker = result[-1]
for id in result:
for id in ids:
cdata = repository.get(id)
decrypt_dump(i, id, cdata)
i += 1
@ -207,11 +208,10 @@ def print_finding(info, wanted, data, offset):
last_id = None
i = 0
while True:
result = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here
if not result:
ids, marker = repository.scan(limit=LIST_SCAN_LIMIT, marker=marker) # must use on-disk order scanning here
if not ids:
break
marker = result[-1]
for id in result:
for id in ids:
cdata = repository.get(id)
_, data = repo_objs.parse(id, cdata)

View file

@ -1163,9 +1163,7 @@ def scan_low_level(self, segment=None, offset=None):
When segment or segment+offset is given, limit processing to this location only.
"""
for current_segment, filename in self.io.segment_iterator(segment=segment):
if segment is not None and current_segment > segment:
break
for current_segment, filename in self.io.segment_iterator(start_segment=segment, end_segment=segment):
try:
for tag, key, current_offset, _, data in self.io.iter_objects(
segment=current_segment, offset=offset or 0
@ -1211,9 +1209,14 @@ def list(self, limit=None, marker=None, mask=0, value=0):
def scan(self, limit=None, marker=None):
"""
list <limit> IDs starting from after id <marker> - in on-disk order, so that a client
list <limit> IDs starting from after <marker> - in on-disk order, so that a client
fetching data in this order does linear reads and reuses stuff from disk cache.
marker can either be None (default, meaning "start from the beginning") or the object
returned from a previous scan call (meaning "continue scanning where we stopped previously").
returns: list of chunk ids, marker
We rely on repository.check() has run already (either now or some time before) and that:
- if we are called from a borg check command, self.index is a valid, fresh, in-sync repo index.
@ -1223,14 +1226,15 @@ def scan(self, limit=None, marker=None):
"""
if limit is not None and limit < 1:
raise ValueError("please use limit > 0 or limit = None")
if not self.index:
transaction_id = self.get_transaction_id()
if not self.index:
self.index = self.open_index(transaction_id)
at_start = marker is None
# smallest valid seg is <uint32> 0, smallest valid offs is <uint32> 8
start_segment, start_offset, _ = (0, 0, 0) if at_start else self.index[marker]
result = []
for segment, filename in self.io.segment_iterator(start_segment):
start_segment, start_offset = marker if marker is not None else (0, 0)
ids, segment, offset = [], 0, 0
# we only scan up to end_segment == transaction_id to only scan **committed** chunks,
# avoiding scanning into newly written chunks.
for segment, filename in self.io.segment_iterator(start_segment, transaction_id):
obj_iterator = self.io.iter_objects(segment, start_offset, read_data=False)
while True:
try:
@ -1249,10 +1253,10 @@ def scan(self, limit=None, marker=None):
in_index = self.index.get(id)
if in_index and (in_index.segment, in_index.offset) == (segment, offset):
# we have found an existing and current object
result.append(id)
if len(result) == limit:
return result
return result
ids.append(id)
if len(ids) == limit:
return ids, (segment, offset)
return ids, (segment, offset)
def flags(self, id, mask=0xFFFFFFFF, value=None):
"""
@ -1392,23 +1396,34 @@ def _close_fd(self, ts_fd):
safe_fadvise(fd.fileno(), 0, 0, "DONTNEED")
fd.close()
def segment_iterator(self, segment=None, reverse=False):
if segment is None:
segment = 0 if not reverse else 2**32 - 1
def segment_iterator(self, start_segment=None, end_segment=None, reverse=False):
if start_segment is None:
start_segment = 0 if not reverse else 2**32 - 1
if end_segment is None:
end_segment = 2**32 - 1 if not reverse else 0
data_path = os.path.join(self.path, "data")
start_segment_dir = segment // self.segments_per_dir
start_segment_dir = start_segment // self.segments_per_dir
end_segment_dir = end_segment // self.segments_per_dir
dirs = os.listdir(data_path)
if not reverse:
dirs = [dir for dir in dirs if dir.isdigit() and int(dir) >= start_segment_dir]
dirs = [dir for dir in dirs if dir.isdigit() and start_segment_dir <= int(dir) <= end_segment_dir]
else:
dirs = [dir for dir in dirs if dir.isdigit() and int(dir) <= start_segment_dir]
dirs = [dir for dir in dirs if dir.isdigit() and start_segment_dir >= int(dir) >= end_segment_dir]
dirs = sorted(dirs, key=int, reverse=reverse)
for dir in dirs:
filenames = os.listdir(os.path.join(data_path, dir))
if not reverse:
filenames = [filename for filename in filenames if filename.isdigit() and int(filename) >= segment]
filenames = [
filename
for filename in filenames
if filename.isdigit() and start_segment <= int(filename) <= end_segment
]
else:
filenames = [filename for filename in filenames if filename.isdigit() and int(filename) <= segment]
filenames = [
filename
for filename in filenames
if filename.isdigit() and start_segment >= int(filename) >= end_segment
]
filenames = sorted(filenames, key=int, reverse=reverse)
for filename in filenames:
# Note: Do not filter out logically deleted segments (see "File system interaction" above),

View file

@ -189,19 +189,48 @@ def test_scan(self):
for x in range(100):
self.repository.put(H(x), fchunk(b"SOMEDATA"))
self.repository.commit(compact=False)
all = self.repository.scan()
all, _ = self.repository.scan()
assert len(all) == 100
first_half = self.repository.scan(limit=50)
first_half, marker = self.repository.scan(limit=50)
assert len(first_half) == 50
assert first_half == all[:50]
second_half = self.repository.scan(marker=first_half[-1])
second_half, _ = self.repository.scan(marker=marker)
assert len(second_half) == 50
assert second_half == all[50:]
assert len(self.repository.scan(limit=50)) == 50
# check result order == on-disk order (which is hash order)
for x in range(100):
assert all[x] == H(x)
def test_scan_modify(self):
for x in range(100):
self.repository.put(H(x), fchunk(b"ORIGINAL"))
self.repository.commit(compact=False)
# now we scan, read and modify chunks at the same time
count = 0
ids, _ = self.repository.scan()
for id in ids:
# scan results are in same order as we put the chunks into the repo (into the segment file)
assert id == H(count)
chunk = self.repository.get(id)
# check that we **only** get data that was committed when we started scanning
# and that we do not run into the new data we put into the repo.
assert pdchunk(chunk) == b"ORIGINAL"
count += 1
self.repository.put(id, fchunk(b"MODIFIED"))
assert count == 100
self.repository.commit()
# now we have committed all the modified chunks, and **only** must get the modified ones.
count = 0
ids, _ = self.repository.scan()
for id in ids:
# scan results are in same order as we put the chunks into the repo (into the segment file)
assert id == H(count)
chunk = self.repository.get(id)
assert pdchunk(chunk) == b"MODIFIED"
count += 1
assert count == 100
def test_max_data_size(self):
max_data = b"x" * (MAX_DATA_SIZE - RepoObj.meta_len_hdr.size)
self.repository.put(H(0), fchunk(max_data))