1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2025-01-19 14:02:55 +00:00

unify scanning and listing of segment dirs / segment files and apply good practices

+ os.scandir instead of os.listdir
  Improved speed and added flexibility with attributes (name, path, is_dir(), is_file())
+ use is_dir / is_file to make sure  we're reading only dirs / files respectively
+ Filtering to particular start, end index range built in
+ Move value bounds of segment (index) into constants module and use them instead

Resolves #7597

(forward patch from commits c9f35a16e9bf9e7073c486553177cef79ff1cb06^..edb5e749f512b7737b6933e13b7e61fefcd17bcb)
This commit is contained in:
nain 2023-05-27 07:23:31 -04:00
parent 85b6126629
commit ffe237ce0c
2 changed files with 38 additions and 20 deletions

View file

@ -75,6 +75,12 @@
FD_MAX_AGE = 4 * 60 # 4 minutes FD_MAX_AGE = 4 * 60 # 4 minutes
# Some bounds on segment / segment_dir indexes
MIN_SEGMENT_INDEX = 0
MAX_SEGMENT_INDEX = 2**32 - 1
MIN_SEGMENT_DIR_INDEX = 0
MAX_SEGMENT_DIR_INDEX = 2**32 - 1
# chunker algorithms # chunker algorithms
CH_BUZHASH = "buzhash" CH_BUZHASH = "buzhash"
CH_FIXED = "fixed" CH_FIXED = "fixed"

View file

@ -1376,39 +1376,51 @@ def _close_fd(self, ts_fd):
safe_fadvise(fd.fileno(), 0, 0, "DONTNEED") safe_fadvise(fd.fileno(), 0, 0, "DONTNEED")
fd.close() fd.close()
def get_segment_dirs(self, data_dir, start_index=MIN_SEGMENT_DIR_INDEX, end_index=MAX_SEGMENT_DIR_INDEX):
"""Returns generator yielding required segment dirs in data_dir as `os.DirEntry` objects.
Start and end are inclusive.
"""
segment_dirs = (
f
for f in os.scandir(data_dir)
if f.is_dir() and f.name.isdigit() and start_index <= int(f.name) <= end_index
)
return segment_dirs
def get_segment_files(self, segment_dir, start_index=MIN_SEGMENT_INDEX, end_index=MAX_SEGMENT_INDEX):
"""Returns generator yielding required segment files in segment_dir as `os.DirEntry` objects.
Start and end are inclusive.
"""
segment_files = (
f
for f in os.scandir(segment_dir)
if f.is_file() and f.name.isdigit() and start_index <= int(f.name) <= end_index
)
return segment_files
def segment_iterator(self, start_segment=None, end_segment=None, reverse=False): def segment_iterator(self, start_segment=None, end_segment=None, reverse=False):
if start_segment is None: if start_segment is None:
start_segment = 0 if not reverse else 2**32 - 1 start_segment = MIN_SEGMENT_INDEX if not reverse else MAX_SEGMENT_INDEX
if end_segment is None: if end_segment is None:
end_segment = 2**32 - 1 if not reverse else 0 end_segment = MAX_SEGMENT_INDEX if not reverse else MIN_SEGMENT_INDEX
data_path = os.path.join(self.path, "data") data_path = os.path.join(self.path, "data")
start_segment_dir = start_segment // self.segments_per_dir start_segment_dir = start_segment // self.segments_per_dir
end_segment_dir = end_segment // self.segments_per_dir end_segment_dir = end_segment // self.segments_per_dir
dirs = os.listdir(data_path)
if not reverse: if not reverse:
dirs = [dir for dir in dirs if dir.isdigit() and start_segment_dir <= int(dir) <= end_segment_dir] dirs = self.get_segment_dirs(data_path, start_index=start_segment_dir, end_index=end_segment_dir)
else: else:
dirs = [dir for dir in dirs if dir.isdigit() and start_segment_dir >= int(dir) >= end_segment_dir] dirs = self.get_segment_dirs(data_path, start_index=end_segment_dir, end_index=start_segment_dir)
dirs = sorted(dirs, key=int, reverse=reverse) dirs = sorted(dirs, key=lambda dir: int(dir.name), reverse=reverse)
for dir in dirs: for dir in dirs:
filenames = os.listdir(os.path.join(data_path, dir))
if not reverse: if not reverse:
filenames = [ files = self.get_segment_files(dir, start_index=start_segment, end_index=end_segment)
filename
for filename in filenames
if filename.isdigit() and start_segment <= int(filename) <= end_segment
]
else: else:
filenames = [ files = self.get_segment_files(dir, start_index=end_segment, end_index=start_segment)
filename files = sorted(files, key=lambda file: int(file.name), reverse=reverse)
for filename in filenames for file in files:
if filename.isdigit() and start_segment >= int(filename) >= end_segment
]
filenames = sorted(filenames, key=int, reverse=reverse)
for filename in filenames:
# Note: Do not filter out logically deleted segments (see "File system interaction" above), # Note: Do not filter out logically deleted segments (see "File system interaction" above),
# since this is used by cleanup and txn state detection as well. # since this is used by cleanup and txn state detection as well.
yield int(filename), os.path.join(data_path, dir, filename) yield int(file.name), file.path
def get_latest_segment(self): def get_latest_segment(self):
for segment, filename in self.segment_iterator(reverse=True): for segment, filename in self.segment_iterator(reverse=True):