From c9f35a16e9bf9e7073c486553177cef79ff1cb06 Mon Sep 17 00:00:00 2001 From: nain <126972030+F49FF806@users.noreply.github.com> Date: Thu, 25 May 2023 11:25:10 -0400 Subject: [PATCH 1/3] unify scanning and listing of segment dirs / segment files and apply good practices + os.scandir instead of os.listdir Improved speed and added flexibility with attributes (name,path,is_dir(),is_file()) + use is_dir / is_file to make sure we're reading only dirs / files respectively + Filtering to particular start, end index range built in + Have seperate generators to avoid unnecessary less_than / greater_than checking for every dir/file when not required Resolves #7597 --- src/borg/repository.py | 46 ++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 61905b394..203596c85 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1319,28 +1319,52 @@ class LoggedIO: safe_fadvise(fd.fileno(), 0, 0, 'DONTNEED') fd.close() + def get_segment_dirs(self, start_index=None, end_index=None): + """Returns generator yeilding required segment dirs as `os.DirEntry` objects. Start and end are inclusive. + """ + data_dir = os.path.join(self.path, 'data') + if start_index is None and end_index is None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit()) + elif start_index is not None and end_index is None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name)) + elif start_index is None and end_index is not None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and int(f.name) <= end_index) + elif start_index is not None and end_index is not None: + segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + return segment_dirs + + def get_segment_files(self, segment_dir, start_index=None, end_index=None): + """Returns generator yeilding required segment files in segment_dir as `os.DirEntry` objects. Start and end are inclusive. + """ + if start_index is None and end_index is None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit()) + elif start_index is not None and end_index is None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name)) + elif start_index is None and end_index is not None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and int(f.name) <= end_index) + elif start_index is not None and end_index is not None: + segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + return segment_files + def segment_iterator(self, segment=None, reverse=False): if segment is None: segment = 0 if not reverse else 2 ** 32 - 1 - data_path = os.path.join(self.path, 'data') start_segment_dir = segment // self.segments_per_dir - dirs = os.listdir(data_path) if not reverse: - dirs = [dir for dir in dirs if dir.isdigit() and int(dir) >= start_segment_dir] + dirs = self.get_segment_dirs(start_index=start_segment_dir) else: - dirs = [dir for dir in dirs if dir.isdigit() and int(dir) <= start_segment_dir] - dirs = sorted(dirs, key=int, reverse=reverse) + dirs = self.get_segment_dirs(end_index=start_segment_dir) + dirs = sorted(dirs, key=lambda dir: int(dir.name), reverse=reverse) for dir in dirs: - filenames = os.listdir(os.path.join(data_path, dir)) if not reverse: - filenames = [filename for filename in filenames if filename.isdigit() and int(filename) >= segment] + files = self.get_segment_files(dir, start_index=segment) else: - filenames = [filename for filename in filenames if filename.isdigit() and int(filename) <= segment] - filenames = sorted(filenames, key=int, reverse=reverse) - for filename in filenames: + files = self.get_segment_files(dir, end_index=segment) + files = sorted(files, key=lambda file: int(file.name), reverse=reverse) + for file in files: # Note: Do not filter out logically deleted segments (see "File system interaction" above), # since this is used by cleanup and txn state detection as well. - yield int(filename), os.path.join(data_path, dir, filename) + yield int(file.name), file.path def get_latest_segment(self): for segment, filename in self.segment_iterator(reverse=True): From 3f2da1bba9079b3287a99dff0de919fce26349a2 Mon Sep 17 00:00:00 2001 From: nain <126972030+F49FF806@users.noreply.github.com> Date: Fri, 26 May 2023 02:33:31 -0400 Subject: [PATCH 2/3] Simplify generator functions and make get_segment_dirs inputs consistent with get_segment_files + get_segment_dirs explicitly takes data_dir for clarity and future flexibility + removed multiple cases for one general purpose generator expression --- src/borg/repository.py | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/src/borg/repository.py b/src/borg/repository.py index 203596c85..2910ce261 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1319,41 +1319,37 @@ class LoggedIO: safe_fadvise(fd.fileno(), 0, 0, 'DONTNEED') fd.close() - def get_segment_dirs(self, start_index=None, end_index=None): - """Returns generator yeilding required segment dirs as `os.DirEntry` objects. Start and end are inclusive. + def get_segment_dirs(self, data_dir, start_index=0, end_index=2**30): + """Returns generator yielding required segment dirs in data_dir as `os.DirEntry` objects. + Start and end are inclusive. """ - data_dir = os.path.join(self.path, 'data') - if start_index is None and end_index is None: - segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit()) - elif start_index is not None and end_index is None: - segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name)) - elif start_index is None and end_index is not None: - segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and int(f.name) <= end_index) - elif start_index is not None and end_index is not None: - segment_dirs = (f for f in os.scandir(data_dir) if f.is_dir() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + segment_dirs = ( + f + for f in os.scandir(data_dir) + if f.is_dir() and f.name.isdigit() and start_index <= int(f.name) <= end_index + ) return segment_dirs - def get_segment_files(self, segment_dir, start_index=None, end_index=None): - """Returns generator yeilding required segment files in segment_dir as `os.DirEntry` objects. Start and end are inclusive. + def get_segment_files(self, segment_dir, start_index=0, end_index=2**32): + """Returns generator yielding required segment files in segment_dir as `os.DirEntry` objects. + Start and end are inclusive. """ - if start_index is None and end_index is None: - segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit()) - elif start_index is not None and end_index is None: - segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name)) - elif start_index is None and end_index is not None: - segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and int(f.name) <= end_index) - elif start_index is not None and end_index is not None: - segment_files = (f for f in os.scandir(segment_dir) if f.is_file() and f.name.isdigit() and start_index <= int(f.name) <= end_index) + segment_files = ( + f + for f in os.scandir(segment_dir) + if f.is_file() and f.name.isdigit() and start_index <= int(f.name) <= end_index + ) return segment_files def segment_iterator(self, segment=None, reverse=False): if segment is None: segment = 0 if not reverse else 2 ** 32 - 1 start_segment_dir = segment // self.segments_per_dir + data_path = os.path.join(self.path, 'data') if not reverse: - dirs = self.get_segment_dirs(start_index=start_segment_dir) + dirs = self.get_segment_dirs(data_path, start_index=start_segment_dir) else: - dirs = self.get_segment_dirs(end_index=start_segment_dir) + dirs = self.get_segment_dirs(data_path, end_index=start_segment_dir) dirs = sorted(dirs, key=lambda dir: int(dir.name), reverse=reverse) for dir in dirs: if not reverse: From edb5e749f512b7737b6933e13b7e61fefcd17bcb Mon Sep 17 00:00:00 2001 From: nain <126972030+F49FF806@users.noreply.github.com> Date: Sat, 27 May 2023 00:11:54 -0400 Subject: [PATCH 3/3] Move value bounds of segment (index) into constants module and use them instead --- src/borg/constants.py | 6 ++++++ src/borg/repository.py | 6 +++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/borg/constants.py b/src/borg/constants.py index a1aa3da98..c2ebd4d7c 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -62,6 +62,12 @@ LIST_SCAN_LIMIT = 100000 DEFAULT_SEGMENTS_PER_DIR = 1000 +# Some bounds on segment / segment_dir indexes +MIN_SEGMENT_INDEX = 0 +MAX_SEGMENT_INDEX = 2**32 - 1 +MIN_SEGMENT_DIR_INDEX = 0 +MAX_SEGMENT_DIR_INDEX = 2**32 - 1 + FD_MAX_AGE = 4 * 60 # 4 minutes CHUNK_MIN_EXP = 19 # 2**19 == 512kiB diff --git a/src/borg/repository.py b/src/borg/repository.py index 2910ce261..24079954c 100644 --- a/src/borg/repository.py +++ b/src/borg/repository.py @@ -1319,7 +1319,7 @@ class LoggedIO: safe_fadvise(fd.fileno(), 0, 0, 'DONTNEED') fd.close() - def get_segment_dirs(self, data_dir, start_index=0, end_index=2**30): + def get_segment_dirs(self, data_dir, start_index=MIN_SEGMENT_DIR_INDEX, end_index=MAX_SEGMENT_DIR_INDEX): """Returns generator yielding required segment dirs in data_dir as `os.DirEntry` objects. Start and end are inclusive. """ @@ -1330,7 +1330,7 @@ class LoggedIO: ) return segment_dirs - def get_segment_files(self, segment_dir, start_index=0, end_index=2**32): + def get_segment_files(self, segment_dir, start_index=MIN_SEGMENT_INDEX, end_index=MAX_SEGMENT_INDEX): """Returns generator yielding required segment files in segment_dir as `os.DirEntry` objects. Start and end are inclusive. """ @@ -1343,7 +1343,7 @@ class LoggedIO: def segment_iterator(self, segment=None, reverse=False): if segment is None: - segment = 0 if not reverse else 2 ** 32 - 1 + segment = MIN_SEGMENT_INDEX if not reverse else MAX_SEGMENT_INDEX start_segment_dir = segment // self.segments_per_dir data_path = os.path.join(self.path, 'data') if not reverse: