Merge pull request #6941 from ThomasWaldmann/archive-items-indirect

massively increase archive metadata stream size limit, fixes #1473
2024-12-26 09:47:58 +00:00 · 2022-08-06 22:46:23 +02:00 · 2022-08-06 22:46:23 +02:00 · 76ef20105f
commit 76ef20105f
parent 02580c09ea fb74fdb710
11 changed files with 82 additions and 70 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
 Are there other known limitations?
 ----------------------------------

- A single archive can only reference a limited volume of file/dir metadata,
-  usually corresponding to tens or hundreds of millions of files/dirs.
-  When trying to go beyond that limit, you will get a fatal IntegrityError
-  exception telling that the (archive) object is too big.
-  An easy workaround is to create multiple archives with fewer items each.
-  See also the :ref:`archive_limitation` and :issue:`1452`.
-
-  :ref:`borg_info` shows how large (relative to the maximum size) existing
-  archives are.
 - borg extract only supports restoring into an empty destination. After that,
  the destination will exactly have the contents of the extracted archive.
  If you extract into a non-empty destination, borg will (for example) not
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
  When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
  more than one archive object with the same name, it adds a counter to the name
  in the manifest, but leaves the *name* field of the archives as it was.
-* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
+* *item_ptrs*, a list of "pointer chunk" IDs.
+  Each "pointer chunk" contains a list of chunk IDs of item metadata.
 * *cmdline*, the command line which was used to create the archive
 * *hostname*
 * *username*
@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
  This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
 * Some other pieces of information related to recreate.

-.. _archive_limitation:
-
-.. rubric:: Note about archive limitations
-
-The archive is currently stored as a single object in the repository
-and thus limited in size to MAX_OBJECT_SIZE (20MiB).
-
-As one chunk list entry is ~40B, that means we can reference ~500.000 item
-metadata stream chunks per archive.
-
-Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
-
-So that means the whole item metadata stream is limited to ~64GiB chunks.
-If compression is used, the amount of storable metadata is bigger - by the
-compression factor.
-
-If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
-that means a limit of ~640 million files/directories per archive.
-
-If the medium size of an item entry is 2kB (~100MB size files or more
-ACLs/xattrs), the limit will be ~32 million files/directories per archive.
-
-If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
-IntegrityError will be raised.
-
-A workaround is to create multiple archives with fewer items each, see
-also :issue:`1452`.
-
 .. _item:

 Items
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@ -331,11 +331,6 @@ stats
        Deduplicated size (against the current repository, not when the archive was created)
    nfiles
        Number of regular files in the archive
-limits
-    Object describing the utilization of Borg limits
-
-    max_archive_size
-        Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
 command_line
    Array of strings of the command line that created the archive

@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
                "end": "2017-02-27T12:27:20.789123",
                "hostname": "host",
                "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
-                "limits": {
-                    "max_archive_size": 0.0001330855110409714
-                },
                "name": "host-system-backup-2017-02-27",
                "start": "2017-02-27T12:27:20.789123",
                "stats": {
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
    return uid, gid


+def archive_get_items(metadata, key, repository):
+    if "item_ptrs" in metadata:  # looks like a v2+ archive
+        assert "items" not in metadata
+        items = []
+        for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
+            data = key.decrypt(id, data)
+            ids = msgpack.unpackb(data)
+            items.extend(ids)
+        return items
+
+    if "items" in metadata:  # legacy, v1 archive
+        assert "item_ptrs" not in metadata
+        return metadata.items
+
+
+def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
+    """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
+    item_ptrs = []
+    for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
+        data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
+        id = key.id_hash(data)
+        if cache is not None and stats is not None:
+            cache.add_chunk(id, data, stats)
+        elif add_reference is not None:
+            cdata = key.encrypt(id, data)
+            add_reference(id, len(data), cdata)
+        else:
+            raise NotImplementedError
+        item_ptrs.append(id)
+    return item_ptrs
+
+
 class Archive:
    class DoesNotExist(Error):
        """Archive {} does not exist"""
@ -479,6 +511,8 @@ def _load_meta(self, id):
        metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
        if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
            raise Exception("Unknown archive metadata version")
+        # note: metadata.items must not get written to disk!
+        metadata.items = archive_get_items(metadata, self.key, self.repository)
        return metadata

    def load(self, id):
@ -512,10 +546,6 @@ def duration(self):
    def duration_from_meta(self):
        return format_timedelta(self.ts_end - self.ts)

-    def _archive_csize(self):
-        cdata = self.repository.get(self.id)
-        return len(cdata)
-
    def info(self):
        if self.create:
            stats = self.stats
@ -532,7 +562,6 @@ def info(self):
            "end": OutputTimestamp(end),
            "duration": (end - start).total_seconds(),
            "stats": stats.as_dict(),
-            "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
        }
        if self.create:
            info["command_line"] = sys.argv
@ -556,12 +585,10 @@ def __str__(self):
 Time (start): {start}
 Time (end):   {end}
 Duration: {0.duration}
-Utilization of max. archive size: {csize_max:.0%}
 """.format(
            self,
            start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
            end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
-            csize_max=self._archive_csize() / MAX_DATA_SIZE,
            location=self.repository._location.canonical_path(),
        )

@ -599,6 +626,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
        if name in self.manifest.archives:
            raise self.AlreadyExists(name)
        self.items_buffer.flush(flush=True)
+        item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
        duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
        if timestamp is None:
            end = datetime.utcnow()
@ -612,7 +640,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
            "version": 2,
            "name": name,
            "comment": comment or "",
-            "items": self.items_buffer.chunks,
+            "item_ptrs": item_ptrs,  # see #1473
            "cmdline": sys.argv,
            "hostname": hostname,
            "username": getuser(),
@ -930,6 +958,8 @@ def restore_attrs(self, path, item, symlink=False, fd=None):
    def set_meta(self, key, value):
        metadata = self._load_meta(self.id)
        setattr(metadata, key, value)
+        if "items" in metadata:
+            del metadata.items
        data = msgpack.packb(metadata.as_dict())
        new_id = self.key.id_hash(data)
        self.cache.add_chunk(new_id, data, self.stats)
@ -1004,6 +1034,11 @@ def chunk_decref(id, stats, part=False):
            if forced == 0:
                raise
            error = True
+
+        # delete the blocks that store all the references that end up being loaded into metadata.items:
+        for id in self.metadata.item_ptrs:
+            chunk_decref(id, stats)
+
        # in forced delete mode, we try hard to delete at least the manifest entry,
        # if possible also the archive superblock, even if processing the items raises
        # some harmless exception.
@ -1997,7 +2032,8 @@ def valid_item(obj):
                return True, ""

            i = 0
-            for state, items in groupby(archive.items, missing_chunk_detector):
+            archive_items = archive_get_items(archive, self.key, repository)
+            for state, items in groupby(archive_items, missing_chunk_detector):
                items = list(items)
                if state % 2:
                    for chunk_id in items:
@ -2078,9 +2114,11 @@ def valid_item(obj):
                        verify_file_chunks(info.name, item)
                    items_buffer.add(item)
                items_buffer.flush(flush=True)
-                for previous_item_id in archive.items:
+                for previous_item_id in archive_get_items(archive, self.key, self.repository):
                    mark_as_possibly_superseded(previous_item_id)
-                archive.items = items_buffer.chunks
+                for previous_item_ptr in archive.item_ptrs:
+                    mark_as_possibly_superseded(previous_item_ptr)
+                archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
                data = msgpack.packb(archive.as_dict())
                new_archive_id = self.key.id_hash(data)
                cdata = self.key.encrypt(new_archive_id, data)
--- a/src/borg/archiver/debug.py
+++ b/src/borg/archiver/debug.py
@ -72,7 +72,11 @@ def output(fd):

            unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
            first = True
-            for item_id in archive_org_dict["items"]:
+            items = []
+            for chunk_id in archive_org_dict["item_ptrs"]:
+                data = key.decrypt(chunk_id, repository.get(chunk_id))
+                items.extend(msgpack.unpackb(data))
+            for item_id in items:
                data = key.decrypt(item_id, repository.get(item_id))
                unpacker.feed(data)
                for item in unpacker:
--- a/src/borg/archiver/info.py
+++ b/src/borg/archiver/info.py
@ -55,7 +55,6 @@ def format_cmdline(cmdline):
                Time (end): {end}
                Duration: {duration}
                Command line: {command_line}
-                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                Number of files: {stats[nfiles]}
                Original size: {stats[original_size]}
                Deduplicated size: {stats[deduplicated_size]}
@ -88,11 +87,6 @@ def build_parser_info(self, subparsers, common_parser, mid_common_parser):
        = unique chunks of this archive.
        All archives / deduplicated size = amount of data stored in the repo
        = all chunks in the repository.
-
-        Borg archives can only contain a limited amount of file metadata.
-        The size of an archive relative to this limit depends on a number of factors,
-        mainly the number of files, the lengths of paths and other metadata stored for files.
-        This is shown as *utilization of maximum supported archive size*.
        """
        )
        subparser = subparsers.add_parser(
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -775,8 +775,16 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
            archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
            if archive.version not in (1, 2):  # legacy
                raise Exception("Unknown archive metadata version")
+            if archive.version == 1:
+                items = archive.items
+            elif archive.version == 2:
+                items = []
+                for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
+                    chunk_idx.add(chunk_id, 1, len(data))
+                    ids = msgpack.unpackb(data)
+                    items.extend(ids)
            sync = CacheSynchronizer(chunk_idx)
-            for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
+            for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
                chunk_idx.add(item_id, 1, len(data))
                processed_item_metadata_bytes += len(data)
                processed_item_metadata_chunks += 1
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@ -11,7 +11,9 @@

 # this set must be kept complete, otherwise rebuild_manifest might malfunction:
 # fmt: off
-ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+                          'items',  # legacy v1 archives
+                          'item_ptrs',  # v2+ archives
                          'comment', 'chunker_params',
                          'recreate_cmdline',
                          'recreate_source_id', 'recreate_args', 'recreate_partial_chunks',  # used in 1.1.0b1 .. b2
@ -19,7 +21,7 @@
 # fmt: on

 # this is the set of keys that are always present in archives:
-REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
+REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])

 # default umask, overridden by --umask, defaults to read/write only for owner
 UMASK_DEFAULT = 0o077
@ -47,6 +49,9 @@
 # borg < 1.3, but this is not expected to cause any issues.
 MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8  # see assertion at end of repository module

+# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
+IDS_PER_CHUNK = 3  # MAX_DATA_SIZE // 40
+
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE

--- a/src/borg/item.pyi
+++ b/src/borg/item.pyi
@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
    def items(self) -> List: ...
    @items.setter
    def items(self, val: List) -> None: ...
+    @property
+    def item_ptrs(self) -> List: ...
+    @items.setter
+    def item_ptrs(self, val: List) -> None: ...

 class ChunkListEntry(NamedTuple):
    id: bytes
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@ -483,7 +483,8 @@ class ArchiveItem(PropDict):

    version = PropDict._make_property('version', int)
    name = PropDict._make_property('name', str, 'surrogate-escaped str')
-    items = PropDict._make_property('items', list)
+    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
+    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
    cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
    username = PropDict._make_property('username', str, 'surrogate-escaped str')
@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
                v = fix_tuple_of_str_and_int(v)
            if k in ('cmdline', 'recreate_cmdline'):
                v = fix_list_of_str(v)
-            if k == 'items':
+            if k == 'items':  # legacy
+                v = fix_list_of_bytes(v)
+            if k == 'item_ptrs':
                v = fix_list_of_bytes(v)
            self._dict[k] = v

--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -3981,7 +3981,7 @@ def test_manifest_rebuild_duplicate_archive(self):
            archive = msgpack.packb(
                {
                    "cmdline": [],
-                    "items": [],
+                    "item_ptrs": [],
                    "hostname": "foo",
                    "username": "bar",
                    "name": "archive1",