massively increase per archive metadata stream size limit, fixes #1473

implemented by introducing one level of indirection, the limit is now very high, so it is not practically relevant any more. we always use the indirection (storing the metadata stream chunk ids list not directly into the archive item, but into some repo objects referenced by the new ArchiveItem.item_ptrs list). thus, the code behaves the same for all archive sizes.
2025-03-10 14:15:43 +00:00 · 2022-08-05 22:06:08 +02:00 · 2022-08-05 22:06:08 +02:00 · fb74fdb710
commit fb74fdb710
parent 02580c09ea
11 changed files with 82 additions and 70 deletions
--- a/docs/faq.rst
+++ b/docs/faq.rst
@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
 Are there other known limitations?
 ----------------------------------

- A single archive can only reference a limited volume of file/dir metadata,
-  usually corresponding to tens or hundreds of millions of files/dirs.
-  When trying to go beyond that limit, you will get a fatal IntegrityError
-  exception telling that the (archive) object is too big.
-  An easy workaround is to create multiple archives with fewer items each.
-  See also the :ref:`archive_limitation` and :issue:`1452`.
-
-  :ref:`borg_info` shows how large (relative to the maximum size) existing
-  archives are.
 - borg extract only supports restoring into an empty destination. After that,
  the destination will exactly have the contents of the extracted archive.
  If you extract into a non-empty destination, borg will (for example) not
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
  When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
  more than one archive object with the same name, it adds a counter to the name
  in the manifest, but leaves the *name* field of the archives as it was.
-* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
+* *item_ptrs*, a list of "pointer chunk" IDs.
+  Each "pointer chunk" contains a list of chunk IDs of item metadata.
 * *cmdline*, the command line which was used to create the archive
 * *hostname*
 * *username*
@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
  This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
 * Some other pieces of information related to recreate.

-.. _archive_limitation:
-
-.. rubric:: Note about archive limitations
-
-The archive is currently stored as a single object in the repository
-and thus limited in size to MAX_OBJECT_SIZE (20MiB).
-
-As one chunk list entry is ~40B, that means we can reference ~500.000 item
-metadata stream chunks per archive.
-
-Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
-
-So that means the whole item metadata stream is limited to ~64GiB chunks.
-If compression is used, the amount of storable metadata is bigger - by the
-compression factor.
-
-If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
-that means a limit of ~640 million files/directories per archive.
-
-If the medium size of an item entry is 2kB (~100MB size files or more
-ACLs/xattrs), the limit will be ~32 million files/directories per archive.
-
-If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
-IntegrityError will be raised.
-
-A workaround is to create multiple archives with fewer items each, see
-also :issue:`1452`.
-
 .. _item:

 Items
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@ -331,11 +331,6 @@ stats
        Deduplicated size (against the current repository, not when the archive was created)
    nfiles
        Number of regular files in the archive
-limits
-    Object describing the utilization of Borg limits
-
-    max_archive_size
-        Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
 command_line
    Array of strings of the command line that created the archive

@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
                "end": "2017-02-27T12:27:20.789123",
                "hostname": "host",
                "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
-                "limits": {
-                    "max_archive_size": 0.0001330855110409714
-                },
                "name": "host-system-backup-2017-02-27",
                "start": "2017-02-27T12:27:20.789123",
                "stats": {
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
    return uid, gid


+def archive_get_items(metadata, key, repository):
+    if "item_ptrs" in metadata:  # looks like a v2+ archive
+        assert "items" not in metadata
+        items = []
+        for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
+            data = key.decrypt(id, data)
+            ids = msgpack.unpackb(data)
+            items.extend(ids)
+        return items
+
+    if "items" in metadata:  # legacy, v1 archive
+        assert "item_ptrs" not in metadata
+        return metadata.items
+
+
+def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
+    """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
+    item_ptrs = []
+    for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
+        data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
+        id = key.id_hash(data)
+        if cache is not None and stats is not None:
+            cache.add_chunk(id, data, stats)
+        elif add_reference is not None:
+            cdata = key.encrypt(id, data)
+            add_reference(id, len(data), cdata)
+        else:
+            raise NotImplementedError
+        item_ptrs.append(id)
+    return item_ptrs
+
+
 class Archive:
    class DoesNotExist(Error):
        """Archive {} does not exist"""
@ -479,6 +511,8 @@ class Archive:
        metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
        if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
            raise Exception("Unknown archive metadata version")
+        # note: metadata.items must not get written to disk!
+        metadata.items = archive_get_items(metadata, self.key, self.repository)
        return metadata

    def load(self, id):
@ -512,10 +546,6 @@ class Archive:
    def duration_from_meta(self):
        return format_timedelta(self.ts_end - self.ts)

-    def _archive_csize(self):
-        cdata = self.repository.get(self.id)
-        return len(cdata)
-
    def info(self):
        if self.create:
            stats = self.stats
@ -532,7 +562,6 @@ class Archive:
            "end": OutputTimestamp(end),
            "duration": (end - start).total_seconds(),
            "stats": stats.as_dict(),
-            "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
        }
        if self.create:
            info["command_line"] = sys.argv
@ -556,12 +585,10 @@ Archive fingerprint: {0.fpr}
 Time (start): {start}
 Time (end):   {end}
 Duration: {0.duration}
-Utilization of max. archive size: {csize_max:.0%}
 """.format(
            self,
            start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
            end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
-            csize_max=self._archive_csize() / MAX_DATA_SIZE,
            location=self.repository._location.canonical_path(),
        )

@ -599,6 +626,7 @@ Utilization of max. archive size: {csize_max:.0%}
        if name in self.manifest.archives:
            raise self.AlreadyExists(name)
        self.items_buffer.flush(flush=True)
+        item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
        duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
        if timestamp is None:
            end = datetime.utcnow()
@ -612,7 +640,7 @@ Utilization of max. archive size: {csize_max:.0%}
            "version": 2,
            "name": name,
            "comment": comment or "",
-            "items": self.items_buffer.chunks,
+            "item_ptrs": item_ptrs,  # see #1473
            "cmdline": sys.argv,
            "hostname": hostname,
            "username": getuser(),
@ -930,6 +958,8 @@ Utilization of max. archive size: {csize_max:.0%}
    def set_meta(self, key, value):
        metadata = self._load_meta(self.id)
        setattr(metadata, key, value)
+        if "items" in metadata:
+            del metadata.items
        data = msgpack.packb(metadata.as_dict())
        new_id = self.key.id_hash(data)
        self.cache.add_chunk(new_id, data, self.stats)
@ -1004,6 +1034,11 @@ Utilization of max. archive size: {csize_max:.0%}
            if forced == 0:
                raise
            error = True
+
+        # delete the blocks that store all the references that end up being loaded into metadata.items:
+        for id in self.metadata.item_ptrs:
+            chunk_decref(id, stats)
+
        # in forced delete mode, we try hard to delete at least the manifest entry,
        # if possible also the archive superblock, even if processing the items raises
        # some harmless exception.
@ -1997,7 +2032,8 @@ class ArchiveChecker:
                return True, ""

            i = 0
-            for state, items in groupby(archive.items, missing_chunk_detector):
+            archive_items = archive_get_items(archive, self.key, repository)
+            for state, items in groupby(archive_items, missing_chunk_detector):
                items = list(items)
                if state % 2:
                    for chunk_id in items:
@ -2078,9 +2114,11 @@ class ArchiveChecker:
                        verify_file_chunks(info.name, item)
                    items_buffer.add(item)
                items_buffer.flush(flush=True)
-                for previous_item_id in archive.items:
+                for previous_item_id in archive_get_items(archive, self.key, self.repository):
                    mark_as_possibly_superseded(previous_item_id)
-                archive.items = items_buffer.chunks
+                for previous_item_ptr in archive.item_ptrs:
+                    mark_as_possibly_superseded(previous_item_ptr)
+                archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
                data = msgpack.packb(archive.as_dict())
                new_archive_id = self.key.id_hash(data)
                cdata = self.key.encrypt(new_archive_id, data)
--- a/src/borg/archiver/debug.py
+++ b/src/borg/archiver/debug.py
@ -72,7 +72,11 @@ class DebugMixIn:

            unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
            first = True
-            for item_id in archive_org_dict["items"]:
+            items = []
+            for chunk_id in archive_org_dict["item_ptrs"]:
+                data = key.decrypt(chunk_id, repository.get(chunk_id))
+                items.extend(msgpack.unpackb(data))
+            for item_id in items:
                data = key.decrypt(item_id, repository.get(item_id))
                unpacker.feed(data)
                for item in unpacker:
--- a/src/borg/archiver/info.py
+++ b/src/borg/archiver/info.py
@ -55,7 +55,6 @@ class InfoMixIn:
                Time (end): {end}
                Duration: {duration}
                Command line: {command_line}
-                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                Number of files: {stats[nfiles]}
                Original size: {stats[original_size]}
                Deduplicated size: {stats[deduplicated_size]}
@ -88,11 +87,6 @@ class InfoMixIn:
        = unique chunks of this archive.
        All archives / deduplicated size = amount of data stored in the repo
        = all chunks in the repository.
-
-        Borg archives can only contain a limited amount of file metadata.
-        The size of an archive relative to this limit depends on a number of factors,
-        mainly the number of files, the lengths of paths and other metadata stored for files.
-        This is shown as *utilization of maximum supported archive size*.
        """
        )
        subparser = subparsers.add_parser(
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@ -775,8 +775,16 @@ class LocalCache(CacheStatsMixin):
            archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
            if archive.version not in (1, 2):  # legacy
                raise Exception("Unknown archive metadata version")
+            if archive.version == 1:
+                items = archive.items
+            elif archive.version == 2:
+                items = []
+                for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
+                    chunk_idx.add(chunk_id, 1, len(data))
+                    ids = msgpack.unpackb(data)
+                    items.extend(ids)
            sync = CacheSynchronizer(chunk_idx)
-            for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
+            for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
                chunk_idx.add(item_id, 1, len(data))
                processed_item_metadata_bytes += len(data)
                processed_item_metadata_chunks += 1
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@ -11,7 +11,9 @@ REQUIRED_ITEM_KEYS = frozenset(["path", "mtime"])

 # this set must be kept complete, otherwise rebuild_manifest might malfunction:
 # fmt: off
-ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+                          'items',  # legacy v1 archives
+                          'item_ptrs',  # v2+ archives
                          'comment', 'chunker_params',
                          'recreate_cmdline',
                          'recreate_source_id', 'recreate_args', 'recreate_partial_chunks',  # used in 1.1.0b1 .. b2
@ -19,7 +21,7 @@ ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'us
 # fmt: on

 # this is the set of keys that are always present in archives:
-REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
+REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])

 # default umask, overridden by --umask, defaults to read/write only for owner
 UMASK_DEFAULT = 0o077
@ -47,6 +49,9 @@ MAX_DATA_SIZE = 20971479
 # borg < 1.3, but this is not expected to cause any issues.
 MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8  # see assertion at end of repository module

+# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
+IDS_PER_CHUNK = 3  # MAX_DATA_SIZE // 40
+
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE

--- a/src/borg/item.pyi
+++ b/src/borg/item.pyi
@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
    def items(self) -> List: ...
    @items.setter
    def items(self, val: List) -> None: ...
+    @property
+    def item_ptrs(self) -> List: ...
+    @items.setter
+    def item_ptrs(self, val: List) -> None: ...

 class ChunkListEntry(NamedTuple):
    id: bytes
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@ -483,7 +483,8 @@ class ArchiveItem(PropDict):

    version = PropDict._make_property('version', int)
    name = PropDict._make_property('name', str, 'surrogate-escaped str')
-    items = PropDict._make_property('items', list)
+    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
+    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
    cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
    hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
    username = PropDict._make_property('username', str, 'surrogate-escaped str')
@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
                v = fix_tuple_of_str_and_int(v)
            if k in ('cmdline', 'recreate_cmdline'):
                v = fix_list_of_str(v)
-            if k == 'items':
+            if k == 'items':  # legacy
+                v = fix_list_of_bytes(v)
+            if k == 'item_ptrs':
                v = fix_list_of_bytes(v)
            self._dict[k] = v

--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@ -3981,7 +3981,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
            archive = msgpack.packb(
                {
                    "cmdline": [],
-                    "items": [],
+                    "item_ptrs": [],
                    "hostname": "foo",
                    "username": "bar",
                    "name": "archive1",