From fb74fdb7102295a798dc23cd82a227172c725724 Mon Sep 17 00:00:00 2001
From: Thomas Waldmann <tw@waldmann-edv.de>
Date: Fri, 5 Aug 2022 22:06:08 +0200
Subject: [PATCH] massively increase per archive metadata stream size limit,
 fixes #1473

implemented by introducing one level of indirection, the limit is now
very high, so it is not practically relevant any more.

we always use the indirection (storing the metadata stream chunk ids list not
directly into the archive item, but into some repo objects referenced by the new
ArchiveItem.item_ptrs list).

thus, the code behaves the same for all archive sizes.
---
 docs/faq.rst                       |  9 -----
 docs/internals/data-structures.rst | 31 +--------------
 docs/internals/frontends.rst       |  8 ----
 src/borg/archive.py                | 60 ++++++++++++++++++++++++------
 src/borg/archiver/debug.py         |  6 ++-
 src/borg/archiver/info.py          |  6 ---
 src/borg/cache.py                  | 10 ++++-
 src/borg/constants.py              |  9 ++++-
 src/borg/item.pyi                  |  4 ++
 src/borg/item.pyx                  |  7 +++-
 src/borg/testsuite/archiver.py     |  2 +-
 11 files changed, 82 insertions(+), 70 deletions(-)

diff --git a/docs/faq.rst b/docs/faq.rst
index dbcdf9d6a..f767a277e 100644
--- a/docs/faq.rst
+++ b/docs/faq.rst
@@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
 Are there other known limitations?
 ----------------------------------
 
-- A single archive can only reference a limited volume of file/dir metadata,
-  usually corresponding to tens or hundreds of millions of files/dirs.
-  When trying to go beyond that limit, you will get a fatal IntegrityError
-  exception telling that the (archive) object is too big.
-  An easy workaround is to create multiple archives with fewer items each.
-  See also the :ref:`archive_limitation` and :issue:`1452`.
-
-  :ref:`borg_info` shows how large (relative to the maximum size) existing
-  archives are.
 - borg extract only supports restoring into an empty destination. After that,
   the destination will exactly have the contents of the extracted archive.
   If you extract into a non-empty destination, borg will (for example) not
diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst
index e33ce550d..f337eb18a 100644
--- a/docs/internals/data-structures.rst
+++ b/docs/internals/data-structures.rst
@@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
   When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
   more than one archive object with the same name, it adds a counter to the name
   in the manifest, but leaves the *name* field of the archives as it was.
-* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
+* *item_ptrs*, a list of "pointer chunk" IDs.
+  Each "pointer chunk" contains a list of chunk IDs of item metadata.
 * *cmdline*, the command line which was used to create the archive
 * *hostname*
 * *username*
@@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
   This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
 * Some other pieces of information related to recreate.
 
-.. _archive_limitation:
-
-.. rubric:: Note about archive limitations
-
-The archive is currently stored as a single object in the repository
-and thus limited in size to MAX_OBJECT_SIZE (20MiB).
-
-As one chunk list entry is ~40B, that means we can reference ~500.000 item
-metadata stream chunks per archive.
-
-Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
-
-So that means the whole item metadata stream is limited to ~64GiB chunks.
-If compression is used, the amount of storable metadata is bigger - by the
-compression factor.
-
-If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
-that means a limit of ~640 million files/directories per archive.
-
-If the medium size of an item entry is 2kB (~100MB size files or more
-ACLs/xattrs), the limit will be ~32 million files/directories per archive.
-
-If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
-IntegrityError will be raised.
-
-A workaround is to create multiple archives with fewer items each, see
-also :issue:`1452`.
-
 .. _item:
 
 Items
diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst
index 9eab9c64d..cd9b431c5 100644
--- a/docs/internals/frontends.rst
+++ b/docs/internals/frontends.rst
@@ -331,11 +331,6 @@ stats
         Deduplicated size (against the current repository, not when the archive was created)
     nfiles
         Number of regular files in the archive
-limits
-    Object describing the utilization of Borg limits
-
-    max_archive_size
-        Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
 command_line
     Array of strings of the command line that created the archive
 
@@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
                 "end": "2017-02-27T12:27:20.789123",
                 "hostname": "host",
                 "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
-                "limits": {
-                    "max_archive_size": 0.0001330855110409714
-                },
                 "name": "host-system-backup-2017-02-27",
                 "start": "2017-02-27T12:27:20.789123",
                 "stats": {
diff --git a/src/borg/archive.py b/src/borg/archive.py
index 481017f54..020ba5ac9 100644
--- a/src/borg/archive.py
+++ b/src/borg/archive.py
@@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
     return uid, gid
 
 
+def archive_get_items(metadata, key, repository):
+    if "item_ptrs" in metadata:  # looks like a v2+ archive
+        assert "items" not in metadata
+        items = []
+        for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
+            data = key.decrypt(id, data)
+            ids = msgpack.unpackb(data)
+            items.extend(ids)
+        return items
+
+    if "items" in metadata:  # legacy, v1 archive
+        assert "item_ptrs" not in metadata
+        return metadata.items
+
+
+def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
+    """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
+    item_ptrs = []
+    for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
+        data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
+        id = key.id_hash(data)
+        if cache is not None and stats is not None:
+            cache.add_chunk(id, data, stats)
+        elif add_reference is not None:
+            cdata = key.encrypt(id, data)
+            add_reference(id, len(data), cdata)
+        else:
+            raise NotImplementedError
+        item_ptrs.append(id)
+    return item_ptrs
+
+
 class Archive:
     class DoesNotExist(Error):
         """Archive {} does not exist"""
@@ -479,6 +511,8 @@ class Archive:
         metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
         if metadata.version not in (1, 2):  # legacy: still need to read v1 archives
             raise Exception("Unknown archive metadata version")
+        # note: metadata.items must not get written to disk!
+        metadata.items = archive_get_items(metadata, self.key, self.repository)
         return metadata
 
     def load(self, id):
@@ -512,10 +546,6 @@ class Archive:
     def duration_from_meta(self):
         return format_timedelta(self.ts_end - self.ts)
 
-    def _archive_csize(self):
-        cdata = self.repository.get(self.id)
-        return len(cdata)
-
     def info(self):
         if self.create:
             stats = self.stats
@@ -532,7 +562,6 @@ class Archive:
             "end": OutputTimestamp(end),
             "duration": (end - start).total_seconds(),
             "stats": stats.as_dict(),
-            "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
         }
         if self.create:
             info["command_line"] = sys.argv
@@ -556,12 +585,10 @@ Archive fingerprint: {0.fpr}
 Time (start): {start}
 Time (end):   {end}
 Duration: {0.duration}
-Utilization of max. archive size: {csize_max:.0%}
 """.format(
             self,
             start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
             end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
-            csize_max=self._archive_csize() / MAX_DATA_SIZE,
             location=self.repository._location.canonical_path(),
         )
 
@@ -599,6 +626,7 @@ Utilization of max. archive size: {csize_max:.0%}
         if name in self.manifest.archives:
             raise self.AlreadyExists(name)
         self.items_buffer.flush(flush=True)
+        item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
         duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
         if timestamp is None:
             end = datetime.utcnow()
@@ -612,7 +640,7 @@ Utilization of max. archive size: {csize_max:.0%}
             "version": 2,
             "name": name,
             "comment": comment or "",
-            "items": self.items_buffer.chunks,
+            "item_ptrs": item_ptrs,  # see #1473
             "cmdline": sys.argv,
             "hostname": hostname,
             "username": getuser(),
@@ -930,6 +958,8 @@ Utilization of max. archive size: {csize_max:.0%}
     def set_meta(self, key, value):
         metadata = self._load_meta(self.id)
         setattr(metadata, key, value)
+        if "items" in metadata:
+            del metadata.items
         data = msgpack.packb(metadata.as_dict())
         new_id = self.key.id_hash(data)
         self.cache.add_chunk(new_id, data, self.stats)
@@ -1004,6 +1034,11 @@ Utilization of max. archive size: {csize_max:.0%}
             if forced == 0:
                 raise
             error = True
+
+        # delete the blocks that store all the references that end up being loaded into metadata.items:
+        for id in self.metadata.item_ptrs:
+            chunk_decref(id, stats)
+
         # in forced delete mode, we try hard to delete at least the manifest entry,
         # if possible also the archive superblock, even if processing the items raises
         # some harmless exception.
@@ -1997,7 +2032,8 @@ class ArchiveChecker:
                 return True, ""
 
             i = 0
-            for state, items in groupby(archive.items, missing_chunk_detector):
+            archive_items = archive_get_items(archive, self.key, repository)
+            for state, items in groupby(archive_items, missing_chunk_detector):
                 items = list(items)
                 if state % 2:
                     for chunk_id in items:
@@ -2078,9 +2114,11 @@ class ArchiveChecker:
                         verify_file_chunks(info.name, item)
                     items_buffer.add(item)
                 items_buffer.flush(flush=True)
-                for previous_item_id in archive.items:
+                for previous_item_id in archive_get_items(archive, self.key, self.repository):
                     mark_as_possibly_superseded(previous_item_id)
-                archive.items = items_buffer.chunks
+                for previous_item_ptr in archive.item_ptrs:
+                    mark_as_possibly_superseded(previous_item_ptr)
+                archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
                 data = msgpack.packb(archive.as_dict())
                 new_archive_id = self.key.id_hash(data)
                 cdata = self.key.encrypt(new_archive_id, data)
diff --git a/src/borg/archiver/debug.py b/src/borg/archiver/debug.py
index bc68090f3..2b463cbeb 100644
--- a/src/borg/archiver/debug.py
+++ b/src/borg/archiver/debug.py
@@ -72,7 +72,11 @@ class DebugMixIn:
 
             unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
             first = True
-            for item_id in archive_org_dict["items"]:
+            items = []
+            for chunk_id in archive_org_dict["item_ptrs"]:
+                data = key.decrypt(chunk_id, repository.get(chunk_id))
+                items.extend(msgpack.unpackb(data))
+            for item_id in items:
                 data = key.decrypt(item_id, repository.get(item_id))
                 unpacker.feed(data)
                 for item in unpacker:
diff --git a/src/borg/archiver/info.py b/src/borg/archiver/info.py
index 011c7748a..16205185f 100644
--- a/src/borg/archiver/info.py
+++ b/src/borg/archiver/info.py
@@ -55,7 +55,6 @@ class InfoMixIn:
                 Time (end): {end}
                 Duration: {duration}
                 Command line: {command_line}
-                Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
                 Number of files: {stats[nfiles]}
                 Original size: {stats[original_size]}
                 Deduplicated size: {stats[deduplicated_size]}
@@ -88,11 +87,6 @@ class InfoMixIn:
         = unique chunks of this archive.
         All archives / deduplicated size = amount of data stored in the repo
         = all chunks in the repository.
-
-        Borg archives can only contain a limited amount of file metadata.
-        The size of an archive relative to this limit depends on a number of factors,
-        mainly the number of files, the lengths of paths and other metadata stored for files.
-        This is shown as *utilization of maximum supported archive size*.
         """
         )
         subparser = subparsers.add_parser(
diff --git a/src/borg/cache.py b/src/borg/cache.py
index a7b2ae22e..8fd6b6ef2 100644
--- a/src/borg/cache.py
+++ b/src/borg/cache.py
@@ -775,8 +775,16 @@ class LocalCache(CacheStatsMixin):
             archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
             if archive.version not in (1, 2):  # legacy
                 raise Exception("Unknown archive metadata version")
+            if archive.version == 1:
+                items = archive.items
+            elif archive.version == 2:
+                items = []
+                for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
+                    chunk_idx.add(chunk_id, 1, len(data))
+                    ids = msgpack.unpackb(data)
+                    items.extend(ids)
             sync = CacheSynchronizer(chunk_idx)
-            for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
+            for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
                 chunk_idx.add(item_id, 1, len(data))
                 processed_item_metadata_bytes += len(data)
                 processed_item_metadata_chunks += 1
diff --git a/src/borg/constants.py b/src/borg/constants.py
index b5a001518..c45ccf465 100644
--- a/src/borg/constants.py
+++ b/src/borg/constants.py
@@ -11,7 +11,9 @@ REQUIRED_ITEM_KEYS = frozenset(["path", "mtime"])
 
 # this set must be kept complete, otherwise rebuild_manifest might malfunction:
 # fmt: off
-ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
+                          'items',  # legacy v1 archives
+                          'item_ptrs',  # v2+ archives
                           'comment', 'chunker_params',
                           'recreate_cmdline',
                           'recreate_source_id', 'recreate_args', 'recreate_partial_chunks',  # used in 1.1.0b1 .. b2
@@ -19,7 +21,7 @@ ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'us
 # fmt: on
 
 # this is the set of keys that are always present in archives:
-REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
+REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])
 
 # default umask, overridden by --umask, defaults to read/write only for owner
 UMASK_DEFAULT = 0o077
@@ -47,6 +49,9 @@ MAX_DATA_SIZE = 20971479
 # borg < 1.3, but this is not expected to cause any issues.
 MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8  # see assertion at end of repository module
 
+# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
+IDS_PER_CHUNK = 3  # MAX_DATA_SIZE // 40
+
 # repo config max_segment_size value must be below this limit to stay within uint32 offsets:
 MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE
 
diff --git a/src/borg/item.pyi b/src/borg/item.pyi
index 6e67b464c..e1683a5e3 100644
--- a/src/borg/item.pyi
+++ b/src/borg/item.pyi
@@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
     def items(self) -> List: ...
     @items.setter
     def items(self, val: List) -> None: ...
+    @property
+    def item_ptrs(self) -> List: ...
+    @items.setter
+    def item_ptrs(self, val: List) -> None: ...
 
 class ChunkListEntry(NamedTuple):
     id: bytes
diff --git a/src/borg/item.pyx b/src/borg/item.pyx
index 5d38e0fe6..a78c692f0 100644
--- a/src/borg/item.pyx
+++ b/src/borg/item.pyx
@@ -483,7 +483,8 @@ class ArchiveItem(PropDict):
 
     version = PropDict._make_property('version', int)
     name = PropDict._make_property('name', str, 'surrogate-escaped str')
-    items = PropDict._make_property('items', list)
+    items = PropDict._make_property('items', list)  # list of chunk ids of item metadata stream (only in memory)
+    item_ptrs = PropDict._make_property('item_ptrs', list)  # list of blocks with list of chunk ids of ims, arch v2
     cmdline = PropDict._make_property('cmdline', list)  # list of s-e-str
     hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
     username = PropDict._make_property('username', str, 'surrogate-escaped str')
@@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
                 v = fix_tuple_of_str_and_int(v)
             if k in ('cmdline', 'recreate_cmdline'):
                 v = fix_list_of_str(v)
-            if k == 'items':
+            if k == 'items':  # legacy
+                v = fix_list_of_bytes(v)
+            if k == 'item_ptrs':
                 v = fix_list_of_bytes(v)
             self._dict[k] = v
 
diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py
index 3ebdcaf20..846c9ef8a 100644
--- a/src/borg/testsuite/archiver.py
+++ b/src/borg/testsuite/archiver.py
@@ -3981,7 +3981,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase):
             archive = msgpack.packb(
                 {
                     "cmdline": [],
-                    "items": [],
+                    "item_ptrs": [],
                     "hostname": "foo",
                     "username": "bar",
                     "name": "archive1",