diff --git a/docs/faq.rst b/docs/faq.rst index dbcdf9d6a..f767a277e 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved? Are there other known limitations? ---------------------------------- -- A single archive can only reference a limited volume of file/dir metadata, - usually corresponding to tens or hundreds of millions of files/dirs. - When trying to go beyond that limit, you will get a fatal IntegrityError - exception telling that the (archive) object is too big. - An easy workaround is to create multiple archives with fewer items each. - See also the :ref:`archive_limitation` and :issue:`1452`. - - :ref:`borg_info` shows how large (relative to the maximum size) existing - archives are. - borg extract only supports restoring into an empty destination. After that, the destination will exactly have the contents of the extracted archive. If you extract into a non-empty destination, borg will (for example) not diff --git a/docs/internals/data-structures.rst b/docs/internals/data-structures.rst index e33ce550d..f337eb18a 100644 --- a/docs/internals/data-structures.rst +++ b/docs/internals/data-structures.rst @@ -511,7 +511,8 @@ The archive object itself further contains some metadata: When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds more than one archive object with the same name, it adds a counter to the name in the manifest, but leaves the *name* field of the archives as it was. -* *items*, a list of chunk IDs containing item metadata (size: count * ~34B) +* *item_ptrs*, a list of "pointer chunk" IDs. + Each "pointer chunk" contains a list of chunk IDs of item metadata. * *cmdline*, the command line which was used to create the archive * *hostname* * *username* @@ -521,34 +522,6 @@ The archive object itself further contains some metadata: This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking. * Some other pieces of information related to recreate. -.. _archive_limitation: - -.. rubric:: Note about archive limitations - -The archive is currently stored as a single object in the repository -and thus limited in size to MAX_OBJECT_SIZE (20MiB). - -As one chunk list entry is ~40B, that means we can reference ~500.000 item -metadata stream chunks per archive. - -Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS). - -So that means the whole item metadata stream is limited to ~64GiB chunks. -If compression is used, the amount of storable metadata is bigger - by the -compression factor. - -If the medium size of an item entry is 100B (small size file, no ACLs/xattrs), -that means a limit of ~640 million files/directories per archive. - -If the medium size of an item entry is 2kB (~100MB size files or more -ACLs/xattrs), the limit will be ~32 million files/directories per archive. - -If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal -IntegrityError will be raised. - -A workaround is to create multiple archives with fewer items each, see -also :issue:`1452`. - .. _item: Items diff --git a/docs/internals/frontends.rst b/docs/internals/frontends.rst index 9eab9c64d..cd9b431c5 100644 --- a/docs/internals/frontends.rst +++ b/docs/internals/frontends.rst @@ -331,11 +331,6 @@ stats Deduplicated size (against the current repository, not when the archive was created) nfiles Number of regular files in the archive -limits - Object describing the utilization of Borg limits - - max_archive_size - Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg command_line Array of strings of the command line that created the archive @@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``):: "end": "2017-02-27T12:27:20.789123", "hostname": "host", "id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a", - "limits": { - "max_archive_size": 0.0001330855110409714 - }, "name": "host-system-backup-2017-02-27", "start": "2017-02-27T12:27:20.789123", "stats": { diff --git a/src/borg/archive.py b/src/borg/archive.py index 481017f54..020ba5ac9 100644 --- a/src/borg/archive.py +++ b/src/borg/archive.py @@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def return uid, gid +def archive_get_items(metadata, key, repository): + if "item_ptrs" in metadata: # looks like a v2+ archive + assert "items" not in metadata + items = [] + for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)): + data = key.decrypt(id, data) + ids = msgpack.unpackb(data) + items.extend(ids) + return items + + if "items" in metadata: # legacy, v1 archive + assert "item_ptrs" not in metadata + return metadata.items + + +def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None): + """gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects""" + item_ptrs = [] + for i in range(0, len(chunk_ids), IDS_PER_CHUNK): + data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK]) + id = key.id_hash(data) + if cache is not None and stats is not None: + cache.add_chunk(id, data, stats) + elif add_reference is not None: + cdata = key.encrypt(id, data) + add_reference(id, len(data), cdata) + else: + raise NotImplementedError + item_ptrs.append(id) + return item_ptrs + + class Archive: class DoesNotExist(Error): """Archive {} does not exist""" @@ -479,6 +511,8 @@ class Archive: metadata = ArchiveItem(internal_dict=msgpack.unpackb(data)) if metadata.version not in (1, 2): # legacy: still need to read v1 archives raise Exception("Unknown archive metadata version") + # note: metadata.items must not get written to disk! + metadata.items = archive_get_items(metadata, self.key, self.repository) return metadata def load(self, id): @@ -512,10 +546,6 @@ class Archive: def duration_from_meta(self): return format_timedelta(self.ts_end - self.ts) - def _archive_csize(self): - cdata = self.repository.get(self.id) - return len(cdata) - def info(self): if self.create: stats = self.stats @@ -532,7 +562,6 @@ class Archive: "end": OutputTimestamp(end), "duration": (end - start).total_seconds(), "stats": stats.as_dict(), - "limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE}, } if self.create: info["command_line"] = sys.argv @@ -556,12 +585,10 @@ Archive fingerprint: {0.fpr} Time (start): {start} Time (end): {end} Duration: {0.duration} -Utilization of max. archive size: {csize_max:.0%} """.format( self, start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)), end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)), - csize_max=self._archive_csize() / MAX_DATA_SIZE, location=self.repository._location.canonical_path(), ) @@ -599,6 +626,7 @@ Utilization of max. archive size: {csize_max:.0%} if name in self.manifest.archives: raise self.AlreadyExists(name) self.items_buffer.flush(flush=True) + item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats) duration = timedelta(seconds=time.monotonic() - self.start_monotonic) if timestamp is None: end = datetime.utcnow() @@ -612,7 +640,7 @@ Utilization of max. archive size: {csize_max:.0%} "version": 2, "name": name, "comment": comment or "", - "items": self.items_buffer.chunks, + "item_ptrs": item_ptrs, # see #1473 "cmdline": sys.argv, "hostname": hostname, "username": getuser(), @@ -930,6 +958,8 @@ Utilization of max. archive size: {csize_max:.0%} def set_meta(self, key, value): metadata = self._load_meta(self.id) setattr(metadata, key, value) + if "items" in metadata: + del metadata.items data = msgpack.packb(metadata.as_dict()) new_id = self.key.id_hash(data) self.cache.add_chunk(new_id, data, self.stats) @@ -1004,6 +1034,11 @@ Utilization of max. archive size: {csize_max:.0%} if forced == 0: raise error = True + + # delete the blocks that store all the references that end up being loaded into metadata.items: + for id in self.metadata.item_ptrs: + chunk_decref(id, stats) + # in forced delete mode, we try hard to delete at least the manifest entry, # if possible also the archive superblock, even if processing the items raises # some harmless exception. @@ -1997,7 +2032,8 @@ class ArchiveChecker: return True, "" i = 0 - for state, items in groupby(archive.items, missing_chunk_detector): + archive_items = archive_get_items(archive, self.key, repository) + for state, items in groupby(archive_items, missing_chunk_detector): items = list(items) if state % 2: for chunk_id in items: @@ -2078,9 +2114,11 @@ class ArchiveChecker: verify_file_chunks(info.name, item) items_buffer.add(item) items_buffer.flush(flush=True) - for previous_item_id in archive.items: + for previous_item_id in archive_get_items(archive, self.key, self.repository): mark_as_possibly_superseded(previous_item_id) - archive.items = items_buffer.chunks + for previous_item_ptr in archive.item_ptrs: + mark_as_possibly_superseded(previous_item_ptr) + archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference) data = msgpack.packb(archive.as_dict()) new_archive_id = self.key.id_hash(data) cdata = self.key.encrypt(new_archive_id, data) diff --git a/src/borg/archiver/debug.py b/src/borg/archiver/debug.py index bc68090f3..2b463cbeb 100644 --- a/src/borg/archiver/debug.py +++ b/src/borg/archiver/debug.py @@ -72,7 +72,11 @@ class DebugMixIn: unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict) first = True - for item_id in archive_org_dict["items"]: + items = [] + for chunk_id in archive_org_dict["item_ptrs"]: + data = key.decrypt(chunk_id, repository.get(chunk_id)) + items.extend(msgpack.unpackb(data)) + for item_id in items: data = key.decrypt(item_id, repository.get(item_id)) unpacker.feed(data) for item in unpacker: diff --git a/src/borg/archiver/info.py b/src/borg/archiver/info.py index 011c7748a..16205185f 100644 --- a/src/borg/archiver/info.py +++ b/src/borg/archiver/info.py @@ -55,7 +55,6 @@ class InfoMixIn: Time (end): {end} Duration: {duration} Command line: {command_line} - Utilization of maximum supported archive size: {limits[max_archive_size]:.0%} Number of files: {stats[nfiles]} Original size: {stats[original_size]} Deduplicated size: {stats[deduplicated_size]} @@ -88,11 +87,6 @@ class InfoMixIn: = unique chunks of this archive. All archives / deduplicated size = amount of data stored in the repo = all chunks in the repository. - - Borg archives can only contain a limited amount of file metadata. - The size of an archive relative to this limit depends on a number of factors, - mainly the number of files, the lengths of paths and other metadata stored for files. - This is shown as *utilization of maximum supported archive size*. """ ) subparser = subparsers.add_parser( diff --git a/src/borg/cache.py b/src/borg/cache.py index a7b2ae22e..8fd6b6ef2 100644 --- a/src/borg/cache.py +++ b/src/borg/cache.py @@ -775,8 +775,16 @@ class LocalCache(CacheStatsMixin): archive = ArchiveItem(internal_dict=msgpack.unpackb(data)) if archive.version not in (1, 2): # legacy raise Exception("Unknown archive metadata version") + if archive.version == 1: + items = archive.items + elif archive.version == 2: + items = [] + for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)): + chunk_idx.add(chunk_id, 1, len(data)) + ids = msgpack.unpackb(data) + items.extend(ids) sync = CacheSynchronizer(chunk_idx) - for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)): + for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)): chunk_idx.add(item_id, 1, len(data)) processed_item_metadata_bytes += len(data) processed_item_metadata_chunks += 1 diff --git a/src/borg/constants.py b/src/borg/constants.py index b5a001518..c45ccf465 100644 --- a/src/borg/constants.py +++ b/src/borg/constants.py @@ -11,7 +11,9 @@ REQUIRED_ITEM_KEYS = frozenset(["path", "mtime"]) # this set must be kept complete, otherwise rebuild_manifest might malfunction: # fmt: off -ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end', +ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end', + 'items', # legacy v1 archives + 'item_ptrs', # v2+ archives 'comment', 'chunker_params', 'recreate_cmdline', 'recreate_source_id', 'recreate_args', 'recreate_partial_chunks', # used in 1.1.0b1 .. b2 @@ -19,7 +21,7 @@ ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'us # fmt: on # this is the set of keys that are always present in archives: -REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"]) +REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"]) # default umask, overridden by --umask, defaults to read/write only for owner UMASK_DEFAULT = 0o077 @@ -47,6 +49,9 @@ MAX_DATA_SIZE = 20971479 # borg < 1.3, but this is not expected to cause any issues. MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8 # see assertion at end of repository module +# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list? +IDS_PER_CHUNK = 3 # MAX_DATA_SIZE // 40 + # repo config max_segment_size value must be below this limit to stay within uint32 offsets: MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE diff --git a/src/borg/item.pyi b/src/borg/item.pyi index 6e67b464c..e1683a5e3 100644 --- a/src/borg/item.pyi +++ b/src/borg/item.pyi @@ -98,6 +98,10 @@ class ArchiveItem(PropDict): def items(self) -> List: ... @items.setter def items(self, val: List) -> None: ... + @property + def item_ptrs(self) -> List: ... + @items.setter + def item_ptrs(self, val: List) -> None: ... class ChunkListEntry(NamedTuple): id: bytes diff --git a/src/borg/item.pyx b/src/borg/item.pyx index 5d38e0fe6..a78c692f0 100644 --- a/src/borg/item.pyx +++ b/src/borg/item.pyx @@ -483,7 +483,8 @@ class ArchiveItem(PropDict): version = PropDict._make_property('version', int) name = PropDict._make_property('name', str, 'surrogate-escaped str') - items = PropDict._make_property('items', list) + items = PropDict._make_property('items', list) # list of chunk ids of item metadata stream (only in memory) + item_ptrs = PropDict._make_property('item_ptrs', list) # list of blocks with list of chunk ids of ims, arch v2 cmdline = PropDict._make_property('cmdline', list) # list of s-e-str hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str') username = PropDict._make_property('username', str, 'surrogate-escaped str') @@ -515,7 +516,9 @@ class ArchiveItem(PropDict): v = fix_tuple_of_str_and_int(v) if k in ('cmdline', 'recreate_cmdline'): v = fix_list_of_str(v) - if k == 'items': + if k == 'items': # legacy + v = fix_list_of_bytes(v) + if k == 'item_ptrs': v = fix_list_of_bytes(v) self._dict[k] = v diff --git a/src/borg/testsuite/archiver.py b/src/borg/testsuite/archiver.py index 3ebdcaf20..846c9ef8a 100644 --- a/src/borg/testsuite/archiver.py +++ b/src/borg/testsuite/archiver.py @@ -3981,7 +3981,7 @@ class ArchiverCheckTestCase(ArchiverTestCaseBase): archive = msgpack.packb( { "cmdline": [], - "items": [], + "item_ptrs": [], "hostname": "foo", "username": "bar", "name": "archive1",