1
0
Fork 0
mirror of https://github.com/borgbackup/borg.git synced 2024-12-26 09:47:58 +00:00

Merge pull request #6941 from ThomasWaldmann/archive-items-indirect

massively increase archive metadata stream size limit, fixes #1473
This commit is contained in:
TW 2022-08-06 22:46:23 +02:00 committed by GitHub
commit 76ef20105f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 82 additions and 70 deletions

View file

@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
Are there other known limitations?
----------------------------------
- A single archive can only reference a limited volume of file/dir metadata,
usually corresponding to tens or hundreds of millions of files/dirs.
When trying to go beyond that limit, you will get a fatal IntegrityError
exception telling that the (archive) object is too big.
An easy workaround is to create multiple archives with fewer items each.
See also the :ref:`archive_limitation` and :issue:`1452`.
:ref:`borg_info` shows how large (relative to the maximum size) existing
archives are.
- borg extract only supports restoring into an empty destination. After that,
the destination will exactly have the contents of the extracted archive.
If you extract into a non-empty destination, borg will (for example) not

View file

@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
more than one archive object with the same name, it adds a counter to the name
in the manifest, but leaves the *name* field of the archives as it was.
* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
* *item_ptrs*, a list of "pointer chunk" IDs.
Each "pointer chunk" contains a list of chunk IDs of item metadata.
* *cmdline*, the command line which was used to create the archive
* *hostname*
* *username*
@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
* Some other pieces of information related to recreate.
.. _archive_limitation:
.. rubric:: Note about archive limitations
The archive is currently stored as a single object in the repository
and thus limited in size to MAX_OBJECT_SIZE (20MiB).
As one chunk list entry is ~40B, that means we can reference ~500.000 item
metadata stream chunks per archive.
Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
So that means the whole item metadata stream is limited to ~64GiB chunks.
If compression is used, the amount of storable metadata is bigger - by the
compression factor.
If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
that means a limit of ~640 million files/directories per archive.
If the medium size of an item entry is 2kB (~100MB size files or more
ACLs/xattrs), the limit will be ~32 million files/directories per archive.
If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
IntegrityError will be raised.
A workaround is to create multiple archives with fewer items each, see
also :issue:`1452`.
.. _item:
Items

View file

@ -331,11 +331,6 @@ stats
Deduplicated size (against the current repository, not when the archive was created)
nfiles
Number of regular files in the archive
limits
Object describing the utilization of Borg limits
max_archive_size
Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
command_line
Array of strings of the command line that created the archive
@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
"end": "2017-02-27T12:27:20.789123",
"hostname": "host",
"id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
"limits": {
"max_archive_size": 0.0001330855110409714
},
"name": "host-system-backup-2017-02-27",
"start": "2017-02-27T12:27:20.789123",
"stats": {

View file

@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
return uid, gid
def archive_get_items(metadata, key, repository):
if "item_ptrs" in metadata: # looks like a v2+ archive
assert "items" not in metadata
items = []
for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
data = key.decrypt(id, data)
ids = msgpack.unpackb(data)
items.extend(ids)
return items
if "items" in metadata: # legacy, v1 archive
assert "item_ptrs" not in metadata
return metadata.items
def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
"""gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
item_ptrs = []
for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
id = key.id_hash(data)
if cache is not None and stats is not None:
cache.add_chunk(id, data, stats)
elif add_reference is not None:
cdata = key.encrypt(id, data)
add_reference(id, len(data), cdata)
else:
raise NotImplementedError
item_ptrs.append(id)
return item_ptrs
class Archive:
class DoesNotExist(Error):
"""Archive {} does not exist"""
@ -479,6 +511,8 @@ def _load_meta(self, id):
metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
if metadata.version not in (1, 2): # legacy: still need to read v1 archives
raise Exception("Unknown archive metadata version")
# note: metadata.items must not get written to disk!
metadata.items = archive_get_items(metadata, self.key, self.repository)
return metadata
def load(self, id):
@ -512,10 +546,6 @@ def duration(self):
def duration_from_meta(self):
return format_timedelta(self.ts_end - self.ts)
def _archive_csize(self):
cdata = self.repository.get(self.id)
return len(cdata)
def info(self):
if self.create:
stats = self.stats
@ -532,7 +562,6 @@ def info(self):
"end": OutputTimestamp(end),
"duration": (end - start).total_seconds(),
"stats": stats.as_dict(),
"limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
}
if self.create:
info["command_line"] = sys.argv
@ -556,12 +585,10 @@ def __str__(self):
Time (start): {start}
Time (end): {end}
Duration: {0.duration}
Utilization of max. archive size: {csize_max:.0%}
""".format(
self,
start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
csize_max=self._archive_csize() / MAX_DATA_SIZE,
location=self.repository._location.canonical_path(),
)
@ -599,6 +626,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
if name in self.manifest.archives:
raise self.AlreadyExists(name)
self.items_buffer.flush(flush=True)
item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
if timestamp is None:
end = datetime.utcnow()
@ -612,7 +640,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
"version": 2,
"name": name,
"comment": comment or "",
"items": self.items_buffer.chunks,
"item_ptrs": item_ptrs, # see #1473
"cmdline": sys.argv,
"hostname": hostname,
"username": getuser(),
@ -930,6 +958,8 @@ def restore_attrs(self, path, item, symlink=False, fd=None):
def set_meta(self, key, value):
metadata = self._load_meta(self.id)
setattr(metadata, key, value)
if "items" in metadata:
del metadata.items
data = msgpack.packb(metadata.as_dict())
new_id = self.key.id_hash(data)
self.cache.add_chunk(new_id, data, self.stats)
@ -1004,6 +1034,11 @@ def chunk_decref(id, stats, part=False):
if forced == 0:
raise
error = True
# delete the blocks that store all the references that end up being loaded into metadata.items:
for id in self.metadata.item_ptrs:
chunk_decref(id, stats)
# in forced delete mode, we try hard to delete at least the manifest entry,
# if possible also the archive superblock, even if processing the items raises
# some harmless exception.
@ -1997,7 +2032,8 @@ def valid_item(obj):
return True, ""
i = 0
for state, items in groupby(archive.items, missing_chunk_detector):
archive_items = archive_get_items(archive, self.key, repository)
for state, items in groupby(archive_items, missing_chunk_detector):
items = list(items)
if state % 2:
for chunk_id in items:
@ -2078,9 +2114,11 @@ def valid_item(obj):
verify_file_chunks(info.name, item)
items_buffer.add(item)
items_buffer.flush(flush=True)
for previous_item_id in archive.items:
for previous_item_id in archive_get_items(archive, self.key, self.repository):
mark_as_possibly_superseded(previous_item_id)
archive.items = items_buffer.chunks
for previous_item_ptr in archive.item_ptrs:
mark_as_possibly_superseded(previous_item_ptr)
archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
data = msgpack.packb(archive.as_dict())
new_archive_id = self.key.id_hash(data)
cdata = self.key.encrypt(new_archive_id, data)

View file

@ -72,7 +72,11 @@ def output(fd):
unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
first = True
for item_id in archive_org_dict["items"]:
items = []
for chunk_id in archive_org_dict["item_ptrs"]:
data = key.decrypt(chunk_id, repository.get(chunk_id))
items.extend(msgpack.unpackb(data))
for item_id in items:
data = key.decrypt(item_id, repository.get(item_id))
unpacker.feed(data)
for item in unpacker:

View file

@ -55,7 +55,6 @@ def format_cmdline(cmdline):
Time (end): {end}
Duration: {duration}
Command line: {command_line}
Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
Number of files: {stats[nfiles]}
Original size: {stats[original_size]}
Deduplicated size: {stats[deduplicated_size]}
@ -88,11 +87,6 @@ def build_parser_info(self, subparsers, common_parser, mid_common_parser):
= unique chunks of this archive.
All archives / deduplicated size = amount of data stored in the repo
= all chunks in the repository.
Borg archives can only contain a limited amount of file metadata.
The size of an archive relative to this limit depends on a number of factors,
mainly the number of files, the lengths of paths and other metadata stored for files.
This is shown as *utilization of maximum supported archive size*.
"""
)
subparser = subparsers.add_parser(

View file

@ -775,8 +775,16 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
if archive.version not in (1, 2): # legacy
raise Exception("Unknown archive metadata version")
if archive.version == 1:
items = archive.items
elif archive.version == 2:
items = []
for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
chunk_idx.add(chunk_id, 1, len(data))
ids = msgpack.unpackb(data)
items.extend(ids)
sync = CacheSynchronizer(chunk_idx)
for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
chunk_idx.add(item_id, 1, len(data))
processed_item_metadata_bytes += len(data)
processed_item_metadata_chunks += 1

View file

@ -11,7 +11,9 @@
# this set must be kept complete, otherwise rebuild_manifest might malfunction:
# fmt: off
ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
'items', # legacy v1 archives
'item_ptrs', # v2+ archives
'comment', 'chunker_params',
'recreate_cmdline',
'recreate_source_id', 'recreate_args', 'recreate_partial_chunks', # used in 1.1.0b1 .. b2
@ -19,7 +21,7 @@
# fmt: on
# this is the set of keys that are always present in archives:
REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])
# default umask, overridden by --umask, defaults to read/write only for owner
UMASK_DEFAULT = 0o077
@ -47,6 +49,9 @@
# borg < 1.3, but this is not expected to cause any issues.
MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8 # see assertion at end of repository module
# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
IDS_PER_CHUNK = 3 # MAX_DATA_SIZE // 40
# repo config max_segment_size value must be below this limit to stay within uint32 offsets:
MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE

View file

@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
def items(self) -> List: ...
@items.setter
def items(self, val: List) -> None: ...
@property
def item_ptrs(self) -> List: ...
@items.setter
def item_ptrs(self, val: List) -> None: ...
class ChunkListEntry(NamedTuple):
id: bytes

View file

@ -483,7 +483,8 @@ class ArchiveItem(PropDict):
version = PropDict._make_property('version', int)
name = PropDict._make_property('name', str, 'surrogate-escaped str')
items = PropDict._make_property('items', list)
items = PropDict._make_property('items', list) # list of chunk ids of item metadata stream (only in memory)
item_ptrs = PropDict._make_property('item_ptrs', list) # list of blocks with list of chunk ids of ims, arch v2
cmdline = PropDict._make_property('cmdline', list) # list of s-e-str
hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
username = PropDict._make_property('username', str, 'surrogate-escaped str')
@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
v = fix_tuple_of_str_and_int(v)
if k in ('cmdline', 'recreate_cmdline'):
v = fix_list_of_str(v)
if k == 'items':
if k == 'items': # legacy
v = fix_list_of_bytes(v)
if k == 'item_ptrs':
v = fix_list_of_bytes(v)
self._dict[k] = v

View file

@ -3981,7 +3981,7 @@ def test_manifest_rebuild_duplicate_archive(self):
archive = msgpack.packb(
{
"cmdline": [],
"items": [],
"item_ptrs": [],
"hostname": "foo",
"username": "bar",
"name": "archive1",