mirror of
https://github.com/borgbackup/borg.git
synced 2024-12-26 09:47:58 +00:00
Merge pull request #6941 from ThomasWaldmann/archive-items-indirect
massively increase archive metadata stream size limit, fixes #1473
This commit is contained in:
commit
76ef20105f
11 changed files with 82 additions and 70 deletions
|
@ -115,15 +115,6 @@ Which file types, attributes, etc. are *not* preserved?
|
|||
Are there other known limitations?
|
||||
----------------------------------
|
||||
|
||||
- A single archive can only reference a limited volume of file/dir metadata,
|
||||
usually corresponding to tens or hundreds of millions of files/dirs.
|
||||
When trying to go beyond that limit, you will get a fatal IntegrityError
|
||||
exception telling that the (archive) object is too big.
|
||||
An easy workaround is to create multiple archives with fewer items each.
|
||||
See also the :ref:`archive_limitation` and :issue:`1452`.
|
||||
|
||||
:ref:`borg_info` shows how large (relative to the maximum size) existing
|
||||
archives are.
|
||||
- borg extract only supports restoring into an empty destination. After that,
|
||||
the destination will exactly have the contents of the extracted archive.
|
||||
If you extract into a non-empty destination, borg will (for example) not
|
||||
|
|
|
@ -511,7 +511,8 @@ The archive object itself further contains some metadata:
|
|||
When :ref:`borg_check` rebuilds the manifest (e.g. if it was corrupted) and finds
|
||||
more than one archive object with the same name, it adds a counter to the name
|
||||
in the manifest, but leaves the *name* field of the archives as it was.
|
||||
* *items*, a list of chunk IDs containing item metadata (size: count * ~34B)
|
||||
* *item_ptrs*, a list of "pointer chunk" IDs.
|
||||
Each "pointer chunk" contains a list of chunk IDs of item metadata.
|
||||
* *cmdline*, the command line which was used to create the archive
|
||||
* *hostname*
|
||||
* *username*
|
||||
|
@ -521,34 +522,6 @@ The archive object itself further contains some metadata:
|
|||
This is used by :ref:`borg_recreate` to determine whether a given archive needs rechunking.
|
||||
* Some other pieces of information related to recreate.
|
||||
|
||||
.. _archive_limitation:
|
||||
|
||||
.. rubric:: Note about archive limitations
|
||||
|
||||
The archive is currently stored as a single object in the repository
|
||||
and thus limited in size to MAX_OBJECT_SIZE (20MiB).
|
||||
|
||||
As one chunk list entry is ~40B, that means we can reference ~500.000 item
|
||||
metadata stream chunks per archive.
|
||||
|
||||
Each item metadata stream chunk is ~128kiB (see hardcoded ITEMS_CHUNKER_PARAMS).
|
||||
|
||||
So that means the whole item metadata stream is limited to ~64GiB chunks.
|
||||
If compression is used, the amount of storable metadata is bigger - by the
|
||||
compression factor.
|
||||
|
||||
If the medium size of an item entry is 100B (small size file, no ACLs/xattrs),
|
||||
that means a limit of ~640 million files/directories per archive.
|
||||
|
||||
If the medium size of an item entry is 2kB (~100MB size files or more
|
||||
ACLs/xattrs), the limit will be ~32 million files/directories per archive.
|
||||
|
||||
If one tries to create an archive object bigger than MAX_OBJECT_SIZE, a fatal
|
||||
IntegrityError will be raised.
|
||||
|
||||
A workaround is to create multiple archives with fewer items each, see
|
||||
also :issue:`1452`.
|
||||
|
||||
.. _item:
|
||||
|
||||
Items
|
||||
|
|
|
@ -331,11 +331,6 @@ stats
|
|||
Deduplicated size (against the current repository, not when the archive was created)
|
||||
nfiles
|
||||
Number of regular files in the archive
|
||||
limits
|
||||
Object describing the utilization of Borg limits
|
||||
|
||||
max_archive_size
|
||||
Float between 0 and 1 describing how large this archive is relative to the maximum size allowed by Borg
|
||||
command_line
|
||||
Array of strings of the command line that created the archive
|
||||
|
||||
|
@ -405,9 +400,6 @@ The same archive with more information (``borg info --last 1 --json``)::
|
|||
"end": "2017-02-27T12:27:20.789123",
|
||||
"hostname": "host",
|
||||
"id": "80cd07219ad725b3c5f665c1dcf119435c4dee1647a560ecac30f8d40221a46a",
|
||||
"limits": {
|
||||
"max_archive_size": 0.0001330855110409714
|
||||
},
|
||||
"name": "host-system-backup-2017-02-27",
|
||||
"start": "2017-02-27T12:27:20.789123",
|
||||
"stats": {
|
||||
|
|
|
@ -391,6 +391,38 @@ def get_item_uid_gid(item, *, numeric, uid_forced=None, gid_forced=None, uid_def
|
|||
return uid, gid
|
||||
|
||||
|
||||
def archive_get_items(metadata, key, repository):
|
||||
if "item_ptrs" in metadata: # looks like a v2+ archive
|
||||
assert "items" not in metadata
|
||||
items = []
|
||||
for id, data in zip(metadata.item_ptrs, repository.get_many(metadata.item_ptrs)):
|
||||
data = key.decrypt(id, data)
|
||||
ids = msgpack.unpackb(data)
|
||||
items.extend(ids)
|
||||
return items
|
||||
|
||||
if "items" in metadata: # legacy, v1 archive
|
||||
assert "item_ptrs" not in metadata
|
||||
return metadata.items
|
||||
|
||||
|
||||
def archive_put_items(chunk_ids, *, key, cache=None, stats=None, add_reference=None):
|
||||
"""gets a (potentially large) list of archive metadata stream chunk ids and writes them to repo objects"""
|
||||
item_ptrs = []
|
||||
for i in range(0, len(chunk_ids), IDS_PER_CHUNK):
|
||||
data = msgpack.packb(chunk_ids[i : i + IDS_PER_CHUNK])
|
||||
id = key.id_hash(data)
|
||||
if cache is not None and stats is not None:
|
||||
cache.add_chunk(id, data, stats)
|
||||
elif add_reference is not None:
|
||||
cdata = key.encrypt(id, data)
|
||||
add_reference(id, len(data), cdata)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
item_ptrs.append(id)
|
||||
return item_ptrs
|
||||
|
||||
|
||||
class Archive:
|
||||
class DoesNotExist(Error):
|
||||
"""Archive {} does not exist"""
|
||||
|
@ -479,6 +511,8 @@ def _load_meta(self, id):
|
|||
metadata = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
||||
if metadata.version not in (1, 2): # legacy: still need to read v1 archives
|
||||
raise Exception("Unknown archive metadata version")
|
||||
# note: metadata.items must not get written to disk!
|
||||
metadata.items = archive_get_items(metadata, self.key, self.repository)
|
||||
return metadata
|
||||
|
||||
def load(self, id):
|
||||
|
@ -512,10 +546,6 @@ def duration(self):
|
|||
def duration_from_meta(self):
|
||||
return format_timedelta(self.ts_end - self.ts)
|
||||
|
||||
def _archive_csize(self):
|
||||
cdata = self.repository.get(self.id)
|
||||
return len(cdata)
|
||||
|
||||
def info(self):
|
||||
if self.create:
|
||||
stats = self.stats
|
||||
|
@ -532,7 +562,6 @@ def info(self):
|
|||
"end": OutputTimestamp(end),
|
||||
"duration": (end - start).total_seconds(),
|
||||
"stats": stats.as_dict(),
|
||||
"limits": {"max_archive_size": self._archive_csize() / MAX_DATA_SIZE},
|
||||
}
|
||||
if self.create:
|
||||
info["command_line"] = sys.argv
|
||||
|
@ -556,12 +585,10 @@ def __str__(self):
|
|||
Time (start): {start}
|
||||
Time (end): {end}
|
||||
Duration: {0.duration}
|
||||
Utilization of max. archive size: {csize_max:.0%}
|
||||
""".format(
|
||||
self,
|
||||
start=OutputTimestamp(self.start.replace(tzinfo=timezone.utc)),
|
||||
end=OutputTimestamp(self.end.replace(tzinfo=timezone.utc)),
|
||||
csize_max=self._archive_csize() / MAX_DATA_SIZE,
|
||||
location=self.repository._location.canonical_path(),
|
||||
)
|
||||
|
||||
|
@ -599,6 +626,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
|
|||
if name in self.manifest.archives:
|
||||
raise self.AlreadyExists(name)
|
||||
self.items_buffer.flush(flush=True)
|
||||
item_ptrs = archive_put_items(self.items_buffer.chunks, key=self.key, cache=self.cache, stats=self.stats)
|
||||
duration = timedelta(seconds=time.monotonic() - self.start_monotonic)
|
||||
if timestamp is None:
|
||||
end = datetime.utcnow()
|
||||
|
@ -612,7 +640,7 @@ def save(self, name=None, comment=None, timestamp=None, stats=None, additional_m
|
|||
"version": 2,
|
||||
"name": name,
|
||||
"comment": comment or "",
|
||||
"items": self.items_buffer.chunks,
|
||||
"item_ptrs": item_ptrs, # see #1473
|
||||
"cmdline": sys.argv,
|
||||
"hostname": hostname,
|
||||
"username": getuser(),
|
||||
|
@ -930,6 +958,8 @@ def restore_attrs(self, path, item, symlink=False, fd=None):
|
|||
def set_meta(self, key, value):
|
||||
metadata = self._load_meta(self.id)
|
||||
setattr(metadata, key, value)
|
||||
if "items" in metadata:
|
||||
del metadata.items
|
||||
data = msgpack.packb(metadata.as_dict())
|
||||
new_id = self.key.id_hash(data)
|
||||
self.cache.add_chunk(new_id, data, self.stats)
|
||||
|
@ -1004,6 +1034,11 @@ def chunk_decref(id, stats, part=False):
|
|||
if forced == 0:
|
||||
raise
|
||||
error = True
|
||||
|
||||
# delete the blocks that store all the references that end up being loaded into metadata.items:
|
||||
for id in self.metadata.item_ptrs:
|
||||
chunk_decref(id, stats)
|
||||
|
||||
# in forced delete mode, we try hard to delete at least the manifest entry,
|
||||
# if possible also the archive superblock, even if processing the items raises
|
||||
# some harmless exception.
|
||||
|
@ -1997,7 +2032,8 @@ def valid_item(obj):
|
|||
return True, ""
|
||||
|
||||
i = 0
|
||||
for state, items in groupby(archive.items, missing_chunk_detector):
|
||||
archive_items = archive_get_items(archive, self.key, repository)
|
||||
for state, items in groupby(archive_items, missing_chunk_detector):
|
||||
items = list(items)
|
||||
if state % 2:
|
||||
for chunk_id in items:
|
||||
|
@ -2078,9 +2114,11 @@ def valid_item(obj):
|
|||
verify_file_chunks(info.name, item)
|
||||
items_buffer.add(item)
|
||||
items_buffer.flush(flush=True)
|
||||
for previous_item_id in archive.items:
|
||||
for previous_item_id in archive_get_items(archive, self.key, self.repository):
|
||||
mark_as_possibly_superseded(previous_item_id)
|
||||
archive.items = items_buffer.chunks
|
||||
for previous_item_ptr in archive.item_ptrs:
|
||||
mark_as_possibly_superseded(previous_item_ptr)
|
||||
archive.item_ptrs = archive_put_items(items_buffer.chunks, key=self.key, add_reference=add_reference)
|
||||
data = msgpack.packb(archive.as_dict())
|
||||
new_archive_id = self.key.id_hash(data)
|
||||
cdata = self.key.encrypt(new_archive_id, data)
|
||||
|
|
|
@ -72,7 +72,11 @@ def output(fd):
|
|||
|
||||
unpacker = msgpack.Unpacker(use_list=False, object_hook=StableDict)
|
||||
first = True
|
||||
for item_id in archive_org_dict["items"]:
|
||||
items = []
|
||||
for chunk_id in archive_org_dict["item_ptrs"]:
|
||||
data = key.decrypt(chunk_id, repository.get(chunk_id))
|
||||
items.extend(msgpack.unpackb(data))
|
||||
for item_id in items:
|
||||
data = key.decrypt(item_id, repository.get(item_id))
|
||||
unpacker.feed(data)
|
||||
for item in unpacker:
|
||||
|
|
|
@ -55,7 +55,6 @@ def format_cmdline(cmdline):
|
|||
Time (end): {end}
|
||||
Duration: {duration}
|
||||
Command line: {command_line}
|
||||
Utilization of maximum supported archive size: {limits[max_archive_size]:.0%}
|
||||
Number of files: {stats[nfiles]}
|
||||
Original size: {stats[original_size]}
|
||||
Deduplicated size: {stats[deduplicated_size]}
|
||||
|
@ -88,11 +87,6 @@ def build_parser_info(self, subparsers, common_parser, mid_common_parser):
|
|||
= unique chunks of this archive.
|
||||
All archives / deduplicated size = amount of data stored in the repo
|
||||
= all chunks in the repository.
|
||||
|
||||
Borg archives can only contain a limited amount of file metadata.
|
||||
The size of an archive relative to this limit depends on a number of factors,
|
||||
mainly the number of files, the lengths of paths and other metadata stored for files.
|
||||
This is shown as *utilization of maximum supported archive size*.
|
||||
"""
|
||||
)
|
||||
subparser = subparsers.add_parser(
|
||||
|
|
|
@ -775,8 +775,16 @@ def fetch_and_build_idx(archive_id, decrypted_repository, chunk_idx):
|
|||
archive = ArchiveItem(internal_dict=msgpack.unpackb(data))
|
||||
if archive.version not in (1, 2): # legacy
|
||||
raise Exception("Unknown archive metadata version")
|
||||
if archive.version == 1:
|
||||
items = archive.items
|
||||
elif archive.version == 2:
|
||||
items = []
|
||||
for chunk_id, (csize, data) in zip(archive.item_ptrs, decrypted_repository.get_many(archive.item_ptrs)):
|
||||
chunk_idx.add(chunk_id, 1, len(data))
|
||||
ids = msgpack.unpackb(data)
|
||||
items.extend(ids)
|
||||
sync = CacheSynchronizer(chunk_idx)
|
||||
for item_id, (csize, data) in zip(archive.items, decrypted_repository.get_many(archive.items)):
|
||||
for item_id, (csize, data) in zip(items, decrypted_repository.get_many(items)):
|
||||
chunk_idx.add(item_id, 1, len(data))
|
||||
processed_item_metadata_bytes += len(data)
|
||||
processed_item_metadata_chunks += 1
|
||||
|
|
|
@ -11,7 +11,9 @@
|
|||
|
||||
# this set must be kept complete, otherwise rebuild_manifest might malfunction:
|
||||
# fmt: off
|
||||
ARCHIVE_KEYS = frozenset(['version', 'name', 'items', 'cmdline', 'hostname', 'username', 'time', 'time_end',
|
||||
ARCHIVE_KEYS = frozenset(['version', 'name', 'cmdline', 'hostname', 'username', 'time', 'time_end',
|
||||
'items', # legacy v1 archives
|
||||
'item_ptrs', # v2+ archives
|
||||
'comment', 'chunker_params',
|
||||
'recreate_cmdline',
|
||||
'recreate_source_id', 'recreate_args', 'recreate_partial_chunks', # used in 1.1.0b1 .. b2
|
||||
|
@ -19,7 +21,7 @@
|
|||
# fmt: on
|
||||
|
||||
# this is the set of keys that are always present in archives:
|
||||
REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "items", "cmdline", "time"])
|
||||
REQUIRED_ARCHIVE_KEYS = frozenset(["version", "name", "item_ptrs", "cmdline", "time"])
|
||||
|
||||
# default umask, overridden by --umask, defaults to read/write only for owner
|
||||
UMASK_DEFAULT = 0o077
|
||||
|
@ -47,6 +49,9 @@
|
|||
# borg < 1.3, but this is not expected to cause any issues.
|
||||
MAX_OBJECT_SIZE = MAX_DATA_SIZE + 41 + 8 # see assertion at end of repository module
|
||||
|
||||
# how many metadata stream chunk ids do we store into a "pointer chunk" of the ArchiveItem.item_ptrs list?
|
||||
IDS_PER_CHUNK = 3 # MAX_DATA_SIZE // 40
|
||||
|
||||
# repo config max_segment_size value must be below this limit to stay within uint32 offsets:
|
||||
MAX_SEGMENT_SIZE_LIMIT = 2**32 - MAX_OBJECT_SIZE
|
||||
|
||||
|
|
|
@ -98,6 +98,10 @@ class ArchiveItem(PropDict):
|
|||
def items(self) -> List: ...
|
||||
@items.setter
|
||||
def items(self, val: List) -> None: ...
|
||||
@property
|
||||
def item_ptrs(self) -> List: ...
|
||||
@items.setter
|
||||
def item_ptrs(self, val: List) -> None: ...
|
||||
|
||||
class ChunkListEntry(NamedTuple):
|
||||
id: bytes
|
||||
|
|
|
@ -483,7 +483,8 @@ class ArchiveItem(PropDict):
|
|||
|
||||
version = PropDict._make_property('version', int)
|
||||
name = PropDict._make_property('name', str, 'surrogate-escaped str')
|
||||
items = PropDict._make_property('items', list)
|
||||
items = PropDict._make_property('items', list) # list of chunk ids of item metadata stream (only in memory)
|
||||
item_ptrs = PropDict._make_property('item_ptrs', list) # list of blocks with list of chunk ids of ims, arch v2
|
||||
cmdline = PropDict._make_property('cmdline', list) # list of s-e-str
|
||||
hostname = PropDict._make_property('hostname', str, 'surrogate-escaped str')
|
||||
username = PropDict._make_property('username', str, 'surrogate-escaped str')
|
||||
|
@ -515,7 +516,9 @@ class ArchiveItem(PropDict):
|
|||
v = fix_tuple_of_str_and_int(v)
|
||||
if k in ('cmdline', 'recreate_cmdline'):
|
||||
v = fix_list_of_str(v)
|
||||
if k == 'items':
|
||||
if k == 'items': # legacy
|
||||
v = fix_list_of_bytes(v)
|
||||
if k == 'item_ptrs':
|
||||
v = fix_list_of_bytes(v)
|
||||
self._dict[k] = v
|
||||
|
||||
|
|
|
@ -3981,7 +3981,7 @@ def test_manifest_rebuild_duplicate_archive(self):
|
|||
archive = msgpack.packb(
|
||||
{
|
||||
"cmdline": [],
|
||||
"items": [],
|
||||
"item_ptrs": [],
|
||||
"hostname": "foo",
|
||||
"username": "bar",
|
||||
"name": "archive1",
|
||||
|
|
Loading…
Reference in a new issue