Add --format option to `borg diff`, resolve issue #4634 (#7534)

diff: add --format option

also: refactoring/improvements of BaseFormatter
This commit is contained in:
Tarrailt 2023-06-12 04:41:36 +08:00 committed by GitHub
parent 8506c05ab6
commit 616d5e7330
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 492 additions and 266 deletions

View File

@ -11,6 +11,7 @@ from functools import partial
from getpass import getuser
from io import BytesIO
from itertools import groupby, zip_longest
from typing import Iterator
from shutil import get_terminal_size
from .platformflags import is_win32
@ -297,31 +298,24 @@ class DownloadPipeline:
unpacker = msgpack.Unpacker(use_list=False)
for data in self.fetch_many(ids):
unpacker.feed(data)
items = [Item(internal_dict=item) for item in unpacker]
for item in items:
for _item in unpacker:
item = Item(internal_dict=_item)
if "chunks" in item:
item.chunks = [ChunkListEntry(*e) for e in item.chunks]
if filter:
items = [item for item in items if filter(item)]
if preload:
for item in items:
if "chunks" in item:
hlid = item.get("hlid", None)
if hlid is None:
preload_chunks = True
else:
if hlid in hlids_preloaded:
preload_chunks = False
else:
# not having the hardlink's chunks already preloaded for other hardlink to same inode
preload_chunks = True
hlids_preloaded.add(hlid)
if preload_chunks:
self.repository.preload([c.id for c in item.chunks])
for item in items:
if filter and not filter(item):
continue
if preload and "chunks" in item:
hlid = item.get("hlid", None)
if hlid is None:
preload_chunks = True
elif hlid in hlids_preloaded:
preload_chunks = False
else:
# not having the hardlink's chunks already preloaded for other hardlink to same inode
preload_chunks = True
hlids_preloaded.add(hlid)
if preload_chunks:
self.repository.preload([c.id for c in item.chunks])
yield item
def fetch_many(self, ids, is_preloaded=False):
@ -631,10 +625,9 @@ Duration: {0.duration}
def iter_items(self, filter=None, preload=False):
# note: when calling this with preload=True, later fetch_many() must be called with
# is_preloaded=True or the RemoteRepository code will leak memory!
for item in self.pipeline.unpack_many(
yield from self.pipeline.unpack_many(
self.metadata.items, preload=preload, filter=lambda item: self.item_filter(item, filter)
):
yield item
)
def add_item(self, item, show_progress=True, stats=None):
if show_progress and self.show_progress:
@ -1123,55 +1116,59 @@ Duration: {0.duration}
logger.warning("borg check --repair is required to free all space.")
@staticmethod
def compare_archives_iter(archive1, archive2, matcher=None, can_compare_chunk_ids=False, content_only=False):
def compare_archives_iter(
archive1: "Archive", archive2: "Archive", matcher=None, can_compare_chunk_ids=False
) -> Iterator[ItemDiff]:
"""
Yields tuples with a path and an ItemDiff instance describing changes/indicating equality.
Yields an ItemDiff instance describing changes/indicating equality.
:param matcher: PatternMatcher class to restrict results to only matching paths.
:param can_compare_chunk_ids: Whether --chunker-params are the same for both archives.
"""
def compare_items(item1, item2):
def compare_items(path: str, item1: Item, item2: Item):
return ItemDiff(
path,
item1,
item2,
archive1.pipeline.fetch_many([c.id for c in item1.get("chunks", [])]),
archive2.pipeline.fetch_many([c.id for c in item2.get("chunks", [])]),
can_compare_chunk_ids=can_compare_chunk_ids,
content_only=content_only,
)
orphans_archive1 = OrderedDict()
orphans_archive2 = OrderedDict()
orphans_archive1: OrderedDict[str, Item] = OrderedDict()
orphans_archive2: OrderedDict[str, Item] = OrderedDict()
assert matcher is not None, "matcher must be set"
for item1, item2 in zip_longest(
archive1.iter_items(lambda item: matcher.match(item.path)),
archive2.iter_items(lambda item: matcher.match(item.path)),
):
if item1 and item2 and item1.path == item2.path:
yield (item1.path, compare_items(item1, item2))
yield compare_items(item1.path, item1, item2)
continue
if item1:
matching_orphan = orphans_archive2.pop(item1.path, None)
if matching_orphan:
yield (item1.path, compare_items(item1, matching_orphan))
yield compare_items(item1.path, item1, matching_orphan)
else:
orphans_archive1[item1.path] = item1
if item2:
matching_orphan = orphans_archive1.pop(item2.path, None)
if matching_orphan:
yield (matching_orphan.path, compare_items(matching_orphan, item2))
yield compare_items(matching_orphan.path, matching_orphan, item2)
else:
orphans_archive2[item2.path] = item2
# At this point orphans_* contain items that had no matching partner in the other archive
for added in orphans_archive2.values():
path = added.path
deleted_item = Item.create_deleted(path)
yield (path, compare_items(deleted_item, added))
yield compare_items(path, deleted_item, added)
for deleted in orphans_archive1.values():
path = deleted.path
deleted_item = Item.create_deleted(path)
yield (path, compare_items(deleted, deleted_item))
yield compare_items(path, deleted, deleted_item)
class MetadataCollector:

View File

@ -1,13 +1,14 @@
import argparse
import textwrap
import json
import sys
import os
from ._common import with_repository, with_archive, build_matcher
from ._common import with_repository, with_archive, build_matcher, Highlander
from ..archive import Archive
from ..constants import * # NOQA
from ..helpers import archivename_validator
from ..helpers import BaseFormatter, DiffFormatter, archivename_validator, BorgJsonEncoder
from ..manifest import Manifest
from ..helpers.parseformat import BorgJsonEncoder
from ..logger import create_logger
logger = create_logger()
@ -18,14 +19,12 @@ class DiffMixIn:
@with_archive
def do_diff(self, args, repository, manifest, archive):
"""Diff contents of two archives"""
def print_json_output(diff, path):
print(json.dumps({"path": path, "changes": [j for j, str in diff]}, sort_keys=True, cls=BorgJsonEncoder))
def print_text_output(diff, path):
print("{:<19} {}".format(" ".join([str for j, str in diff]), path))
print_output = print_json_output if args.json_lines else print_text_output
if args.format is not None:
format = args.format
elif args.content_only:
format = "{content}{link}{directory}{blkdev}{chrdev}{fifo} {path}{NL}"
else:
format = os.environ.get("BORG_DIFF_FORMAT", "{change} {path}{NL}")
archive1 = archive
archive2 = Archive(manifest, args.other_name)
@ -43,17 +42,36 @@ class DiffMixIn:
matcher = build_matcher(args.patterns, args.paths)
diffs = Archive.compare_archives_iter(
archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids, content_only=args.content_only
diffs_iter = Archive.compare_archives_iter(
archive1, archive2, matcher, can_compare_chunk_ids=can_compare_chunk_ids
)
# Conversion to string and filtering for diff.equal to save memory if sorting
diffs = ((path, diff.changes()) for path, diff in diffs if not diff.equal)
diffs = (diff for diff in diffs_iter if not diff.equal(args.content_only))
if args.sort:
diffs = sorted(diffs)
diffs = sorted(diffs, key=lambda diff: diff.path)
for path, diff in diffs:
print_output(diff, path)
formatter = DiffFormatter(format, args.content_only)
for diff in diffs:
if args.json_lines:
print(
json.dumps(
{
"path": diff.path,
"changes": [
change.to_dict()
for name, change in diff.changes().items()
if not args.content_only or (name not in DiffFormatter.METADATA)
],
},
sort_keys=True,
cls=BorgJsonEncoder,
)
)
else:
res: str = formatter.format_item(diff)
if res.strip():
sys.stdout.write(res)
for pattern in matcher.get_unmatched_include_patterns():
self.print_warning("Include pattern '%s' never matched.", pattern)
@ -64,25 +82,48 @@ class DiffMixIn:
from ._common import process_epilog
from ._common import define_exclusion_group
diff_epilog = process_epilog(
"""
This command finds differences (file contents, user/group/mode) between archives.
diff_epilog = (
process_epilog(
"""
This command finds differences (file contents, metadata) between ARCHIVE1 and ARCHIVE2.
A repository location and an archive name must be specified for REPO::ARCHIVE1.
ARCHIVE2 is just another archive name in same repository (no repository location
allowed).
For more help on include/exclude patterns, see the :ref:`borg_patterns` command output.
For archives created with Borg 1.1 or newer diff automatically detects whether
the archives are created with the same chunker params. If so, only chunk IDs
are compared, which is very fast.
.. man NOTES
For archives prior to Borg 1.1 chunk contents are compared by default.
If you did not create the archives with different chunker params,
pass ``--same-chunker-params``.
Note that the chunker params changed from Borg 0.xx to 1.0.
The FORMAT specifier syntax
+++++++++++++++++++++++++++
For more help on include/exclude patterns, see the :ref:`borg_patterns` command output.
"""
The ``--format`` option uses python's `format string syntax
<https://docs.python.org/3.9/library/string.html#formatstrings>`_.
Examples:
::
$ borg diff --format '{content:30} {path}{NL}' ArchiveFoo ArchiveBar
modified: +4.1 kB -1.0 kB file-diff
...
# {VAR:<NUMBER} - pad to NUMBER columns left-aligned.
# {VAR:>NUMBER} - pad to NUMBER columns right-aligned.
$ borg diff --format '{content:>30} {path}{NL}' ArchiveFoo ArchiveBar
modified: +4.1 kB -1.0 kB file-diff
...
The following keys are always available:
"""
)
+ BaseFormatter.keys_help()
+ textwrap.dedent(
"""
Keys available only when showing differences between archives:
"""
)
+ DiffFormatter.keys_help()
)
subparser = subparsers.add_parser(
"diff",
@ -107,6 +148,13 @@ class DiffMixIn:
help="Override check of chunker parameters.",
)
subparser.add_argument("--sort", dest="sort", action="store_true", help="Sort the output lines by file path.")
subparser.add_argument(
"--format",
metavar="FORMAT",
dest="format",
action=Highlander,
help='specify format for differences between archives (default: "{change} {path}{NL}")',
)
subparser.add_argument("--json-lines", action="store_true", help="Format output as JSON Lines. ")
subparser.add_argument(
"--content-only",

View File

@ -29,10 +29,9 @@ class ListMixIn:
def _list_inner(cache):
archive = Archive(manifest, args.name, cache=cache)
formatter = ItemFormatter(archive, format, json_lines=args.json_lines)
formatter = ItemFormatter(archive, format)
for item in archive.iter_items(lambda item: matcher.match(item.path)):
sys.stdout.write(formatter.format_item(item))
sys.stdout.write(formatter.format_item(item, args.json_lines, sort=True))
# Only load the cache if it will be used
if ItemFormatter.format_needs_cache(format):

View File

@ -89,7 +89,7 @@ class PruneMixIn:
format = "{archive}"
else:
format = os.environ.get("BORG_PRUNE_FORMAT", "{archive:<36} {time} [{id}]")
formatter = ArchiveFormatter(format, repository, manifest, manifest.key, json=False, iec=args.iec)
formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec)
checkpoint_re = r"\.checkpoint(\.\d+)?"
archives_checkpoints = manifest.archives.list(
@ -169,7 +169,7 @@ class PruneMixIn:
or (args.list_pruned and archive in to_delete)
or (args.list_kept and archive not in to_delete)
):
list_logger.info(f"{log_message:<40} {formatter.format_item(archive)}")
list_logger.info(f"{log_message:<40} {formatter.format_item(archive, jsonline=False)}")
pi.finish()
if sig_int:
# Ctrl-C / SIGINT: do not checkpoint (commit) again, we already have a checkpoint in this case.

View File

@ -23,15 +23,15 @@ class RListMixIn:
format = "{archive}{NL}"
else:
format = os.environ.get("BORG_RLIST_FORMAT", "{archive:<36} {time} [{id}]{NL}")
formatter = ArchiveFormatter(format, repository, manifest, manifest.key, json=args.json, iec=args.iec)
formatter = ArchiveFormatter(format, repository, manifest, manifest.key, iec=args.iec)
output_data = []
for archive_info in manifest.archives.list_considering(args):
if args.json:
output_data.append(formatter.get_item_data(archive_info))
output_data.append(formatter.get_item_data(archive_info, args.json))
else:
sys.stdout.write(formatter.format_item(archive_info))
sys.stdout.write(formatter.format_item(archive_info, args.json))
if args.json:
json_print(basic_json_data(manifest, extra={"archives": output_data}))

View File

@ -28,7 +28,7 @@ from .parseformat import sizeof_fmt, sizeof_fmt_iec, sizeof_fmt_decimal, Locatio
from .parseformat import format_line, replace_placeholders, PlaceholderError, relative_time_marker_validator
from .parseformat import format_archive, parse_stringified_list, clean_lines
from .parseformat import location_validator, archivename_validator, comment_validator
from .parseformat import BaseFormatter, ArchiveFormatter, ItemFormatter, file_status
from .parseformat import BaseFormatter, ArchiveFormatter, ItemFormatter, DiffFormatter, file_status
from .parseformat import swidth_slice, ellipsis_truncate
from .parseformat import BorgJsonEncoder, basic_json_data, json_print, json_dump, prepare_dump_dict
from .parseformat import Highlander, MakePathSafeAction

View File

@ -1,3 +1,4 @@
import abc
import argparse
import base64
import hashlib
@ -8,6 +9,7 @@ import re
import shlex
import stat
import uuid
from typing import List, Dict, Set, Tuple, ClassVar, Any, TYPE_CHECKING, Literal
from binascii import hexlify
from collections import Counter, OrderedDict
from datetime import datetime, timezone
@ -27,6 +29,9 @@ from .. import __version_tuple__ as borg_version_tuple
from ..constants import * # NOQA
from ..platformflags import is_win32
if TYPE_CHECKING:
from ..item import ItemDiff
def bin_to_hex(binary):
return hexlify(binary).decode("ascii")
@ -649,8 +654,10 @@ def archivename_validator(text):
return validate_text(text)
class BaseFormatter:
FIXED_KEYS = {
class BaseFormatter(metaclass=abc.ABCMeta):
format: str
static_data: Dict[str, Any]
FIXED_KEYS: ClassVar[Dict[str, str]] = {
# Formatting aids
"LF": "\n",
"SPACE": " ",
@ -660,25 +667,49 @@ class BaseFormatter:
"NEWLINE": "\n",
"NL": "\n", # \n is automatically converted to os.linesep on write
}
KEY_DESCRIPTIONS: ClassVar[Dict[str, str]] = {
"NEWLINE": "OS dependent line separator",
"NL": "alias of NEWLINE",
"NUL": "NUL character for creating print0 / xargs -0 like output",
"SPACE": "space character",
"TAB": "tab character",
"CR": "carriage return character",
"LF": "line feed character",
}
KEY_GROUPS: ClassVar[Tuple[Tuple[str, ...], ...]] = (("NEWLINE", "NL", "NUL", "SPACE", "TAB", "CR", "LF"),)
def get_item_data(self, item):
def __init__(self, format: str, static: Dict[str, Any]) -> None:
self.format = partial_format(format, static)
self.static_data = static
@abc.abstractmethod
def get_item_data(self, item, jsonline=False) -> dict:
raise NotImplementedError
def format_item(self, item):
return self.format.format_map(self.get_item_data(item))
@staticmethod
def keys_help():
def format_item(self, item, jsonline=False, sort=False):
data = self.get_item_data(item, jsonline)
return (
"- NEWLINE: OS dependent line separator\n"
"- NL: alias of NEWLINE\n"
"- NUL: NUL character for creating print0 / xargs -0 like output\n"
"- SPACE\n"
"- TAB\n"
"- CR\n"
"- LF"
f"{json.dumps(data, cls=BorgJsonEncoder, sort_keys=sort)}\n" if jsonline else self.format.format_map(data)
)
@classmethod
def keys_help(cls):
help = []
keys: Set[str] = set()
keys.update(cls.KEY_DESCRIPTIONS.keys())
keys.update(key for group in cls.KEY_GROUPS for key in group)
for group in cls.KEY_GROUPS:
for key in group:
keys.remove(key)
text = "- " + key
if key in cls.KEY_DESCRIPTIONS:
text += ": " + cls.KEY_DESCRIPTIONS[key]
help.append(text)
help.append("")
assert not keys, str(keys)
return "\n".join(help)
class ArchiveFormatter(BaseFormatter):
KEY_DESCRIPTIONS = {
@ -703,47 +734,17 @@ class ArchiveFormatter(BaseFormatter):
("size", "nfiles"),
)
@classmethod
def available_keys(cls):
from ..manifest import ArchiveInfo
fake_archive_info = ArchiveInfo("archivename", b"\1" * 32, datetime(1970, 1, 1, tzinfo=timezone.utc))
formatter = cls("", None, None, None)
keys = []
keys.extend(formatter.call_keys.keys())
keys.extend(formatter.get_item_data(fake_archive_info).keys())
return keys
@classmethod
def keys_help(cls):
help = []
keys = cls.available_keys()
for key in cls.FIXED_KEYS:
keys.remove(key)
for group in cls.KEY_GROUPS:
for key in group:
keys.remove(key)
text = "- " + key
if key in cls.KEY_DESCRIPTIONS:
text += ": " + cls.KEY_DESCRIPTIONS[key]
help.append(text)
help.append("")
assert not keys, str(keys)
return "\n".join(help)
def __init__(self, format, repository, manifest, key, *, json=False, iec=False):
def __init__(self, format, repository, manifest, key, *, iec=False):
static_data = {} # here could be stuff on repo level, above archive level
static_data.update(self.FIXED_KEYS)
super().__init__(format, static_data)
self.repository = repository
self.manifest = manifest
self.key = key
self.name = None
self.id = None
self._archive = None
self.json = json
self.iec = iec
static_keys = {} # here could be stuff on repo level, above archive level
static_keys.update(self.FIXED_KEYS)
self.format = partial_format(format, static_keys)
self.format_keys = {f[1] for f in Formatter().parse(format)}
self.call_keys = {
"hostname": partial(self.get_meta, "hostname", ""),
@ -755,20 +756,12 @@ class ArchiveFormatter(BaseFormatter):
"end": self.get_ts_end,
}
self.used_call_keys = set(self.call_keys) & self.format_keys
if self.json:
self.item_data = {}
self.format_item = self.format_item_json
else:
self.item_data = static_keys
def format_item_json(self, item):
return json.dumps(self.get_item_data(item), cls=BorgJsonEncoder) + "\n"
def get_item_data(self, archive_info):
def get_item_data(self, archive_info, jsonline=False):
self.name = archive_info.name
self.id = archive_info.id
item_data = {}
item_data.update(self.item_data)
item_data.update({} if jsonline else self.static_data)
item_data.update(
{
"name": archive_info.name,
@ -812,15 +805,31 @@ class ItemFormatter(BaseFormatter):
# shake_* is not provided because it uses an incompatible .digest() method to support variable length.
hash_algorithms = set(hashlib.algorithms_guaranteed).union({"xxh64"}).difference({"shake_128", "shake_256"})
KEY_DESCRIPTIONS = {
"type": "file type (file, dir, symlink, ...)",
"mode": "file mode (as in stat)",
"uid": "user id of file owner",
"gid": "group id of file owner",
"user": "user name of file owner",
"group": "group name of file owner",
"path": "file path",
"target": "link target for symlinks",
"hlid": "hard link identity (same if hardlinking same fs object)",
"flags": "file flags",
"extra": 'prepends {target} with " -> " for soft links and " link to " for hard links',
"size": "file size",
"dsize": "deduplicated size",
"num_chunks": "number of chunks in this file",
"unique_chunks": "number of unique chunks in this file",
"mtime": "file modification time",
"ctime": "file change time",
"atime": "file access time",
"isomtime": "file modification time (ISO 8601 format)",
"isoctime": "file change time (ISO 8601 format)",
"isoatime": "file access time (ISO 8601 format)",
"xxh64": "XXH64 checksum of this file (note: this is NOT a cryptographic hash!)",
"health": 'either "healthy" (file ok) or "broken" (if file has all-zero replacement chunks)',
"archiveid": "internal ID of the archive",
"archivename": "name of the archive",
}
KEY_GROUPS = (
("type", "mode", "uid", "gid", "user", "group", "path", "target", "hlid", "flags"),
@ -833,57 +842,19 @@ class ItemFormatter(BaseFormatter):
KEYS_REQUIRING_CACHE = ("dsize", "unique_chunks")
@classmethod
def available_keys(cls):
class FakeArchive:
fpr = name = ""
from ..item import Item
fake_item = Item(mode=0, path="foo", user="", group="", mtime=0, uid=0, gid=0)
formatter = cls(FakeArchive, "")
keys = []
keys.extend(formatter.call_keys.keys())
keys.extend(formatter.get_item_data(fake_item).keys())
return keys
@classmethod
def keys_help(cls):
help = []
keys = cls.available_keys()
for key in cls.FIXED_KEYS:
keys.remove(key)
for group in cls.KEY_GROUPS:
for key in group:
keys.remove(key)
text = "- " + key
if key in cls.KEY_DESCRIPTIONS:
text += ": " + cls.KEY_DESCRIPTIONS[key]
help.append(text)
help.append("")
assert not keys, str(keys)
return "\n".join(help)
@classmethod
def format_needs_cache(cls, format):
format_keys = {f[1] for f in Formatter().parse(format)}
return any(key in cls.KEYS_REQUIRING_CACHE for key in format_keys)
def __init__(self, archive, format, *, json_lines=False):
def __init__(self, archive, format):
from ..checksums import StreamingXXH64
static_data = {"archivename": archive.name, "archiveid": archive.fpr}
static_data.update(self.FIXED_KEYS)
super().__init__(format, static_data)
self.xxh64 = StreamingXXH64
self.archive = archive
self.json_lines = json_lines
static_keys = {"archivename": archive.name, "archiveid": archive.fpr}
static_keys.update(self.FIXED_KEYS)
if self.json_lines:
self.item_data = {}
self.format_item = self.format_item_json
else:
self.item_data = static_keys
self.format = partial_format(format, static_keys)
self.format_keys = {f[1] for f in Formatter().parse(format)}
self.call_keys = {
"size": self.calculate_size,
@ -901,17 +872,14 @@ class ItemFormatter(BaseFormatter):
self.call_keys[hash_function] = partial(self.hash_item, hash_function)
self.used_call_keys = set(self.call_keys) & self.format_keys
def format_item_json(self, item):
return json.dumps(self.get_item_data(item), cls=BorgJsonEncoder, sort_keys=True) + "\n"
def get_item_data(self, item):
def get_item_data(self, item, jsonline=False):
item_data = {}
item_data.update(self.item_data)
item_data.update({} if jsonline else self.static_data)
item_data.update(text_to_json("path", item.path))
target = item.get("target", "")
item_data.update(text_to_json("target", target))
if not self.json_lines:
if not jsonline:
item_data["extra"] = "" if not target else f" -> {item_data['target']}"
hlid = item.get("hlid")
@ -928,7 +896,7 @@ class ItemFormatter(BaseFormatter):
item_data.update(text_to_json("user", item.get("user", str(item_data["uid"]))))
item_data.update(text_to_json("group", item.get("group", str(item_data["gid"]))))
if self.json_lines:
if jsonline:
item_data["healthy"] = "chunks_healthy" not in item
else:
item_data["health"] = "broken" if "chunks_healthy" in item else "healthy"
@ -944,7 +912,7 @@ class ItemFormatter(BaseFormatter):
item: The item to sum its unique chunks' metadata
metadata_func: A function that takes a parameter of type ChunkIndexEntry and returns a number, used to return
the metadata needed from the chunk
the metadata needed from the chunk
"""
chunk_index = self.archive.cache.chunks
chunks = item.get("chunks", [])
@ -976,6 +944,134 @@ class ItemFormatter(BaseFormatter):
return self.format_time(key, item).isoformat()
class DiffFormatter(BaseFormatter):
KEY_DESCRIPTIONS = {
"path": "archived file path",
"change": "all available changes",
"content": "file content change",
"mode": "file mode change",
"type": "file type change",
"owner": "file owner (user/group) change",
"user": "file user change",
"group": "file group change",
"link": "file link change",
"directory": "file directory change",
"blkdev": "file block device change",
"chrdev": "file character device change",
"fifo": "file fifo change",
"mtime": "file modification time change",
"ctime": "file change time change",
"isomtime": "file modification time change (ISO 8601)",
"isoctime": "file creation time change (ISO 8601)",
}
KEY_GROUPS = (
("path", "change"),
("content", "mode", "type", "owner", "group", "user"),
("link", "directory", "blkdev", "chrdev", "fifo"),
("mtime", "ctime", "isomtime", "isoctime"),
)
METADATA = ("mode", "type", "owner", "group", "user", "mtime", "ctime")
def __init__(self, format, content_only=False):
static_data = {}
static_data.update(self.FIXED_KEYS)
super().__init__(format or "{content}{link}{directory}{blkdev}{chrdev}{fifo} {path}{NL}", static_data)
self.content_only = content_only
self.format_keys = {f[1] for f in Formatter().parse(format)}
self.call_keys = {
"content": self.format_content,
"mode": self.format_mode,
"type": partial(self.format_mode, filetype=True),
"owner": partial(self.format_owner),
"group": partial(self.format_owner, spec="group"),
"user": partial(self.format_owner, spec="user"),
"link": partial(self.format_other, "link"),
"directory": partial(self.format_other, "directory"),
"blkdev": partial(self.format_other, "blkdev"),
"chrdev": partial(self.format_other, "chrdev"),
"fifo": partial(self.format_other, "fifo"),
"mtime": partial(self.format_time, "mtime"),
"ctime": partial(self.format_time, "ctime"),
"isomtime": partial(self.format_iso_time, "mtime"),
"isoctime": partial(self.format_iso_time, "ctime"),
}
self.used_call_keys = set(self.call_keys) & self.format_keys
if self.content_only:
self.used_call_keys -= set(self.METADATA)
def get_item_data(self, item: "ItemDiff", jsonline=False) -> dict:
diff_data = {}
for key in self.used_call_keys:
diff_data[key] = self.call_keys[key](item)
change = []
for key in self.call_keys:
if key in ("isomtime", "isoctime"):
continue
if self.content_only and key in self.METADATA:
continue
change.append(self.call_keys[key](item))
diff_data["change"] = " ".join([v for v in change if v])
diff_data["path"] = item.path
diff_data.update({} if jsonline else self.static_data)
return diff_data
def format_other(self, key, diff: "ItemDiff"):
change = diff.changes().get(key)
return f"{change.diff_type}".ljust(27) if change else "" # 27 is the length of the content change
def format_mode(self, diff: "ItemDiff", filetype=False):
change = diff.type() if filetype else diff.mode()
return f"[{change.diff_data['item1']} -> {change.diff_data['item2']}]" if change else ""
def format_owner(self, diff: "ItemDiff", spec: Literal["owner", "user", "group"] = "owner"):
if spec == "user":
change = diff.user()
return f"[{change.diff_data['item1']} -> {change.diff_data['item2']}]" if change else ""
if spec == "group":
change = diff.group()
return f"[{change.diff_data['item1']} -> {change.diff_data['item2']}]" if change else ""
if spec != "owner":
raise ValueError(f"Invalid owner spec: {spec}")
change = diff.owner()
if change:
return "[{}:{} -> {}:{}]".format(
change.diff_data["item1"][0],
change.diff_data["item1"][1],
change.diff_data["item2"][0],
change.diff_data["item2"][1],
)
return ""
def format_content(self, diff: "ItemDiff"):
change = diff.content()
if change:
if change.diff_type == "added":
return "{}: {:>20}".format(change.diff_type, format_file_size(change.diff_data["added"]))
if change.diff_type == "removed":
return "{}: {:>18}".format(change.diff_type, format_file_size(change.diff_data["removed"]))
if "added" not in change.diff_data and "removed" not in change.diff_data:
return "modified: (can't get size)"
return "{}: {:>8} {:>8}".format(
change.diff_type,
format_file_size(change.diff_data["added"], precision=1, sign=True),
format_file_size(-change.diff_data["removed"], precision=1, sign=True),
)
return ""
def format_time(self, key, diff: "ItemDiff"):
change = diff.changes().get(key)
return f"[{key}: {change.diff_data['item1']} -> {change.diff_data['item2']}]" if change else ""
def format_iso_time(self, key, diff: "ItemDiff"):
change = diff.changes().get(key)
return (
f"[{key}: {change.diff_data['item1'].isoformat()} -> {change.diff_data['item2'].isoformat()}]"
if change
else ""
)
def file_status(mode):
if stat.S_ISREG(mode):
return "A"

View File

@ -1,4 +1,4 @@
from typing import FrozenSet, Set, NamedTuple, Tuple, Mapping, Dict, List, Iterator, Callable, Any
from typing import FrozenSet, Set, NamedTuple, Tuple, Mapping, Dict, List, Iterator, Callable, Any, Optional
from .helpers import StableDict
@ -247,9 +247,36 @@ class ManifestItem(PropDict):
@item_keys.setter
def item_keys(self, val: Tuple) -> None: ...
class DiffChange:
diff_type: str
diff_data: Dict[str, Any]
def __init__(self, diff_type: str, diff_data: Optional[Dict[str, Any]] = ...) -> None: ...
def to_dict(self) -> Dict[str, Any]: ...
class ItemDiff:
def __init__(self, *args, **kwargs) -> None: ...
def _chunk_content_equal(self, c1: Iterator, c2: Iterator) -> bool: ...
path: str
def __init__(
self,
path: str,
item1: Item,
item2: Item,
chunk_1: Iterator,
chunk_2: Iterator,
numeric_ids: bool = ...,
can_compare_chunk_ids: bool = ...,
) -> None: ...
def changes(self) -> Dict[str, DiffChange]: ...
def equal(self, content_only: bool = ...) -> bool: ...
def content(self) -> Optional[DiffChange]: ...
def ctime(self) -> Optional[DiffChange]: ...
def mtime(self) -> Optional[DiffChange]: ...
def mode(self) -> Optional[DiffChange]: ...
def type(self) -> Optional[DiffChange]: ...
def owner(self) -> Optional[DiffChange]: ...
def user(self) -> Optional[DiffChange]: ...
def group(self) -> Optional[DiffChange]: ...
def chunk_content_equal(chunks_a: Iterator, chunks_b: Iterator) -> bool: ...
class Key(PropDict):
@property

View File

@ -620,66 +620,76 @@ cpdef _init_names():
_init_names()
class DiffChange:
"""
Stores a change in a diff.
The diff_type denotes the type of change, e.g. "added", "removed", "modified".
The diff_data contains additional information about the change, e.g. the old and new mode.
"""
def __init__(self, diff_type, diff_data=None):
self.diff_type = diff_type
self.diff_data = diff_data or {}
def to_dict(self):
return {"type": self.diff_type, **self.diff_data}
class ItemDiff:
"""
Comparison of two items from different archives.
The items may have different paths and still be considered equal (e.g. for renames).
It does not include extended or time attributes in the comparison.
"""
def __init__(self, item1, item2, chunk_iterator1, chunk_iterator2, numeric_ids=False, can_compare_chunk_ids=False, content_only=False):
def __init__(self, path, item1, item2, chunk_1, chunk_2, numeric_ids=False, can_compare_chunk_ids=False):
self.path = path
self._item1 = item1
self._item2 = item2
self._content_only = content_only
self._numeric_ids = numeric_ids
self._can_compare_chunk_ids = can_compare_chunk_ids
self.equal = self._equal(chunk_iterator1, chunk_iterator2)
changes = []
self._chunk_1 = chunk_1
self._chunk_2 = chunk_2
self._changes = {}
if self._item1.is_link() or self._item2.is_link():
changes.append(self._link_diff())
self._link_diff()
if 'chunks' in self._item1 and 'chunks' in self._item2:
changes.append(self._content_diff())
self._content_diff()
if self._item1.is_dir() or self._item2.is_dir():
changes.append(self._presence_diff('directory'))
self._presence_diff('directory')
if self._item1.is_blk() or self._item2.is_blk():
changes.append(self._presence_diff('blkdev'))
self._presence_diff('blkdev')
if self._item1.is_chr() or self._item2.is_chr():
changes.append(self._presence_diff('chrdev'))
self._presence_diff('chrdev')
if self._item1.is_fifo() or self._item2.is_fifo():
changes.append(self._presence_diff('fifo'))
self._presence_diff('fifo')
if not self._content_only:
if not (self._item1.get('deleted') or self._item2.get('deleted')):
changes.append(self._owner_diff())
changes.append(self._mode_diff())
changes.extend(self._time_diffs())
if not (self._item1.get('deleted') or self._item2.get('deleted')):
self._owner_diff()
self._mode_diff()
self._time_diffs()
# filter out empty changes
self._changes = [ch for ch in changes if ch]
def changes(self):
return self._changes
def __repr__(self):
if self.equal:
return 'equal'
return ' '.join(str for d, str in self._changes)
return (' '.join(self._changes.keys())) or 'equal'
def _equal(self, chunk_iterator1, chunk_iterator2):
def equal(self, content_only=False):
# if both are deleted, there is nothing at path regardless of what was deleted
if self._item1.get('deleted') and self._item2.get('deleted'):
return True
attr_list = ['deleted', 'target']
if not self._content_only:
if not content_only:
attr_list += ['mode', 'ctime', 'mtime']
attr_list += ['uid', 'gid'] if self._numeric_ids else ['user', 'group']
@ -693,74 +703,107 @@ class ItemDiff:
return False
if 'chunks' in self._item1 and 'chunks' in self._item2:
return self._content_equal(chunk_iterator1, chunk_iterator2)
return self._content_equal()
return True
def _presence_diff(self, item_type):
if not self._item1.get('deleted') and self._item2.get('deleted'):
chg = 'removed ' + item_type
return ({"type": chg}, chg)
self._changes[item_type] = DiffChange(f"removed {item_type}")
return True
if self._item1.get('deleted') and not self._item2.get('deleted'):
chg = 'added ' + item_type
return ({"type": chg}, chg)
self._changes[item_type] = DiffChange(f"added {item_type}")
return True
def _link_diff(self):
pd = self._presence_diff('link')
if pd is not None:
return pd
if self._presence_diff('link'):
return True
if 'target' in self._item1 and 'target' in self._item2 and self._item1.target != self._item2.target:
return ({"type": 'changed link'}, 'changed link')
self._changes['link'] = DiffChange('changed link')
return True
def _content_diff(self):
if self._item1.get('deleted'):
sz = self._item2.get_size()
return ({"type": "added", "size": sz}, 'added {:>13}'.format(format_file_size(sz)))
self._changes['content'] = DiffChange("added", {"added": sz, "removed": 0})
return True
if self._item2.get('deleted'):
sz = self._item1.get_size()
return ({"type": "removed", "size": sz}, 'removed {:>11}'.format(format_file_size(sz)))
self._changes['content'] = DiffChange("removed", {"added": 0, "removed": sz})
return True
if not self._can_compare_chunk_ids:
return ({"type": "modified"}, "modified")
self._changes['content'] = DiffChange("modified")
return True
chunk_ids1 = {c.id for c in self._item1.chunks}
chunk_ids2 = {c.id for c in self._item2.chunks}
added_ids = chunk_ids2 - chunk_ids1
removed_ids = chunk_ids1 - chunk_ids2
added = self._item2.get_size(consider_ids=added_ids)
removed = self._item1.get_size(consider_ids=removed_ids)
return ({"type": "modified", "added": added, "removed": removed},
'{:>9} {:>9}'.format(format_file_size(added, precision=1, sign=True),
format_file_size(-removed, precision=1, sign=True)))
self._changes['content'] = DiffChange("modified", {"added": added, "removed": removed})
return True
def _owner_diff(self):
u_attr, g_attr = ('uid', 'gid') if self._numeric_ids else ('user', 'group')
u1, g1 = self._item1.get(u_attr), self._item1.get(g_attr)
u2, g2 = self._item2.get(u_attr), self._item2.get(g_attr)
if (u1, g1) != (u2, g2):
return ({"type": "owner", "old_user": u1, "old_group": g1, "new_user": u2, "new_group": g2},
'[{}:{} -> {}:{}]'.format(u1, g1, u2, g2))
if (u1, g1) == (u2, g2):
return False
self._changes['owner'] = DiffChange("changed owner", {"item1": (u1, g1), "item2": (u2, g2)})
if u1 != u2:
self._changes['user'] = DiffChange("changed user", {"item1": u1, "item2": u2})
if g1 != g2:
self._changes['group'] = DiffChange("changed group", {"item1": g1, "item2": g2})
return True
def _mode_diff(self):
if 'mode' in self._item1 and 'mode' in self._item2 and self._item1.mode != self._item2.mode:
mode1 = stat.filemode(self._item1.mode)
mode2 = stat.filemode(self._item2.mode)
return ({"type": "mode", "old_mode": mode1, "new_mode": mode2}, '[{} -> {}]'.format(mode1, mode2))
self._changes['mode'] = DiffChange("changed mode", {"item1": mode1, "item2": mode2})
if mode1[0] != mode2[0]:
self._changes['type'] = DiffChange("changed type", {"item1": mode1[0], "item2": mode2[0]})
def _time_diffs(self):
changes = []
attrs = ["ctime", "mtime"]
for attr in attrs:
if attr in self._item1 and attr in self._item2 and self._item1.get(attr) != self._item2.get(attr):
ts1 = OutputTimestamp(safe_timestamp(self._item1.get(attr)))
ts2 = OutputTimestamp(safe_timestamp(self._item2.get(attr)))
changes.append(({"type": attr, f"old_{attr}": ts1, f"new_{attr}": ts2}, '[{}: {} -> {}]'.format(attr, ts1, ts2)))
return changes
self._changes[attr] = DiffChange(attr, {"item1": ts1, "item2": ts2},)
return True
def _content_equal(self, chunk_iterator1, chunk_iterator2):
def content(self):
return self._changes.get('content')
def ctime(self):
return self._changes.get('ctime')
def mtime(self):
return self._changes.get('mtime')
def mode(self):
return self._changes.get('mode')
def type(self):
return self._changes.get('type')
def owner(self):
return self._changes.get('owner')
def user(self):
return self._changes.get('user')
def group(self):
return self._changes.get('group')
def _content_equal(self):
if self._can_compare_chunk_ids:
return self._item1.chunks == self._item2.chunks
if self._item1.get_size() != self._item2.get_size():
return False
return chunks_contents_equal(chunk_iterator1, chunk_iterator2)
return chunks_contents_equal(self._chunk_1, self._chunk_2)
def chunks_contents_equal(chunks_a, chunks_b):

View File

@ -72,22 +72,20 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.cmd(f"--repo={self.repository_location}", "create", "test1b", "input", "--chunker-params", "16,18,17,4095")
def do_asserts(output, can_compare_ids, content_only=False):
# File contents changed (deleted and replaced with a new file)
change = "B" if can_compare_ids else "{:<19}".format("modified")
lines = output.splitlines()
lines: list = output.splitlines()
assert "file_replaced" in output # added to debug #3494
change = "modified.*B" if can_compare_ids else r"modified: \(can't get size\)"
self.assert_line_exists(lines, f"{change}.*input/file_replaced")
# File unchanged
assert "input/file_unchanged" not in output
# Directory replaced with a regular file
if "BORG_TESTS_IGNORE_MODES" not in os.environ and not is_win32 and not content_only:
self.assert_line_exists(lines, "drwxr-xr-x -> -rwxr-xr-x.*input/dir_replaced_with_file")
self.assert_line_exists(lines, "[drwxr-xr-x -> -rwxr-xr-x].*input/dir_replaced_with_file")
# Basic directory cases
assert "added directory input/dir_added" in output
assert "removed directory input/dir_removed" in output
assert "added directory input/dir_added" in output
assert "removed directory input/dir_removed" in output
if are_symlinks_supported():
# Basic symlink cases
@ -96,8 +94,9 @@ class ArchiverTestCase(ArchiverTestCaseBase):
self.assert_line_exists(lines, "removed link.*input/link_removed")
# Symlink replacing or being replaced
assert "input/dir_replaced_with_link" in output
assert "input/link_replaced_by_file" in output
if not content_only:
assert "input/dir_replaced_with_link" in output
assert "input/link_replaced_by_file" in output
# Symlink target removed. Should not affect the symlink at all.
assert "input/link_target_removed" not in output
@ -105,7 +104,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
# The inode has two links and the file contents changed. Borg
# should notice the changes in both links. However, the symlink
# pointing to the file is not changed.
change = "0 B" if can_compare_ids else "{:<19}".format("modified")
change = "modified.*0 B" if can_compare_ids else r"modified: \(can't get size\)"
self.assert_line_exists(lines, f"{change}.*input/empty")
if are_hardlinks_supported():
self.assert_line_exists(lines, f"{change}.*input/hardlink_contents_changed")
@ -114,18 +113,18 @@ class ArchiverTestCase(ArchiverTestCaseBase):
# Added a new file and a hard link to it. Both links to the same
# inode should appear as separate files.
assert "added 2.05 kB input/file_added" in output
assert "added: 2.05 kB input/file_added" in output
if are_hardlinks_supported():
assert "added 2.05 kB input/hardlink_added" in output
assert "added: 2.05 kB input/hardlink_added" in output
# check if a diff between nonexistent and empty new file is found
assert "added 0 B input/file_empty_added" in output
assert "added: 0 B input/file_empty_added" in output
# The inode has two links and both of them are deleted. They should
# appear as two deleted files.
assert "removed 256 B input/file_removed" in output
assert "removed: 256 B input/file_removed" in output
if are_hardlinks_supported():
assert "removed 256 B input/hardlink_removed" in output
assert "removed: 256 B input/hardlink_removed" in output
if are_hardlinks_supported() and content_only:
# Another link (marked previously as the source in borg) to the
@ -143,7 +142,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
chgsets = [j["changes"] for j in data if j["path"] == filename]
assert len(chgsets) < 2
# return a flattened list of changes for given filename
return [chg for chgset in chgsets for chg in chgset]
return sum(chgsets, [])
# convert output to list of dicts
joutput = [json.loads(line) for line in output.split("\n") if line]
@ -157,7 +156,7 @@ class ArchiverTestCase(ArchiverTestCaseBase):
# Directory replaced with a regular file
if "BORG_TESTS_IGNORE_MODES" not in os.environ and not is_win32 and not content_only:
assert {"type": "mode", "old_mode": "drwxr-xr-x", "new_mode": "-rwxr-xr-x"} in get_changes(
assert {"type": "changed mode", "item1": "drwxr-xr-x", "item2": "-rwxr-xr-x"} in get_changes(
"input/dir_replaced_with_file", joutput
)
@ -175,11 +174,11 @@ class ArchiverTestCase(ArchiverTestCaseBase):
if not content_only:
assert any(
chg["type"] == "mode" and chg["new_mode"].startswith("l")
chg["type"] == "changed mode" and chg["item1"].startswith("d") and chg["item2"].startswith("l")
for chg in get_changes("input/dir_replaced_with_link", joutput)
), get_changes("input/dir_replaced_with_link", joutput)
assert any(
chg["type"] == "mode" and chg["old_mode"].startswith("l")
chg["type"] == "changed mode" and chg["item1"].startswith("l") and chg["item2"].startswith("-")
for chg in get_changes("input/link_replaced_by_file", joutput)
), get_changes("input/link_replaced_by_file", joutput)
@ -198,18 +197,18 @@ class ArchiverTestCase(ArchiverTestCaseBase):
# Added a new file and a hard link to it. Both links to the same
# inode should appear as separate files.
assert {"type": "added", "size": 2048} in get_changes("input/file_added", joutput)
assert {"added": 2048, "removed": 0, "type": "added"} in get_changes("input/file_added", joutput)
if are_hardlinks_supported():
assert {"type": "added", "size": 2048} in get_changes("input/hardlink_added", joutput)
assert {"added": 2048, "removed": 0, "type": "added"} in get_changes("input/hardlink_added", joutput)
# check if a diff between nonexistent and empty new file is found
assert {"type": "added", "size": 0} in get_changes("input/file_empty_added", joutput)
assert {"added": 0, "removed": 0, "type": "added"} in get_changes("input/file_empty_added", joutput)
# The inode has two links and both of them are deleted. They should
# appear as two deleted files.
assert {"type": "removed", "size": 256} in get_changes("input/file_removed", joutput)
assert {"added": 0, "removed": 256, "type": "removed"} in get_changes("input/file_removed", joutput)
if are_hardlinks_supported():
assert {"type": "removed", "size": 256} in get_changes("input/hardlink_removed", joutput)
assert {"added": 0, "removed": 256, "type": "removed"} in get_changes("input/hardlink_removed", joutput)
if are_hardlinks_supported() and content_only:
# Another link (marked previously as the source in borg) to the
@ -251,14 +250,28 @@ class ArchiverTestCase(ArchiverTestCaseBase):
time.sleep(1) # HFS has a 1s timestamp granularity
self.create_regular_file("test_file", size=15)
self.cmd(f"--repo={self.repository_location}", "create", "archive2", "input")
output = self.cmd(f"--repo={self.repository_location}", "diff", "archive1", "archive2")
output = self.cmd(
f"--repo={self.repository_location}",
"diff",
"archive1",
"archive2",
"--format",
"'{mtime}{ctime} {path}{NL}'",
)
self.assert_in("mtime", output)
self.assert_in("ctime", output) # Should show up on windows as well since it is a new file.
if is_darwin:
time.sleep(1) # HFS has a 1s timestamp granularity
os.chmod("input/test_file", 0o777)
self.cmd(f"--repo={self.repository_location}", "create", "archive3", "input")
output = self.cmd(f"--repo={self.repository_location}", "diff", "archive2", "archive3")
output = self.cmd(
f"--repo={self.repository_location}",
"diff",
"archive2",
"archive3",
"--format",
"'{mtime}{ctime} {path}{NL}'",
)
self.assert_not_in("mtime", output)
# Checking platform because ctime should not be shown on windows since it wasn't recreated.
if not is_win32:
@ -294,7 +307,10 @@ class ArchiverTestCase(ArchiverTestCaseBase):
"e_file_changed",
"f_file_removed",
]
assert all(x in line for x, line in zip(expected, output.splitlines()))
assert isinstance(output, str)
outputs = output.splitlines()
assert len(outputs) == len(expected)
assert all(x in line for x, line in zip(expected, outputs))
class RemoteArchiverTestCase(RemoteArchiverTestCaseBase, ArchiverTestCase):