borg/src/borg/archiver/tar_cmds.py

import argparse
import base64
import logging
import os
import stat
import tarfile
import time

from ..archive import Archive, TarfileObjectProcessors, ChunksProcessor
from ..compress import CompressionSpec
from ..constants import *  # NOQA
from ..helpers import HardLinkManager
from ..helpers import ProgressIndicatorPercent
from ..helpers import dash_open
from ..helpers import msgpack
from ..helpers import create_filter_process
from ..helpers import ChunkIteratorFileWrapper
from ..helpers import archivename_validator, comment_validator, PathSpec, ChunkerParams
from ..helpers import remove_surrogates
from ..helpers import timestamp, archive_ts_now
from ..helpers import basic_json_data, json_print
from ..helpers import log_multi
from ..manifest import Manifest

from ._common import with_repository, with_archive, Highlander, define_exclusion_group
from ._common import build_matcher, build_filter

from ..logger import create_logger

logger = create_logger(__name__)

# Python 3.12+ gives a deprecation warning if TarFile.extraction_filter is None.
# https://docs.python.org/3.12/library/tarfile.html#tarfile-extraction-filter
if hasattr(tarfile, "fully_trusted_filter"):
    tarfile.TarFile.extraction_filter = staticmethod(tarfile.fully_trusted_filter)  # type: ignore


def get_tar_filter(fname, decompress):
    # Note that filter is None if fname is '-'.
    if fname.endswith((".tar.gz", ".tgz")):
        filter = "gzip -d" if decompress else "gzip"
    elif fname.endswith((".tar.bz2", ".tbz")):
        filter = "bzip2 -d" if decompress else "bzip2"
    elif fname.endswith((".tar.xz", ".txz")):
        filter = "xz -d" if decompress else "xz"
    elif fname.endswith((".tar.lz4",)):
        filter = "lz4 -d" if decompress else "lz4"
    elif fname.endswith((".tar.zstd", ".tar.zst")):
        filter = "zstd -d" if decompress else "zstd"
    else:
        filter = None
    logger.debug("Automatically determined tar filter: %s", filter)
    return filter


class TarMixIn:
    @with_repository(compatibility=(Manifest.Operation.READ,))
    @with_archive
    def do_export_tar(self, args, repository, manifest, archive):
        """Export archive contents as a tarball"""
        self.output_list = args.output_list

        # A quick note about the general design of tar_filter and tarfile;
        # The tarfile module of Python can provide some compression mechanisms
        # by itself, using the builtin gzip, bz2 and lzma modules (and "tarmodes"
        # such as "w:xz").
        #
        # Doing so would have three major drawbacks:
        # For one the compressor runs on the same thread as the program using the
        # tarfile, stealing valuable CPU time from Borg and thus reducing throughput.
        # Then this limits the available options - what about lz4? Brotli? zstd?
        # The third issue is that systems can ship more optimized versions than those
        # built into Python, e.g. pigz or pxz, which can use more than one thread for
        # compression.
        #
        # Therefore we externalize compression by using a filter program, which has
        # none of these drawbacks. The only issue of using an external filter is
        # that it has to be installed -- hardly a problem, considering that
        # the decompressor must be installed as well to make use of the exported tarball!

        filter = get_tar_filter(args.tarfile, decompress=False) if args.tar_filter == "auto" else args.tar_filter

        tarstream = dash_open(args.tarfile, "wb")
        tarstream_close = args.tarfile != "-"

        with create_filter_process(filter, stream=tarstream, stream_close=tarstream_close, inbound=False) as _stream:
            self._export_tar(args, archive, _stream)

    def _export_tar(self, args, archive, tarstream):
        matcher = build_matcher(args.patterns, args.paths)

        progress = args.progress
        output_list = args.output_list
        strip_components = args.strip_components
        hlm = HardLinkManager(id_type=bytes, info_type=str)  # hlid -> path

        filter = build_filter(matcher, strip_components)

        # The | (pipe) symbol instructs tarfile to use a streaming mode of operation
        # where it never seeks on the passed fileobj.
        tar_format = dict(GNU=tarfile.GNU_FORMAT, PAX=tarfile.PAX_FORMAT, BORG=tarfile.PAX_FORMAT)[args.tar_format]
        tar = tarfile.open(fileobj=tarstream, mode="w|", format=tar_format)

        if progress:
            pi = ProgressIndicatorPercent(msg="%5.1f%% Processing: %s", step=0.1, msgid="extract")
            pi.output("Calculating size")
            extracted_size = sum(item.get_size() for item in archive.iter_items(filter))
            pi.total = extracted_size
        else:
            pi = None

        def item_content_stream(item):
            """
            Return a file-like object that reads from the chunks of *item*.
            """
            chunk_iterator = archive.pipeline.fetch_many(
                [chunk_id for chunk_id, _ in item.chunks], is_preloaded=True, ro_type=ROBJ_FILE_STREAM
            )
            if pi:
                info = [remove_surrogates(item.path)]
                return ChunkIteratorFileWrapper(
                    chunk_iterator, lambda read_bytes: pi.show(increase=len(read_bytes), info=info)
                )
            else:
                return ChunkIteratorFileWrapper(chunk_iterator)

        def item_to_tarinfo(item, original_path):
            """
            Transform a Borg *item* into a tarfile.TarInfo object.

            Return a tuple (tarinfo, stream), where stream may be a file-like object that represents
            the file contents, if any, and is None otherwise. When *tarinfo* is None, the *item*
            cannot be represented as a TarInfo object and should be skipped.
            """
            stream = None
            tarinfo = tarfile.TarInfo()
            tarinfo.name = item.path
            tarinfo.mtime = item.mtime / 1e9
            tarinfo.mode = stat.S_IMODE(item.mode)
            tarinfo.uid = item.get("uid", 0)
            tarinfo.gid = item.get("gid", 0)
            tarinfo.uname = item.get("user", "")
            tarinfo.gname = item.get("group", "")
            # The linkname in tar has 2 uses:
            # for symlinks it means the destination, while for hardlinks it refers to the file.
            # Since hardlinks in tar have a different type code (LNKTYPE) the format might
            # support hardlinking arbitrary objects (including symlinks and directories), but
            # whether implementations actually support that is a whole different question...
            tarinfo.linkname = ""

            modebits = stat.S_IFMT(item.mode)
            if modebits == stat.S_IFREG:
                tarinfo.type = tarfile.REGTYPE
                if "hlid" in item:
                    linkname = hlm.retrieve(id=item.hlid)
                    if linkname is not None:
                        # the first hardlink was already added to the archive, add a tar-hardlink reference to it.
                        tarinfo.type = tarfile.LNKTYPE
                        tarinfo.linkname = linkname
                    else:
                        tarinfo.size = item.get_size()
                        stream = item_content_stream(item)
                        hlm.remember(id=item.hlid, info=item.path)
                else:
                    tarinfo.size = item.get_size()
                    stream = item_content_stream(item)
            elif modebits == stat.S_IFDIR:
                tarinfo.type = tarfile.DIRTYPE
            elif modebits == stat.S_IFLNK:
                tarinfo.type = tarfile.SYMTYPE
                tarinfo.linkname = item.target
            elif modebits == stat.S_IFBLK:
                tarinfo.type = tarfile.BLKTYPE
                tarinfo.devmajor = os.major(item.rdev)
                tarinfo.devminor = os.minor(item.rdev)
            elif modebits == stat.S_IFCHR:
                tarinfo.type = tarfile.CHRTYPE
                tarinfo.devmajor = os.major(item.rdev)
                tarinfo.devminor = os.minor(item.rdev)
            elif modebits == stat.S_IFIFO:
                tarinfo.type = tarfile.FIFOTYPE
            else:
                self.print_warning(
                    "%s: unsupported file type %o for tar export", remove_surrogates(item.path), modebits
                )
                return None, stream
            return tarinfo, stream

        def item_to_paxheaders(format, item):
            """
            Transform (parts of) a Borg *item* into a pax_headers dict.
            """
            # PAX format
            # ----------
            # When using the PAX (POSIX) format, we can support some things that aren't possible
            # with classic tar formats, including GNU tar, such as:
            # - atime, ctime (DONE)
            # - possibly Linux capabilities, security.* xattrs (TODO)
            # - various additions supported by GNU tar in POSIX mode (TODO)
            #
            # BORG format
            # -----------
            # This is based on PAX, but additionally adds BORG.* pax headers.
            # Additionally to the standard tar / PAX metadata and data, it transfers
            # ALL borg item metadata in a BORG specific way.
            #
            ph = {}
            # note: for mtime this is a bit redundant as it is already done by tarfile module,
            #       but we just do it in our way to be consistent for sure.
            for name in "atime", "ctime", "mtime":
                if hasattr(item, name):
                    ns = getattr(item, name)
                    ph[name] = str(ns / 1e9)
            if format == "BORG":  # BORG format additions
                ph["BORG.item.version"] = "1"
                # BORG.item.meta - just serialize all metadata we have:
                meta_bin = msgpack.packb(item.as_dict())
                meta_text = base64.b64encode(meta_bin).decode()
                ph["BORG.item.meta"] = meta_text
            return ph

        for item in archive.iter_items(filter, preload=True):
            orig_path = item.path
            if strip_components:
                item.path = os.sep.join(orig_path.split(os.sep)[strip_components:])
            tarinfo, stream = item_to_tarinfo(item, orig_path)
            if tarinfo:
                if args.tar_format in ("BORG", "PAX"):
                    tarinfo.pax_headers = item_to_paxheaders(args.tar_format, item)
                if output_list:
                    logging.getLogger("borg.output.list").info(remove_surrogates(orig_path))
                tar.addfile(tarinfo, stream)

        if pi:
            pi.finish()

        # This does not close the fileobj (tarstream) we passed to it -- a side effect of the | mode.
        tar.close()

        for pattern in matcher.get_unmatched_include_patterns():
            self.print_warning_instance(IncludePatternNeverMatchedWarning(pattern))

    @with_repository(cache=True, exclusive=True, compatibility=(Manifest.Operation.WRITE,))
    def do_import_tar(self, args, repository, manifest, cache):
        """Create a backup archive from a tarball"""
        self.output_filter = args.output_filter
        self.output_list = args.output_list

        filter = get_tar_filter(args.tarfile, decompress=True) if args.tar_filter == "auto" else args.tar_filter

        tarstream = dash_open(args.tarfile, "rb")
        tarstream_close = args.tarfile != "-"

        with create_filter_process(filter, stream=tarstream, stream_close=tarstream_close, inbound=True) as _stream:
            self._import_tar(args, repository, manifest, manifest.key, cache, _stream)

    def _import_tar(self, args, repository, manifest, key, cache, tarstream):
        t0 = archive_ts_now()
        t0_monotonic = time.monotonic()

        archive = Archive(
            manifest,
            args.name,
            cache=cache,
            create=True,
            progress=args.progress,
            chunker_params=args.chunker_params,
            start=t0,
            start_monotonic=t0_monotonic,
            log_json=args.log_json,
        )
        cp = ChunksProcessor(
            cache=cache,
            key=key,
            add_item=archive.add_item,
            prepare_checkpoint=archive.prepare_checkpoint,
            write_checkpoint=archive.write_checkpoint,
            checkpoint_interval=args.checkpoint_interval,
            checkpoint_volume=args.checkpoint_volume,
            rechunkify=False,
        )
        tfo = TarfileObjectProcessors(
            cache=cache,
            key=key,
            process_file_chunks=cp.process_file_chunks,
            add_item=archive.add_item,
            chunker_params=args.chunker_params,
            show_progress=args.progress,
            log_json=args.log_json,
            iec=args.iec,
            file_status_printer=self.print_file_status,
        )

        tar = tarfile.open(fileobj=tarstream, mode="r|", ignore_zeros=args.ignore_zeros)

        while True:
            tarinfo = tar.next()
            if not tarinfo:
                break
            if tarinfo.isreg():
                status = tfo.process_file(tarinfo=tarinfo, status="A", type=stat.S_IFREG, tar=tar)
                archive.stats.nfiles += 1
            elif tarinfo.isdir():
                status = tfo.process_dir(tarinfo=tarinfo, status="d", type=stat.S_IFDIR)
            elif tarinfo.issym():
                status = tfo.process_symlink(tarinfo=tarinfo, status="s", type=stat.S_IFLNK)
            elif tarinfo.islnk():
                # tar uses a hardlink model like: the first instance of a hardlink is stored as a regular file,
                # later instances are special entries referencing back to the first instance.
                status = tfo.process_hardlink(tarinfo=tarinfo, status="h", type=stat.S_IFREG)
            elif tarinfo.isblk():
                status = tfo.process_dev(tarinfo=tarinfo, status="b", type=stat.S_IFBLK)
            elif tarinfo.ischr():
                status = tfo.process_dev(tarinfo=tarinfo, status="c", type=stat.S_IFCHR)
            elif tarinfo.isfifo():
                status = tfo.process_fifo(tarinfo=tarinfo, status="f", type=stat.S_IFIFO)
            else:
                status = "E"
                self.print_warning("%s: Unsupported tarinfo type %s", tarinfo.name, tarinfo.type)
            self.print_file_status(status, tarinfo.name)

        # This does not close the fileobj (tarstream) we passed to it -- a side effect of the | mode.
        tar.close()

        if args.progress:
            archive.stats.show_progress(final=True)
        archive.stats += tfo.stats
        archive.save(comment=args.comment, timestamp=args.timestamp)
        args.stats |= args.json
        if args.stats:
            if args.json:
                json_print(basic_json_data(archive.manifest, cache=archive.cache, extra={"archive": archive}))
            else:
                log_multi(str(archive), str(archive.stats), logger=logging.getLogger("borg.output.stats"))

    def build_parser_tar(self, subparsers, common_parser, mid_common_parser):
        from ._common import process_epilog

        export_tar_epilog = process_epilog(
            """
        This command creates a tarball from an archive.

        When giving '-' as the output FILE, Borg will write a tar stream to standard output.

        By default (``--tar-filter=auto``) Borg will detect whether the FILE should be compressed
        based on its file extension and pipe the tarball through an appropriate filter
        before writing it to FILE:

        - .tar.gz or .tgz: gzip
        - .tar.bz2 or .tbz: bzip2
        - .tar.xz or .txz: xz
        - .tar.zstd or .tar.zst: zstd
        - .tar.lz4: lz4

        Alternatively, a ``--tar-filter`` program may be explicitly specified. It should
        read the uncompressed tar stream from stdin and write a compressed/filtered
        tar stream to stdout.

        Depending on the ``-tar-format`` option, these formats are created:

        +--------------+---------------------------+----------------------------+
        | --tar-format | Specification             | Metadata                   |
        +--------------+---------------------------+----------------------------+
        | BORG         | BORG specific, like PAX   | all as supported by borg   |
        +--------------+---------------------------+----------------------------+
        | PAX          | POSIX.1-2001 (pax) format | GNU + atime/ctime/mtime ns |
        +--------------+---------------------------+----------------------------+
        | GNU          | GNU tar format            | mtime s, no atime/ctime,   |
        |              |                           | no ACLs/xattrs/bsdflags    |
        +--------------+---------------------------+----------------------------+

        A ``--sparse`` option (as found in borg extract) is not supported.

        By default the entire archive is extracted but a subset of files and directories
        can be selected by passing a list of ``PATHs`` as arguments.
        The file selection can further be restricted by using the ``--exclude`` option.

        For more help on include/exclude patterns, see the :ref:`borg_patterns` command output.

        ``--progress`` can be slower than no progress display, since it makes one additional
        pass over the archive metadata.
        """
        )
        subparser = subparsers.add_parser(
            "export-tar",
            parents=[common_parser],
            add_help=False,
            description=self.do_export_tar.__doc__,
            epilog=export_tar_epilog,
            formatter_class=argparse.RawDescriptionHelpFormatter,
            help="create tarball from archive",
        )
        subparser.set_defaults(func=self.do_export_tar)
        subparser.add_argument(
            "--tar-filter",
            dest="tar_filter",
            default="auto",
            action=Highlander,
            help="filter program to pipe data through",
        )
        subparser.add_argument(
            "--list", dest="output_list", action="store_true", help="output verbose list of items (files, dirs, ...)"
        )
        subparser.add_argument(
            "--tar-format",
            metavar="FMT",
            dest="tar_format",
            default="GNU",
            choices=("BORG", "PAX", "GNU"),
            action=Highlander,
            help="select tar format: BORG, PAX or GNU",
        )
        subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name")
        subparser.add_argument("tarfile", metavar="FILE", help='output tar file. "-" to write to stdout instead.')
        subparser.add_argument(
            "paths", metavar="PATH", nargs="*", type=PathSpec, help="paths to extract; patterns are supported"
        )
        define_exclusion_group(subparser, strip_components=True)

        import_tar_epilog = process_epilog(
            """
        This command creates a backup archive from a tarball.

        When giving '-' as path, Borg will read a tar stream from standard input.

        By default (--tar-filter=auto) Borg will detect whether the file is compressed
        based on its file extension and pipe the file through an appropriate filter:

        - .tar.gz or .tgz: gzip -d
        - .tar.bz2 or .tbz: bzip2 -d
        - .tar.xz or .txz: xz -d
        - .tar.zstd or .tar.zst: zstd -d
        - .tar.lz4: lz4 -d

        Alternatively, a --tar-filter program may be explicitly specified. It should
        read compressed data from stdin and output an uncompressed tar stream on
        stdout.

        Most documentation of borg create applies. Note that this command does not
        support excluding files.

        A ``--sparse`` option (as found in borg create) is not supported.

        About tar formats and metadata conservation or loss, please see ``borg export-tar``.

        import-tar reads these tar formats:

        - BORG: borg specific (PAX-based)
        - PAX: POSIX.1-2001
        - GNU: GNU tar
        - POSIX.1-1988 (ustar)
        - UNIX V7 tar
        - SunOS tar with extended attributes

        To import multiple tarballs into a single archive, they can be simply
        concatenated (e.g. using "cat") into a single file, and imported with an
        ``--ignore-zeros`` option to skip through the stop markers between them.
        """
        )
        subparser = subparsers.add_parser(
            "import-tar",
            parents=[common_parser],
            add_help=False,
            description=self.do_import_tar.__doc__,
            epilog=import_tar_epilog,
            formatter_class=argparse.RawDescriptionHelpFormatter,
            help=self.do_import_tar.__doc__,
        )
        subparser.set_defaults(func=self.do_import_tar)
        subparser.add_argument(
            "--tar-filter",
            dest="tar_filter",
            default="auto",
            action=Highlander,
            help="filter program to pipe data through",
        )
        subparser.add_argument(
            "-s",
            "--stats",
            dest="stats",
            action="store_true",
            default=False,
            help="print statistics for the created archive",
        )
        subparser.add_argument(
            "--list",
            dest="output_list",
            action="store_true",
            default=False,
            help="output verbose list of items (files, dirs, ...)",
        )
        subparser.add_argument(
            "--filter",
            dest="output_filter",
            metavar="STATUSCHARS",
            action=Highlander,
            help="only display items with the given status characters",
        )
        subparser.add_argument("--json", action="store_true", help="output stats as JSON (implies --stats)")
        subparser.add_argument(
            "--ignore-zeros",
            dest="ignore_zeros",
            action="store_true",
            help="ignore zero-filled blocks in the input tarball",
        )

        archive_group = subparser.add_argument_group("Archive options")
        archive_group.add_argument(
            "--comment",
            metavar="COMMENT",
            dest="comment",
            type=comment_validator,
            default="",
            action=Highlander,
            help="add a comment text to the archive",
        )
        archive_group.add_argument(
            "--timestamp",
            dest="timestamp",
            type=timestamp,
            default=None,
            action=Highlander,
            metavar="TIMESTAMP",
            help="manually specify the archive creation date/time (yyyy-mm-ddThh:mm:ss[(+|-)HH:MM] format, "
            "(+|-)HH:MM is the UTC offset, default: local time zone). Alternatively, give a reference file/directory.",
        )
        archive_group.add_argument(
            "-c",
            "--checkpoint-interval",
            dest="checkpoint_interval",
            type=int,
            default=1800,
            action=Highlander,
            metavar="SECONDS",
            help="write checkpoint every SECONDS seconds (Default: 1800)",
        )
        archive_group.add_argument(
            "--checkpoint-volume",
            metavar="BYTES",
            dest="checkpoint_volume",
            type=int,
            default=0,
            action=Highlander,
            help="write checkpoint every BYTES bytes (Default: 0, meaning no volume based checkpointing)",
        )
        archive_group.add_argument(
            "--chunker-params",
            dest="chunker_params",
            type=ChunkerParams,
            default=CHUNKER_PARAMS,
            action=Highlander,
            metavar="PARAMS",
            help="specify the chunker parameters (ALGO, CHUNK_MIN_EXP, CHUNK_MAX_EXP, "
            "HASH_MASK_BITS, HASH_WINDOW_SIZE). default: %s,%d,%d,%d,%d" % CHUNKER_PARAMS,
        )
        archive_group.add_argument(
            "-C",
            "--compression",
            metavar="COMPRESSION",
            dest="compression",
            type=CompressionSpec,
            default=CompressionSpec("lz4"),
            action=Highlander,
            help="select compression algorithm, see the output of the " '"borg help compression" command for details.',
        )

        subparser.add_argument("name", metavar="NAME", type=archivename_validator, help="specify the archive name")
        subparser.add_argument("tarfile", metavar="TARFILE", help='input tar file. "-" to read from stdin instead.')