Merge pull request #2217 from ThomasWaldmann/bench-cmd

borg benchmark crud command
2017-02-28 21:49:04 +01:00 · 2017-02-28 21:49:04 +01:00 · 0db058345b
parent 94d40e53ab 7e9845fc68
commit 0db058345b
2 changed files with 196 additions and 1 deletions
--- a/docs/misc/benchmark-crud.txt
+++ b/docs/misc/benchmark-crud.txt
@ -0,0 +1,64 @@
 borg benchmark crud
 ===================
 Here is some example of borg benchmark crud output.
 I ran it on my laptop, Core i5-4200u, 8GB RAM, SATA SSD, Linux, ext4 fs.
 "src" as well as repo is local, on this SSD.
 $ BORG_PASSPHRASE=secret borg init --encryption repokey-blake2 repo
 $ BORG_PASSPHRASE=secret borg benchmark crud repo src
 C-Z-BIG         116.06 MB/s (10 * 100.00 MB all-zero files: 8.62s)
 R-Z-BIG         197.00 MB/s (10 * 100.00 MB all-zero files: 5.08s)
 U-Z-BIG         418.07 MB/s (10 * 100.00 MB all-zero files: 2.39s)
 D-Z-BIG         724.94 MB/s (10 * 100.00 MB all-zero files: 1.38s)
 C-R-BIG          42.21 MB/s (10 * 100.00 MB random files: 23.69s)
 R-R-BIG         134.45 MB/s (10 * 100.00 MB random files: 7.44s)
 U-R-BIG         316.83 MB/s (10 * 100.00 MB random files: 3.16s)
 D-R-BIG         251.10 MB/s (10 * 100.00 MB random files: 3.98s)
 C-Z-MEDIUM      118.53 MB/s (1000 * 1.00 MB all-zero files: 8.44s)
 R-Z-MEDIUM      218.49 MB/s (1000 * 1.00 MB all-zero files: 4.58s)
 U-Z-MEDIUM      591.59 MB/s (1000 * 1.00 MB all-zero files: 1.69s)
 D-Z-MEDIUM      730.04 MB/s (1000 * 1.00 MB all-zero files: 1.37s)
 C-R-MEDIUM       31.46 MB/s (1000 * 1.00 MB random files: 31.79s)
 R-R-MEDIUM      129.64 MB/s (1000 * 1.00 MB random files: 7.71s)
 U-R-MEDIUM      621.86 MB/s (1000 * 1.00 MB random files: 1.61s)
 D-R-MEDIUM      234.82 MB/s (1000 * 1.00 MB random files: 4.26s)
 C-Z-SMALL        19.81 MB/s (10000 * 10.00 kB all-zero files: 5.05s)
 R-Z-SMALL        97.69 MB/s (10000 * 10.00 kB all-zero files: 1.02s)
 U-Z-SMALL        36.35 MB/s (10000 * 10.00 kB all-zero files: 2.75s)
 D-Z-SMALL        57.04 MB/s (10000 * 10.00 kB all-zero files: 1.75s)
 C-R-SMALL         9.81 MB/s (10000 * 10.00 kB random files: 10.19s)
 R-R-SMALL        92.21 MB/s (10000 * 10.00 kB random files: 1.08s)
 U-R-SMALL        64.62 MB/s (10000 * 10.00 kB random files: 1.55s)
 D-R-SMALL        51.62 MB/s (10000 * 10.00 kB random files: 1.94s)
 A second run some time later gave:
 C-Z-BIG         115.22 MB/s (10 * 100.00 MB all-zero files: 8.68s)
 R-Z-BIG         196.06 MB/s (10 * 100.00 MB all-zero files: 5.10s)
 U-Z-BIG         439.50 MB/s (10 * 100.00 MB all-zero files: 2.28s)
 D-Z-BIG         671.11 MB/s (10 * 100.00 MB all-zero files: 1.49s)
 C-R-BIG          43.40 MB/s (10 * 100.00 MB random files: 23.04s)
 R-R-BIG         133.17 MB/s (10 * 100.00 MB random files: 7.51s)
 U-R-BIG         464.50 MB/s (10 * 100.00 MB random files: 2.15s)
 D-R-BIG         245.19 MB/s (10 * 100.00 MB random files: 4.08s)
 C-Z-MEDIUM      110.82 MB/s (1000 * 1.00 MB all-zero files: 9.02s)
 R-Z-MEDIUM      217.96 MB/s (1000 * 1.00 MB all-zero files: 4.59s)
 U-Z-MEDIUM      601.54 MB/s (1000 * 1.00 MB all-zero files: 1.66s)
 D-Z-MEDIUM      686.99 MB/s (1000 * 1.00 MB all-zero files: 1.46s)
 C-R-MEDIUM       39.91 MB/s (1000 * 1.00 MB random files: 25.06s)
 R-R-MEDIUM      128.91 MB/s (1000 * 1.00 MB random files: 7.76s)
 U-R-MEDIUM      599.00 MB/s (1000 * 1.00 MB random files: 1.67s)
 D-R-MEDIUM      230.69 MB/s (1000 * 1.00 MB random files: 4.33s)
 C-Z-SMALL        14.78 MB/s (10000 * 10.00 kB all-zero files: 6.76s)
 R-Z-SMALL        96.86 MB/s (10000 * 10.00 kB all-zero files: 1.03s)
 U-Z-SMALL        35.22 MB/s (10000 * 10.00 kB all-zero files: 2.84s)
 D-Z-SMALL        64.93 MB/s (10000 * 10.00 kB all-zero files: 1.54s)
 C-R-SMALL        11.08 MB/s (10000 * 10.00 kB random files: 9.02s)
 R-R-SMALL        92.34 MB/s (10000 * 10.00 kB random files: 1.08s)
 U-R-SMALL        64.49 MB/s (10000 * 10.00 kB random files: 1.55s)
 D-R-SMALL        46.96 MB/s (10000 * 10.00 kB random files: 2.13s)
--- a/src/borg/archiver.py
+++ b/src/borg/archiver.py
@ -9,6 +9,7 @@ import logging
 import os
 import re
 import shlex
 import shutil
 import signal
 import stat
 import subprocess
@ -17,6 +18,7 @@ import textwrap
 import time
 import traceback
 from binascii import unhexlify
 from contextlib import contextmanager
 from datetime import datetime, timedelta
 from itertools import zip_longest
@ -57,7 +59,7 @@ from .helpers import basic_json_data, json_print
 from .item import Item
 from .key import key_creator, tam_required_file, tam_required, RepoKey, PassphraseKey
 from .keymanager import KeyManager
-from .platform import get_flags, umount, get_process_id
+from .platform import get_flags, umount, get_process_id, SyncFile
 from .remote import RepositoryServer, RemoteRepository, cache_if_remote
 from .repository import Repository, LIST_SCAN_LIMIT
 from .selftest import selftest
@ -322,6 +324,72 @@ class Archiver:
        logger.info('Key updated')
        return EXIT_SUCCESS
    def do_benchmark_crud(self, args):
        def measurement_run(repo, path):
            archive = repo + '::borg-benchmark-crud'
            compression = '--compression=none'
            # measure create perf (without files cache to always have it chunking)
            t_start = time.monotonic()
            rc = self.do_create(self.parse_args(['create', compression, '--no-files-cache', archive + '1', path]))
            t_end = time.monotonic()
            dt_create = t_end - t_start
            assert rc == 0
            # now build files cache
            rc1 = self.do_create(self.parse_args(['create', compression, archive + '2', path]))
            rc2 = self.do_delete(self.parse_args(['delete', archive + '2']))
            assert rc1 == rc2 == 0
            # measure a no-change update (archive1 is still present)
            t_start = time.monotonic()
            rc1 = self.do_create(self.parse_args(['create', compression, archive + '3', path]))
            t_end = time.monotonic()
            dt_update = t_end - t_start
            rc2 = self.do_delete(self.parse_args(['delete', archive + '3']))
            assert rc1 == rc2 == 0
            # measure extraction (dry-run: without writing result to disk)
            t_start = time.monotonic()
            rc = self.do_extract(self.parse_args(['extract', '--dry-run', archive + '1']))
            t_end = time.monotonic()
            dt_extract = t_end - t_start
            assert rc == 0
            # measure archive deletion (of LAST present archive with the data)
            t_start = time.monotonic()
            rc = self.do_delete(self.parse_args(['delete', archive + '1']))
            t_end = time.monotonic()
            dt_delete = t_end - t_start
            assert rc == 0
            return dt_create, dt_update, dt_extract, dt_delete
        @contextmanager
        def test_files(path, count, size, random):
            path = os.path.join(path, 'borg-test-data')
            os.makedirs(path)
            for i in range(count):
                fname = os.path.join(path, 'file_%d' % i)
                data = b'\0' * size if not random else os.urandom(size)
                with SyncFile(fname, binary=True) as fd:  # used for posix_fadvise's sake
                    fd.write(data)
            yield path
            shutil.rmtree(path)
        for msg, count, size, random in [
            ('Z-BIG', 10, 100000000, False),
            ('R-BIG', 10, 100000000, True),
            ('Z-MEDIUM', 1000, 1000000, False),
            ('R-MEDIUM', 1000, 1000000, True),
            ('Z-SMALL', 10000, 10000, False),
            ('R-SMALL', 10000, 10000, True),
        ]:
            with test_files(args.path, count, size, random) as path:
                dt_create, dt_update, dt_extract, dt_delete = measurement_run(args.location.canonical_path(), path)
            total_size_MB = count * size / 1e06
            file_size_formatted = format_file_size(size)
            content = 'random' if random else 'all-zero'
            fmt = '%s-%-10s %9.2f MB/s (%d * %s %s files: %.2fs)'
            print(fmt % ('C', msg, total_size_MB / dt_create, count, file_size_formatted, content, dt_create))
            print(fmt % ('R', msg, total_size_MB / dt_extract, count, file_size_formatted, content, dt_extract))
            print(fmt % ('U', msg, total_size_MB / dt_update, count, file_size_formatted, content, dt_update))
            print(fmt % ('D', msg, total_size_MB / dt_delete, count, file_size_formatted, content, dt_delete))
    @with_repository(fake='dry_run', exclusive=True)
    def do_create(self, args, repository, manifest=None, key=None):
        """Create new archive"""
@ -3141,6 +3209,69 @@ class Archiver:
        subparser.add_argument('ids', metavar='IDs', nargs='+', type=str,
                               help='hex object ID(s) to show refcounts for')
        benchmark_epilog = process_epilog("These commands do various benchmarks.")
        subparser = subparsers.add_parser('benchmark', parents=[common_parser], add_help=False,
                                          description='benchmark command',
                                          epilog=benchmark_epilog,
                                          formatter_class=argparse.RawDescriptionHelpFormatter,
                                          help='benchmark command')
        benchmark_parsers = subparser.add_subparsers(title='required arguments', metavar='<command>')
        subparser.set_defaults(fallback_func=functools.partial(self.do_subcommand_help, subparser))
        bench_crud_epilog = process_epilog("""
        This command benchmarks borg CRUD (create, read, update, delete) operations.
        It creates input data below the given PATH and backups this data into the given REPO.
        The REPO must already exist (it could be a fresh empty repo or an existing repo, the
        command will create / read / update / delete some archives named borg-test-data* there.
        Make sure you have free space there, you'll need about 1GB each (+ overhead).
        If your repository is encrypted and borg needs a passphrase to unlock the key, use:
        BORG_PASSPHRASE=mysecret borg benchmark crud REPO PATH
        Measurements are done with different input file sizes and counts.
        The file contents are very artificial (either all zero or all random),
        thus the measurement results do not necessarily reflect performance with real data.
        Also, due to the kind of content used, no compression is used in these benchmarks.
        C- == borg create (1st archive creation, no compression, do not use files cache)
              C-Z- == all-zero files. full dedup, this is primarily measuring reader/chunker/hasher.
              C-R- == random files. no dedup, measuring throughput through all processing stages.
        R- == borg extract (extract archive, dry-run, do everything, but do not write files to disk)
              R-Z- == all zero files. Measuring heavily duplicated files.
              R-R- == random files. No duplication here, measuring throughput through all processing
                      stages, except writing to disk.
        U- == borg create (2nd archive creation of unchanged input files, measure files cache speed)
              The throughput value is kind of virtual here, it does not actually read the file.
              U-Z- == needs to check the 2 all-zero chunks' existence in the repo.
              U-R- == needs to check existence of a lot of different chunks in the repo.
        D- == borg delete archive (delete last remaining archive, measure deletion + compaction)
              D-Z- == few chunks to delete / few segments to compact/remove.
              D-R- == many chunks to delete / many segments to compact/remove.
        Please note that there might be quite some variance in these measurements.
        Try multiple measurements and having a otherwise idle machine (and network, if you use it).
        """)
        subparser = benchmark_parsers.add_parser('crud', parents=[common_parser], add_help=False,
                                                 description=self.do_benchmark_crud.__doc__,
                                                 epilog=bench_crud_epilog,
                                                 formatter_class=argparse.RawDescriptionHelpFormatter,
                                                 help='benchmarks borg CRUD (create, extract, update, delete).')
        subparser.set_defaults(func=self.do_benchmark_crud)
        subparser.add_argument('location', metavar='REPO',
                               type=location_validator(archive=False),
                               help='repo to use for benchmark (must exist)')
        subparser.add_argument('path', metavar='PATH', help='path were to create benchmark input data')
        return parser
    @staticmethod