diff --git a/changelog/unreleased/issue-4278 b/changelog/unreleased/issue-4278 new file mode 100644 index 000000000..3170fa0da --- /dev/null +++ b/changelog/unreleased/issue-4278 @@ -0,0 +1,31 @@ +Enhancement: include include filters in command rewrite + +The enhancement allows the standard include filter options + --iinclude pattern same as --include pattern but ignores the casing of filenames + --iinclude-file file same as --include-file but ignores casing of filenames in patterns + -i, --include pattern include a pattern (can be specified multiple times) + --include-file file read include patterns from a file (can be specified multiple times) + +In addition the option + -s, --snapshot-summary create snapshot summary record if it does not exist + +To improve space performance of the newly created snapshot via the include filter variants, +it is recommended to utilize the option + -X, --exclude-empty exclude empty directories from being created, needs a second walk through the tree + +The exclusion or inclusion of filter parameters is exclusive, as in other commands +which use both include and exclude filters. + +In order to make the include filter work efficiently, an additional read pass through the +directory tree is needed to identify the subdirectories and their parents for the +inclusion of files. Otherwise the full directory tree needs to be included +which as a consequence may contain quite a lot of empty subdirectories. The additiinal read pass +avoids this issue, but it might take a bit more time, depending on the network speed of +the backend storage and the size of the snapshot. + +The --snapshot-summary parameter adds summary data to the snapshot summary section, as already +described in the solution to issue 4942. + +https://github.com/restic/restic/issues/4278 +https://github.com/restic/restic/issues/4942 +https://github.com/restic/restic/pull/5185 diff --git a/cmd/restic/cmd_rewrite.go b/cmd/restic/cmd_rewrite.go index b62d1ed95..d4e855d6b 100644 --- a/cmd/restic/cmd_rewrite.go +++ b/cmd/restic/cmd_rewrite.go @@ -2,6 +2,7 @@ package main import ( "context" + "path/filepath" "time" "github.com/spf13/cobra" @@ -19,8 +20,9 @@ var cmdRewrite = &cobra.Command{ Use: "rewrite [flags] [snapshotID ...]", Short: "Rewrite snapshots to exclude unwanted files", Long: ` -The "rewrite" command excludes files from existing snapshots. It creates new -snapshots containing the same data as the original ones, but without the files +The "rewrite" command excludes files from existing snapshots. +Alternatively you can use rewrite command to include only wanted files and directories. +It creates new snapshots containing the same data as the original ones, but without the files you specify to exclude. All metadata (time, host, tags) will be preserved. The snapshots to rewrite are specified using the --host, --tag and --path options, @@ -35,6 +37,21 @@ Please note that the --forget option only removes the snapshots and not the actu data stored in the repository. In order to delete the no longer referenced data, use the "prune" command. +The option --snapshot-summary [-s] creates a new snapshot with snapshot summary data attached. +Only the two fields TotalFilesProcessed and TotalBytesProcessed are non-zero. + +For the include option to work more efficiently, it os advisable to use the flag +'--exclude-empty' so only directories needed will be included from the original +snapshot. Otherwise all directories from the original snapshot have to be included. +This however will produce an extra Walk() through the original snapshot tree. + +In order to make the include filter work efficiently, an additional read pass through the +directory tree is needed to identify the subdirectories and their parents for the +inclusion of files to work effectively. Otherwise the full directory tree needs to be included +which may contain quite a lot of empty subdirectories. The first read pass +avoids this issue, but it might take a bit more time, depending on the network speed of +the backend storage and the size of the snapshot. + EXIT STATUS =========== @@ -83,12 +100,15 @@ func (sma snapshotMetadataArgs) convert() (*snapshotMetadata, error) { // RewriteOptions collects all options for the rewrite command. type RewriteOptions struct { - Forget bool - DryRun bool + Forget bool + DryRun bool + SnapshotSummary bool + ExcludeEmptyDir bool Metadata snapshotMetadataArgs restic.SnapshotFilter filter.ExcludePatternOptions + filter.IncludePatternOptions } var rewriteOptions RewriteOptions @@ -101,13 +121,21 @@ func init() { f.BoolVarP(&rewriteOptions.DryRun, "dry-run", "n", false, "do not do anything, just print what would be done") f.StringVar(&rewriteOptions.Metadata.Hostname, "new-host", "", "replace hostname") f.StringVar(&rewriteOptions.Metadata.Time, "new-time", "", "replace time of the backup") + f.BoolVarP(&rewriteOptions.SnapshotSummary, "snapshot-summary", "s", false, "create snapshot summary record if it does not exist") + f.BoolVarP(&rewriteOptions.ExcludeEmptyDir, "exclude-empty", "X", false, "only for include patterns: exclude empty directories from being created, needs a second walk through the tree") initMultiSnapshotFilter(f, &rewriteOptions.SnapshotFilter, true) rewriteOptions.ExcludePatternOptions.Add(f) + rewriteOptions.IncludePatternOptions.Add(f) } type rewriteFilterFunc func(ctx context.Context, sn *restic.Snapshot) (restic.ID, error) +type DirectoryNeeded struct { + node *restic.Node + needed bool +} + func rewriteSnapshot(ctx context.Context, repo *repository.Repository, sn *restic.Snapshot, opts RewriteOptions) (bool, error) { if sn.Tree == nil { return false, errors.Errorf("snapshot %v has nil tree", sn.ID().Str()) @@ -118,12 +146,103 @@ func rewriteSnapshot(ctx context.Context, repo *repository.Repository, sn *resti return false, err } + includeByNameFuncs, err := opts.IncludePatternOptions.CollectPatterns(Warnf) + if err != nil { + return false, err + } + metadata, err := opts.Metadata.convert() if err != nil { return false, err } + // walk the complete snapshot tree and memorize the directory structure + directoriesNeeded := map[string]DirectoryNeeded{} + if opts.ExcludeEmptyDir { + err := walker.Walk(ctx, repo, *sn.Tree, walker.WalkVisitor{ProcessNode: func(parentTreeID restic.ID, nodepath string, node *restic.Node, err error) error { + if err != nil { + Printf("Unable to load tree %s\n ... which belongs to snapshot %s - reason %v\n", parentTreeID, sn.ID().Str(), err) + return walker.ErrSkipNode + } + + if node == nil { + return nil + } else if node.Type == restic.NodeTypeDir { + directoriesNeeded[nodepath] = DirectoryNeeded{ + node: node, + needed: false, + } + // filter directories + for _, include := range includeByNameFuncs { + matched, childMayMatch := include(nodepath) + if matched && childMayMatch { + parentData := directoriesNeeded[nodepath] + if !parentData.needed { // flip 'needed' bit: off->on + directoriesNeeded[nodepath] = DirectoryNeeded{ + node: parentData.node, + needed: true, + } + } + } + } + } else { // include filter processsing - filter file names + for _, include := range includeByNameFuncs { + if node.Type == restic.NodeTypeFile { + matched, childMayMatch := include(nodepath) + if matched && childMayMatch { + dirpath := filepath.Dir(nodepath) // parent path + parentData := directoriesNeeded[dirpath] + if !parentData.needed { // flip 'needed' bit: off->on + directoriesNeeded[dirpath] = DirectoryNeeded{ + node: parentData.node, + needed: true, + } + } + } + } + } + } + return nil + }}) // end walker.Walk + + if err != nil { + Printf("walker.Walk does not want to run for snapshot %s - reason %v\n", sn.ID().Str(), err) + return false, err + } + + // go over all directory structure an find all parent nodes needed + for { // ever + more := false + for dirpath, dirData := range directoriesNeeded { + if !dirData.needed { + continue + } + + parentPath := filepath.Dir(dirpath) + // TODO: don't know how this is expressed for Windows + if parentPath == "/" { + continue + } + + value := directoriesNeeded[parentPath] + if value.needed { + continue + } + + directoriesNeeded[parentPath] = DirectoryNeeded{ + node: value.node, + needed: true, + } + more = true + } // all directories in snapshot + + if !more { + break + } + } // for ever + } // opts.ExcludeEmptyDir + var filter rewriteFilterFunc if len(rejectByNameFuncs) > 0 { @@ -152,14 +271,99 @@ func rewriteSnapshot(ctx context.Context, repo *repository.Repository, sn *resti return restic.ID{}, err } ss := querySize() - if sn.Summary != nil { - sn.Summary.TotalFilesProcessed = ss.FileCount - sn.Summary.TotalBytesProcessed = ss.FileSize + if sn.Summary == nil { // change of logic: create summary if it wasn't there before + sn.Summary = &restic.SnapshotSummary{} } + sn.Summary.DataBlobs = ss.DataBlobs + sn.Summary.TreeBlobs = ss.TreeBlobs + sn.Summary.TotalFilesProcessed = ss.FileCount + sn.Summary.TotalBytesProcessed = ss.FileSize return id, err } + } else if len(includeByNameFuncs) > 0 { + selectByName := func(nodepath string, node *restic.Node) bool { + for _, include := range includeByNameFuncs { + if node.Type == restic.NodeTypeDir { + if opts.ExcludeEmptyDir { + return directoriesNeeded[nodepath].needed + } else { + // include directories unconditionally + return true + } + } else if node.Type == restic.NodeTypeFile { + ifun, childMayMatch := include(nodepath) + if ifun && childMayMatch { + return true + } + } + } + return false + } + + rewriteNode := func(node *restic.Node, path string) *restic.Node { + if selectByName(path, node) { + Verboseff("including %s\n", path) + return node + } + return nil + } + + rewriter, querySize := walker.NewSnapshotSizeRewriter(rewriteNode) + + filter = func(ctx context.Context, sn *restic.Snapshot) (restic.ID, error) { + id, err := rewriter.RewriteTree(ctx, repo, "/", *sn.Tree) + if err != nil { + return restic.ID{}, err + } + ss := querySize() + if sn.Summary == nil { + sn.Summary = &restic.SnapshotSummary{} + } + sn.Summary.DataBlobs = ss.DataBlobs + sn.Summary.TreeBlobs = ss.TreeBlobs + sn.Summary.TotalFilesProcessed = ss.FileCount + sn.Summary.TotalBytesProcessed = ss.FileSize + + return id, nil + } + + } else if opts.SnapshotSummary { + if sn.Summary != nil { + Printf("snapshot %s has already got snapshot summary data\n", sn.ID().Str()) + return false, nil + } + + rewriteNode := func(node *restic.Node, path string) *restic.Node { + return node + } + + rewriter, querySize := walker.NewSnapshotSizeRewriter(rewriteNode) + + filter = func(ctx context.Context, sn *restic.Snapshot) (restic.ID, error) { + id, err := rewriter.RewriteTree(ctx, repo, "/", *sn.Tree) + if err != nil { + return restic.ID{}, err + } + ss := querySize() + if sn.Summary == nil { + sn.Summary = &restic.SnapshotSummary{} + } + sn.Summary.DataBlobs = ss.DataBlobs + sn.Summary.TreeBlobs = ss.TreeBlobs + sn.Summary.TotalFilesProcessed = ss.FileCount + sn.Summary.TotalBytesProcessed = ss.FileSize + Verbosef("dataBlobs %12d\n", ss.DataBlobs) + Verbosef("treeBlobs %12d\n", ss.TreeBlobs) + Verbosef("totalFilesProcessed %12d\n", ss.FileCount) + Verbosef("totalBytesProcessed %12d\n", ss.FileSize) + + return id, nil + } + } else { + // TODO: question: should metadata modification be changed so that + // snapshot summary data will always be created?? filter = func(_ context.Context, sn *restic.Snapshot) (restic.ID, error) { return *sn.Tree, nil } @@ -203,7 +407,7 @@ func filterAndReplaceSnapshot(ctx context.Context, repo restic.Repository, sn *r return true, nil } - if filteredTree == *sn.Tree && newMetadata == nil { + if filteredTree == *sn.Tree && newMetadata == nil && sn.Summary == nil { debug.Log("Snapshot %v not modified", sn) return false, nil } @@ -230,6 +434,7 @@ func filterAndReplaceSnapshot(ctx context.Context, repo restic.Repository, sn *r // Always set the original snapshot id as this essentially a new snapshot. sn.Original = sn.ID() sn.Tree = &filteredTree + sn.ProgramVersion = version if !forget { sn.AddTags([]string{addTag}) @@ -263,8 +468,18 @@ func filterAndReplaceSnapshot(ctx context.Context, repo restic.Repository, sn *r } func runRewrite(ctx context.Context, opts RewriteOptions, gopts GlobalOptions, args []string) error { - if opts.ExcludePatternOptions.Empty() && opts.Metadata.empty() { - return errors.Fatal("Nothing to do: no excludes provided and no new metadata provided") + exEmpty := opts.ExcludePatternOptions.Empty() + inEmpty := opts.IncludePatternOptions.Empty() + if !opts.SnapshotSummary && exEmpty && inEmpty && opts.Metadata.empty() { + return errors.Fatal("Nothing to do: no includes/excludes provided and no new metadata provided") + } + + if !exEmpty && !inEmpty { + return errors.Fatal("You cannot specify include and exclude options simultaneously!") + } + + if opts.SnapshotSummary && (!exEmpty || !inEmpty) { + Warnf("option --snapshot-summary is ignored with include/exclude options\n") } var ( diff --git a/internal/filter/include.go b/internal/filter/include.go index 87d5f1207..e6eefe6b9 100644 --- a/internal/filter/include.go +++ b/internal/filter/include.go @@ -25,6 +25,10 @@ func (opts *IncludePatternOptions) Add(f *pflag.FlagSet) { f.StringArrayVar(&opts.InsensitiveIncludeFiles, "iinclude-file", nil, "same as --include-file but ignores casing of `file`names in patterns") } +func (opts *IncludePatternOptions) Empty() bool { + return len(opts.Includes) == 0 && len(opts.InsensitiveIncludes) == 0 && len(opts.IncludeFiles) == 0 && len(opts.InsensitiveIncludeFiles) == 0 +} + func (opts IncludePatternOptions) CollectPatterns(warnf func(msg string, args ...interface{})) ([]IncludeByNameFunc, error) { var fs []IncludeByNameFunc if len(opts.IncludeFiles) > 0 { diff --git a/internal/walker/rewriter.go b/internal/walker/rewriter.go index 968ef44f3..111e1ddb5 100644 --- a/internal/walker/rewriter.go +++ b/internal/walker/rewriter.go @@ -16,6 +16,8 @@ type QueryRewrittenSizeFunc func() SnapshotSize type SnapshotSize struct { FileCount uint FileSize uint64 + TreeBlobs int + DataBlobs int } type RewriteOpts struct { @@ -61,6 +63,7 @@ func NewTreeRewriter(opts RewriteOpts) *TreeRewriter { func NewSnapshotSizeRewriter(rewriteNode NodeRewriteFunc) (*TreeRewriter, QueryRewrittenSizeFunc) { var count uint var size uint64 + var treeBlobs, dataBlobs int t := NewTreeRewriter(RewriteOpts{ RewriteNode: func(node *restic.Node, path string) *restic.Node { @@ -68,6 +71,9 @@ func NewSnapshotSizeRewriter(rewriteNode NodeRewriteFunc) (*TreeRewriter, QueryR if node != nil && node.Type == restic.NodeTypeFile { count++ size += node.Size + dataBlobs += len(node.Content) + } else if node != nil && node.Type == restic.NodeTypeDir { + treeBlobs++ } return node }, @@ -75,7 +81,7 @@ func NewSnapshotSizeRewriter(rewriteNode NodeRewriteFunc) (*TreeRewriter, QueryR }) ss := func() SnapshotSize { - return SnapshotSize{count, size} + return SnapshotSize{count, size, treeBlobs, dataBlobs} } return t, ss