mirror of https://github.com/borgbackup/borg.git
Normalize paths before pattern matching on OS X
The OS X file system HFS+ stores path names as Unicode, and converts them to a variant of Unicode NFD for storage. Because path names will always be in this canonical form, it's not friendly to require users to match this form exactly. Convert paths from the repository and patterns from the command line to NFD before comparing them. Unix (and Windows, I think) file systems don't convert path names into a canonical form, so users will continue to have to exactly match the path name they want, because there could be two paths with the same character visually that are actually composed of different byte sequences.
This commit is contained in:
parent
da5923ec04
commit
d9fb1d2b03
|
@ -7,6 +7,8 @@ import pwd
|
|||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from fnmatch import translate
|
||||
from operator import attrgetter
|
||||
|
@ -220,6 +222,10 @@ def exclude_path(path, patterns):
|
|||
# unify the two cases, we add a path separator to the end of
|
||||
# the path before matching.
|
||||
|
||||
##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
##### For discussion only, don't merge this code!
|
||||
##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
||||
|
||||
class IncludePattern:
|
||||
"""Literal files or directories listed on the command line
|
||||
for some operations (e.g. extract, but not create).
|
||||
|
@ -227,10 +233,22 @@ class IncludePattern:
|
|||
path match as well. A trailing slash makes no difference.
|
||||
"""
|
||||
def __init__(self, pattern):
|
||||
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
|
||||
def match(path):
|
||||
return (path+os.path.sep).startswith(self.pattern)
|
||||
|
||||
def match(self, path):
|
||||
return (path+os.path.sep).startswith(self.pattern)
|
||||
# HFS+ converts paths to a canonical form, so users shouldn't be
|
||||
# required to enter an exact match
|
||||
if sys.platform in ('darwin',):
|
||||
# repository paths will be mostly in NFD, as the OSX exception list
|
||||
# to NFD is small, so normalize to that form for best performance
|
||||
pattern = unicodedata.normalize("NFD", pattern)
|
||||
self.match = lambda p: match(unicodedata.normalize("NFD", p))
|
||||
# Windows and Unix filesystems allow different forms, so users
|
||||
# always have to enter an exact match
|
||||
else:
|
||||
self.match = match
|
||||
|
||||
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s)' % (type(self), self.pattern)
|
||||
|
@ -241,17 +259,30 @@ class ExcludePattern(IncludePattern):
|
|||
exclude the contents of a directory, but not the directory itself.
|
||||
"""
|
||||
def __init__(self, pattern):
|
||||
def match(path):
|
||||
return self.regex.match(path+os.path.sep) is not None
|
||||
|
||||
if pattern.endswith(os.path.sep):
|
||||
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
|
||||
else:
|
||||
self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
|
||||
|
||||
# HFS+ converts paths to a canonical form, so users shouldn't be
|
||||
# required to enter an exact match
|
||||
if sys.platform in ('darwin',):
|
||||
# repository paths will be mostly in NFD, as the OSX exception list
|
||||
# to NFD is small, so normalize to that form for best performance
|
||||
self.pattern = unicodedata.normalize("NFD", self.pattern)
|
||||
self.match = lambda p: match(unicodedata.normalize("NFD", p))
|
||||
# Windows and Unix filesystems allow different forms, so users
|
||||
# always have to enter an exact match
|
||||
else:
|
||||
self.match = match
|
||||
|
||||
# fnmatch and re.match both cache compiled regular expressions.
|
||||
# Nevertheless, this is about 10 times faster.
|
||||
self.regex = re.compile(translate(self.pattern))
|
||||
|
||||
def match(self, path):
|
||||
return self.regex.match(path+os.path.sep) is not None
|
||||
|
||||
def __repr__(self):
|
||||
return '%s(%s)' % (type(self), self.pattern)
|
||||
|
||||
|
|
|
@ -3,9 +3,10 @@ from time import mktime, strptime
|
|||
from datetime import datetime, timezone, timedelta
|
||||
|
||||
import pytest
|
||||
import sys
|
||||
import msgpack
|
||||
|
||||
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
|
||||
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
|
||||
prune_within, prune_split, \
|
||||
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
|
||||
from . import BaseTestCase
|
||||
|
@ -178,6 +179,99 @@ class PatternTestCase(BaseTestCase):
|
|||
['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
|
||||
class IncludePatternNonAsciiTestCase(BaseTestCase):
|
||||
def testComposedUnicode(self):
|
||||
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
|
||||
i = IncludePattern(pattern)
|
||||
|
||||
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testDecomposedUnicode(self):
|
||||
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
|
||||
i = IncludePattern(pattern)
|
||||
|
||||
assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testInvalidUnicode(self):
|
||||
pattern = str(b'ba\x80', 'latin1')
|
||||
i = IncludePattern(pattern)
|
||||
|
||||
assert not i.match("ba/foo")
|
||||
assert i.match(str(b"ba\x80/foo", 'latin1'))
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
|
||||
class ExcludePatternNonAsciiTestCase(BaseTestCase):
|
||||
def testComposedUnicode(self):
|
||||
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testDecomposedUnicode(self):
|
||||
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testInvalidUnicode(self):
|
||||
pattern = str(b'ba\x80', 'latin1')
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert not e.match("ba/foo")
|
||||
assert e.match(str(b"ba\x80/foo", 'latin1'))
|
||||
|
||||
#@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='OS X only test')
|
||||
class OSXPatternNormalizationTestCase(BaseTestCase):
|
||||
# monkey patch sys.platform to allow testing on non-OSX during development
|
||||
# remove and uncomment OSX-only decorator before push
|
||||
def setUp(self):
|
||||
self.oldplatform = sys.platform
|
||||
sys.platform = 'darwin'
|
||||
pass
|
||||
|
||||
# monkey patch sys.platform to allow testing on non-OSX during development
|
||||
# remove and uncomment OSX-only decorator before push
|
||||
def tearDown(self):
|
||||
sys.platform = self.oldplatform
|
||||
pass
|
||||
|
||||
def testComposedUnicode(self):
|
||||
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
|
||||
i = IncludePattern(pattern)
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testDecomposedUnicode(self):
|
||||
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
|
||||
i = IncludePattern(pattern)
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
|
||||
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
|
||||
|
||||
def testInvalidUnicode(self):
|
||||
pattern = str(b'ba\x80', 'latin1')
|
||||
i = IncludePattern(pattern)
|
||||
e = ExcludePattern(pattern)
|
||||
|
||||
assert not i.match("ba/foo")
|
||||
assert i.match(str(b"ba\x80/foo", 'latin1'))
|
||||
assert not e.match("ba/foo")
|
||||
assert e.match(str(b"ba\x80/foo", 'latin1'))
|
||||
|
||||
|
||||
def test_compression_specs():
|
||||
with pytest.raises(ValueError):
|
||||
CompressionSpec('')
|
||||
|
|
Loading…
Reference in New Issue