Normalize paths before pattern matching on OS X

The OS X file system HFS+ stores path names as Unicode, and converts them to a variant of Unicode NFD for storage. Because path names will always be in this canonical form, it's not friendly to require users to match this form exactly. Convert paths from the repository and patterns from the command line to NFD before comparing them. Unix (and Windows, I think) file systems don't convert path names into a canonical form, so users will continue to have to exactly match the path name they want, because there could be two paths with the same character visually that are actually composed of different byte sequences.
2015-09-08 23:33:34 -04:00 · 2015-09-08 23:33:34 -04:00 · d9fb1d2b03
parent da5923ec04
commit d9fb1d2b03
2 changed files with 132 additions and 7 deletions
--- a/borg/helpers.py
+++ b/borg/helpers.py
@ -7,6 +7,8 @@ import pwd
 import re
 import sys
 import time
+import unicodedata
+
 from datetime import datetime, timezone, timedelta
 from fnmatch import translate
 from operator import attrgetter
@ -220,6 +222,10 @@ def exclude_path(path, patterns):
 # unify the two cases, we add a path separator to the end of
 # the path before matching.

+##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+##### For discussion only, don't merge this code!
+##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
 class IncludePattern:
    """Literal files or directories listed on the command line
    for some operations (e.g. extract, but not create).
@ -227,10 +233,22 @@ class IncludePattern:
    path match as well.  A trailing slash makes no difference.
    """
    def __init__(self, pattern):
-        self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
+        def match(path):
+            return (path+os.path.sep).startswith(self.pattern)

-    def match(self, path):
-        return (path+os.path.sep).startswith(self.pattern)
+        # HFS+ converts paths to a canonical form, so users shouldn't be
+        # required to enter an exact match
+        if sys.platform in ('darwin',):
+            # repository paths will be mostly in NFD, as the OSX exception list
+            # to NFD is small, so normalize to that form for best performance
+            pattern = unicodedata.normalize("NFD", pattern)
+            self.match = lambda p: match(unicodedata.normalize("NFD", p))
+        # Windows and Unix filesystems allow different forms, so users
+        # always have to enter an exact match
+        else:
+            self.match = match
+
+        self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep

    def __repr__(self):
        return '%s(%s)' % (type(self), self.pattern)
@ -241,17 +259,30 @@ class ExcludePattern(IncludePattern):
    exclude the contents of a directory, but not the directory itself.
    """
    def __init__(self, pattern):
+        def match(path):
+            return self.regex.match(path+os.path.sep) is not None
+
        if pattern.endswith(os.path.sep):
            self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
        else:
            self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
+
+        # HFS+ converts paths to a canonical form, so users shouldn't be
+        # required to enter an exact match
+        if sys.platform in ('darwin',):
+            # repository paths will be mostly in NFD, as the OSX exception list
+            # to NFD is small, so normalize to that form for best performance
+            self.pattern = unicodedata.normalize("NFD", self.pattern)
+            self.match = lambda p: match(unicodedata.normalize("NFD", p))
+        # Windows and Unix filesystems allow different forms, so users
+        # always have to enter an exact match
+        else:
+            self.match = match
+
        # fnmatch and re.match both cache compiled regular expressions.
        # Nevertheless, this is about 10 times faster.
        self.regex = re.compile(translate(self.pattern))

-    def match(self, path):
-        return self.regex.match(path+os.path.sep) is not None
-
    def __repr__(self):
        return '%s(%s)' % (type(self), self.pattern)

--- a/borg/testsuite/helpers.py
+++ b/borg/testsuite/helpers.py
@ -3,9 +3,10 @@ from time import mktime, strptime
 from datetime import datetime, timezone, timedelta

 import pytest
+import sys
 import msgpack

-from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
+from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
    prune_within, prune_split, \
    StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
 from . import BaseTestCase
@ -178,6 +179,99 @@ class PatternTestCase(BaseTestCase):
                          ['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])


+@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
+class IncludePatternNonAsciiTestCase(BaseTestCase):
+    def testComposedUnicode(self):
+        pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
+        i = IncludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+
+    def testDecomposedUnicode(self):
+        pattern = 'ba\N{COMBINING ACUTE ACCENT}'
+        i = IncludePattern(pattern)
+
+        assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testInvalidUnicode(self):
+        pattern = str(b'ba\x80', 'latin1')
+        i = IncludePattern(pattern)
+
+        assert not i.match("ba/foo")
+        assert i.match(str(b"ba\x80/foo", 'latin1'))
+
+
+@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
+class ExcludePatternNonAsciiTestCase(BaseTestCase):
+    def testComposedUnicode(self):
+        pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
+        e = ExcludePattern(pattern)
+
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+
+    def testDecomposedUnicode(self):
+        pattern = 'ba\N{COMBINING ACUTE ACCENT}'
+        e = ExcludePattern(pattern)
+
+        assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testInvalidUnicode(self):
+        pattern = str(b'ba\x80', 'latin1')
+        e = ExcludePattern(pattern)
+
+        assert not e.match("ba/foo")
+        assert e.match(str(b"ba\x80/foo", 'latin1'))
+
+#@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='OS X only test')
+class OSXPatternNormalizationTestCase(BaseTestCase):
+    # monkey patch sys.platform to allow testing on non-OSX during development
+    # remove and uncomment OSX-only decorator before push
+    def setUp(self):
+        self.oldplatform = sys.platform
+        sys.platform = 'darwin'
+        pass
+
+    # monkey patch sys.platform to allow testing on non-OSX during development
+    # remove and uncomment OSX-only decorator before push
+    def tearDown(self):
+        sys.platform = self.oldplatform
+        pass
+        
+    def testComposedUnicode(self):
+        pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testDecomposedUnicode(self):
+        pattern = 'ba\N{COMBINING ACUTE ACCENT}'
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+        assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
+        assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
+    
+    def testInvalidUnicode(self):
+        pattern = str(b'ba\x80', 'latin1')
+        i = IncludePattern(pattern)
+        e = ExcludePattern(pattern)
+
+        assert not i.match("ba/foo")
+        assert i.match(str(b"ba\x80/foo", 'latin1'))
+        assert not e.match("ba/foo")
+        assert e.match(str(b"ba\x80/foo", 'latin1'))
+
+
 def test_compression_specs():
    with pytest.raises(ValueError):
        CompressionSpec('')