Normalize paths before pattern matching on OS X

The OS X file system HFS+ stores path names as Unicode, and converts
them to a variant of Unicode NFD for storage.  Because path names will
always be in this canonical form, it's not friendly to require users to
match this form exactly.  Convert paths from the repository and patterns
from the command line to NFD before comparing them.

Unix (and Windows, I think) file systems don't convert path names into a
canonical form, so users will continue to have to exactly match the path
name they want, because there could be two paths with the same character
visually that are actually composed of different byte sequences.
This commit is contained in:
Ed Blackman 2015-09-08 23:33:34 -04:00
parent da5923ec04
commit d9fb1d2b03
2 changed files with 132 additions and 7 deletions

View File

@ -7,6 +7,8 @@ import pwd
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone, timedelta
from fnmatch import translate
from operator import attrgetter
@ -220,6 +222,10 @@ def exclude_path(path, patterns):
# unify the two cases, we add a path separator to the end of
# the path before matching.
##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
##### For discussion only, don't merge this code!
##### !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
class IncludePattern:
"""Literal files or directories listed on the command line
for some operations (e.g. extract, but not create).
@ -227,10 +233,22 @@ class IncludePattern:
path match as well. A trailing slash makes no difference.
"""
def __init__(self, pattern):
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
def match(path):
return (path+os.path.sep).startswith(self.pattern)
def match(self, path):
return (path+os.path.sep).startswith(self.pattern)
# HFS+ converts paths to a canonical form, so users shouldn't be
# required to enter an exact match
if sys.platform in ('darwin',):
# repository paths will be mostly in NFD, as the OSX exception list
# to NFD is small, so normalize to that form for best performance
pattern = unicodedata.normalize("NFD", pattern)
self.match = lambda p: match(unicodedata.normalize("NFD", p))
# Windows and Unix filesystems allow different forms, so users
# always have to enter an exact match
else:
self.match = match
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
def __repr__(self):
return '%s(%s)' % (type(self), self.pattern)
@ -241,17 +259,30 @@ class ExcludePattern(IncludePattern):
exclude the contents of a directory, but not the directory itself.
"""
def __init__(self, pattern):
def match(path):
return self.regex.match(path+os.path.sep) is not None
if pattern.endswith(os.path.sep):
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
else:
self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
# HFS+ converts paths to a canonical form, so users shouldn't be
# required to enter an exact match
if sys.platform in ('darwin',):
# repository paths will be mostly in NFD, as the OSX exception list
# to NFD is small, so normalize to that form for best performance
self.pattern = unicodedata.normalize("NFD", self.pattern)
self.match = lambda p: match(unicodedata.normalize("NFD", p))
# Windows and Unix filesystems allow different forms, so users
# always have to enter an exact match
else:
self.match = match
# fnmatch and re.match both cache compiled regular expressions.
# Nevertheless, this is about 10 times faster.
self.regex = re.compile(translate(self.pattern))
def match(self, path):
return self.regex.match(path+os.path.sep) is not None
def __repr__(self):
return '%s(%s)' % (type(self), self.pattern)

View File

@ -3,9 +3,10 @@ from time import mktime, strptime
from datetime import datetime, timezone, timedelta
import pytest
import sys
import msgpack
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
prune_within, prune_split, \
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
from . import BaseTestCase
@ -178,6 +179,99 @@ class PatternTestCase(BaseTestCase):
['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
class IncludePatternNonAsciiTestCase(BaseTestCase):
def testComposedUnicode(self):
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
i = IncludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testDecomposedUnicode(self):
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
i = IncludePattern(pattern)
assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testInvalidUnicode(self):
pattern = str(b'ba\x80', 'latin1')
i = IncludePattern(pattern)
assert not i.match("ba/foo")
assert i.match(str(b"ba\x80/foo", 'latin1'))
@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='all but OS X test')
class ExcludePatternNonAsciiTestCase(BaseTestCase):
def testComposedUnicode(self):
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
e = ExcludePattern(pattern)
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testDecomposedUnicode(self):
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
e = ExcludePattern(pattern)
assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testInvalidUnicode(self):
pattern = str(b'ba\x80', 'latin1')
e = ExcludePattern(pattern)
assert not e.match("ba/foo")
assert e.match(str(b"ba\x80/foo", 'latin1'))
#@pytest.mark.skipif(sys.platform.startswith('darwin'), reason='OS X only test')
class OSXPatternNormalizationTestCase(BaseTestCase):
# monkey patch sys.platform to allow testing on non-OSX during development
# remove and uncomment OSX-only decorator before push
def setUp(self):
self.oldplatform = sys.platform
sys.platform = 'darwin'
pass
# monkey patch sys.platform to allow testing on non-OSX during development
# remove and uncomment OSX-only decorator before push
def tearDown(self):
sys.platform = self.oldplatform
pass
def testComposedUnicode(self):
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testDecomposedUnicode(self):
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testInvalidUnicode(self):
pattern = str(b'ba\x80', 'latin1')
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert not i.match("ba/foo")
assert i.match(str(b"ba\x80/foo", 'latin1'))
assert not e.match("ba/foo")
assert e.match(str(b"ba\x80/foo", 'latin1'))
def test_compression_specs():
with pytest.raises(ValueError):
CompressionSpec('')