Merge pull request #193 from edgewood/osxPathNormalization

Normalize paths before pattern matching on OS X
This commit is contained in:
TW 2015-09-10 22:58:20 +02:00
commit 638204fd0e
2 changed files with 97 additions and 1 deletions

View File

@ -1,12 +1,15 @@
import argparse
import binascii
from collections import namedtuple
from functools import wraps
import grp
import os
import pwd
import re
import sys
import time
import unicodedata
from datetime import datetime, timezone, timedelta
from fnmatch import translate
from operator import attrgetter
@ -220,6 +223,23 @@ def exclude_path(path, patterns):
# unify the two cases, we add a path separator to the end of
# the path before matching.
def normalized(func):
""" Decorator for the Pattern match methods, returning a wrapper that
normalizes OSX paths to match the normalized pattern on OSX, and
returning the original method on other platforms"""
@wraps(func)
def normalize_wrapper(self, path):
return func(self, unicodedata.normalize("NFD", path))
if sys.platform in ('darwin',):
# HFS+ converts paths to a canonical form, so users shouldn't be
# required to enter an exact match
return normalize_wrapper
else:
# Windows and Unix filesystems allow different forms, so users
# always have to enter an exact match
return func
class IncludePattern:
"""Literal files or directories listed on the command line
for some operations (e.g. extract, but not create).
@ -227,8 +247,12 @@ class IncludePattern:
path match as well. A trailing slash makes no difference.
"""
def __init__(self, pattern):
if sys.platform in ('darwin',):
pattern = unicodedata.normalize("NFD", pattern)
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep
@normalized
def match(self, path):
return (path+os.path.sep).startswith(self.pattern)
@ -245,10 +269,15 @@ class ExcludePattern(IncludePattern):
self.pattern = os.path.normpath(pattern).rstrip(os.path.sep)+os.path.sep+'*'+os.path.sep
else:
self.pattern = os.path.normpath(pattern)+os.path.sep+'*'
if sys.platform in ('darwin',):
self.pattern = unicodedata.normalize("NFD", self.pattern)
# fnmatch and re.match both cache compiled regular expressions.
# Nevertheless, this is about 10 times faster.
self.regex = re.compile(translate(self.pattern))
@normalized
def match(self, path):
return self.regex.match(path+os.path.sep) is not None

View File

@ -3,9 +3,10 @@ from time import mktime, strptime
from datetime import datetime, timezone, timedelta
import pytest
import sys
import msgpack
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, ExcludePattern, make_path_safe, \
from ..helpers import adjust_patterns, exclude_path, Location, format_timedelta, IncludePattern, ExcludePattern, make_path_safe, \
prune_within, prune_split, \
StableDict, int_to_bigint, bigint_to_int, parse_timestamp, CompressionSpec, ChunkerParams
from . import BaseTestCase
@ -178,6 +179,72 @@ class PatternTestCase(BaseTestCase):
['/etc/passwd', '/etc/hosts', '/var/log/messages', '/var/log/dmesg'])
@pytest.mark.skipif(sys.platform in ('darwin',), reason='all but OS X test')
class PatternNonAsciiTestCase(BaseTestCase):
def testComposedUnicode(self):
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert not i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert not e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testDecomposedUnicode(self):
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert not i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert not e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testInvalidUnicode(self):
pattern = str(b'ba\x80', 'latin1')
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert not i.match("ba/foo")
assert i.match(str(b"ba\x80/foo", 'latin1'))
assert not e.match("ba/foo")
assert e.match(str(b"ba\x80/foo", 'latin1'))
@pytest.mark.skipif(sys.platform not in ('darwin',), reason='OS X test')
class OSXPatternNormalizationTestCase(BaseTestCase):
def testComposedUnicode(self):
pattern = 'b\N{LATIN SMALL LETTER A WITH ACUTE}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testDecomposedUnicode(self):
pattern = 'ba\N{COMBINING ACUTE ACCENT}'
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert i.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert i.match("ba\N{COMBINING ACUTE ACCENT}/foo")
assert e.match("b\N{LATIN SMALL LETTER A WITH ACUTE}/foo")
assert e.match("ba\N{COMBINING ACUTE ACCENT}/foo")
def testInvalidUnicode(self):
pattern = str(b'ba\x80', 'latin1')
i = IncludePattern(pattern)
e = ExcludePattern(pattern)
assert not i.match("ba/foo")
assert i.match(str(b"ba\x80/foo", 'latin1'))
assert not e.match("ba/foo")
assert e.match(str(b"ba\x80/foo", 'latin1'))
def test_compression_specs():
with pytest.raises(ValueError):
CompressionSpec('')