Skip to content

Commit

Permalink
pythonGH-72904: Add optional *seps* argument to fnmatch.translate()
Browse files Browse the repository at this point in the history
If a sequence of path separators is given to the new argument,
`translate()` produces a pattern that matches similarly to
`pathlib.Path.glob()`. Specifically:

- A `*` pattern segment matches precisely one path segment.
- A `**` pattern segment matches any number of path segments
- If `**` appears in any other position within the pattern, `ValueError` is
  raised.
- `*` and `?` wildcards in other positions don't match path separators.

This change allows us to factor out a lot of complex code in pathlib.
  • Loading branch information
barneygale committed Jul 12, 2023
1 parent e4b88c1 commit bbfd404
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 125 deletions.
17 changes: 16 additions & 1 deletion Doc/library/fnmatch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`,
``[n for n in names if fnmatch(n, pattern)]``, but implemented more efficiently.


.. function:: translate(pattern)
.. function:: translate(pattern, seps='')

Return the shell-style *pattern* converted to a regular expression for
using with :func:`re.match`.
Expand All @@ -98,6 +98,21 @@ cache the compiled regex patterns in the following functions: :func:`fnmatch`,
>>> reobj.match('foobar.txt')
<re.Match object; span=(0, 10), match='foobar.txt'>

A sequence of path separator characters may be supplied to the *seps*
argument. If given, the separators are used to split the pattern into
segments, where:

- A ``*`` pattern segment matches precisely one path segment.
- A ``**`` pattern segment matches any number of path segments.
- If ``**`` appears in any other position within the pattern,
:exc:`ValueError` is raised.
- ``*`` and ``?`` wildcards in other positions don't match path separators.

This closely approximates the matching rules of the :mod:`glob` module.

.. versionchanged:: 3.13
The *seps* parameter was added.


.. seealso::

Expand Down
34 changes: 30 additions & 4 deletions Lib/fnmatch.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,13 +71,19 @@ def fnmatchcase(name, pat):
return match(name) is not None


def translate(pat):
def translate(pat, seps=None):
"""Translate a shell PATTERN to a regular expression.
There is no way to quote meta-characters.
"""

STAR = object()
if seps:
SEPS = re.escape(seps)
DOT = f'[^{SEPS}]'
else:
SEPS = None
DOT = '.'
res = []
add = res.append
i, n = 0, len(pat)
Expand All @@ -86,10 +92,30 @@ def translate(pat):
i = i+1
if c == '*':
# compress consecutive `*` into one
if (not res) or res[-1] is not STAR:
h = i - 1
while i < n and pat[i] == '*':
i = i + 1

if seps:
star_count = i - h
is_segment = (h == 0 or pat[h - 1] in seps) and (i == n or pat[i] in seps)
if star_count == 1:
if is_segment:
add(f'{DOT}+')
else:
add(f'{DOT}*')
elif star_count == 2 and is_segment:
if i == n:
add('.*')
else:
add(f'(.*[{SEPS}])?')
i += 1
else:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
add(STAR)
elif c == '?':
add('.')
add(DOT)
elif c == '[':
j = i
if j < n and pat[j] == '!':
Expand Down Expand Up @@ -136,7 +162,7 @@ def translate(pat):
add('(?!)')
elif stuff == '!':
# Negated empty range: match any character.
add('.')
add(DOT)
else:
if stuff[0] == '!':
stuff = '^' + stuff[1:]
Expand Down
157 changes: 37 additions & 120 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,78 +64,12 @@ def _is_case_sensitive(flavour):
#


# fnmatch.translate() returns a regular expression that includes a prefix and
# a suffix, which enable matching newlines and ensure the end of the string is
# matched, respectively. These features are undesirable for our implementation
# of PurePatch.match(), which represents path separators as newlines and joins
# pattern segments together. As a workaround, we define a slice object that
# can remove the prefix and suffix from any translate() result. See the
# _compile_pattern_lines() function for more details.
_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
_SWAP_SEP_AND_NEWLINE = {
'/': str.maketrans({'/': '\n', '\n': '/'}),
'\\': str.maketrans({'\\': '\n', '\n': '\\'}),
}


@functools.lru_cache(maxsize=256)
def _compile_pattern(pat, case_sensitive):
def _compile_pattern(pat, sep, case_sensitive):
"""Compile given glob pattern to a re.Pattern object (observing case
sensitivity), or None if the pattern should match everything."""
if pat == '*':
return None
flags = re.NOFLAG if case_sensitive else re.IGNORECASE
return re.compile(fnmatch.translate(pat), flags).match


@functools.lru_cache()
def _compile_pattern_lines(pattern_lines, case_sensitive):
"""Compile the given pattern lines to an `re.Pattern` object.
The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
its path separators and newlines swapped (e.g. '**\n*.py`). By using
newlines to separate path components, and not setting `re.DOTALL`, we
ensure that the `*` wildcard cannot match path separators.
The returned `re.Pattern` object may have its `match()` method called to
match a complete pattern, or `search()` to match from the right. The
argument supplied to these methods must also have its path separators and
newlines swapped.
"""

# Match the start of the path, or just after a path separator
parts = ['^']
for part in pattern_lines.splitlines(keepends=True):
if part == '*\n':
part = r'.+\n'
elif part == '*':
part = r'.+'
elif part == '**\n':
# '**/' component: we use '[\s\S]' rather than '.' so that path
# separators (i.e. newlines) are matched. The trailing '^' ensures
# we terminate after a path separator (i.e. on a new line).
part = r'[\s\S]*^'
elif part == '**':
# '**' component.
part = r'[\s\S]*'
elif '**' in part:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
# Any other component: pass to fnmatch.translate(). We slice off
# the common prefix and suffix added by translate() to ensure that
# re.DOTALL is not set, and the end of the string not matched,
# respectively. With DOTALL not set, '*' wildcards will not match
# path separators, because the '.' characters in the pattern will
# not match newlines.
part = fnmatch.translate(part)[_FNMATCH_SLICE]
parts.append(part)
# Match the end of the path, always.
parts.append(r'\Z')
flags = re.MULTILINE
if not case_sensitive:
flags |= re.IGNORECASE
return re.compile(''.join(parts), flags=flags)
return re.compile(fnmatch.translate(pat, sep), flags).match


def _select_children(parent_paths, dir_only, follow_symlinks, match):
Expand All @@ -159,7 +93,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
except OSError:
continue
name = entry.name
if match is None or match(name):
if match(name):
yield parent_path._make_child_relpath(name)


Expand Down Expand Up @@ -196,7 +130,7 @@ def _select_unique(paths):
yielded = set()
try:
for path in paths:
path_str = str(path)
path_str = path._str
if path_str not in yielded:
yield path
yielded.add(path_str)
Expand Down Expand Up @@ -268,10 +202,10 @@ class PurePath:
# tail are normalized.
'_drv', '_root', '_tail_cached',

# The `_str` slot stores the string representation of the path,
# The `_str_cached` slot stores the string representation of the path,
# computed from the drive, root and tail when `__str__()` is called
# for the first time. It's used to implement `_str_normcase`
'_str',
'_str_cached',

# The `_str_normcase_cached` slot stores the string path with
# normalized case. It is set when the `_str_normcase` property is
Expand All @@ -285,10 +219,6 @@ class PurePath:
# to implement comparison methods like `__lt__()`.
'_parts_normcase_cached',

# The `_lines_cached` slot stores the string path with path separators
# and newlines swapped. This is used to implement `match()`.
'_lines_cached',

# The `_hash` slot stores the hash of the case-normalized string
# path. It's set when `__hash__()` is called for the first time.
'_hash',
Expand Down Expand Up @@ -375,7 +305,7 @@ def _load_parts(self):
def _from_parsed_parts(self, drv, root, tail):
path_str = self._format_parsed_parts(drv, root, tail)
path = self.with_segments(path_str)
path._str = path_str or '.'
path._str_cached = path_str
path._drv = drv
path._root = root
path._tail_cached = tail
Expand All @@ -392,12 +322,7 @@ def _format_parsed_parts(cls, drv, root, tail):
def __str__(self):
"""Return the string representation of the path, suitable for
passing to system calls."""
try:
return self._str
except AttributeError:
self._str = self._format_parsed_parts(self.drive, self.root,
self._tail) or '.'
return self._str
return self._str or '.'

def __fspath__(self):
return str(self)
Expand Down Expand Up @@ -436,16 +361,25 @@ def as_uri(self):
path = str(self)
return prefix + urlquote_from_bytes(os.fsencode(path))

@property
def _str(self):
try:
return self._str_cached
except AttributeError:
self._str_cached = self._format_parsed_parts(
self.drive, self.root, self._tail)
return self._str_cached

@property
def _str_normcase(self):
# String with normalized case, for hashing and equality checks
try:
return self._str_normcase_cached
except AttributeError:
if _is_case_sensitive(self._flavour):
self._str_normcase_cached = str(self)
self._str_normcase_cached = self._str
else:
self._str_normcase_cached = str(self).lower()
self._str_normcase_cached = self._str.lower()
return self._str_normcase_cached

@property
Expand All @@ -457,20 +391,6 @@ def _parts_normcase(self):
self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
return self._parts_normcase_cached

@property
def _lines(self):
# Path with separators and newlines swapped, for pattern matching.
try:
return self._lines_cached
except AttributeError:
path_str = str(self)
if path_str == '.':
self._lines_cached = ''
else:
trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
self._lines_cached = path_str.translate(trans)
return self._lines_cached

def __eq__(self, other):
if not isinstance(other, PurePath):
return NotImplemented
Expand Down Expand Up @@ -738,13 +658,16 @@ def match(self, path_pattern, *, case_sensitive=None):
path_pattern = self.with_segments(path_pattern)
if case_sensitive is None:
case_sensitive = _is_case_sensitive(self._flavour)
pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
sep = path_pattern._flavour.sep
pattern_str = path_pattern._str
if path_pattern.drive or path_pattern.root:
return pattern.match(self._lines) is not None
pass
elif path_pattern._tail:
return pattern.search(self._lines) is not None
pattern_str = f'**{sep}{pattern_str}'
else:
raise ValueError("empty pattern")
match = _compile_pattern(pattern_str, sep, case_sensitive)
return match(self._str) is not None


# Subclassing os.PathLike makes isinstance() checks slower,
Expand Down Expand Up @@ -1017,26 +940,19 @@ def _scandir(self):
return os.scandir(self)

def _make_child_relpath(self, name):
sep = self._flavour.sep
lines_name = name.replace('\n', sep)
lines_str = self._lines
path_str = str(self)
path_str = self._str
tail = self._tail
if tail:
path_str = f'{path_str}{sep}{name}'
lines_str = f'{lines_str}\n{lines_name}'
elif path_str != '.':
path_str = f'{path_str}{self._flavour.sep}{name}'
elif path_str:
path_str = f'{path_str}{name}'
lines_str = f'{lines_str}{lines_name}'
else:
path_str = name
lines_str = lines_name
path = self.with_segments(path_str)
path._str = path_str
path._str_cached = path_str
path._drv = self.drive
path._root = self.root
path._tail_cached = tail + [name]
path._lines_cached = lines_str
return path

def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
Expand Down Expand Up @@ -1082,6 +998,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
# do not perform any filesystem access, which can be much faster!
filter_paths = follow_symlinks is not None and '..' not in pattern_parts
deduplicate_paths = False
sep = self._flavour.sep
paths = iter([self] if self.is_dir() else [])
part_idx = 0
while part_idx < len(pattern_parts):
Expand All @@ -1102,9 +1019,9 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
paths = _select_recursive(paths, dir_only, follow_symlinks)

# Filter out paths that don't match pattern.
prefix_len = len(self._make_child_relpath('_')._lines) - 1
match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
paths = (path for path in paths if match(path._lines[prefix_len:]))
prefix_len = len(self._make_child_relpath('_')._str) - 1
match = _compile_pattern(path_pattern._str, sep, case_sensitive)
paths = (path for path in paths if match(path._str[prefix_len:]))
return paths

dir_only = part_idx < len(pattern_parts)
Expand All @@ -1117,7 +1034,7 @@ def _glob(self, pattern, case_sensitive, follow_symlinks):
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
dir_only = part_idx < len(pattern_parts)
match = _compile_pattern(part, case_sensitive)
match = _compile_pattern(part, sep, case_sensitive)
paths = _select_children(paths, dir_only, follow_symlinks, match)
return paths

Expand Down Expand Up @@ -1210,11 +1127,11 @@ def absolute(self):
# Fast path for "empty" paths, e.g. Path("."), Path("") or Path().
# We pass only one argument to with_segments() to avoid the cost
# of joining, and we exploit the fact that getcwd() returns a
# fully-normalized string by storing it in _str. This is used to
# implement Path.cwd().
# fully-normalized string by storing it in _str_cached. This is
# used to implement Path.cwd().
if not self.root and not self._tail:
result = self.with_segments(cwd)
result._str = cwd
result._str_cached = cwd
return result
return self.with_segments(cwd, self)

Expand Down
Loading

0 comments on commit bbfd404

Please sign in to comment.