Skip to content

Commit

Permalink
pythonGH-102613: Improve performance of pathlib.Path.rglob() (pytho…
Browse files Browse the repository at this point in the history
…nGH-104244)

Stop de-duplicating results in `_RecursiveWildcardSelector`. A new
`_DoubleRecursiveWildcardSelector` class is introduced which performs
de-duplication, but this is used _only_ for patterns with multiple
non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding
the use of a set, `PurePath.__hash__()` is not called, and so paths do not
need to be stringified and case-normalised.

Also merge adjacent '**' segments in patterns.
  • Loading branch information
barneygale authored and jbower-fb committed May 8, 2023
1 parent 44914e0 commit f6943eb
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 18 deletions.
54 changes: 37 additions & 17 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
@functools.lru_cache()
def _make_selector(pattern_parts, flavour, case_sensitive):
pat = pattern_parts[0]
child_parts = pattern_parts[1:]
if not pat:
return _TerminatingSelector()
if pat == '**':
cls = _RecursiveWildcardSelector
elif pat == '..':
cls = _ParentSelector
elif '**' in pat:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
child_parts_idx = 1
while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
child_parts_idx += 1
child_parts = pattern_parts[child_parts_idx:]
if '**' in child_parts:
cls = _DoubleRecursiveWildcardSelector
else:
cls = _RecursiveWildcardSelector
else:
cls = _WildcardSelector
child_parts = pattern_parts[1:]
if pat == '..':
cls = _ParentSelector
elif '**' in pat:
raise ValueError("Invalid pattern: '**' can only be an entire path component")
else:
cls = _WildcardSelector
return cls(pat, child_parts, flavour, case_sensitive)


Expand Down Expand Up @@ -183,20 +191,32 @@ def _iterate_directories(self, parent_path, scandir):

def _select_from(self, parent_path, scandir):
try:
yielded = set()
try:
successor_select = self.successor._select_from
for starting_point in self._iterate_directories(parent_path, scandir):
for p in successor_select(starting_point, scandir):
if p not in yielded:
yield p
yielded.add(p)
finally:
yielded.clear()
successor_select = self.successor._select_from
for starting_point in self._iterate_directories(parent_path, scandir):
for p in successor_select(starting_point, scandir):
yield p
except PermissionError:
return


class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
"""
Like _RecursiveWildcardSelector, but also de-duplicates results from
successive selectors. This is necessary if the pattern contains
multiple non-adjacent '**' segments.
"""

def _select_from(self, parent_path, scandir):
yielded = set()
try:
for p in super()._select_from(parent_path, scandir):
if p not in yielded:
yield p
yielded.add(p)
finally:
yielded.clear()


#
# Public API
#
Expand Down
6 changes: 5 additions & 1 deletion Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -1853,13 +1853,14 @@ def _check(path, pattern, case_sensitive, expected):

def test_rglob_common(self):
def _check(glob, expected):
self.assertEqual(set(glob), { P(BASE, q) for q in expected })
self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
P = self.cls
p = P(BASE)
it = p.rglob("fileA")
self.assertIsInstance(it, collections.abc.Iterator)
_check(it, ["fileA"])
_check(p.rglob("fileB"), ["dirB/fileB"])
_check(p.rglob("**/fileB"), ["dirB/fileB"])
_check(p.rglob("*/fileA"), [])
if not os_helper.can_symlink():
_check(p.rglob("*/fileB"), ["dirB/fileB"])
Expand All @@ -1883,9 +1884,12 @@ def _check(glob, expected):
_check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
"dirC/dirD", "dirC/dirD/fileD"])
_check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
_check(p.rglob("dir*/**"), ["dirC/dirD"])
_check(p.rglob("*/*"), ["dirC/dirD/fileD"])
_check(p.rglob("*/"), ["dirC/dirD"])
_check(p.rglob(""), ["dirC", "dirC/dirD"])
_check(p.rglob("**"), ["dirC", "dirC/dirD"])
# gh-91616, a re module regression
_check(p.rglob("*.txt"), ["dirC/novel.txt"])
_check(p.rglob("*.*"), ["dirC/novel.txt"])
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
wildcards ("``**``") by merging adjacent wildcards and de-duplicating
results only when necessary.

0 comments on commit f6943eb

Please sign in to comment.