diff --git a/datalad_next/commands/ls_file_collection.py b/datalad_next/commands/ls_file_collection.py index 181fe7e5c..cbc0aa867 100644 --- a/datalad_next/commands/ls_file_collection.py +++ b/datalad_next/commands/ls_file_collection.py @@ -52,6 +52,11 @@ FileSystemItemType, compute_multihash_from_fp, ) +from datalad_next.iter_collections.gitworktree import ( + GitTreeItemType, + GitWorktreeFileSystemItem, + iter_gitworktree, +) lgr = getLogger('datalad.local.ls_file_collection') @@ -64,6 +69,7 @@ _supported_collection_types = ( 'directory', 'tarfile', + 'gitworktree', ) @@ -104,7 +110,7 @@ def get_collection_iter(self, **kwargs): hash = kwargs['hash'] iter_fx = None iter_kwargs = None - if type in ('directory', 'tarfile'): + if type in ('directory', 'tarfile', 'gitworktree'): if not isinstance(collection, Path): self.raise_for( kwargs, @@ -118,10 +124,16 @@ def get_collection_iter(self, **kwargs): item2res = fsitem_to_dict if type == 'directory': iter_fx = iter_dir + item2res = fsitem_to_dict elif type == 'tarfile': iter_fx = iter_tar + item2res = fsitem_to_dict + elif type == 'gitworktree': + iter_fx = iter_gitworktree + item2res = gitworktreeitem_to_dict else: - raise RuntimeError('unhandled condition') + raise RuntimeError( + 'unhandled collection-type: this is a defect, please report.') assert iter_fx is not None return dict( collection=CollectionSpec( @@ -166,6 +178,33 @@ def fsitem_to_dict(item, hash) -> Dict: return d +def gitworktreeitem_to_dict(item, hash) -> Dict: + gitworktreeitem_type_to_res_type = { + # permission bits are not distinguished for types + GitTreeItemType.executablefile: 'file', + # 'dataset' is the commonly used label as the command API + # level + GitTreeItemType.submodule: 'dataset', + } + + gittype = gitworktreeitem_type_to_res_type.get( + item.gittype, item.gittype.value) if item.gittype else None + + if isinstance(item, GitWorktreeFileSystemItem): + d = fsitem_to_dict(item, hash) + else: + d = dict(item=item.name) + if gittype is not None: + d['type'] = gittype + + if item.gitsha: + d['gitsha'] = item.gitsha + + if gittype is not None: + d['gittype'] = gittype + return d + + @build_doc class LsFileCollection(ValidatedInterface): """Report information on files in a collection @@ -310,19 +349,20 @@ def custom_result_renderer(res, **kwargs): 'minutes ago', 'min ago').replace( 'seconds ago', 'sec ago') - ui.message('{mode} {size: >6} {uid: >4}:{gid: >4} {hts: >11} {item} ({type})'.format( + # stick with numerical IDs (although less accessible), we cannot + # know in general whether this particular system can map numerical + # IDs to valid target names (think stored name in tarballs) + owner_info = f'{res["uid"]}:{res["gid"]}' if 'uid' in res else '' + + ui.message('{mode} {size: >6} {owner: >9} {hts: >11} {item} ({type})'.format( mode=mode, size=size, - # stick with numerical IDs (although less accessible), we cannot - # know in general whether this particular system can map numerical - # IDs to valid target names (think stored name in tarballs) - uid=res.get('uid', '-'), - gid=res.get('gid', '-'), - hts=hts, + owner=owner_info, + hts=hts if mtime else '', item=ac.color_word( - res.get('item', ''), + res.get('item', ''), ac.BOLD), type=ac.color_word( - res.get('type', ''), + res.get('type', ''), ac.MAGENTA), )) diff --git a/datalad_next/commands/tests/test_ls_file_collection.py b/datalad_next/commands/tests/test_ls_file_collection.py index 2001f1da7..2455af7f4 100644 --- a/datalad_next/commands/tests/test_ls_file_collection.py +++ b/datalad_next/commands/tests/test_ls_file_collection.py @@ -63,6 +63,20 @@ def test_ls_file_collection_directory(tmp_path): assert len(res) == 0 +def test_ls_file_collection_gitworktree(existing_dataset): + kwa = dict(result_renderer='disabled') + # smoke test on a plain dataset + res = ls_file_collection('gitworktree', existing_dataset.pathobj, **kwa) + assert len(res) > 1 + assert all('gitsha' in r for r in res) + + # and with hashing + res_hash = ls_file_collection('gitworktree', existing_dataset.pathobj, + hash='md5', **kwa) + assert len(res) == len(res_hash) + assert all('hash-md5' in r for r in res_hash) + + def test_ls_file_collection_validator(): val = LsFileCollectionParamValidator() diff --git a/datalad_next/iter_collections/__init__.py b/datalad_next/iter_collections/__init__.py index a08c67ff4..e83816b86 100644 --- a/datalad_next/iter_collections/__init__.py +++ b/datalad_next/iter_collections/__init__.py @@ -18,6 +18,7 @@ :toctree: generated directory + gitworktree tarfile zipfile utils diff --git a/datalad_next/iter_collections/gitworktree.py b/datalad_next/iter_collections/gitworktree.py new file mode 100644 index 000000000..16350c04f --- /dev/null +++ b/datalad_next/iter_collections/gitworktree.py @@ -0,0 +1,273 @@ +"""Report on the content of a Git repository worktree + +The main functionality is provided by the :func:`iter_gitworktree()` function. +""" +from __future__ import annotations + +from dataclasses import dataclass +from enum import Enum +from itertools import chain +import logging +from pathlib import ( + Path, + PurePath, + PurePosixPath, +) +import re +from typing import ( + Dict, + Generator, + Tuple, +) + +from datalad_next.runners import ( + DEVNULL, + LineSplitter, + ThreadedRunner, + StdOutCaptureGeneratorProtocol, +) + +from .utils import ( + FileSystemItem, + FileSystemItemType, + PathBasedItem, +) + +lgr = logging.getLogger('datalad.ext.next.iter_collections.gitworktree') + + +# TODO Could be `StrEnum`, came with PY3.11 +class GitTreeItemType(Enum): + """Enumeration of item types of Git trees + """ + file = 'file' + executablefile = 'executablefile' + symlink = 'symlink' + directory = 'directory' + submodule = 'submodule' + + +# TODO maybe establish GitTreeItem and derive from that +@dataclass +class GitWorktreeItem(PathBasedItem): + name: PurePosixPath + # gitsha is not the sha1 of the file content, but the output + # of `git hash-object` which does something like + # `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` + gitsha: str | None = None + gittype: GitTreeItemType | None = None + + +@dataclass +class GitWorktreeFileSystemItem(FileSystemItem): + # gitsha is not the sha1 of the file content, but the output + # of `git hash-object` which does something like + # `printf "blob $(wc -c < "$file_name")\0$(cat "$file_name")" | sha1sum` + gitsha: str | None = None + gittype: GitTreeItemType | None = None + + +# stolen from GitRepo.get_content_info() +_lsfiles_props_re = re.compile( + r'(?P[0-9]+) (?P.*) (.*)\t(?P.*)$' +) + +_mode_type_map = { + '100644': GitTreeItemType.file, + '100755': GitTreeItemType.executablefile, + '040000': GitTreeItemType.directory, + '120000': GitTreeItemType.symlink, + '160000': GitTreeItemType.submodule, +} + +lsfiles_untracked_args = { + 'all': + ('--exclude-standard', '--others',), + 'whole-dir': + ('--exclude-standard', '--others', '--directory'), + 'no-empty-dir': + ('--exclude-standard', '--others', '--directory', '--no-empty-directory'), +} + + +def iter_gitworktree( + path: Path, + *, + untracked: str | None = 'all', + link_target: bool = False, + fp: bool = False, +) -> Generator[GitWorktreeItem | GitWorktreeFileSystemItem, None, None]: + """Uses ``git ls-files`` to report on a work tree of a Git repository + + This iterator can be used to report on all tracked, and untracked content + of a Git repository's work tree. This includes files that have been removed + from the work tree (deleted), unless their removal has already been staged. + + For any tracked content, yielded items include type information and gitsha + as last known to Git. This means that such reports reflect the last + committed or staged content, not the state of a potential unstaged + modification in the work tree. + + When no reporting of link targets or file-objects are requested, items of + type :class:`GitWorktreeItem` are yielded, otherwise + :class:`GitWorktreeFileSystemItem` instances. In both cases, ``gitsha`` and + ``gittype`` properties are provided. Either of them being ``None`` + indicates untracked work tree content. + + .. note:: + The ``gitsha`` is not equivalent to a SHA1 hash of a file's content, + but is the SHA-type blob identifier as reported and used by Git. + + Parameters + ---------- + path: Path + Path of a directory in a Git repository to report on. This directory + need not be the root directory of the repository, but must be part of + the repository's work tree. + untracked: {'all', 'whole-dir', 'no-empty'} or None, optional + If not ``None``, also reports on untracked work tree content. + ``all`` reports on any untracked file; ``whole-dir`` yields a single + report for a directory that is entirely untracked, and not individual + untracked files in it; ``no-empty-dir`` skips any reports on + untracked empty directories. Any untracked content is yielded as + a ``PurePosixPath``. + link_target: bool, optional + If ``True``, each file-type item includes a file-like object + to access the file's content. This file handle will be closed + automatically when the next item is yielded. + fp: bool, optional + If ``True``, each file-type item includes a file-like object + to access the file's content. This file handle will be closed + automatically when the next item is yielded. + + Yields + ------ + :class:`GitWorktreeItem` or `GitWorktreeFileSystemItem` + """ + lsfiles_args = ['--stage', '--cached'] + if untracked: + lsfiles_args.extend(lsfiles_untracked_args[untracked]) + + # helper to handle multi-stage reports by ls-files + pending_item = None + + # we add a "fake" `None` record at the end to avoid a special + # case for submitting the last pending item after the loop. + # otherwise the context manager handling of the file pointer + # would lead to lots of code duplication + for line in chain(_git_ls_files(path, *lsfiles_args), [None]): + # a bit ugly, but we need to account for the `None` record + # that signals the final loop iteration + ipath, lsfiles_props = _lsfiles_line2props(line) \ + if line is not None else (None, None) + + # yield any pending item, if the current record is not an + # addendum of it + if ipath is None or ( + pending_item is not None and pending_item[0] != ipath): + # report on a pending item, this is not a "higher-stage" + # report by ls-files + item = _get_item(path, link_target, fp, *pending_item) + if fp and item.type == FileSystemItemType.file: + with (Path(path) / item.name).open('rb') as fp: + item.fp = fp + yield item + else: + yield item + pending_item = None + + if ipath is None: + # this is the trailing `None` record. we are done here + break + + if lsfiles_props is None: + # when no properties were produced, this is a + # category "other" report (i.e., untracked content) + # the path is always relative-POSIX + pending_item = (ipath,) + else: + pending_item = ( + ipath, + _mode_type_map[lsfiles_props['mode']], + lsfiles_props['gitsha'] + ) + # do not yield immediately, wait for a possible higher-stage + # report in the next loop iteration + + +def _get_item( + basepath: Path, + link_target: bool, + fp: bool, + ipath: PurePosixPath, + type: GitTreeItemType | None = None, + gitsha: str | None = None, +) -> GitWorktreeItem | GitWorktreeFileSystemItem: + if link_target or fp: + fullpath = basepath / ipath + item = GitWorktreeFileSystemItem.from_path( + fullpath, + link_target=link_target, + ) + # make sure the name/id is the path relative to the basepath + item.name = PurePath(ipath) + if type is not None: + item.gittype = type + if gitsha is not None: + item.gitsha = gitsha + return item + else: + return GitWorktreeItem( + name=ipath, + gittype=type, + gitsha=gitsha, + ) + + +def _lsfiles_line2props( + line: str +) -> Tuple[PurePosixPath, Dict[str, str] | None]: + props = _lsfiles_props_re.match(line) + if not props: + # Kludge: Filter out paths starting with .git/ to work around + # an `ls-files -o` bug that was fixed in Git 2.25. + # + # TODO: Drop this condition when GIT_MIN_VERSION is at least + # 2.25. + if line.startswith(".git/"): # pragma nocover + lgr.debug("Filtering out .git/ file: %s", line) + return + # not known to Git, but Git always reports POSIX + path = PurePosixPath(line) + # early exist, we have nothing but the path (untracked) + return path, None + + # again Git reports always in POSIX + path = PurePosixPath(props.group('fname')) + return path, dict( + gitsha=props.group('gitsha'), + mode=props.group('mode'), + ) + + +def _git_ls_files(path, *args): + # we use a plain runner to avoid the overhead of a GitRepo instance + runner = ThreadedRunner( + cmd=[ + 'git', 'ls-files', + # we rely on zero-byte splitting below + '-z', + # otherwise take whatever is coming in + *args, + ], + protocol_class=StdOutCaptureGeneratorProtocol, + stdin=DEVNULL, + # run in the directory we want info on + cwd=path, + ) + line_splitter = LineSplitter('\0', keep_ends=False) + # for each command output chunk received by the runner + for content in runner.run(): + # for each zerobyte-delimited "line" in the output + for line in line_splitter.process(content.decode('utf-8')): + yield line diff --git a/datalad_next/iter_collections/tests/test_itergitworktree.py b/datalad_next/iter_collections/tests/test_itergitworktree.py new file mode 100644 index 000000000..10176404b --- /dev/null +++ b/datalad_next/iter_collections/tests/test_itergitworktree.py @@ -0,0 +1,74 @@ +from pathlib import PurePath +from ..gitworktree import ( + GitWorktreeItem, + GitWorktreeFileSystemItem, + iter_gitworktree, +) + + +def test_iter_gitworktree(existing_dataset): + ds = existing_dataset + + (ds.pathobj / 'emptydir').mkdir() + untracked = ds.pathobj / 'subdir' / 'untracked' + untracked.parent.mkdir() + untracked.write_text('untracked') + + tracked_items = list(iter_gitworktree(ds.pathobj, untracked=None)) + # without untracked's and no link resolution this is plain and fast + assert all( + isinstance(i, GitWorktreeItem) and i.gitsha and i.gittype + for i in tracked_items + ) + + all_items = list(iter_gitworktree(ds.pathobj, untracked='all')) + # empty-dir is not reported, only untracked files + assert len(all_items) == len(tracked_items) + 1 + assert any( + i.name == PurePath('subdir', 'untracked') + and i.gitsha is None and i.gittype is None + for i in all_items + ) + # same again, but with a different untracked reporting + all_items = list(iter_gitworktree(ds.pathobj, untracked='whole-dir')) + # emptydir is reported too + assert len(all_items) == len(tracked_items) + 2 + assert any( + i.name == PurePath('subdir') + and i.gitsha is None and i.gittype is None + for i in all_items + ) + # and again for the last variant + all_items = list(iter_gitworktree(ds.pathobj, untracked='no-empty-dir')) + # and again no emptydir + assert len(all_items) == len(tracked_items) + 1 + assert any( + i.name == PurePath('subdir') + and i.gitsha is None and i.gittype is None + for i in all_items + ) + + # if we ask for file objects or link_targets, we get a different item type, + # but including the same + for kwargs in ( + dict(link_target=True, fp=False, untracked=None), + dict(link_target=False, fp=True, untracked=None), + dict(link_target=True, fp=True, untracked=None), + ): + assert all( + isinstance(i, GitWorktreeFileSystemItem) and i.gitsha and i.gittype + for i in iter_gitworktree(ds.pathobj, **kwargs) + ) + + # check that file pointers work for tracked and untracked content + checked_tracked = False + checked_untracked = False + for item in iter_gitworktree(ds.pathobj, fp=True): + if item.name == PurePath('.datalad', 'config'): + assert ds.id in (ds.pathobj / item.name).read_text() + checked_tracked = True + elif item.name == PurePath('subdir', 'untracked'): + assert 'untracked' == (ds.pathobj / item.name).read_text() + checked_untracked = True + assert checked_tracked + assert checked_untracked diff --git a/datalad_next/iter_collections/utils.py b/datalad_next/iter_collections/utils.py index d9196dedb..0f00a2e5d 100644 --- a/datalad_next/iter_collections/utils.py +++ b/datalad_next/iter_collections/utils.py @@ -69,14 +69,12 @@ def from_path( path: Path, *, link_target: bool = True, - fp: bool = False, ): """Populate item properties from a single `stat` and `readlink` call The given ``path`` must exist. The ``link_target`` flag indicates whether to report the result of ``readlink`` for a symlink-type - path. If `fp` is set, the item includes a file-like object - to access the file's content. + path. """ cstat = path.lstat() cmode = cstat.st_mode diff --git a/datalad_next/runners/__init__.py b/datalad_next/runners/__init__.py index b2d0df59f..b65b1fa77 100644 --- a/datalad_next/runners/__init__.py +++ b/datalad_next/runners/__init__.py @@ -23,3 +23,15 @@ from datalad.runner.exception import ( CommandError, ) + +# utilities +from datalad.runner.nonasyncrunner import ( + STDOUT_FILENO, + STDERR_FILENO, +) +from datalad.runner.utils import ( + LineSplitter, +) +from subprocess import ( + DEVNULL, +)