Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support scoping within Dataverse datasets #273

Merged
merged 3 commits into from
Mar 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datalad_dataverse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
description="Convenience conversion of Dataverse dataset landing page "
"URLs to git-cloneable 'datalad-annex::'-type URLs. It enables cloning "
"from dataset webpage directly, and implies a remote sibling in 'annex' "
"mode (i.e., with keys, not exports) "
"mode (i.e., with keys, not exports) and no alternative root path being used"
"See https://docs.datalad.org/design/url_substitution.html for details",
dialog='question',
scope='global',
Expand Down
24 changes: 21 additions & 3 deletions datalad_dataverse/add_sibling_dataverse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
__docformat__ = "numpy"

import logging
from pathlib import PurePosixPath
from urllib.parse import quote as urlquote

from datalad_next.commands import (
Expand Down Expand Up @@ -76,6 +77,12 @@ class AddSiblingDataverse(ValidatedInterface):
args=("ds_pid",),
doc="""""",
),
root_path=Parameter(
args=('--root-path',),
metavar='PATH',
doc="""optional alternative root path for the sibling inside the
Dataverse dataset. This can be used to represent multiple DataLad
datasets within a single Dataverse dataset without conflict."""),
dataset=Parameter(
args=("-d", "--dataset"),
doc="""specify the dataset to process. If
Expand Down Expand Up @@ -156,6 +163,7 @@ def __call__(
dv_url: str,
ds_pid: str,
*,
root_path: PurePosixPath | None = None,
dataset: DatasetParameter | None = None,
name: str = 'dataverse',
storage_name: str | None = None,
Expand Down Expand Up @@ -191,6 +199,7 @@ def __call__(
url=dv_url,
credential_name=credential,
ds_pid=ds_pid,
root_path=root_path,
mode=mode,
name=name,
storage_name=storage_name,
Expand Down Expand Up @@ -227,6 +236,7 @@ def custom_result_renderer(res, **kwargs):
def _add_sibling_dataverse(
ds, url, credential_name, ds_pid,
*,
root_path=None,
mode='git-only',
name=None,
storage_name=None,
Expand Down Expand Up @@ -256,6 +266,7 @@ def _add_sibling_dataverse(
ds=ds,
url=url,
doi=ds_pid,
root_path=root_path,
credential_name=credential_name,
export=export_storage,
existing=existing,
Expand Down Expand Up @@ -292,7 +303,7 @@ def _get_skip_sibling_result(name, ds, type_):

def _add_git_sibling(
*,
ds, url, doi, name, credential_name, export, existing,
ds, url, doi, root_path, name, credential_name, export, existing,
known, publish_depends=None):
"""
Parameters
Expand Down Expand Up @@ -328,6 +339,9 @@ def _add_git_sibling(
# e.g., it is not uncommon for credentials to be named after URLs
remote_url += f'&credential={urlquote(credential_name)}'

if root_path:
remote_url += f'&rootpath={urlquote(str(root_path))}'

# announce the sibling to not have an annex (we have a dedicated
# storage sibling for that) to avoid needless annex-related processing
# and speculative whining by `siblings()`
Expand Down Expand Up @@ -355,8 +369,9 @@ def _add_git_sibling(


def _add_storage_sibling(
*,
ds, url, doi, name, credential_name, export, existing, known=False):
*, ds, url, doi, root_path, name, credential_name, export, existing,
known=False,
):
"""
Parameters
----------
Expand Down Expand Up @@ -393,6 +408,9 @@ def _add_storage_sibling(
# supply the credential identifier, if it was explicitly given
if credential_name:
cmd_args.append(f"credential={credential_name}")
if root_path:
cmd_args.append(f"rootpath={root_path}")

ds.repo.call_annex(cmd_args)
yield get_status_dict(
ds=ds,
Expand Down
14 changes: 8 additions & 6 deletions datalad_dataverse/baseremote.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,13 @@ class DataverseRemote(SpecialRemote):

def __init__(self, *args):
super().__init__(*args)
self.configs['url'] = 'The Dataverse URL for the remote'
self.configs['doi'] = 'DOI to the dataset'
self.configs['url'] = 'URL of the Dataverse site'
self.configs['doi'] = \
'DOI-style persistent identifier of the Dataverse dataset'
self.configs['rootpath'] = \
'optional alternative root path to use in the Dataverse dataset'
self.configs['credential'] = \
'Identifier used to retrieve an API token from a local ' \
'credential store'
'name of a DataLad credential with a Dataverse API token to use'
# dataverse dataset interface
self._dvds = None

Expand All @@ -119,6 +121,7 @@ def prepare(self):
doi = self.annex.getconfig('doi')
if not doi:
raise ValueError('doi must be specified')
dv_root_path = self.annex.getconfig('rootpath')
# standardize formatting to minimize complexity downstream
doi = format_doi(doi)
# we need an access token, use the repo's configmanager to query for one
Expand All @@ -145,11 +148,10 @@ def prepare(self):
apitoken,
)
# TODO this can raise, capture and raise proper error
self._dvds = OnlineDataverseDataset(api, doi)
self._dvds = OnlineDataverseDataset(api, doi, root_path=dv_root_path)
# save the credential, now that it has successfully been used
credman.set(credential_name, _lastused=True, **cred)


def initremote(self):
"""
Use this command to initialize a remote
Expand Down
30 changes: 24 additions & 6 deletions datalad_dataverse/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,21 @@ class OnlineDataverseDataset:
not representable by a subset of ASCII, and therefore any non-latin
alphabet. See the documentation of the ``mangle_path()`` function
for details.

If ``root_path`` is set, then all paths in the scope of the Dataverse
dataset will be prefixed with this path. This establishes an alternative
root path for all dataset operations. It will not be possible to upload,
download, rename (etc) files from outside this prefix scope, or across
scopes.
"""
def __init__(self, api, dsid: str):
def __init__(self, api, dsid: str, root_path: str | None = None):
# dataverse native API handle
self._api = api
self._dsid = dsid
# unconditional prefix of `directoryLabel` for any remote deposit
# in POSIX notation
# (filter out '')
self._root_path = PurePosixPath(root_path) if root_path else None

self._data_access_api = None
# mapping of dataverse database fileids to FileIdRecord
Expand Down Expand Up @@ -95,7 +105,7 @@ def get_fileid_from_path(
"""
if not latest_only:
self._ensure_file_records_for_all_versions()
path = mangle_path(path)
path = self._mangle_path(path)
# get all file id records that match the path, and are latest version,
# if desired
match_path = dict(
Expand All @@ -122,14 +132,14 @@ def has_fileid_in_latest_version(self, fid: int) -> bool:
return rec.is_latest_version

def has_path(self, path: PurePosixPath) -> bool:
path = mangle_path(path)
path = self._mangle_path(path)
self._ensure_file_records_for_all_versions()
return path in set(
f.path for f in self._file_records_by_fileid.values()
)

def has_path_in_latest_version(self, path: PurePosixPath) -> bool:
path = mangle_path(path)
path = self._mangle_path(path)
return path in set(
f.path for f in self._file_records_by_fileid.values()
if f.is_latest_version
Expand Down Expand Up @@ -171,7 +181,7 @@ def upload_file(self,
local_path: Path,
remote_path: PurePosixPath,
replace_id: int | None = None) -> int:
remote_path = mangle_path(remote_path)
remote_path = self._mangle_path(remote_path)
datafile = Datafile()
# remote file metadata
datafile.set({
Expand Down Expand Up @@ -238,7 +248,7 @@ def rename_file(self,

# mangle_path for rename_path is done inside get_fileid_from_path()
# in the conditional below
new_path = mangle_path(new_path)
new_path = self._mangle_path(new_path)

if rename_id is None:
# unclear to MIH why `latest_only=True`, presumably because
Expand Down Expand Up @@ -325,6 +335,14 @@ def data_access_api(self):
)
return self._data_access_api

def _mangle_path(self, path: str | PurePosixPath) -> PurePosixPath:
if self._root_path:
# we cannot use mangle_path() directly for type conversion,
# because we have to add a root_path first in order to ensure
# that it gets mangled properly too, in case it needs to
path = self._root_path / PurePosixPath(path)
return mangle_path(path)

def _ensure_file_records_for_all_versions(self) -> None:
if self._knows_all_versions:
return
Expand Down
75 changes: 75 additions & 0 deletions datalad_dataverse/tests/test_add_sibling_dataverse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest

from pathlib import PurePosixPath

from datalad.api import clone

from datalad_next.tests.utils import assert_result_count
Expand Down Expand Up @@ -77,6 +79,79 @@ def test_asdv_addpushclone(
cloned_repo.get_hexsha(cloned_repo.get_corresponding_branch())


def test_asdv_multiple_ds(
dataverse_admin_credential_setup,
dataverse_instance_url,
dataverse_dataset,
existing_dataset,
tmp_path,
):
dspid = dataverse_dataset

ds = existing_dataset
ds_repo = ds.repo
# create two-levels of nested datasets
subds = ds.create('subds', **ckwa)
subsubds = ds.create(subds.pathobj / 'subsubds', **ckwa)

# now add siblings for all of them in the same dataverse dataset
common_add_args = dict(
ckwa,
dv_url=dataverse_instance_url,
ds_pid=dspid,
credential="dataverse",
)

res = ds.add_sibling_dataverse(**common_add_args)
clone_url = [
r['url'] for r in res
if r['action'] == "add_sibling_dataverse"
][0]

# deposit all subdatasets regardless of nesting level under their
# (UU)ID. This is nohow mandatory or the best way. It could also
# be by relative path, or someother measure. But this gives
# a conflict free layout
for d in (subds, subsubds):
d.add_sibling_dataverse(
root_path=d.id,
**common_add_args,
)

# let the superdataset know about the origination of subdatasets
# to enable a recursive installation
ds.configuration(
'set',
spec=[(
'datalad.get.subdataset-source-candidate-100dv',
clone_url + '&rootpath={id}',
)],
scope='branch',
**ckwa
)
# safe the config update
ds.save(**ckwa)

ds.push(to='dataverse', recursive=True, **ckwa)

# And we should be able to clone
cloned_ds = clone(
source=clone_url,
path=tmp_path,
result_xfm='datasets',
**ckwa
)
# and perform a recursive get of subdatasets
cloned_ds.get(get_data=False, recursive=True, **ckwa)
# and we have two subdatasets (all levels)
assert_result_count(
cloned_ds.subdatasets(state='present', recursive=True, **ckwa),
2,
type='dataset',
status='ok',
)


# TODO despaghettify this monster
@pytest.mark.parametrize("mode", ["annex", "filetree"])
def test_workflow(dataverse_admin_credential_setup,
Expand Down