datalad · mih · Mar 17, 2023 · Mar 17, 2023 · Mar 17, 2023 · Mar 17, 2023
diff --git a/datalad_dataverse/__init__.py b/datalad_dataverse/__init__.py
@@ -32,7 +32,7 @@
     description="Convenience conversion of Dataverse dataset landing page "
     "URLs to git-cloneable 'datalad-annex::'-type URLs. It enables cloning "
     "from dataset webpage directly, and implies a remote sibling in 'annex' "
-    "mode (i.e., with keys, not exports) "
+    "mode (i.e., with keys, not exports) and no alternative root path being used"
     "See https://docs.datalad.org/design/url_substitution.html for details",
     dialog='question',
     scope='global',

diff --git a/datalad_dataverse/add_sibling_dataverse.py b/datalad_dataverse/add_sibling_dataverse.py
@@ -6,6 +6,7 @@
 __docformat__ = "numpy"
 
 import logging
+from pathlib import PurePosixPath
 from urllib.parse import quote as urlquote
 
 from datalad_next.commands import (
@@ -76,6 +77,12 @@ class AddSiblingDataverse(ValidatedInterface):
             args=("ds_pid",),
             doc="""""",
         ),
+        root_path=Parameter(
+            args=('--root-path',),
+            metavar='PATH',
+            doc="""optional alternative root path for the sibling inside the
+            Dataverse dataset. This can be used to represent multiple DataLad
+            datasets within a single Dataverse dataset without conflict."""),
         dataset=Parameter(
             args=("-d", "--dataset"),
             doc="""specify the dataset to process.  If
@@ -156,6 +163,7 @@ def __call__(
             dv_url: str,
             ds_pid: str,
             *,
+            root_path: PurePosixPath | None = None,
             dataset: DatasetParameter | None = None,
             name: str = 'dataverse',
             storage_name: str | None = None,
@@ -191,6 +199,7 @@ def __call__(
                 url=dv_url,
                 credential_name=credential,
                 ds_pid=ds_pid,
+                root_path=root_path,
                 mode=mode,
                 name=name,
                 storage_name=storage_name,
@@ -227,6 +236,7 @@ def custom_result_renderer(res, **kwargs):
 def _add_sibling_dataverse(
         ds, url, credential_name, ds_pid,
         *,
+        root_path=None,
         mode='git-only',
         name=None,
         storage_name=None,
@@ -256,6 +266,7 @@ def _add_sibling_dataverse(
         ds=ds,
         url=url,
         doi=ds_pid,
+        root_path=root_path,
         credential_name=credential_name,
         export=export_storage,
         existing=existing,
@@ -292,7 +303,7 @@ def _get_skip_sibling_result(name, ds, type_):
 
 def _add_git_sibling(
         *,
-        ds, url, doi, name, credential_name, export, existing,
+        ds, url, doi, root_path, name, credential_name, export, existing,
         known, publish_depends=None):
     """
     Parameters
@@ -328,6 +339,9 @@ def _add_git_sibling(
         # e.g., it is not uncommon for credentials to be named after URLs
         remote_url += f'&credential={urlquote(credential_name)}'
 
+    if root_path:
+        remote_url += f'&rootpath={urlquote(str(root_path))}'
+
     # announce the sibling to not have an annex (we have a dedicated
     # storage sibling for that) to avoid needless annex-related processing
     # and speculative whining by `siblings()`
@@ -355,8 +369,9 @@ def _add_git_sibling(
 
 
 def _add_storage_sibling(
-        *,
-        ds, url, doi, name, credential_name, export, existing, known=False):
+    *, ds, url, doi, root_path, name, credential_name, export, existing,
+    known=False,
+):
     """
     Parameters
     ----------
@@ -393,6 +408,9 @@ def _add_storage_sibling(
     # supply the credential identifier, if it was explicitly given
     if credential_name:
         cmd_args.append(f"credential={credential_name}")
+    if root_path:
+        cmd_args.append(f"rootpath={root_path}")
+
     ds.repo.call_annex(cmd_args)
     yield get_status_dict(
         ds=ds,

diff --git a/datalad_dataverse/baseremote.py b/datalad_dataverse/baseremote.py
@@ -100,11 +100,13 @@ class DataverseRemote(SpecialRemote):
 
     def __init__(self, *args):
         super().__init__(*args)
-        self.configs['url'] = 'The Dataverse URL for the remote'
-        self.configs['doi'] = 'DOI to the dataset'
+        self.configs['url'] = 'URL of the Dataverse site'
+        self.configs['doi'] = \
+            'DOI-style persistent identifier of the Dataverse dataset'
+        self.configs['rootpath'] = \
+            'optional alternative root path to use in the Dataverse dataset'
         self.configs['credential'] = \
-            'Identifier used to retrieve an API token from a local ' \
-            'credential store'
+            'name of a DataLad credential with a Dataverse API token to use'
         # dataverse dataset interface
         self._dvds = None
 
@@ -119,6 +121,7 @@ def prepare(self):
         doi = self.annex.getconfig('doi')
         if not doi:
             raise ValueError('doi must be specified')
+        dv_root_path = self.annex.getconfig('rootpath')
         # standardize formatting to minimize complexity downstream
         doi = format_doi(doi)
         # we need an access token, use the repo's configmanager to query for one
@@ -145,11 +148,10 @@ def prepare(self):
             apitoken,
         )
         # TODO this can raise, capture and raise proper error
-        self._dvds = OnlineDataverseDataset(api, doi)
+        self._dvds = OnlineDataverseDataset(api, doi, root_path=dv_root_path)
         # save the credential, now that it has successfully been used
         credman.set(credential_name, _lastused=True, **cred)
 
-
     def initremote(self):
         """
             Use this command to initialize a remote

diff --git a/datalad_dataverse/dataset.py b/datalad_dataverse/dataset.py
@@ -48,11 +48,21 @@ class OnlineDataverseDataset:
     not representable by a subset of ASCII, and therefore any non-latin
     alphabet. See the documentation of the ``mangle_path()`` function
     for details.
+
+    If ``root_path`` is set, then all paths in the scope of the Dataverse
+    dataset will be prefixed with this path. This establishes an alternative
+    root path for all dataset operations. It will not be possible to upload,
+    download, rename (etc) files from outside this prefix scope, or across
+    scopes.
     """
-    def __init__(self, api, dsid: str):
+    def __init__(self, api, dsid: str, root_path: str | None = None):
         # dataverse native API handle
         self._api = api
         self._dsid = dsid
+        # unconditional prefix of `directoryLabel` for any remote deposit
+        # in POSIX notation
+        # (filter out '')
+        self._root_path = PurePosixPath(root_path) if root_path else None
 
         self._data_access_api = None
         # mapping of dataverse database fileids to FileIdRecord
@@ -95,7 +105,7 @@ def get_fileid_from_path(
         """
         if not latest_only:
             self._ensure_file_records_for_all_versions()
-        path = mangle_path(path)
+        path = self._mangle_path(path)
         # get all file id records that match the path, and are latest version,
         # if desired
         match_path = dict(
@@ -122,14 +132,14 @@ def has_fileid_in_latest_version(self, fid: int) -> bool:
             return rec.is_latest_version
 
     def has_path(self, path: PurePosixPath) -> bool:
-        path = mangle_path(path)
+        path = self._mangle_path(path)
         self._ensure_file_records_for_all_versions()
         return path in set(
             f.path for f in self._file_records_by_fileid.values()
         )
 
     def has_path_in_latest_version(self, path: PurePosixPath) -> bool:
-        path = mangle_path(path)
+        path = self._mangle_path(path)
         return path in set(
             f.path for f in self._file_records_by_fileid.values()
             if f.is_latest_version
@@ -171,7 +181,7 @@ def upload_file(self,
                     local_path: Path,
                     remote_path: PurePosixPath,
                     replace_id: int | None = None) -> int:
-        remote_path = mangle_path(remote_path)
+        remote_path = self._mangle_path(remote_path)
         datafile = Datafile()
         # remote file metadata
         datafile.set({
@@ -238,7 +248,7 @@ def rename_file(self,
 
         # mangle_path for rename_path is done inside get_fileid_from_path()
         # in the conditional below
-        new_path = mangle_path(new_path)
+        new_path = self._mangle_path(new_path)
 
         if rename_id is None:
             # unclear to MIH why `latest_only=True`, presumably because
@@ -325,6 +335,14 @@ def data_access_api(self):
             )
         return self._data_access_api
 
+    def _mangle_path(self, path: str | PurePosixPath) -> PurePosixPath:
+        if self._root_path:
+            # we cannot use mangle_path() directly for type conversion,
+            # because we have to add a root_path first in order to ensure
+            # that it gets mangled properly too, in case it needs to
+            path = self._root_path / PurePosixPath(path)
+        return mangle_path(path)
+
     def _ensure_file_records_for_all_versions(self) -> None:
         if self._knows_all_versions:
             return

diff --git a/datalad_dataverse/tests/test_add_sibling_dataverse.py b/datalad_dataverse/tests/test_add_sibling_dataverse.py
@@ -1,5 +1,7 @@
 import pytest
 
+from pathlib import PurePosixPath
+
 from datalad.api import clone
 
 from datalad_next.tests.utils import assert_result_count
@@ -77,6 +79,79 @@ def test_asdv_addpushclone(
         cloned_repo.get_hexsha(cloned_repo.get_corresponding_branch())
 
 
+def test_asdv_multiple_ds(
+    dataverse_admin_credential_setup,
+    dataverse_instance_url,
+    dataverse_dataset,
+    existing_dataset,
+    tmp_path,
+):
+    dspid = dataverse_dataset
+
+    ds = existing_dataset
+    ds_repo = ds.repo
+    # create two-levels of nested datasets
+    subds = ds.create('subds', **ckwa)
+    subsubds = ds.create(subds.pathobj / 'subsubds', **ckwa)
+
+    # now add siblings for all of them in the same dataverse dataset
+    common_add_args = dict(
+        ckwa,
+        dv_url=dataverse_instance_url,
+        ds_pid=dspid,
+        credential="dataverse",
+    )
+
+    res = ds.add_sibling_dataverse(**common_add_args)
+    clone_url = [
+        r['url'] for r in res
+        if r['action'] == "add_sibling_dataverse"
+    ][0]
+
+    # deposit all subdatasets regardless of nesting level under their
+    # (UU)ID. This is nohow mandatory or the best way. It could also
+    # be by relative path, or someother measure. But this gives
+    # a conflict free layout
+    for d in (subds, subsubds):
+        d.add_sibling_dataverse(
+            root_path=d.id,
+            **common_add_args,
+        )
+
+    # let the superdataset know about the origination of subdatasets
+    # to enable a recursive installation
+    ds.configuration(
+        'set',
+        spec=[(
+            'datalad.get.subdataset-source-candidate-100dv',
+            clone_url + '&rootpath={id}',
+        )],
+        scope='branch',
+        **ckwa
+    )
+    # safe the config update
+    ds.save(**ckwa)
+
+    ds.push(to='dataverse', recursive=True, **ckwa)
+
+    # And we should be able to clone
+    cloned_ds = clone(
+        source=clone_url,
+        path=tmp_path,
+        result_xfm='datasets',
+        **ckwa
+    )
+    # and perform a recursive get of subdatasets
+    cloned_ds.get(get_data=False, recursive=True, **ckwa)
+    # and we have two subdatasets (all levels)
+    assert_result_count(
+        cloned_ds.subdatasets(state='present', recursive=True, **ckwa),
+        2,
+        type='dataset',
+        status='ok',
+    )
+
+
 # TODO despaghettify this monster
 @pytest.mark.parametrize("mode", ["annex", "filetree"])
 def test_workflow(dataverse_admin_credential_setup,