Merge pull request #7504 from GlobalDataverseCommunityConsortium/IQSS…

…/6497-migrate_dataset_api IQSS/6497 migrate dataset api
IQSS · Jul 27, 2021 · 1c3dbe8 · 1c3dbe8
2 parents c284ffa + 93481bd
commit 1c3dbe8
Show file tree

Hide file tree

Showing 12 changed files with 371 additions and 37 deletions.
diff --git a/doc/release-notes/6497-migrate-api.md b/doc/release-notes/6497-migrate-api.md
@@ -0,0 +1,7 @@
+# Release Highlights
+
+### Dataset Migration API (Experimental)
+
+Datasets can now imported following the format of an OAI-ORE export (RDA-conformant Bags), allowing for easier migration from one Dataverse installation to another, and migration from other systems. This experimental, super-user only, endpoint also allows keeping the existing persistent identifier (where the authority and shoulder match those for which the software is configured) and publication dates.
+
+This development was supported by DANS and the [Research Data Alliance](https://rd-alliance.org) and follows the recommendations from the [Research Data Repository Interoperability Working Group](http://dx.doi.org/10.15497/RDA00025).
diff --git a/doc/sphinx-guides/source/_static/api/dataset-migrate.jsonld b/doc/sphinx-guides/source/_static/api/dataset-migrate.jsonld
@@ -0,0 +1,43 @@
+{
+"citation:Depositor": "Admin, Dataverse",
+"Title": "Test Dataset",
+"Subject": "Computer and Information Science",
+"Creator": {
+  "author:Name": "Admin, Dataverse",
+  "author:Affiliation": "GDCC"
+},
+"Deposit Date": "2020-10-08",
+"citation:Distributor": {
+  "distributor:Name": "Demo Dataverse Repository",
+  "distributor:Affiliation": "Dataverse Community",
+  "distributor:Abbreviation": "GDCC",
+  "distributor:URL": "https://dataverse.org/global-dataverse-community-consortium"
+},
+"citation:Contact": {
+"datasetContact:Name": "Admin, Dataverse",
+"datasetContact:Affiliation": "GDCC",
+"datasetContact:E-mail": "admin@demo.dataverse.org"
+},
+"citation:Description": {
+  "dsDescription:Text": "A short description"
+},
+"@id": "doi:10.33564/FK27U7YBV",
+"schema:version": "1.0",
+"schema:license": "https://creativecommons.org/publicdomain/zero/1.0/",
+"schema:datePublished": "2021-07-21",
+"dvcore:fileTermsOfAccess": {
+"dvcore:fileRequestAccess": false
+},
+"@context": {
+  "Creator": "http://purl.org/dc/terms/creator",
+  "Deposit Date": "http://purl.org/dc/terms/dateSubmitted",
+  "Subject": "http://purl.org/dc/terms/subject",
+  "Title": "http://purl.org/dc/terms/title",
+  "author": "https://dataverse.org/schema/citation/author#",
+  "citation": "https://dataverse.org/schema/citation/",
+  "datasetContact": "https://dataverse.org/schema/citation/datasetContact#",
+  "distributor": "https://dataverse.org/schema/citation/distributor#",
+  "dsDescription": "https://dataverse.org/schema/citation/dsDescription#",
+  "dvcore": "https://dataverse.org/schema/core#",
+  "schema": "http://schema.org/"
+}}
diff --git a/doc/sphinx-guides/source/admin/dashboard.rst b/doc/sphinx-guides/source/admin/dashboard.rst
@@ -22,7 +22,7 @@ This dashboard tool allows you to define sets of local datasets to make availabl
 Metadata Export
 ---------------
 
-This part of the Dashboard is simply a reminder message that metadata export happens through the Dataverse Software API. See the :doc:`metadataexport` section and the :doc:`/api/native-api` section of the API Guide for more details.
+This part of the Dashboard is simply a reminder message that metadata export happens through the Dataverse Software API. See the :doc:`/admin/metadataexport` section and the :doc:`/api/native-api` section of the API Guide for more details.
 
 Users
 -----

diff --git a/doc/sphinx-guides/source/api/intro.rst b/doc/sphinx-guides/source/api/intro.rst
@@ -204,6 +204,15 @@ Please note that some APIs are only documented in other guides that are more sui
 
   - :doc:`/installation/config`
 
+- Developer Guide
+
+  - :doc:`/developers/aux-file-support`
+  - :doc:`/developers/big-data-support`
+  - :doc:`/developers/dataset-migration-api`
+  - :doc:`/developers/dataset-semantic-metadata-api`
+  - :doc:`/developers/s3-direct-upload-api`
+  - :doc:`/developers/workflows`
+
 Client Libraries
 ~~~~~~~~~~~~~~~~
 

diff --git a/doc/sphinx-guides/source/developers/dataset-migration-api.rst b/doc/sphinx-guides/source/developers/dataset-migration-api.rst
@@ -0,0 +1,49 @@
+Dataset Migration API
+=====================
+
+The Dataverse software includes several ways to add Datasets originally created elsewhere (not to mention Harvesting capabilities). These include the Sword API (see the :doc:`/api/sword` guide) and the /dataverses/{id}/datasets/:import methods (json and ddi) (see the :doc:`/api/native-api` guide).
+
+This experimental migration API offers an additional option with some potential advantages:
+
+* metadata can be specified using the json-ld format used in the OAI-ORE metadata export
+* existing publication dates and PIDs are maintained (currently limited to the case where the PID can be managed by the Dataverse software, e.g. where the authority and shoulder match those the software is configured for)
+* adding files can be done via the standard APIs, including using direct-upload to S3
+
+This API consists of 2 calls: one to create an initial Dataset version, and one to 'republish' the dataset through Dataverse with a specified publication date.
+Both calls require super-admin privileges.
+
+These calls can be used in concert with other API calls to add files, update metadata, etc. before the 'republish' step is done.
+
+
+Start Migrating a Dataset into a Dataverse Collection
+-----------------------------------------------------
+
+.. note:: This action requires a Dataverse installation account with superuser permissions.
+
+To import a dataset with an existing persistent identifier (PID), the provided json-ld metadata should include it.
+
+.. code-block:: bash
+
+  export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  export SERVER_URL=https://demo.dataverse.org
+  export DATAVERSE_ID=root
+  
+  curl -H X-Dataverse-key:$API_TOKEN -X POST $SERVER_URL/api/dataverses/$DATAVERSE_ID/datasets/:startmigration --upload-file dataset-migrate.jsonld
+
+An example jsonld file is available at :download:`dataset-migrate.jsonld <../_static/api/dataset-migrate.jsonld>` . Note that you would need to replace the PID in the sample file with one supported in your Dataverse instance. (Also note that `Issue #8028 <https://github.com/IQSS/dataverse/issues/8028>`_ currently breaks testing this API with DataCite test DOIs.)
+
+Publish a Migrated Dataset
+--------------------------
+
+The call above creates a Dataset. Once it is created, other APIs can be used to add files, add additional metadata, etc. When a version is complete, the following call can be used to publish it with its original publication date.
+
+.. note:: This action requires a Dataverse installation account with superuser permissions.
+
+.. code-block:: bash
+
+  export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
+  export SERVER_URL=https://demo.dataverse.org
+ 
+  curl -H 'Content-Type: application/jsonld' -H X-Dataverse-key:$API_TOKEN -X POST -d '{"schema:datePublished": "2020-10-26","@context":{ "schema":"http://schema.org/"}}' "$SERVER_URL/api/datasets/{id}/actions/:releasemigrated"
+
+datePublished is the only metadata supported in this call.
diff --git a/doc/sphinx-guides/source/developers/index.rst b/doc/sphinx-guides/source/developers/index.rst
@@ -36,4 +36,5 @@ Developer Guide
    aux-file-support
    s3-direct-upload-api
    dataset-semantic-metadata-api
+   dataset-migration-api 
    workflows
diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -51,6 +51,7 @@
 import edu.harvard.iq.dataverse.engine.command.impl.DeleteDatasetLinkingDataverseCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.DeletePrivateUrlCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand;
+import edu.harvard.iq.dataverse.engine.command.impl.FinalizeDatasetPublicationCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.GetDatasetCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.GetSpecificPublishedDatasetVersionCommand;
 import edu.harvard.iq.dataverse.engine.command.impl.GetDraftDatasetVersionCommand;
@@ -77,7 +78,6 @@
 import edu.harvard.iq.dataverse.ingest.IngestServiceBean;
 import edu.harvard.iq.dataverse.privateurl.PrivateUrl;
 import edu.harvard.iq.dataverse.S3PackageImporter;
-import edu.harvard.iq.dataverse.api.AbstractApiBean.WrappedResponse;
 import edu.harvard.iq.dataverse.api.dto.RoleAssignmentDTO;
 import edu.harvard.iq.dataverse.batch.util.LoggingUtil;
 import edu.harvard.iq.dataverse.dataaccess.DataAccess;
@@ -104,16 +104,23 @@
 import edu.harvard.iq.dataverse.util.SystemConfig;
 import edu.harvard.iq.dataverse.util.bagit.OREMap;
 import edu.harvard.iq.dataverse.util.json.JSONLDUtil;
+import edu.harvard.iq.dataverse.util.json.JsonLDTerm;
 import edu.harvard.iq.dataverse.util.json.JsonParseException;
 import edu.harvard.iq.dataverse.search.IndexServiceBean;
 import static edu.harvard.iq.dataverse.util.json.JsonPrinter.*;
 import static edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder.jsonObjectBuilder;
 import edu.harvard.iq.dataverse.util.json.NullSafeJsonBuilder;
+import edu.harvard.iq.dataverse.workflow.Workflow;
+import edu.harvard.iq.dataverse.workflow.WorkflowContext;
+import edu.harvard.iq.dataverse.workflow.WorkflowServiceBean;
+import edu.harvard.iq.dataverse.workflow.WorkflowContext.TriggerType;
+
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
 import java.sql.Timestamp;
 import java.text.MessageFormat;
+import java.time.LocalDateTime;
 import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
 import java.util.ArrayList;
@@ -123,6 +130,7 @@
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Map.Entry;
 import java.util.Set;
 import java.util.logging.Level;
@@ -140,6 +148,7 @@
 import javax.json.stream.JsonParsingException;
 import javax.servlet.http.HttpServletRequest;
 import javax.servlet.http.HttpServletResponse;
+import javax.ws.rs.BadRequestException;
 import javax.ws.rs.Consumes;
 import javax.ws.rs.DELETE;
 import javax.ws.rs.DefaultValue;
@@ -158,13 +167,14 @@
 import javax.ws.rs.core.Response.Status;
 import static javax.ws.rs.core.Response.Status.BAD_REQUEST;
 import javax.ws.rs.core.UriInfo;
+
+import org.apache.commons.lang3.StringUtils;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.glassfish.jersey.media.multipart.FormDataBodyPart;
 import org.glassfish.jersey.media.multipart.FormDataContentDisposition;
 import org.glassfish.jersey.media.multipart.FormDataParam;
 
 import com.amazonaws.services.s3.model.PartETag;
-import edu.harvard.iq.dataverse.FileMetadata;
 
 @Path("datasets")
 public class Datasets extends AbstractApiBean {
@@ -224,6 +234,9 @@ public class Datasets extends AbstractApiBean {
 
     @Inject
     DataverseRequestServiceBean dvRequestService;
+
+    @Inject
+    WorkflowServiceBean wfService;
 
     /**
      * Used to consolidate the way we parse and handle dataset versions.
@@ -1190,6 +1203,96 @@ public Response publishDataset(@PathParam("id") String id, @QueryParam("type") S
         }
     }
 
+    @POST
+    @Path("{id}/actions/:releasemigrated")
+    @Consumes("application/ld+json, application/json-ld")
+    public Response publishMigratedDataset(String jsonldBody, @PathParam("id") String id) {
+        try {
+            AuthenticatedUser user = findAuthenticatedUserOrDie();
+            if (!user.isSuperuser()) {
+                return error(Response.Status.FORBIDDEN, "Only superusers can release migrated datasets");
+            }
+
+            Dataset ds = findDatasetOrDie(id);
+            try {
+                JsonObject metadata = JSONLDUtil.decontextualizeJsonLD(jsonldBody);
+                String pubDate = metadata.getString(JsonLDTerm.schemaOrg("datePublished").getUrl());
+                logger.fine("Submitted date: " + pubDate);
+                LocalDateTime dateTime = null;
+                if(!StringUtils.isEmpty(pubDate)) {
+                    dateTime = JSONLDUtil.getDateTimeFrom(pubDate);
+                    final Timestamp time = Timestamp.valueOf(dateTime);
+                    //Set version release date
+                    ds.getLatestVersion().setReleaseTime(new Date(time.getTime()));
+                }
+                // dataset.getPublicationDateFormattedYYYYMMDD())
+                // Assign a version number if not set
+                if (ds.getLatestVersion().getVersionNumber() == null) {
+
+                    if (ds.getVersions().size() == 1) {
+                        // First Release
+                        ds.getLatestVersion().setVersionNumber(Long.valueOf(1));
+                        ds.getLatestVersion().setMinorVersionNumber(Long.valueOf(0));
+                    } else if (ds.getLatestVersion().isMinorUpdate()) {
+                        ds.getLatestVersion().setVersionNumber(Long.valueOf(ds.getVersionNumber()));
+                        ds.getLatestVersion().setMinorVersionNumber(Long.valueOf(ds.getMinorVersionNumber() + 1));
+                    } else {
+                        // major, non-first release
+                        ds.getLatestVersion().setVersionNumber(Long.valueOf(ds.getVersionNumber() + 1));
+                        ds.getLatestVersion().setMinorVersionNumber(Long.valueOf(0));
+                    }
+                }
+                if(ds.getLatestVersion().getVersionNumber()==1 && ds.getLatestVersion().getMinorVersionNumber()==0) {
+                    //Also set publication date if this is the first 
+                    if(dateTime != null) {
+                      ds.setPublicationDate(Timestamp.valueOf(dateTime));
+                    }
+                    // Release User is only set in FinalizeDatasetPublicationCommand if the pub date
+                    // is null, so set it here.
+                    ds.setReleaseUser((AuthenticatedUser) user);
+                }
+            } catch (Exception e) {
+                logger.fine(e.getMessage());
+                throw new BadRequestException("Unable to set publication date ("
+                        + JsonLDTerm.schemaOrg("datePublished").getUrl() + "): " + e.getMessage());
+            }
+            /*
+             * Note: The code here mirrors that in the
+             * edu.harvard.iq.dataverse.DatasetPage:updateCurrentVersion method. Any changes
+             * to the core logic (i.e. beyond updating the messaging about results) should
+             * be applied to the code there as well.
+             */
+            String errorMsg = null;
+            Optional<Workflow> prePubWf = wfService.getDefaultWorkflow(TriggerType.PrePublishDataset);
+
+            try {
+                // ToDo - should this be in onSuccess()? May relate to todo above
+                if (prePubWf.isPresent()) {
+                    // Start the workflow, the workflow will call FinalizeDatasetPublication later
+                    wfService.start(prePubWf.get(),
+                            new WorkflowContext(createDataverseRequest(user), ds, TriggerType.PrePublishDataset, false),
+                            false);
+                } else {
+                    FinalizeDatasetPublicationCommand cmd = new FinalizeDatasetPublicationCommand(ds,
+                            createDataverseRequest(user), false);
+                    ds = commandEngine.submit(cmd);
+                }
+            } catch (CommandException ex) {
+                errorMsg = BundleUtil.getStringFromBundle("datasetversion.update.failure") + " - " + ex.toString();
+                logger.severe(ex.getMessage());
+            }
+
+            if (errorMsg != null) {
+                return error(Response.Status.INTERNAL_SERVER_ERROR, errorMsg);
+            } else {
+                return prePubWf.isPresent() ? accepted(json(ds)) : ok(json(ds));
+            }
+
+        } catch (WrappedResponse ex) {
+            return ex.getResponse();
+        }
+    }
+
     @POST
     @Path("{id}/move/{targetDataverseAlias}")
     public Response moveDataset(@PathParam("id") String id, @PathParam("targetDataverseAlias") String targetDataverseAlias, @QueryParam("forceMove") Boolean force) {

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java b/src/main/java/edu/harvard/iq/dataverse/api/Dataverses.java
@@ -88,6 +88,7 @@
 import javax.json.stream.JsonParsingException;
 import javax.validation.ConstraintViolation;
 import javax.validation.ConstraintViolationException;
+import javax.ws.rs.BadRequestException;
 import javax.ws.rs.Consumes;
 import javax.ws.rs.DELETE;
 import javax.ws.rs.GET;
@@ -450,6 +451,59 @@ public Response importDatasetDdi(String xml, @PathParam("identifier") String par
         }
     }
 
+    @POST
+    @Path("{identifier}/datasets/:startmigration")
+    @Consumes("application/ld+json, application/json-ld")
+    public Response recreateDataset(String jsonLDBody, @PathParam("identifier") String parentIdtf) {
+        try {
+            User u = findUserOrDie();
+            if (!u.isSuperuser()) {
+                return error(Status.FORBIDDEN, "Not a superuser");
+            }
+            Dataverse owner = findDataverseOrDie(parentIdtf);
+
+            Dataset ds = new Dataset();
+
+            ds.setOwner(owner);
+            ds = JSONLDUtil.updateDatasetMDFromJsonLD(ds, jsonLDBody, metadataBlockSvc, datasetFieldSvc, false, true); 
+          //ToDo - verify PID is one Dataverse can manage (protocol/authority/shoulder match)
+            if(!
+            (ds.getAuthority().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Authority))&& 
+            ds.getProtocol().equals(settingsService.getValueForKey(SettingsServiceBean.Key.Protocol))&&
+            ds.getIdentifier().startsWith(settingsService.getValueForKey(SettingsServiceBean.Key.Shoulder)))) {
+                throw new BadRequestException("Cannot recreate a dataset that has a PID that doesn't match the server's settings");
+            }
+            if(!datasetSvc.isIdentifierLocallyUnique(ds)) {
+                throw new BadRequestException("Cannot recreate a dataset whose PID is already in use");
+            }
+
+
+
+            if (ds.getVersions().isEmpty()) {
+                return badRequest("Supplied json must contain a single dataset version.");
+            }
+
+            DatasetVersion version = ds.getVersions().get(0);
+            if (!version.isPublished()) {
+                throw new BadRequestException("Cannot recreate a dataset that hasn't been published.");
+            }
+            //While the datasetversion whose metadata we're importing has been published, we consider it in draft until the API caller adds files and then completes the migration
+            version.setVersionState(DatasetVersion.VersionState.DRAFT);
+
+            DataverseRequest request = createDataverseRequest(u);
+
+            Dataset managedDs = execCommand(new ImportDatasetCommand(ds, request));
+            JsonObjectBuilder responseBld = Json.createObjectBuilder()
+                    .add("id", managedDs.getId())
+                    .add("persistentId", managedDs.getGlobalId().toString());
+
+            return created("/datasets/" + managedDs.getId(), responseBld);
+
+        } catch (WrappedResponse ex) {
+            return ex.getResponse();
+        }
+    }
+
     private Dataset parseDataset(String datasetJson) throws WrappedResponse {
         try (StringReader rdr = new StringReader(datasetJson)) {
             return jsonParser().parseDataset(Json.createReader(rdr).readObject());

diff --git a/.../java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java b/.../java/edu/harvard/iq/dataverse/engine/command/impl/FinalizeDatasetPublicationCommand.java
@@ -115,12 +115,16 @@ public Dataset execute(CommandContext ctxt) throws CommandException {
         } 
 
         // update metadata
-        theDataset.getLatestVersion().setReleaseTime(getTimestamp());
+        if (theDataset.getLatestVersion().getReleaseTime() == null) {
+            // Allow migrated versions to keep original release dates
+            theDataset.getLatestVersion().setReleaseTime(getTimestamp());
+        }
         theDataset.getLatestVersion().setLastUpdateTime(getTimestamp());
         theDataset.setModificationTime(getTimestamp());
         theDataset.setFileAccessRequest(theDataset.getLatestVersion().getTermsOfUseAndAccess().isFileAccessRequest());
 
-        updateFiles(getTimestamp(), ctxt);
+        //Use dataset pub date (which may not be the current date for migrated datasets)
+        updateFiles(new Timestamp(theDataset.getLatestVersion().getReleaseTime().getTime()), ctxt);
 
         // 
         // TODO: Not sure if this .merge() is necessary here - ?