diff --git a/doc/release-notes/7140-google-cloud.md b/doc/release-notes/7140-google-cloud.md new file mode 100644 index 00000000000..62aef73acd0 --- /dev/null +++ b/doc/release-notes/7140-google-cloud.md @@ -0,0 +1,12 @@ +## Google Cloud Archiver + +Dataverse Bags can now be sent to a bucket in Google Cloud, including those in the 'Coldline' storage class, which provide less expensive but slower access. + +## Use Cases + +- As an Administrator I can set up a regular export to Google Cloud so that my users' data is preserved. + +## New Settings + +:GoogleCloudProject - the name of the project managing the bucket. +:GoogleCloudBucket - the name of the bucket to use \ No newline at end of file diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index 137144c8cdd..de8fbad3687 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -776,6 +776,8 @@ For Google Analytics, the example script at :download:`analytics-code.html `_ serialized `OAI-ORE `_ map file, which is also available as a metadata export format in the Dataverse web interface. -At present, the DPNSubmitToArchiveCommand and LocalSubmitToArchiveCommand are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. +At present, the DPNSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchive are the only implementations extending the AbstractSubmitToArchiveCommand and using the configurable mechanisms discussed below. .. _Duracloud Configuration: @@ -831,10 +833,41 @@ ArchiverClassName - the fully qualified class to be used for archiving. For exam \:ArchiverSettings - the archiver class can access required settings including existing Dataverse settings and dynamically defined ones specific to the class. This setting is a comma-separated list of those settings. For example\: -``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath”`` +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":BagItLocalPath"`` :BagItLocalPath is the file path that you've set in :ArchiverSettings. +.. _Google Cloud Configuration: + +Google Cloud Configuration +++++++++++++++++++++++++++ + +The Google Cloud Archiver can send Dataverse Bags to a bucket in Google's cloud, including those in the 'Coldline' storage class (cheaper, with slower access) + +``curl http://localhost:8080/api/admin/settings/:ArchiverClassName -X PUT -d "edu.harvard.iq.dataverse.engine.command.impl.GoogleCloudSubmitToArchiveCommand"`` + +``curl http://localhost:8080/api/admin/settings/:ArchiverSettings -X PUT -d ":GoogleCloudBucket, :GoogleCloudProject"`` + +The Google Cloud Archiver defines two custom settings, both are required. The credentials for your account, in the form of a json key file, must also be obtained and stored locally (see below): + +In order to use the Google Cloud Archiver, you must have a Google account. You will need to create a project and bucket within that account and provide those values in the settings: + +\:GoogleCloudBucket - the name of the bucket to use. For example: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudBucket -X PUT -d "qdr-archive"`` + +\:GoogleCloudProject - the name of the project managing the bucket. For example: + +``curl http://localhost:8080/api/admin/settings/:GoogleCloudProject -X PUT -d "qdr-project"`` + +The Google Cloud Archiver also requires a key file that must be renamed to 'googlecloudkey.json' and placed in the directory identified by your 'dataverse.files.directory' jvm option. This file can be created in the Google Cloud Console. (One method: Navigate to your Project 'Settings'/'Service Accounts', create an account, give this account the 'Cloud Storage'/'Storage Admin' role, and once it's created, use the 'Actions' menu to 'Create Key', selecting the 'JSON' format option. Use this as the 'googlecloudkey.json' file.) + +For example: + +``cp /usr/local/payara5/glassfish/domains/domain1/files/googlecloudkey.json`` + +.. _Archiving API Call: + API Call ++++++++ @@ -2124,3 +2157,40 @@ To enable redirects to the zipper installed on the same server as the main Datav To enable redirects to the zipper on a different server: ``curl -X PUT -d 'https://zipper.example.edu/cgi-bin/zipdownload' http://localhost:8080/api/admin/settings/:CustomZipDownloadServiceUrl`` + +:ArchiverClassName +++++++++++++++++++ + +Dataverse can export archival "Bag" files to an extensible set of storage systems (see :ref:`BagIt Export` above for details about this and for further explanation of the other archiving related settings below). +This setting specifies which storage system to use by identifying the particular Java class that should be run. Current options include DuraCloudSubmitToArchiveCommand, LocalSubmitToArchiveCommand, and GoogleCloudSubmitToArchiveCommand. + +``curl -X PUT -d 'LocalSubmitToArchiveCommand' http://localhost:8080/api/admin/settings/:ArchiverClassName`` + +:ArchiverSettings ++++++++++++++++++ + +Each Archiver class may have its own custom settings. Along with setting which Archiver class to use, one must use this setting to identify which setting values should be sent to it when it is invoked. The value should be a comma-separated list of setting names. +For example, the LocalSubmitToArchiveCommand only uses the :BagItLocalPath setting. To allow the class to use that setting, this setting must set as: + +``curl -X PUT -d ':BagItLocalPath' http://localhost:8080/api/admin/settings/:ArchiverSettings`` + +:DuraCloudHost +++++++++++++++ +:DuraCloudPort +++++++++++++++ +:DuraCloudContext ++++++++++++++++++ + +These three settings define the host, port, and context used by the DuraCloudSubmitToArchiveCommand. :DuraCloudHost is required. The other settings have default values as noted in the :ref:`Duracloud Configuration` section above. + +:BagItLocalPath ++++++++++++++++ + +This is the local file system path to be used with the LocalSubmitToArchiveCommand class. It is recommended to use an absolute path. See the :ref:`Local Path Configuration` section above. + +:GoogleCloudBucket +++++++++++++++++++ +:GoogleCloudProject ++++++++++++++++++++ + +These are the bucket and project names to be used with the GoogleCloudSubmitToArchiveCommand class. Further information is in the :ref:`Google Cloud Configuration` section above. diff --git a/pom.xml b/pom.xml index 9a4f0c05fa0..f95f97af112 100644 --- a/pom.xml +++ b/pom.xml @@ -57,7 +57,7 @@ - @@ -127,6 +127,13 @@ httpclient ${httpcomponents.client.version} + + com.google.cloud + google-cloud-bom + 0.115.0-alpha + pom + import + org.testcontainers testcontainers-bom @@ -137,7 +144,7 @@ @@ -576,6 +583,11 @@ opennlp-tools 1.9.1 + + com.google.cloud + google-cloud-storage + 1.97.0 + diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java index 66e8770a641..468e99f24c1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/DuraCloudSubmitToArchiveCommand.java @@ -99,7 +99,12 @@ public void run() { } } }).start(); - + //Have seen Pipe Closed errors for other archivers when used as a workflow without this delay loop + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } String checksum = store.addContent(spaceName, "datacite.xml", digestInputStream, -1l, null, null, null); logger.fine("Content: datacite.xml added with checksum: " + checksum); @@ -133,7 +138,11 @@ public void run() { } } }).start(); - + i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } checksum = store.addContent(spaceName, fileName, digestInputStream2, -1l, null, null, null); logger.fine("Content: " + fileName + " added with checksum: " + checksum); @@ -174,6 +183,9 @@ public void run() { logger.severe(rte.getMessage()); return new Failure("Error in generating datacite.xml file", "DuraCloud Submission Failure: metadata file not created"); + } catch (InterruptedException e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); } } catch (ContentStoreException e) { logger.warning(e.getMessage()); diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java new file mode 100644 index 00000000000..cb729a9807a --- /dev/null +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/GoogleCloudSubmitToArchiveCommand.java @@ -0,0 +1,228 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DOIDataCiteRegisterService; +import edu.harvard.iq.dataverse.DataCitation; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.DatasetLock.Reason; +import edu.harvard.iq.dataverse.authorization.Permission; +import edu.harvard.iq.dataverse.authorization.users.ApiToken; +import edu.harvard.iq.dataverse.engine.command.Command; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; +import edu.harvard.iq.dataverse.util.bagit.BagGenerator; +import edu.harvard.iq.dataverse.util.bagit.OREMap; +import edu.harvard.iq.dataverse.workflow.step.Failure; +import edu.harvard.iq.dataverse.workflow.step.WorkflowStepResult; + +import java.io.BufferedInputStream; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PipedInputStream; +import java.io.PipedOutputStream; +import java.nio.charset.Charset; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Map; +import java.util.logging.Logger; + +import org.apache.commons.codec.binary.Hex; +import com.google.auth.oauth2.ServiceAccountCredentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; + +@RequiredPermissions(Permission.PublishDataset) +public class GoogleCloudSubmitToArchiveCommand extends AbstractSubmitToArchiveCommand implements Command { + + private static final Logger logger = Logger.getLogger(GoogleCloudSubmitToArchiveCommand.class.getName()); + private static final String GOOGLECLOUD_BUCKET = ":GoogleCloudBucket"; + private static final String GOOGLECLOUD_PROJECT = ":GoogleCloudProject"; + + public GoogleCloudSubmitToArchiveCommand(DataverseRequest aRequest, DatasetVersion version) { + super(aRequest, version); + } + + @Override + public WorkflowStepResult performArchiveSubmission(DatasetVersion dv, ApiToken token, Map requestedSettings) { + logger.fine("In GoogleCloudSubmitToArchiveCommand..."); + String bucketName = requestedSettings.get(GOOGLECLOUD_BUCKET); + String projectName = requestedSettings.get(GOOGLECLOUD_PROJECT); + logger.fine("Project: " + projectName + " Bucket: " + bucketName); + if (bucketName != null && projectName != null) { + Storage storage; + try { + FileInputStream fis = new FileInputStream(System.getProperty("dataverse.files.directory") + System.getProperty("file.separator")+ "googlecloudkey.json"); + storage = StorageOptions.newBuilder() + .setCredentials(ServiceAccountCredentials.fromStream(fis)) + .setProjectId(projectName) + .build() + .getService(); + Bucket bucket = storage.get(bucketName); + + Dataset dataset = dv.getDataset(); + if (dataset.getLockFor(Reason.finalizePublication) == null) { + + String spaceName = dataset.getGlobalId().asString().replace(':', '-').replace('/', '-') + .replace('.', '-').toLowerCase(); + + DataCitation dc = new DataCitation(dv); + Map metadata = dc.getDataCiteMetadata(); + String dataciteXml = DOIDataCiteRegisterService.getMetadataFromDvObject( + dv.getDataset().getGlobalId().asString(), metadata, dv.getDataset()); + String blobIdString = null; + MessageDigest messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream dataciteIn = new PipedInputStream(); DigestInputStream digestInputStream = new DigestInputStream(dataciteIn, messageDigest)) { + // Add datacite.xml file + + new Thread(new Runnable() { + public void run() { + try (PipedOutputStream dataciteOut = new PipedOutputStream(dataciteIn)) { + + dataciteOut.write(dataciteXml.getBytes(Charset.forName("utf-8"))); + dataciteOut.close(); + } catch (Exception e) { + logger.severe("Error creating datacite.xml: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("Error creating datacite.xml: " + e.getMessage()); + } + } + }).start(); + //Have seen broken pipe in PostPublishDataset workflow without this delay + int i=0; + while(digestInputStream.available()<=0 && i<100) { + Thread.sleep(10); + i++; + } + Blob dcXml = bucket.create(spaceName + "/datacite.v" + dv.getFriendlyVersionNumber()+".xml", digestInputStream, "text/xml", Bucket.BlobWriteOption.doesNotExist()); + String checksum = dcXml.getMd5ToHexString(); + logger.fine("Content: datacite.xml added with checksum: " + checksum); + String localchecksum = Hex.encodeHexString(digestInputStream.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring DataCite.xml file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete metadata transfer"); + } + + // Store BagIt file + String fileName = spaceName + ".v" + dv.getFriendlyVersionNumber() + ".zip"; + + // Add BagIt ZIP file + // Google uses MD5 as one way to verify the + // transfer + messageDigest = MessageDigest.getInstance("MD5"); + try (PipedInputStream in = new PipedInputStream(100000); DigestInputStream digestInputStream2 = new DigestInputStream(in, messageDigest);) { + Thread writeThread = new Thread(new Runnable() { + public void run() { + try (PipedOutputStream out = new PipedOutputStream(in)) { + // Generate bag + BagGenerator bagger = new BagGenerator(new OREMap(dv, false), dataciteXml); + bagger.setAuthenticationKey(token.getTokenString()); + bagger.generateBag(out); + } catch (Exception e) { + logger.severe("Error creating bag: " + e.getMessage()); + // TODO Auto-generated catch block + e.printStackTrace(); + try { + digestInputStream2.close(); + } catch(Exception ex) { + logger.warning(ex.getLocalizedMessage()); + } + throw new RuntimeException("Error creating bag: " + e.getMessage()); + } + } + }); + writeThread.start(); + /* + * The following loop handles two issues. First, with no delay, the + * bucket.create() call below can get started before the piped streams are set + * up, causing a failure (seen when triggered in a PostPublishDataset workflow). + * A minimal initial wait, e.g. until some bytes are available, would address + * this. Second, the BagGenerator class, due to it's use of parallel streaming + * creation of the zip file, has the characteristic that it makes a few bytes + * available - from setting up the directory structure for the zip file - + * significantly earlier than it is ready to stream file content (e.g. for + * thousands of files and GB of content). If, for these large datasets, + * bucket.create() is called as soon as bytes are available, the call can + * timeout before the bytes for all the zipped files are available. To manage + * this, the loop waits until 90K bytes are available, larger than any expected + * dir structure for the zip and implying that the main zipped content is + * available, or until the thread terminates, with all of its content written to + * the pipe. (Note the PipedInputStream buffer is set at 100K above - I didn't + * want to test whether that means that exactly 100K bytes will be available() + * for large datasets or not, so the test below is at 90K.) + * + * An additional sanity check limits the wait to 2K seconds. The BagGenerator + * has been used to archive >120K files, 2K directories, and ~600GB files on the + * SEAD project (streaming content to disk rather than over an internet + * connection) which would take longer than 2K seconds (10+ hours) and might + * produce an initial set of bytes for directories > 90K. If Dataverse ever + * needs to support datasets of this size, the numbers here would need to be + * increased, and/or a change in how archives are sent to google (e.g. as + * multiple blobs that get aggregated) would be required. + */ + i=0; + while(digestInputStream2.available()<=90000 && i<2000 && writeThread.isAlive()) { + Thread.sleep(1000); + logger.fine("avail: " + digestInputStream2.available() + " : " + writeThread.getState().toString()); + i++; + } + logger.fine("Bag: transfer started, i=" + i + ", avail = " + digestInputStream2.available()); + if(i==2000) { + throw new IOException("Stream not available"); + } + Blob bag = bucket.create(spaceName + "/" + fileName, digestInputStream2, "application/zip", Bucket.BlobWriteOption.doesNotExist()); + if(bag.getSize()==0) { + throw new IOException("Empty Bag"); + } + blobIdString = bag.getBlobId().getBucket() + "/" + bag.getBlobId().getName(); + checksum = bag.getMd5ToHexString(); + logger.fine("Bag: " + fileName + " added with checksum: " + checksum); + localchecksum = Hex.encodeHexString(digestInputStream2.getMessageDigest().digest()); + if (!checksum.equals(localchecksum)) { + logger.severe(checksum + " not equal to " + localchecksum); + return new Failure("Error in transferring Zip file to GoogleCloud", + "GoogleCloud Submission Failure: incomplete archive transfer"); + } + } catch (RuntimeException rte) { + logger.severe("Error creating Bag during GoogleCloud archiving: " + rte.getMessage()); + return new Failure("Error in generating Bag", + "GoogleCloud Submission Failure: archive file not created"); + } + + logger.fine("GoogleCloud Submission step: Content Transferred"); + + // Document the location of dataset archival copy location (actually the URL + // where you can + // view it as an admin) + + StringBuffer sb = new StringBuffer("https://console.cloud.google.com/storage/browser/"); + sb.append(blobIdString); + dv.setArchivalCopyLocation(sb.toString()); + } catch (RuntimeException rte) { + logger.severe("Error creating datacite xml file during GoogleCloud Archiving: " + rte.getMessage()); + return new Failure("Error in generating datacite.xml file", + "GoogleCloud Submission Failure: metadata file not created"); + } + } else { + logger.warning("GoogleCloud Submision Workflow aborted: Dataset locked for pidRegister"); + return new Failure("Dataset locked"); + } + } catch (Exception e) { + logger.warning(e.getLocalizedMessage()); + e.printStackTrace(); + return new Failure("GoogleCloud Submission Failure", + e.getLocalizedMessage() + ": check log for details"); + + } + return WorkflowStepResult.OK; + } else { + return new Failure("GoogleCloud Submission not configured - no \":GoogleCloudBucket\" and/or \":GoogleCloudProject\"."); + } + } + +}