Merge remote-tracking branch 'IQSS/develop' into DANS/7564

GlobalDataverseCommunityConsortium · Feb 24, 2021 · 7f602e3 · 7f602e3
2 parents 97eeab8 + b2726b1
commit 7f602e3
Show file tree

Hide file tree

Showing 25 changed files with 747 additions and 240 deletions.
diff --git a/doc/release-notes/7084-crawlable-file-access.md b/doc/release-notes/7084-crawlable-file-access.md
@@ -0,0 +1,29 @@
+ ## Release Highlights
+
+### A new file access API
+
+A new api offers *crawlable* access view of the folders and files within a datset:
+
+```
+  /api/datasets/<dataset id>/dirindex/
+```
+
+will output a simple html listing, based on the standard Apache
+directory index, with Access API download links for individual files,
+and recursive calls to the API above for sub-folders. (See the
+documentation entry in the guides for more information).
+
+Using this API, ``wget --recursive`` (or similar crawling client) can
+be used to download all the files in a dataset, preserving the file
+names and folder structure; without having to use the download-as-zip
+API. In addition to being faster (zipping is a relatively
+resource-intensive operation on the server side), this process can be
+restarted if interrupted (with ``wget --continue`` or equivalent) -
+unlike zipped multi-file downloads that always have to start from the
+beginning.
+
+On a system that uses S3 with download redirects, the individual file
+downloads will be handled by S3 directly, without having to be proxied
+through the Dataverse application.
+
+
diff --git a/...source/_static/installation/files/var/www/dataverse/branding/custom-homepage-dynamic.html b/...source/_static/installation/files/var/www/dataverse/branding/custom-homepage-dynamic.html
@@ -94,10 +94,13 @@
                     <div class="search-widget input-group">
                         <input id="inputDataverseSearch" class="form-control" type="text" placeholder="" onkeydown="if (event.keyCode == 13) document.getElementById('btnDataverseSearch').click();">
                         <span class="input-group-btn">
-                            <button id="btnDataverseSearch" class="btn btn-default" type="button" onclick="window.location = '/dataverse/root?q=' + document.getElementById('inputDataverseSearch').value;return false;"><span class="glyphicon glyphicon-search"></span> Find</button>
+                            <button id="btnDataverseSearch" class="btn btn-default bootstrap-button-tooltip" type="button" data-original-title="Find" onclick="window.location = '/dataverse/root?q=' + document.getElementById('inputDataverseSearch').value;return false;"><span class="glyphicon glyphicon-search"></span></button>
                         </span>
                     </div>
                 </div>
+                <div class="col-md-5 col-sm-6 col-xs-11">
+                    <p class="browse-all-data small text-muted"><a href= "/dataverse/root">VIEW ALL DATA</a> <span class="glyphicon glyphicon-chevron-right"></span></p>
+                </div>
             </div>
         </div>
 

diff --git a/doc/sphinx-guides/source/api/client-libraries.rst b/doc/sphinx-guides/source/api/client-libraries.rst
@@ -13,7 +13,7 @@ Python
 
 There are two Python modules for interacting with Dataverse Software APIs.
 
-`pyDataverse <https://github.com/AUSSDA/pyDataverse>`_ had its initial release in 2019 and can be installed with ``pip install pyDataverse``. The module is developed by `Stefan Kasberger <http://stefankasberger.at>`_ from `AUSSDA - The Austrian Social Science Data Archive <https://aussda.at>`_.  
+`pyDataverse <https://github.com/gdcc/pyDataverse>`_ primarily allows developers to manage Dataverse collections, datasets and datafiles. Its intention is to help with data migrations and DevOps activities such as testing and configuration management. The module is developed by `Stefan Kasberger <http://stefankasberger.at>`_ from `AUSSDA - The Austrian Social Science Data Archive <https://aussda.at>`_.  
 
 `dataverse-client-python <https://github.com/IQSS/dataverse-client-python>`_ had its initial release in 2015. `Robert Liebowitz <https://github.com/rliebz>`_ created this library while at the `Center for Open Science (COS) <https://centerforopenscience.org>`_ and the COS uses it to integrate the `Open Science Framework (OSF) <https://osf.io>`_ with a Dataverse installation via an add-on which itself is open source and listed on the :doc:`/api/apps` page.
 

diff --git a/doc/sphinx-guides/source/api/img/dataset_page_files_view.png b/doc/sphinx-guides/source/api/img/dataset_page_files_view.png
diff --git a/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png b/doc/sphinx-guides/source/api/img/dataset_page_tree_view.png
diff --git a/doc/sphinx-guides/source/api/img/index_view_subfolder.png b/doc/sphinx-guides/source/api/img/index_view_subfolder.png
diff --git a/doc/sphinx-guides/source/api/img/index_view_top.png b/doc/sphinx-guides/source/api/img/index_view_top.png
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -783,9 +783,92 @@ List Files in a Dataset
 The fully expanded example above (without environment variables) looks like this:
 
 .. code-block:: bash
-
+ 
   curl https://demo.dataverse.org/api/datasets/24/versions/1.0/files
 
+View Dataset Files and Folders as a Directory Index
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+|CORS| Provides a *crawlable* view of files and folders within the given dataset and version:
+
+.. code-block:: bash
+
+  curl $SERVER_URL/api/datasets/$ID/dirindex/
+
+Optional parameters:
+
+* ``folder`` - A subfolder within the dataset (default: top-level view of the dataset)
+* ``version`` - Specifies the version (default: latest published version)
+* ``original=true`` - Download original versions of ingested tabular files. 
+  
+This API outputs a simple html listing, based on the standard Apache
+directory index, with Access API download links for individual files,
+and recursive calls to the API above for sub-folders.
+
+Using this API, ``wget --recursive`` (or a similar crawling client) can
+be used to download all the files in a dataset, preserving the file
+names and folder structure; without having to use the download-as-zip
+API. In addition to being faster (zipping is a relatively
+resource-intensive operation on the server side), this process can be
+restarted if interrupted (with ``wget --continue`` or equivalent) -
+unlike zipped multi-file downloads that always have to start from the
+beginning.
+
+On a system that uses S3 with download redirects, the individual file
+downloads will be handled by S3 directly, without having to be proxied
+through the Dataverse application.
+
+For example, if you have a dataset version with 2 files, one with the folder named "subfolder":
+
+.. |image1| image:: ./img/dataset_page_files_view.png
+
+or, as viewed as a tree on the dataset page:
+
+.. |image2| image:: ./img/dataset_page_tree_view.png
+
+The output of the API for the top-level folder (``/api/datasets/{dataset}/dirindex/``) will be as follows:
+
+.. |image3| image:: ./img/index_view_top.png
+
+with the underlying html source:
+
+.. code-block:: html
+
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+    <html><head><title>Index of folder /</title></head>
+    <body><h1>Index of folder / in dataset doi:XXX/YY/ZZZZ (v. MM)</h1>
+    <table>
+    <tr><th>Name</th><th>Last Modified</th><th>Size</th><th>Description</th></tr>
+    <tr><th colspan="4"><hr></th></tr>
+    <tr><td><a href="/api/datasets/NNNN/dirindex/?folder=subfolder">subfolder/</a></td><td align="right"> - </td><td align="right"> - </td><td align="right">&nbsp;</td></tr>
+    <tr><td><a href="/api/access/datafile/KKKK">testfile.txt</a></td><td align="right">13-January-2021 22:35</td><td align="right">19 B</td><td align="right">&nbsp;</td></tr>
+    </table></body></html>
+
+The ``/dirindex/?folder=subfolder`` link above will produce the following view:
+
+.. |image4| image:: ./img/index_view_subfolder.png
+
+with the html source as follows:
+
+.. code-block:: html
+
+    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+    <html><head><title>Index of folder /subfolder</title></head>
+    <body><h1>Index of folder /subfolder in dataset doi:XXX/YY/ZZZZ (v. MM)</h1>
+    <table>
+    <tr><th>Name</th><th>Last Modified</th><th>Size</th><th>Description</th></tr>
+    <tr><th colspan="4"><hr></th></tr>
+    <tr><td><a href="/api/access/datafile/subfolder/LLLL">50by1000.tab</a></td><td align="right">11-January-2021 09:31</td><td align="right">102.5 KB</td><td align="right">&nbsp;</td></tr>
+    </table></body></html>
+
+An example of a ``wget`` command line for crawling ("recursive downloading") of the files and folders in a dataset: 
+
+.. code-block:: bash
+
+  wget -r -e robots=off -nH --cut-dirs=3 --content-disposition https://demo.dataverse.org/api/datasets/24/dirindex/
+
+.. note:: In addition to the files and folders in the dataset, the command line above will also save the directory index of each folder, in a separate folder "dirindex".
+
 List All Metadata Blocks for a Dataset
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java b/src/main/java/edu/harvard/iq/dataverse/DatasetVersion.java
@@ -214,6 +214,33 @@ public List<FileMetadata> getFileMetadatasSortedByLabelAndFolder() {
         Collections.sort(fileMetadatasCopy, FileMetadata.compareByLabelAndFolder);
         return fileMetadatasCopy;
     }
+
+    public List<FileMetadata> getFileMetadatasFolderListing(String folderName) {
+        ArrayList<FileMetadata> fileMetadatasCopy = new ArrayList<>();
+        HashSet<String> subFolders = new HashSet<>();
+
+        for (FileMetadata fileMetadata : fileMetadatas) {
+            String thisFolder = fileMetadata.getDirectoryLabel() == null ? "" : fileMetadata.getDirectoryLabel(); 
+
+            if (folderName.equals(thisFolder)) {
+                fileMetadatasCopy.add(fileMetadata);
+            } else if (thisFolder.startsWith(folderName)) {
+                String subFolder = "".equals(folderName) ? thisFolder : thisFolder.substring(folderName.length() + 1);
+                if (subFolder.indexOf('/') > 0) {
+                    subFolder = subFolder.substring(0, subFolder.indexOf('/'));
+                }
+
+                if (!subFolders.contains(subFolder)) {
+                    fileMetadatasCopy.add(fileMetadata);
+                    subFolders.add(subFolder);
+                }
+
+            }
+        }
+        Collections.sort(fileMetadatasCopy, FileMetadata.compareByFullPath);
+
+        return fileMetadatasCopy; 
+    }
 
     public void setFileMetadatas(List<FileMetadata> fileMetadatas) {
         this.fileMetadatas = fileMetadatas;

diff --git a/src/main/java/edu/harvard/iq/dataverse/DataversePage.java b/src/main/java/edu/harvard/iq/dataverse/DataversePage.java
@@ -511,18 +511,6 @@ public void hideDatasetFieldTypes(Long mdbId) {
         }
         setEditInputLevel(false);
     }
-
-    public void toggleInputLevel( Long mdbId, long dsftId){
-        for (MetadataBlock mdb : allMetadataBlocks) {
-            if (mdb.getId().equals(mdbId)) {
-                for (DatasetFieldType dsftTest : mdb.getDatasetFieldTypes()) {
-                    if (dsftTest.getId().equals(dsftId)) {
-                            dsftTest.setRequiredDV(!dsftTest.isRequiredDV());                           
-                    }
-                }
-            }
-        }        
-    }
 
     public void updateInclude(Long mdbId, long dsftId) {
         List<DatasetFieldType> childDSFT = new ArrayList<>();
@@ -903,7 +891,7 @@ public String releaseDataverse() {
 
             }
         } else {
-            JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("dataverse.publish.not.authorized"));            
+            JsfHelper.addErrorMessage(BundleUtil.getStringFromBundle("dataverse.release.authenticatedUsersOnly"));
         }
         return returnRedirect();
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java b/src/main/java/edu/harvard/iq/dataverse/EditDatafilesPage.java
@@ -1679,11 +1679,12 @@ public void uploadFinished() {
                     setLabelForDeleteFilesPopup();
                     PrimeFaces.current().ajax().update("datasetForm:fileAlreadyExistsPopup");
                     PrimeFaces.current().executeScript("PF('fileAlreadyExistsPopup').show();");
+                } else {
+                    //adding back warnings in non-replace situations
+                    FacesContext.getCurrentInstance().addMessage(uploadComponentId, new FacesMessage(FacesMessage.SEVERITY_WARN, BundleUtil.getStringFromBundle("dataset.file.uploadWarning"), uploadWarningMessage));                   
                 }
 
 
-                //taking this out for now based on design feedback 7/8/2020
-               // FacesContext.getCurrentInstance().addMessage(uploadComponentId, new FacesMessage(FacesMessage.SEVERITY_WARN, BundleUtil.getStringFromBundle("dataset.file.uploadWarning"), uploadWarningMessage));
 
             } else if (uploadSuccessMessage != null) {
                 FacesContext.getCurrentInstance().addMessage(uploadComponentId, new FacesMessage(FacesMessage.SEVERITY_INFO, BundleUtil.getStringFromBundle("dataset.file.uploadWorked"), uploadSuccessMessage));

diff --git a/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java b/src/main/java/edu/harvard/iq/dataverse/FileMetadata.java
@@ -575,6 +575,16 @@ public int compare(FileMetadata o1, FileMetadata o2) {
         }
     };
 
+    public static final Comparator<FileMetadata> compareByFullPath = new Comparator<FileMetadata>() {
+        @Override
+        public int compare(FileMetadata o1, FileMetadata o2) {
+            String folder1 = StringUtil.isEmpty(o1.getDirectoryLabel()) ? "" : o1.getDirectoryLabel().toUpperCase() + "/";
+            String folder2 = StringUtil.isEmpty(o2.getDirectoryLabel()) ? "" : o2.getDirectoryLabel().toUpperCase() + "/";
+
+            return folder1.concat(o1.getLabel().toUpperCase()).compareTo(folder2.concat(o2.getLabel().toUpperCase()));
+        }
+    };
+
 
     public String toPrettyJSON(){
 

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Access.java b/src/main/java/edu/harvard/iq/dataverse/api/Access.java
@@ -270,11 +270,21 @@ private DataFile findDataFileOrDieWrapper(String fileId){
     }
 
 
-    @Path("datafile/{fileId}")
+    @Path("datafile/{fileId:.+}")
     @GET
     @Produces({"application/xml"})
     public DownloadInstance datafile(@PathParam("fileId") String fileId, @QueryParam("gbrecs") boolean gbrecs, @QueryParam("key") String apiToken, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) /*throws NotFoundException, ServiceUnavailableException, PermissionDeniedException, AuthorizationRequiredException*/ {
-
+
+        if (fileId.indexOf('/') > -1) {
+            // This is for embedding folder names into the Access API URLs;
+            // something like /api/access/datafile/folder/subfolder/1234
+            // instead of the normal /api/access/datafile/1234 notation. 
+            // this is supported only for recreating folders during recursive downloads - 
+            // i.e. they are embedded into the URL for the remote client like wget,
+            // but can be safely ignored here. 
+            fileId = fileId.substring(fileId.lastIndexOf('/') + 1);
+        }
+
         DataFile df = findDataFileOrDieWrapper(fileId);
         GuestbookResponse gbr = null;
 
@@ -331,6 +341,14 @@ public DownloadInstance datafile(@PathParam("fileId") String fileId, @QueryParam
             // So we need to identify when a service is being called and then let checkIfServiceSupportedAndSetConverter see if the required one exists
             if (key.equals("imageThumb") || key.equals("format") || key.equals("variables") || key.equals("noVarHeader")) {
                 serviceRequested = true;
+                //In the dataset file table context a user is allowed to select original as the format
+                //for download
+                // if the dataset has tabular files - it should not be applied to instances 
+                // where the file selected is not tabular see #6972
+                if("format".equals(key) && "original".equals(value) && !df.isTabularData()) {
+                    serviceRequested = false;
+                    break;
+                }
                 //Only need to check if this key is associated with a service
                 if (downloadInstance.checkIfServiceSupportedAndSetConverter(key, value)) {
                     // this automatically sets the conversion parameters in

diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java b/src/main/java/edu/harvard/iq/dataverse/api/Datasets.java
@@ -158,6 +158,7 @@
 import org.glassfish.jersey.media.multipart.FormDataParam;
 
 import com.amazonaws.services.s3.model.PartETag;
+import edu.harvard.iq.dataverse.FileMetadata;
 import java.util.Map.Entry;
 
 @Path("datasets")
@@ -468,6 +469,42 @@ public Response getVersionFiles( @PathParam("id") String datasetId, @PathParam("
                          getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers).getFileMetadatas())));
     }
 
+    @GET
+    @Path("{id}/dirindex")
+    @Produces("text/html")
+    public Response getFileAccessFolderView(@PathParam("id") String datasetId, @QueryParam("version") String versionId, @QueryParam("folder") String folderName, @QueryParam("original") Boolean originals, @Context UriInfo uriInfo, @Context HttpHeaders headers, @Context HttpServletResponse response) {
+
+        folderName = folderName == null ? "" : folderName;
+        versionId = versionId == null ? ":latest-published" : versionId; 
+
+        DatasetVersion version; 
+        try {
+            DataverseRequest req = createDataverseRequest(findUserOrDie());
+            version = getDatasetVersionOrDie(req, versionId, findDatasetOrDie(datasetId), uriInfo, headers);
+        } catch (WrappedResponse wr) {
+            return wr.getResponse();
+        }
+
+        String output = FileUtil.formatFolderListingHtml(folderName, version, "", originals != null && originals);
+
+        // return "NOT FOUND" if there is no such folder in the dataset version:
+
+        if ("".equals(output)) {
+            return notFound("Folder " + folderName + " does not exist");
+        }
+
+
+        String indexFileName = folderName.equals("") ? ".index.html"
+                : ".index-" + folderName.replace('/', '_') + ".html";
+        response.setHeader("Content-disposition", "attachment; filename=\"" + indexFileName + "\"");
+
+
+        return Response.ok()
+                .entity(output)
+                //.type("application/html").
+                .build();
+    }
+
     @GET
     @Path("{id}/versions/{versionId}/metadata")
     public Response getVersionMetadata( @PathParam("id") String datasetId, @PathParam("versionId") String versionId, @Context UriInfo uriInfo, @Context HttpHeaders headers) {

diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java
@@ -136,9 +136,9 @@ public class IngestServiceBean {
     @EJB
     SystemConfig systemConfig;
 
-    @Resource(name = "java:app/jms/queue/ingest")
+    @Resource(lookup = "java:app/jms/queue/ingest")
     Queue queue;
-    @Resource(name = "java:app/jms/factory/ingest")
+    @Resource(lookup = "java:app/jms/factory/ingest")
     QueueConnectionFactory factory;