Skip to content

Commit

Permalink
File Detection - Add support for files without extensions
Browse files Browse the repository at this point in the history
  • Loading branch information
abujeda committed Jul 19, 2022
1 parent 567e506 commit 5d1ae86
Show file tree
Hide file tree
Showing 7 changed files with 54 additions and 20 deletions.
9 changes: 9 additions & 0 deletions doc/release-notes/8740-file-recognition-based-on-filename.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
### File types detection
File types are now detected based on the filename when the file has no extension.

The following filenames are now detected:

- Makefile=text/x-makefile
- Snakemake=text/x-snakemake
- Dockerfile=application/x-docker-file
- Vagrantfile=application/x-vagrant-file
40 changes: 21 additions & 19 deletions src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
import edu.harvard.iq.dataverse.DatasetVersion;
import edu.harvard.iq.dataverse.Embargo;
import edu.harvard.iq.dataverse.FileMetadata;
import edu.harvard.iq.dataverse.TermsOfUseAndAccess;
import edu.harvard.iq.dataverse.dataaccess.DataAccess;
import edu.harvard.iq.dataverse.dataaccess.ImageThumbConverter;
import edu.harvard.iq.dataverse.dataaccess.S3AccessIO;
Expand All @@ -53,7 +52,7 @@
import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatLink;
import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTableCellAlignRight;
import static edu.harvard.iq.dataverse.util.xml.html.HtmlFormatUtil.formatTableRow;
import java.awt.image.BufferedImage;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
Expand All @@ -76,7 +75,6 @@
import java.text.MessageFormat;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.format.DateTimeFormatter;
import java.util.Map;
import java.util.MissingResourceException;
import java.util.ArrayList;
Expand All @@ -90,11 +88,6 @@
import javax.activation.MimetypesFileTypeMap;
import javax.ejb.EJBException;
import javax.enterprise.inject.spi.CDI;
import javax.faces.application.FacesMessage;
import javax.faces.component.UIComponent;
import javax.faces.component.UIInput;
import javax.faces.context.FacesContext;
import javax.faces.validator.ValidatorException;
import javax.json.JsonArray;
import javax.json.JsonObject;
import javax.xml.stream.XMLStreamConstants;
Expand All @@ -108,7 +101,6 @@
import java.util.zip.ZipInputStream;
import org.apache.commons.io.FilenameUtils;

import com.amazonaws.AmazonServiceException;
import edu.harvard.iq.dataverse.dataaccess.DataAccessOption;
import edu.harvard.iq.dataverse.dataaccess.StorageIO;
import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker;
Expand Down Expand Up @@ -487,22 +479,27 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
// step 4:
// Additional processing; if we haven't gotten much useful information
// back from Jhove, we'll try and make an educated guess based on
// the file extension:
// the file name and extension:

if ( fileExtension != null) {
logger.fine("fileExtension="+fileExtension);

if (fileType == null || fileType.startsWith("text/plain") || "application/octet-stream".equals(fileType)) {
if (fileType != null && fileType.startsWith("text/plain") && STATISTICAL_FILE_EXTENSION.containsKey(fileExtension)) {
fileType = STATISTICAL_FILE_EXTENSION.get(fileExtension);
} else {
fileType = determineFileTypeByExtension(fileName);
fileType = determineFileTypeByNameAndExtension(fileName);
}

logger.fine("mime type recognized by extension: "+fileType);
}
} else {
logger.fine("fileExtension is null");
String fileTypeByName = lookupFileTypeFromPropertiesFile(fileName);
if(!StringUtil.isEmpty(fileTypeByName)) {
logger.fine(String.format("mime type: %s recognized by filename: %s", fileTypeByName, fileName));
fileType = fileTypeByName;
}
}

// step 5:
Expand Down Expand Up @@ -552,7 +549,7 @@ public static String determineFileType(File f, String fileName) throws IOExcepti
return fileType;
}

public static String determineFileTypeByExtension(String fileName) {
public static String determineFileTypeByNameAndExtension(String fileName) {
String mimetypesFileTypeMapResult = MIME_TYPE_MAP.getContentType(fileName);
logger.fine("MimetypesFileTypeMap type by extension, for " + fileName + ": " + mimetypesFileTypeMapResult);
if (mimetypesFileTypeMapResult != null) {
Expand All @@ -567,14 +564,19 @@ public static String determineFileTypeByExtension(String fileName) {
}

public static String lookupFileTypeFromPropertiesFile(String fileName) {
String fileExtension = FilenameUtils.getExtension(fileName);
String fileKey = FilenameUtils.getExtension(fileName);
String propertyFileName = "MimeTypeDetectionByFileExtension";
if(fileKey == null || fileKey.isEmpty()) {
fileKey = fileName;
propertyFileName = "MimeTypeDetectionByFileName";

}
String propertyFileNameOnDisk = propertyFileName + ".properties";
try {
logger.fine("checking " + propertyFileNameOnDisk + " for file extension " + fileExtension);
return BundleUtil.getStringFromPropertyFile(fileExtension, propertyFileName);
logger.fine("checking " + propertyFileNameOnDisk + " for file key " + fileKey);
return BundleUtil.getStringFromPropertyFile(fileKey, propertyFileName);
} catch (MissingResourceException ex) {
logger.info(fileExtension + " is a file extension Dataverse doesn't know about. Consider adding it to the " + propertyFileNameOnDisk + " file.");
logger.info(fileKey + " is a filename/extension Dataverse doesn't know about. Consider adding it to the " + propertyFileNameOnDisk + " file.");
return null;
}
}
Expand Down Expand Up @@ -1145,7 +1147,7 @@ public static CreateDataFileResult createDataFiles(DatasetVersion version, Input
} else {
// Default to suppliedContentType if set or the overall undetermined default if a contenttype isn't supplied
finalType = StringUtils.isBlank(suppliedContentType) ? FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT : suppliedContentType;
String type = determineFileTypeByExtension(fileName);
String type = determineFileTypeByNameAndExtension(fileName);
if (!StringUtils.isBlank(type)) {
//Use rules for deciding when to trust browser supplied type
if (useRecognizedType(finalType, type)) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Makefile=text/x-makefile
Snakemake=text/x-snakemake
Dockerfile=application/x-docker-file
Vagrantfile=application/x-vagrant-file
2 changes: 2 additions & 0 deletions src/main/java/propertyFiles/MimeTypeDisplay.properties
Original file line number Diff line number Diff line change
Expand Up @@ -219,5 +219,7 @@ video/webm=WebM Video
text/xml-graphml=GraphML Network Data
# Other
application/octet-stream=Unknown
application/x-docker-file=Docker Image File
application/x-vagrant-file=Vagrant Image File
# Dataverse-specific
application/vnd.dataverse.file-package=Dataverse Package
2 changes: 2 additions & 0 deletions src/main/java/propertyFiles/MimeTypeFacets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ text/x-ruby-script=Code
text/x-dagman=Code
text/x-makefile=Code
text/x-snakemake=Code
application/x-docker-file=Code
application/x-vagrant-file=Code
# Ingested
text/tab-separated-values=Tabular Data
# Data
Expand Down
16 changes: 15 additions & 1 deletion src/test/java/edu/harvard/iq/dataverse/util/FileUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ public void testRescaleImage() throws IOException {
}*/

@Test
public void testDetermineFileType() {
public void testDetermineFileTypeByExtension() {
File file = new File("src/main/webapp/resources/images/cc0.png");
if (file.exists()) {
try {
Expand All @@ -316,6 +316,20 @@ public void testDetermineFileType() {
}
}

@Test
public void testDetermineFileTypeByName() {
File file = new File("src/test/resources/fileutil/Makefile");
if (file.exists()) {
try {
assertEquals("text/x-makefile", FileUtil.determineFileType(file, "Makefile"));
} catch (IOException ex) {
Logger.getLogger(FileUtilTest.class.getName()).log(Level.SEVERE, null, ex);
}
} else {
fail("File does not exist: " + file.toPath().toString());
}
}

// isThumbnailSuppported() has been moved from DataFileService to FileUtil:
/**
* Expect that {@code null}, a DataFile without content type and a DataFile
Expand Down
1 change: 1 addition & 0 deletions src/test/resources/fileutil/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
To test file type recognition from file name

0 comments on commit 5d1ae86

Please sign in to comment.