Skip to content

Commit

Permalink
Merge pull request #6087 from IQSS/3268-harvester-improvements
Browse files Browse the repository at this point in the history
Harvester improvements.
  • Loading branch information
kcondon authored Aug 16, 2019
2 parents aabd676 + 275fe37 commit a1a5733
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 59 deletions.
37 changes: 32 additions & 5 deletions src/main/java/edu/harvard/iq/dataverse/DatasetServiceBean.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,17 @@
import edu.harvard.iq.dataverse.engine.command.CommandContext;
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
import edu.harvard.iq.dataverse.engine.command.impl.DestroyDatasetCommand;
import edu.harvard.iq.dataverse.engine.command.impl.FinalizeDatasetPublicationCommand;
import edu.harvard.iq.dataverse.export.ExportService;
import edu.harvard.iq.dataverse.harvest.server.OAIRecordServiceBean;
import edu.harvard.iq.dataverse.search.IndexServiceBean;
import edu.harvard.iq.dataverse.settings.SettingsServiceBean;
import edu.harvard.iq.dataverse.util.SystemConfig;
import edu.harvard.iq.dataverse.workflows.WorkflowComment;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
Expand All @@ -48,9 +47,6 @@
import javax.persistence.Query;
import javax.persistence.StoredProcedureQuery;
import javax.persistence.TypedQuery;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.apache.commons.lang.RandomStringUtils;
import org.ocpsoft.common.util.Strings;

Expand Down Expand Up @@ -928,4 +924,35 @@ public long findStorageSize(Dataset dataset, boolean countCachedExtras) throws I

return total;
}

/**
* An optimized method for deleting a harvested dataset.
*
* @param dataset
* @param request DataverseRequest (for initializing the DestroyDatasetCommand)
* @param hdLogger logger object (in practice, this will be a separate log file created for a specific harvesting job)
*/
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public void deleteHarvestedDataset(Dataset dataset, DataverseRequest request, Logger hdLogger) {
// Purge all the SOLR documents associated with this client from the
// index server:
indexService.deleteHarvestedDocuments(dataset);

try {
// files from harvested datasets are removed unceremoniously,
// directly in the database. no need to bother calling the
// DeleteFileCommand on them.
for (DataFile harvestedFile : dataset.getFiles()) {
DataFile merged = em.merge(harvestedFile);
em.remove(merged);
harvestedFile = null;
}
dataset.setFiles(null);
Dataset merged = em.merge(dataset);
commandEngine.submit(new DestroyDatasetCommand(merged, request));
hdLogger.info("Successfully destroyed the dataset");
} catch (Exception ex) {
hdLogger.warning("Failed to destroy the dataset");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import java.io.StringReader;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.logging.Formatter;
Expand Down Expand Up @@ -197,7 +198,7 @@ public JsonObjectBuilder handleFile(DataverseRequest dataverseRequest, Dataverse
}

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, PrintWriter cleanupLog) throws ImportException, IOException {
public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, HarvestingClient harvestingClient, String harvestIdentifier, String metadataFormat, File metadataFile, Date oaiDateStamp, PrintWriter cleanupLog) throws ImportException, IOException {
if (harvestingClient == null || harvestingClient.getDataverse() == null) {
throw new ImportException("importHarvestedDataset called wiht a null harvestingClient, or an invalid harvestingClient.");
}
Expand Down Expand Up @@ -275,6 +276,10 @@ public Dataset doImportHarvestedDataset(DataverseRequest dataverseRequest, Harve
ds.setOwner(owner);
ds.getLatestVersion().setDatasetFields(ds.getLatestVersion().initDatasetFields());

if (ds.getVersions().get(0).getReleaseTime() == null) {
ds.getVersions().get(0).setReleaseTime(oaiDateStamp);
}

// Check data against required contraints
List<ConstraintViolation<DatasetField>> violations = ds.getVersions().get(0).validateRequired();
if (!violations.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ protected void executeImpl(CommandContext ctxt) throws CommandException {
}

//also, lets delete the uploaded thumbnails!
deleteDatasetLogo(doomed);
if (!doomed.isHarvested()) {
deleteDatasetLogo(doomed);
}


// ASSIGNMENTS
Expand All @@ -92,17 +94,20 @@ protected void executeImpl(CommandContext ctxt) throws CommandException {
ctxt.em().remove(ra);
}

GlobalIdServiceBean idServiceBean = GlobalIdServiceBean.getBean(ctxt);
try{
if(idServiceBean.alreadyExists(doomed)){
idServiceBean.deleteIdentifier(doomed);
for (DataFile df : doomed.getFiles()) {
idServiceBean.deleteIdentifier(df);
if (!doomed.isHarvested()) {
GlobalIdServiceBean idServiceBean = GlobalIdServiceBean.getBean(ctxt);
try {
if (idServiceBean.alreadyExists(doomed)) {
idServiceBean.deleteIdentifier(doomed);
for (DataFile df : doomed.getFiles()) {
idServiceBean.deleteIdentifier(df);
}
}
} catch (Exception e) {
logger.log(Level.WARNING, "Identifier deletion was not successful:", e.getMessage());
}
} catch (Exception e) {
logger.log(Level.WARNING, "Identifier deletion was not successfull:", e.getMessage());
}
}

Dataverse toReIndex = managedDoomed.getOwner();

// dataset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
import javax.ejb.EJBException;
import javax.ejb.Stateless;
import javax.ejb.Timer;
import javax.ejb.TransactionAttribute;
import javax.ejb.TransactionAttributeType;
import javax.faces.bean.ManagedBean;
import javax.inject.Named;
//import javax.xml.bind.Unmarshaller;
Expand All @@ -39,14 +37,9 @@
import org.xml.sax.SAXException;

import com.lyncode.xoai.model.oaipmh.Header;
import edu.harvard.iq.dataverse.DataFile;
import edu.harvard.iq.dataverse.EjbDataverseEngine;
import edu.harvard.iq.dataverse.api.imports.ImportServiceBean;
import edu.harvard.iq.dataverse.engine.command.DataverseRequest;
import edu.harvard.iq.dataverse.engine.command.exception.CommandException;
import edu.harvard.iq.dataverse.engine.command.exception.IllegalCommandException;
import edu.harvard.iq.dataverse.engine.command.exception.PermissionException;
import edu.harvard.iq.dataverse.engine.command.impl.DeleteDatasetCommand;
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandler;
import edu.harvard.iq.dataverse.harvest.client.oai.OaiHandlerException;
import edu.harvard.iq.dataverse.search.IndexServiceBean;
Expand Down Expand Up @@ -263,13 +256,14 @@ private List<Long> harvestOAI(DataverseRequest dataverseRequest, HarvestingClien

Header h = idIter.next();
String identifier = h.getIdentifier();
Date dateStamp = h.getDatestamp();

hdLogger.info("processing identifier: " + identifier);
hdLogger.info("processing identifier: " + identifier + ", date: " + dateStamp);

MutableBoolean getRecordErrorOccurred = new MutableBoolean(false);

// Retrieve and process this record with a separate GetRecord call:
Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, processedSizeThisBatch, deletedIdentifiers);
Long datasetId = processRecord(dataverseRequest, hdLogger, importCleanupLog, oaiHandler, identifier, getRecordErrorOccurred, processedSizeThisBatch, deletedIdentifiers, dateStamp);

hdLogger.info("Total content processed in this batch so far: "+processedSizeThisBatch);
if (datasetId != null) {
Expand Down Expand Up @@ -315,8 +309,7 @@ private List<Long> harvestOAI(DataverseRequest dataverseRequest, HarvestingClien



@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, MutableLong processedSizeThisBatch, List<String> deletedIdentifiers) {
private Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, PrintWriter importCleanupLog, OaiHandler oaiHandler, String identifier, MutableBoolean recordErrorOccurred, MutableLong processedSizeThisBatch, List<String> deletedIdentifiers, Date dateStamp) {
String errMessage = null;
Dataset harvestedDataset = null;
logGetRecord(hdLogger, oaiHandler, identifier);
Expand All @@ -334,7 +327,7 @@ public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, Pr
Dataset dataset = datasetService.getDatasetByHarvestInfo(oaiHandler.getHarvestingClient().getDataverse(), identifier);
if (dataset != null) {
hdLogger.info("Deleting dataset " + dataset.getGlobalIdString());
deleteHarvestedDataset(dataset, dataverseRequest, hdLogger);
datasetService.deleteHarvestedDataset(dataset, dataverseRequest, hdLogger);
// TODO:
// check the status of that Delete - see if it actually succeeded
deletedIdentifiers.add(identifier);
Expand All @@ -351,7 +344,8 @@ public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, Pr
oaiHandler.getHarvestingClient(),
identifier,
oaiHandler.getMetadataPrefix(),
record.getMetadataFile(),
record.getMetadataFile(),
dateStamp,
importCleanupLog);

hdLogger.fine("Harvest Successful for identifier " + identifier);
Expand Down Expand Up @@ -388,36 +382,7 @@ public Long processRecord(DataverseRequest dataverseRequest, Logger hdLogger, Pr

return harvestedDataset != null ? harvestedDataset.getId() : null;
}

private void deleteHarvestedDataset(Dataset dataset, DataverseRequest request, Logger hdLogger) {
// Purge all the SOLR documents associated with this client from the
// index server:
indexService.deleteHarvestedDocuments(dataset);

try {
// files from harvested datasets are removed unceremoniously,
// directly in the database. no need to bother calling the
// DeleteFileCommand on them.
for (DataFile harvestedFile : dataset.getFiles()) {
DataFile merged = em.merge(harvestedFile);
em.remove(merged);
harvestedFile = null;
}
dataset.setFiles(null);
Dataset merged = em.merge(dataset);
engineService.submit(new DeleteDatasetCommand(request, merged));
} catch (IllegalCommandException ex) {
// TODO: log the result
} catch (PermissionException ex) {
// TODO: log the result
} catch (CommandException ex) {
// TODO: log the result
}

// TODO: log the success result
}



private void logBeginOaiHarvest(Logger hdLogger, HarvestingClient harvestingClient) {
hdLogger.log(Level.INFO, "BEGIN HARVEST, oaiUrl="
+harvestingClient.getHarvestingUrl()
Expand Down

0 comments on commit a1a5733

Please sign in to comment.