From 7d1e6feefe18611cc34b90784da5c79a82df8cc0 Mon Sep 17 00:00:00 2001 From: jo-pol Date: Tue, 1 Oct 2024 12:00:51 +0200 Subject: [PATCH 1/5] replace ZipInputStream with ZipFile --- .../impl/CreateNewDataFilesCommand.java | 331 ++++++++---------- .../ingest/IngestServiceShapefileHelper.java | 67 ++-- .../harvard/iq/dataverse/util/FileUtil.java | 13 +- .../iq/dataverse/util/ShapefileHandler.java | 149 +++----- .../command/impl/CreateNewDataFilesTest.java | 187 ++++++++++ .../util/shapefile/ShapefileHandlerTest.java | 38 +- .../own-cloud-downloads/greetings.zip | Bin 0 -> 679 bytes .../resources/own-cloud-downloads/shapes.zip | Bin 0 -> 4336 bytes 8 files changed, 429 insertions(+), 356 deletions(-) create mode 100644 src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java create mode 100644 src/test/resources/own-cloud-downloads/greetings.zip create mode 100644 src/test/resources/own-cloud-downloads/shapes.zip diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index 3a21345448b..e543606e039 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -2,34 +2,29 @@ import edu.harvard.iq.dataverse.DataFile; import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.authorization.Permission; import edu.harvard.iq.dataverse.datasetutility.FileExceedsMaxSizeException; import edu.harvard.iq.dataverse.datasetutility.FileSizeChecker; -import static edu.harvard.iq.dataverse.datasetutility.FileSizeChecker.bytesToHumanReadable; import edu.harvard.iq.dataverse.engine.command.AbstractCommand; import edu.harvard.iq.dataverse.engine.command.CommandContext; import edu.harvard.iq.dataverse.engine.command.DataverseRequest; -//import edu.harvard.iq.dataverse.engine.command.RequiredPermissions; import edu.harvard.iq.dataverse.engine.command.exception.CommandException; import edu.harvard.iq.dataverse.engine.command.exception.CommandExecutionException; import edu.harvard.iq.dataverse.ingest.IngestServiceShapefileHelper; -import edu.harvard.iq.dataverse.Dataverse; import edu.harvard.iq.dataverse.storageuse.UploadSessionQuotaLimit; -import edu.harvard.iq.dataverse.util.file.FileExceedsStorageQuotaException; import edu.harvard.iq.dataverse.util.BundleUtil; import edu.harvard.iq.dataverse.util.FileUtil; -import static edu.harvard.iq.dataverse.util.FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT; -import static edu.harvard.iq.dataverse.util.FileUtil.createIngestFailureReport; -import static edu.harvard.iq.dataverse.util.FileUtil.determineFileType; -import static edu.harvard.iq.dataverse.util.FileUtil.determineFileTypeByNameAndExtension; -import static edu.harvard.iq.dataverse.util.FileUtil.getFilesTempDirectory; -import static edu.harvard.iq.dataverse.util.FileUtil.saveInputStreamInTempFile; -import static edu.harvard.iq.dataverse.util.FileUtil.useRecognizedType; import edu.harvard.iq.dataverse.util.ShapefileHandler; import edu.harvard.iq.dataverse.util.StringUtil; import edu.harvard.iq.dataverse.util.file.BagItFileHandler; import edu.harvard.iq.dataverse.util.file.BagItFileHandlerFactory; import edu.harvard.iq.dataverse.util.file.CreateDataFileResult; +import edu.harvard.iq.dataverse.util.file.FileExceedsStorageQuotaException; +import jakarta.enterprise.inject.spi.CDI; +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -42,7 +37,7 @@ import java.text.MessageFormat; import java.util.ArrayList; import java.util.Arrays; -import java.util.Enumeration; +import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -51,12 +46,17 @@ import java.util.Set; import java.util.logging.Logger; import java.util.zip.GZIPInputStream; -import java.util.zip.ZipFile; import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import jakarta.enterprise.inject.spi.CDI; -import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; +import java.util.zip.ZipFile; + +import static edu.harvard.iq.dataverse.datasetutility.FileSizeChecker.bytesToHumanReadable; +import static edu.harvard.iq.dataverse.util.FileUtil.MIME_TYPE_UNDETERMINED_DEFAULT; +import static edu.harvard.iq.dataverse.util.FileUtil.createIngestFailureReport; +import static edu.harvard.iq.dataverse.util.FileUtil.determineFileType; +import static edu.harvard.iq.dataverse.util.FileUtil.determineFileTypeByNameAndExtension; +import static edu.harvard.iq.dataverse.util.FileUtil.getFilesTempDirectory; +import static edu.harvard.iq.dataverse.util.FileUtil.saveInputStreamInTempFile; +import static edu.harvard.iq.dataverse.util.FileUtil.useRecognizedType; /** * @@ -140,9 +140,10 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException if (newStorageIdentifier == null) { - if (getFilesTempDirectory() != null) { + var filesTempDirectory = getFilesTempDirectory(); + if (filesTempDirectory != null) { try { - tempFile = Files.createTempFile(Paths.get(getFilesTempDirectory()), "tmp", "upload"); + tempFile = Files.createTempFile(Paths.get(filesTempDirectory), "tmp", "upload"); // "temporary" location is the key here; this is why we are not using // the DataStore framework for this - the assumption is that // temp files will always be stored on the local filesystem. @@ -260,10 +261,6 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException // DataFile objects from its contents: } else if (finalType.equals("application/zip")) { - ZipFile zipFile = null; - ZipInputStream unZippedIn = null; - ZipEntry zipEntry = null; - int fileNumberLimit = ctxt.systemConfig().getZipUploadFilesLimit(); Long combinedUnzippedFileSize = 0L; @@ -271,14 +268,14 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException Charset charset = null; /* TODO: (?) - We may want to investigate somehow letting the user specify + We may want to investigate somehow letting the user specify the charset for the filenames in the zip file... - - otherwise, ZipInputStream bails out if it encounteres a file - name that's not valid in the current charest (i.e., UTF-8, in - our case). It would be a bit trickier than what we're doing for - SPSS tabular ingests - with the lang. encoding pulldown menu - + - otherwise, ZipInputStream bails out if it encounteres a file + name that's not valid in the current charest (i.e., UTF-8, in + our case). It would be a bit trickier than what we're doing for + SPSS tabular ingests - with the lang. encoding pulldown menu - because this encoding needs to be specified *before* we upload and - attempt to unzip the file. + attempt to unzip the file. -- L.A. 4.0 beta12 logger.info("default charset is "+Charset.defaultCharset().name()); if (Charset.isSupported("US-ASCII")) { @@ -287,25 +284,21 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException if (charset != null) { logger.info("was able to obtain charset for US-ASCII"); } - + } */ - /** - * Perform a quick check for how many individual files are - * inside this zip archive. If it's above the limit, we can - * give up right away, without doing any unpacking. + /** + * Perform a quick check for how many individual files are + * inside this zip archive. If it's above the limit, we can + * give up right away, without doing any unpacking. * This should be a fairly inexpensive operation, we just need - * to read the directory at the end of the file. + * to read the directory at the end of the file. */ - - if (charset != null) { - zipFile = new ZipFile(tempFile.toFile(), charset); - } else { - zipFile = new ZipFile(tempFile.toFile()); - } + + /** - * The ZipFile constructors above will throw ZipException - + * The ZipFile constructors in openZipFile will throw ZipException - * a type of IOException - if there's something wrong * with this file as a zip. There's no need to intercept it * here, it will be caught further below, with other IOExceptions, @@ -313,8 +306,8 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException * then attempt to save it as is. */ - int numberOfUnpackableFiles = 0; - + int numberOfUnpackableFiles = 0; + /** * Note that we can't just use zipFile.size(), * unfortunately, since that's the total number of entries, @@ -323,83 +316,48 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException * that are files. */ - for (Enumeration entries = zipFile.entries(); entries.hasMoreElements();) { - ZipEntry entry = entries.nextElement(); - logger.fine("inside first zip pass; this entry: "+entry.getName()); - if (!entry.isDirectory()) { - String shortName = entry.getName().replaceFirst("^.*[\\/]", ""); - // ... and, finally, check if it's a "fake" file - a zip archive entry - // created for a MacOS X filesystem element: (these - // start with "._") - if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { - numberOfUnpackableFiles++; - if (numberOfUnpackableFiles > fileNumberLimit) { - logger.warning("Zip upload - too many files in the zip to process individually."); - warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit - + "); please upload a zip archive with fewer files, if you want them to be ingested " - + "as individual DataFiles."; - throw new IOException(); - } - // In addition to counting the files, we can - // also check the file size while we're here, - // provided the size limit is defined; if a single - // file is above the individual size limit, unzipped, - // we give up on unpacking this zip archive as well: - if (fileSizeLimit != null && entry.getSize() > fileSizeLimit) { - throw new FileExceedsMaxSizeException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(entry.getSize()), bytesToHumanReadable(fileSizeLimit))); - } - // Similarly, we want to check if saving all these unpacked - // files is going to push the disk usage over the - // quota: - if (storageQuotaLimit != null) { - combinedUnzippedFileSize = combinedUnzippedFileSize + entry.getSize(); - if (combinedUnzippedFileSize > storageQuotaLimit) { - //throw new FileExceedsStorageQuotaException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.quota_exceeded"), bytesToHumanReadable(combinedUnzippedFileSize), bytesToHumanReadable(storageQuotaLimit))); - // change of plans: if the unzipped content inside exceeds the remaining quota, - // we reject the upload outright, rather than accepting the zip - // file as is. - throw new CommandExecutionException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.unzipped.quota_exceeded"), bytesToHumanReadable(storageQuotaLimit)), this); - } + try (var zipFile = openZipFile(tempFile, charset)) { + for (var entry : filteredZipEntries(zipFile)) { + logger.fine("inside first zip pass; this entry: " + entry.getName()); + numberOfUnpackableFiles++; + if (numberOfUnpackableFiles > fileNumberLimit) { + logger.warning("Zip upload - too many files in the zip to process individually."); + warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit + + "); please upload a zip archive with fewer files, if you want them to be ingested " + + "as individual DataFiles."; + throw new IOException(); + } + // In addition to counting the files, we can + // also check the file size while we're here, + // provided the size limit is defined; if a single + // file is above the individual size limit, unzipped, + // we give up on unpacking this zip archive as well: + if (fileSizeLimit != null && entry.getSize() > fileSizeLimit) { + throw new FileExceedsMaxSizeException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.file_exceeds_limit"), bytesToHumanReadable(entry.getSize()), bytesToHumanReadable(fileSizeLimit))); + } + // Similarly, we want to check if saving all these unpacked + // files is going to push the disk usage over the + // quota: + if (storageQuotaLimit != null) { + combinedUnzippedFileSize = combinedUnzippedFileSize + entry.getSize(); + if (combinedUnzippedFileSize > storageQuotaLimit) { + //throw new FileExceedsStorageQuotaException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.quota_exceeded"), bytesToHumanReadable(combinedUnzippedFileSize), bytesToHumanReadable(storageQuotaLimit))); + // change of plans: if the unzipped content inside exceeds the remaining quota, + // we reject the upload outright, rather than accepting the zip + // file as is. + throw new CommandExecutionException(MessageFormat.format(BundleUtil.getStringFromBundle("file.addreplace.error.unzipped.quota_exceeded"), bytesToHumanReadable(storageQuotaLimit)), this); } } } } - + // OK we're still here - that means we can proceed unzipping. - // Close the ZipFile, re-open as ZipInputStream: - zipFile.close(); // reset: combinedUnzippedFileSize = 0L; - if (charset != null) { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile()), charset); - } else { - unZippedIn = new ZipInputStream(new FileInputStream(tempFile.toFile())); - } - - while (true) { - try { - zipEntry = unZippedIn.getNextEntry(); - } catch (IllegalArgumentException iaex) { - // Note: - // ZipInputStream documentation doesn't even mention that - // getNextEntry() throws an IllegalArgumentException! - // but that's what happens if the file name of the next - // entry is not valid in the current CharSet. - // -- L.A. - warningMessage = "Failed to unpack Zip file. (Unknown Character Set used in a file name?) Saving the file as is."; - logger.warning(warningMessage); - throw new IOException(); - } - - if (zipEntry == null) { - break; - } - // Note that some zip entries may be directories - we - // simply skip them: - - if (!zipEntry.isDirectory()) { + try (var zipFile = openZipFile(tempFile, charset)) { + for (var entry : filteredZipEntries(zipFile)) { if (datafiles.size() > fileNumberLimit) { logger.warning("Zip upload - too many files."); warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit @@ -407,72 +365,55 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException + "as individual DataFiles."; throw new IOException(); } - - String fileEntryName = zipEntry.getName(); + var fileEntryName = entry.getName(); + var shortName = getShortName(fileEntryName); logger.fine("ZipEntry, file: " + fileEntryName); + String storageIdentifier = FileUtil.generateStorageIdentifier(); + File unzippedFile = new File(getFilesTempDirectory() + "/" + storageIdentifier); + Files.copy(zipFile.getInputStream(entry), unzippedFile.toPath(), StandardCopyOption.REPLACE_EXISTING); + // No need to check the size of this unpacked file against the size limit, + // since we've already checked for that in the first pass. + DataFile datafile = FileUtil.createSingleDataFile(version, null, storageIdentifier, shortName, + MIME_TYPE_UNDETERMINED_DEFAULT, + ctxt.systemConfig().getFileFixityChecksumAlgorithm(), null, false); + + if (!fileEntryName.equals(shortName)) { + // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), + // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all + // the leading, trailing and duplicate slashes; then replace all the characters that + // don't pass our validation rules. + String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); + directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); + // if (!"".equals(directoryName)) { + if (!StringUtil.isEmpty(directoryName)) { + logger.fine("setting the directory label to " + directoryName); + datafile.getFileMetadata().setDirectoryLabel(directoryName); + } + } - if (fileEntryName != null && !fileEntryName.equals("")) { - - String shortName = fileEntryName.replaceFirst("^.*[\\/]", ""); - - // Check if it's a "fake" file - a zip archive entry - // created for a MacOS X filesystem element: (these - // start with "._") - if (!shortName.startsWith("._") && !shortName.startsWith(".DS_Store") && !"".equals(shortName)) { - // OK, this seems like an OK file entry - we'll try - // to read it and create a DataFile with it: - - String storageIdentifier = FileUtil.generateStorageIdentifier(); - File unzippedFile = new File(getFilesTempDirectory() + "/" + storageIdentifier); - Files.copy(unZippedIn, unzippedFile.toPath(), StandardCopyOption.REPLACE_EXISTING); - // No need to check the size of this unpacked file against the size limit, - // since we've already checked for that in the first pass. - - DataFile datafile = FileUtil.createSingleDataFile(version, null, storageIdentifier, shortName, - MIME_TYPE_UNDETERMINED_DEFAULT, - ctxt.systemConfig().getFileFixityChecksumAlgorithm(), null, false); - - if (!fileEntryName.equals(shortName)) { - // If the filename looks like a hierarchical folder name (i.e., contains slashes and backslashes), - // we'll extract the directory name; then subject it to some "aggressive sanitizing" - strip all - // the leading, trailing and duplicate slashes; then replace all the characters that - // don't pass our validation rules. - String directoryName = fileEntryName.replaceFirst("[\\\\/][\\\\/]*[^\\\\/]*$", ""); - directoryName = StringUtil.sanitizeFileDirectory(directoryName, true); - // if (!"".equals(directoryName)) { - if (!StringUtil.isEmpty(directoryName)) { - logger.fine("setting the directory label to " + directoryName); - datafile.getFileMetadata().setDirectoryLabel(directoryName); - } - } + if (datafile != null) { + // We have created this datafile with the mime type "unknown"; + // Now that we have it saved in a temporary location, + // let's try and determine its real type: - if (datafile != null) { - // We have created this datafile with the mime type "unknown"; - // Now that we have it saved in a temporary location, - // let's try and determine its real type: - - String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); - - try { - recognizedType = determineFileType(unzippedFile, shortName); - // null the File explicitly, to release any open FDs: - unzippedFile = null; - logger.fine("File utility recognized unzipped file as " + recognizedType); - if (recognizedType != null && !recognizedType.equals("")) { - datafile.setContentType(recognizedType); - } - } catch (Exception ex) { - logger.warning("Failed to run the file utility mime type check on file " + fileName); - } - - datafiles.add(datafile); - combinedUnzippedFileSize += datafile.getFilesize(); + String tempFileName = getFilesTempDirectory() + "/" + datafile.getStorageIdentifier(); + + try { + recognizedType = determineFileType(unzippedFile, shortName); + // null the File explicitly, to release any open FDs: + unzippedFile = null; + logger.fine("File utility recognized unzipped file as " + recognizedType); + if (recognizedType != null && !recognizedType.equals("")) { + datafile.setContentType(recognizedType); } + } catch (Exception ex) { + logger.warning("Failed to run the file utility mime type check on file " + fileName); } + + datafiles.add(datafile); + combinedUnzippedFileSize += datafile.getFilesize(); } } - unZippedIn.closeEntry(); - } } catch (IOException ioex) { @@ -494,18 +435,7 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException //warningMessage = BundleUtil.getStringFromBundle("file.addreplace.warning.unzip.failed.quota", Arrays.asList(FileSizeChecker.bytesToHumanReadable(storageQuotaLimit))); //datafiles.clear(); throw new CommandExecutionException(fesqx.getMessage(), fesqx, this); - }*/ finally { - if (zipFile != null) { - try { - zipFile.close(); - } catch (Exception zEx) {} - } - if (unZippedIn != null) { - try { - unZippedIn.close(); - } catch (Exception zEx) {} - } - } + }*/ if (!datafiles.isEmpty()) { // remove the uploaded zip file: try { @@ -591,7 +521,8 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException // The try-catch is due to error encountered in using NFS for stocking file, // cf. https://github.com/IQSS/dataverse/issues/5909 try { - FileUtils.deleteDirectory(rezipFolder); + if (rezipFolder!=null) + FileUtils.deleteDirectory(rezipFolder); } catch (IOException ioex) { // do nothing - it's a temp folder. logger.warning("Could not remove temp folder, error message : " + ioex.getMessage()); @@ -730,7 +661,37 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException return CreateDataFileResult.error(fileName, finalType); } // end createDataFiles - + + private static List filteredZipEntries(ZipFile zipFile) { + var entries = Collections.list(zipFile.entries()).stream().filter(e -> { + var entryName = e.getName(); + logger.fine("ZipEntry, file: " + entryName); + return !e.isDirectory() && !entryName.isEmpty() && !isFileToSkip(entryName); + }).toList(); + return entries; + } + + private static ZipFile openZipFile(Path tempFile, Charset charset) throws IOException { + if (charset != null) { + return new ZipFile(tempFile.toFile(), charset); + } + else { + return new ZipFile(tempFile.toFile()); + } + } + + private static boolean isFileToSkip(String fileName) { + // check if it's a "fake" file - a zip archive entry + // created for a MacOS X filesystem element: (these + // start with "._") + var shortName = getShortName(fileName); + return shortName.startsWith("._") || shortName.startsWith(".DS_Store") || "".equals(shortName); + } + + private static String getShortName(String fileName) { + return fileName.replaceFirst("^.*[\\/]", ""); + } + @Override public Map> getRequiredPermissions() { Map> ret = new HashMap<>(); diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java index 8c5dad237b1..27a2ab99376 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceShapefileHelper.java @@ -100,71 +100,48 @@ public IngestServiceShapefileHelper(File zippedShapefile, File rezipFolder){ //this.processFile(zippedShapefile, rezipFolder); } - - private FileInputStream getFileInputStream(File fileObject){ - if (fileObject==null){ - return null; - } - try { + + private FileInputStream getFileInputStream(File fileObject){ + if (fileObject==null){ + return null; + } + try { return new FileInputStream(fileObject); } catch (FileNotFoundException ex) { logger.severe("Failed to create FileInputStream from File: " + fileObject.getAbsolutePath()); return null; } - } - - private void closeFileInputStream(FileInputStream fis){ - if (fis==null){ - return; - } + } + + private void closeFileInputStream(FileInputStream fis){ + if (fis==null){ + return; + } try { - fis.close(); + fis.close(); } catch (IOException ex) { logger.info("Failed to close FileInputStream"); } - } - + } + public boolean processFile() { if ((!isValidFile(this.zippedShapefile))||(!isValidFolder(this.rezipFolder))){ return false; } - - // (1) Use the ShapefileHandler to the .zip for a shapefile - // - FileInputStream shpfileInputStream = this.getFileInputStream(zippedShapefile); - if (shpfileInputStream==null){ - return false; - } - - this.shpHandler = new ShapefileHandler(shpfileInputStream); - if (!shpHandler.containsShapefile()){ - logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); - return false; - } - - this.closeFileInputStream(shpfileInputStream); - - // (2) Rezip the shapefile pieces - logger.info("rezipFolder: " + rezipFolder.getAbsolutePath()); - shpfileInputStream = this.getFileInputStream(zippedShapefile); - if (shpfileInputStream==null){ - return false; - } - - boolean rezipSuccess; try { - rezipSuccess = shpHandler.rezipShapefileSets(shpfileInputStream, rezipFolder); + this.shpHandler = new ShapefileHandler(zippedShapefile); + if (!shpHandler.containsShapefile()){ + logger.severe("Shapefile was incorrectly detected upon Ingest (FileUtil) and passed here"); + return false; + } + logger.info("rezipFolder: " + rezipFolder.getAbsolutePath()); + return shpHandler.rezipShapefileSets(rezipFolder); } catch (IOException ex) { logger.severe("Shapefile was not correctly unpacked/repacked"); logger.severe("shpHandler message: " + shpHandler.errorMessage); return false; } - - this.closeFileInputStream(shpfileInputStream); - - return rezipSuccess; - // return createDataFiles(rezipFolder); } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java index a0c32d5c8ce..991682ec8e8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/FileUtil.java @@ -525,15 +525,18 @@ public static String determineFileType(File f, String fileName) throws IOExcepti // Check for shapefile extensions as described here: http://en.wikipedia.org/wiki/Shapefile //logger.info("Checking for shapefile"); - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(f)); + ShapefileHandler shp_handler = new ShapefileHandler(f); if (shp_handler.containsShapefile()){ // logger.info("------- shapefile FOUND ----------"); fileType = ShapefileHandler.SHAPEFILE_FILE_TYPE; //"application/zipped-shapefile"; } - - Optional bagItFileHandler = CDI.current().select(BagItFileHandlerFactory.class).get().getBagItFileHandler(); - if(bagItFileHandler.isPresent() && bagItFileHandler.get().isBagItPackage(fileName, f)) { - fileType = BagItFileHandler.FILE_TYPE; + try { + Optional bagItFileHandler = CDI.current().select(BagItFileHandlerFactory.class).get().getBagItFileHandler(); + if (bagItFileHandler.isPresent() && bagItFileHandler.get().isBagItPackage(fileName, f)) { + fileType = BagItFileHandler.FILE_TYPE; + } + } catch (Exception e) { + logger.warning("Error checking for BagIt package: " + e.getMessage()); } } diff --git a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java index f1440cc3c02..20dc6d9c26a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/ShapefileHandler.java @@ -1,23 +1,21 @@ package edu.harvard.iq.dataverse.util; import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; import java.io.FileNotFoundException; import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.Date; import java.util.ArrayList; import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipInputStream; -import java.util.zip.ZipException; +import java.util.zip.ZipFile; import java.util.HashMap; import java.util.*; import java.nio.file.Files; import java.nio.file.Paths; import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; -import java.util.logging.Level; + import java.util.logging.Logger; import org.apache.commons.io.FileUtils; @@ -43,11 +41,10 @@ * "shape1.pdf", "README.md", "shape_notes.txt" * * Code Example: - * FileInputStream shp_file_input_stream = new FileInputStream(new File("zipped_shapefile.zip")) - * ShapefileHandler shp_handler = new ShapefileHandler(shp_file_input_stream); + * ShapefileHandler shp_handler = new ShapefileHandler(new File("zipped_shapefile.zip")); * if (shp_handler.containsShapefile()){ * File rezip_folder = new File("~/folder_for_rezipping"); - * boolean rezip_success = shp_handler.rezipShapefileSets(shp_file_input_stream, rezip_folder ); + * boolean rezip_success = shp_handler.rezipShapefileSets(rezip_folder ); * if (!rezip_success){ * // rezip failed, should be an error message (String) available System.out.println(shp_handler.error_message); @@ -74,7 +71,8 @@ public class ShapefileHandler{ public final static String SHP_XML_EXTENSION = "shp.xml"; public final static String BLANK_EXTENSION = "__PLACEHOLDER-FOR-BLANK-EXTENSION__"; public final static List SHAPEFILE_ALL_EXTENSIONS = Arrays.asList("shp", "shx", "dbf", "prj", "sbn", "sbx", "fbn", "fbx", "ain", "aih", "ixs", "mxs", "atx", "cpg", "qpj", "qmd", SHP_XML_EXTENSION); - + private final File zipFile; + public boolean DEBUG = false; private boolean zipFileProcessed = false; @@ -97,9 +95,6 @@ public class ShapefileHandler{ private Map> fileGroups = new HashMap<>(); private List finalRezippedFiles = new ArrayList<>(); - - private String outputFolder = "unzipped"; - private String rezippedFolder = "rezipped"; // Debug helper private void msg(String s){ @@ -116,40 +111,28 @@ private void msgt(String s){ } /* - Constructor, start with filename - */ - public ShapefileHandler(String filename){ - - if (filename==null){ - this.addErrorMessage("The filename was null"); - return; - } - - FileInputStream zip_file_stream; - try { - zip_file_stream = new FileInputStream(new File(filename)); - } catch (FileNotFoundException ex) { - this.addErrorMessage("The file was not found"); + Constructor, start with File + */ + public ShapefileHandler(File zip_file) throws IOException { + zipFile = zip_file; + if (zip_file == null) { + this.addErrorMessage("The file was null"); return; } - - this.examineZipfile(zip_file_stream); - } - - - /* - Constructor, start with FileInputStream - */ - public ShapefileHandler(FileInputStream zip_file_stream){ - - if (zip_file_stream==null){ - this.addErrorMessage("The zip_file_stream was null"); - return; + try (var zip_file_object = new ZipFile(zip_file)) { + this.examineZipfile(zip_file_object); + } + catch (FileNotFoundException ex) { + // While this constructor had a FileInputStream as argument: + // FileUtil.determineFileType threw this exception before calling the constructor with a FileInputStream + // IngestServiceShapefileHelper.processFile won´t call this constructor if the file is not valid hence does not exist. + // When the file would have disappeared in the meantime, it would have produced a slightly different error message. + logger.severe("File not found: " + zip_file.getAbsolutePath()); + throw ex; } - this.examineZipfile(zip_file_stream); } - + public List getFinalRezippedFiles(){ return this.finalRezippedFiles; } @@ -291,26 +274,19 @@ inside the uploaded zip file (issue #6873). To achieve this, we recreate subfolders in the FileMetadata of the newly created DataFiles. (-- L.A. 09/2020) */ - private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File target_directory){ + private boolean unzipFilesToDirectory(ZipFile zipfileInput, File target_directory){ logger.fine("unzipFilesToDirectory: " + target_directory.getAbsolutePath() ); - if (zipfile_input_stream== null){ - this.addErrorMessage("unzipFilesToDirectory. The zipfile_input_stream is null."); - return false; - } if (!target_directory.isDirectory()){ this.addErrorMessage("This directory does not exist: " + target_directory.getAbsolutePath()); return false; } - List unzippedFileNames = new ArrayList<>(); - - ZipInputStream zipStream = new ZipInputStream(zipfile_input_stream); + List unzippedFileNames = new ArrayList<>(); + - ZipEntry origEntry; - byte[] buffer = new byte[2048]; try { - while((origEntry = zipStream.getNextEntry())!=null){ + for(var origEntry : Collections.list(zipfileInput.entries())){ String zentryFileName = origEntry.getName(); logger.fine("\nOriginal entry name: " + origEntry); @@ -360,15 +336,10 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File unzippedFileNames.add(outpath); } logger.fine("Write zip file: " + outpath); - FileOutputStream fileOutputStream; - long fsize = 0; - fileOutputStream = new FileOutputStream(outpath); - int len;// = 0; - while ((len = zipStream.read(buffer)) > 0){ - fileOutputStream.write(buffer, 0, len); - fsize+=len; - } // end while - fileOutputStream.close(); + try(var inputStream = zipfileInput.getInputStream(origEntry)) { + Files.createDirectories(new File(outpath).getParentFile().toPath()); + Files.copy(inputStream, Path.of(outpath), StandardCopyOption.REPLACE_EXISTING); + } } // end outer while } catch (IOException ex) { for (StackTraceElement el : ex.getStackTrace()){ @@ -377,19 +348,13 @@ private boolean unzipFilesToDirectory(FileInputStream zipfile_input_stream, File this.addErrorMessage("Failed to open ZipInputStream entry" + ex.getMessage()); return false; } - - try { - zipStream.close(); - } catch (IOException ex) { - Logger.getLogger(ShapefileHandler.class.getName()).log(Level.SEVERE, null, ex); - } - return true; + return true; } /* Rezip the shapefile(s) into a given directory Assumes that the zipfile_input_stream has already been checked! */ - public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rezippedFolder) throws IOException{ + public boolean rezipShapefileSets(File rezippedFolder) throws IOException{ logger.fine("rezipShapefileSets"); //msgt("rezipShapefileSets"); if (!this.zipFileProcessed){ @@ -400,10 +365,6 @@ public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rez this.addErrorMessage("There are no shapefiles here!"); return false; } - if (zipfile_input_stream== null){ - this.addErrorMessage("The zipfile_input_stream is null."); - return false; - } if (rezippedFolder == null){ this.addErrorMessage("The rezippedFolder is null."); return false; @@ -433,9 +394,11 @@ public boolean rezipShapefileSets(FileInputStream zipfile_input_stream, File rez // Unzip files! - if (!this.unzipFilesToDirectory(zipfile_input_stream, dir_for_unzipping)){ - this.addErrorMessage("Failed to unzip files."); - return false; + try(var zipfileObject = new ZipFile(zipFile)) { + if (!this.unzipFilesToDirectory(zipfileObject, dir_for_unzipping)) { + this.addErrorMessage("Failed to unzip files."); + return false; + } } // Redistribute files! String target_dirname = rezippedFolder.getAbsolutePath(); @@ -681,27 +644,19 @@ private boolean isFileToSkip(String fname){ /************************************** * Iterate through the zip file contents. * Does it contain any shapefiles? - * - * @param FileInputStream zip_file_stream */ - private boolean examineZipfile(FileInputStream zip_file_stream){ + private boolean examineZipfile(ZipFile zip_file){ // msgt("examineZipfile"); - - if (zip_file_stream==null){ - this.addErrorMessage("The zip file stream was null"); - return false; - } - + // Clear out file lists this.filesListInDir.clear(); this.filesizeHash.clear(); this.fileGroups.clear(); - try{ - ZipInputStream zipStream = new ZipInputStream(zip_file_stream); - ZipEntry entry; - List hiddenDirectories = new ArrayList<>(); - while((entry = zipStream.getNextEntry())!=null){ + try{ + List hiddenDirectories = new ArrayList<>(); + for(var entry : Collections.list(zip_file.entries())){ + String zentryFileName = entry.getName(); boolean isDirectory = entry.isDirectory(); @@ -748,8 +703,6 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.filesizeHash.put(unzipFilePath, entry.getSize()); } } // end while - - zipStream.close(); if (this.filesListInDir.isEmpty()){ errorMessage = "No files in zipStream"; @@ -759,13 +712,8 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.zipFileProcessed = true; return true; - }catch(ZipException ex){ - this.addErrorMessage("ZipException"); - msgt("ZipException"); - return false; - }catch(IOException ex){ - //ex.printStackTrace(); + //ex.printStackTrace(); this.addErrorMessage("IOException File name"); msgt("IOException"); return false; @@ -773,9 +721,6 @@ private boolean examineZipfile(FileInputStream zip_file_stream){ this.addErrorMessage("IllegalArgumentException when parsing zipfile"); msgt("IllegalArgumentException when parsing zipfile"); return false; - - }finally{ - } } // end examineFile diff --git a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java new file mode 100644 index 00000000000..1262984eb27 --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java @@ -0,0 +1,187 @@ +package edu.harvard.iq.dataverse.engine.command.impl; + +import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import edu.harvard.iq.dataverse.engine.command.CommandContext; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.engine.command.exception.CommandException; +import edu.harvard.iq.dataverse.settings.JvmSettings; +import edu.harvard.iq.dataverse.storageuse.UploadSessionQuotaLimit; +import edu.harvard.iq.dataverse.util.JhoveFileType; +import edu.harvard.iq.dataverse.util.SystemConfig; +import edu.harvard.iq.dataverse.util.testing.JvmSetting; +import edu.harvard.iq.dataverse.util.testing.LocalJvmSettings; +import org.jetbrains.annotations.NotNull; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.file.NoSuchFileException; +import java.nio.file.Path; + +import static edu.harvard.iq.dataverse.DataFile.ChecksumType.MD5; +import static org.apache.commons.io.file.FilesUncheck.createDirectories; +import static org.apache.commons.io.file.PathUtils.deleteDirectory; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; + + +@LocalJvmSettings +public class CreateNewDataFilesTest { + // TODO keep constants for annotations in sync with class name + Path testDir = Path.of("target/test/").resolve(getClass().getSimpleName()); + PrintStream original_stderr; + + @BeforeEach + public void cleanTmpDir() throws IOException { + original_stderr = System.err; + if(testDir.toFile().exists()) + deleteDirectory(testDir); + } + + @AfterEach void restoreStderr() { + System.setErr(original_stderr); + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_fails_to_upload_when_tmp_does_not_exist() throws FileNotFoundException { + + mockTmpLookup(); + var cmd = createCmd("scripts/search/data/shape/shapefile.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(true, 0L, MD5, 10)); + + assertThatThrownBy(() -> cmd.execute(ctxt)) + .isInstanceOf(CommandException.class) + .hasMessageContaining("Failed to save the upload as a temp file (temp disk space?)") + .hasRootCauseInstanceOf(NoSuchFileException.class) + .getRootCause() + .hasMessageStartingWith("target/test/CreateNewDataFilesTest/tmp/temp/tmp"); + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_fails_on_size_limit() throws Exception { + createDirectories(Path.of("target/test/CreateNewDataFilesTest/tmp/temp")); + + mockTmpLookup(); + var cmd = createCmd("scripts/search/data/binary/3files.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(true, 50L, MD5, 0)); + try (var mockedStatic = Mockito.mockStatic(JhoveFileType.class)) { + mockedStatic.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + assertThatThrownBy(() -> cmd.execute(ctxt)) + .isInstanceOf(CommandException.class) + .hasMessage("This file size (462 B) exceeds the size limit of 50 B."); + } + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_loads_individual_files_from_uploaded_zip() throws Exception { + var tempDir = testDir.resolve("tmp/temp"); + createDirectories(tempDir); + + mockTmpLookup(); + var cmd = createCmd("src/test/resources/own-cloud-downloads/greetings.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(false, 1000000L, MD5, 10)); + try (MockedStatic mockedStatic = Mockito.mockStatic(JhoveFileType.class)) { + mockedStatic.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + // the test + var result = cmd.execute(ctxt); + + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles().stream().map(dataFile -> + dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName() + )).containsExactlyInAnyOrder( + "DD-1576/goodbye.txt", "DD-1576/hello.txt" + ); + var storageIds = result.getDataFiles().stream().map(DataFile::getStorageIdentifier).toList(); + assertThat(tempDir.toFile().list()) + .containsExactlyInAnyOrderElementsOf(storageIds); + } + } + + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "target/test/CreateNewDataFilesTest/tmp") + public void execute_rezips_sets_of_shape_files_from_uploaded_zip() throws Exception { + var tempDir = testDir.resolve("tmp/temp"); + createDirectories(tempDir); + + mockTmpLookup(); + var cmd = createCmd("src/test/resources/own-cloud-downloads/shapes.zip", mockDatasetVersion(), 1000L, 500L); + var ctxt = mockCommandContext(mockSysConfig(false, 100000000L, MD5, 10)); + try (var mockedJHoveFileType = Mockito.mockStatic(JhoveFileType.class)) { + mockedJHoveFileType.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + + // the test + var result = cmd.execute(ctxt); + + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles().stream().map(dataFile -> + (dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName()) + .replaceAll(".*temp/shp_[-0-9]*/", "") + )).containsExactlyInAnyOrder( + "dataDir/shape1.zip", + "dataDir/shape2/shape2", + "dataDir/shape2/shape2.pdf", + "dataDir/shape2/shape2.txt", + "dataDir/shape2/shape2.zip", + "dataDir/extra/shp_dictionary.xls", + "dataDir/extra/notes", + "dataDir/extra/README.MD" + ); + var storageIds = result.getDataFiles().stream().map(DataFile::getStorageIdentifier).toList(); + assertThat(tempDir.toFile().list()) + .containsExactlyInAnyOrderElementsOf(storageIds); + } + } + + private static @NotNull CreateNewDataFilesCommand createCmd(String name, DatasetVersion dsVersion, long allocatedQuotaLimit, long usedQuotaLimit) throws FileNotFoundException { + return new CreateNewDataFilesCommand( + Mockito.mock(DataverseRequest.class), + dsVersion, + new FileInputStream(name), + "example.zip", + "application/zip", + null, + new UploadSessionQuotaLimit(allocatedQuotaLimit, usedQuotaLimit), + "sha"); + } + + private static @NotNull CommandContext mockCommandContext(SystemConfig sysCfg) { + var ctxt = Mockito.mock(CommandContext.class); + Mockito.when(ctxt.systemConfig()).thenReturn(sysCfg); + return ctxt; + } + + private static @NotNull SystemConfig mockSysConfig(boolean isStorageQuataEnforced, long maxFileUploadSizeForStore, DataFile.ChecksumType checksumType, int zipUploadFilesLimit) { + var sysCfg = Mockito.mock(SystemConfig.class); + Mockito.when(sysCfg.isStorageQuotasEnforced()).thenReturn(isStorageQuataEnforced); + Mockito.when(sysCfg.getMaxFileUploadSizeForStore(any())).thenReturn(maxFileUploadSizeForStore); + Mockito.when(sysCfg.getFileFixityChecksumAlgorithm()).thenReturn(checksumType); + Mockito.when(sysCfg.getZipUploadFilesLimit()).thenReturn(zipUploadFilesLimit); + return sysCfg; + } + + private static void mockTmpLookup() { + JvmSettings mockFilesDirectory = Mockito.mock(JvmSettings.class); + Mockito.when(mockFilesDirectory.lookup()).thenReturn("/mocked/path"); + } + + private static @NotNull DatasetVersion mockDatasetVersion() { + var dsVersion = Mockito.mock(DatasetVersion.class); + Mockito.when(dsVersion.getDataset()).thenReturn(Mockito.mock(Dataset.class)); + return dsVersion; + } + +} diff --git a/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java b/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java index 3c5b4797b0a..c4ee4547ed7 100644 --- a/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/util/shapefile/ShapefileHandlerTest.java @@ -63,22 +63,22 @@ private File createBlankFile(String filename) throws IOException { } return Files.createFile(tempFolder.resolve(filename)).toFile(); } - + private FileInputStream createZipReturnFilestream(List file_names, String zipfile_name) throws IOException{ - + File zip_file_obj = this.createAndZipFiles(file_names, zipfile_name); if (zip_file_obj == null){ return null; } - + FileInputStream file_input_stream = new FileInputStream(zip_file_obj); return file_input_stream; - + } - + /* - Convenience class to create .zip file and return a FileInputStream + Convenience method to create .zip file and return a File @param List file_names - List of filenames to add to .zip. These names will be used to create 0 length files @param String zipfile_name - Name of .zip file to create @@ -98,13 +98,13 @@ private File createAndZipFiles(List file_names, String zipfile_name) thr } Path zip_file_obj = this.tempFolder.resolve(zipfile_name); - ZipOutputStream zip_stream = new ZipOutputStream(new FileOutputStream(zip_file_obj.toFile())); + try (ZipOutputStream zip_stream = new ZipOutputStream(new FileOutputStream(zip_file_obj.toFile()))) { - // Iterate through File objects and add them to the ZipOutputStream - for (File file_obj : fileCollection) { - this.addToZipFile(file_obj.getName(), file_obj, zip_stream); + // Iterate through File objects and add them to the ZipOutputStream + for (File file_obj : fileCollection) { + this.addToZipFile(file_obj.getName(), file_obj, zip_stream); + } } - /* ----------------------------------- Cleanup: Delete single files that were added to .zip ----------------------------------- */ @@ -126,7 +126,7 @@ public void testCreateZippedNonShapefile() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "not-quite-a-shape.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; // Contains shapefile? @@ -157,7 +157,7 @@ public void testShapefileWithQpjAndQmd() throws IOException { File zipFile = createAndZipFiles(fileNames, "testShapeWithNewExtensions.zip"); // Pass the zip to the ShapefileHandler - ShapefileHandler shpHandler = new ShapefileHandler(new FileInputStream(zipFile)); + ShapefileHandler shpHandler = new ShapefileHandler(zipFile); shpHandler.DEBUG = true; // Check if it is recognized as a shapefile @@ -191,7 +191,7 @@ public void testZippedTwoShapefiles() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "two-shapes.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; assertTrue(shp_handler.containsShapefile(), "verify shapefile existance"); @@ -217,7 +217,7 @@ public void testZippedTwoShapefiles() throws IOException{ // Rezip/Reorder the files File test_unzip_folder = Files.createDirectory(this.tempFolder.resolve("test_unzip")).toFile(); //File test_unzip_folder = new File("/Users/rmp553/Desktop/blah"); - shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), test_unzip_folder ); + shp_handler.rezipShapefileSets(test_unzip_folder ); // Does the re-ordering do what we wanted? @@ -244,7 +244,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ File zipfile_obj = createAndZipFiles(file_names, "shape-plus.zip"); // Pass the .zip to the ShapefileHandler - ShapefileHandler shp_handler = new ShapefileHandler(new FileInputStream(zipfile_obj)); + ShapefileHandler shp_handler = new ShapefileHandler(zipfile_obj); shp_handler.DEBUG= true; assertTrue(shp_handler.containsShapefile(), "verify shapefile existance"); @@ -264,7 +264,7 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ File unzip2Folder = Files.createDirectory(this.tempFolder.resolve("test_unzip2")).toFile(); // Rezip/Reorder the files - shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), unzip2Folder); + shp_handler.rezipShapefileSets(unzip2Folder); //shp_handler.rezipShapefileSets(new FileInputStream(zipfile_obj), new File("/Users/rmp553/Desktop/blah")); @@ -284,9 +284,9 @@ public void testZippedShapefileWithExtraFiles() throws IOException{ } @Test - public void testHiddenFiles() { + public void testHiddenFiles() throws IOException { // test with shapefiles in hidden directory - ShapefileHandler shp_handler = new ShapefileHandler("src/test/resources/hiddenShapefiles.zip"); + ShapefileHandler shp_handler = new ShapefileHandler(new File("src/test/resources/hiddenShapefiles.zip")); shp_handler.DEBUG= true; assertFalse(shp_handler.containsShapefile()); } diff --git a/src/test/resources/own-cloud-downloads/greetings.zip b/src/test/resources/own-cloud-downloads/greetings.zip new file mode 100644 index 0000000000000000000000000000000000000000..6e166d385d18dce3b63eac9b742c40fea5dddae9 GIT binary patch literal 679 zcmWIWW@gc400HJ?!$=VQ9||}a6c}7wbPY|-&GZ==WI$3lK>*kk4xl`1BC09ENT#Ic z=cgo9rs|bclwdQ^4as}OAjq@OSojYL zK!~Xf3`-i3O+^b%Sg62g=vMe94rK2G;+uVc(bwT01W{F uSm=P*%pd{?V6+$`gbz$lU;xy|4q~`6LHHoy0B=?{kkgofx?oOZU;qG5YmhPk literal 0 HcmV?d00001 diff --git a/src/test/resources/own-cloud-downloads/shapes.zip b/src/test/resources/own-cloud-downloads/shapes.zip new file mode 100644 index 0000000000000000000000000000000000000000..99d5f36c8952950869f77b3316e3c9d5b65c72fe GIT binary patch literal 4336 zcmbVOJ%|%Q7~NcQxm=^s5cLj=Ac7!wIg&!fLWtz9Sxy!$L{KETY)Je`Lb65tDHIgJ zLR9cLEd)_38wCXm3vCn|wX(FZ6S1=s(fP9Tk}s2)u$yfj+1+{X&6oN1dyXBAEn0LP zcXKBQ|H&(1WvoicD;4YQrHHkY9vLsk$g+vCxLvPZ(sHdZW}hn|?z}xV>}dN4*U{xd zadkPnTGSc5%Sf@2{({U}v*Ec7&#`9`_a5bM&&ij8W*4au7mcRpc61JQL)K(nu{%vYyY zYPq>=rCik&B|9}*l)(z_;7#t*{dxH^FqDI8xuJ9L-0VQC;11lqpD#ig7}7yk=O7<7 z)}UJMat9B$q@O|=7|KDbeMaZtrP+a4!5z&0{OTnCTqsiy3PmR&WRJ?IT~I z3=HW&`!Bp6=V0hx`1WJ{Jd}aH1KNZ7<=m|HLSFmPJ2*WkfO$LVQL_$0(lNf1Lv|pM#W+P$+H*)#G zmHjXPoXCj2+2&`A6;Yb)!$#~63;-uFV(7WYW+O_^CH7o93j@H(jA&bLf#$oslrz$) z4YgkK`pGXa0Gz~#*Y$KA^U7>Qtl(#)`Tg$!7yvqUWNI;L&QFx4EGudKDp07xMs+?= z48n1&l&AEpLDQA*82O5_4-y~cE1%o4o?PXQeeQmEq75J@0eRZO1|UKa2O<=o#)G*m8}7$8qUDnF%;!2B}f zfEUO!j>=D&A_RZ0fx!#p2}b3o#1NQY<`eJ&c|KA3DGvnZmtR8g0(tsS`DxmR;MdlN zXAAI)gQCp}{`pbA68_u{kQ{~^nua^fQ+610IP*(`B!L_NRDmhU!^+4B+%J|mH+1m{?AY=6f`8k>T&15Wc@OBwLLZv-ABc6mJGYYl?}z}$o{p2n KlvoX`mh~UjzepMY literal 0 HcmV?d00001 From 7d3ab225af18650b67d9e018bb37806229f69bf1 Mon Sep 17 00:00:00 2001 From: jo-pol Date: Mon, 7 Oct 2024 09:51:24 +0200 Subject: [PATCH 2/5] open zip once and reuse list of entries --- .../command/impl/CreateNewDataFilesCommand.java | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index e543606e039..76939751899 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -317,7 +317,8 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException */ try (var zipFile = openZipFile(tempFile, charset)) { - for (var entry : filteredZipEntries(zipFile)) { + var zipEntries = filteredZipEntries(zipFile); + for (var entry : zipEntries) { logger.fine("inside first zip pass; this entry: " + entry.getName()); numberOfUnpackableFiles++; if (numberOfUnpackableFiles > fileNumberLimit) { @@ -349,15 +350,12 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException } } } - } + // OK we're still here - that means we can proceed unzipping. - // OK we're still here - that means we can proceed unzipping. - - // reset: - combinedUnzippedFileSize = 0L; + // reset: + combinedUnzippedFileSize = 0L; - try (var zipFile = openZipFile(tempFile, charset)) { - for (var entry : filteredZipEntries(zipFile)) { + for (var entry : zipEntries) { if (datafiles.size() > fileNumberLimit) { logger.warning("Zip upload - too many files."); warningMessage = "The number of files in the zip archive is over the limit (" + fileNumberLimit From 2a62c0460c28a3704b26289a65cd00437684a7c6 Mon Sep 17 00:00:00 2001 From: jo-pol Date: Mon, 21 Oct 2024 15:15:44 +0200 Subject: [PATCH 3/5] performance test --- .../command/impl/CreateNewDataFilesTest.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java index 1262984eb27..a956b473a4b 100644 --- a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java @@ -13,18 +13,26 @@ import edu.harvard.iq.dataverse.util.testing.JvmSetting; import edu.harvard.iq.dataverse.util.testing.LocalJvmSettings; import org.jetbrains.annotations.NotNull; +import org.joda.time.DateTime; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.mockito.MockedStatic; import org.mockito.Mockito; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.nio.file.Files; import java.nio.file.NoSuchFileException; import java.nio.file.Path; +import java.security.SecureRandom; +import java.text.MessageFormat; +import java.util.zip.ZipEntry; +import java.util.zip.ZipOutputStream; import static edu.harvard.iq.dataverse.DataFile.ChecksumType.MD5; import static org.apache.commons.io.file.FilesUncheck.createDirectories; @@ -146,6 +154,72 @@ public void execute_rezips_sets_of_shape_files_from_uploaded_zip() throws Except } } + @Disabled("Too slow. Intended for manual execution.") + @Test + @JvmSetting(key = JvmSettings.FILES_DIRECTORY, value = "/tmp/test/CreateNewDataFilesTest/tmp") + public void extract_zip_performance() throws Exception { + /* + Developed to test performance difference between the old implementation with ZipInputStream and the new ZipFile implementation. + Play with numbers depending on: + - the time you want to spend on this test + - how much system stress you want to examine + */ + var nrOfZipFiles = 20; + var avgNrOfFilesPerZip = 300; + var avgFileLength = 5000; + + var tmpUploadStorage = Path.of("/tmp/test/CreateNewDataFilesTest/tmp/temp"); + if(tmpUploadStorage.toFile().exists()) { + deleteDirectory(tmpUploadStorage); + } + createDirectories(tmpUploadStorage); // temp in target would choke intellij + + var chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; + var random = new SecureRandom(); + var totalNrOfFiles = 0; + var totalFileSize = 0; + var tmp = Path.of(Files.createTempDirectory(null).toString()); + var ctxt = mockCommandContext(mockSysConfig(false, 100000000L, MD5, 10000)); + try (var mockedJHoveFileType = Mockito.mockStatic(JhoveFileType.class)) { + mockedJHoveFileType.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); + var before = DateTime.now(); + for (var zipNr = 1; zipNr <= nrOfZipFiles; zipNr++) { + // build the zip + var zip = tmp.resolve(zipNr + "-data.zip"); + var nrOfFilesInZip = random.nextInt(avgNrOfFilesPerZip * 2); + try (var zipStream = new ZipOutputStream(new FileOutputStream(zip.toFile()))) { + for (var fileInZipNr = 1; fileInZipNr <= nrOfFilesInZip; fileInZipNr++) { + // build content for a file + var stringLength = random.nextInt(avgFileLength * 2 -5); + StringBuilder sb = new StringBuilder(stringLength); + for (int i = 1; i <= stringLength; i++) {// zero length causes buffer underflow + sb.append(chars.charAt(random.nextInt(chars.length()))); + } + // add the file to the zip + zipStream.putNextEntry(new ZipEntry(fileInZipNr + ".txt")); + zipStream.write((sb.toString()).getBytes()); + zipStream.closeEntry(); + totalFileSize += stringLength; + } + } + + // upload the zip + var result = createCmd(zip.toString(), mockDatasetVersion(), 1000L, 500L) + .execute(ctxt); + assertThat(result.getErrors()).hasSize(0); + assertThat(result.getDataFiles()).hasSize(nrOfFilesInZip); + totalNrOfFiles += nrOfFilesInZip; + + // report after each zip to have some data even when aborting a test that takes too long + System.out.println(MessageFormat.format( + "Total time: {0}ms; nr of zips {1} total nr of files {2}; total file size {3}", + DateTime.now().getMillis() - before.getMillis(), zipNr, totalNrOfFiles, totalFileSize + )); + } + assertThat(tmpUploadStorage.toFile().list()).hasSize(totalNrOfFiles); + } + } + private static @NotNull CreateNewDataFilesCommand createCmd(String name, DatasetVersion dsVersion, long allocatedQuotaLimit, long usedQuotaLimit) throws FileNotFoundException { return new CreateNewDataFilesCommand( Mockito.mock(DataverseRequest.class), From 255f4196ef053373205f632a460de392a592efb9 Mon Sep 17 00:00:00 2001 From: jo-pol Date: Tue, 22 Oct 2024 10:02:24 +0200 Subject: [PATCH 4/5] defensive assert; concise time measurement --- .../engine/command/impl/CreateNewDataFilesTest.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java index a956b473a4b..aa015dd68a6 100644 --- a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java @@ -137,7 +137,7 @@ public void execute_rezips_sets_of_shape_files_from_uploaded_zip() throws Except assertThat(result.getErrors()).hasSize(0); assertThat(result.getDataFiles().stream().map(dataFile -> (dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName()) - .replaceAll(".*temp/shp_[-0-9]*/", "") + .replaceAll(".*temp/[-_shp0-9]*/", "") )).containsExactlyInAnyOrder( "dataDir/shape1.zip", "dataDir/shape2/shape2", @@ -178,11 +178,11 @@ public void extract_zip_performance() throws Exception { var random = new SecureRandom(); var totalNrOfFiles = 0; var totalFileSize = 0; + var totalTime = 0L; var tmp = Path.of(Files.createTempDirectory(null).toString()); var ctxt = mockCommandContext(mockSysConfig(false, 100000000L, MD5, 10000)); try (var mockedJHoveFileType = Mockito.mockStatic(JhoveFileType.class)) { mockedJHoveFileType.when(JhoveFileType::getJhoveConfigFile).thenReturn("conf/jhove/jhove.conf"); - var before = DateTime.now(); for (var zipNr = 1; zipNr <= nrOfZipFiles; zipNr++) { // build the zip var zip = tmp.resolve(zipNr + "-data.zip"); @@ -204,8 +204,11 @@ public void extract_zip_performance() throws Exception { } // upload the zip + var before = DateTime.now(); var result = createCmd(zip.toString(), mockDatasetVersion(), 1000L, 500L) .execute(ctxt); + totalTime += DateTime.now().getMillis() - before.getMillis(); + assertThat(result.getErrors()).hasSize(0); assertThat(result.getDataFiles()).hasSize(nrOfFilesInZip); totalNrOfFiles += nrOfFilesInZip; @@ -213,7 +216,7 @@ public void extract_zip_performance() throws Exception { // report after each zip to have some data even when aborting a test that takes too long System.out.println(MessageFormat.format( "Total time: {0}ms; nr of zips {1} total nr of files {2}; total file size {3}", - DateTime.now().getMillis() - before.getMillis(), zipNr, totalNrOfFiles, totalFileSize + totalTime, zipNr, totalNrOfFiles, totalFileSize )); } assertThat(tmpUploadStorage.toFile().list()).hasSize(totalNrOfFiles); From 580ca364305f8d97404a73724416e549553dcf05 Mon Sep 17 00:00:00 2001 From: jo-pol Date: Tue, 22 Oct 2024 14:10:17 +0200 Subject: [PATCH 5/5] more defensive assert Intellij shows directory labels like ewDataFilesTest/tmp/temp/shp_2024-10-22-01-57-21-833/dataDir/extra possibly different environments have different values --- .../command/impl/CreateNewDataFilesTest.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java index aa015dd68a6..f49ebcea39c 100644 --- a/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java +++ b/src/test/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesTest.java @@ -137,16 +137,16 @@ public void execute_rezips_sets_of_shape_files_from_uploaded_zip() throws Except assertThat(result.getErrors()).hasSize(0); assertThat(result.getDataFiles().stream().map(dataFile -> (dataFile.getFileMetadata().getDirectoryLabel() + "/" + dataFile.getDisplayName()) - .replaceAll(".*temp/[-_shp0-9]*/", "") + .replaceAll(".*/dataDir/", "") )).containsExactlyInAnyOrder( - "dataDir/shape1.zip", - "dataDir/shape2/shape2", - "dataDir/shape2/shape2.pdf", - "dataDir/shape2/shape2.txt", - "dataDir/shape2/shape2.zip", - "dataDir/extra/shp_dictionary.xls", - "dataDir/extra/notes", - "dataDir/extra/README.MD" + "shape1.zip", + "shape2/shape2", + "shape2/shape2.pdf", + "shape2/shape2.txt", + "shape2/shape2.zip", + "extra/shp_dictionary.xls", + "extra/notes", + "extra/README.MD" ); var storageIds = result.getDataFiles().stream().map(DataFile::getStorageIdentifier).toList(); assertThat(tempDir.toFile().list())