Skip to content

Commit

Permalink
Merged FilesInDatasetCache
Browse files Browse the repository at this point in the history
  • Loading branch information
janvanmansum committed Dec 7, 2024
2 parents 286ca7c + ca639f7 commit 8560b6b
Show file tree
Hide file tree
Showing 19 changed files with 492 additions and 83 deletions.
12 changes: 6 additions & 6 deletions docs/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ editFiles:
categories: [ 'Testlabel' ]
ignoreFiles:
- 'file7.txt' # This file will NOT be added to the dataset
renameAtUploadFiles:
- from: 'file8.txt' # Local file name
to: 'file9.txt' # The file name assigned in the dataset
autoRenameFiles:
- from: "Unsanitize'd/file?" # Local file name
to: "Sanitize_d/file_" # The file name assigned in the dataset
addEmbargoes:
- filePaths: [ 'file1.txt' ] # All other files will NOT be embargoed
dateAvailable: '2030-01-01'
Expand All @@ -106,10 +106,10 @@ editFiles:
reason: 'Pending publication'
```
The actions specified in this file correspond roughly to the action available in the dropdown menu in the file view of a dataset in Dataverse.
The actions specified in this file correspond roughly to the actions available in the dropdown menu in the file view of a dataset in Dataverse.
The replacement file is looked up in the bag, under the `data` directory under the same path as the original file has in the dataset. Note that files the
replacement files will automatically be skipped in the add files step, the deleted files, however, will not. In other words, it is also possible to remove a
The replacement file is looked up in the bag, under the `data` directory under the same path as the original file has in the dataset. Note that files in
`replaceFiles` will automatically be skipped in the add files step, the deleted files, however, will not. In other words, it is also possible to remove a
file and add a file back to the same location in one deposit. In that case, there will be no continuous history of the file in the dataset.

The `addRestrictedFiles` action is included, because it allows you to add a large number of restricted files in a more efficient way than by updating the file
Expand Down
24 changes: 24 additions & 0 deletions docs/dev.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Development
===========

## Local debugging

To locally debug you need to have the following services running:

* A dataverse instance. Internal DANS developers can use the vagrant boxes with development versions of the Data Stations for this. You will need to configure
access to the admin interface to use the unblock-key:

curl -X PUT -d s3kretKey http://localhost:8080/api/admin/settings/:BlockedApiKey
curl -X PUT -d unblock-key http://localhost:8080/api/admin/settings/:BlockedApiPolicy

# When done debugging, you can reset the policy to localhost-only:
curl -X PUT -d localhost-only http://localhost:8080/api/admin/settings/:BlockedApiPolicy/?unblock-key=s3kretKey

* [dd-validate-dans-bag]{:target="_blank"}. Note that its `validation.baseFolder` configuration property should point to the deposit area or an ancestor of it.

Calling `dd-dataverse-ingest` is most conveniently done through the [dd-dataverse-ingest-cli]{:target=_blank} command line tool.



[dd-validate-dans-bag]: {{ validate_dans_bag_url }}
[dd-dataverse-ingest-cli]: {{ dataverse_ingest_cli_url }}
2 changes: 2 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ extra:
bagit_specs_url: https://www.rfc-editor.org/rfc/rfc8493.html
dans_bag_profile_url: https://dans-knaw.github.io/dans-bagit-profile/versions/1.2.0/
dataverse_api_url: https://guides.dataverse.org/en/latest/api
validate_dans_bag_url: https://github.com/DANS-KNAW/dd-validate-dans-bag
dataverse_ingest_cli_url: https://github.com/DANS-KNAW/dd-dataverse-ingest-cli

plugins:
- markdownextradata
Expand Down
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
</parent>

<artifactId>dd-dataverse-ingest</artifactId>
<version>0.9.1-SNAPSHOT</version>
<version>0.9.2-SNAPSHOT</version>

<name>DD Dataverse Ingest</name>
<url>https://github.com/DANS-KNAW/dd-dataverse-ingest</url>
Expand Down
109 changes: 46 additions & 63 deletions src/main/java/nl/knaw/dans/dvingest/core/bagprocessor/FilesEditor.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import nl.knaw.dans.dvingest.core.service.DataverseService;
import nl.knaw.dans.dvingest.core.service.FilesInDataset;
import nl.knaw.dans.dvingest.core.service.UtilityServices;
import nl.knaw.dans.dvingest.core.yaml.EditFiles;
import nl.knaw.dans.dvingest.core.yaml.FromTo;
import nl.knaw.dans.lib.dataverse.DataverseException;
import nl.knaw.dans.lib.dataverse.model.dataset.Embargo;
import nl.knaw.dans.lib.dataverse.model.file.FileMeta;
Expand All @@ -36,39 +36,38 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;

@Slf4j

public class FilesEditor {
@NonNull
private final UUID depositId;
@NonNull
private final Path dataDir;
private final EditFiles editFiles;

@NonNull
private final DataverseService dataverseService;

@NonNull
private final UtilityServices utilityServices;
@Getter(AccessLevel.PACKAGE) // for testing
private final FilesInDatasetCache filesInDatasetCache;

private String pid;

@Getter(AccessLevel.PACKAGE) // Getter for unit testing
// private final Map<String, FileMeta> filesInDataset = new java.util.HashMap<>();
private boolean filesRetrieved = false;

private FilesInDataset filesInDataset;

public FilesEditor(UUID depositId, Path dataDir, EditFiles editFiles, DataverseService dataverseService, UtilityServices utilityServices) {
public FilesEditor(@NonNull UUID depositId, @NonNull Path dataDir, @NonNull EditFiles editFiles, @NonNull DataverseService dataverseService,
@NonNull UtilityServices utilityServices) {
this.depositId = depositId;
this.dataDir = dataDir;
this.editFiles = editFiles;
this.dataverseService = dataverseService;
this.utilityServices = utilityServices;
this.filesInDatasetCache = new FilesInDatasetCache(dataverseService, getRenameMap(editFiles.getAutoRenameFiles()));
}

private static Map<String, String> getRenameMap(List<FromTo> renames) {
return renames.stream()
.map(rename -> Map.entry(rename.getFrom(), rename.getTo()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

public void editFiles(String pid) throws IOException, DataverseException {
Expand All @@ -80,18 +79,15 @@ public void editFiles(String pid) throws IOException, DataverseException {
* - updateFileMetas must exist in bag if first version deposit
*/
if (editFiles == null) {
try (var stream = Files.list(dataDir)) {
if (stream.findAny().isEmpty()) {
log.debug("No files to edit for deposit {}", depositId);
return;
}
if (isEmptyDir(dataDir)) {
log.debug("No files to edit for deposit {}", depositId);
return;
}
}

filesInDataset = dataverseService.getFilesInDataset(pid);

log.debug("Start editing files for deposit {}", depositId);
this.pid = pid;
filesInDatasetCache.downloadFromDataset(pid);
if (editFiles != null) {
deleteFiles();
replaceFiles();
Expand All @@ -106,26 +102,32 @@ public void editFiles(String pid) throws IOException, DataverseException {
log.debug("End editing files for deposit {}", depositId);
}

private boolean isEmptyDir(Path dir) throws IOException {
try (var stream = Files.list(dir)) {
return stream.findAny().isEmpty();
}
}

private void deleteFiles() throws IOException, DataverseException {
log.debug("Start deleting {} files for deposit {}", depositId, editFiles.getDeleteFiles().size());
for (var file : editFiles.getDeleteFiles()) {
log.debug("Deleting file: {}", file);
var fileToDelete = filesInDataset.get(file);
for (var filepath : editFiles.getDeleteFiles()) {
log.debug("Deleting file: {}", filepath);
var fileToDelete = filesInDatasetCache.get(filepath);
if (fileToDelete == null) {
throw new IllegalArgumentException("File to delete not found in dataset: " + file);
throw new IllegalArgumentException("File to delete not found in dataset: " + filepath);
}
dataverseService.deleteFile(fileToDelete.getDataFile().getId());
filesInDataset.remove(file);
filesInDatasetCache.remove(filepath);
}
log.debug("End deleting files for deposit {}", depositId);
}

private void replaceFiles() throws IOException, DataverseException {
log.debug("Start replacing {} files for deposit {}", depositId, editFiles.getReplaceFiles().size());
for (var file : editFiles.getReplaceFiles()) {
log.debug("Replacing file: {}", file);
var fileMeta = filesInDataset().get(file);
dataverseService.replaceFile(pid, fileMeta, dataDir.resolve(file));
for (var filepath : editFiles.getReplaceFiles()) {
log.debug("Replacing file: {}", filepath);
var fileMeta = filesInDatasetCache.get(filepath);
dataverseService.replaceFile(pid, fileMeta, dataDir.resolve(filepath));
}
log.debug("End replacing files for deposit {}", depositId);
}
Expand Down Expand Up @@ -160,16 +162,10 @@ private Iterator<File> getRestrictedFilesToUpload() {
new FileUploadInclusionPredicate(editFiles, dataDir, true));
}

private Map<String, String> getRenameMap() {
return editFiles.getRenameAtUploadFiles().stream()
.map(rename -> Map.entry(rename.getFrom(), rename.getTo()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOException, DataverseException {
var tempZipFile = utilityServices.createTempZipFile();
try {
var zipFile = utilityServices.createPathIteratorZipperBuilder(getRenameMap())
var zipFile = utilityServices.createPathIteratorZipperBuilder(filesInDatasetCache.getAutoRenamedFiles())
.rootDir(dataDir)
.sourceIterator(iterator)
.targetZipFile(tempZipFile)
Expand All @@ -178,10 +174,10 @@ private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOE
var fileMeta = new FileMeta();
fileMeta.setRestricted(restrict);
log.debug("Start uploading zip file at {} for deposit {}", zipFile, depositId);
var fileList = dataverseService.addFile(pid, zipFile, fileMeta);
log.debug("Uploaded {} files, {} cumulative)", fileList.getFiles().size(), iterator.getIteratedCount());
for (var file : fileList.getFiles()) {
filesInDataset.put(getPath(file), file);
var addedFileMetaList = dataverseService.addFile(pid, zipFile, fileMeta);
log.debug("Uploaded {} files, {} cumulative)", addedFileMetaList.getFiles().size(), iterator.getIteratedCount());
for (var fm : addedFileMetaList.getFiles()) {
filesInDatasetCache.put(fm); // auto-rename is done by PathIteratorZipper
}
}
finally {
Expand All @@ -192,40 +188,24 @@ private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOE
private void moveFiles() throws IOException, DataverseException {
log.debug("Start moving files {} for deposit {}", editFiles.getMoveFiles().size(), depositId);
for (var move : editFiles.getMoveFiles()) {
var fileMeta = filesInDataset().get(move.getFrom());
var dvToPath = new DataversePath(move.getTo());
fileMeta.setDirectoryLabel(dvToPath.getDirectoryLabel());
fileMeta.setLabel(dvToPath.getLabel());
var fileMeta = filesInDatasetCache.get(move.getFrom());
fileMeta = filesInDatasetCache.createFileMetaForMovedFile(move.getTo(), fileMeta);
dataverseService.updateFileMetadata(fileMeta.getDataFile().getId(), fileMeta);
filesInDatasetCache.remove(move.getFrom());
filesInDatasetCache.put(fileMeta); // auto-rename is done by getMovedFile
}
log.debug("End moving files for deposit {}", depositId);
}

private void updateFileMetas() throws IOException, DataverseException {
log.debug("Start updating {} file metas for deposit {}", editFiles.getUpdateFileMetas().size(), depositId);
for (var fileMeta : editFiles.getUpdateFileMetas()) {
var id = filesInDataset().get(getPath(fileMeta)).getDataFile().getId();
var id = filesInDatasetCache.get(getPath(fileMeta)).getDataFile().getId();
dataverseService.updateFileMetadata(id, fileMeta);
}
log.debug("End updating file metadata for deposit {}", depositId);
}

private Map<String, FileMeta> filesInDataset() throws IOException, DataverseException {
if (!filesRetrieved) {
log.debug("Start getting files in dataset for deposit {}", depositId);
var files = dataverseService.getFiles(pid);
for (var file : files) {
filesInDataset.put(getPath(file), file);
}
filesRetrieved = true;
log.debug("End getting files in dataset for deposit {}", depositId);
}
else {
log.debug("Files in dataset already retrieved for deposit {}", depositId);
}
return filesInDataset;
}

private String getPath(FileMeta file) {
var dataversePath = new DataversePath(file.getDirectoryLabel(), file.getLabel());
return dataversePath.toString();
Expand All @@ -237,7 +217,10 @@ private void addEmbargoes() throws IOException, DataverseException {
var embargo = new Embargo();
embargo.setDateAvailable(addEmbargo.getDateAvailable());
embargo.setReason(addEmbargo.getReason());
var fileIds = addEmbargo.getFilePaths().stream().map(filesInDataset::get).mapToInt(file -> file.getDataFile().getId()).toArray();
var fileIds = addEmbargo.getFilePaths()
.stream()
.map(filesInDatasetCache::get)
.mapToInt(file -> file.getDataFile().getId()).toArray();
embargo.setFileIds(fileIds);
dataverseService.addEmbargo(pid, embargo);
}
Expand Down
Loading

0 comments on commit 8560b6b

Please sign in to comment.