Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP #5

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/description.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ editFiles:
categories: [ 'Testlabel' ]
ignoreFiles:
- 'file7.txt' # This file will NOT be added to the dataset
renameAtUploadFiles:
- from: 'file8.txt' # Local file name
to: 'file9.txt' # The file name assigned in the dataset
autoRenameFiles:
- from: "Unsanitize'd/file?" # Local file name
to: "Sanitize_d/file_" # The file name assigned in the dataset
addEmbargoes:
- filePaths: [ 'file1.txt' ] # All other files will NOT be embargoed
dateAvailable: '2030-01-01'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
import lombok.AccessLevel;
import lombok.Getter;
import lombok.NonNull;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import nl.knaw.dans.dvingest.core.service.DataverseService;
import nl.knaw.dans.dvingest.core.service.UtilityServices;
import nl.knaw.dans.dvingest.core.yaml.EditFiles;
import nl.knaw.dans.dvingest.core.yaml.FromTo;
import nl.knaw.dans.lib.dataverse.DataverseException;
import nl.knaw.dans.lib.dataverse.model.dataset.Embargo;
import nl.knaw.dans.lib.dataverse.model.file.FileMeta;
Expand All @@ -35,32 +35,38 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.stream.Collectors;

@Slf4j
@RequiredArgsConstructor
public class FilesEditor {
@NonNull
private final UUID depositId;
@NonNull
private final Path dataDir;
private final EditFiles editFiles;

@NonNull
private final DataverseService dataverseService;

@NonNull
private final UtilityServices utilityServices;
@Getter(AccessLevel.PACKAGE) // for testing
private final FilesInDatasetCache filesInDatasetCache;

private String pid;

@Getter(AccessLevel.PACKAGE) // Getter for unit testing
private final Map<String, FileMeta> filesInDataset = new java.util.HashMap<>();
private boolean filesRetrieved = false;
public FilesEditor(@NonNull UUID depositId, @NonNull Path dataDir, @NonNull EditFiles editFiles, @NonNull DataverseService dataverseService,
@NonNull UtilityServices utilityServices) {
this.depositId = depositId;
this.dataDir = dataDir;
this.editFiles = editFiles;
this.dataverseService = dataverseService;
this.utilityServices = utilityServices;
this.filesInDatasetCache = new FilesInDatasetCache(dataverseService, getRenameMap(editFiles.getAutoRenameFiles()));
}

private Map<String, String> renameMap;
private static Map<String, String> getRenameMap(List<FromTo> renames) {
return renames.stream()
.map(rename -> Map.entry(rename.getFrom(), rename.getTo()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

public void editFiles(String pid) throws IOException, DataverseException {
/*
Expand All @@ -79,7 +85,7 @@ public void editFiles(String pid) throws IOException, DataverseException {

log.debug("Start editing files for deposit {}", depositId);
this.pid = pid;
this.renameMap = getRenameMap();
filesInDatasetCache.downloadFromDataset(pid);
if (editFiles != null) {
deleteFiles();
replaceFiles();
Expand All @@ -102,24 +108,24 @@ private boolean isEmptyDir(Path dir) throws IOException {

private void deleteFiles() throws IOException, DataverseException {
log.debug("Start deleting {} files for deposit {}", depositId, editFiles.getDeleteFiles().size());
for (var file : editFiles.getDeleteFiles()) {
log.debug("Deleting file: {}", file);
var fileToDelete = filesInDataset().get(file);
for (var filepath : editFiles.getDeleteFiles()) {
log.debug("Deleting file: {}", filepath);
var fileToDelete = filesInDatasetCache.get(filepath);
if (fileToDelete == null) {
throw new IllegalArgumentException("File to delete not found in dataset: " + file);
throw new IllegalArgumentException("File to delete not found in dataset: " + filepath);
}
dataverseService.deleteFile(fileToDelete.getDataFile().getId());
filesInDataset.remove(file);
filesInDatasetCache.remove(filepath);
}
log.debug("End deleting files for deposit {}", depositId);
}

private void replaceFiles() throws IOException, DataverseException {
log.debug("Start replacing {} files for deposit {}", depositId, editFiles.getReplaceFiles().size());
for (var file : editFiles.getReplaceFiles()) {
log.debug("Replacing file: {}", file);
var fileMeta = filesInDataset().get(file);
dataverseService.replaceFile(pid, fileMeta, dataDir.resolve(file));
for (var filepath : editFiles.getReplaceFiles()) {
log.debug("Replacing file: {}", filepath);
var fileMeta = filesInDatasetCache.get(filepath);
dataverseService.replaceFile(pid, fileMeta, dataDir.resolve(filepath));
}
log.debug("End replacing files for deposit {}", depositId);
}
Expand Down Expand Up @@ -154,16 +160,10 @@ private Iterator<File> getRestrictedFilesToUpload() {
new FileUploadInclusionPredicate(editFiles, dataDir, true));
}

private Map<String, String> getRenameMap() {
return editFiles.getRenameAtUploadFiles().stream()
.map(rename -> Map.entry(rename.getFrom(), rename.getTo()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOException, DataverseException {
var tempZipFile = utilityServices.createTempZipFile();
try {
var zipFile = utilityServices.createPathIteratorZipperBuilder(renameMap)
var zipFile = utilityServices.createPathIteratorZipperBuilder(filesInDatasetCache.getAutoRenamedFiles())
.rootDir(dataDir)
.sourceIterator(iterator)
.targetZipFile(tempZipFile)
Expand All @@ -172,10 +172,10 @@ private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOE
var fileMeta = new FileMeta();
fileMeta.setRestricted(restrict);
log.debug("Start uploading zip file at {} for deposit {}", zipFile, depositId);
var fileList = dataverseService.addFile(pid, zipFile, fileMeta);
log.debug("Uploaded {} files, {} cumulative)", fileList.getFiles().size(), iterator.getIteratedCount());
for (var file : fileList.getFiles()) {
filesInDataset.put(getPath(file), file);
var addedFileMetaList = dataverseService.addFile(pid, zipFile, fileMeta);
log.debug("Uploaded {} files, {} cumulative)", addedFileMetaList.getFiles().size(), iterator.getIteratedCount());
for (var fm : addedFileMetaList.getFiles()) {
filesInDatasetCache.put(fm); // auto-rename is done by PathIteratorZipper
}
}
finally {
Expand All @@ -186,40 +186,24 @@ private void uploadFileBatch(PathIterator iterator, boolean restrict) throws IOE
private void moveFiles() throws IOException, DataverseException {
log.debug("Start moving files {} for deposit {}", editFiles.getMoveFiles().size(), depositId);
for (var move : editFiles.getMoveFiles()) {
var fileMeta = filesInDataset().get(move.getFrom());
var dvToPath = new DataversePath(move.getTo());
fileMeta.setDirectoryLabel(dvToPath.getDirectoryLabel());
fileMeta.setLabel(dvToPath.getLabel());
var fileMeta = filesInDatasetCache.get(move.getFrom());
fileMeta = filesInDatasetCache.createFileMetaForMovedFile(move.getTo(), fileMeta);
dataverseService.updateFileMetadata(fileMeta.getDataFile().getId(), fileMeta);
filesInDatasetCache.remove(move.getFrom());
filesInDatasetCache.put(fileMeta); // auto-rename is done by getMovedFile
}
log.debug("End moving files for deposit {}", depositId);
}

private void updateFileMetas() throws IOException, DataverseException {
log.debug("Start updating {} file metas for deposit {}", editFiles.getUpdateFileMetas().size(), depositId);
for (var fileMeta : editFiles.getUpdateFileMetas()) {
var id = filesInDataset().get(getPath(fileMeta)).getDataFile().getId();
var id = filesInDatasetCache.get(getPath(fileMeta)).getDataFile().getId();
dataverseService.updateFileMetadata(id, fileMeta);
}
log.debug("End updating file metadata for deposit {}", depositId);
}

private Map<String, FileMeta> filesInDataset() throws IOException, DataverseException {
if (!filesRetrieved) {
log.debug("Start getting files in dataset for deposit {}", depositId);
var files = dataverseService.getFiles(pid);
for (var file : files) {
filesInDataset.put(getPath(file), file);
}
filesRetrieved = true;
log.debug("End getting files in dataset for deposit {}", depositId);
}
else {
log.debug("Files in dataset already retrieved for deposit {}", depositId);
}
return filesInDataset;
}

private String getPath(FileMeta file) {
var dataversePath = new DataversePath(file.getDirectoryLabel(), file.getLabel());
return dataversePath.toString();
Expand All @@ -233,24 +217,11 @@ private void addEmbargoes() throws IOException, DataverseException {
embargo.setReason(addEmbargo.getReason());
var fileIds = addEmbargo.getFilePaths()
.stream()
.map(this::renameFile)
.map(this::throwIfFileNotInDataset)
.map(filesInDataset::get)
.map(filesInDatasetCache::get)
.mapToInt(file -> file.getDataFile().getId()).toArray();
embargo.setFileIds(fileIds);
dataverseService.addEmbargo(pid, embargo);
}
log.debug("End adding embargoes for deposit {}", depositId);
}

private String renameFile(String file) {
return renameMap.getOrDefault(file, file);
}

private String throwIfFileNotInDataset(String file) {
if (!filesInDataset.containsKey(file)) {
throw new IllegalArgumentException("File not found in dataset: " + file);
}
return file;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (C) 2024 DANS - Data Archiving and Networked Services ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.dvingest.core.bagprocessor;

import lombok.Getter;
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import nl.knaw.dans.dvingest.core.service.DataverseService;
import nl.knaw.dans.lib.dataverse.DataverseException;
import nl.knaw.dans.lib.dataverse.model.file.FileMeta;

import java.io.IOException;
import java.util.Collections;
import java.util.Map;

/**
* <p>
* Keeps track of the FileMeta objects of files in a dataset. The cache is initialized by downloading the files from the dataset.
* </p>
*/
@Slf4j
public class FilesInDatasetCache {
private final DataverseService dataverseService;
/*
* Key: filepath after auto-rename / Value: FileMeta object
*/
private final Map<String, FileMeta> filesInDataset = new java.util.HashMap<>();
@Getter
private final Map<String, String> autoRenamedFiles;
private boolean initialized = false;

public FilesInDatasetCache(@NonNull DataverseService dataverseService, @NonNull Map<String, String> autoRenamedFiles) {
this.dataverseService = dataverseService;
this.autoRenamedFiles = Collections.unmodifiableMap(autoRenamedFiles);
}

/**
* Returns the cached FileMeta object for the given filepath. The filepath will be auto-renamed if it is in the renamedFiles map, so the local path from the bag is used.
*
* @param filepath before auto-rename
* @return the FileMeta object for the file in the dataset
*/
public FileMeta get(@NonNull String filepath) {
return filesInDataset.get(autoRenamePath(filepath));
}

private String autoRenamePath(@NonNull String filepath) {
return autoRenamedFiles.getOrDefault(filepath, filepath);
}

/**
* Adds or updates the FileMeta object for the given filepath. No auto-rename is done. It is assumed that the FileMeta object was returned by the Dataverse API or that the filepath was already
* auto-renamed by the client.
*
* @param fileMeta the FileMeta object for the file in the dataset
*/
public void put(@NonNull FileMeta fileMeta) {
filesInDataset.put(getPath(fileMeta), fileMeta);
}

/**
* A move operation is in fact a file metadata update operation in which the directory label and label are updated. This method allows to calculate the file metadata for the moved file in the
* dataset. The filepath will be auto-renamed if it is in the renamedFiles map, so the local path from the bag is used.
*
* @param toPath new filepath before auto-rename
* @param fileMeta the FileMeta object for the file in the dataset after the move
* @return the FileMeta object for the moved file in the dataset
*/
public FileMeta createFileMetaForMovedFile(@NonNull String toPath, @NonNull FileMeta fileMeta) {
var newPath = autoRenamePath(toPath);
var dataversePath = new DataversePath(newPath);
fileMeta.setDirectoryLabel(dataversePath.getDirectoryLabel());
fileMeta.setLabel(dataversePath.getLabel());
return fileMeta;
}

/**
* Removes the FileMeta object for the given filepath. The filepath will be auto-renamed if it is in the renamedFiles map, so the local path from the bag is used.
*
* @param filepath before auto-rename
*/
public void remove(@NonNull String filepath) {
filesInDataset.remove(autoRenamePath(filepath));
}

/**
* Download the file metadata from the dataset with the given persistent identifier, initializing the cache. This method can only be called once. Subsequent calls will throw an exception.
*
* @param pid the persistent identifier of the dataset
* @throws IOException if an I/O error occurs
* @throws DataverseException if the Dataverse API returns an error
* @throws IllegalStateException if the cache is already initialized
*/
public void downloadFromDataset(@NonNull String pid) throws IOException, DataverseException {
if (initialized) {
throw new IllegalStateException("Cache already initialized");
}

var files = dataverseService.getFiles(pid);
for (var file : files) {
filesInDataset.put(getPath(file), file);
}
initialized = true;
}

private String getPath(@NonNull FileMeta file) {
var dataversePath = new DataversePath(file.getDirectoryLabel(), file.getLabel());
return dataversePath.toString();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ public EditFiles getEditFilesFromDansDeposit(Deposit dansDeposit) {

pathFileInfoMap = removeIgnoredFiles(pathFileInfoMap, ignoredFiles);

editFiles.setRenameAtUploadFiles(getRenameAtUpload(pathFileInfoMap));
editFiles.setAutoRenameFiles(getAutoRenameFiles(pathFileInfoMap));

editFiles.setAddRestrictedFiles(pathFileInfoMap.entrySet().stream()
.filter(entry -> entry.getValue().getMetadata().getRestricted())
Expand All @@ -163,7 +163,7 @@ public EditFiles getEditFilesFromDansDeposit(Deposit dansDeposit) {
return editFiles;
}

private List<FromTo> getRenameAtUpload(Map<Path, FileInfo> files) {
private List<FromTo> getAutoRenameFiles(Map<Path, FileInfo> files) {
ArrayList<FromTo> fromTos = new ArrayList<>();
for (var entry : files.entrySet()) {
if (entry.getValue().isSanitized()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public class EditFiles {
private List<String> addRestrictedFiles = List.of();
private List<FromTo> moveFiles = List.of();
private List<String> ignoreFiles = List.of();
private List<FromTo> renameAtUploadFiles = List.of();
private List<FromTo> autoRenameFiles = List.of();
private List<FileMeta> updateFileMetas = List.of();
private List<AddEmbargo> addEmbargoes = List.of();
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public void deleteFiles_deletes_files_from_dataset() throws Exception {
// Then
Mockito.verify(dataverseServiceMock).deleteFile(1);
Mockito.verify(dataverseServiceMock).deleteFile(3);
assertThat(filesEditor.getFilesInDataset()).containsOnlyKeys("file2");
assertThat(filesEditor.getFilesInDatasetCache().get("file1")).isNull();
}

@Test
Expand All @@ -97,4 +97,6 @@ public void deleteFiles_throws_exception_when_file_not_found() throws Exception
.hasMessage("File to delete not found in dataset: file4");
}



}
Loading
Loading