Skip to content

Commit

Permalink
Transplanted moveFile algortithm from old code
Browse files Browse the repository at this point in the history
  • Loading branch information
janvanmansum committed Dec 7, 2024
1 parent 481a848 commit 06c80eb
Show file tree
Hide file tree
Showing 8 changed files with 104 additions and 164 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;
import nl.knaw.dans.dvingest.core.service.DataverseService;
import nl.knaw.dans.dvingest.core.service.FilesInDataset;
import nl.knaw.dans.dvingest.core.service.UtilityServices;
import nl.knaw.dans.dvingest.core.yaml.EditFiles;
import nl.knaw.dans.dvingest.core.yaml.FromTo;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ public class FilesInDatasetCache {
/*
* Key: filepath after auto-rename / Value: FileMeta object
*/
@Getter
private final Map<String, FileMeta> filesInDataset = new java.util.HashMap<>();
@Getter
private final Map<String, String> autoRenamedFiles;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,26 +175,6 @@ public String packageOriginalMetadata(Deposit dansDeposit) throws IOException {
return zipFile.toString();
}

private boolean hasAttributes(FileMeta fileMeta) {
return (fileMeta.getCategories() != null && !fileMeta.getCategories().isEmpty()) ||
(fileMeta.getDescription() != null && !fileMeta.getDescription().isBlank());
}

Map<Path, FileInfo> getFileInfo(Deposit dansDeposit) {
var files = FileElement.pathToFileInfo(dansDeposit, false); // TODO: handle migration case

return files.entrySet().stream()
.map(entry -> {
// relativize the path
var bagPath = entry.getKey();
var fileInfo = entry.getValue();
var newKey = Path.of("data").relativize(bagPath);

return Map.entry(newKey, fileInfo);
})
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

Optional<String> getDateOfDeposit(Deposit dansDeposit) {
if (dansDeposit.isUpdate()) {
return Optional.empty(); // See for implementation CIT025B in DatasetUpdater
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
import nl.knaw.dans.ingest.core.domain.Deposit;
import nl.knaw.dans.ingest.core.domain.FileInfo;
import nl.knaw.dans.ingest.core.service.XPathEvaluator;
import nl.knaw.dans.lib.dataverse.DataverseException;
import nl.knaw.dans.lib.dataverse.model.file.FileMeta;

import java.io.IOException;
import java.nio.file.Path;
import java.text.SimpleDateFormat;
import java.time.Instant;
Expand All @@ -35,7 +37,6 @@
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
Expand All @@ -55,19 +56,20 @@ public class EditFilesComposer {
private static final SimpleDateFormat yyyymmddFormat = new SimpleDateFormat("yyyy-MM-dd");

public EditFiles composeEditFiles() {
var editFiles = new EditFiles();
var pathFileInfoMap = getFileInfo(dansDeposit);

// TODO: in update also ignore any files that have not changed (content or metadata)
var renamedFiles = getAutoRenameMap(pathFileInfoMap);
init(renamedFiles);
var ignoredFiles = getFilesToIgnore(pathFileInfoMap);

var editFiles = new EditFiles();
editFiles.setIgnoreFiles(ignoredFiles);
pathFileInfoMap = removeIgnoredFiles(pathFileInfoMap, ignoredFiles);

editFiles.setAutoRenameFiles(getAutoRenamedFiles(pathFileInfoMap));
editFiles.setAutoRenameFiles(getAutoRenamedFiles(renamedFiles));
editFiles.setAddRestrictedFiles(getRestrictedFilesToAdd(pathFileInfoMap));
editFiles.setUpdateFileMetas(getUpdatedFileMetas(pathFileInfoMap));
editFiles.setDeleteFiles(getDeleteFiles(pathFileInfoMap));
editFiles.setMoveFiles(getFileMovements());
editFiles.setMoveFiles(getFileMovements(pathFileInfoMap));

var dateAvailable = getDateAvailable(dansDeposit);
var filePathsToEmbargo = getEmbargoedFiles(pathFileInfoMap, dateAvailable);
Expand All @@ -80,13 +82,17 @@ public EditFiles composeEditFiles() {
return editFiles;
}

protected void init(Map<String, String> renamedFiles) {
// do nothing
}


/**
* Get the files that should not be processed by the ingest service.
*
* @param files the file infos found inf files.xml
* @return a list of file paths that should be ignored
*/
// TODO: add parameter for current FileMetas (for update)
protected List<String> getFilesToIgnore(Map<Path, FileInfo> files) {
if (fileExclusionPattern == null) {
return List.of();
Expand Down Expand Up @@ -135,9 +141,10 @@ protected List<String> getDeleteFiles(Map<Path, FileInfo> files) {
/**
* Get the files that should be moved.
*
* @param files the file infos found in files.xml
* @return a list of FromTo objects that specify the files to move and their new location
*/
protected List<FromTo> getFileMovements() {
protected List<FromTo> getFileMovements(Map<Path, FileInfo> files) {
return List.of();
}

Expand Down Expand Up @@ -179,16 +186,17 @@ private Map<Path, FileInfo> removeIgnoredFiles(Map<Path, FileInfo> files, List<S
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
}

private List<FromTo> getAutoRenamedFiles(Map<Path, FileInfo> files) {
ArrayList<FromTo> fromTos = new ArrayList<>();
for (var entry : files.entrySet()) {
if (entry.getValue().isSanitized()) {
var from = entry.getKey().toString();
var to = new DataversePath(entry.getValue().getMetadata().getDirectoryLabel(), entry.getValue().getMetadata().getLabel()).toString();
fromTos.add(new FromTo(from, to));
}
}
return fromTos;
private List<FromTo> getAutoRenamedFiles(Map<String, String> renamedFiles) {
return renamedFiles.entrySet().stream()
.map(entry -> new FromTo(entry.getKey(), entry.getValue()))
.toList();
}

protected Map<String, String> getAutoRenameMap(Map<Path, FileInfo> files) {
return files.entrySet().stream()
.filter(entry -> entry.getValue().isSanitized())
.collect(Collectors.toMap(entry -> entry.getKey().toString(),
entry -> new DataversePath(entry.getValue().getMetadata().getDirectoryLabel(), entry.getValue().getMetadata().getLabel()).toString()));
}

// TODO: move to mapping package
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,34 +15,109 @@
*/
package nl.knaw.dans.dvingest.core.dansbag;

import lombok.extern.slf4j.Slf4j;
import nl.knaw.dans.dvingest.core.bagprocessor.FilesInDatasetCache;
import nl.knaw.dans.dvingest.core.service.DataverseService;
import nl.knaw.dans.dvingest.core.yaml.FromTo;
import nl.knaw.dans.ingest.core.domain.Deposit;
import nl.knaw.dans.ingest.core.domain.FileInfo;
import nl.knaw.dans.lib.dataverse.DataverseException;
import nl.knaw.dans.lib.dataverse.model.file.FileMeta;

import java.io.IOException;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* Overrides the methods in EditFilesComposer to handle the case of an update to an existing dataset.
*/
@Slf4j
public class EditFilesComposerForUpdate extends EditFilesComposer {
private final String updatesDatasetPid;
private final DataverseService dataverseService;
// TODO: this should be a read-only variant the cache
private FilesInDatasetCache filesInDatasetCache;

public EditFilesComposerForUpdate(Deposit dansDeposit, Pattern fileExclusionPattern, List<String> embargoExclusions, DataverseService dataverseService) {
public EditFilesComposerForUpdate(Deposit dansDeposit, String updatesDatasetPid, Pattern fileExclusionPattern, List<String> embargoExclusions, DataverseService dataverseService) {
super(dansDeposit, fileExclusionPattern, embargoExclusions);
this.updatesDatasetPid = updatesDatasetPid;
this.dataverseService = dataverseService;
}

protected void init(Map<String, String> renamedFiles) {
filesInDatasetCache = new FilesInDatasetCache(dataverseService, renamedFiles);
try {
filesInDatasetCache.downloadFromDataset(updatesDatasetPid);
}
catch (IOException | DataverseException e) {
log.error("Could not download files from dataset with pid {}", updatesDatasetPid, e);
}
}

@Override
protected List<String> getFilesToIgnore(Map<Path, FileInfo> files) {
super.getFilesToIgnore(files);

return List.of();
}

@Override
protected List<FromTo> getFileMovements(Map<Path, FileInfo> files) {
// Convert from Path to String
var filesInDataset = filesInDatasetCache.getFilesInDataset().entrySet().stream()
.collect(Collectors.toMap(e -> Path.of(e.getKey()), Map.Entry::getValue));
var oldToNewPath = getOldToNewPathOfFilesToMove(filesInDataset, files);
return oldToNewPath.entrySet().stream()
.map(e -> new FromTo(e.getKey().toString(), e.getValue().toString()))
.collect(Collectors.toList());
}

/**
* Creates a mapping for moving files to a new location. To determine this, the file needs to be unique in the old and the new version, because its checksum is used to locate it. Files that occur
* multiple times in either the old or the new version cannot be moved in this way. They will appear to have been deleted in the old version and added in the new. This has the same net result,
* except that the "Changes" overview in Dataverse does not record that the file was effectively moved.
*
* @param pathToFileMetaInLatestVersion map from path to file metadata in the old version
* @param pathToFileInfo map from path to file info in the new version (i.e. the deposit).
* @return map from old path to new path for files that can be moved.
*/
private Map<Path, Path> getOldToNewPathOfFilesToMove(Map<Path, FileMeta> pathToFileMetaInLatestVersion, Map<Path, FileInfo> pathToFileInfo) {

var depositChecksums = pathToFileInfo.entrySet().stream()
.map(e -> Map.entry(e.getKey(), e.getValue().getChecksum()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

var latestFileChecksums = pathToFileMetaInLatestVersion.entrySet().stream()
.map(e -> Map.entry(e.getKey(), e.getValue().getDataFile().getChecksum().getValue()))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

var checksumsToPathNonDuplicatedFilesInDeposit = getChecksumsToPathOfNonDuplicateFiles(depositChecksums);
var checksumsToPathNonDuplicatedFilesInLatestVersion = getChecksumsToPathOfNonDuplicateFiles(latestFileChecksums);

var intersects = checksumsToPathNonDuplicatedFilesInDeposit.keySet().stream()
.filter(checksumsToPathNonDuplicatedFilesInLatestVersion::containsKey)
.collect(Collectors.toSet());

return intersects.stream()
.map(c -> Map.entry(checksumsToPathNonDuplicatedFilesInLatestVersion.get(c), checksumsToPathNonDuplicatedFilesInDeposit.get(c)))
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));

}

private Map<String, Path> getChecksumsToPathOfNonDuplicateFiles(Map<Path, String> pathToChecksum) {
// inverse map first
var inversed = pathToChecksum.entrySet().stream()
.collect(Collectors.groupingBy(Map.Entry::getValue, Collectors.mapping(Map.Entry::getKey, Collectors.toList())));

// filter out items with 0 or more than 1 item
return inversed.entrySet().stream()
.filter(item -> item.getValue().size() == 1)
.collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().get(0)));
}

@Override
protected List<String> getRestrictedFilesToAdd(Map<Path, FileInfo> files) {
return super.getRestrictedFilesToAdd(files);
Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ private void validateFileMetas(Map<Path, FileMeta> pathToFileInfoInLatestVersion
}
}

// check if any of them have a checksum that is not SHA-1restrictedFilesPresent
// check if any of them have a checksum that is not SHA-1
for (var fileMeta : pathToFileInfoInLatestVersion.values()) {
var checksumType = fileMeta.getDataFile().getChecksum().getType();
log.trace("Filemeta checksum type for file {}: {}", fileMeta.getLabel(), checksumType);
Expand Down

0 comments on commit 06c80eb

Please sign in to comment.