From 2a8eb192abed187584ba35f86de8684cfc7c51a8 Mon Sep 17 00:00:00 2001 From: LTLA Date: Thu, 31 Oct 2024 13:05:23 -0700 Subject: [PATCH] Improved synchronization of cache with remote in retrieve* functions. - Delete files that are no longer present in the remote directory. - Always perform HEAD requests to check for updates to individual files. This allows us to simplify the code by removing the updateDelay for files, which was incorrect anyway as the delay was being computed from the file's modification time rather than its last HEAD check. - Wrap the HEAD in a try() to ensure we can still use the cache if the request fails, e.g., due to lack of connectivity. --- DESCRIPTION | 4 +-- R/listFiles.R | 4 ++- R/retrieveDirectory.R | 51 +++++++++++++++++++++------------- R/retrieveFile.R | 8 ++++-- man/retrieveDirectory.Rd | 13 +++++++-- man/retrieveFile.Rd | 17 ++++-------- tests/testthat/test-retrieve.R | 31 ++++++++++++++++++++- 7 files changed, 88 insertions(+), 40 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 50b118f..b0115ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: SewerRat -Version: 0.2.11 -Date: 2024-10-27 +Version: 0.2.12 +Date: 2024-10-31 Title: Client for the SewerRat API Description: Search metadata files across a shared filesystem via the SewerRat API. diff --git a/R/listFiles.R b/R/listFiles.R index a015de8..79bdc73 100644 --- a/R/listFiles.R +++ b/R/listFiles.R @@ -38,7 +38,7 @@ #' @import httr2 listFiles <- function(path, url, forceRemote=FALSE) { if (!forceRemote && file.exists(path)) { - list.files(path, recursive=TRUE, all.files=TRUE) + .quick_list(path) } else { req <- request(paste0(url, "/list?path=", URLencode(path, reserved=TRUE), "&recursive=true")) req <- handle_error(req) @@ -46,3 +46,5 @@ listFiles <- function(path, url, forceRemote=FALSE) { unlist(resp_body_json(res)) } } + +.quick_list <- function(path) list.files(path, recursive=TRUE, all.files=TRUE) diff --git a/R/retrieveDirectory.R b/R/retrieveDirectory.R index 7714ae3..eb9bf8c 100644 --- a/R/retrieveDirectory.R +++ b/R/retrieveDirectory.R @@ -15,14 +15,23 @@ #' Only used for remote access. #' @param concurrent Integer scalar specifying the number of concurrent downloads. #' Only used for remote access. -#' @param updateDelay Integer scalar specifying the maximum age of a cached file, in seconds. -#' Older files will be automatically checked for updates. +#' @param updateDelay Integer scalar specifying the interval before checking for updates in a cached directory, in seconds. #' Only used for remote access. #' #' @return Path to the subdirectory on the caller's filesystem. #' This is either a path to the registered (sub)directory if it is accessible, #' or a path to a local cache of the directory's contents otherwise. #' +#' @details +#' During remote access, this function exhibits the following behavior: +#' \itemize{ +#' \item It will only check for updates to the directory contents after \code{updateDelay} seconds have passed since the previous check. +#' This avoids unnecessarily frequent requests to the SewerRat API. +#' \item If a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +#' Any updates on the remote will cause the new file to be re-downloaded to the cache. +#' \item Any cached files that are no longer in the remote \code{path} will be deleted from the cache. +#' } +#' #' @author Aaron Lun #' #' @examples @@ -45,7 +54,7 @@ #' #' @export #' @import httr2 -retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) { +retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, concurrent=1, updateDelay=3600) { if (!forceRemote && file.exists(path)) { return(path) } @@ -67,12 +76,16 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit res <- req_perform(req) listing <- resp_body_json(res) + # Removing files that no longer exist. + existing <- .quick_list(final) + unlink(file.path(final, setdiff(existing, listing))) + if (concurrent == 1L) { - lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay) + lapply(listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite) } else { cl <- parallel::makeCluster(concurrent) on.exit(parallel::stopCluster(cl), add=TRUE, after=FALSE) - parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite, updateDelay=updateDelay) + parallel::parLapply(cl, listing, acquire_file, cache=cache, path=path, url=url, overwrite=overwrite) } # We use a directory-level OK file to avoid having to scan through all @@ -82,24 +95,22 @@ retrieveDirectory <- function(path, url, cache=NULL, forceRemote=FALSE, overwrit final } -#' @importFrom utils URLencode -full_file_url <- function(url, path) { - paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE)) -} - #' @import httr2 -acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { +#' @importFrom utils URLencode +acquire_file_raw <- function(cache, path, url, overwrite) { target <- file.path(cache, "LOCAL", path) + url <- paste0(url, "/retrieve/file?path=", URLencode(path, reserved=TRUE)) if (!file.exists(target)) { overwrite <- TRUE } else if (!overwrite) { - last_mod <- file.info(target)$mtime - if (last_mod + updateDelay < Sys.time()) { # only check older files for updates, to avoid excessive queries. - req <- request(full_file_url(url, path)) - req <- req_method(req, "HEAD") - req <- handle_error(req) - res <- req_perform(req) + req <- request(url) + req <- req_method(req, "HEAD") + req <- handle_error(req) + res <- try(req_perform(req), silent=TRUE) + + if (!is(res, "try-error")) { # don't fail if the HEAD didn't work, e.g., no internet but we already have a cached file. + last_mod <- file.info(target)$mtime remote_mod <- parse_remote_last_modified(res) if (!is.null(remote_mod) && remote_mod > last_mod) { overwrite <- TRUE @@ -113,7 +124,7 @@ acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { tempf <- tempfile(tmpdir=tempdir) on.exit(unlink(tempf), add=TRUE, after=FALSE) - download_file(full_file_url(url, path), tempf) + download_file(url, tempf) dir.create(dirname(target), recursive=TRUE, showWarnings=FALSE) file.rename(tempf, target) # this should be more or less atomic, so no need for locks. } @@ -121,8 +132,8 @@ acquire_file_raw <- function(cache, path, url, overwrite, updateDelay) { target } -acquire_file <- function(cache, path, name, url, overwrite, updateDelay) { - acquire_file_raw(cache, paste0(path, "/", name), url, overwrite, updateDelay) +acquire_file <- function(cache, path, name, url, overwrite) { + acquire_file_raw(cache, paste0(path, "/", name), url, overwrite) } #' @importFrom utils URLencode diff --git a/R/retrieveFile.R b/R/retrieveFile.R index 9e046aa..4f44568 100644 --- a/R/retrieveFile.R +++ b/R/retrieveFile.R @@ -10,6 +10,10 @@ #' @inheritParams retrieveDirectory #' #' @author Aaron Lun +#' +#' @details +#' During remote access, if a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +#' Any updates on the remote will cause the new file to be re-downloaded to the cache. #' #' @return String containing the path to the file on the caller's filesystem. #' @@ -32,11 +36,11 @@ #' retrieveFile(paste0(mydir, "/diet/bar"), url=info$url, forceRemote=TRUE) #' #' @export -retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE, updateDelay=3600) { +retrieveFile <- function(path, url, cache=NULL, forceRemote=FALSE, overwrite=FALSE) { if (!forceRemote && file.exists(path)) { path } else { cache <- local_root(cache, url) - acquire_file_raw(cache, path, url=url, overwrite=overwrite, updateDelay=updateDelay) + acquire_file_raw(cache, path, url=url, overwrite=overwrite) } } diff --git a/man/retrieveDirectory.Rd b/man/retrieveDirectory.Rd index 9b06dc8..8be4555 100644 --- a/man/retrieveDirectory.Rd +++ b/man/retrieveDirectory.Rd @@ -33,8 +33,7 @@ Only used for remote access.} \item{concurrent}{Integer scalar specifying the number of concurrent downloads. Only used for remote access.} -\item{updateDelay}{Integer scalar specifying the maximum age of a cached file, in seconds. -Older files will be automatically checked for updates. +\item{updateDelay}{Integer scalar specifying the interval before checking for updates in a cached directory, in seconds. Only used for remote access.} } \value{ @@ -46,6 +45,16 @@ or a path to a local cache of the directory's contents otherwise. Retrieve the path to a registered directory (or its subdirectories), possibly creating a local copy of the directory's contents if the caller is not on the same filesystem. } +\details{ +During remote access, this function exhibits the following behavior: +\itemize{ +\item It will only check for updates to the directory contents after \code{updateDelay} seconds have passed since the previous check. +This avoids unnecessarily frequent requests to the SewerRat API. +\item If a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +Any updates on the remote will cause the new file to be re-downloaded to the cache. +\item Any cached files that are no longer in the remote \code{path} will be deleted from the cache. +} +} \examples{ info <- startSewerRat() diff --git a/man/retrieveFile.Rd b/man/retrieveFile.Rd index 4089cb7..368c253 100644 --- a/man/retrieveFile.Rd +++ b/man/retrieveFile.Rd @@ -4,14 +4,7 @@ \alias{retrieveFile} \title{Retrieve a single file} \usage{ -retrieveFile( - path, - url, - cache = NULL, - forceRemote = FALSE, - overwrite = FALSE, - updateDelay = 3600 -) +retrieveFile(path, url, cache = NULL, forceRemote = FALSE, overwrite = FALSE) } \arguments{ \item{path}{String containing the absolute path to a file in a registered directory.} @@ -28,10 +21,6 @@ even if \code{path} is on the same filesystem as the caller.} \item{overwrite}{Logical scalar indicating whether to overwrite the existing cache. Only used for remote access.} - -\item{updateDelay}{Integer scalar specifying the maximum age of a cached file, in seconds. -Older files will be automatically checked for updates. -Only used for remote access.} } \value{ String containing the path to the file on the caller's filesystem. @@ -40,6 +29,10 @@ String containing the path to the file on the caller's filesystem. Retrieve the path to a single file in a registered directory. This will call the REST API to obtain and cache a copy of the file if the caller is not on the same filesystem. } +\details{ +During remote access, if a file in \code{path} has already been locally cached, \code{retrieveDirectory} will be automatically check the SewerRat API for updates. +Any updates on the remote will cause the new file to be re-downloaded to the cache. +} \examples{ info <- startSewerRat() diff --git a/tests/testthat/test-retrieve.R b/tests/testthat/test-retrieve.R index b237183..73e6a18 100644 --- a/tests/testthat/test-retrieve.R +++ b/tests/testthat/test-retrieve.R @@ -2,7 +2,7 @@ # library(testthat); library(SewerRat); source("test-retrieve.R") info <- startSewerRat() - + mydir <- tempfile() dir.create(mydir) write(file=file.path(mydir, "metadata.json"), '{ "first": "Aaron", "last": "Lun" }') @@ -80,4 +80,33 @@ test_that("retrieveDirectory works as expected", { expect_identical(jsonlite::fromJSON(file.path(rdir2, "diet", "metadata.json"))$meal, "lunch") }) +test_that("retrieveDirectory works with remote updates", { + mydir2 <- tempfile() + dir.create(mydir2) + write(file=file.path(mydir2, "metadata.json"), '{ "first": "Kanon", "last": "Shibuya" }') + dir.create(file.path(mydir2, "2")) + write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Kinako", "last": "Sakurakouji" }') + dir.create(file.path(mydir2, "3")) + write(file=file.path(mydir2, "3", "metadata.json"), '{ "first": "Margarete", "last": "Wien" }') + + register(mydir2, "metadata.json", url=info$url) + on.exit(deregister(mydir2, url=info$url)) + + cache <- tempfile() + dir <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE) + expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Kinako") + expect_true(file.exists(file.path(dir, "3", "metadata.json"))) + + # Checking if it responds correctly to remote updates. + Sys.sleep(1.5) + unlink(file.path(mydir2, "3"), recursive=TRUE) + write(file=file.path(mydir2, "2", "metadata.json"), '{ "first": "Mei", "last": "Yoneme" }') + Sys.sleep(1.5) + + dir2 <- retrieveDirectory(mydir2, url=info$url, cache=cache, forceRemote=TRUE, updateDelay=0) + expect_identical(dir, dir2) + expect_identical(jsonlite::fromJSON(file.path(dir, "2", "metadata.json"))$first, "Mei") + expect_false(file.exists(file.path(dir, "3", "metadata.json"))) +}) + deregister(mydir, url=info$url)