|
| 1 | +# SPDX-License-Identifier: BSD-3-Clause |
| 2 | +# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean) |
| 3 | +"""File transfer that copies file between locations on the same filesystem.""" |
| 4 | + |
| 5 | +import os |
| 6 | +import shutil |
| 7 | +from collections.abc import Iterator |
| 8 | +from contextlib import contextmanager |
| 9 | +from datetime import datetime, timezone |
| 10 | +from pathlib import Path |
| 11 | + |
| 12 | +from ..dataset import Dataset |
| 13 | +from ..error import FileNotAccessibleError, FileUploadError |
| 14 | +from ..file import File |
| 15 | +from ..filesystem import RemotePath |
| 16 | +from ..logging import get_logger |
| 17 | +from ._util import source_folder_for |
| 18 | + |
| 19 | + |
| 20 | +class CopyDownloadConnection: |
| 21 | + """Connection for 'downloading' files by copying them. |
| 22 | +
|
| 23 | + Should be created using |
| 24 | + :meth:`scitacean.transfer.copy.CopyFileTransfer.connect_for_download`. |
| 25 | + """ |
| 26 | + |
| 27 | + def __init__(self, hard_link: bool) -> None: |
| 28 | + self._hard_link = hard_link |
| 29 | + |
| 30 | + def download_files(self, *, remote: list[RemotePath], local: list[Path]) -> None: |
| 31 | + """Download files from the given remote path.""" |
| 32 | + for r, l in zip(remote, local, strict=True): |
| 33 | + self.download_file(remote=r, local=l) |
| 34 | + |
| 35 | + def download_file(self, *, remote: RemotePath, local: Path) -> None: |
| 36 | + """Download a file from the given remote path.""" |
| 37 | + get_logger().info( |
| 38 | + "Copying file %s to %s", |
| 39 | + remote, |
| 40 | + local, |
| 41 | + ) |
| 42 | + remote_path = Path(remote.posix) |
| 43 | + if not remote_path.exists(): |
| 44 | + raise FileNotAccessibleError( |
| 45 | + f"Unable to copy to remote file {remote_path}: File does not exist. " |
| 46 | + "This might mean that your machine does not have direct filesystem " |
| 47 | + "access to the file server. Consider using a different file transfer.", |
| 48 | + remote_path=remote, |
| 49 | + ) |
| 50 | + if self._hard_link: |
| 51 | + os.link(src=remote_path, dst=local) |
| 52 | + else: |
| 53 | + shutil.copy(src=remote_path, dst=local) |
| 54 | + |
| 55 | + |
| 56 | +class CopyUploadConnection: |
| 57 | + """Connection for 'uploading' files by copying. |
| 58 | +
|
| 59 | + Should be created using |
| 60 | + :meth:`scitacean.transfer.copy.CopyFileTransfer.connect_for_upload`. |
| 61 | + """ |
| 62 | + |
| 63 | + def __init__(self, *, source_folder: RemotePath, hard_link: bool) -> None: |
| 64 | + self._source_folder = source_folder |
| 65 | + self._hard_link = hard_link |
| 66 | + |
| 67 | + @property |
| 68 | + def source_folder(self) -> RemotePath: |
| 69 | + """The source folder this connection uploads to.""" |
| 70 | + return self._source_folder |
| 71 | + |
| 72 | + def remote_path(self, filename: str | RemotePath) -> RemotePath: |
| 73 | + """Return the complete remote path for a given path.""" |
| 74 | + return self.source_folder / filename |
| 75 | + |
| 76 | + def _make_source_folder(self) -> None: |
| 77 | + try: |
| 78 | + Path(self.source_folder.posix).mkdir(parents=True, exist_ok=True) |
| 79 | + except OSError as exc: |
| 80 | + raise FileUploadError( |
| 81 | + f"Failed to create source folder {self.source_folder}: {exc.args}" |
| 82 | + ) from None |
| 83 | + |
| 84 | + def upload_files(self, *files: File) -> list[File]: |
| 85 | + """Upload files to the remote folder.""" |
| 86 | + self._make_source_folder() |
| 87 | + uploaded: list[File] = [] |
| 88 | + try: |
| 89 | + uploaded.extend(self._upload_file(file) for file in files) |
| 90 | + except Exception: |
| 91 | + self.revert_upload(*uploaded) |
| 92 | + raise |
| 93 | + return uploaded |
| 94 | + |
| 95 | + def _upload_file(self, file: File) -> File: |
| 96 | + if file.local_path is None: |
| 97 | + raise ValueError( |
| 98 | + f"Cannot upload file to {file.remote_path}, the file has no local path" |
| 99 | + ) |
| 100 | + remote_path = self.remote_path(file.remote_path) |
| 101 | + get_logger().info( |
| 102 | + "Copying file %s to %s", |
| 103 | + file.local_path, |
| 104 | + remote_path, |
| 105 | + ) |
| 106 | + if self._hard_link: |
| 107 | + os.link(src=file.local_path, dst=remote_path.posix) |
| 108 | + else: |
| 109 | + shutil.copy(src=file.local_path, dst=remote_path.posix) |
| 110 | + st = file.local_path.stat() |
| 111 | + return file.uploaded( |
| 112 | + remote_gid=str(st.st_gid), |
| 113 | + remote_uid=str(st.st_uid), |
| 114 | + remote_creation_time=datetime.now().astimezone(timezone.utc), |
| 115 | + remote_perm=str(st.st_mode), |
| 116 | + remote_size=st.st_size, |
| 117 | + ) |
| 118 | + |
| 119 | + def revert_upload(self, *files: File) -> None: |
| 120 | + """Remove uploaded files from the remote folder.""" |
| 121 | + for file in files: |
| 122 | + self._revert_upload_single(remote=file.remote_path, local=file.local_path) |
| 123 | + |
| 124 | + if _remote_folder_is_empty(self.source_folder): |
| 125 | + try: |
| 126 | + get_logger().info( |
| 127 | + "Removing empty remote directory %s", |
| 128 | + self.source_folder, |
| 129 | + ) |
| 130 | + Path(self.source_folder.posix).rmdir() |
| 131 | + except OSError as exc: |
| 132 | + get_logger().warning( |
| 133 | + "Failed to remove empty remote directory %s:\n%s", |
| 134 | + self.source_folder, |
| 135 | + exc, |
| 136 | + ) |
| 137 | + |
| 138 | + def _revert_upload_single(self, *, remote: RemotePath, local: Path | None) -> None: |
| 139 | + remote_path = self.remote_path(remote) |
| 140 | + get_logger().info( |
| 141 | + "Reverting upload of file %s to %s", |
| 142 | + local, |
| 143 | + remote_path, |
| 144 | + ) |
| 145 | + |
| 146 | + try: |
| 147 | + Path(remote_path.posix).unlink(missing_ok=True) |
| 148 | + except OSError as exc: |
| 149 | + get_logger().warning("Error reverting file %s:\n%s", remote_path, exc) |
| 150 | + return |
| 151 | + |
| 152 | + |
| 153 | +class CopyFileTransfer: |
| 154 | + """Upload / download files by copying files on the same filesystem. |
| 155 | +
|
| 156 | + This file transfer requires that the 'remote' file system is directly |
| 157 | + accessible from the 'local' file system. |
| 158 | + It copies the 'remote' files directly to the local download folder. |
| 159 | +
|
| 160 | + Note |
| 161 | + ---- |
| 162 | + A note on terminology: |
| 163 | + In Scitacean, 'remote' refers to the file server where the data files |
| 164 | + are stored that belong to SciCat datasets. |
| 165 | + In contrast, 'local' refers to the file system of the machine that |
| 166 | + runs the Python process. |
| 167 | + The two filesystems can be the same. |
| 168 | + However, Scitacean maintains a strict separation between the two and |
| 169 | + uses 'downloaders' and 'uploaders' to transfer between them even if that |
| 170 | + transfer is a simple copy. |
| 171 | +
|
| 172 | + See also the documentation of :class:`scitacean.File`. |
| 173 | +
|
| 174 | + Warning |
| 175 | + ------- |
| 176 | + This file transfer does not work on Windows because it converts between |
| 177 | + :class:`RemotePath` and :class:`pathlib.Path`. |
| 178 | + This requires that both use the same directory separators. |
| 179 | + Since :class:`RemotePath` uses UNIX-style forward slashes, it is |
| 180 | + incompatible with Windows paths. |
| 181 | + In practice, this should not be a problem because SciCat file storage |
| 182 | + should never be a Windows server. |
| 183 | +
|
| 184 | + Examples |
| 185 | + -------- |
| 186 | + Given a dataset with ``source_folder="/dataset/source"`` and a file with path |
| 187 | + ``"file1.dat"``, this |
| 188 | +
|
| 189 | + .. code-block:: python |
| 190 | +
|
| 191 | + client = Client.from_token( |
| 192 | + url="...", |
| 193 | + token="...", |
| 194 | + file_transfer=CopyFileTransfer() |
| 195 | + ) |
| 196 | + ds = client.get_dataset(pid="...") |
| 197 | + ds = client.download_files(ds, target="/downloads") |
| 198 | +
|
| 199 | + copies the file from ``/dataset/source/file1.dat`` to ``/downloads/file1.dat``. |
| 200 | + """ |
| 201 | + |
| 202 | + def __init__( |
| 203 | + self, |
| 204 | + *, |
| 205 | + source_folder: str | RemotePath | None = None, |
| 206 | + hard_link: bool = False, |
| 207 | + ) -> None: |
| 208 | + """Construct a new Copy file transfer. |
| 209 | +
|
| 210 | + Warning |
| 211 | + ------- |
| 212 | + When using hard links (with ``hard_link = True``), the downloaded |
| 213 | + or uploaded files will refer to the same bytes. |
| 214 | + So if one is modified, the other will be modified as well. |
| 215 | + Use this feature with care! |
| 216 | +
|
| 217 | + Parameters |
| 218 | + ---------- |
| 219 | + source_folder: |
| 220 | + Upload files to this folder if set. |
| 221 | + Otherwise, upload to the dataset's source_folder. |
| 222 | + Ignored when downloading files. |
| 223 | + hard_link: |
| 224 | + If True, try to use hard links instead of copies. |
| 225 | + """ |
| 226 | + self._source_folder_pattern = ( |
| 227 | + RemotePath(source_folder) if source_folder is not None else None |
| 228 | + ) |
| 229 | + self._hard_link = hard_link |
| 230 | + |
| 231 | + def source_folder_for(self, dataset: Dataset) -> RemotePath: |
| 232 | + """Return the source folder used for the given dataset.""" |
| 233 | + return source_folder_for(dataset, self._source_folder_pattern) |
| 234 | + |
| 235 | + @contextmanager |
| 236 | + def connect_for_download( |
| 237 | + self, dataset: Dataset, representative_file_path: RemotePath |
| 238 | + ) -> Iterator[CopyDownloadConnection]: |
| 239 | + """Create a connection for downloads, use as a context manager. |
| 240 | +
|
| 241 | + Parameters |
| 242 | + ---------- |
| 243 | + dataset: |
| 244 | + The dataset for which to download files. |
| 245 | + representative_file_path: |
| 246 | + A path to a file that can be used to check whether files for this |
| 247 | + dataset are accessible. |
| 248 | + The transfer assumes that, if this path is accessible, |
| 249 | + all files for this dataset are. |
| 250 | +
|
| 251 | + Returns |
| 252 | + ------- |
| 253 | + : |
| 254 | + A connection object that can download files. |
| 255 | +
|
| 256 | + Raises |
| 257 | + ------ |
| 258 | + FileNotAccessibleError |
| 259 | + If files for the given dataset cannot be accessed |
| 260 | + based on ``representative_file_path``. |
| 261 | + """ |
| 262 | + source_folder = self.source_folder_for(dataset) |
| 263 | + if not Path(source_folder.posix).exists(): |
| 264 | + raise FileNotAccessibleError( |
| 265 | + "Cannot directly access the source folder", |
| 266 | + remote_path=source_folder, |
| 267 | + ) |
| 268 | + if not Path((source_folder / representative_file_path).posix).exists(): |
| 269 | + raise FileNotAccessibleError( |
| 270 | + "Cannot directly access the file", remote_path=representative_file_path |
| 271 | + ) |
| 272 | + yield CopyDownloadConnection(self._hard_link) |
| 273 | + |
| 274 | + @contextmanager |
| 275 | + def connect_for_upload( |
| 276 | + self, dataset: Dataset, representative_file_path: RemotePath |
| 277 | + ) -> Iterator[CopyUploadConnection]: |
| 278 | + """Create a connection for uploads, use as a context manager. |
| 279 | +
|
| 280 | + Parameters |
| 281 | + ---------- |
| 282 | + dataset: |
| 283 | + The connection will be used to upload files of this dataset. |
| 284 | + Used to determine the target folder. |
| 285 | + representative_file_path: |
| 286 | + A path on the remote to check whether files for this |
| 287 | + dataset can be written. |
| 288 | + The transfer assumes that, if it is possible to write to this path, |
| 289 | + it is possible to write to the paths of all files to be uploaded. |
| 290 | +
|
| 291 | + Returns |
| 292 | + ------- |
| 293 | + : |
| 294 | + An open :class:`CopyUploadConnection` object. |
| 295 | +
|
| 296 | + Raises |
| 297 | + ------ |
| 298 | + FileNotAccessibleError |
| 299 | + If the remote folder cannot be accessed |
| 300 | + based on ``representative_file_path``. |
| 301 | + """ |
| 302 | + source_folder = Path(self.source_folder_for(dataset).posix) |
| 303 | + if not source_folder.parents[-2].exists(): |
| 304 | + # This check may have a lot of false negatives. |
| 305 | + # But we cannot check whether `source_folder` exists because the user |
| 306 | + # may intend for the upload to create that folder. |
| 307 | + # Checking the top level parent after the root should still catch many |
| 308 | + # cases as long as the remote uses paths are uncommon on user machines. |
| 309 | + # E.g., for /ess/data/2025/... we get parents[-2] = /ess which should |
| 310 | + # not exist on non-ess machines. |
| 311 | + raise FileNotAccessibleError( |
| 312 | + "Cannot directly access the source folder", |
| 313 | + remote_path=self.source_folder_for(dataset), |
| 314 | + ) |
| 315 | + yield CopyUploadConnection( |
| 316 | + source_folder=self.source_folder_for(dataset), hard_link=self._hard_link |
| 317 | + ) |
| 318 | + |
| 319 | + |
| 320 | +def _remote_folder_is_empty(path: RemotePath) -> bool: |
| 321 | + try: |
| 322 | + _ = next(iter(Path(path.posix).iterdir())) |
| 323 | + except StopIteration: |
| 324 | + return True |
| 325 | + return False |
| 326 | + |
| 327 | + |
| 328 | +__all__ = ["CopyDownloadConnection", "CopyFileTransfer", "CopyUploadConnection"] |
0 commit comments