Skip to content

Commit 1758732

Browse files
authored
Merge pull request #277 from SciCatProject/copy-file-transfer
Add CopyFileTransfer
2 parents 6c27a48 + bd95168 commit 1758732

File tree

5 files changed

+681
-57
lines changed

5 files changed

+681
-57
lines changed

docs/reference/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ File transfer
2828
:template: scitacean-class-template.rst
2929
:recursive:
3030

31+
transfer.copy.CopyFileTransfer
3132
transfer.link.LinkFileTransfer
3233
transfer.select.SelectFileTransfer
3334
transfer.sftp.SFTPFileTransfer

src/scitacean/transfer/copy.py

Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
# SPDX-License-Identifier: BSD-3-Clause
2+
# Copyright (c) 2025 SciCat Project (https://github.com/SciCatProject/scitacean)
3+
"""File transfer that copies file between locations on the same filesystem."""
4+
5+
import os
6+
import shutil
7+
from collections.abc import Iterator
8+
from contextlib import contextmanager
9+
from datetime import datetime, timezone
10+
from pathlib import Path
11+
12+
from ..dataset import Dataset
13+
from ..error import FileNotAccessibleError, FileUploadError
14+
from ..file import File
15+
from ..filesystem import RemotePath
16+
from ..logging import get_logger
17+
from ._util import source_folder_for
18+
19+
20+
class CopyDownloadConnection:
21+
"""Connection for 'downloading' files by copying them.
22+
23+
Should be created using
24+
:meth:`scitacean.transfer.copy.CopyFileTransfer.connect_for_download`.
25+
"""
26+
27+
def __init__(self, hard_link: bool) -> None:
28+
self._hard_link = hard_link
29+
30+
def download_files(self, *, remote: list[RemotePath], local: list[Path]) -> None:
31+
"""Download files from the given remote path."""
32+
for r, l in zip(remote, local, strict=True):
33+
self.download_file(remote=r, local=l)
34+
35+
def download_file(self, *, remote: RemotePath, local: Path) -> None:
36+
"""Download a file from the given remote path."""
37+
get_logger().info(
38+
"Copying file %s to %s",
39+
remote,
40+
local,
41+
)
42+
remote_path = Path(remote.posix)
43+
if not remote_path.exists():
44+
raise FileNotAccessibleError(
45+
f"Unable to copy to remote file {remote_path}: File does not exist. "
46+
"This might mean that your machine does not have direct filesystem "
47+
"access to the file server. Consider using a different file transfer.",
48+
remote_path=remote,
49+
)
50+
if self._hard_link:
51+
os.link(src=remote_path, dst=local)
52+
else:
53+
shutil.copy(src=remote_path, dst=local)
54+
55+
56+
class CopyUploadConnection:
57+
"""Connection for 'uploading' files by copying.
58+
59+
Should be created using
60+
:meth:`scitacean.transfer.copy.CopyFileTransfer.connect_for_upload`.
61+
"""
62+
63+
def __init__(self, *, source_folder: RemotePath, hard_link: bool) -> None:
64+
self._source_folder = source_folder
65+
self._hard_link = hard_link
66+
67+
@property
68+
def source_folder(self) -> RemotePath:
69+
"""The source folder this connection uploads to."""
70+
return self._source_folder
71+
72+
def remote_path(self, filename: str | RemotePath) -> RemotePath:
73+
"""Return the complete remote path for a given path."""
74+
return self.source_folder / filename
75+
76+
def _make_source_folder(self) -> None:
77+
try:
78+
Path(self.source_folder.posix).mkdir(parents=True, exist_ok=True)
79+
except OSError as exc:
80+
raise FileUploadError(
81+
f"Failed to create source folder {self.source_folder}: {exc.args}"
82+
) from None
83+
84+
def upload_files(self, *files: File) -> list[File]:
85+
"""Upload files to the remote folder."""
86+
self._make_source_folder()
87+
uploaded: list[File] = []
88+
try:
89+
uploaded.extend(self._upload_file(file) for file in files)
90+
except Exception:
91+
self.revert_upload(*uploaded)
92+
raise
93+
return uploaded
94+
95+
def _upload_file(self, file: File) -> File:
96+
if file.local_path is None:
97+
raise ValueError(
98+
f"Cannot upload file to {file.remote_path}, the file has no local path"
99+
)
100+
remote_path = self.remote_path(file.remote_path)
101+
get_logger().info(
102+
"Copying file %s to %s",
103+
file.local_path,
104+
remote_path,
105+
)
106+
if self._hard_link:
107+
os.link(src=file.local_path, dst=remote_path.posix)
108+
else:
109+
shutil.copy(src=file.local_path, dst=remote_path.posix)
110+
st = file.local_path.stat()
111+
return file.uploaded(
112+
remote_gid=str(st.st_gid),
113+
remote_uid=str(st.st_uid),
114+
remote_creation_time=datetime.now().astimezone(timezone.utc),
115+
remote_perm=str(st.st_mode),
116+
remote_size=st.st_size,
117+
)
118+
119+
def revert_upload(self, *files: File) -> None:
120+
"""Remove uploaded files from the remote folder."""
121+
for file in files:
122+
self._revert_upload_single(remote=file.remote_path, local=file.local_path)
123+
124+
if _remote_folder_is_empty(self.source_folder):
125+
try:
126+
get_logger().info(
127+
"Removing empty remote directory %s",
128+
self.source_folder,
129+
)
130+
Path(self.source_folder.posix).rmdir()
131+
except OSError as exc:
132+
get_logger().warning(
133+
"Failed to remove empty remote directory %s:\n%s",
134+
self.source_folder,
135+
exc,
136+
)
137+
138+
def _revert_upload_single(self, *, remote: RemotePath, local: Path | None) -> None:
139+
remote_path = self.remote_path(remote)
140+
get_logger().info(
141+
"Reverting upload of file %s to %s",
142+
local,
143+
remote_path,
144+
)
145+
146+
try:
147+
Path(remote_path.posix).unlink(missing_ok=True)
148+
except OSError as exc:
149+
get_logger().warning("Error reverting file %s:\n%s", remote_path, exc)
150+
return
151+
152+
153+
class CopyFileTransfer:
154+
"""Upload / download files by copying files on the same filesystem.
155+
156+
This file transfer requires that the 'remote' file system is directly
157+
accessible from the 'local' file system.
158+
It copies the 'remote' files directly to the local download folder.
159+
160+
Note
161+
----
162+
A note on terminology:
163+
In Scitacean, 'remote' refers to the file server where the data files
164+
are stored that belong to SciCat datasets.
165+
In contrast, 'local' refers to the file system of the machine that
166+
runs the Python process.
167+
The two filesystems can be the same.
168+
However, Scitacean maintains a strict separation between the two and
169+
uses 'downloaders' and 'uploaders' to transfer between them even if that
170+
transfer is a simple copy.
171+
172+
See also the documentation of :class:`scitacean.File`.
173+
174+
Warning
175+
-------
176+
This file transfer does not work on Windows because it converts between
177+
:class:`RemotePath` and :class:`pathlib.Path`.
178+
This requires that both use the same directory separators.
179+
Since :class:`RemotePath` uses UNIX-style forward slashes, it is
180+
incompatible with Windows paths.
181+
In practice, this should not be a problem because SciCat file storage
182+
should never be a Windows server.
183+
184+
Examples
185+
--------
186+
Given a dataset with ``source_folder="/dataset/source"`` and a file with path
187+
``"file1.dat"``, this
188+
189+
.. code-block:: python
190+
191+
client = Client.from_token(
192+
url="...",
193+
token="...",
194+
file_transfer=CopyFileTransfer()
195+
)
196+
ds = client.get_dataset(pid="...")
197+
ds = client.download_files(ds, target="/downloads")
198+
199+
copies the file from ``/dataset/source/file1.dat`` to ``/downloads/file1.dat``.
200+
"""
201+
202+
def __init__(
203+
self,
204+
*,
205+
source_folder: str | RemotePath | None = None,
206+
hard_link: bool = False,
207+
) -> None:
208+
"""Construct a new Copy file transfer.
209+
210+
Warning
211+
-------
212+
When using hard links (with ``hard_link = True``), the downloaded
213+
or uploaded files will refer to the same bytes.
214+
So if one is modified, the other will be modified as well.
215+
Use this feature with care!
216+
217+
Parameters
218+
----------
219+
source_folder:
220+
Upload files to this folder if set.
221+
Otherwise, upload to the dataset's source_folder.
222+
Ignored when downloading files.
223+
hard_link:
224+
If True, try to use hard links instead of copies.
225+
"""
226+
self._source_folder_pattern = (
227+
RemotePath(source_folder) if source_folder is not None else None
228+
)
229+
self._hard_link = hard_link
230+
231+
def source_folder_for(self, dataset: Dataset) -> RemotePath:
232+
"""Return the source folder used for the given dataset."""
233+
return source_folder_for(dataset, self._source_folder_pattern)
234+
235+
@contextmanager
236+
def connect_for_download(
237+
self, dataset: Dataset, representative_file_path: RemotePath
238+
) -> Iterator[CopyDownloadConnection]:
239+
"""Create a connection for downloads, use as a context manager.
240+
241+
Parameters
242+
----------
243+
dataset:
244+
The dataset for which to download files.
245+
representative_file_path:
246+
A path to a file that can be used to check whether files for this
247+
dataset are accessible.
248+
The transfer assumes that, if this path is accessible,
249+
all files for this dataset are.
250+
251+
Returns
252+
-------
253+
:
254+
A connection object that can download files.
255+
256+
Raises
257+
------
258+
FileNotAccessibleError
259+
If files for the given dataset cannot be accessed
260+
based on ``representative_file_path``.
261+
"""
262+
source_folder = self.source_folder_for(dataset)
263+
if not Path(source_folder.posix).exists():
264+
raise FileNotAccessibleError(
265+
"Cannot directly access the source folder",
266+
remote_path=source_folder,
267+
)
268+
if not Path((source_folder / representative_file_path).posix).exists():
269+
raise FileNotAccessibleError(
270+
"Cannot directly access the file", remote_path=representative_file_path
271+
)
272+
yield CopyDownloadConnection(self._hard_link)
273+
274+
@contextmanager
275+
def connect_for_upload(
276+
self, dataset: Dataset, representative_file_path: RemotePath
277+
) -> Iterator[CopyUploadConnection]:
278+
"""Create a connection for uploads, use as a context manager.
279+
280+
Parameters
281+
----------
282+
dataset:
283+
The connection will be used to upload files of this dataset.
284+
Used to determine the target folder.
285+
representative_file_path:
286+
A path on the remote to check whether files for this
287+
dataset can be written.
288+
The transfer assumes that, if it is possible to write to this path,
289+
it is possible to write to the paths of all files to be uploaded.
290+
291+
Returns
292+
-------
293+
:
294+
An open :class:`CopyUploadConnection` object.
295+
296+
Raises
297+
------
298+
FileNotAccessibleError
299+
If the remote folder cannot be accessed
300+
based on ``representative_file_path``.
301+
"""
302+
source_folder = Path(self.source_folder_for(dataset).posix)
303+
if not source_folder.parents[-2].exists():
304+
# This check may have a lot of false negatives.
305+
# But we cannot check whether `source_folder` exists because the user
306+
# may intend for the upload to create that folder.
307+
# Checking the top level parent after the root should still catch many
308+
# cases as long as the remote uses paths are uncommon on user machines.
309+
# E.g., for /ess/data/2025/... we get parents[-2] = /ess which should
310+
# not exist on non-ess machines.
311+
raise FileNotAccessibleError(
312+
"Cannot directly access the source folder",
313+
remote_path=self.source_folder_for(dataset),
314+
)
315+
yield CopyUploadConnection(
316+
source_folder=self.source_folder_for(dataset), hard_link=self._hard_link
317+
)
318+
319+
320+
def _remote_folder_is_empty(path: RemotePath) -> bool:
321+
try:
322+
_ = next(iter(Path(path.posix).iterdir()))
323+
except StopIteration:
324+
return True
325+
return False
326+
327+
328+
__all__ = ["CopyDownloadConnection", "CopyFileTransfer", "CopyUploadConnection"]

src/scitacean/transfer/link.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,16 +99,19 @@ class LinkFileTransfer:
9999
100100
Warning
101101
-------
102-
This file transfer does not work on Windows.
103-
This is due to :class:`RemotePath` not supporting backslashes as
104-
path separators.
102+
This file transfer does not work on Windows because it converts between
103+
:class:`RemotePath` and :class:`pathlib.Path`.
104+
This requires that both use the same directory separators.
105+
Since :class:`RemotePath` uses UNIX-style forward slashes, it is
106+
incompatible with Windows paths.
105107
In practice, this should not be a problem because SciCat file storage
106108
should never be a Windows server.
107109
108110
Warning
109111
-------
110112
This file transfer cannot upload files.
111-
Instead, consider copying or moving the files to the SciCat source folder
113+
Instead, consider copying or moving the files to the SciCat source folder,
114+
e.g., by using :scitacean.transfer.copy.CopyFileTransfer`
112115
or writing the files there directly from your workflow.
113116
114117
Attempting to upload files will raise ``NotImplementedError``.

0 commit comments

Comments
 (0)