-
Notifications
You must be signed in to change notification settings - Fork 19
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Collect functionality is going to be useful for push/gc/etc in the next PRs.
- Loading branch information
Showing
3 changed files
with
172 additions
and
147 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import logging | ||
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple | ||
|
||
from dvc_objects.fs.callbacks import DEFAULT_CALLBACK | ||
|
||
from .index import ( | ||
DataIndex, | ||
DataIndexEntry, | ||
FileStorage, | ||
ObjectStorage, | ||
StorageInfo, | ||
) | ||
|
||
if TYPE_CHECKING: | ||
from dvc_objects.fs.callbacks import Callback | ||
|
||
from .index import Storage | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def _collect_from_index( | ||
cache, | ||
cache_prefix, | ||
index, | ||
prefix, | ||
storage, | ||
callback: "Callback" = DEFAULT_CALLBACK, | ||
): | ||
entries = {} | ||
|
||
try: | ||
for _, entry in index.iteritems(prefix): | ||
callback.relative_update() | ||
try: | ||
storage_key = storage.get_key(entry) | ||
except ValueError: | ||
continue | ||
|
||
loaded = False | ||
if entry.meta and entry.meta.isdir: | ||
# NOTE: at this point it might not be loaded yet, so we can't | ||
# rely on entry.loaded | ||
loaded = True | ||
|
||
meta = entry.meta | ||
hash_info = entry.hash_info | ||
if ( | ||
isinstance(storage, FileStorage) | ||
and storage.fs.version_aware | ||
and entry.meta | ||
and not entry.meta.isdir | ||
and entry.meta.version_id is None | ||
): | ||
meta.md5 = None | ||
hash_info = None | ||
|
||
# NOTE: avoiding modifying cache right away, because you might | ||
# run into a locked database if idx and cache are using the same | ||
# table. | ||
entries[storage_key] = DataIndexEntry( | ||
key=storage_key, | ||
meta=meta, | ||
hash_info=hash_info, | ||
loaded=loaded, | ||
) | ||
|
||
except KeyError: | ||
return | ||
|
||
for key, entry in entries.items(): | ||
cache[(*cache_prefix, *key)] = entry | ||
|
||
|
||
def collect( # noqa: C901 | ||
idxs, | ||
storage, | ||
callback: "Callback" = DEFAULT_CALLBACK, | ||
cache_index=None, | ||
cache_key=None, | ||
) -> List["DataIndex"]: | ||
from fsspec.utils import tokenize | ||
|
||
storage_by_fs: Dict[Tuple[str, str], StorageInfo] = {} | ||
skip = set() | ||
|
||
if cache_index is None: | ||
cache_index = DataIndex() | ||
cache_key = () | ||
|
||
for idx in idxs: | ||
for prefix, storage_info in idx.storage_map.items(): | ||
data = getattr(storage_info, storage) | ||
cache = storage_info.cache if storage != "cache" else None | ||
remote = storage_info.remote if storage != "remote" else None | ||
|
||
if not data: | ||
continue | ||
|
||
# FIXME should use fsid instead of protocol | ||
key = (data.fs.protocol, tokenize(data.path)) | ||
if key not in storage_by_fs: | ||
if cache_index.has_node((*cache_key, *key)): | ||
skip.add(key) | ||
|
||
if key not in skip: | ||
_collect_from_index( | ||
cache_index, | ||
(*cache_key, *key), | ||
idx, | ||
prefix, | ||
data, | ||
callback=callback, | ||
) | ||
cache_index.commit() | ||
|
||
if key not in storage_by_fs: | ||
fs_data: "Storage" | ||
fs_cache: Optional["Storage"] | ||
fs_remote: Optional["Storage"] | ||
|
||
if isinstance(data, ObjectStorage): | ||
fs_data = ObjectStorage(key=(), odb=data.odb) | ||
else: | ||
fs_data = FileStorage(key=(), fs=data.fs, path=data.path) | ||
|
||
if not cache: | ||
fs_cache = None | ||
elif isinstance(cache, ObjectStorage): | ||
fs_cache = ObjectStorage(key=(), odb=cache.odb) | ||
else: | ||
fs_cache = FileStorage( | ||
key=(), fs=cache.fs, path=cache.path | ||
) | ||
|
||
if not remote: | ||
fs_remote = None | ||
elif isinstance(remote, ObjectStorage): | ||
fs_remote = ObjectStorage(key=(), odb=remote.odb) | ||
else: | ||
fs_remote = FileStorage( | ||
key=(), | ||
fs=remote.fs, | ||
path=remote.path, | ||
) | ||
|
||
storage_by_fs[key] = StorageInfo( | ||
data=fs_data, cache=fs_cache, remote=fs_remote | ||
) | ||
|
||
storage_indexes = [] | ||
for key, storage_info in storage_by_fs.items(): | ||
idx = cache_index.view((*cache_key, *key)) | ||
idx.storage_map[()] = storage_info | ||
storage_indexes.append(idx) | ||
|
||
return storage_indexes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters