diff --git a/README.md b/README.md index 0ef613103..0bfbe0634 100644 --- a/README.md +++ b/README.md @@ -140,11 +140,15 @@ As a dependency it is included in [bioimageio.core](https://github.com/bioimage- ## Environment variables -| Name | Default | Description | -|---|---|---| -| BIOIMAGEIO_USE_CACHE | "true" | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false" | -| BIOIMAGEIO_CACHE_PATH | generated tmp folder | File path for simple URL to file cache; changes of URL source are not detected. | -| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3" | Maximum number of warnings generated for simple cache hits. | +| Name | Default | Description | +|---------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| BIOIMAGEIO_USE_CACHE | "true" | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false" | +| BIOIMAGEIO_CACHE_PATH | generated tmp folder | File path for simple URL to file cache; changes of URL source are not detected. | +| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3" | Maximum number of warnings generated for simple cache hits. | +| BIOIMAGEIO_COUNT_RDF_DOWNLOADS | "true" | Enables to report RDF downloads. No personal data is uploaded. Only the model id and, if set 'BIOIMAGEIO_CONSUMER_ID' and 'BIOIMAGEIO_CONSUMER_VERSION'. | +| BIOIMAGEIO_CONSUMER_ID | "python" | Software consumer ID in whose context bioimageio.spec is being used, e.g. "ilastik" | +| BIOIMAGEIO_CONSUMER_VERSION | null | Version of the software consumer ID in whose context bioimageio.spec is being used, e.g. "1.4.0". For 'BIOIMAGEIO_CONSUMER_ID' "python" the active Python version's major.minor version is used. | + ## Changelog #### bioimageio.spec 0.4.9 diff --git a/bioimageio/spec/shared/_resolve_source.py b/bioimageio/spec/shared/_resolve_source.py index 521c97d3f..2448fd810 100644 --- a/bioimageio/spec/shared/_resolve_source.py +++ b/bioimageio/spec/shared/_resolve_source.py @@ -1,4 +1,5 @@ import json +import logging import os import pathlib import re @@ -20,15 +21,19 @@ BIOIMAGEIO_COLLECTION_URL, BIOIMAGEIO_SITE_CONFIG_URL, BIOIMAGEIO_USE_CACHE, + COUNT_RDF_DOWNLOADS, + CacheWarning, DOI_REGEX, RDF_NAMES, - CacheWarning, get_spec_type_from_type, no_cache_tmp_list, tqdm, yaml, ) from .raw_nodes import URI +from .tracking import report_resource_download + +logger = logging.getLogger(__name__) class DownloadCancelled(Exception): @@ -70,11 +75,12 @@ def resolve_rdf_source( if not isinstance(source, (dict, pathlib.Path, str, bytes)): raise TypeError(f"Unexpected source type {type(source)}") + # check for local/in-memory sources if isinstance(source, pathlib.Path): source_name = str(source) root: typing.Union[pathlib.Path, raw_nodes.URI] = source.parent elif isinstance(source, dict): - source_name = f"{{name: {source.get('name', '')}, ...}}" + source_name = f"{{name: {source.get('name', '')}, " + str(source)[1:100] source = dict(source) given_root = source.pop("root_path", pathlib.Path()) if _is_path(given_root): @@ -97,6 +103,7 @@ def resolve_rdf_source( else: raise TypeError(source) + # source might be remote if isinstance(source, str): # source might be bioimageio nickname, id, doi, url or file path -> resolve to pathlib.Path @@ -455,10 +462,11 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non # Total size in bytes. total_size = int(r.headers.get("content-length", 0)) block_size = 1024 # 1 Kibibyte + file_name = uri.path.split("/")[-1] if pbar: - t = pbar(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1]) + t = pbar(total=total_size, unit="iB", unit_scale=True, desc=file_name) else: - t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1]) + t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=file_name) tmp_path = local_path.with_suffix(f"{local_path.suffix}.part") with tmp_path.open("wb") as f: for data in r.iter_content(block_size): @@ -471,6 +479,20 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non warnings.warn(f"Download ({t.n}) does not have expected size ({total_size}).") shutil.move(f.name, str(local_path)) + if file_name == "rdf.yaml" and COUNT_RDF_DOWNLOADS: + try: + assert yaml + data = yaml.load(local_path) + if data["type"] == "model": # only report model downloads for now + id_ = data["id"] + if id_.count("/") > 1: # a version id has additional forward slashes + id_ = "/".join(id_.split("/")[:2]) # only use resource id/'concept' part + + report_resource_download(id_) + + except Exception as e: + logger.debug(f"failed to report download: {e}") + except DownloadCancelled as e: # let calling code handle this exception specifically -> allow for cancellation of # long running downloads per user request @@ -518,12 +540,14 @@ def _resolve_json_from_url( else: BIOIMAGEIO_COLLECTION_ENTRIES = {} for cr in BIOIMAGEIO_COLLECTION.get("collection", []): - if "id" in cr and "rdf_source" in cr and "type" in cr: - entry = (cr["type"], cr["rdf_source"]) - BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry + if not ("id" in cr and "rdf_source" in cr and "type" in cr): + continue + + entry = (cr["type"], cr["rdf_source"]) + BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry - if "nickname" in cr: - BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry + if "nickname" in cr: + BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry # add resource versions explicitly for cv in cr.get("versions", []): diff --git a/bioimageio/spec/shared/common.py b/bioimageio/spec/shared/common.py index ddba5ac59..89461eae8 100644 --- a/bioimageio/spec/shared/common.py +++ b/bioimageio/spec/shared/common.py @@ -1,6 +1,7 @@ import getpass import os import pathlib +import sys import tempfile import warnings from typing import Any, Dict, Generic, Iterable, List, Optional, Sequence, Union @@ -75,6 +76,13 @@ class CacheWarning(RuntimeWarning): BIOIMAGEIO_COLLECTION_URL = "https://bioimage-io.github.io/collection-bioimage-io/collection.json" +COUNT_RDF_DOWNLOADS = os.getenv("BIOIMAGEIO_COUNT_RDF_DOWNLOADS", "true").lower() in ("true", "yes", "1") +CONSUMER_ID = os.getenv("BIOIMAGEIO_CONSUMER_ID", "python").lower() +CONSUMER_VERSION = os.getenv("BIOIMAGEIO_CONSUMER_VERSION") +if CONSUMER_ID == "python" and CONSUMER_VERSION is None: + CONSUMER_VERSION = ".".join([str(v) for v in [sys.version_info.major, sys.version_info.minor]]) + + DOI_REGEX = r"^10[.][0-9]{4,9}\/[-._;()\/:A-Za-z0-9]+$" RDF_NAMES = ("rdf.yaml", "model.yaml") diff --git a/bioimageio/spec/shared/tracking.py b/bioimageio/spec/shared/tracking.py new file mode 100644 index 000000000..24f0bf9ae --- /dev/null +++ b/bioimageio/spec/shared/tracking.py @@ -0,0 +1,14 @@ +from bioimageio.spec.shared.common import CONSUMER_ID, CONSUMER_VERSION + + +def report_resource_download(resource_id: str): + import requests # not available in pyodide + + uadata = f'&uadata={{"brands":[{{"brand":"{CONSUMER_ID}","version":"{CONSUMER_VERSION or "unknown"}"}}]}}' + url = ( + f"https://bioimage.matomo.cloud/matomo.php?download=https://doi.org/{resource_id}&idsite=1&rec=1" + f"&r=646242&h=13&m=35&s=20&url=http://bioimage.io/#/?id={resource_id}{uadata}" + ) + + r = requests.get(url) + r.raise_for_status() diff --git a/tests/conftest.py b/tests/conftest.py index 92afd6d74..9785101c0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,10 @@ +import os import pathlib import pytest +os.environ["BIOIMAGEIO_COUNT_RDF_DOWNLOADS"] = "false" + @pytest.fixture def unet2d_nuclei_broad_base_path():