bioimage-io · FynnBe · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023
diff --git a/README.md b/README.md
@@ -140,11 +140,15 @@ As a dependency it is included in [bioimageio.core](https://github.com/bioimage-
 
 ## Environment variables
 
-| Name | Default | Description |
-|---|---|---|
-| BIOIMAGEIO_USE_CACHE | "true" | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false" |
-| BIOIMAGEIO_CACHE_PATH | generated tmp folder  | File path for simple URL to file cache; changes of URL source are not detected. |
-| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3" | Maximum number of warnings generated for simple cache hits. |
+| Name                            | Default               | Description                                                                                                                                                                                      |
+|---------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| BIOIMAGEIO_USE_CACHE            | "true"                | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false"                                                 |
+| BIOIMAGEIO_CACHE_PATH           | generated tmp folder  | File path for simple URL to file cache; changes of URL source are not detected.                                                                                                                  |
+| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3"                   | Maximum number of warnings generated for simple cache hits.                                                                                                                                      |
+| BIOIMAGEIO_COUNT_RDF_DOWNLOADS  | "true"                | Enables to report RDF downloads. No personal data is uploaded. Only the model id and, if set 'BIOIMAGEIO_CONSUMER_ID' and 'BIOIMAGEIO_CONSUMER_VERSION'.                                         |
+| BIOIMAGEIO_CONSUMER_ID          | "python"              | Software consumer ID in whose context bioimageio.spec is being used, e.g. "ilastik"                                                                                                              |
+| BIOIMAGEIO_CONSUMER_VERSION     | null                  | Version of the software consumer ID in whose context bioimageio.spec is being used, e.g. "1.4.0". For 'BIOIMAGEIO_CONSUMER_ID' "python" the active Python version's major.minor version is used. |
+
 
 ## Changelog
 #### bioimageio.spec 0.4.9

diff --git a/bioimageio/spec/shared/_resolve_source.py b/bioimageio/spec/shared/_resolve_source.py
@@ -1,4 +1,5 @@
 import json
+import logging
 import os
 import pathlib
 import re
@@ -20,15 +21,19 @@
     BIOIMAGEIO_COLLECTION_URL,
     BIOIMAGEIO_SITE_CONFIG_URL,
     BIOIMAGEIO_USE_CACHE,
+    COUNT_RDF_DOWNLOADS,
+    CacheWarning,
     DOI_REGEX,
     RDF_NAMES,
-    CacheWarning,
     get_spec_type_from_type,
     no_cache_tmp_list,
     tqdm,
     yaml,
 )
 from .raw_nodes import URI
+from .tracking import report_resource_download
+
+logger = logging.getLogger(__name__)
 
 
 class DownloadCancelled(Exception):
@@ -70,11 +75,12 @@ def resolve_rdf_source(
     if not isinstance(source, (dict, pathlib.Path, str, bytes)):
         raise TypeError(f"Unexpected source type {type(source)}")
 
+    # check for local/in-memory sources
     if isinstance(source, pathlib.Path):
         source_name = str(source)
         root: typing.Union[pathlib.Path, raw_nodes.URI] = source.parent
     elif isinstance(source, dict):
-        source_name = f"{{name: {source.get('name', '<unknown>')}, ...}}"
+        source_name = f"{{name: {source.get('name', '<unknown>')}, " + str(source)[1:100]
         source = dict(source)
         given_root = source.pop("root_path", pathlib.Path())
         if _is_path(given_root):
@@ -97,6 +103,7 @@ def resolve_rdf_source(
     else:
         raise TypeError(source)
 
+    # source might be remote
     if isinstance(source, str):
         # source might be bioimageio nickname, id, doi, url or file path -> resolve to pathlib.Path
 
@@ -455,10 +462,11 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non
             # Total size in bytes.
             total_size = int(r.headers.get("content-length", 0))
             block_size = 1024  # 1 Kibibyte
+            file_name = uri.path.split("/")[-1]
             if pbar:
-                t = pbar(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1])
+                t = pbar(total=total_size, unit="iB", unit_scale=True, desc=file_name)
             else:
-                t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1])
+                t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=file_name)
             tmp_path = local_path.with_suffix(f"{local_path.suffix}.part")
             with tmp_path.open("wb") as f:
                 for data in r.iter_content(block_size):
@@ -471,6 +479,20 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non
                 warnings.warn(f"Download ({t.n}) does not have expected size ({total_size}).")
 
             shutil.move(f.name, str(local_path))
+            if file_name == "rdf.yaml" and COUNT_RDF_DOWNLOADS:
+                try:
+                    assert yaml
+                    data = yaml.load(local_path)
+                    if data["type"] == "model":  # only report model downloads for now
+                        id_ = data["id"]
+                        if id_.count("/") > 1:  # a version id has additional forward slashes
+                            id_ = "/".join(id_.split("/")[:2])  # only use resource id/'concept' part
+
+                        report_resource_download(id_)
+
+                except Exception as e:
+                    logger.debug(f"failed to report download: {e}")
+
         except DownloadCancelled as e:
             # let calling code handle this exception specifically -> allow for cancellation of
             # long running downloads per user request
@@ -518,12 +540,14 @@ def _resolve_json_from_url(
 else:
     BIOIMAGEIO_COLLECTION_ENTRIES = {}
     for cr in BIOIMAGEIO_COLLECTION.get("collection", []):
-        if "id" in cr and "rdf_source" in cr and "type" in cr:
-            entry = (cr["type"], cr["rdf_source"])
-            BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry
+        if not ("id" in cr and "rdf_source" in cr and "type" in cr):
+            continue
+
+        entry = (cr["type"], cr["rdf_source"])
+        BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry
 
-            if "nickname" in cr:
-                BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry
+        if "nickname" in cr:
+            BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry
 
         # add resource versions explicitly
         for cv in cr.get("versions", []):

diff --git a/bioimageio/spec/shared/common.py b/bioimageio/spec/shared/common.py
@@ -1,6 +1,7 @@
 import getpass
 import os
 import pathlib
+import sys
 import tempfile
 import warnings
 from typing import Any, Dict, Generic, Iterable, List, Optional, Sequence, Union
@@ -75,6 +76,13 @@ class CacheWarning(RuntimeWarning):
 BIOIMAGEIO_COLLECTION_URL = "https://bioimage-io.github.io/collection-bioimage-io/collection.json"
 
 
+COUNT_RDF_DOWNLOADS = os.getenv("BIOIMAGEIO_COUNT_RDF_DOWNLOADS", "true").lower() in ("true", "yes", "1")
+CONSUMER_ID = os.getenv("BIOIMAGEIO_CONSUMER_ID", "python").lower()
+CONSUMER_VERSION = os.getenv("BIOIMAGEIO_CONSUMER_VERSION")
+if CONSUMER_ID == "python" and CONSUMER_VERSION is None:
+    CONSUMER_VERSION = ".".join([str(v) for v in [sys.version_info.major, sys.version_info.minor]])
+
+
 DOI_REGEX = r"^10[.][0-9]{4,9}\/[-._;()\/:A-Za-z0-9]+$"
 RDF_NAMES = ("rdf.yaml", "model.yaml")
 

diff --git a/bioimageio/spec/shared/tracking.py b/bioimageio/spec/shared/tracking.py
@@ -0,0 +1,19 @@
+from bioimageio.spec.shared.common import CONSUMER_ID, CONSUMER_VERSION
+
+
+def report_resource_download(resource_id: str):
+    import requests  # not available in pyodide
+
+    uadata = f'&uadata={{"brands":[{{"brand":"{CONSUMER_ID}","version":"{CONSUMER_VERSION or "unknown"}"}}]}}'
+    url = f"https://bioimage.matomo.cloud/matomo.php?download=https://doi.org/{resource_id}&idsite=1&rec=1&r=646242&h=13&m=35&s=20&url=http://bioimage.io/#/?id={resource_id}{uadata}"
+
+    r = requests.get(url)
+    r.raise_for_status()
+
+
+if __name__ == "__main__":
+    model_doi = "10.5281/zenodo.7614645"
+    report_resource_download(model_doi)
+    print(
+        f"https://bioimage.matomo.cloud/?module=API&method=Actions.getDownload&downloadUrl=https://doi.org/{model_doi}&idSite=1&period=day&date=yesterday&format=JSON&token_auth=anonymous"
+    )
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,7 +1,10 @@
+import os
 import pathlib
 
 import pytest
 
+os.environ["BIOIMAGEIO_COUNT_RDF_DOWNLOADS"] = "false"
+
 
 @pytest.fixture
 def unet2d_nuclei_broad_base_path():