Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tracking of model RDF downloads #504

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,15 @@ As a dependency it is included in [bioimageio.core](https://github.com/bioimage-

## Environment variables

| Name | Default | Description |
|---|---|---|
| BIOIMAGEIO_USE_CACHE | "true" | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false" |
| BIOIMAGEIO_CACHE_PATH | generated tmp folder | File path for simple URL to file cache; changes of URL source are not detected. |
| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3" | Maximum number of warnings generated for simple cache hits. |
| Name | Default | Description |
|---------------------------------|-----------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| BIOIMAGEIO_USE_CACHE | "true" | Enables simple URL to file cache. possible, case-insensitive, positive values are: "true", "yes", "1". Any other value is interpreted as "false" |
| BIOIMAGEIO_CACHE_PATH | generated tmp folder | File path for simple URL to file cache; changes of URL source are not detected. |
| BIOIMAGEIO_CACHE_WARNINGS_LIMIT | "3" | Maximum number of warnings generated for simple cache hits. |
| BIOIMAGEIO_COUNT_RDF_DOWNLOADS | "true" | Enables to report RDF downloads. No personal data is uploaded. Only the model id and, if set 'BIOIMAGEIO_CONSUMER_ID' and 'BIOIMAGEIO_CONSUMER_VERSION'. |
| BIOIMAGEIO_CONSUMER_ID | "python" | Software consumer ID in whose context bioimageio.spec is being used, e.g. "ilastik" |
| BIOIMAGEIO_CONSUMER_VERSION | null | Version of the software consumer ID in whose context bioimageio.spec is being used, e.g. "1.4.0". For 'BIOIMAGEIO_CONSUMER_ID' "python" the active Python version's major.minor version is used. |


## Changelog
#### bioimageio.spec 0.4.9
Expand Down
42 changes: 33 additions & 9 deletions bioimageio/spec/shared/_resolve_source.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import logging
import os
import pathlib
import re
Expand All @@ -20,15 +21,19 @@
BIOIMAGEIO_COLLECTION_URL,
BIOIMAGEIO_SITE_CONFIG_URL,
BIOIMAGEIO_USE_CACHE,
COUNT_RDF_DOWNLOADS,
CacheWarning,
DOI_REGEX,
RDF_NAMES,
CacheWarning,
get_spec_type_from_type,
no_cache_tmp_list,
tqdm,
yaml,
)
from .raw_nodes import URI
from .tracking import report_resource_download

logger = logging.getLogger(__name__)


class DownloadCancelled(Exception):
Expand Down Expand Up @@ -70,11 +75,12 @@ def resolve_rdf_source(
if not isinstance(source, (dict, pathlib.Path, str, bytes)):
raise TypeError(f"Unexpected source type {type(source)}")

# check for local/in-memory sources
if isinstance(source, pathlib.Path):
source_name = str(source)
root: typing.Union[pathlib.Path, raw_nodes.URI] = source.parent
elif isinstance(source, dict):
source_name = f"{{name: {source.get('name', '<unknown>')}, ...}}"
source_name = f"{{name: {source.get('name', '<unknown>')}, " + str(source)[1:100]
source = dict(source)
given_root = source.pop("root_path", pathlib.Path())
if _is_path(given_root):
Expand All @@ -97,6 +103,7 @@ def resolve_rdf_source(
else:
raise TypeError(source)

# source might be remote
if isinstance(source, str):
# source might be bioimageio nickname, id, doi, url or file path -> resolve to pathlib.Path

Expand Down Expand Up @@ -455,10 +462,11 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non
# Total size in bytes.
total_size = int(r.headers.get("content-length", 0))
block_size = 1024 # 1 Kibibyte
file_name = uri.path.split("/")[-1]
if pbar:
t = pbar(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1])
t = pbar(total=total_size, unit="iB", unit_scale=True, desc=file_name)
else:
t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=uri.path.split("/")[-1])
t = tqdm(total=total_size, unit="iB", unit_scale=True, desc=file_name)
tmp_path = local_path.with_suffix(f"{local_path.suffix}.part")
with tmp_path.open("wb") as f:
for data in r.iter_content(block_size):
Expand All @@ -471,6 +479,20 @@ def _download_url(uri: raw_nodes.URI, output: typing.Optional[os.PathLike] = Non
warnings.warn(f"Download ({t.n}) does not have expected size ({total_size}).")

shutil.move(f.name, str(local_path))
if file_name == "rdf.yaml" and COUNT_RDF_DOWNLOADS:
try:
assert yaml
data = yaml.load(local_path)
if data["type"] == "model": # only report model downloads for now
id_ = data["id"]
if id_.count("/") > 1: # a version id has additional forward slashes
id_ = "/".join(id_.split("/")[:2]) # only use resource id/'concept' part

report_resource_download(id_)

except Exception as e:
logger.debug(f"failed to report download: {e}")

except DownloadCancelled as e:
# let calling code handle this exception specifically -> allow for cancellation of
# long running downloads per user request
Expand Down Expand Up @@ -518,12 +540,14 @@ def _resolve_json_from_url(
else:
BIOIMAGEIO_COLLECTION_ENTRIES = {}
for cr in BIOIMAGEIO_COLLECTION.get("collection", []):
if "id" in cr and "rdf_source" in cr and "type" in cr:
entry = (cr["type"], cr["rdf_source"])
BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry
if not ("id" in cr and "rdf_source" in cr and "type" in cr):
continue

entry = (cr["type"], cr["rdf_source"])
BIOIMAGEIO_COLLECTION_ENTRIES[cr["id"]] = entry

if "nickname" in cr:
BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry
if "nickname" in cr:
BIOIMAGEIO_COLLECTION_ENTRIES[cr["nickname"]] = entry

# add resource versions explicitly
for cv in cr.get("versions", []):
Expand Down
8 changes: 8 additions & 0 deletions bioimageio/spec/shared/common.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import getpass
import os
import pathlib
import sys
import tempfile
import warnings
from typing import Any, Dict, Generic, Iterable, List, Optional, Sequence, Union
Expand Down Expand Up @@ -75,6 +76,13 @@ class CacheWarning(RuntimeWarning):
BIOIMAGEIO_COLLECTION_URL = "https://bioimage-io.github.io/collection-bioimage-io/collection.json"


COUNT_RDF_DOWNLOADS = os.getenv("BIOIMAGEIO_COUNT_RDF_DOWNLOADS", "true").lower() in ("true", "yes", "1")
CONSUMER_ID = os.getenv("BIOIMAGEIO_CONSUMER_ID", "python").lower()
CONSUMER_VERSION = os.getenv("BIOIMAGEIO_CONSUMER_VERSION")
if CONSUMER_ID == "python" and CONSUMER_VERSION is None:
CONSUMER_VERSION = ".".join([str(v) for v in [sys.version_info.major, sys.version_info.minor]])


DOI_REGEX = r"^10[.][0-9]{4,9}\/[-._;()\/:A-Za-z0-9]+$"
RDF_NAMES = ("rdf.yaml", "model.yaml")

Expand Down
19 changes: 19 additions & 0 deletions bioimageio/spec/shared/tracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from bioimageio.spec.shared.common import CONSUMER_ID, CONSUMER_VERSION


def report_resource_download(resource_id: str):
import requests # not available in pyodide

uadata = f'&uadata={{"brands":[{{"brand":"{CONSUMER_ID}","version":"{CONSUMER_VERSION or "unknown"}"}}]}}'
url = f"https://bioimage.matomo.cloud/matomo.php?download=https://doi.org/{resource_id}&idsite=1&rec=1&r=646242&h=13&m=35&s=20&url=http://bioimage.io/#/?id={resource_id}{uadata}"

r = requests.get(url)
r.raise_for_status()


if __name__ == "__main__":
model_doi = "10.5281/zenodo.7614645"
report_resource_download(model_doi)
print(
f"https://bioimage.matomo.cloud/?module=API&method=Actions.getDownload&downloadUrl=https://doi.org/{model_doi}&idSite=1&period=day&date=yesterday&format=JSON&token_auth=anonymous"
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could not yet see the tracking working...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@oeway any idea what I'm doing wrong?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two things we need to set here:

  1. we need to configure the date properly, it was configured to start from yesterday and the period is a day, so there won't be any traffic for that model. I would change period to year and date to 2023-03-01 (see here: https://developer.matomo.org/api-reference/Piwik/Period)
  2. the report will only be generated every 15 minutes: https://matomo.org/faq/general/faq_41/ so we won't see the report immediately.

You can see an example here (I triggered the report yesterday so it shows up today):
https://bioimage.matomo.cloud/?module=API&method=Actions.getDownload&downloadUrl=https://doi.org/test&idSite=1&idCustomReport=1&period=year&date=2023-03-01&format=JSON&token_auth=anonymous

3 changes: 3 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import pathlib

import pytest

os.environ["BIOIMAGEIO_COUNT_RDF_DOWNLOADS"] = "false"


@pytest.fixture
def unet2d_nuclei_broad_base_path():
Expand Down