diff --git a/requirements.workflow.in b/requirements.workflow.in index 9ad9bdd..3d16a52 100644 --- a/requirements.workflow.in +++ b/requirements.workflow.in @@ -1 +1,3 @@ -rdflib==7.4.0 \ No newline at end of file +rdflib==7.4.0 +validators==0.35.0 +requests==2.32.5 diff --git a/requirements.workflow.txt b/requirements.workflow.txt index cf20d26..809293b 100644 --- a/requirements.workflow.txt +++ b/requirements.workflow.txt @@ -4,7 +4,19 @@ # # pip-compile --output-file=requirements.workflow.txt requirements.workflow.in # +certifi==2025.11.12 + # via requests +charset-normalizer==3.4.4 + # via requests +idna==3.11 + # via requests pyparsing==3.2.5 # via rdflib rdflib==7.4.0 # via -r requirements.workflow.in +requests==2.32.5 + # via -r requirements.workflow.in +urllib3==2.6.2 + # via requests +validators==0.35.0 + # via -r requirements.workflow.in diff --git a/scripts/check_profiles.py b/scripts/check_profiles.py index acd6bf1..8fd7bb7 100644 --- a/scripts/check_profiles.py +++ b/scripts/check_profiles.py @@ -1,4 +1,6 @@ import re +import validators +import requests from hashlib import md5 from rdflib import Graph, URIRef, Literal, XSD, RDF, OWL @@ -11,8 +13,34 @@ with open("profile_urls.txt", "r") as file: profile_urls = [line.strip() for line in file if line.strip()] -for url in profile_urls: - g.parse(url, format="json-ld", publicID=urljoin(url, '.')) +for url in profile_urls: + if not validators.url(url): + raise ValueError(f"{url} is not an URL") + + headers = { + "Accept": "application/ld+json, application/json" + } + + try: + response = requests.head(url, headers=headers, allow_redirects=True) + except requests.RequestException as e: + raise ValueError(f"Unable to reach {url} (Error: {e})") + + # Fallback + if response.status_code >= 400: + response = requests.get(url, headers=headers) + + content_type = response.headers.get("Content-Type", "").lower() + if "json" not in content_type: + raise ValueError(f"{url} does not return JSON-LD (Content type: {content_type})") + + temp_g = Graph() + base_iri = urljoin(url, '.') if url.endswith("ro-crate-metadata.json") else f"{url.rstrip('/')}/" + temp_g.parse(url, format="json-ld", publicID=base_iri) + if any(temp_g.subjects(RDF.type, profile_class)): + g += temp_g + else: + raise ValueError(f"No profile entity found in {url}") datetime_pattern = re.compile(r"^-?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})?$") date_pattern = re.compile(r"^-?\d{4}-\d{2}-\d{2}$") diff --git a/scripts/profile_urls.txt b/scripts/profile_urls.txt index c7a5e55..b818ef4 100644 --- a/scripts/profile_urls.txt +++ b/scripts/profile_urls.txt @@ -1,5 +1,4 @@ https://trefx.uk/5s-crate/0.4/ro-crate-metadata.json https://www.researchobject.org/workflow-run-crate/profiles/0.5/process_run_crate/ro-crate-metadata.json https://www.researchobject.org/workflow-run-crate/profiles/0.5/workflow_run_crate/ro-crate-metadata.json -https://about.workflowhub.eu/Workflow-RO-Crate/ro-crate-metadata.json https://www.researchobject.org/workflow-run-crate/profiles/0.5/provenance_run_crate/ro-crate-metadata.json