diff --git a/.github/workflows/check-profile-pr.yml b/.github/workflows/check-profile-pr.yml new file mode 100644 index 0000000..c9e4ef1 --- /dev/null +++ b/.github/workflows/check-profile-pr.yml @@ -0,0 +1,20 @@ +name: Check added profile link + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - 'scripts/profile_urls.txt' + workflow_dispatch: +jobs: + check: + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - name: Install dependencies + run: python3 -m pip install -r requirements.workflow.txt + - name: Run script + run: | + cd scripts + python check_profiles.py diff --git a/.gitignore b/.gitignore index 53fa7ed..a8a1213 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ # Environments .env +venv/ diff --git a/requirements.workflow.in b/requirements.workflow.in new file mode 100644 index 0000000..9ad9bdd --- /dev/null +++ b/requirements.workflow.in @@ -0,0 +1 @@ +rdflib==7.4.0 \ No newline at end of file diff --git a/requirements.workflow.txt b/requirements.workflow.txt new file mode 100644 index 0000000..cf20d26 --- /dev/null +++ b/requirements.workflow.txt @@ -0,0 +1,10 @@ +# +# This file is autogenerated by pip-compile with Python 3.12 +# by the following command: +# +# pip-compile --output-file=requirements.workflow.txt requirements.workflow.in +# +pyparsing==3.2.5 + # via rdflib +rdflib==7.4.0 + # via -r requirements.workflow.in diff --git a/scripts/check_profiles.py b/scripts/check_profiles.py new file mode 100644 index 0000000..acd6bf1 --- /dev/null +++ b/scripts/check_profiles.py @@ -0,0 +1,42 @@ +import re + +from hashlib import md5 +from rdflib import Graph, URIRef, Literal, XSD, RDF, OWL +from urllib.parse import urljoin + + +g = Graph() +profile_class = URIRef("http://www.w3.org/ns/dx/prof/Profile") + +with open("profile_urls.txt", "r") as file: + profile_urls = [line.strip() for line in file if line.strip()] + +for url in profile_urls: + g.parse(url, format="json-ld", publicID=urljoin(url, '.')) + +datetime_pattern = re.compile(r"^-?\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})?$") +date_pattern = re.compile(r"^-?\d{4}-\d{2}-\d{2}$") +time_pattern = re.compile(r"^\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})?$") + +# Add type to datetime, date and time data +for s, p, o in g.triples((None, None, None)): + typed_o = None + + if datetime_pattern.match(o): + typed_o = Literal(o, datatype=XSD.dateTime) + elif date_pattern.match(o): + typed_o = Literal(o, datatype=XSD.date) + elif time_pattern.match(o): + typed_o = Literal(o, datatype=XSD.time) + + if typed_o is not None: + g.add((s, p, typed_o)) + g.remove((s, p, o)) + + if p == RDF.type and o == profile_class: + id = md5(s.encode('utf-8')).hexdigest() + new_s = URIRef(f"http://example.org/data/profile/{id}") # TODO: Change URI + g.add((new_s, RDF.type, o)) + g.add((new_s, OWL.sameAs, s)) + +ttl_data = g.serialize(format="turtle") diff --git a/scripts/profile_urls.txt b/scripts/profile_urls.txt new file mode 100644 index 0000000..c7a5e55 --- /dev/null +++ b/scripts/profile_urls.txt @@ -0,0 +1,5 @@ +https://trefx.uk/5s-crate/0.4/ro-crate-metadata.json +https://www.researchobject.org/workflow-run-crate/profiles/0.5/process_run_crate/ro-crate-metadata.json +https://www.researchobject.org/workflow-run-crate/profiles/0.5/workflow_run_crate/ro-crate-metadata.json +https://about.workflowhub.eu/Workflow-RO-Crate/ro-crate-metadata.json +https://www.researchobject.org/workflow-run-crate/profiles/0.5/provenance_run_crate/ro-crate-metadata.json