From c216c2c2dfd92c83bea682523d2d2cbcd4a762d7 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 25 Nov 2024 08:25:43 +0000 Subject: [PATCH 01/23] Begin refactoring to allow multiple converter types by moving get_workflows to new CWL converter class --- src/runcrate/cli.py | 27 +++++++- src/runcrate/convert.py | 63 +++--------------- src/runcrate/converters/__init__.py | 7 ++ .../__pycache__/__init__.cpython-312.pyc | Bin 0 -> 322 bytes .../__pycache__/base.cpython-312.pyc | Bin 0 -> 724 bytes .../__pycache__/cwl.cpython-312.pyc | Bin 0 -> 3177 bytes src/runcrate/converters/base.py | 10 +++ src/runcrate/converters/cwl.py | 60 +++++++++++++++++ tests/test_cli.py | 2 +- tests/test_step_mapping.py | 19 ++++-- 10 files changed, 128 insertions(+), 60 deletions(-) create mode 100644 src/runcrate/converters/__init__.py create mode 100644 src/runcrate/converters/__pycache__/__init__.cpython-312.pyc create mode 100644 src/runcrate/converters/__pycache__/base.cpython-312.pyc create mode 100644 src/runcrate/converters/__pycache__/cwl.cpython-312.pyc create mode 100644 src/runcrate/converters/base.py create mode 100644 src/runcrate/converters/cwl.py diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index b35ab86..1bfb1bc 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -21,6 +21,7 @@ from .convert import ProvCrateBuilder from .report import dump_crate_actions from .run import run_crate +from .converters import CONVERTERS @click.group() @@ -34,6 +35,13 @@ def cli(): metavar="RO_DIR", type=click.Path(exists=True, file_okay=False, readable=True, path_type=Path), ) +@click.option( + "-c", + "--converter", + type=click.Choice(CONVERTERS.keys()), + default="cwl", + help="converter to use", +) @click.option( "-o", "--output", @@ -56,15 +64,30 @@ def cli(): type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path), help="path to a README file (should be README.md in Markdown format)", ) -def convert(root, output, license, workflow_name, readme): +def convert(root, converter, output, license, workflow_name, readme): """\ Convert a CWLProv RO bundle into a Workflow Run RO-Crate. RO_DIR: top-level directory of the CWLProv RO """ + if not output: output = Path(f"{root.name}.crate.zip") - builder = ProvCrateBuilder(root, workflow_name, license, readme) + + if converter not in CONVERTERS: + sys.stderr.write(f"Unknown converter: {converter}\n") + sys.exit(1) + + converter_instance = CONVERTERS[converter] + sys.stdout.write(f"Using converter: {converter_instance}\n") + + builder = ProvCrateBuilder( + root, + converter_instance, + workflow_name, + license, + readme + ) crate = builder.build() if output.suffix == ".zip": crate.write_zip(output) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index cc20fc6..bbe355e 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -27,7 +27,6 @@ import networkx as nx import prov.model from bdbag.bdbagit import BDBag -from cwl_utils.parser import load_document_by_yaml from cwlprov.prov import Entity, Provenance from cwlprov.ro import ResearchObject from cwlprov.utils import first @@ -37,6 +36,7 @@ from .constants import PROFILES_BASE, PROFILES_VERSION, TERMS_NAMESPACE from .utils import as_list, parse_img +from .converters import CONVERTERS WORKFLOW_BASENAME = "packed.cwl" @@ -114,10 +114,6 @@ def is_structured(cwl_type): return properties -def get_fragment(uri): - return uri.rsplit("#", 1)[-1] - - def get_relative_uri(uri): doc, fragment = uri.rsplit("#", 1) return f"{doc.rsplit('/', 1)[-1]}#{fragment}" @@ -148,62 +144,25 @@ def build_step_graph(cwl_wf): return graph -def normalize_cwl_defs(cwl_defs): - inline_tools = {} - for d in cwl_defs.values(): - if not hasattr(d, "steps") or not d.steps: - continue - for s in d.steps: - if hasattr(s, "run") and s.run: - if hasattr(s.run, "id"): - tool = s.run - if tool.id.startswith("_:"): # CWL > 1.0 - tool.id = f"{s.id}/run" - inline_tools[get_fragment(tool.id)] = tool - s.run = tool.id - cwl_defs.update(inline_tools) - - -def get_workflow(wf_path): - """\ - Read the packed CWL workflow. - - Returns a dictionary where tools / workflows are mapped by their ids. - - Does not use load_document_by_uri, so we can hack the json to work around - issues. - """ - wf_path = Path(wf_path) - with open(wf_path, "rt") as f: - json_wf = json.load(f) - graph = json_wf.get("$graph", [json_wf]) - # https://github.com/common-workflow-language/cwltool/pull/1506 - for n in graph: - ns = n.pop("$namespaces", {}) - if ns: - json_wf.setdefault("$namespaces", {}).update(ns) - defs = load_document_by_yaml(json_wf, wf_path.absolute().as_uri(), load_all=True) - if not isinstance(defs, list): - defs = [defs] - def_map = {} - for d in defs: - k = get_fragment(d.id) - if k == "main": - k = wf_path.name - def_map[k] = d - normalize_cwl_defs(def_map) - return def_map +def get_fragment(uri): + return uri.rsplit("#", 1)[-1] class ProvCrateBuilder: - def __init__(self, root, workflow_name=None, license=None, readme=None): + def __init__(self, + root, + converter=CONVERTERS["cwl"], + workflow_name=None, + license=None, + readme=None): self.root = Path(root) + self.converter = converter self.workflow_name = workflow_name self.license = license self.readme = Path(readme) if readme else readme self.wf_path = self.root / "workflow" / WORKFLOW_BASENAME - self.cwl_defs = get_workflow(self.wf_path) + self.cwl_defs = self.converter.get_workflow(self.wf_path) self.step_maps = self._get_step_maps(self.cwl_defs) self.ro = ResearchObject(BDBag(str(root))) self.with_prov = set(str(_) for _ in self.ro.resources_with_provenance()) diff --git a/src/runcrate/converters/__init__.py b/src/runcrate/converters/__init__.py new file mode 100644 index 0000000..7460165 --- /dev/null +++ b/src/runcrate/converters/__init__.py @@ -0,0 +1,7 @@ +from .base import converter +from .cwl import cwlConverter + +CONVERTERS = { + "base": converter(), + "cwl": cwlConverter(), +} diff --git a/src/runcrate/converters/__pycache__/__init__.cpython-312.pyc b/src/runcrate/converters/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4bd230982ec500f71cd7ebb8e40ff0b25844105c GIT binary patch literal 322 zcmX@j%ge<81obN%(i?#EV-N=hn4yf%RzSvdh7^Vr#vF!R#wbQc5SuB7DVI5lnUR5s zA(e48R4I_n3}dG-&tZyUsbtY)eaQ%v)?~cJnVg?jmReMjS_I_qB$wwn!}(0NSdtQp zQ*SW?Mg25ci&%j=ia-u1VgnH@K;jmcv%g=MYfy-5Q1D8I&p;7|UvByt`MIh3`Q`cf zIr;%b`B|ySCB^zi21X|OMWuPkMTsS;`o%@b2oBr@#rpB_nR%Hd@$q^EmA5!-a`RJ4 zb5iY!xPiukoLejhBt9@RGBVy}kiN?x`;m=-SFGFnCbxLE-A7htKBh*lB2J(Z097_m AxBvhE literal 0 HcmV?d00001 diff --git a/src/runcrate/converters/__pycache__/base.cpython-312.pyc b/src/runcrate/converters/__pycache__/base.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bf58f88f68c43cebf9bf26c83435d198f7add58e GIT binary patch literal 724 zcmY*Wy>8nu5GE-ru?-n5G87%TYO#u-V^I`cvKd83FN7k}wo*xSk8}iiY60g7Is|=$ zAdiw4D75wk3VSHfsXYBS$qmQvj^BNEM?Op@W8&<`&kxJJJAY%)+n@u7*ACngMlOgX zJ0j^lYj5VHVxgDxd^~9vbTx|1sU&kI>5j}j$($clUO1>ni7u`cSOrO6(#v=b9I*E- zsYpm?ch1!Vqq2oTD*yK}TIpOxy3{(0K7qbci8ayroAb93Y>EV8t0Du*V{q(cqRZH* z>AI?iLZn4%h1i}QZ>N0@*E7Cz-^j0FaBzSm-O7j?dpacz+4hEacA+e{E5%C6KtLb>$(?2V_fx_^Z%QUg6E@G&7xl*9Gb^93}9hDr(W%GEwR+3*< zsF?y!rI{YpW`JXd3{KoppOy=O?F()kA=v!++>IYA--_U*UpAt_o;cU&{0D6AsRCep4-%KwjE&XLnXV zUU~=4oO|!N-#Pd1JNK_WJ)H>3-HWrSvV_oMRtbvGRQBHlrGhl1VH0KW0>(%SX`C6# za0{HT^CrK*Lz`d<8F4|xD1_#aCf-Gwq~i^)fwLg3@bT~y47E7+I+C_^M<p0I9c z;onuA<#|vBOk33y%}(Yry5%T~E6R$RF_R6HhGZbD>`#EGARR4W4K0K;d>1WnS_mYs zaUcba2Pv)yanb8e=FJ}lPLlzG4+DY)xzGD(D-g~6=YB|3HtTDCL#l%2P2 zbLw4Ur}d;mr!KsDVS0+>tRzt#eTtIgL8Uc8It88+*%dFWD28P?ic%Oj97`f-W?Mno zs{aDj!`=I!pk#>2W)b1<$N3txR*=)0gNrwsdJ#L#~qeF^CnT)z^9C!~z?ab8)&as2%(_nreFBpe@jXK27+Y7#!B3~uRd>DEgKk#30n#i0M- z4;CG6*zLRmO{R&dW>0&iE}UAI>i^{638A->mEtRau`DLBx zvIDn*%2O>QFoM>Mn$7B(ytu+J4I&#FJp?yv>r}REN6t}Qe(uiai1CU{Z8@*YN!5~< z)#NQdvNW|VaOZn!MWzxvXKAuQX-)?hLBL*!IP(Q@B1P2fa-nO&QZqWus!5$*2c8N0 zK2_87_>5{;@fcwj#N+L(Zh1WOZn#y2N;7oUXKDe$`VmijMu!7p?N%u$m>%qn`BheDCi0h zg909Cyo<0a8d}mpqBZq~w=8!Jm+aEiYtIk7xFt1X6(o_ps*STa*nBmI3~x zn}Kz+0D{dfj2t1M!(Hr3QlNGqqG$c6Z>XIA_*_Za?TMAYQ=Q)Kk!v%%gC{G-no+|a zo29vLICSFlKchpHuC=b}TeW=6*owYd5+26}OEdLYZ)IR@pj@miZpGrIH@rx!9v#|= zj&DZCYd`p6d@FisCwgTwdgaUa9!0P1NvL<|Sr|npYd6+YJBh2CiK|ksiBh`uD@Y|8edjD|wd!^ZWZ+}VpDmDNPqy0_v^Pf$BI=vNr3ECpj z@|z$2tlmFbd1vjNGOmxDEX|c?KK#*M2kIT%>qH$rJ7IY^Ntn~a z9{F2;C98w8QSMJsVRl41!pJOP_>BAtC>3Dbbc>Urp;Z*N6T z*Ty#^lMlm_gL=aG}~heFX$d`T?!Y zs9zyXxEFE-rL_jG;D&&%+oU*1k!=tH4~**DTe!T^O7N@qd9?Z$NC>!Xl58b(jNnNl z&U*rN^epu_c(0H#@K6XTuNQdsVu^=6@s4T&d3w^aN>#@p1Tq>T(@Zd1pgu<>4$T|R zvVRl_JIHadaah9vZF${>Wg3>Q_z8i$%tm6MD8I^>vcX9{9TAqr3i8nt{X7#+fffBd z2sqb6Bc<+oY@{-_HuksJcxh(0?_~9SHMQM$c3pjt`faKn9j*@yR&M#+v|8w0yTFII2g&wY}sX^)1!TOX5|UghhfWA`tAa(QR;+~(-Hb!~g}Vm 1.0 + tool.id = f"{s.id}/run" + inline_tools[self._get_fragment(tool.id)] = tool + s.run = tool.id + cwl_defs.update(inline_tools) diff --git a/tests/test_cli.py b/tests/test_cli.py index 8d8d357..2428ec6 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,7 +26,7 @@ def test_cli_convert(data_dir, tmpdir, monkeypatch): monkeypatch.chdir(str(tmpdir)) root = data_dir / "revsort-run-1" runner = CliRunner() - args = ["convert", str(root)] + args = ["convert", "-c", "cwl", str(root)] result = runner.invoke(cli, args) assert result.exit_code == 0, result.exception crate_zip = tmpdir / f"{root.name}.crate.zip" diff --git a/tests/test_step_mapping.py b/tests/test_step_mapping.py index 0a106c7..b5ecac9 100644 --- a/tests/test_step_mapping.py +++ b/tests/test_step_mapping.py @@ -12,13 +12,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -from runcrate.convert import ProvCrateBuilder, get_workflow +import pytest +from runcrate.convert import ProvCrateBuilder +from runcrate.converters.cwl import cwlConverter -def test_step_maps(data_dir): + +@pytest.fixture +def converter(): + converter = cwlConverter() + return converter + + +def test_step_maps_cwl(data_dir, converter): wf_basename = "exome-alignment-packed.cwl" wf_path = data_dir / wf_basename - cwl_defs = get_workflow(wf_path) + cwl_defs = converter.get_workflow(wf_path) step_maps = ProvCrateBuilder._get_step_maps(cwl_defs) assert set(step_maps) == {wf_basename} sm = step_maps[wf_basename] @@ -39,9 +48,9 @@ def test_step_maps(data_dir): assert sm["main/samtools_sort"]["pos"] < sm["main/picard_markduplicates"]["pos"] -def test_step_maps_disconnected(data_dir): +def test_step_maps_disconnected_cwl(data_dir, converter): wf_path = data_dir / "no-output-run-1/workflow/packed.cwl" - cwl_defs = get_workflow(wf_path) + cwl_defs = converter.get_workflow(wf_path) step_maps = ProvCrateBuilder._get_step_maps(cwl_defs) assert set(step_maps) == {"packed.cwl"} sm = step_maps["packed.cwl"] From a26fc10d40c6b61ba7215dd44e40a8ac3b1af08d Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 25 Nov 2024 08:51:49 +0000 Subject: [PATCH 02/23] Add standard python gitignore from github. Remove files which fall foul of it --- .gitignore | 166 +++++++++++++++++- .../__pycache__/cwl.cpython-312.pyc | Bin 3177 -> 0 bytes 2 files changed, 162 insertions(+), 4 deletions(-) delete mode 100644 src/runcrate/converters/__pycache__/cwl.cpython-312.pyc diff --git a/.gitignore b/.gitignore index 2ca8682..efa407c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,162 @@ -_site/ -.sass-cache/ -.jekyll-cache/ -.jekyll-metadata +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/src/runcrate/converters/__pycache__/cwl.cpython-312.pyc b/src/runcrate/converters/__pycache__/cwl.cpython-312.pyc deleted file mode 100644 index 4059217289c3814aab0c3c56eeede7c10413dc52..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3177 zcmaJ@U1%KF6~1@pZ}sQh)xWGb8q1QKP2*K;%XJ!DW2GWWN)RD2xE7RQv@=@GdUj@g z@2t1l&eA^kVG{&dk-~0hjWRCep4-%KwjE&XLnXV zUU~=4oO|!N-#Pd1JNK_WJ)H>3-HWrSvV_oMRtbvGRQBHlrGhl1VH0KW0>(%SX`C6# za0{HT^CrK*Lz`d<8F4|xD1_#aCf-Gwq~i^)fwLg3@bT~y47E7+I+C_^M<p0I9c z;onuA<#|vBOk33y%}(Yry5%T~E6R$RF_R6HhGZbD>`#EGARR4W4K0K;d>1WnS_mYs zaUcba2Pv)yanb8e=FJ}lPLlzG4+DY)xzGD(D-g~6=YB|3HtTDCL#l%2P2 zbLw4Ur}d;mr!KsDVS0+>tRzt#eTtIgL8Uc8It88+*%dFWD28P?ic%Oj97`f-W?Mno zs{aDj!`=I!pk#>2W)b1<$N3txR*=)0gNrwsdJ#L#~qeF^CnT)z^9C!~z?ab8)&as2%(_nreFBpe@jXK27+Y7#!B3~uRd>DEgKk#30n#i0M- z4;CG6*zLRmO{R&dW>0&iE}UAI>i^{638A->mEtRau`DLBx zvIDn*%2O>QFoM>Mn$7B(ytu+J4I&#FJp?yv>r}REN6t}Qe(uiai1CU{Z8@*YN!5~< z)#NQdvNW|VaOZn!MWzxvXKAuQX-)?hLBL*!IP(Q@B1P2fa-nO&QZqWus!5$*2c8N0 zK2_87_>5{;@fcwj#N+L(Zh1WOZn#y2N;7oUXKDe$`VmijMu!7p?N%u$m>%qn`BheDCi0h zg909Cyo<0a8d}mpqBZq~w=8!Jm+aEiYtIk7xFt1X6(o_ps*STa*nBmI3~x zn}Kz+0D{dfj2t1M!(Hr3QlNGqqG$c6Z>XIA_*_Za?TMAYQ=Q)Kk!v%%gC{G-no+|a zo29vLICSFlKchpHuC=b}TeW=6*owYd5+26}OEdLYZ)IR@pj@miZpGrIH@rx!9v#|= zj&DZCYd`p6d@FisCwgTwdgaUa9!0P1NvL<|Sr|npYd6+YJBh2CiK|ksiBh`uD@Y|8edjD|wd!^ZWZ+}VpDmDNPqy0_v^Pf$BI=vNr3ECpj z@|z$2tlmFbd1vjNGOmxDEX|c?KK#*M2kIT%>qH$rJ7IY^Ntn~a z9{F2;C98w8QSMJsVRl41!pJOP_>BAtC>3Dbbc>Urp;Z*N6T z*Ty#^lMlm_gL=aG}~heFX$d`T?!Y zs9zyXxEFE-rL_jG;D&&%+oU*1k!=tH4~**DTe!T^O7N@qd9?Z$NC>!Xl58b(jNnNl z&U*rN^epu_c(0H#@K6XTuNQdsVu^=6@s4T&d3w^aN>#@p1Tq>T(@Zd1pgu<>4$T|R zvVRl_JIHadaah9vZF${>Wg3>Q_z8i$%tm6MD8I^>vcX9{9TAqr3i8nt{X7#+fffBd z2sqb6Bc<+oY@{-_HuksJcxh(0?_~9SHMQM$c3pjt`faKn9j*@yR&M#+v|8w0yTFII2g&wY}sX^)1!TOX5|UghhfWA`tAa(QR;+~(-Hb!~g}Vm Date: Mon, 25 Nov 2024 08:53:47 +0000 Subject: [PATCH 03/23] Move helper functions out of cwl class --- src/runcrate/converters/cwl.py | 39 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 2d9d975..15f4f56 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -5,6 +5,24 @@ from cwl_utils.parser import load_document_by_yaml +def _get_fragment(uri): + return uri.rsplit("#", 1)[-1] + +def _normalize_cwl_defs(cwl_defs): + inline_tools = {} + for d in cwl_defs.values(): + if not hasattr(d, "steps") or not d.steps: + continue + for s in d.steps: + if hasattr(s, "run") and s.run: + if hasattr(s.run, "id"): + tool = s.run + if tool.id.startswith("_:"): # CWL > 1.0 + tool.id = f"{s.id}/run" + inline_tools[_get_fragment(tool.id)] = tool + s.run = tool.id + cwl_defs.update(inline_tools) + class cwlConverter(converter): def __init__(self): @@ -34,27 +52,10 @@ def get_workflow(self, wf_path): defs = [defs] def_map = {} for d in defs: - k = self._get_fragment(d.id) + k = _get_fragment(d.id) if k == "main": k = wf_path.name def_map[k] = d - self._normalize_cwl_defs(def_map) + _normalize_cwl_defs(def_map) return def_map - def _get_fragment(self, uri): - return uri.rsplit("#", 1)[-1] - - def _normalize_cwl_defs(self, cwl_defs): - inline_tools = {} - for d in cwl_defs.values(): - if not hasattr(d, "steps") or not d.steps: - continue - for s in d.steps: - if hasattr(s, "run") and s.run: - if hasattr(s.run, "id"): - tool = s.run - if tool.id.startswith("_:"): # CWL > 1.0 - tool.id = f"{s.id}/run" - inline_tools[self._get_fragment(tool.id)] = tool - s.run = tool.id - cwl_defs.update(inline_tools) From 47788cc8731d94d1131ebd6ee2af47614b0fe8cd Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 25 Nov 2024 09:43:54 +0000 Subject: [PATCH 04/23] Ignore pycs --- .gitignore | 1 + .../__pycache__/__init__.cpython-312.pyc | Bin 322 -> 0 bytes .../converters/__pycache__/base.cpython-312.pyc | Bin 724 -> 0 bytes 3 files changed, 1 insertion(+) delete mode 100644 src/runcrate/converters/__pycache__/__init__.cpython-312.pyc delete mode 100644 src/runcrate/converters/__pycache__/base.cpython-312.pyc diff --git a/.gitignore b/.gitignore index efa407c..522376a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ __pycache__/ *.py[cod] *$py.class +*.pyc # C extensions *.so diff --git a/src/runcrate/converters/__pycache__/__init__.cpython-312.pyc b/src/runcrate/converters/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 4bd230982ec500f71cd7ebb8e40ff0b25844105c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 322 zcmX@j%ge<81obN%(i?#EV-N=hn4yf%RzSvdh7^Vr#vF!R#wbQc5SuB7DVI5lnUR5s zA(e48R4I_n3}dG-&tZyUsbtY)eaQ%v)?~cJnVg?jmReMjS_I_qB$wwn!}(0NSdtQp zQ*SW?Mg25ci&%j=ia-u1VgnH@K;jmcv%g=MYfy-5Q1D8I&p;7|UvByt`MIh3`Q`cf zIr;%b`B|ySCB^zi21X|OMWuPkMTsS;`o%@b2oBr@#rpB_nR%Hd@$q^EmA5!-a`RJ4 zb5iY!xPiukoLejhBt9@RGBVy}kiN?x`;m=-SFGFnCbxLE-A7htKBh*lB2J(Z097_m AxBvhE diff --git a/src/runcrate/converters/__pycache__/base.cpython-312.pyc b/src/runcrate/converters/__pycache__/base.cpython-312.pyc deleted file mode 100644 index bf58f88f68c43cebf9bf26c83435d198f7add58e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 724 zcmY*Wy>8nu5GE-ru?-n5G87%TYO#u-V^I`cvKd83FN7k}wo*xSk8}iiY60g7Is|=$ zAdiw4D75wk3VSHfsXYBS$qmQvj^BNEM?Op@W8&<`&kxJJJAY%)+n@u7*ACngMlOgX zJ0j^lYj5VHVxgDxd^~9vbTx|1sU&kI>5j}j$($clUO1>ni7u`cSOrO6(#v=b9I*E- zsYpm?ch1!Vqq2oTD*yK}TIpOxy3{(0K7qbci8ayroAb93Y>EV8t0Du*V{q(cqRZH* z>AI?iLZn4%h1i}QZ>N0@*E7Cz-^j0FaBzSm-O7j?dpacz+4hEacA+e{E5%C6KtLb>$(?2V_fx_^Z%QUg6E@G&7xl*9Gb^93}9hDr(W%GEwR+3*< zsF?y!rI{YpW`JXd3{KoppOy=O?F()kA=v!++>IYA--_U*UpAt_o;cU&{0D6As Date: Mon, 25 Nov 2024 09:44:26 +0000 Subject: [PATCH 05/23] Move get_step_maps to cwl class --- src/runcrate/convert.py | 31 +------------------------------ src/runcrate/converters/base.py | 13 +++++++++++++ src/runcrate/converters/cwl.py | 30 ++++++++++++++++++++++++++++++ tests/test_step_mapping.py | 4 ++-- 4 files changed, 46 insertions(+), 32 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index bbe355e..ed7c61c 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -24,7 +24,6 @@ from io import StringIO from pathlib import Path -import networkx as nx import prov.model from bdbag.bdbagit import BDBag from cwlprov.prov import Entity, Provenance @@ -126,22 +125,6 @@ def cut_step_part(relative_uri): return relative_uri -def build_step_graph(cwl_wf): - out_map = {} - for s in cwl_wf.steps: - for o in s.out: - out_map[o] = get_fragment(s.id) - graph = nx.DiGraph() - for s in cwl_wf.steps: - fragment = get_fragment(s.id) - graph.add_node(fragment) - for i in s.in_: - sources = [i.source] if not isinstance(i.source, list) else i.source - for s in sources: - source_fragment = out_map.get(s) - if source_fragment: - graph.add_edge(source_fragment, fragment) - return graph def get_fragment(uri): @@ -163,7 +146,7 @@ def __init__(self, self.readme = Path(readme) if readme else readme self.wf_path = self.root / "workflow" / WORKFLOW_BASENAME self.cwl_defs = self.converter.get_workflow(self.wf_path) - self.step_maps = self._get_step_maps(self.cwl_defs) + self.step_maps = self.converter.get_step_maps(self.cwl_defs) self.ro = ResearchObject(BDBag(str(root))) self.with_prov = set(str(_) for _ in self.ro.resources_with_provenance()) self.workflow_run = Provenance(self.ro).activity() @@ -178,18 +161,6 @@ def __init__(self, self.file_map = {} self.manifest = self._get_manifest() - @staticmethod - def _get_step_maps(cwl_defs): - rval = {} - for k, v in cwl_defs.items(): - if hasattr(v, "steps"): - graph = build_step_graph(v) - pos_map = {f: i for i, f in enumerate(nx.topological_sort(graph))} - rval[k] = {} - for s in v.steps: - f = get_fragment(s.id) - rval[k][f] = {"tool": get_fragment(s.run), "pos": pos_map[f]} - return rval def _get_manifest(self): manifest = {} diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index b5320dc..5321aa6 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -8,3 +8,16 @@ def get_workflow(self, wf_path): Returns a dictionary where tools / workflows are mapped by their ids. """ + raise NotImplementedError("get_workflow") + + def get_step_maps(self, wf_defs): + """\ + Get a mapping of step names to their tool names and positions. + """ + raise NotImplementedError("get_step_maps") + + def build_step_graph(self, wf): + """\ + Build a graph of steps in the workflow. + """ + raise NotImplementedError("build_step_graph") diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 15f4f56..fbfe967 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -2,6 +2,7 @@ from pathlib import Path import json +import networkx as nx from cwl_utils.parser import load_document_by_yaml @@ -59,3 +60,32 @@ def get_workflow(self, wf_path): _normalize_cwl_defs(def_map) return def_map + def get_step_maps(self, cwl_defs): + rval = {} + for k, v in cwl_defs.items(): + if hasattr(v, "steps"): + graph = self.build_step_graph(v) + pos_map = {f: i for i, f in enumerate(nx.topological_sort(graph))} + rval[k] = {} + for s in v.steps: + f = _get_fragment(s.id) + rval[k][f] = {"tool": _get_fragment(s.run), "pos": pos_map[f]} + return rval + + def build_step_graph(self, cwl_wf): + out_map = {} + for s in cwl_wf.steps: + for o in s.out: + out_map[o] = _get_fragment(s.id) + graph = nx.DiGraph() + for s in cwl_wf.steps: + fragment = _get_fragment(s.id) + graph.add_node(fragment) + for i in s.in_: + sources = [i.source] if not isinstance(i.source, list) else i.source + for s in sources: + source_fragment = out_map.get(s) + if source_fragment: + graph.add_edge(source_fragment, fragment) + return graph + diff --git a/tests/test_step_mapping.py b/tests/test_step_mapping.py index b5ecac9..79f879e 100644 --- a/tests/test_step_mapping.py +++ b/tests/test_step_mapping.py @@ -28,7 +28,7 @@ def test_step_maps_cwl(data_dir, converter): wf_basename = "exome-alignment-packed.cwl" wf_path = data_dir / wf_basename cwl_defs = converter.get_workflow(wf_path) - step_maps = ProvCrateBuilder._get_step_maps(cwl_defs) + step_maps = converter.get_step_maps(cwl_defs) assert set(step_maps) == {wf_basename} sm = step_maps[wf_basename] assert len(sm) == 8 @@ -51,7 +51,7 @@ def test_step_maps_cwl(data_dir, converter): def test_step_maps_disconnected_cwl(data_dir, converter): wf_path = data_dir / "no-output-run-1/workflow/packed.cwl" cwl_defs = converter.get_workflow(wf_path) - step_maps = ProvCrateBuilder._get_step_maps(cwl_defs) + step_maps = converter.get_step_maps(cwl_defs) assert set(step_maps) == {"packed.cwl"} sm = step_maps["packed.cwl"] assert set(sm) == {"main/date_step", "main/echo_step", "main/date2_step"} From 3ced681c139ed06ca9c1d0975d75b537f50dbe27 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 25 Nov 2024 09:48:53 +0000 Subject: [PATCH 06/23] Apply linting --- src/runcrate/cli.py | 2 +- src/runcrate/convert.py | 5 +---- src/runcrate/converters/__init__.py | 1 + src/runcrate/converters/cwl.py | 11 ++++++----- tests/test_step_mapping.py | 1 - 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index 1bfb1bc..7e5cbd2 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -19,9 +19,9 @@ from . import __version__ from .convert import ProvCrateBuilder +from .converters import CONVERTERS from .report import dump_crate_actions from .run import run_crate -from .converters import CONVERTERS @click.group() diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index ed7c61c..9037809 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -34,8 +34,8 @@ from rocrate.rocrate import ROCrate from .constants import PROFILES_BASE, PROFILES_VERSION, TERMS_NAMESPACE -from .utils import as_list, parse_img from .converters import CONVERTERS +from .utils import as_list, parse_img WORKFLOW_BASENAME = "packed.cwl" @@ -125,8 +125,6 @@ def cut_step_part(relative_uri): return relative_uri - - def get_fragment(uri): return uri.rsplit("#", 1)[-1] @@ -161,7 +159,6 @@ def __init__(self, self.file_map = {} self.manifest = self._get_manifest() - def _get_manifest(self): manifest = {} with open(self.root / Path(MANIFEST_FILE)) as f: diff --git a/src/runcrate/converters/__init__.py b/src/runcrate/converters/__init__.py index 7460165..d4d1d78 100644 --- a/src/runcrate/converters/__init__.py +++ b/src/runcrate/converters/__init__.py @@ -1,6 +1,7 @@ from .base import converter from .cwl import cwlConverter + CONVERTERS = { "base": converter(), "cwl": cwlConverter(), diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index fbfe967..30c328c 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -1,14 +1,16 @@ -from .base import converter - -from pathlib import Path import json -import networkx as nx +from pathlib import Path +import networkx as nx from cwl_utils.parser import load_document_by_yaml +from .base import converter + + def _get_fragment(uri): return uri.rsplit("#", 1)[-1] + def _normalize_cwl_defs(cwl_defs): inline_tools = {} for d in cwl_defs.values(): @@ -88,4 +90,3 @@ def build_step_graph(self, cwl_wf): if source_fragment: graph.add_edge(source_fragment, fragment) return graph - diff --git a/tests/test_step_mapping.py b/tests/test_step_mapping.py index 79f879e..ac4fed1 100644 --- a/tests/test_step_mapping.py +++ b/tests/test_step_mapping.py @@ -14,7 +14,6 @@ import pytest -from runcrate.convert import ProvCrateBuilder from runcrate.converters.cwl import cwlConverter From 82473285c0ef5421ddd351479d05c385fa3cd8be Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 25 Nov 2024 13:31:11 +0000 Subject: [PATCH 07/23] Apply linting --- src/runcrate/convert.py | 461 ++++++++++++++++++---------------------- 1 file changed, 201 insertions(+), 260 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 9037809..c60ae36 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -131,6 +131,9 @@ def get_fragment(uri): class ProvCrateBuilder: + # -------------------------------------------------------------------------- + # Public methods, called by the CLI + def __init__(self, root, converter=CONVERTERS["cwl"], @@ -159,61 +162,6 @@ def __init__(self, self.file_map = {} self.manifest = self._get_manifest() - def _get_manifest(self): - manifest = {} - with open(self.root / Path(MANIFEST_FILE)) as f: - for line in f: - hash_, relpath = line.strip().split(None, 1) - manifest[hash_] = self.root / relpath - return manifest - - def _resolve_plan(self, activity): - job_qname = activity.plan() - plan = activity.provenance.entity(job_qname) - if not plan: - m = SCATTER_JOB_PATTERN.match(str(job_qname)) - if m: - plan = activity.provenance.entity(m.groups()[0]) - return plan - - def _get_hash(self, prov_param): - k = prov_param.id.localpart - try: - return self.hashes[k] - except KeyError: - type_names = frozenset(str(_) for _ in prov_param.types()) - if "wf4ever:File" in type_names: - hash_ = next(prov_param.specializationOf()).id.localpart - self.hashes[k] = hash_ - return hash_ - elif "ro:Folder" in type_names: - m = hashlib.sha1() - m.update("".join(sorted( - self._get_hash(_) for _ in self.get_dict(prov_param).values() - )).encode()) - self.hashes[k] = hash_ = m.hexdigest() - return hash_ - - def _get_hashes(self, provenance): - for r in provenance.prov_doc.get_records(prov.model.ProvEntity): - self._get_hash(Entity(provenance, r)) - - def get_members(self, entity): - membership = entity.provenance.record_with_attr( - prov.model.ProvMembership, entity.id, prov.model.PROV_ATTR_COLLECTION - ) - member_ids = (_.get_attribute(prov.model.PROV_ATTR_ENTITY) for _ in membership) - return (entity.provenance.entity(first(_)) for _ in member_ids) - - def get_dict(self, entity): - d = {} - for qname in entity.record.get_attribute("prov:hadDictionaryMember"): - kvp = entity.provenance.entity(qname) - key = first(kvp.record.get_attribute("prov:pairKey")) - entity_id = first(kvp.record.get_attribute("prov:pairEntity")) - d[key] = entity.provenance.entity(entity_id) - return d - def build(self): crate = ROCrate(gen_preview=False) crate.metadata.extra_contexts.append(TERMS_NAMESPACE) @@ -227,6 +175,9 @@ def build(self): self.add_output_formats(crate) return crate + # -------------------------------------------------------------------------- + # Top level methods, called by build() + def add_root_metadata(self, crate): if self.license: crate.root_dataset["license"] = self.license @@ -280,6 +231,144 @@ def add_workflow(self, crate): self.add_param_connections(crate, workflow) return workflow + def add_engine_run(self, crate): + engine = self.workflow_run.start().starter_activity() + roc_engine = crate.add(SoftwareApplication(crate, properties={ + "name": engine.label or "workflow engine" + })) + roc_engine_run = crate.add(ContextEntity(crate, properties={ + "@type": "OrganizeAction", + "name": f"Run of {roc_engine['name']}", + "startTime": engine.start().time.isoformat(), + })) + roc_engine_run["instrument"] = roc_engine + self.add_agent(crate, roc_engine_run, engine) + self.roc_engine_run = roc_engine_run + + def add_action(self, crate, activity, parent_instrument=None): + workflow = crate.mainEntity + action = crate.add(ContextEntity(crate, properties={ + "@type": "CreateAction", + "name": activity.label, + })) + plan = self._resolve_plan(activity) + plan_tag = plan.id.localpart + if plan_tag == "main": + assert str(activity.type) == "wfprov:WorkflowRun" + instrument = workflow + self.roc_engine_run["result"] = action + crate.root_dataset["mentions"] = [action] + + def to_wf_p(k): + return k + else: + parent_instrument_fragment = get_fragment(parent_instrument.id) + if parent_instrument_fragment != WORKFLOW_BASENAME: + parts = plan_tag.split("/", 1) + if parts[0] == "main": + parts[0] = parent_instrument_fragment + plan_tag = "/".join(parts) + tool_name = self.step_maps[parent_instrument_fragment][plan_tag]["tool"] + instrument = crate.dereference(f"{workflow.id}#{tool_name}") + control_action = self.control_actions.get(plan_tag) + if not control_action: + control_action = crate.add(ContextEntity(crate, properties={ + "@type": "ControlAction", + "name": f"orchestrate {tool_name}", + })) + step = crate.dereference(f"{workflow.id}#{plan_tag}") + control_action["instrument"] = step + self.roc_engine_run.append_to("object", control_action, compact=True) + self.control_actions[plan_tag] = control_action + control_action.append_to("object", action, compact=True) + if activity.uri in self.with_prov: + nested_prov = Provenance(self.ro, activity.uri) + activity = nested_prov.activity() + + def to_wf_p(k): + return k.replace(activity.plan().localpart, tool_name) + self._get_hashes(activity.provenance) + action["instrument"] = instrument + action["startTime"] = activity.start().time.isoformat() + action["endTime"] = activity.end().time.isoformat() + action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage") + action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation") + self.add_container_images(crate, action, activity) + for job in activity.steps(): + self.add_action(crate, job, parent_instrument=instrument) + + def patch_workflow_input_collection(self, crate, wf=None): + """\ + CWLProv records secondary files only in step runs, not in the workflow + run. Thus, when the conversion of parameter values is completed, + workflow-level parameters with secondary files get mapped to the main + entity of the collection alone (a File). This method fixes the mapping + by retrieving the correct Collection entity from the relevant tool + execution. + """ + if wf is None: + wf = crate.mainEntity + sel = [_ for _ in crate.contextual_entities + if "CreateAction" in as_list(_.type) and _.get("instrument") is wf] + if not sel: + return # skipped subworkflow + wf_action = sel[0] + connections = [_ for _ in crate.contextual_entities + if "ParameterConnection" in as_list(_.type)] + for param in wf.get("input", []): + if param.get("additionalType") == "Collection": + src_sel = [_ for _ in wf_action.get("object", []) + if param in as_list(_.get("exampleOfWork"))] + if not src_sel: + raise RuntimeError(f"object for param {param.id} not found") + obj = src_sel[0] + if obj.type != "Collection": + param_connections = [_ for _ in connections if _["sourceParameter"] is param] + if not param_connections: + continue + pc = param_connections[0] + tgt_param = pc["targetParameter"] + tgt_sel = [_ for _ in crate.get_entities() + if tgt_param in as_list(_.get("exampleOfWork"))] + if not tgt_sel: + raise RuntimeError(f"object for param {tgt_param.id} not found") + tgt_obj = tgt_sel[0] + wf_action["object"] = [ + _ for _ in as_list(wf_action["object"]) if _ is not obj + ] + [tgt_obj] + tgt_obj.append_to("exampleOfWork", param) + obj["exampleOfWork"] = [_ for _ in as_list(obj["exampleOfWork"]) + if _ is not param] + if len(obj["exampleOfWork"]) == 1: + obj["exampleOfWork"] = obj["exampleOfWork"][0] + if len(obj["exampleOfWork"]) == 0: + del obj["exampleOfWork"] + for tool in wf.get("hasPart", []): + if "ComputationalWorkflow" in as_list(tool.type): + self.patch_workflow_input_collection(crate, wf=tool) + + def add_inputs_file(self, crate): + path = self.root / "workflow" / INPUTS_FILE_BASENAME + if path.is_file(): + with open(path) as f: + data = json.load(f) + data = self._map_input_data(crate, data) + source = StringIO(json.dumps(data, indent=4)) + crate.add_file(source, path.name, properties={ + "name": "input object document", + "encodingFormat": "application/json", + }) + + def add_output_formats(self, crate): + path = self.root / "workflow" / OUTPUTS_FILE_BASENAME + if path.is_file(): + with open(path) as f: + data = json.load(f) + self._map_input_data(crate, data) + + # -------------------------------------------------------------------------- + # Internal methods, called by the top level methods + def add_step(self, crate, workflow, cwl_step): step_fragment = get_fragment(cwl_step.id) step_id = f"{self.wf_path.name}#{step_fragment}" @@ -370,20 +459,6 @@ def add_params(self, crate, cwl_params): params.append(p) return params - def add_engine_run(self, crate): - engine = self.workflow_run.start().starter_activity() - roc_engine = crate.add(SoftwareApplication(crate, properties={ - "name": engine.label or "workflow engine" - })) - roc_engine_run = crate.add(ContextEntity(crate, properties={ - "@type": "OrganizeAction", - "name": f"Run of {roc_engine['name']}", - "startTime": engine.start().time.isoformat(), - })) - roc_engine_run["instrument"] = roc_engine - self.add_agent(crate, roc_engine_run, engine) - self.roc_engine_run = roc_engine_run - def add_agent(self, crate, roc_engine_run, engine): delegate = engine.start().starter_activity() try: @@ -408,58 +483,6 @@ def add_agent(self, crate, roc_engine_run, engine): ro_a = crate.add(ContextEntity(crate, agent_id, properties=properties)) roc_engine_run.append_to("agent", ro_a, compact=True) - def add_action(self, crate, activity, parent_instrument=None): - workflow = crate.mainEntity - action = crate.add(ContextEntity(crate, properties={ - "@type": "CreateAction", - "name": activity.label, - })) - plan = self._resolve_plan(activity) - plan_tag = plan.id.localpart - if plan_tag == "main": - assert str(activity.type) == "wfprov:WorkflowRun" - instrument = workflow - self.roc_engine_run["result"] = action - crate.root_dataset["mentions"] = [action] - - def to_wf_p(k): - return k - else: - parent_instrument_fragment = get_fragment(parent_instrument.id) - if parent_instrument_fragment != WORKFLOW_BASENAME: - parts = plan_tag.split("/", 1) - if parts[0] == "main": - parts[0] = parent_instrument_fragment - plan_tag = "/".join(parts) - tool_name = self.step_maps[parent_instrument_fragment][plan_tag]["tool"] - instrument = crate.dereference(f"{workflow.id}#{tool_name}") - control_action = self.control_actions.get(plan_tag) - if not control_action: - control_action = crate.add(ContextEntity(crate, properties={ - "@type": "ControlAction", - "name": f"orchestrate {tool_name}", - })) - step = crate.dereference(f"{workflow.id}#{plan_tag}") - control_action["instrument"] = step - self.roc_engine_run.append_to("object", control_action, compact=True) - self.control_actions[plan_tag] = control_action - control_action.append_to("object", action, compact=True) - if activity.uri in self.with_prov: - nested_prov = Provenance(self.ro, activity.uri) - activity = nested_prov.activity() - - def to_wf_p(k): - return k.replace(activity.plan().localpart, tool_name) - self._get_hashes(activity.provenance) - action["instrument"] = instrument - action["startTime"] = activity.start().time.isoformat() - action["endTime"] = activity.end().time.isoformat() - action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage") - action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation") - self.add_container_images(crate, action, activity) - for job in activity.steps(): - self.add_action(crate, job, parent_instrument=instrument) - def add_container_images(self, crate, action, activity): images = set() for assoc in activity.association(): @@ -522,77 +545,6 @@ def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): action_params.append(action_p) return action_params - @staticmethod - def _set_alternate_name(prov_param, action_p, parent=None): - basename = getattr(prov_param, "basename", None) - if not basename: - return - if not parent: - action_p["alternateName"] = basename - return - if "alternateName" in parent: - action_p["alternateName"] = (Path(parent["alternateName"]) / basename).as_posix() - - def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): - type_names = frozenset(str(_) for _ in prov_param.types()) - secondary_files = [_.generated_entity() for _ in prov_param.derivations() - if str(_.type) == "cwlprov:SecondaryFile"] - if convert_secondary and secondary_files: - main_entity = self.convert_param(prov_param, crate, convert_secondary=False) - action_p = self.collections.get(main_entity.id) - if not action_p: - action_p = crate.add(ContextEntity(crate, properties={ - "@type": "Collection" - })) - action_p["mainEntity"] = main_entity - action_p["hasPart"] = [main_entity] + [ - self.convert_param(_, crate) for _ in secondary_files - ] - crate.root_dataset.append_to("mentions", action_p) - self.collections[main_entity.id] = action_p - return action_p - if "wf4ever:File" in type_names: - hash_ = self.hashes[prov_param.id.localpart] - dest = Path(parent.id if parent else "") / hash_ - action_p = crate.dereference(dest.as_posix()) - if not action_p: - source = self.manifest[hash_] - action_p = crate.add_file(source, dest, properties={ - "sha1": hash_, - "contentSize": str(Path(source).stat().st_size) - }) - self._set_alternate_name(prov_param, action_p, parent=parent) - try: - source_k = str(source.resolve(strict=False)) - except RuntimeError: - source_k = str(source) - self.file_map[source_k] = dest - return action_p - if "ro:Folder" in type_names: - hash_ = self.hashes[prov_param.id.localpart] - dest = Path(parent.id if parent else "") / hash_ - action_p = crate.dereference(dest.as_posix()) - if not action_p: - action_p = crate.add_directory(dest_path=dest) - self._set_alternate_name(prov_param, action_p, parent=parent) - for child in self.get_dict(prov_param).values(): - part = self.convert_param(child, crate, parent=action_p) - action_p.append_to("hasPart", part) - return action_p - if prov_param.value is not None: - return str(prov_param.value) - if "prov:Dictionary" in type_names: - return dict( - (k, self.convert_param(v, crate)) - for k, v in self.get_dict(prov_param).items() - if k != "@id" - ) - if "prov:Collection" in type_names: - return [self.convert_param(_, crate) for _ in self.get_members(prov_param)] - if prov_param.id.uri == CWLPROV_NONE: - return None - raise RuntimeError(f"No value to convert for {prov_param}") - def add_param_connections(self, crate, workflow): def connect(source, target, entity): connection = crate.add(ContextEntity(crate, properties={ @@ -643,55 +595,63 @@ def connect(source, target, entity): to_param = get_fragment(out.id) connect(from_param, to_param, workflow) - def patch_workflow_input_collection(self, crate, wf=None): - """\ - CWLProv records secondary files only in step runs, not in the workflow - run. Thus, when the conversion of parameter values is completed, - workflow-level parameters with secondary files get mapped to the main - entity of the collection alone (a File). This method fixes the mapping - by retrieving the correct Collection entity from the relevant tool - execution. - """ - if wf is None: - wf = crate.mainEntity - sel = [_ for _ in crate.contextual_entities - if "CreateAction" in as_list(_.type) and _.get("instrument") is wf] - if not sel: - return # skipped subworkflow - wf_action = sel[0] - connections = [_ for _ in crate.contextual_entities - if "ParameterConnection" in as_list(_.type)] - for param in wf.get("input", []): - if param.get("additionalType") == "Collection": - src_sel = [_ for _ in wf_action.get("object", []) - if param in as_list(_.get("exampleOfWork"))] - if not src_sel: - raise RuntimeError(f"object for param {param.id} not found") - obj = src_sel[0] - if obj.type != "Collection": - param_connections = [_ for _ in connections if _["sourceParameter"] is param] - if not param_connections: - continue - pc = param_connections[0] - tgt_param = pc["targetParameter"] - tgt_sel = [_ for _ in crate.get_entities() - if tgt_param in as_list(_.get("exampleOfWork"))] - if not tgt_sel: - raise RuntimeError(f"object for param {tgt_param.id} not found") - tgt_obj = tgt_sel[0] - wf_action["object"] = [ - _ for _ in as_list(wf_action["object"]) if _ is not obj - ] + [tgt_obj] - tgt_obj.append_to("exampleOfWork", param) - obj["exampleOfWork"] = [_ for _ in as_list(obj["exampleOfWork"]) - if _ is not param] - if len(obj["exampleOfWork"]) == 1: - obj["exampleOfWork"] = obj["exampleOfWork"][0] - if len(obj["exampleOfWork"]) == 0: - del obj["exampleOfWork"] - for tool in wf.get("hasPart", []): - if "ComputationalWorkflow" in as_list(tool.type): - self.patch_workflow_input_collection(crate, wf=tool) + # -------------------------------------------------------------------------- + # Utility methods, called by the other methods + + def _get_manifest(self): + manifest = {} + with open(self.root / Path(MANIFEST_FILE)) as f: + for line in f: + hash_, relpath = line.strip().split(None, 1) + manifest[hash_] = self.root / relpath + return manifest + + def _resolve_plan(self, activity): + job_qname = activity.plan() + plan = activity.provenance.entity(job_qname) + if not plan: + m = SCATTER_JOB_PATTERN.match(str(job_qname)) + if m: + plan = activity.provenance.entity(m.groups()[0]) + return plan + + def _get_hash(self, prov_param): + k = prov_param.id.localpart + try: + return self.hashes[k] + except KeyError: + type_names = frozenset(str(_) for _ in prov_param.types()) + if "wf4ever:File" in type_names: + hash_ = next(prov_param.specializationOf()).id.localpart + self.hashes[k] = hash_ + return hash_ + elif "ro:Folder" in type_names: + m = hashlib.sha1() + m.update("".join(sorted( + self._get_hash(_) for _ in self.get_dict(prov_param).values() + )).encode()) + self.hashes[k] = hash_ = m.hexdigest() + return hash_ + + def _get_hashes(self, provenance): + for r in provenance.prov_doc.get_records(prov.model.ProvEntity): + self._get_hash(Entity(provenance, r)) + + def get_members(self, entity): + membership = entity.provenance.record_with_attr( + prov.model.ProvMembership, entity.id, prov.model.PROV_ATTR_COLLECTION + ) + member_ids = (_.get_attribute(prov.model.PROV_ATTR_ENTITY) for _ in membership) + return (entity.provenance.entity(first(_)) for _ in member_ids) + + def get_dict(self, entity): + d = {} + for qname in entity.record.get_attribute("prov:hadDictionaryMember"): + kvp = entity.provenance.entity(qname) + key = first(kvp.record.get_attribute("prov:pairKey")) + entity_id = first(kvp.record.get_attribute("prov:pairEntity")) + d[key] = entity.provenance.entity(entity_id) + return d def _map_input_data(self, crate, data): if isinstance(data, list): @@ -716,22 +676,3 @@ def _map_input_data(self, crate, data): rval[k] = self._map_input_data(crate, v) return rval return data - - def add_inputs_file(self, crate): - path = self.root / "workflow" / INPUTS_FILE_BASENAME - if path.is_file(): - with open(path) as f: - data = json.load(f) - data = self._map_input_data(crate, data) - source = StringIO(json.dumps(data, indent=4)) - crate.add_file(source, path.name, properties={ - "name": "input object document", - "encodingFormat": "application/json", - }) - - def add_output_formats(self, crate): - path = self.root / "workflow" / OUTPUTS_FILE_BASENAME - if path.is_file(): - with open(path) as f: - data = json.load(f) - self._map_input_data(crate, data) From 40f52dfadd0c26633b444c7a947a76c03a20b56e Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Mon, 2 Dec 2024 13:46:26 +0000 Subject: [PATCH 08/23] Add convert_param to the cwl object --- src/runcrate/convert.py | 53 ++------- src/runcrate/converters/base.py | 6 + src/runcrate/converters/cwl.py | 187 +++++++++++++++++++++++++++++--- 3 files changed, 183 insertions(+), 63 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index c60ae36..3df23cf 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -18,7 +18,6 @@ Generate a Workflow Run RO-Crate from a CWLProv RO bundle. """ -import hashlib import json import re from io import StringIO @@ -26,9 +25,8 @@ import prov.model from bdbag.bdbagit import BDBag -from cwlprov.prov import Entity, Provenance +from cwlprov.prov import Provenance from cwlprov.ro import ResearchObject -from cwlprov.utils import first from rocrate.model.contextentity import ContextEntity from rocrate.model.softwareapplication import SoftwareApplication from rocrate.rocrate import ROCrate @@ -58,8 +56,6 @@ SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") -CWLPROV_NONE = "https://w3id.org/cwl/prov#None" - WROC_PROFILE_VERSION = "1.0" DOCKER_IMG_TYPE = "https://w3id.org/ro/terms/workflow-run#DockerImage" @@ -287,7 +283,7 @@ def to_wf_p(k): def to_wf_p(k): return k.replace(activity.plan().localpart, tool_name) - self._get_hashes(activity.provenance) + self.converter.get_hashes(activity.provenance) action["instrument"] = instrument action["startTime"] = activity.start().time.isoformat() action["endTime"] = activity.end().time.isoformat() @@ -517,7 +513,12 @@ def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): wf_p = crate.dereference(to_wf_p(k)) k = get_fragment(k) v = rel.entity() - value = self.convert_param(v, crate) + value = self.converter.convert_param(v, + crate, + hashes=self.hashes, + manifest=self.manifest, + file_map=self.file_map + ) if value is None: continue # param is optional with no default and was not set if {"ro:Folder", "wf4ever:File"} & set(str(_) for _ in v.types()): @@ -615,44 +616,6 @@ def _resolve_plan(self, activity): plan = activity.provenance.entity(m.groups()[0]) return plan - def _get_hash(self, prov_param): - k = prov_param.id.localpart - try: - return self.hashes[k] - except KeyError: - type_names = frozenset(str(_) for _ in prov_param.types()) - if "wf4ever:File" in type_names: - hash_ = next(prov_param.specializationOf()).id.localpart - self.hashes[k] = hash_ - return hash_ - elif "ro:Folder" in type_names: - m = hashlib.sha1() - m.update("".join(sorted( - self._get_hash(_) for _ in self.get_dict(prov_param).values() - )).encode()) - self.hashes[k] = hash_ = m.hexdigest() - return hash_ - - def _get_hashes(self, provenance): - for r in provenance.prov_doc.get_records(prov.model.ProvEntity): - self._get_hash(Entity(provenance, r)) - - def get_members(self, entity): - membership = entity.provenance.record_with_attr( - prov.model.ProvMembership, entity.id, prov.model.PROV_ATTR_COLLECTION - ) - member_ids = (_.get_attribute(prov.model.PROV_ATTR_ENTITY) for _ in membership) - return (entity.provenance.entity(first(_)) for _ in member_ids) - - def get_dict(self, entity): - d = {} - for qname in entity.record.get_attribute("prov:hadDictionaryMember"): - kvp = entity.provenance.entity(qname) - key = first(kvp.record.get_attribute("prov:pairKey")) - entity_id = first(kvp.record.get_attribute("prov:pairEntity")) - d[key] = entity.provenance.entity(entity_id) - return d - def _map_input_data(self, crate, data): if isinstance(data, list): return [self._map_input_data(crate, _) for _ in data] diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 5321aa6..14a2b07 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -21,3 +21,9 @@ def build_step_graph(self, wf): Build a graph of steps in the workflow. """ raise NotImplementedError("build_step_graph") + + def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): + """\ + Convert a CWLProv parameter to a RO-Crate entity. + """ + raise NotImplementedError("convert_param") diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 30c328c..5be588b 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -1,33 +1,24 @@ +import hashlib import json from pathlib import Path import networkx as nx +import prov.model from cwl_utils.parser import load_document_by_yaml +from cwlprov.prov import Entity +from cwlprov.utils import first +from rocrate.model.contextentity import ContextEntity from .base import converter -def _get_fragment(uri): - return uri.rsplit("#", 1)[-1] - - -def _normalize_cwl_defs(cwl_defs): - inline_tools = {} - for d in cwl_defs.values(): - if not hasattr(d, "steps") or not d.steps: - continue - for s in d.steps: - if hasattr(s, "run") and s.run: - if hasattr(s.run, "id"): - tool = s.run - if tool.id.startswith("_:"): # CWL > 1.0 - tool.id = f"{s.id}/run" - inline_tools[_get_fragment(tool.id)] = tool - s.run = tool.id - cwl_defs.update(inline_tools) +CWLPROV_NONE = "https://w3id.org/cwl/prov#None" class cwlConverter(converter): + hashes = {} + collections = {} + def __init__(self): pass @@ -90,3 +81,163 @@ def build_step_graph(self, cwl_wf): if source_fragment: graph.add_edge(source_fragment, fragment) return graph + + def convert_param(self, + prov_param, + crate, + convert_secondary=True, + parent=None, + hashes=None, + manifest=None, + file_map=None + ): + type_names = frozenset(str(_) for _ in prov_param.types()) + secondary_files = [_.generated_entity() for _ in prov_param.derivations() + if str(_.type) == "cwlprov:SecondaryFile"] + if convert_secondary and secondary_files: + main_entity = self.convert_param(prov_param, + crate, + convert_secondary=False, + manifest=manifest, + file_map=file_map) + action_p = self.collections.get(main_entity.id) + if not action_p: + action_p = crate.add(ContextEntity(crate, properties={ + "@type": "Collection" + })) + action_p["mainEntity"] = main_entity + action_p["hasPart"] = [main_entity] + [ + self.convert_param(_, + crate, + manifest=manifest, + file_map=file_map + ) for _ in secondary_files + ] + crate.root_dataset.append_to("mentions", action_p) + self.collections[main_entity.id] = action_p + return action_p + if "wf4ever:File" in type_names: + hash_ = self.hashes[prov_param.id.localpart] + dest = Path(parent.id if parent else "") / hash_ + action_p = crate.dereference(dest.as_posix()) + if not action_p: + source = manifest[hash_] + action_p = crate.add_file(source, dest, properties={ + "sha1": hash_, + "contentSize": str(Path(source).stat().st_size) + }) + _set_alternate_name(prov_param, action_p, parent=parent) + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + file_map[source_k] = dest + return action_p + if "ro:Folder" in type_names: + hash_ = self.hashes[prov_param.id.localpart] + dest = Path(parent.id if parent else "") / hash_ + action_p = crate.dereference(dest.as_posix()) + if not action_p: + action_p = crate.add_directory(dest_path=dest) + _set_alternate_name(prov_param, action_p, parent=parent) + for child in _get_dict(prov_param).values(): + part = self.convert_param(child, + crate, + parent=action_p, + manifest=manifest, + file_map=file_map + ) + action_p.append_to("hasPart", part) + return action_p + if prov_param.value is not None: + return str(prov_param.value) + if "prov:Dictionary" in type_names: + return dict( + (k, self.convert_param(v, + crate, + manifest=manifest, + file_map=file_map + )) + for k, v in _get_dict(prov_param).items() + if k != "@id" + ) + if "prov:Collection" in type_names: + return [self.convert_param(_, + crate, + manifest=manifest, + file_map=file_map + ) for _ in _get_members(prov_param)] + if prov_param.id.uri == CWLPROV_NONE: + return None + raise RuntimeError(f"No value to convert for {prov_param}") + + def get_hashes(self, provenance): + for r in provenance.prov_doc.get_records(prov.model.ProvEntity): + self._get_hash(self.hashes, Entity(provenance, r)) + + def _get_hash(self, hashes, prov_param): + k = prov_param.id.localpart + try: + return hashes[k] + except KeyError: + type_names = frozenset(str(_) for _ in prov_param.types()) + if "wf4ever:File" in type_names: + hash_ = next(prov_param.specializationOf()).id.localpart + self.hashes[k] = hash_ + return hash_ + elif "ro:Folder" in type_names: + m = hashlib.sha1() + m.update("".join(sorted( + self._get_hash(hashes, _) for _ in _get_dict(prov_param).values() + )).encode()) + self.hashes[k] = hash_ = m.hexdigest() + return hash_ + + +def _get_members(entity): + membership = entity.provenance.record_with_attr( + prov.model.ProvMembership, entity.id, prov.model.PROV_ATTR_COLLECTION + ) + member_ids = (_.get_attribute(prov.model.PROV_ATTR_ENTITY) for _ in membership) + return (entity.provenance.entity(first(_)) for _ in member_ids) + + +def _get_fragment(uri): + return uri.rsplit("#", 1)[-1] + + +def _normalize_cwl_defs(cwl_defs): + inline_tools = {} + for d in cwl_defs.values(): + if not hasattr(d, "steps") or not d.steps: + continue + for s in d.steps: + if hasattr(s, "run") and s.run: + if hasattr(s.run, "id"): + tool = s.run + if tool.id.startswith("_:"): # CWL > 1.0 + tool.id = f"{s.id}/run" + inline_tools[_get_fragment(tool.id)] = tool + s.run = tool.id + cwl_defs.update(inline_tools) + + +def _set_alternate_name(prov_param, action_p, parent=None): + basename = getattr(prov_param, "basename", None) + if not basename: + return + if not parent: + action_p["alternateName"] = basename + return + if "alternateName" in parent: + action_p["alternateName"] = (Path(parent["alternateName"]) / basename).as_posix() + + +def _get_dict(entity): + d = {} + for qname in entity.record.get_attribute("prov:hadDictionaryMember"): + kvp = entity.provenance.entity(qname) + key = first(kvp.record.get_attribute("prov:pairKey")) + entity_id = first(kvp.record.get_attribute("prov:pairEntity")) + d[key] = entity.provenance.entity(entity_id) + return d From 0458fd59f509ddc9e9ddc263abc62e10ffb3cb70 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Tue, 10 Dec 2024 13:16:47 +0000 Subject: [PATCH 09/23] Complete first go at refactoring to language agnostic tool --- src/runcrate/constants.py | 5 + src/runcrate/convert.py | 617 ++------------------------------ src/runcrate/converters/base.py | 79 +++- src/runcrate/converters/cwl.py | 605 +++++++++++++++++++++++++++++-- tests/test_step_mapping.py | 13 +- 5 files changed, 687 insertions(+), 632 deletions(-) diff --git a/src/runcrate/constants.py b/src/runcrate/constants.py index b0498e1..e011ec3 100644 --- a/src/runcrate/constants.py +++ b/src/runcrate/constants.py @@ -24,3 +24,8 @@ PROVENANCE_PROFILE = f"{PROVENANCE_PROFILE_BASE}/{PROFILES_VERSION}" TERMS_NAMESPACE = "https://w3id.org/ro/terms/workflow-run" + +WROC_PROFILE_VERSION = "1.0" +DOCKER_IMG_TYPE = "https://w3id.org/ro/terms/workflow-run#DockerImage" + + diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 3df23cf..fc5f3eb 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -19,8 +19,6 @@ """ import json -import re -from io import StringIO from pathlib import Path import prov.model @@ -31,611 +29,46 @@ from rocrate.model.softwareapplication import SoftwareApplication from rocrate.rocrate import ROCrate -from .constants import PROFILES_BASE, PROFILES_VERSION, TERMS_NAMESPACE +from .constants import TERMS_NAMESPACE from .converters import CONVERTERS -from .utils import as_list, parse_img - -WORKFLOW_BASENAME = "packed.cwl" -INPUTS_FILE_BASENAME = "primary-job.json" -OUTPUTS_FILE_BASENAME = "primary-output.json" MANIFEST_FILE = "manifest-sha1.txt" -CWL_TYPE_MAP = { - "string": "Text", - "int": "Integer", - "long": "Integer", - "float": "Float", - "double": "Float", - "Any": "DataType", - "boolean": "Boolean", - "File": "File", - "Directory": "Dataset", - "null": None, -} - -SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") - -WROC_PROFILE_VERSION = "1.0" - -DOCKER_IMG_TYPE = "https://w3id.org/ro/terms/workflow-run#DockerImage" - - -def convert_cwl_type(cwl_type): - if isinstance(cwl_type, list): - s = set(convert_cwl_type(_) for _ in cwl_type) - s.discard(None) - return s.pop() if len(s) == 1 else sorted(s) - if isinstance(cwl_type, str): - return CWL_TYPE_MAP[cwl_type] - if cwl_type.type_ == "enum": - return "Text" # use actionOption to represent choices? - if cwl_type.type_ == "array": - return convert_cwl_type(cwl_type.items) - if cwl_type.type_ == "record": - return "PropertyValue" - - -def properties_from_cwl_param(cwl_p): - def is_structured(cwl_type): - return getattr(cwl_type, "type_", None) in ("array", "record") - additional_type = "Collection" if cwl_p.secondaryFiles else convert_cwl_type(cwl_p.type_) - properties = { - "@type": "FormalParameter", - "additionalType": additional_type - } - if hasattr(cwl_p, "doc") and cwl_p.doc: - properties["description"] = cwl_p.doc - elif hasattr(cwl_p, "label") and cwl_p.label: - # name is used for the parameter's id to support reproducibility - properties["description"] = cwl_p.label - if cwl_p.format: - properties["encodingFormat"] = cwl_p.format - if isinstance(cwl_p.type_, list) and "null" in cwl_p.type_: - properties["valueRequired"] = "False" - if is_structured(cwl_p.type_): - properties["multipleValues"] = "True" - if hasattr(cwl_p, "default"): - if isinstance(cwl_p.default, dict): - if cwl_p.default.get("class") in ("File", "Directory"): - default = cwl_p.default.get("location", cwl_p.default.get("path")) - if default: - properties["defaultValue"] = default - elif not is_structured(cwl_p.type_) and cwl_p.default is not None: - properties["defaultValue"] = str(cwl_p.default) - # TODO: support more cases - if getattr(cwl_p.type_, "type_", None) == "enum": - properties["valuePattern"] = "|".join(_.rsplit("/", 1)[-1] for _ in cwl_p.type_.symbols) - return properties - - -def get_relative_uri(uri): - doc, fragment = uri.rsplit("#", 1) - return f"{doc.rsplit('/', 1)[-1]}#{fragment}" - - -def cut_step_part(relative_uri): - parts = relative_uri.split("/", 2) - if len(parts) > 2: - relative_uri = parts[0] + "/" + parts[2] - return relative_uri - - -def get_fragment(uri): - return uri.rsplit("#", 1)[-1] - - class ProvCrateBuilder: - - # -------------------------------------------------------------------------- - # Public methods, called by the CLI - def __init__(self, root, converter=CONVERTERS["cwl"], workflow_name=None, license=None, readme=None): - self.root = Path(root) self.converter = converter - self.workflow_name = workflow_name - self.license = license - self.readme = Path(readme) if readme else readme - self.wf_path = self.root / "workflow" / WORKFLOW_BASENAME - self.cwl_defs = self.converter.get_workflow(self.wf_path) - self.step_maps = self.converter.get_step_maps(self.cwl_defs) - self.ro = ResearchObject(BDBag(str(root))) - self.with_prov = set(str(_) for _ in self.ro.resources_with_provenance()) - self.workflow_run = Provenance(self.ro).activity() - self.roc_engine_run = None - # avoid duplicates - not handled by ro-crate-py, see - # https://github.com/ResearchObject/ro-crate-py/issues/132 - self.control_actions = {} - # index collections by their main entity's id - self.collections = {} - self.hashes = {} - # map source files to destination files - self.file_map = {} - self.manifest = self._get_manifest() + self.converter.root = Path(root) + self.converter.workflow_name = workflow_name + self.converter.license = license + self.converter.readme = Path(readme) if readme else readme + self.converter.wf_path = self.converter.root / "workflow" / self.converter.WORKFLOW_BASENAME + self.converter.workflow_definition = self.converter.get_workflow() + self.converter.step_maps = self.converter.get_step_maps() + self.converter.ro = ResearchObject(BDBag(str(root))) + self.converter.with_prov = set(str(_) for _ in self.converter.ro.resources_with_provenance()) + self.converter.workflow_run = Provenance(self.converter.ro).activity() + self.converter.roc_engine_run = None + self.converter.control_actions = {} + self.converter.collection = {} + self.converter.hashes = {} + self.converter.file_map = {} + self.converter.manifest = self.converter.get_manifest(self.converter.root, MANIFEST_FILE) def build(self): crate = ROCrate(gen_preview=False) crate.metadata.extra_contexts.append(TERMS_NAMESPACE) - self.add_root_metadata(crate) - self.add_profiles(crate) - self.add_workflow(crate) - self.add_engine_run(crate) - self.add_action(crate, self.workflow_run) - self.patch_workflow_input_collection(crate) - self.add_inputs_file(crate) - self.add_output_formats(crate) + self.converter.add_root_metadata(crate) + self.converter.add_profiles(crate) + self.converter.add_workflow(crate) + self.converter.add_engine_run(crate) + self.converter.add_action(crate, self.converter.workflow_run) + self.converter.patch_workflow_input_collection(crate) + self.converter.add_inputs_file(crate) + self.converter.add_output_formats(crate) return crate - # -------------------------------------------------------------------------- - # Top level methods, called by build() - - def add_root_metadata(self, crate): - if self.license: - crate.root_dataset["license"] = self.license - if self.readme: - readme = crate.add_file(self.readme) - readme["about"] = crate.root_dataset - if self.readme.suffix.lower() == ".md": - readme["encodingFormat"] = "text/markdown" - - def add_profiles(self, crate): - profiles = [] - for p in "process", "workflow", "provenance": - id_ = f"{PROFILES_BASE}/{p}/{PROFILES_VERSION}" - profiles.append(crate.add(ContextEntity(crate, id_, properties={ - "@type": "CreativeWork", - "name": f"{p.title()} Run Crate", - "version": PROFILES_VERSION, - }))) - # FIXME: in the future, this could go out of sync with the wroc - # profile added by ro-crate-py to the metadata descriptor - wroc_profile_id = f"https://w3id.org/workflowhub/workflow-ro-crate/{WROC_PROFILE_VERSION}" - profiles.append(crate.add(ContextEntity(crate, wroc_profile_id, properties={ - "@type": "CreativeWork", - "name": "Workflow RO-Crate", - "version": WROC_PROFILE_VERSION, - }))) - crate.root_dataset["conformsTo"] = profiles - - def add_workflow(self, crate): - lang_version = self.cwl_defs[WORKFLOW_BASENAME].cwlVersion - properties = { - "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], - } - workflow = crate.add_workflow( - self.wf_path, self.wf_path.name, main=True, lang="cwl", - lang_version=lang_version, gen_cwl=False, properties=properties - ) - cwl_workflow = self.cwl_defs[workflow.id] - wf_name = self.wf_path.name - if hasattr(cwl_workflow, "label") and cwl_workflow.label: - wf_name = cwl_workflow.label - workflow["name"] = self.workflow_name or wf_name - if hasattr(cwl_workflow, "doc") and cwl_workflow.doc: - workflow["description"] = cwl_workflow.doc - # cannot convert "intent" to featureList: workflow is not a SoftwareApplication - workflow["input"] = self.add_params(crate, cwl_workflow.inputs) - workflow["output"] = self.add_params(crate, cwl_workflow.outputs) - if hasattr(cwl_workflow, "steps"): - for s in cwl_workflow.steps: - self.add_step(crate, workflow, s) - self.add_param_connections(crate, workflow) - return workflow - - def add_engine_run(self, crate): - engine = self.workflow_run.start().starter_activity() - roc_engine = crate.add(SoftwareApplication(crate, properties={ - "name": engine.label or "workflow engine" - })) - roc_engine_run = crate.add(ContextEntity(crate, properties={ - "@type": "OrganizeAction", - "name": f"Run of {roc_engine['name']}", - "startTime": engine.start().time.isoformat(), - })) - roc_engine_run["instrument"] = roc_engine - self.add_agent(crate, roc_engine_run, engine) - self.roc_engine_run = roc_engine_run - - def add_action(self, crate, activity, parent_instrument=None): - workflow = crate.mainEntity - action = crate.add(ContextEntity(crate, properties={ - "@type": "CreateAction", - "name": activity.label, - })) - plan = self._resolve_plan(activity) - plan_tag = plan.id.localpart - if plan_tag == "main": - assert str(activity.type) == "wfprov:WorkflowRun" - instrument = workflow - self.roc_engine_run["result"] = action - crate.root_dataset["mentions"] = [action] - - def to_wf_p(k): - return k - else: - parent_instrument_fragment = get_fragment(parent_instrument.id) - if parent_instrument_fragment != WORKFLOW_BASENAME: - parts = plan_tag.split("/", 1) - if parts[0] == "main": - parts[0] = parent_instrument_fragment - plan_tag = "/".join(parts) - tool_name = self.step_maps[parent_instrument_fragment][plan_tag]["tool"] - instrument = crate.dereference(f"{workflow.id}#{tool_name}") - control_action = self.control_actions.get(plan_tag) - if not control_action: - control_action = crate.add(ContextEntity(crate, properties={ - "@type": "ControlAction", - "name": f"orchestrate {tool_name}", - })) - step = crate.dereference(f"{workflow.id}#{plan_tag}") - control_action["instrument"] = step - self.roc_engine_run.append_to("object", control_action, compact=True) - self.control_actions[plan_tag] = control_action - control_action.append_to("object", action, compact=True) - if activity.uri in self.with_prov: - nested_prov = Provenance(self.ro, activity.uri) - activity = nested_prov.activity() - - def to_wf_p(k): - return k.replace(activity.plan().localpart, tool_name) - self.converter.get_hashes(activity.provenance) - action["instrument"] = instrument - action["startTime"] = activity.start().time.isoformat() - action["endTime"] = activity.end().time.isoformat() - action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage") - action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation") - self.add_container_images(crate, action, activity) - for job in activity.steps(): - self.add_action(crate, job, parent_instrument=instrument) - - def patch_workflow_input_collection(self, crate, wf=None): - """\ - CWLProv records secondary files only in step runs, not in the workflow - run. Thus, when the conversion of parameter values is completed, - workflow-level parameters with secondary files get mapped to the main - entity of the collection alone (a File). This method fixes the mapping - by retrieving the correct Collection entity from the relevant tool - execution. - """ - if wf is None: - wf = crate.mainEntity - sel = [_ for _ in crate.contextual_entities - if "CreateAction" in as_list(_.type) and _.get("instrument") is wf] - if not sel: - return # skipped subworkflow - wf_action = sel[0] - connections = [_ for _ in crate.contextual_entities - if "ParameterConnection" in as_list(_.type)] - for param in wf.get("input", []): - if param.get("additionalType") == "Collection": - src_sel = [_ for _ in wf_action.get("object", []) - if param in as_list(_.get("exampleOfWork"))] - if not src_sel: - raise RuntimeError(f"object for param {param.id} not found") - obj = src_sel[0] - if obj.type != "Collection": - param_connections = [_ for _ in connections if _["sourceParameter"] is param] - if not param_connections: - continue - pc = param_connections[0] - tgt_param = pc["targetParameter"] - tgt_sel = [_ for _ in crate.get_entities() - if tgt_param in as_list(_.get("exampleOfWork"))] - if not tgt_sel: - raise RuntimeError(f"object for param {tgt_param.id} not found") - tgt_obj = tgt_sel[0] - wf_action["object"] = [ - _ for _ in as_list(wf_action["object"]) if _ is not obj - ] + [tgt_obj] - tgt_obj.append_to("exampleOfWork", param) - obj["exampleOfWork"] = [_ for _ in as_list(obj["exampleOfWork"]) - if _ is not param] - if len(obj["exampleOfWork"]) == 1: - obj["exampleOfWork"] = obj["exampleOfWork"][0] - if len(obj["exampleOfWork"]) == 0: - del obj["exampleOfWork"] - for tool in wf.get("hasPart", []): - if "ComputationalWorkflow" in as_list(tool.type): - self.patch_workflow_input_collection(crate, wf=tool) - - def add_inputs_file(self, crate): - path = self.root / "workflow" / INPUTS_FILE_BASENAME - if path.is_file(): - with open(path) as f: - data = json.load(f) - data = self._map_input_data(crate, data) - source = StringIO(json.dumps(data, indent=4)) - crate.add_file(source, path.name, properties={ - "name": "input object document", - "encodingFormat": "application/json", - }) - - def add_output_formats(self, crate): - path = self.root / "workflow" / OUTPUTS_FILE_BASENAME - if path.is_file(): - with open(path) as f: - data = json.load(f) - self._map_input_data(crate, data) - - # -------------------------------------------------------------------------- - # Internal methods, called by the top level methods - - def add_step(self, crate, workflow, cwl_step): - step_fragment = get_fragment(cwl_step.id) - step_id = f"{self.wf_path.name}#{step_fragment}" - pos = self.step_maps[get_fragment(workflow.id)][step_fragment]["pos"] - step = crate.add(ContextEntity(crate, step_id, properties={ - "@type": "HowToStep", - "position": str(pos), - })) - tool = self.add_tool(crate, workflow, cwl_step.run) - step["workExample"] = tool - if hasattr(cwl_step, "label") and cwl_step.label: - step["name"] = cwl_step.label - if hasattr(cwl_step, "doc") and cwl_step.doc: - step["description"] = cwl_step.doc - workflow.append_to("step", step) - - def add_tool(self, crate, workflow, cwl_tool): - if isinstance(cwl_tool, str): - tool_fragment = get_fragment(cwl_tool) - cwl_tool = self.cwl_defs[tool_fragment] - else: - tool_fragment = get_fragment(cwl_tool.id) - if hasattr(cwl_tool, "expression"): - raise RuntimeError("ExpressionTool not supported yet") - tool_id = f"{self.wf_path.name}#{tool_fragment}" - tool = crate.dereference(tool_id) - if tool: - return tool - properties = {"name": tool_fragment} - if cwl_tool.doc: - properties["description"] = cwl_tool.doc - if cwl_tool.label: - properties["name"] = cwl_tool.label - if hasattr(cwl_tool, "steps"): - properties["@type"] = ["SoftwareSourceCode", "ComputationalWorkflow", "HowTo"] - else: - properties["@type"] = "SoftwareApplication" - if hasattr(cwl_tool, "intent") and cwl_tool.intent: - properties["featureList"] = cwl_tool.intent - if hasattr(cwl_tool, "requirements") and cwl_tool.requirements: - for req in cwl_tool.requirements: - if req.class_ == "ResourceRequirement": - ramMin = req.ramMin - if ramMin: - properties["memoryRequirements"] = f"{int(ramMin)} MiB" - deps = [] - if hasattr(cwl_tool, "hints") and cwl_tool.hints: - for req in cwl_tool.hints: - if hasattr(req, "class_") and req.class_ == "ResourceRequirement": - ramMin = req.ramMin - if ramMin: - properties["memoryRequirements"] = f"{int(ramMin)} MiB" - if hasattr(req, "class_") and req.class_ == "SoftwareRequirement": - for p in req.packages: - if hasattr(p, "specs") and p.specs: - dep_id = p.specs[0] - dep_properties = { - "@type": "SoftwareApplication", - "name": p.package - } - if p.version: - dep_properties["softwareVersion"] = p.version - deps.append( - crate.add(ContextEntity(crate, dep_id, properties=dep_properties)) - ) - tool = crate.add(ContextEntity(crate, tool_id, properties=properties)) - if deps: - tool["softwareRequirements"] = deps - if len(deps) == 1: - tool["mainEntity"] = deps[0] - tool["input"] = self.add_params(crate, cwl_tool.inputs) - tool["output"] = self.add_params(crate, cwl_tool.outputs) - workflow.append_to("hasPart", tool) - if hasattr(cwl_tool, "steps"): - tool["programmingLanguage"] = workflow["programmingLanguage"] - for s in cwl_tool.steps: - self.add_step(crate, tool, s) - self.add_param_connections(crate, tool) - return tool - - def add_params(self, crate, cwl_params): - params = [] - for cwl_p in cwl_params: - p_id = get_relative_uri(cwl_p.id) - properties = properties_from_cwl_param(cwl_p) - properties["name"] = p_id.rsplit("/", 1)[-1] - p = crate.add(ContextEntity(crate, p_id, properties=properties)) - params.append(p) - return params - - def add_agent(self, crate, roc_engine_run, engine): - delegate = engine.start().starter_activity() - try: - delegation = next(engine.provenance.record_with_attr( - prov.model.ProvDelegation, delegate.id, prov.model.PROV_ATTR_DELEGATE - )) - except StopIteration: - return - responsible = delegation.get_attribute(prov.model.PROV_ATTR_RESPONSIBLE) - agent = sum((engine.provenance.prov_doc.get_record(_) for _ in responsible), []) - for a in agent: - if "prov:Person" not in set(str(_) for _ in a.get_asserted_types()): - continue - agent_id = a.identifier.uri - if not agent_id.startswith("http"): - agent_id = "#" + agent_id.rsplit(":", 1)[-1] - properties = { - "@type": "Person" - } - if isinstance(a.label, str): - properties["name"] = a.label - ro_a = crate.add(ContextEntity(crate, agent_id, properties=properties)) - roc_engine_run.append_to("agent", ro_a, compact=True) - - def add_container_images(self, crate, action, activity): - images = set() - for assoc in activity.association(): - for agent in activity.provenance.prov_doc.get_record(assoc.agent_id): - images |= agent.get_attribute("cwlprov:image") - for im in images: - properties = parse_img(im) - properties.update({ - "@type": "ContainerImage", - "additionalType": {"@id": DOCKER_IMG_TYPE} - }) - roc_img = crate.add(ContextEntity(crate, properties=properties)) - action.append_to("containerImage", roc_img, compact=True) - - def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): - action_params = [] - all_roles = set() - for rel in getattr(activity, ptype)(): - k = get_relative_uri(rel.role.uri) - if str(activity.type) == "wfprov:WorkflowRun": - # workflow output roles have a phantom step part - if ptype == "generation": - k = cut_step_part(k) - # In the case of a single tool run, cwltool reports one WorkflowRun - # and no ProcessRun; some parameters are duplicated, appearing both - # with role main/PARAM_NAME and main/ORIGINAL_WF_NAME/PARAM_NAME - if not list(activity.steps()): - k = cut_step_part(k) - if k in all_roles: - continue - all_roles.add(k) - wf_p = crate.dereference(to_wf_p(k)) - k = get_fragment(k) - v = rel.entity() - value = self.converter.convert_param(v, - crate, - hashes=self.hashes, - manifest=self.manifest, - file_map=self.file_map - ) - if value is None: - continue # param is optional with no default and was not set - if {"ro:Folder", "wf4ever:File"} & set(str(_) for _ in v.types()): - action_p = value - else: - # FIXME: assuming arrays and records don't have nested structured types - if isinstance(value, dict): - value = [crate.add(ContextEntity(crate, f"#pv-{k}/{nk}", properties={ - "@type": "PropertyValue", - "name": nk, - "value": nv, - })) for nk, nv in value.items()] - action_p = crate.add(ContextEntity(crate, f"#pv-{k}", properties={ - "@type": "PropertyValue", - "name": k.rsplit("/", 1)[-1], - })) - action_p["value"] = value - action_p["exampleOfWork"] = list(set( - as_list(action_p.get("exampleOfWork", [])) + [wf_p] - )) - if len(action_p["exampleOfWork"]) == 1: - action_p["exampleOfWork"] = action_p["exampleOfWork"][0] - if ptype == "generation": - action_p["dateCreated"] = rel.time.isoformat() - action_params.append(action_p) - return action_params - - def add_param_connections(self, crate, workflow): - def connect(source, target, entity): - connection = crate.add(ContextEntity(crate, properties={ - "@type": "ParameterConnection" - })) - connection["sourceParameter"] = crate.get(f"{WORKFLOW_BASENAME}#{source}") - connection["targetParameter"] = crate.get(f"{WORKFLOW_BASENAME}#{target}") - entity.append_to("connection", connection) - wf_name = get_fragment(workflow.id) - wf_def = self.cwl_defs[wf_name] - step_map = self.step_maps[wf_name] - out_map = {} - for step in wf_def.steps: - step_name = get_fragment(step.id) - tool_name = step_map[step_name]["tool"] - for o in step.out: - o_name = get_fragment(o) - out_map[o_name] = o_name.replace(step_name, tool_name) - for step in wf_def.steps: - step_name = get_fragment(step.id) - ro_step = crate.get(f"{self.wf_path.name}#{step_name}") - tool_name = step_map[step_name]["tool"] - for mapping in getattr(step, "in_", []): - if not mapping.source: - continue - sources = [mapping.source] if not isinstance( - mapping.source, list - ) else mapping.source - for s in sources: - from_param = get_fragment(s) - try: - from_param = out_map[from_param] - except KeyError: - pass # only needed if source is from another step - to_param = get_fragment(mapping.id).replace(step_name, tool_name) - connect(from_param, to_param, ro_step) - for out in getattr(wf_def, "outputs", []): - out_sources = [out.outputSource] if not isinstance( - out.outputSource, list - ) else out.outputSource - for out_s in out_sources: - from_param = get_fragment(out_s) - try: - from_param = out_map[from_param] - except KeyError: - # assuming this is a passthrough for a workflow input parameter - pass - to_param = get_fragment(out.id) - connect(from_param, to_param, workflow) - - # -------------------------------------------------------------------------- - # Utility methods, called by the other methods - - def _get_manifest(self): - manifest = {} - with open(self.root / Path(MANIFEST_FILE)) as f: - for line in f: - hash_, relpath = line.strip().split(None, 1) - manifest[hash_] = self.root / relpath - return manifest - - def _resolve_plan(self, activity): - job_qname = activity.plan() - plan = activity.provenance.entity(job_qname) - if not plan: - m = SCATTER_JOB_PATTERN.match(str(job_qname)) - if m: - plan = activity.provenance.entity(m.groups()[0]) - return plan - - def _map_input_data(self, crate, data): - if isinstance(data, list): - return [self._map_input_data(crate, _) for _ in data] - if isinstance(data, dict): - rval = {} - for k, v in data.items(): - if k == "location": - source = self.root / "workflow" / v - try: - source_k = str(source.resolve(strict=False)) - except RuntimeError: - source_k = str(source) - dest = self.file_map.get(source_k) - rval[k] = str(dest) if dest else v - fmt = data.get("format") - if fmt: - entity = crate.get(str(dest)) - if entity: - entity["encodingFormat"] = fmt - else: - rval[k] = self._map_input_data(crate, v) - return rval - return data diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 14a2b07..98d8bdf 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -1,9 +1,78 @@ class converter: def __init__(self): - pass + self.root = None + self.workflow_name = None + self.license = None + self.readme = None + self.wf_path = None + self.workflow_definition = {} + self.step_maps = {} + self.ro = None + self.with_prov = set() + self.workflow_run = None + self.roc_engine_run = None + self.control_actions = {} + self.collections = {} + self.hashes = {} + self.file_map = {} + self.manifest = None + + # -------------------------------------------------------------------------- + # Top level functions - called by the build() function + + def add_root_metadata(self, crate): + """ + Add metadata to the root of the crate. + """ + raise NotImplementedError("add_root_metadata") + + def add_profiles(self, crate): + """ + Add profiles to the crate. + """ + raise NotImplementedError("add_profiles") + + def add_workflow(self, crate): + """ + Add the workflow to the crate. + """ + raise NotImplementedError("add_workflow") + + def add_engine_run(self, crate): + """ + Add the engine run to the crate. + """ + raise NotImplementedError("add_engine_run") + + def add_action(self, crate, workflow_run): + """ + Add the action to the crate. + """ + raise NotImplementedError("add_action") + + def patch_workflow_input_collection(self, crate): + """ + Patch the workflow input collection. + """ + raise NotImplementedError("patch_workflow_input_collection") + + def add_inputs_files(self, crate): + """ + Add input files to the crate. + """ + raise NotImplementedError("add_inputs_files") + + def add_output_formats(self, crate): + """ + Add output formats to the crate. + """ + raise NotImplementedError("add_output_formats") + + # -------------------------------------------------------------------------- + # Helper functions - called by the top level functions def get_workflow(self, wf_path): - """\ + """ Get the workflow from the given path. Returns a dictionary where tools / workflows are mapped by their ids. @@ -11,19 +80,19 @@ def get_workflow(self, wf_path): raise NotImplementedError("get_workflow") def get_step_maps(self, wf_defs): - """\ + """ Get a mapping of step names to their tool names and positions. """ raise NotImplementedError("get_step_maps") def build_step_graph(self, wf): - """\ + """ Build a graph of steps in the workflow. """ raise NotImplementedError("build_step_graph") def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): - """\ + """ Convert a CWLProv parameter to a RO-Crate entity. """ raise NotImplementedError("convert_param") diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 5be588b..f56134a 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -1,28 +1,480 @@ import hashlib import json +import re from pathlib import Path +from io import StringIO import networkx as nx import prov.model from cwl_utils.parser import load_document_by_yaml from cwlprov.prov import Entity from cwlprov.utils import first +from cwlprov.prov import Provenance + from rocrate.model.contextentity import ContextEntity +from rocrate.model.softwareapplication import SoftwareApplication +from rocrate.rocrate import ROCrate from .base import converter - +from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION, DOCKER_IMG_TYPE +from ..utils import as_list, parse_img CWLPROV_NONE = "https://w3id.org/cwl/prov#None" +CWL_TYPE_MAP = { + "string": "Text", + "int": "Integer", + "long": "Integer", + "float": "Float", + "double": "Float", + "Any": "DataType", + "boolean": "Boolean", + "File": "File", + "Directory": "Dataset", + "null": None, +} + +INPUTS_FILE_BASENAME = "primary-job.json" +OUTPUTS_FILE_BASENAME = "primary-output.json" + +SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") + +class cwlConverter(converter): + + WORKFLOW_BASENAME = "packed.cwl" + + # -------------------------------------------------------------------------- + # Top level methods, called by build() + + def add_root_metadata(self, crate): + if self.license: + crate.root_dataset["license"] = self.license + if self.readme: + readme = crate.add_file(self.readme) + readme["about"] = crate.root_dataset + if self.readme.suffix.lower() == ".md": + readme["encodingFormat"] = "text/markdown" + + def add_profiles(self, crate): + profiles = [] + for p in "process", "workflow", "provenance": + id_ = f"{PROFILES_BASE}/{p}/{PROFILES_VERSION}" + profiles.append(crate.add(ContextEntity(crate, id_, properties={ + "@type": "CreativeWork", + "name": f"{p.title()} Run Crate", + "version": PROFILES_VERSION, + }))) + # FIXME: in the future, this could go out of sync with the wroc + # profile added by ro-crate-py to the metadata descriptor + wroc_profile_id = f"https://w3id.org/workflowhub/workflow-ro-crate/{WROC_PROFILE_VERSION}" + profiles.append(crate.add(ContextEntity(crate, wroc_profile_id, properties={ + "@type": "CreativeWork", + "name": "Workflow RO-Crate", + "version": WROC_PROFILE_VERSION, + }))) + crate.root_dataset["conformsTo"] = profiles + + def add_workflow(self, crate): + lang_version = self.workflow_definition[self.WORKFLOW_BASENAME].cwlVersion + properties = { + "@type": ["File", "SoftwareSourceCode", "ComputationalWorkflow", "HowTo"], + } + workflow = crate.add_workflow( + self.wf_path, self.wf_path.name, main=True, lang="cwl", + lang_version=lang_version, gen_cwl=False, properties=properties + ) + cwl_workflow = self.workflow_definition[workflow.id] + wf_name = self.wf_path.name + if hasattr(cwl_workflow, "label") and cwl_workflow.label: + wf_name = cwl_workflow.label + workflow["name"] = self.workflow_name or wf_name + if hasattr(cwl_workflow, "doc") and cwl_workflow.doc: + workflow["description"] = cwl_workflow.doc + # cannot convert "intent" to featureList: workflow is not a SoftwareApplication + workflow["input"] = self.add_params(crate, cwl_workflow.inputs) + workflow["output"] = self.add_params(crate, cwl_workflow.outputs) + if hasattr(cwl_workflow, "steps"): + for s in cwl_workflow.steps: + self.add_step(crate, workflow, s) + self.add_param_connections(crate, workflow) + return workflow + + def add_engine_run(self, crate): + engine = self.workflow_run.start().starter_activity() + roc_engine = crate.add(SoftwareApplication(crate, properties={ + "name": engine.label or "workflow engine" + })) + roc_engine_run = crate.add(ContextEntity(crate, properties={ + "@type": "OrganizeAction", + "name": f"Run of {roc_engine['name']}", + "startTime": engine.start().time.isoformat(), + })) + roc_engine_run["instrument"] = roc_engine + self.add_agent(crate, roc_engine_run, engine) + self.roc_engine_run = roc_engine_run + + def add_action(self, crate, activity, parent_instrument=None): + workflow = crate.mainEntity + action = crate.add(ContextEntity(crate, properties={ + "@type": "CreateAction", + "name": activity.label, + })) + plan = _resolve_plan(activity) + plan_tag = plan.id.localpart + if plan_tag == "main": + assert str(activity.type) == "wfprov:WorkflowRun" + instrument = workflow + self.roc_engine_run["result"] = action + crate.root_dataset["mentions"] = [action] -class cwlConverter(converter): - hashes = {} - collections = {} + def to_wf_p(k): + return k + else: + parent_instrument_fragment = _get_fragment(parent_instrument.id) + if parent_instrument_fragment != self.WORKFLOW_BASENAME: + parts = plan_tag.split("/", 1) + if parts[0] == "main": + parts[0] = parent_instrument_fragment + plan_tag = "/".join(parts) + tool_name = self.step_maps[parent_instrument_fragment][plan_tag]["tool"] + instrument = crate.dereference(f"{workflow.id}#{tool_name}") + control_action = self.control_actions.get(plan_tag) + if not control_action: + control_action = crate.add(ContextEntity(crate, properties={ + "@type": "ControlAction", + "name": f"orchestrate {tool_name}", + })) + step = crate.dereference(f"{workflow.id}#{plan_tag}") + control_action["instrument"] = step + self.roc_engine_run.append_to("object", control_action, compact=True) + self.control_actions[plan_tag] = control_action + control_action.append_to("object", action, compact=True) + if activity.uri in self.with_prov: + nested_prov = Provenance(self.ro, activity.uri) + activity = nested_prov.activity() - def __init__(self): - pass + def to_wf_p(k): + return k.replace(activity.plan().localpart, tool_name) + self.get_hashes(activity.provenance) + action["instrument"] = instrument + action["startTime"] = activity.start().time.isoformat() + action["endTime"] = activity.end().time.isoformat() + action["object"] = self.add_action_params(crate, activity, to_wf_p, "usage") + action["result"] = self.add_action_params(crate, activity, to_wf_p, "generation") + self.add_container_images(crate, action, activity) + for job in activity.steps(): + self.add_action(crate, job, parent_instrument=instrument) - def get_workflow(self, wf_path): + def patch_workflow_input_collection(self, crate, wf=None): + """\ + CWLProv records secondary files only in step runs, not in the workflow + run. Thus, when the conversion of parameter values is completed, + workflow-level parameters with secondary files get mapped to the main + entity of the collection alone (a File). This method fixes the mapping + by retrieving the correct Collection entity from the relevant tool + execution. + """ + if wf is None: + wf = crate.mainEntity + sel = [_ for _ in crate.contextual_entities + if "CreateAction" in as_list(_.type) and _.get("instrument") is wf] + if not sel: + return # skipped subworkflow + wf_action = sel[0] + connections = [_ for _ in crate.contextual_entities + if "ParameterConnection" in as_list(_.type)] + for param in wf.get("input", []): + if param.get("additionalType") == "Collection": + src_sel = [_ for _ in wf_action.get("object", []) + if param in as_list(_.get("exampleOfWork"))] + if not src_sel: + raise RuntimeError(f"object for param {param.id} not found") + obj = src_sel[0] + if obj.type != "Collection": + param_connections = [_ for _ in connections if _["sourceParameter"] is param] + if not param_connections: + continue + pc = param_connections[0] + tgt_param = pc["targetParameter"] + tgt_sel = [_ for _ in crate.get_entities() + if tgt_param in as_list(_.get("exampleOfWork"))] + if not tgt_sel: + raise RuntimeError(f"object for param {tgt_param.id} not found") + tgt_obj = tgt_sel[0] + wf_action["object"] = [ + _ for _ in as_list(wf_action["object"]) if _ is not obj + ] + [tgt_obj] + tgt_obj.append_to("exampleOfWork", param) + obj["exampleOfWork"] = [_ for _ in as_list(obj["exampleOfWork"]) + if _ is not param] + if len(obj["exampleOfWork"]) == 1: + obj["exampleOfWork"] = obj["exampleOfWork"][0] + if len(obj["exampleOfWork"]) == 0: + del obj["exampleOfWork"] + for tool in wf.get("hasPart", []): + if "ComputationalWorkflow" in as_list(tool.type): + self.patch_workflow_input_collection(crate, wf=tool) + + def add_inputs_file(self, crate): + path = self.root / "workflow" / INPUTS_FILE_BASENAME + if path.is_file(): + with open(path) as f: + data = json.load(f) + data = self.map_input_data(crate, data) + source = StringIO(json.dumps(data, indent=4)) + crate.add_file(source, path.name, properties={ + "name": "input object document", + "encodingFormat": "application/json", + }) + + def add_output_formats(self, crate): + path = self.root / "workflow" / OUTPUTS_FILE_BASENAME + if path.is_file(): + with open(path) as f: + data = json.load(f) + self.map_input_data(crate, data) + + # -------------------------------------------------------------------------- + # Internal methods, called by the top level methods + + def add_step(self, crate, workflow, cwl_step): + step_fragment = _get_fragment(cwl_step.id) + step_id = f"{self.wf_path.name}#{step_fragment}" + pos = self.step_maps[_get_fragment(workflow.id)][step_fragment]["pos"] + step = crate.add(ContextEntity(crate, step_id, properties={ + "@type": "HowToStep", + "position": str(pos), + })) + tool = self.add_tool(crate, workflow, cwl_step.run) + step["workExample"] = tool + if hasattr(cwl_step, "label") and cwl_step.label: + step["name"] = cwl_step.label + if hasattr(cwl_step, "doc") and cwl_step.doc: + step["description"] = cwl_step.doc + workflow.append_to("step", step) + + def add_tool(self, crate, workflow, cwl_tool): + if isinstance(cwl_tool, str): + tool_fragment = _get_fragment(cwl_tool) + cwl_tool = self.workflow_definition[tool_fragment] + else: + tool_fragment = _get_fragment(cwl_tool.id) + if hasattr(cwl_tool, "expression"): + raise RuntimeError("ExpressionTool not supported yet") + tool_id = f"{self.wf_path.name}#{tool_fragment}" + tool = crate.dereference(tool_id) + if tool: + return tool + properties = {"name": tool_fragment} + if cwl_tool.doc: + properties["description"] = cwl_tool.doc + if cwl_tool.label: + properties["name"] = cwl_tool.label + if hasattr(cwl_tool, "steps"): + properties["@type"] = ["SoftwareSourceCode", "ComputationalWorkflow", "HowTo"] + else: + properties["@type"] = "SoftwareApplication" + if hasattr(cwl_tool, "intent") and cwl_tool.intent: + properties["featureList"] = cwl_tool.intent + if hasattr(cwl_tool, "requirements") and cwl_tool.requirements: + for req in cwl_tool.requirements: + if req.class_ == "ResourceRequirement": + ramMin = req.ramMin + if ramMin: + properties["memoryRequirements"] = f"{int(ramMin)} MiB" + deps = [] + if hasattr(cwl_tool, "hints") and cwl_tool.hints: + for req in cwl_tool.hints: + if hasattr(req, "class_") and req.class_ == "ResourceRequirement": + ramMin = req.ramMin + if ramMin: + properties["memoryRequirements"] = f"{int(ramMin)} MiB" + if hasattr(req, "class_") and req.class_ == "SoftwareRequirement": + for p in req.packages: + if hasattr(p, "specs") and p.specs: + dep_id = p.specs[0] + dep_properties = { + "@type": "SoftwareApplication", + "name": p.package + } + if p.version: + dep_properties["softwareVersion"] = p.version + deps.append( + crate.add(ContextEntity(crate, dep_id, properties=dep_properties)) + ) + tool = crate.add(ContextEntity(crate, tool_id, properties=properties)) + if deps: + tool["softwareRequirements"] = deps + if len(deps) == 1: + tool["mainEntity"] = deps[0] + tool["input"] = self.add_params(crate, cwl_tool.inputs) + tool["output"] = self.add_params(crate, cwl_tool.outputs) + workflow.append_to("hasPart", tool) + if hasattr(cwl_tool, "steps"): + tool["programmingLanguage"] = workflow["programmingLanguage"] + for s in cwl_tool.steps: + self.add_step(crate, tool, s) + self.add_param_connections(crate, tool) + return tool + + def add_params(self, crate, cwl_params): + params = [] + for cwl_p in cwl_params: + p_id = _get_relative_uri(cwl_p.id) + properties = _properties_from_cwl_param(cwl_p) + properties["name"] = p_id.rsplit("/", 1)[-1] + p = crate.add(ContextEntity(crate, p_id, properties=properties)) + params.append(p) + return params + + def add_agent(self, crate, roc_engine_run, engine): + delegate = engine.start().starter_activity() + try: + delegation = next(engine.provenance.record_with_attr( + prov.model.ProvDelegation, delegate.id, prov.model.PROV_ATTR_DELEGATE + )) + except StopIteration: + return + responsible = delegation.get_attribute(prov.model.PROV_ATTR_RESPONSIBLE) + agent = sum((engine.provenance.prov_doc.get_record(_) for _ in responsible), []) + for a in agent: + if "prov:Person" not in set(str(_) for _ in a.get_asserted_types()): + continue + agent_id = a.identifier.uri + if not agent_id.startswith("http"): + agent_id = "#" + agent_id.rsplit(":", 1)[-1] + properties = { + "@type": "Person" + } + if isinstance(a.label, str): + properties["name"] = a.label + ro_a = crate.add(ContextEntity(crate, agent_id, properties=properties)) + roc_engine_run.append_to("agent", ro_a, compact=True) + + def add_container_images(self, crate, action, activity): + images = set() + for assoc in activity.association(): + for agent in activity.provenance.prov_doc.get_record(assoc.agent_id): + images |= agent.get_attribute("cwlprov:image") + for im in images: + properties = parse_img(im) + properties.update({ + "@type": "ContainerImage", + "additionalType": {"@id": DOCKER_IMG_TYPE} + }) + roc_img = crate.add(ContextEntity(crate, properties=properties)) + action.append_to("containerImage", roc_img, compact=True) + + def add_action_params(self, crate, activity, to_wf_p, ptype="usage"): + action_params = [] + all_roles = set() + for rel in getattr(activity, ptype)(): + k = _get_relative_uri(rel.role.uri) + if str(activity.type) == "wfprov:WorkflowRun": + # workflow output roles have a phantom step part + if ptype == "generation": + k = _cut_step_part(k) + # In the case of a single tool run, cwltool reports one WorkflowRun + # and no ProcessRun; some parameters are duplicated, appearing both + # with role main/PARAM_NAME and main/ORIGINAL_WF_NAME/PARAM_NAME + if not list(activity.steps()): + k = _cut_step_part(k) + if k in all_roles: + continue + all_roles.add(k) + wf_p = crate.dereference(to_wf_p(k)) + k = _get_fragment(k) + v = rel.entity() + value = self.convert_param(v, + crate, + manifest=self.manifest + ) + if value is None: + continue # param is optional with no default and was not set + if {"ro:Folder", "wf4ever:File"} & set(str(_) for _ in v.types()): + action_p = value + else: + # FIXME: assuming arrays and records don't have nested structured types + if isinstance(value, dict): + value = [crate.add(ContextEntity(crate, f"#pv-{k}/{nk}", properties={ + "@type": "PropertyValue", + "name": nk, + "value": nv, + })) for nk, nv in value.items()] + action_p = crate.add(ContextEntity(crate, f"#pv-{k}", properties={ + "@type": "PropertyValue", + "name": k.rsplit("/", 1)[-1], + })) + action_p["value"] = value + action_p["exampleOfWork"] = list(set( + as_list(action_p.get("exampleOfWork", [])) + [wf_p] + )) + if len(action_p["exampleOfWork"]) == 1: + action_p["exampleOfWork"] = action_p["exampleOfWork"][0] + if ptype == "generation": + action_p["dateCreated"] = rel.time.isoformat() + action_params.append(action_p) + return action_params + + def add_param_connections(self, crate, workflow): + def connect(source, target, entity): + connection = crate.add(ContextEntity(crate, properties={ + "@type": "ParameterConnection" + })) + connection["sourceParameter"] = crate.get(f"{self.WORKFLOW_BASENAME}#{source}") + connection["targetParameter"] = crate.get(f"{self.WORKFLOW_BASENAME}#{target}") + entity.append_to("connection", connection) + wf_name = _get_fragment(workflow.id) + wf_def = self.workflow_definition[wf_name] + step_map = self.step_maps[wf_name] + out_map = {} + for step in wf_def.steps: + step_name = _get_fragment(step.id) + tool_name = step_map[step_name]["tool"] + for o in step.out: + o_name = _get_fragment(o) + out_map[o_name] = o_name.replace(step_name, tool_name) + for step in wf_def.steps: + step_name = _get_fragment(step.id) + ro_step = crate.get(f"{self.wf_path.name}#{step_name}") + tool_name = step_map[step_name]["tool"] + for mapping in getattr(step, "in_", []): + if not mapping.source: + continue + sources = [mapping.source] if not isinstance( + mapping.source, list + ) else mapping.source + for s in sources: + from_param = _get_fragment(s) + try: + from_param = out_map[from_param] + except KeyError: + pass # only needed if source is from another step + to_param = _get_fragment(mapping.id).replace(step_name, tool_name) + connect(from_param, to_param, ro_step) + for out in getattr(wf_def, "outputs", []): + out_sources = [out.outputSource] if not isinstance( + out.outputSource, list + ) else out.outputSource + for out_s in out_sources: + from_param = _get_fragment(out_s) + try: + from_param = out_map[from_param] + except KeyError: + # assuming this is a passthrough for a workflow input parameter + pass + to_param = _get_fragment(out.id) + connect(from_param, to_param, workflow) + + def get_manifest(self, root=None, MANIFEST_FILE=None): + manifest = {} + with open(root / Path(MANIFEST_FILE)) as f: + for line in f: + hash_, relpath = line.strip().split(None, 1) + manifest[hash_] = root / relpath + return manifest + + def get_workflow(self): """\ Get the workflow from the given path. @@ -32,8 +484,7 @@ def get_workflow(self, wf_path): around issues. """ - wf_path = Path(wf_path) - with open(wf_path, "rt") as f: + with open(self.wf_path, "rt") as f: json_wf = json.load(f) graph = json_wf.get("$graph", [json_wf]) # https://github.com/common-workflow-language/cwltool/pull/1506 @@ -41,21 +492,25 @@ def get_workflow(self, wf_path): ns = n.pop("$namespaces", {}) if ns: json_wf.setdefault("$namespaces", {}).update(ns) - defs = load_document_by_yaml(json_wf, wf_path.absolute().as_uri(), load_all=True) + defs = load_document_by_yaml(json_wf, self.wf_path.absolute().as_uri(), load_all=True) if not isinstance(defs, list): defs = [defs] def_map = {} for d in defs: k = _get_fragment(d.id) if k == "main": - k = wf_path.name + k = self.wf_path.name def_map[k] = d - _normalize_cwl_defs(def_map) + self.workflow_definition = _normalize_workflow_definition(def_map) return def_map - def get_step_maps(self, cwl_defs): + def get_step_maps(self): + """ + Get a mapping of step names to their tool names and positions. + """ + rval = {} - for k, v in cwl_defs.items(): + for k, v in self.workflow_definition.items(): if hasattr(v, "steps"): graph = self.build_step_graph(v) pos_map = {f: i for i, f in enumerate(nx.topological_sort(graph))} @@ -66,6 +521,10 @@ def get_step_maps(self, cwl_defs): return rval def build_step_graph(self, cwl_wf): + """ + Build a graph of steps in the workflow. + """ + out_map = {} for s in cwl_wf.steps: for o in s.out: @@ -87,9 +546,7 @@ def convert_param(self, crate, convert_secondary=True, parent=None, - hashes=None, manifest=None, - file_map=None ): type_names = frozenset(str(_) for _ in prov_param.types()) secondary_files = [_.generated_entity() for _ in prov_param.derivations() @@ -98,8 +555,7 @@ def convert_param(self, main_entity = self.convert_param(prov_param, crate, convert_secondary=False, - manifest=manifest, - file_map=file_map) + manifest=manifest) action_p = self.collections.get(main_entity.id) if not action_p: action_p = crate.add(ContextEntity(crate, properties={ @@ -110,7 +566,6 @@ def convert_param(self, self.convert_param(_, crate, manifest=manifest, - file_map=file_map ) for _ in secondary_files ] crate.root_dataset.append_to("mentions", action_p) @@ -131,7 +586,7 @@ def convert_param(self, source_k = str(source.resolve(strict=False)) except RuntimeError: source_k = str(source) - file_map[source_k] = dest + self.file_map[source_k] = dest return action_p if "ro:Folder" in type_names: hash_ = self.hashes[prov_param.id.localpart] @@ -145,7 +600,6 @@ def convert_param(self, crate, parent=action_p, manifest=manifest, - file_map=file_map ) action_p.append_to("hasPart", part) return action_p @@ -156,7 +610,6 @@ def convert_param(self, (k, self.convert_param(v, crate, manifest=manifest, - file_map=file_map )) for k, v in _get_dict(prov_param).items() if k != "@id" @@ -165,7 +618,6 @@ def convert_param(self, return [self.convert_param(_, crate, manifest=manifest, - file_map=file_map ) for _ in _get_members(prov_param)] if prov_param.id.uri == CWLPROV_NONE: return None @@ -173,12 +625,12 @@ def convert_param(self, def get_hashes(self, provenance): for r in provenance.prov_doc.get_records(prov.model.ProvEntity): - self._get_hash(self.hashes, Entity(provenance, r)) + self.get_hash(Entity(provenance, r)) - def _get_hash(self, hashes, prov_param): + def get_hash(self, prov_param): k = prov_param.id.localpart try: - return hashes[k] + return self.hashes[k] except KeyError: type_names = frozenset(str(_) for _ in prov_param.types()) if "wf4ever:File" in type_names: @@ -188,11 +640,36 @@ def _get_hash(self, hashes, prov_param): elif "ro:Folder" in type_names: m = hashlib.sha1() m.update("".join(sorted( - self._get_hash(hashes, _) for _ in _get_dict(prov_param).values() + self.get_hash(_) for _ in _get_dict(prov_param).values() )).encode()) self.hashes[k] = hash_ = m.hexdigest() return hash_ + def map_input_data(self, crate, data): + if isinstance(data, list): + return [self.map_input_data(crate, _) for _ in data] + if isinstance(data, dict): + rval = {} + for k, v in data.items(): + if k == "location": + source = self.root / "workflow" / v + try: + source_k = str(source.resolve(strict=False)) + except RuntimeError: + source_k = str(source) + dest = self.file_map.get(source_k) + rval[k] = str(dest) if dest else v + fmt = data.get("format") + if fmt: + entity = crate.get(str(dest)) + if entity: + entity["encodingFormat"] = fmt + else: + rval[k] = self.map_input_data(crate, v) + return rval + return data + + def _get_members(entity): membership = entity.provenance.record_with_attr( @@ -206,9 +683,9 @@ def _get_fragment(uri): return uri.rsplit("#", 1)[-1] -def _normalize_cwl_defs(cwl_defs): +def _normalize_workflow_definition(workflow_definition): inline_tools = {} - for d in cwl_defs.values(): + for d in workflow_definition.values(): if not hasattr(d, "steps") or not d.steps: continue for s in d.steps: @@ -219,7 +696,7 @@ def _normalize_cwl_defs(cwl_defs): tool.id = f"{s.id}/run" inline_tools[_get_fragment(tool.id)] = tool s.run = tool.id - cwl_defs.update(inline_tools) + return workflow_definition.update(inline_tools) def _set_alternate_name(prov_param, action_p, parent=None): @@ -241,3 +718,71 @@ def _get_dict(entity): entity_id = first(kvp.record.get_attribute("prov:pairEntity")) d[key] = entity.provenance.entity(entity_id) return d + +def _resolve_plan(activity): + job_qname = activity.plan() + plan = activity.provenance.entity(job_qname) + if not plan: + m = SCATTER_JOB_PATTERN.match(str(job_qname)) + if m: + plan = activity.provenance.entity(m.groups()[0]) + return plan + +def _get_relative_uri(uri): + doc, fragment = uri.rsplit("#", 1) + return f"{doc.rsplit('/', 1)[-1]}#{fragment}" + +def _properties_from_cwl_param(cwl_p): + def is_structured(cwl_type): + return getattr(cwl_type, "type_", None) in ("array", "record") + additional_type = "Collection" if cwl_p.secondaryFiles else _convert_cwl_type(cwl_p.type_) + properties = { + "@type": "FormalParameter", + "additionalType": additional_type + } + if hasattr(cwl_p, "doc") and cwl_p.doc: + properties["description"] = cwl_p.doc + elif hasattr(cwl_p, "label") and cwl_p.label: + # name is used for the parameter's id to support reproducibility + properties["description"] = cwl_p.label + if cwl_p.format: + properties["encodingFormat"] = cwl_p.format + if isinstance(cwl_p.type_, list) and "null" in cwl_p.type_: + properties["valueRequired"] = "False" + if is_structured(cwl_p.type_): + properties["multipleValues"] = "True" + if hasattr(cwl_p, "default"): + if isinstance(cwl_p.default, dict): + if cwl_p.default.get("class") in ("File", "Directory"): + default = cwl_p.default.get("location", cwl_p.default.get("path")) + if default: + properties["defaultValue"] = default + elif not is_structured(cwl_p.type_) and cwl_p.default is not None: + properties["defaultValue"] = str(cwl_p.default) + # TODO: support more cases + if getattr(cwl_p.type_, "type_", None) == "enum": + properties["valuePattern"] = "|".join(_.rsplit("/", 1)[-1] for _ in cwl_p.type_.symbols) + return properties + +def _convert_cwl_type(cwl_type): + if isinstance(cwl_type, list): + s = set(_convert_cwl_type(_) for _ in cwl_type) + s.discard(None) + return s.pop() if len(s) == 1 else sorted(s) + if isinstance(cwl_type, str): + return CWL_TYPE_MAP[cwl_type] + if cwl_type.type_ == "enum": + return "Text" # use actionOption to represent choices? + if cwl_type.type_ == "array": + return _convert_cwl_type(cwl_type.items) + if cwl_type.type_ == "record": + return "PropertyValue" + +def _get_fragment(uri): + return uri.rsplit("#", 1)[-1] + +def _cut_step_part(relative_uri): + parts = relative_uri.split("/", 2) + if len(parts) > 2: + relative_uri = parts[0] + "/" + parts[2] + return relative_uri diff --git a/tests/test_step_mapping.py b/tests/test_step_mapping.py index ac4fed1..1c3191b 100644 --- a/tests/test_step_mapping.py +++ b/tests/test_step_mapping.py @@ -26,8 +26,10 @@ def converter(): def test_step_maps_cwl(data_dir, converter): wf_basename = "exome-alignment-packed.cwl" wf_path = data_dir / wf_basename - cwl_defs = converter.get_workflow(wf_path) - step_maps = converter.get_step_maps(cwl_defs) + converter.wf_path = wf_path + cwl_defs = converter.get_workflow() + converter.workflow_definition = cwl_defs + step_maps = converter.get_step_maps() assert set(step_maps) == {wf_basename} sm = step_maps[wf_basename] assert len(sm) == 8 @@ -48,9 +50,10 @@ def test_step_maps_cwl(data_dir, converter): def test_step_maps_disconnected_cwl(data_dir, converter): - wf_path = data_dir / "no-output-run-1/workflow/packed.cwl" - cwl_defs = converter.get_workflow(wf_path) - step_maps = converter.get_step_maps(cwl_defs) + converter.wf_path = data_dir / "no-output-run-1/workflow/packed.cwl" + cwl_defs = converter.get_workflow() + converter.workflow_definition = cwl_defs + step_maps = converter.get_step_maps() assert set(step_maps) == {"packed.cwl"} sm = step_maps["packed.cwl"] assert set(sm) == {"main/date_step", "main/echo_step", "main/date2_step"} From 3608552097dae0fdc4790245e2d0ac94a20cc66e Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Tue, 10 Dec 2024 13:56:08 +0000 Subject: [PATCH 10/23] Move generic root metadata creation to base class --- src/runcrate/convert.py | 1 - src/runcrate/converters/base.py | 14 ++++++++++++-- src/runcrate/converters/cwl.py | 9 --------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index fc5f3eb..a2996a2 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -62,7 +62,6 @@ def __init__(self, def build(self): crate = ROCrate(gen_preview=False) crate.metadata.extra_contexts.append(TERMS_NAMESPACE) - self.converter.add_root_metadata(crate) self.converter.add_profiles(crate) self.converter.add_workflow(crate) self.converter.add_engine_run(crate) diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 98d8bdf..587039d 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -17,14 +17,24 @@ def __init__(self): self.file_map = {} self.manifest = None + add_root_metadata(self, crate) + # -------------------------------------------------------------------------- # Top level functions - called by the build() function def add_root_metadata(self, crate): """ - Add metadata to the root of the crate. + Add license and readme to the root of the crate, if provided. """ - raise NotImplementedError("add_root_metadata") + if self.license: + crate.root_dataset["license"] = self.license + if self.readme: + readme = crate.add_file(self.readme) + readme["about"] = crate.root_dataset + if self.readme.suffix.lower() == ".md": + readme["encodingFormat"] = "text/markdown" + + return def add_profiles(self, crate): """ diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index f56134a..6c31d16 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -46,15 +46,6 @@ class cwlConverter(converter): # -------------------------------------------------------------------------- # Top level methods, called by build() - def add_root_metadata(self, crate): - if self.license: - crate.root_dataset["license"] = self.license - if self.readme: - readme = crate.add_file(self.readme) - readme["about"] = crate.root_dataset - if self.readme.suffix.lower() == ".md": - readme["encodingFormat"] = "text/markdown" - def add_profiles(self, crate): profiles = [] for p in "process", "workflow", "provenance": From ffefec6d6304cfce750744b65064a3c1856381b0 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 08:09:28 +0000 Subject: [PATCH 11/23] Add test of CLI option --- tests/test_cli.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 2428ec6..d17bb8a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -26,7 +26,22 @@ def test_cli_convert(data_dir, tmpdir, monkeypatch): monkeypatch.chdir(str(tmpdir)) root = data_dir / "revsort-run-1" runner = CliRunner() - args = ["convert", "-c", "cwl", str(root)] + args = ["convert", str(root)] + result = runner.invoke(cli, args) + assert result.exit_code == 0, result.exception + crate_zip = tmpdir / f"{root.name}.crate.zip" + assert crate_zip.is_file() + crate = ROCrate(crate_zip) + assert not crate.root_dataset.get("license") + workflow = crate.mainEntity + assert workflow["name"] == "packed.cwl" + + +def test_cli_convert_with_cwl_converter_set_explictly(data_dir, tmpdir, monkeypatch): + monkeypatch.chdir(str(tmpdir)) + root = data_dir / "revsort-run-1" + runner = CliRunner() + args = ["convert", "--converter", "cwl", str(root)] result = runner.invoke(cli, args) assert result.exit_code == 0, result.exception crate_zip = tmpdir / f"{root.name}.crate.zip" From 4687ca9209e76e49aacfae481e585b3154820bb0 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 08:09:36 +0000 Subject: [PATCH 12/23] Apply linting --- src/runcrate/constants.py | 2 -- src/runcrate/convert.py | 7 ++----- src/runcrate/converters/base.py | 8 +++----- src/runcrate/converters/cwl.py | 19 ++++++++++--------- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/src/runcrate/constants.py b/src/runcrate/constants.py index e011ec3..68f0992 100644 --- a/src/runcrate/constants.py +++ b/src/runcrate/constants.py @@ -27,5 +27,3 @@ WROC_PROFILE_VERSION = "1.0" DOCKER_IMG_TYPE = "https://w3id.org/ro/terms/workflow-run#DockerImage" - - diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index a2996a2..03b9c13 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -18,15 +18,11 @@ Generate a Workflow Run RO-Crate from a CWLProv RO bundle. """ -import json from pathlib import Path -import prov.model from bdbag.bdbagit import BDBag from cwlprov.prov import Provenance from cwlprov.ro import ResearchObject -from rocrate.model.contextentity import ContextEntity -from rocrate.model.softwareapplication import SoftwareApplication from rocrate.rocrate import ROCrate from .constants import TERMS_NAMESPACE @@ -34,6 +30,7 @@ MANIFEST_FILE = "manifest-sha1.txt" + class ProvCrateBuilder: def __init__(self, root, @@ -62,6 +59,7 @@ def __init__(self, def build(self): crate = ROCrate(gen_preview=False) crate.metadata.extra_contexts.append(TERMS_NAMESPACE) + self.converter.add_root_metadata(crate) self.converter.add_profiles(crate) self.converter.add_workflow(crate) self.converter.add_engine_run(crate) @@ -70,4 +68,3 @@ def build(self): self.converter.add_inputs_file(crate) self.converter.add_output_formats(crate) return crate - diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 587039d..8491deb 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -11,14 +11,12 @@ def __init__(self): self.with_prov = set() self.workflow_run = None self.roc_engine_run = None - self.control_actions = {} + self.control_actions = {} self.collections = {} self.hashes = {} self.file_map = {} self.manifest = None - add_root_metadata(self, crate) - # -------------------------------------------------------------------------- # Top level functions - called by the build() function @@ -34,7 +32,7 @@ def add_root_metadata(self, crate): if self.readme.suffix.lower() == ".md": readme["encodingFormat"] = "text/markdown" - return + return def add_profiles(self, crate): """ @@ -77,7 +75,7 @@ def add_output_formats(self, crate): Add output formats to the crate. """ raise NotImplementedError("add_output_formats") - + # -------------------------------------------------------------------------- # Helper functions - called by the top level functions diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 6c31d16..af43e2a 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -13,7 +13,6 @@ from rocrate.model.contextentity import ContextEntity from rocrate.model.softwareapplication import SoftwareApplication -from rocrate.rocrate import ROCrate from .base import converter from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION, DOCKER_IMG_TYPE @@ -39,7 +38,8 @@ SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") -class cwlConverter(converter): + +class cwlConverter(converter): WORKFLOW_BASENAME = "packed.cwl" @@ -499,7 +499,7 @@ def get_step_maps(self): """ Get a mapping of step names to their tool names and positions. """ - + rval = {} for k, v in self.workflow_definition.items(): if hasattr(v, "steps"): @@ -515,7 +515,7 @@ def build_step_graph(self, cwl_wf): """ Build a graph of steps in the workflow. """ - + out_map = {} for s in cwl_wf.steps: for o in s.out: @@ -661,7 +661,6 @@ def map_input_data(self, crate, data): return data - def _get_members(entity): membership = entity.provenance.record_with_attr( prov.model.ProvMembership, entity.id, prov.model.PROV_ATTR_COLLECTION @@ -670,10 +669,6 @@ def _get_members(entity): return (entity.provenance.entity(first(_)) for _ in member_ids) -def _get_fragment(uri): - return uri.rsplit("#", 1)[-1] - - def _normalize_workflow_definition(workflow_definition): inline_tools = {} for d in workflow_definition.values(): @@ -710,6 +705,7 @@ def _get_dict(entity): d[key] = entity.provenance.entity(entity_id) return d + def _resolve_plan(activity): job_qname = activity.plan() plan = activity.provenance.entity(job_qname) @@ -719,10 +715,12 @@ def _resolve_plan(activity): plan = activity.provenance.entity(m.groups()[0]) return plan + def _get_relative_uri(uri): doc, fragment = uri.rsplit("#", 1) return f"{doc.rsplit('/', 1)[-1]}#{fragment}" + def _properties_from_cwl_param(cwl_p): def is_structured(cwl_type): return getattr(cwl_type, "type_", None) in ("array", "record") @@ -755,6 +753,7 @@ def is_structured(cwl_type): properties["valuePattern"] = "|".join(_.rsplit("/", 1)[-1] for _ in cwl_p.type_.symbols) return properties + def _convert_cwl_type(cwl_type): if isinstance(cwl_type, list): s = set(_convert_cwl_type(_) for _ in cwl_type) @@ -769,9 +768,11 @@ def _convert_cwl_type(cwl_type): if cwl_type.type_ == "record": return "PropertyValue" + def _get_fragment(uri): return uri.rsplit("#", 1)[-1] + def _cut_step_part(relative_uri): parts = relative_uri.split("/", 2) if len(parts) > 2: From 6f9cc68e21b2447f7bd31810e75ead080940c1a8 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 08:33:09 +0000 Subject: [PATCH 13/23] Remove tab on newline --- tests/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index d17bb8a..c8d09bb 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -36,7 +36,7 @@ def test_cli_convert(data_dir, tmpdir, monkeypatch): workflow = crate.mainEntity assert workflow["name"] == "packed.cwl" - + def test_cli_convert_with_cwl_converter_set_explictly(data_dir, tmpdir, monkeypatch): monkeypatch.chdir(str(tmpdir)) root = data_dir / "revsort-run-1" From c71f73bde33baad679029da48f0f6b47c1b2a5d5 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 08:35:27 +0000 Subject: [PATCH 14/23] Fix import order --- src/runcrate/convert.py | 1 + src/runcrate/converters/cwl.py | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 03b9c13..b4ad996 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -28,6 +28,7 @@ from .constants import TERMS_NAMESPACE from .converters import CONVERTERS + MANIFEST_FILE = "manifest-sha1.txt" diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index af43e2a..5b46819 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -1,22 +1,21 @@ import hashlib import json import re -from pathlib import Path from io import StringIO +from pathlib import Path import networkx as nx import prov.model from cwl_utils.parser import load_document_by_yaml -from cwlprov.prov import Entity +from cwlprov.prov import Entity, Provenance from cwlprov.utils import first -from cwlprov.prov import Provenance - from rocrate.model.contextentity import ContextEntity from rocrate.model.softwareapplication import SoftwareApplication -from .base import converter -from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION, DOCKER_IMG_TYPE +from ..constants import DOCKER_IMG_TYPE, PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION from ..utils import as_list, parse_img +from .base import converter + CWLPROV_NONE = "https://w3id.org/cwl/prov#None" From 9ef32e530ef0ed98d2e99dc62a6e23fa30f080ca Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 09:18:16 +0000 Subject: [PATCH 15/23] Add tests covering non-implemented methods in base converter class --- tests/test_converter_base.py | 82 ++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 tests/test_converter_base.py diff --git a/tests/test_converter_base.py b/tests/test_converter_base.py new file mode 100644 index 0000000..689d28c --- /dev/null +++ b/tests/test_converter_base.py @@ -0,0 +1,82 @@ +# Copyright 2022-2024 CRS4. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from runcrate.converters.base import converter + + +@pytest.fixture +def converter_instance(): + converter_instance = converter() + return converter_instance + + +def test_initialization(converter_instance): + assert isinstance(converter_instance, converter) + + +def test_add_profiles(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_profiles(None) + + +def test_add_workflow(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_workflow(None) + + +def test_add_engine_run(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_engine_run(None) + + +def test_add_action(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_action(None, None) + + +def test_patch_workflow_input_collection(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.patch_workflow_input_collection(None) + + +def test_add_inputs_files(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_inputs_files(None) + + +def test_add_output_formats(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.add_output_formats(None) + + +def test_get_workflow(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.get_workflow(None) + + +def test_get_step_maps(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.get_step_maps(None) + + +def test_build_step_graph(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.build_step_graph(None) + + +def test_convert_param(converter_instance): + with pytest.raises(NotImplementedError): + converter_instance.convert_param(None, None) From 925f67da21263d082bd59f9fb03aeb12988e59f5 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Wed, 11 Dec 2024 09:46:09 +0000 Subject: [PATCH 16/23] Move profile addition to base converter class --- src/runcrate/converters/base.py | 25 ++++++++++++++++++++++++- src/runcrate/converters/cwl.py | 21 +-------------------- tests/test_converter_base.py | 5 ----- 3 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 8491deb..15b9fd8 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -1,3 +1,8 @@ +from rocrate.model.contextentity import ContextEntity + +from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION + + class converter: def __init__(self): self.root = None @@ -38,7 +43,25 @@ def add_profiles(self, crate): """ Add profiles to the crate. """ - raise NotImplementedError("add_profiles") + profiles = [] + for p in "process", "workflow", "provenance": + id_ = f"{PROFILES_BASE}/{p}/{PROFILES_VERSION}" + profiles.append(crate.add(ContextEntity(crate, id_, properties={ + "@type": "CreativeWork", + "name": f"{p.title()} Run Crate", + "version": PROFILES_VERSION, + }))) + # FIXME: in the future, this could go out of sync with the wroc + # profile added by ro-crate-py to the metadata descriptor + wroc_profile_id = f"https://w3id.org/workflowhub/workflow-ro-crate/{WROC_PROFILE_VERSION}" + profiles.append(crate.add(ContextEntity(crate, wroc_profile_id, properties={ + "@type": "CreativeWork", + "name": "Workflow RO-Crate", + "version": WROC_PROFILE_VERSION, + }))) + crate.root_dataset["conformsTo"] = profiles + + return def add_workflow(self, crate): """ diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 5b46819..b214b89 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -12,7 +12,7 @@ from rocrate.model.contextentity import ContextEntity from rocrate.model.softwareapplication import SoftwareApplication -from ..constants import DOCKER_IMG_TYPE, PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION +from ..constants import DOCKER_IMG_TYPE from ..utils import as_list, parse_img from .base import converter @@ -45,25 +45,6 @@ class cwlConverter(converter): # -------------------------------------------------------------------------- # Top level methods, called by build() - def add_profiles(self, crate): - profiles = [] - for p in "process", "workflow", "provenance": - id_ = f"{PROFILES_BASE}/{p}/{PROFILES_VERSION}" - profiles.append(crate.add(ContextEntity(crate, id_, properties={ - "@type": "CreativeWork", - "name": f"{p.title()} Run Crate", - "version": PROFILES_VERSION, - }))) - # FIXME: in the future, this could go out of sync with the wroc - # profile added by ro-crate-py to the metadata descriptor - wroc_profile_id = f"https://w3id.org/workflowhub/workflow-ro-crate/{WROC_PROFILE_VERSION}" - profiles.append(crate.add(ContextEntity(crate, wroc_profile_id, properties={ - "@type": "CreativeWork", - "name": "Workflow RO-Crate", - "version": WROC_PROFILE_VERSION, - }))) - crate.root_dataset["conformsTo"] = profiles - def add_workflow(self, crate): lang_version = self.workflow_definition[self.WORKFLOW_BASENAME].cwlVersion properties = { diff --git a/tests/test_converter_base.py b/tests/test_converter_base.py index 689d28c..c75d325 100644 --- a/tests/test_converter_base.py +++ b/tests/test_converter_base.py @@ -27,11 +27,6 @@ def test_initialization(converter_instance): assert isinstance(converter_instance, converter) -def test_add_profiles(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_profiles(None) - - def test_add_workflow(converter_instance): with pytest.raises(NotImplementedError): converter_instance.add_workflow(None) From 5f5e2309bbde121103e9b6780a7fadfd5e584569 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 07:26:01 +0000 Subject: [PATCH 17/23] Add unneeded converters protection --- src/runcrate/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index 7e5cbd2..1bea861 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -74,10 +74,6 @@ def convert(root, converter, output, license, workflow_name, readme): if not output: output = Path(f"{root.name}.crate.zip") - if converter not in CONVERTERS: - sys.stderr.write(f"Unknown converter: {converter}\n") - sys.exit(1) - converter_instance = CONVERTERS[converter] sys.stdout.write(f"Using converter: {converter_instance}\n") From ef44044988c054bd2f65dc6da19de0e6768a803c Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 07:32:21 +0000 Subject: [PATCH 18/23] Make class name capital letter --- src/runcrate/converters/__init__.py | 8 ++++---- src/runcrate/converters/base.py | 2 +- src/runcrate/converters/cwl.py | 4 ++-- tests/test_converter_base.py | 6 +++--- tests/test_step_mapping.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/runcrate/converters/__init__.py b/src/runcrate/converters/__init__.py index d4d1d78..f13f34b 100644 --- a/src/runcrate/converters/__init__.py +++ b/src/runcrate/converters/__init__.py @@ -1,8 +1,8 @@ -from .base import converter -from .cwl import cwlConverter +from .base import Converter +from .cwl import CwlConverter CONVERTERS = { - "base": converter(), - "cwl": cwlConverter(), + "base": Converter(), + "cwl": CwlConverter(), } diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 15b9fd8..d513229 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -3,7 +3,7 @@ from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION -class converter: +class Converter: def __init__(self): self.root = None self.workflow_name = None diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index b214b89..fea6dd8 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -14,7 +14,7 @@ from ..constants import DOCKER_IMG_TYPE from ..utils import as_list, parse_img -from .base import converter +from .base import Converter CWLPROV_NONE = "https://w3id.org/cwl/prov#None" @@ -38,7 +38,7 @@ SCATTER_JOB_PATTERN = re.compile(r"^(.+)_\d+$") -class cwlConverter(converter): +class CwlConverter(Converter): WORKFLOW_BASENAME = "packed.cwl" diff --git a/tests/test_converter_base.py b/tests/test_converter_base.py index c75d325..511acc1 100644 --- a/tests/test_converter_base.py +++ b/tests/test_converter_base.py @@ -14,17 +14,17 @@ import pytest -from runcrate.converters.base import converter +from runcrate.converters.base import Converter @pytest.fixture def converter_instance(): - converter_instance = converter() + converter_instance = Converter() return converter_instance def test_initialization(converter_instance): - assert isinstance(converter_instance, converter) + assert isinstance(converter_instance, Converter) def test_add_workflow(converter_instance): diff --git a/tests/test_step_mapping.py b/tests/test_step_mapping.py index 1c3191b..6785421 100644 --- a/tests/test_step_mapping.py +++ b/tests/test_step_mapping.py @@ -14,12 +14,12 @@ import pytest -from runcrate.converters.cwl import cwlConverter +from runcrate.converters.cwl import CwlConverter @pytest.fixture def converter(): - converter = cwlConverter() + converter = CwlConverter() return converter From a0383affd88fc28dbdaa855bbf3e460606cdd54e Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 08:53:56 +0000 Subject: [PATCH 19/23] Make base an abstract base class --- revsort-run-1.crate.zip | Bin 0 -> 6056 bytes src/runcrate/convert.py | 2 +- src/runcrate/converters/__init__.py | 1 - src/runcrate/converters/base.py | 34 +++++++++++++++++++--------- src/runcrate/converters/cwl.py | 2 +- tests/test_converter_base.py | 33 ++++++++++++++++++++++++++- 6 files changed, 57 insertions(+), 15 deletions(-) create mode 100644 revsort-run-1.crate.zip diff --git a/revsort-run-1.crate.zip b/revsort-run-1.crate.zip new file mode 100644 index 0000000000000000000000000000000000000000..f69f19a678695f121883753cdd0d5ea116e7f8c6 GIT binary patch literal 6056 zcma)AWl$W;BQzU474wTj$<7_h>;usKfvO00Y4G9AT^nab+b20RVCY008EnS2q~k z(bk3s?(5WV2-W08XM$ewhk>IRfse?yS_)AN_KSvEhnvl1Ii&Av|^O9-B z`;$AwI6ZkrcK39AO!%z?4a2=WD>ST%u~lCBcx~6;Ws*=f7(D?DgXzABC#+KUJe*-TYPu8Y6t&sW7|zQq?icOuymi|*BaUx) zQqBTIjON?Z)so&B=sL{u8{v=ftSOoHR871JxeO&Vm^)8=KZuDKoQ&+jfG~L!r@W*o zG{a)*p8{$z`(f(H&TB52h<|_AH$u*q`hMFsSx}wm@WgGbpMNTj5o)KyltRcKK8X#^ zV)N5dQQ(8g7$t6HQ;zo585emoD*_wBQ_GDACJ@zenpEcS9F-}U5D zD4kMR(X>Y3mxWD-Hu?Ghkzqv2*7RcIQyRAOrh^0^k05^+QF5s~&Q{bzvN!{H>6d?s z3?>(o)9-l*bSRF_sOolkWU_?s;~ey4F{^#IQ5goB1w?YWKVxkOC6dR{juJLpaubL& zj?NvnQo6QxSeZ~u)YBSQ#e+pmZ<$c}wx^0tN9PQ;93?aKUA&rfI2r%w#;yc9yNl+R zu(O!;!KHQ#o;j`iaTk~SN`sTGN!j8^fMd?5x5he(dqSy&g6Q16>EJQTU?r)JtR4|3 zI_tsnVhu!vb)T~vxj|?Xb@_|-TVc}IFZInnWb03Il`_AuUnOOf(&k%UinLF~eRZEg z(ZCW9(B{6h_>^*OP`^Q1I-z0tXsf~GcnPX&n<{Z19J$(6aXar^wY%8F@a;-Ii{ksP z_e(0|PVJ}}L}nyAN&%l5Tm8VD)e`-y8`n2hdJj)?$?-gF;|J>74`roz$_o&}SGCoH z@Iq1|*FmY_-t7t`_?RGA%EQqR=y7HxQIHp}adNyb3D3dZh99|oty*I(-L!UaigCFw zB1WvWd=%o!U$_Mg_U7*XJla&hqVvyZx<6>!wZ9GXdqvyn7_Y#}&t2HJ(xdhe8Tm=) zZJo};US@d^tdVB1c+OJm-)g6LWYN_P2mlzO0sw4(YDa)y*bXiXv$e4kw1ZefL}7yb zFk!fewFs|>4O|E&470O?@CgV*^mWA(mU;1yy5x`(cD|K4*1=OnX;fGAw)fkR77Kw+qzh0|4MQPfNIf^^l(bELbp=A|rbj%LO=VTlrz8Lw}| zg`NeU<7l>BAQM~hhb2pdJuWO>=%!ILsB=K*+Q{7aMeIoTqgFF z-HBW7H1{Tqp0rGyuFn?`8F^zx6IIVI<&PxVz0mTjTk?4+c@0c$%ozeHHQRkxc!k0z)VG*EchqsYRd0Mm!4!s&V=V zinc23%hXHa2W@S*Z9BFs$&YMQ$U|Y{iy9tlO=}mr;)upBnEcjt3~1<1Ti$*PVqYN1 z!B@9FWQzBMMP%W#<3Rn{Pg*OkMTe^NY;aQ{|emMHKxynox{#&W%;oc z2SZXgHVM!Qu8w-(s@cbA6p08UVLAg)tH173gVdOCS>XR~r(RBFIML{Fje zP3)NrUf<6w|AO#1l9cs;yeo-27$^mOAm4IxKuU5qK z^@h$WF(9cyVK8nCA#zkEv$=eM{_m_!0BI+?{x8;Ai}LdcLWD%%a6usf0RcX12tS;k zUxeRUSePFs1m}ea*jU@}iwNrLw$&{2Vz1njd}{^&4nz2kOqZgGapF2ZO;XS=A-oO= z#!LH#MT|i$t(;HRC>)|SJ1a8b)ioEHjqf__Mo4sW@U0`nH0+}&@MDS#WN6vdaa zzG8wQN0mnY)uSwK!HzO)>zoEOyi1QhQKY4BBDj?-92M2LjoJ zk$4hN8x%s^byVEYH&zxhC|&M9#7HO%iXv9D)tniwM2$fw8SD`(j)!xl#ZKRW5<#cq zLw4+%6~U2y5{#HpbmMKzGrEjRGt0GG+6G@D$l9LGJQK6UHJ!z3_u1rxXH_N2fUNh%>ESJRBJrD7?zVhg<|9pJQ{=R4~sQ=i|VWGfH7YAHUI zB{h_XA;sw5>o3p~c1}!kxymXTflh7S(6R+7Bnll> z0tpXn;AK=iaGtWvGjh*)@l>a?v`GwAW9w#>>&8+5{|!QIH+>iO<0&i*YhLJt3!c4N z+o4sSnU-~b7iH)asJLW%^A~AR3>u@{{}*XRh3#zltc7^31%*WT;kMRrVH*)40bx;y zt+fq=-&Vku4-|g2A{OUc_AxYC7V^r55CVH2g}FB_ECgbzUsXSln z6+5)^!@!6Tda~+)wW+NB0_zh?W_V#SR!)+bQvhe15BL;Z=Tc#Zu7@dfvZ`I)?c~{BUo4tZtFK83kxrAY<6W4G(Uh z0rb8OJIHf!)T3CO$w4=w3}$t}Y`yxGvEC{p=mac+Ajoy<9_5<3KtWTr2=+&^&wDu4 z$GV^$(No=!X#!~}v2v)2%l$TCM_}2s7*~i`fh;AJfgPGVVhvS`gKDyU%p`;NiMLQQ zo9241S^0=~R%);BwW}<;TI6lN-&jZ$w(w8lJP#LB;jLkFF7Wd~MPBmoXf!X?XAz1+ z@dk%|#&WlBWWRjG9Wm#{9?v{NF|K{edpGTLfP@-tP?vjh(qFNgyX~v=@R#zg(if6= z?c)2Ff?MOeZ1HFKMdeXbZ#?phG1Km7@2nDYxZWHNcfG$1IDVk=OS|rW+t8fMf zR#C3g8Rao5Rgz+EK5bZXsDL!XsK@HoU-ddPTQxAifY5PR>4*{Cvo(W~p#T~w>-`_k zdm|>vp+``9|`Dx*T}{g{5pSgJ?M2y6X;YR8K7b zq{}R?)g2UZFH3f3Nb%wWR%C=zZM1hxA?0;$!6Z?9F&Dd!bdOgr=U#ZGsD;f~Gf5pN zHcZauhXs=8ckGaI{sYRt_u6#%SM&3Kl))Pm0094A>e|EJ8Rp^7?ci$70#}8aQnxye!cZ?QZS9 zWRc;W)Bt+N6=!&;CGmhlCR7_=RC}*M@c|I%$*aCVy-^D)U9Jgs3A_hlBJ2;^9p*!b zQet#ip6n)R=uDiO8DnVmd!@I`sNE?7UA#WJQE-`US@$uZlgmD`KZn>-dvavUw`yIU0Kz%i>T_^wO-T9Ge}QiI6F zW$oW{uuy`^`TQvWP%Q%hkp9VmhbuSS1LkGR?QH7>vw?ZR{yP)Lrlzi2NRp5>O9t>W zZm|Y=ZQ@D^@0=9{G9#}iDTU`+*sWlNui0tLa>-jNWan#vV9SAd^QI#jVjRM*x|e5W zo##n05BK7rkRK24lSLK^&fCzpEO*3r+TKW5+TxY9Fce8cr(3dE%2X9<)Fpa%ADCQQ zr4tXmva)i(_uR3`II0V$=D)+|1>6r-8|Y&X*2Z}0 z`Z%wnhF0SQrI{EFWbe!FvzFwEGGiCHwA5Zoy8#Vb^WB`fi8H=!cDxIZqj%$j{Xm7P zOb#SKTXb-lBdzwAMw(i(}egjdc}%;^Dy6yR$+%aHPw$Y7}_57$9`+$1pnr|LBGy~_(9o-gD$c>JN()H z#grbmup?H5W5NyTBbL_6i!@dLqElbp71@Y~uDk&UBYduq=RysJdXR!>-b{YlpTk>H^6a)hJ|YT^|ch7LW37cC|++GN1ETit-&%f zLf~i6SMn+0YN7QNxiG1iABgvPinz$OxNMdPq(xWdDf|BP-k@~e#&VX2RSPg7aA4PQ zCS)YX>ZkOOPvG$l{p8~(<25qxf9T#Pv#JOZbc}f(lhpF%<}82&yUaBV`Wb<`ou;a+247KvV`&#eri!pUm82(=g`&nD-1OiSTN2< z@#H5Hd!20j(86L-N+~Hv!ZFADIxAj+uLR=bM5gy4#AJ};6opE4JJNE)T@GO7vY$FygHqczYA);f*jiVU$!uhe*mqHLVUd%*RcUlZ z+~QSoahZ7O-zNLW+WmrfmZGa@`x>$06S{X522(uWla(kNbw*@zKlTdhCU(F@(LFk$l0k2PA~LX-wPWh?9ps;Ug3bpj zN0SgRRm<-s$$IVsvymfa3_%$j>LW; z^rG%b&s0LGPteYS;QWuG$MfdfBVg`LMnBL0{J__GP3jF_Ei>hI#Q@W}vaTuRE^8dNj8mAYMab;leA~6CeTCz03Ei0W>a2dh7n5 zwp%wYKi|$_wIb63hsx_-TeykV>K39@#_-Xk1Vormx3W4cZq8CBWtXu7GVTsZClNC{8~TwZhjsf}*N|mTd5xAQ9cvaf5xMF-DOrhV2d)GPgfD+5wfg z%2%jWmyEM&-e($gr&(__V;qUSNS&w8p2?FjM?@h7CT$}F$OO_8Tz34AN-H_Dqz1;ys{i6*h#JDt;8Ma|Liw0NIr8)Gg+S%(*VX6 zRnr2~$0e1}Zq@wcFAM1S!YgBMNIS^m`P5p?bm(bUGf)@(X$&wIiQTr;QP=pgGa}0diZ>Yjzo0OJl|e(}F7MxQe|uX069oX2gh~8y ziT_iF>u=P*J%@i#&(HvX|I~r_8};v@@E_E23;^Ii9TERV{adsDgKB~Ge--@S(SNJm ef1<6i|3B@k1x0)EuYXVe?9YEDg?W;H-Tn)QLdVnq literal 0 HcmV?d00001 diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index b4ad996..392ee49 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -66,6 +66,6 @@ def build(self): self.converter.add_engine_run(crate) self.converter.add_action(crate, self.converter.workflow_run) self.converter.patch_workflow_input_collection(crate) - self.converter.add_inputs_file(crate) + self.converter.add_inputs_files(crate) self.converter.add_output_formats(crate) return crate diff --git a/src/runcrate/converters/__init__.py b/src/runcrate/converters/__init__.py index f13f34b..8d05b1d 100644 --- a/src/runcrate/converters/__init__.py +++ b/src/runcrate/converters/__init__.py @@ -3,6 +3,5 @@ CONVERTERS = { - "base": Converter(), "cwl": CwlConverter(), } diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index d513229..de72f97 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -1,9 +1,11 @@ +from abc import ABC, abstractmethod + from rocrate.model.contextentity import ContextEntity from ..constants import PROFILES_BASE, PROFILES_VERSION, WROC_PROFILE_VERSION -class Converter: +class Converter(ABC): def __init__(self): self.root = None self.workflow_name = None @@ -63,67 +65,77 @@ def add_profiles(self, crate): return + @abstractmethod def add_workflow(self, crate): """ Add the workflow to the crate. """ - raise NotImplementedError("add_workflow") + pass + @abstractmethod def add_engine_run(self, crate): """ Add the engine run to the crate. """ - raise NotImplementedError("add_engine_run") + pass + @abstractmethod def add_action(self, crate, workflow_run): """ Add the action to the crate. """ - raise NotImplementedError("add_action") + pass + @abstractmethod def patch_workflow_input_collection(self, crate): """ Patch the workflow input collection. """ - raise NotImplementedError("patch_workflow_input_collection") + pass + @abstractmethod def add_inputs_files(self, crate): """ Add input files to the crate. """ - raise NotImplementedError("add_inputs_files") + pass + @abstractmethod def add_output_formats(self, crate): """ Add output formats to the crate. """ - raise NotImplementedError("add_output_formats") + pass # -------------------------------------------------------------------------- # Helper functions - called by the top level functions + @abstractmethod def get_workflow(self, wf_path): """ Get the workflow from the given path. Returns a dictionary where tools / workflows are mapped by their ids. """ - raise NotImplementedError("get_workflow") + pass + @abstractmethod def get_step_maps(self, wf_defs): """ Get a mapping of step names to their tool names and positions. """ - raise NotImplementedError("get_step_maps") + pass + @abstractmethod def build_step_graph(self, wf): """ Build a graph of steps in the workflow. """ - raise NotImplementedError("build_step_graph") + pass + @abstractmethod def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): """ Convert a CWLProv parameter to a RO-Crate entity. """ - raise NotImplementedError("convert_param") + pass diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index fea6dd8..0894f21 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -186,7 +186,7 @@ def patch_workflow_input_collection(self, crate, wf=None): if "ComputationalWorkflow" in as_list(tool.type): self.patch_workflow_input_collection(crate, wf=tool) - def add_inputs_file(self, crate): + def add_inputs_files(self, crate): path = self.root / "workflow" / INPUTS_FILE_BASENAME if path.is_file(): with open(path) as f: diff --git a/tests/test_converter_base.py b/tests/test_converter_base.py index 511acc1..4ae37a0 100644 --- a/tests/test_converter_base.py +++ b/tests/test_converter_base.py @@ -16,10 +16,41 @@ from runcrate.converters.base import Converter +class TestConverter(Converter): + def add_workflow(self, workflow): + raise NotImplementedError + + def add_engine_run(self, engine_run): + raise NotImplementedError + + def add_action(self, action, step): + raise NotImplementedError + + def patch_workflow_input_collection(self, input_collection): + raise NotImplementedError + + def add_inputs_files(self, inputs_files): + raise NotImplementedError + + def add_output_formats(self, output_formats): + raise NotImplementedError + + def get_workflow(self, workflow): + raise NotImplementedError + + def get_step_maps(self, step_maps): + raise NotImplementedError + + def build_step_graph(self, step_graph): + raise NotImplementedError + + def convert_param(self, param, step): + raise NotImplementedError + @pytest.fixture def converter_instance(): - converter_instance = Converter() + converter_instance = TestConverter() return converter_instance From f49369e8c1c92985755bf9e5b9c2d535b2aa3425 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 08:58:18 +0000 Subject: [PATCH 20/23] Make abstract base class not need tests --- src/runcrate/converters/__init__.py | 1 - src/runcrate/converters/base.py | 10 --- tests/test_converter_base.py | 108 ---------------------------- 3 files changed, 119 deletions(-) delete mode 100644 tests/test_converter_base.py diff --git a/src/runcrate/converters/__init__.py b/src/runcrate/converters/__init__.py index 8d05b1d..09a4250 100644 --- a/src/runcrate/converters/__init__.py +++ b/src/runcrate/converters/__init__.py @@ -1,4 +1,3 @@ -from .base import Converter from .cwl import CwlConverter diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index de72f97..16af8fa 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -70,42 +70,36 @@ def add_workflow(self, crate): """ Add the workflow to the crate. """ - pass @abstractmethod def add_engine_run(self, crate): """ Add the engine run to the crate. """ - pass @abstractmethod def add_action(self, crate, workflow_run): """ Add the action to the crate. """ - pass @abstractmethod def patch_workflow_input_collection(self, crate): """ Patch the workflow input collection. """ - pass @abstractmethod def add_inputs_files(self, crate): """ Add input files to the crate. """ - pass @abstractmethod def add_output_formats(self, crate): """ Add output formats to the crate. """ - pass # -------------------------------------------------------------------------- # Helper functions - called by the top level functions @@ -117,25 +111,21 @@ def get_workflow(self, wf_path): Returns a dictionary where tools / workflows are mapped by their ids. """ - pass @abstractmethod def get_step_maps(self, wf_defs): """ Get a mapping of step names to their tool names and positions. """ - pass @abstractmethod def build_step_graph(self, wf): """ Build a graph of steps in the workflow. """ - pass @abstractmethod def convert_param(self, prov_param, crate, convert_secondary=True, parent=None): """ Convert a CWLProv parameter to a RO-Crate entity. """ - pass diff --git a/tests/test_converter_base.py b/tests/test_converter_base.py deleted file mode 100644 index 4ae37a0..0000000 --- a/tests/test_converter_base.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2022-2024 CRS4. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from runcrate.converters.base import Converter - -class TestConverter(Converter): - def add_workflow(self, workflow): - raise NotImplementedError - - def add_engine_run(self, engine_run): - raise NotImplementedError - - def add_action(self, action, step): - raise NotImplementedError - - def patch_workflow_input_collection(self, input_collection): - raise NotImplementedError - - def add_inputs_files(self, inputs_files): - raise NotImplementedError - - def add_output_formats(self, output_formats): - raise NotImplementedError - - def get_workflow(self, workflow): - raise NotImplementedError - - def get_step_maps(self, step_maps): - raise NotImplementedError - - def build_step_graph(self, step_graph): - raise NotImplementedError - - def convert_param(self, param, step): - raise NotImplementedError - - -@pytest.fixture -def converter_instance(): - converter_instance = TestConverter() - return converter_instance - - -def test_initialization(converter_instance): - assert isinstance(converter_instance, Converter) - - -def test_add_workflow(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_workflow(None) - - -def test_add_engine_run(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_engine_run(None) - - -def test_add_action(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_action(None, None) - - -def test_patch_workflow_input_collection(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.patch_workflow_input_collection(None) - - -def test_add_inputs_files(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_inputs_files(None) - - -def test_add_output_formats(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.add_output_formats(None) - - -def test_get_workflow(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.get_workflow(None) - - -def test_get_step_maps(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.get_step_maps(None) - - -def test_build_step_graph(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.build_step_graph(None) - - -def test_convert_param(converter_instance): - with pytest.raises(NotImplementedError): - converter_instance.convert_param(None, None) From 64c709cdf3eccb0b7328e7e31a86dc404a30f2a7 Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 09:07:14 +0000 Subject: [PATCH 21/23] Remove accidentally added zip --- revsort-run-1.crate.zip | Bin 6056 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 revsort-run-1.crate.zip diff --git a/revsort-run-1.crate.zip b/revsort-run-1.crate.zip deleted file mode 100644 index f69f19a678695f121883753cdd0d5ea116e7f8c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6056 zcma)AWl$W;BQzU474wTj$<7_h>;usKfvO00Y4G9AT^nab+b20RVCY008EnS2q~k z(bk3s?(5WV2-W08XM$ewhk>IRfse?yS_)AN_KSvEhnvl1Ii&Av|^O9-B z`;$AwI6ZkrcK39AO!%z?4a2=WD>ST%u~lCBcx~6;Ws*=f7(D?DgXzABC#+KUJe*-TYPu8Y6t&sW7|zQq?icOuymi|*BaUx) zQqBTIjON?Z)so&B=sL{u8{v=ftSOoHR871JxeO&Vm^)8=KZuDKoQ&+jfG~L!r@W*o zG{a)*p8{$z`(f(H&TB52h<|_AH$u*q`hMFsSx}wm@WgGbpMNTj5o)KyltRcKK8X#^ zV)N5dQQ(8g7$t6HQ;zo585emoD*_wBQ_GDACJ@zenpEcS9F-}U5D zD4kMR(X>Y3mxWD-Hu?Ghkzqv2*7RcIQyRAOrh^0^k05^+QF5s~&Q{bzvN!{H>6d?s z3?>(o)9-l*bSRF_sOolkWU_?s;~ey4F{^#IQ5goB1w?YWKVxkOC6dR{juJLpaubL& zj?NvnQo6QxSeZ~u)YBSQ#e+pmZ<$c}wx^0tN9PQ;93?aKUA&rfI2r%w#;yc9yNl+R zu(O!;!KHQ#o;j`iaTk~SN`sTGN!j8^fMd?5x5he(dqSy&g6Q16>EJQTU?r)JtR4|3 zI_tsnVhu!vb)T~vxj|?Xb@_|-TVc}IFZInnWb03Il`_AuUnOOf(&k%UinLF~eRZEg z(ZCW9(B{6h_>^*OP`^Q1I-z0tXsf~GcnPX&n<{Z19J$(6aXar^wY%8F@a;-Ii{ksP z_e(0|PVJ}}L}nyAN&%l5Tm8VD)e`-y8`n2hdJj)?$?-gF;|J>74`roz$_o&}SGCoH z@Iq1|*FmY_-t7t`_?RGA%EQqR=y7HxQIHp}adNyb3D3dZh99|oty*I(-L!UaigCFw zB1WvWd=%o!U$_Mg_U7*XJla&hqVvyZx<6>!wZ9GXdqvyn7_Y#}&t2HJ(xdhe8Tm=) zZJo};US@d^tdVB1c+OJm-)g6LWYN_P2mlzO0sw4(YDa)y*bXiXv$e4kw1ZefL}7yb zFk!fewFs|>4O|E&470O?@CgV*^mWA(mU;1yy5x`(cD|K4*1=OnX;fGAw)fkR77Kw+qzh0|4MQPfNIf^^l(bELbp=A|rbj%LO=VTlrz8Lw}| zg`NeU<7l>BAQM~hhb2pdJuWO>=%!ILsB=K*+Q{7aMeIoTqgFF z-HBW7H1{Tqp0rGyuFn?`8F^zx6IIVI<&PxVz0mTjTk?4+c@0c$%ozeHHQRkxc!k0z)VG*EchqsYRd0Mm!4!s&V=V zinc23%hXHa2W@S*Z9BFs$&YMQ$U|Y{iy9tlO=}mr;)upBnEcjt3~1<1Ti$*PVqYN1 z!B@9FWQzBMMP%W#<3Rn{Pg*OkMTe^NY;aQ{|emMHKxynox{#&W%;oc z2SZXgHVM!Qu8w-(s@cbA6p08UVLAg)tH173gVdOCS>XR~r(RBFIML{Fje zP3)NrUf<6w|AO#1l9cs;yeo-27$^mOAm4IxKuU5qK z^@h$WF(9cyVK8nCA#zkEv$=eM{_m_!0BI+?{x8;Ai}LdcLWD%%a6usf0RcX12tS;k zUxeRUSePFs1m}ea*jU@}iwNrLw$&{2Vz1njd}{^&4nz2kOqZgGapF2ZO;XS=A-oO= z#!LH#MT|i$t(;HRC>)|SJ1a8b)ioEHjqf__Mo4sW@U0`nH0+}&@MDS#WN6vdaa zzG8wQN0mnY)uSwK!HzO)>zoEOyi1QhQKY4BBDj?-92M2LjoJ zk$4hN8x%s^byVEYH&zxhC|&M9#7HO%iXv9D)tniwM2$fw8SD`(j)!xl#ZKRW5<#cq zLw4+%6~U2y5{#HpbmMKzGrEjRGt0GG+6G@D$l9LGJQK6UHJ!z3_u1rxXH_N2fUNh%>ESJRBJrD7?zVhg<|9pJQ{=R4~sQ=i|VWGfH7YAHUI zB{h_XA;sw5>o3p~c1}!kxymXTflh7S(6R+7Bnll> z0tpXn;AK=iaGtWvGjh*)@l>a?v`GwAW9w#>>&8+5{|!QIH+>iO<0&i*YhLJt3!c4N z+o4sSnU-~b7iH)asJLW%^A~AR3>u@{{}*XRh3#zltc7^31%*WT;kMRrVH*)40bx;y zt+fq=-&Vku4-|g2A{OUc_AxYC7V^r55CVH2g}FB_ECgbzUsXSln z6+5)^!@!6Tda~+)wW+NB0_zh?W_V#SR!)+bQvhe15BL;Z=Tc#Zu7@dfvZ`I)?c~{BUo4tZtFK83kxrAY<6W4G(Uh z0rb8OJIHf!)T3CO$w4=w3}$t}Y`yxGvEC{p=mac+Ajoy<9_5<3KtWTr2=+&^&wDu4 z$GV^$(No=!X#!~}v2v)2%l$TCM_}2s7*~i`fh;AJfgPGVVhvS`gKDyU%p`;NiMLQQ zo9241S^0=~R%);BwW}<;TI6lN-&jZ$w(w8lJP#LB;jLkFF7Wd~MPBmoXf!X?XAz1+ z@dk%|#&WlBWWRjG9Wm#{9?v{NF|K{edpGTLfP@-tP?vjh(qFNgyX~v=@R#zg(if6= z?c)2Ff?MOeZ1HFKMdeXbZ#?phG1Km7@2nDYxZWHNcfG$1IDVk=OS|rW+t8fMf zR#C3g8Rao5Rgz+EK5bZXsDL!XsK@HoU-ddPTQxAifY5PR>4*{Cvo(W~p#T~w>-`_k zdm|>vp+``9|`Dx*T}{g{5pSgJ?M2y6X;YR8K7b zq{}R?)g2UZFH3f3Nb%wWR%C=zZM1hxA?0;$!6Z?9F&Dd!bdOgr=U#ZGsD;f~Gf5pN zHcZauhXs=8ckGaI{sYRt_u6#%SM&3Kl))Pm0094A>e|EJ8Rp^7?ci$70#}8aQnxye!cZ?QZS9 zWRc;W)Bt+N6=!&;CGmhlCR7_=RC}*M@c|I%$*aCVy-^D)U9Jgs3A_hlBJ2;^9p*!b zQet#ip6n)R=uDiO8DnVmd!@I`sNE?7UA#WJQE-`US@$uZlgmD`KZn>-dvavUw`yIU0Kz%i>T_^wO-T9Ge}QiI6F zW$oW{uuy`^`TQvWP%Q%hkp9VmhbuSS1LkGR?QH7>vw?ZR{yP)Lrlzi2NRp5>O9t>W zZm|Y=ZQ@D^@0=9{G9#}iDTU`+*sWlNui0tLa>-jNWan#vV9SAd^QI#jVjRM*x|e5W zo##n05BK7rkRK24lSLK^&fCzpEO*3r+TKW5+TxY9Fce8cr(3dE%2X9<)Fpa%ADCQQ zr4tXmva)i(_uR3`II0V$=D)+|1>6r-8|Y&X*2Z}0 z`Z%wnhF0SQrI{EFWbe!FvzFwEGGiCHwA5Zoy8#Vb^WB`fi8H=!cDxIZqj%$j{Xm7P zOb#SKTXb-lBdzwAMw(i(}egjdc}%;^Dy6yR$+%aHPw$Y7}_57$9`+$1pnr|LBGy~_(9o-gD$c>JN()H z#grbmup?H5W5NyTBbL_6i!@dLqElbp71@Y~uDk&UBYduq=RysJdXR!>-b{YlpTk>H^6a)hJ|YT^|ch7LW37cC|++GN1ETit-&%f zLf~i6SMn+0YN7QNxiG1iABgvPinz$OxNMdPq(xWdDf|BP-k@~e#&VX2RSPg7aA4PQ zCS)YX>ZkOOPvG$l{p8~(<25qxf9T#Pv#JOZbc}f(lhpF%<}82&yUaBV`Wb<`ou;a+247KvV`&#eri!pUm82(=g`&nD-1OiSTN2< z@#H5Hd!20j(86L-N+~Hv!ZFADIxAj+uLR=bM5gy4#AJ};6opE4JJNE)T@GO7vY$FygHqczYA);f*jiVU$!uhe*mqHLVUd%*RcUlZ z+~QSoahZ7O-zNLW+WmrfmZGa@`x>$06S{X522(uWla(kNbw*@zKlTdhCU(F@(LFk$l0k2PA~LX-wPWh?9ps;Ug3bpj zN0SgRRm<-s$$IVsvymfa3_%$j>LW; z^rG%b&s0LGPteYS;QWuG$MfdfBVg`LMnBL0{J__GP3jF_Ei>hI#Q@W}vaTuRE^8dNj8mAYMab;leA~6CeTCz03Ei0W>a2dh7n5 zwp%wYKi|$_wIb63hsx_-TeykV>K39@#_-Xk1Vormx3W4cZq8CBWtXu7GVTsZClNC{8~TwZhjsf}*N|mTd5xAQ9cvaf5xMF-DOrhV2d)GPgfD+5wfg z%2%jWmyEM&-e($gr&(__V;qUSNS&w8p2?FjM?@h7CT$}F$OO_8Tz34AN-H_Dqz1;ys{i6*h#JDt;8Ma|Liw0NIr8)Gg+S%(*VX6 zRnr2~$0e1}Zq@wcFAM1S!YgBMNIS^m`P5p?bm(bUGf)@(X$&wIiQTr;QP=pgGa}0diZ>Yjzo0OJl|e(}F7MxQe|uX069oX2gh~8y ziT_iF>u=P*J%@i#&(HvX|I~r_8};v@@E_E23;^Ii9TERV{adsDgKB~Ge--@S(SNJm ef1<6i|3B@k1x0)EuYXVe?9YEDg?W;H-Tn)QLdVnq From 47482b93ebdfd011a0ee824d46530f7fac1e2d6c Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Thu, 12 Dec 2024 09:50:53 +0000 Subject: [PATCH 22/23] Update docstring --- src/runcrate/cli.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/runcrate/cli.py b/src/runcrate/cli.py index 1bea861..589e5f7 100644 --- a/src/runcrate/cli.py +++ b/src/runcrate/cli.py @@ -66,9 +66,12 @@ def cli(): ) def convert(root, converter, output, license, workflow_name, readme): """\ - Convert a CWLProv RO bundle into a Workflow Run RO-Crate. + Convert a provenance bundle into a Workflow Run RO-Crate. - RO_DIR: top-level directory of the CWLProv RO + Supported (see: converters): + - CWLProv RO. + + RO_DIR: top-level directory of the provenance bundle. """ if not output: From 5985eef0110839ce6566ad1bfd10c2ebb39be9ff Mon Sep 17 00:00:00 2001 From: Oliver Woolland Date: Fri, 13 Dec 2024 07:57:24 +0000 Subject: [PATCH 23/23] Refactor converter initialisation to be lazy and bespoke --- src/runcrate/convert.py | 30 ++++++------------------------ src/runcrate/converters/base.py | 17 ++++++++++++++--- src/runcrate/converters/cwl.py | 22 ++++++++++++++++++++++ 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/src/runcrate/convert.py b/src/runcrate/convert.py index 392ee49..908ef33 100644 --- a/src/runcrate/convert.py +++ b/src/runcrate/convert.py @@ -18,20 +18,12 @@ Generate a Workflow Run RO-Crate from a CWLProv RO bundle. """ -from pathlib import Path - -from bdbag.bdbagit import BDBag -from cwlprov.prov import Provenance -from cwlprov.ro import ResearchObject from rocrate.rocrate import ROCrate from .constants import TERMS_NAMESPACE from .converters import CONVERTERS -MANIFEST_FILE = "manifest-sha1.txt" - - class ProvCrateBuilder: def __init__(self, root, @@ -40,22 +32,12 @@ def __init__(self, license=None, readme=None): self.converter = converter - self.converter.root = Path(root) - self.converter.workflow_name = workflow_name - self.converter.license = license - self.converter.readme = Path(readme) if readme else readme - self.converter.wf_path = self.converter.root / "workflow" / self.converter.WORKFLOW_BASENAME - self.converter.workflow_definition = self.converter.get_workflow() - self.converter.step_maps = self.converter.get_step_maps() - self.converter.ro = ResearchObject(BDBag(str(root))) - self.converter.with_prov = set(str(_) for _ in self.converter.ro.resources_with_provenance()) - self.converter.workflow_run = Provenance(self.converter.ro).activity() - self.converter.roc_engine_run = None - self.converter.control_actions = {} - self.converter.collection = {} - self.converter.hashes = {} - self.converter.file_map = {} - self.converter.manifest = self.converter.get_manifest(self.converter.root, MANIFEST_FILE) + self.converter.populate( + root, + workflow_name=workflow_name, + license=license, + readme=readme + ) def build(self): crate = ROCrate(gen_preview=False) diff --git a/src/runcrate/converters/base.py b/src/runcrate/converters/base.py index 16af8fa..fa6d54b 100644 --- a/src/runcrate/converters/base.py +++ b/src/runcrate/converters/base.py @@ -65,6 +65,12 @@ def add_profiles(self, crate): return + @abstractmethod + def populate(self, root, workflow_name=None, license=None, readme=None): + """ + Populate the converter with the given root directory and optional metadata. + """ + @abstractmethod def add_workflow(self, crate): """ @@ -105,11 +111,16 @@ def add_output_formats(self, crate): # Helper functions - called by the top level functions @abstractmethod - def get_workflow(self, wf_path): + def get_workflow(self): """ - Get the workflow from the given path. + Should return a dictionary describing the workflow + Fetched from e.g. a file at self.wf_path - Returns a dictionary where tools / workflows are mapped by their ids. + The definition should contain: + - name: the workflow name + - inputs: a list of inputs + - outputs: a list of outputs + - steps: a list of steps """ @abstractmethod diff --git a/src/runcrate/converters/cwl.py b/src/runcrate/converters/cwl.py index 0894f21..3f33494 100644 --- a/src/runcrate/converters/cwl.py +++ b/src/runcrate/converters/cwl.py @@ -6,8 +6,10 @@ import networkx as nx import prov.model +from bdbag.bdbagit import BDBag from cwl_utils.parser import load_document_by_yaml from cwlprov.prov import Entity, Provenance +from cwlprov.ro import ResearchObject from cwlprov.utils import first from rocrate.model.contextentity import ContextEntity from rocrate.model.softwareapplication import SoftwareApplication @@ -17,6 +19,8 @@ from .base import Converter +MANIFEST_FILE = "manifest-sha1.txt" + CWLPROV_NONE = "https://w3id.org/cwl/prov#None" CWL_TYPE_MAP = { @@ -42,6 +46,24 @@ class CwlConverter(Converter): WORKFLOW_BASENAME = "packed.cwl" + def populate(self, root, workflow_name=None, license=None, readme=None): + self.root = Path(root) + self.workflow_name = workflow_name + self.license = license + self.readme = Path(readme) if readme else readme + self.wf_path = self.root / "workflow" / self.WORKFLOW_BASENAME + self.workflow_definition = self.get_workflow() + self.step_maps = self.get_step_maps() + self.ro = ResearchObject(BDBag(str(root))) + self.with_prov = set(str(_) for _ in self.ro.resources_with_provenance()) + self.workflow_run = Provenance(self.ro).activity() + self.roc_engine_run = None + self.control_actions = {} + self.collection = {} + self.hashes = {} + self.file_map = {} + self.manifest = self.get_manifest(self.root, MANIFEST_FILE) + # -------------------------------------------------------------------------- # Top level methods, called by build()