From 5b723d2d0d4c0b6c64fb5a1abc312bba2498b808 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 11:19:54 +0200 Subject: [PATCH 01/71] wrote signature of load hierarchical data --- notebooks/hierarchical_data_model.ipynb | 27 +++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 31caaf3..e3b4b59 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -226,6 +226,33 @@ } ], "execution_count": 5 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-16T09:17:15.597615Z", + "start_time": "2024-10-16T09:17:15.576646Z" + } + }, + "cell_type": "code", + "source": [ + "from io import IOBase\n", + "from pathlib import Path\n", + "from typing import Union, List, Literal\n", + "\n", + "\n", + "def load_hierarchical_data(\n", + " file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], \n", + " data_model: DataModel, \n", + " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None\n", + "):\n", + " data_reader = DataReader(file, file_extension=file_extension)\n", + " xml_dict = data_reader.data\n", + " print(xml_dict)" + ], + "id": "affc9ecd939c903f", + "outputs": [], + "execution_count": 1 } ], "metadata": { From e66e32fd127b1cdda8662df93e66765d66235449 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 11:41:06 +0200 Subject: [PATCH 02/71] added test case for reading list of jsons --- tests/utils/io/test_data_reader.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/tests/utils/io/test_data_reader.py b/tests/utils/io/test_data_reader.py index e1724b0..aae50e2 100644 --- a/tests/utils/io/test_data_reader.py +++ b/tests/utils/io/test_data_reader.py @@ -153,4 +153,29 @@ def test_reader_csv(inp, expected): ) def test_reader_json(inp, expected): data_reader = DataReader(StringIO(inp), file_extension="json") - assert data_reader.data == expected \ No newline at end of file + assert data_reader.data == expected + + +@pytest.mark.parametrize( + "inp, expected, file_extension", + [ + ( + [ + '{"pat_id": "patient_426387", "name":"Joe Johnson", "condition": {"term_id": "1253", "term_label": "acute_madeupgitis"}, "hospitalized": true}', + '{"pat_id": "patient_426388", "name":"Jane Doe", "condition": {"term_id": "1254", "term_label": "chronic_madeupgitis"}, "hospitalized": false}', + '{"pat_id": "patient_426389", "name":"Mark Markington", "condition": {"term_id": "1255", "term_label": "wild_type_madeupgitis"}, "hospitalized": true}' + ], + [ + {'pat_id': 'patient_426387', 'name':'Joe Johnson', 'condition': {'term_id': '1253', 'term_label': 'acute_madeupgitis'}, 'hospitalized': True}, + {'pat_id': 'patient_426388', 'name':'Jane Doe', 'condition': {'term_id': '1254', 'term_label': 'chronic_madeupgitis'}, 'hospitalized': False}, + {'pat_id': 'patient_426389', 'name':'Mark Markington', 'condition': {'term_id': '1255', 'term_label': 'wild_type_madeupgitis'}, 'hospitalized': True} + ], + 'json' + ), + ] +) +def test_reader_list(inp, expected, file_extension): + buffers = [StringIO(f) for f in inp] + data = DataReader(buffers, file_extension=file_extension).data + for d, e in zip(data, expected): + assert d == e \ No newline at end of file From ccba2248c511e8442202fd701ec1f29427602fe6 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 11:41:18 +0200 Subject: [PATCH 03/71] implemented reading lists --- src/phenopacket_mapper/utils/io/data_reader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py index 6989281..ddee3a4 100644 --- a/src/phenopacket_mapper/utils/io/data_reader.py +++ b/src/phenopacket_mapper/utils/io/data_reader.py @@ -22,7 +22,6 @@ def __init__( :param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the file path. Default is `None`. """ - # TODO: fix read xml # TODO: add option to pass a list of files to read self.is_dir = False self.file_extension = None @@ -59,10 +58,14 @@ def __init__( raise ValueError("File extension must be provided when passing a file buffer.") else: self.handle_file_extension(file_extension) + elif isinstance(file, list): + self.data = [DataReader(f, encoding=encoding, file_extension=file_extension).data for f in file] + self.iterable = self.data else: raise ValueError(f"Invalid input type {type(file)}.") - self.data, self.iterable = self._read() + if not isinstance(file, list): + self.data, self.iterable = self._read() def handle_file_extension(self, fe: str): if fe.lower() in ['csv', 'xlsx', 'json', 'xml']: From e41b76d124adf9bb0afb57f09898069363421010 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 11:55:59 +0200 Subject: [PATCH 04/71] wrote test case for reading list of xmls --- tests/utils/io/test_data_reader.py | 87 ++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/tests/utils/io/test_data_reader.py b/tests/utils/io/test_data_reader.py index aae50e2..21d4e5f 100644 --- a/tests/utils/io/test_data_reader.py +++ b/tests/utils/io/test_data_reader.py @@ -172,6 +172,93 @@ def test_reader_json(inp, expected): ], 'json' ), + ( + [ + ' ' + '' + '' + '' + 'Joe Johnson' + '
' + '12353' + 'acute_madeupgitis' + '
' + 'true' + '
' + '
' + '
', + + ' ' + '' + '' + '' + 'Jane Doe' + '
' + '12354' + 'chronic_madeupgitis' + '
' + 'false' + '
' + '
' + '
', + + ' ' + '' + '' + '' + 'Mark Markington' + '
' + '12355' + 'wild_type_madeupgitis' + '
' + 'true' + '
' + '
' + '
', + ], + [ + {'ODM': {'ClinicalData': {'MetaDataVersionOID': 'Metadata.test_study_2024-10-16_1142', + 'StudyOID': 'test_study', + 'SubjectData': {'Item': [{'#text': 'Joe Johnson', + 'id': 'patient_name'}, + {'#text': True, + 'id': 'hospitalized'}], + 'Section': {'Item': [{'#text': 12353, + 'id': 'term_id'}, + {'#text': 'acute_madeupgitis', + 'id': 'term_label'}], + 'id': 'condition'}, + 'SubjectKey': 101, + 'redcap:RecordIdField': 'record_id'}}}}, + {'ODM': {'ClinicalData': {'MetaDataVersionOID': 'Metadata.test_study_2024-10-16_1142', + 'StudyOID': 'test_study', + 'SubjectData': {'Item': [{'#text': 'Jane Doe', + 'id': 'patient_name'}, + {'#text': False, + 'id': 'hospitalized'}], + 'Section': {'Item': [{'#text': 12354, + 'id': 'term_id'}, + {'#text': 'chronic_madeupgitis', + 'id': 'term_label'}], + 'id': 'condition'}, + 'SubjectKey': 102, + 'redcap:RecordIdField': 'record_id'}}}}, + {'ODM': {'ClinicalData': {'MetaDataVersionOID': 'Metadata.test_study_2024-10-16_1142', + 'StudyOID': 'test_study', + 'SubjectData': {'Item': [{'#text': 'Mark Markington', + 'id': 'patient_name'}, + {'#text': True, + 'id': 'hospitalized'}], + 'Section': {'Item': [{'#text': 12355, + 'id': 'term_id'}, + {'#text': 'wild_type_madeupgitis', + 'id': 'term_label'}], + 'id': 'condition'}, + 'SubjectKey': 103, + 'redcap:RecordIdField': 'record_id'}}}} + ], + 'xml' + ), ] ) def test_reader_list(inp, expected, file_extension): From 26ef833e17311ec0c03b8cfc0c36943cd21c0dc7 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:03:56 +0200 Subject: [PATCH 05/71] updated tests to also test upper and lowercase file extensions --- tests/utils/io/test_data_reader.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/utils/io/test_data_reader.py b/tests/utils/io/test_data_reader.py index 21d4e5f..cfcd602 100644 --- a/tests/utils/io/test_data_reader.py +++ b/tests/utils/io/test_data_reader.py @@ -262,7 +262,8 @@ def test_reader_json(inp, expected): ] ) def test_reader_list(inp, expected, file_extension): - buffers = [StringIO(f) for f in inp] - data = DataReader(buffers, file_extension=file_extension).data - for d, e in zip(data, expected): - assert d == e \ No newline at end of file + for fe in [file_extension, file_extension.lower(), file_extension.upper()]: + buffers = [StringIO(f) for f in inp] + data = DataReader(buffers, file_extension=fe).data + for d, e in zip(data, expected): + assert d == e \ No newline at end of file From 2dd5235df78174b3e5848274fd13d6d43aa381c8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:04:07 +0200 Subject: [PATCH 06/71] removed prints from read xml --- src/phenopacket_mapper/utils/io/read_xml.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/read_xml.py b/src/phenopacket_mapper/utils/io/read_xml.py index 6ed3472..1228d5d 100644 --- a/src/phenopacket_mapper/utils/io/read_xml.py +++ b/src/phenopacket_mapper/utils/io/read_xml.py @@ -33,7 +33,6 @@ def parse_primitive_value(value: str): return value for k, v in dict_.items(): - print(f"{k=}, {type(k)=}, {v=}, {type(v)=}") if isinstance(v, dict): if v == {'@xsi:nil': 'true'}: # resolves dict_[k] = None @@ -41,9 +40,7 @@ def parse_primitive_value(value: str): dict_[k] = _post_process_xml_dict(v) elif isinstance(v, list): list_ = [] - print(f"{v=}") for i, item in enumerate(v): - print(f"{item=}, {type(item)=}") if isinstance(item, dict): list_.append(_post_process_xml_dict(item)) else: From fce31707d99b3858283858b14d0ddc6428777a26 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:04:37 +0200 Subject: [PATCH 07/71] added check to datareader to only allow json and xml when reading multiple files from a dir --- src/phenopacket_mapper/utils/io/data_reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py index ddee3a4..9189442 100644 --- a/src/phenopacket_mapper/utils/io/data_reader.py +++ b/src/phenopacket_mapper/utils/io/data_reader.py @@ -59,6 +59,8 @@ def __init__( else: self.handle_file_extension(file_extension) elif isinstance(file, list): + if file_extension.lower() not in ['json', 'xml']: + raise ValueError(f"File extension {file_extension} not supported for reading multiple files.") self.data = [DataReader(f, encoding=encoding, file_extension=file_extension).data for f in file] self.iterable = self.data else: From 44c6400e13b2b274d3734723b08dbd279fd94ae8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:19:46 +0200 Subject: [PATCH 08/71] retrieving values from dict --- notebooks/hierarchical_data_model.ipynb | 159 +++++++++++++++--------- 1 file changed, 100 insertions(+), 59 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index e3b4b59..b03b23a 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -6,23 +6,24 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-10-15T16:49:45.994633Z", - "start_time": "2024-10-15T16:49:45.291536Z" + "end_time": "2024-10-16T10:19:21.513679Z", + "start_time": "2024-10-16T10:19:21.507885Z" } }, "source": [ "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", - "from phenopacket_mapper.utils.io import DataReader" + "from phenopacket_mapper.utils.io import DataReader\n", + "from referencing.jsonschema import specification_with" ], "outputs": [], - "execution_count": 1 + "execution_count": 50 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-15T16:49:46.001925Z", - "start_time": "2024-10-15T16:49:45.997639Z" + "end_time": "2024-10-16T10:19:21.577040Z", + "start_time": "2024-10-16T10:19:21.562652Z" } }, "cell_type": "code", @@ -47,6 +48,18 @@ " description=\"status of the interpretation. REQUIRED.\",\n", " ),\n", " \n", + " DataSection(\n", + " name=\"example\",\n", + " required=True,\n", + " fields=(\n", + " DataField(\n", + " name=\"a_number\",\n", + " required=True,\n", + " specification=int, \n", + " ),\n", + " )\n", + " ),\n", + " \n", " OrGroup(\n", " name=\"call\",\n", " fields=(\n", @@ -82,13 +95,13 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 2 + "execution_count": 51 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-15T16:49:46.057078Z", - "start_time": "2024-10-15T16:49:46.053873Z" + "end_time": "2024-10-16T10:19:21.593915Z", + "start_time": "2024-10-16T10:19:21.586049Z" } }, "cell_type": "code", @@ -119,6 +132,19 @@ "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", "\t\tcardinality: 1..n\n", "\t)\n", + "\tDataSection(\n", + "\t\tid: example,\n", + "\t\tname: example,\n", + "\t\trequired: True\n", + "\t\tcardinality: 1..n\n", + "\tDataField(\n", + "\t\tid: a_number,\n", + "\t\tname: a_number,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\t)\n", "\tOrGroup(\n", "\t\tid: call,\n", "\t\tname: call,\n", @@ -157,13 +183,13 @@ ] } ], - "execution_count": 3 + "execution_count": 52 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-15T16:49:46.123667Z", - "start_time": "2024-10-15T16:49:46.120256Z" + "end_time": "2024-10-16T10:19:21.621639Z", + "start_time": "2024-10-16T10:19:21.617780Z" } }, "cell_type": "code", @@ -174,7 +200,8 @@ " (\n", " ' '\n", " ''\n", - " '\t'\n", + " ''\n", + " '123'\n", " ''\n", " ''\n", " ''\n", @@ -184,75 +211,89 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 4 + "execution_count": 53 + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-16T10:19:21.652451Z", + "start_time": "2024-10-16T10:19:21.645367Z" + } + }, + "cell_type": "code", + "source": [ + "from io import IOBase\n", + "from pathlib import Path\n", + "from typing import Union, List, Literal\n", + "\n", + "\n", + "def load_hierarchical_data(\n", + " file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], \n", + " data_model: DataModel, \n", + " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,\n", + " **kwargs,\n", + "): \n", + " def recursive_dict_call(d, keys):\n", + " if len(keys) == 1:\n", + " return d[keys[0]]\n", + " else:\n", + " return recursive_dict_call(d[keys[0]], keys[1:])\n", + " data_reader = DataReader(file, file_extension=file_extension)\n", + " xml_dict = data_reader.data\n", + " for k, v in kwargs.items():\n", + " print(f\"{k=}: {v=}\")\n", + " v_keys = v.split('.')\n", + " v = recursive_dict_call(xml_dict, v_keys)\n", + " print(f\"retrieved {k=}: {v=}\")" + ], + "id": "affc9ecd939c903f", + "outputs": [], + "execution_count": 54 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-15T16:49:46.146860Z", - "start_time": "2024-10-15T16:49:46.143098Z" + "end_time": "2024-10-16T10:19:21.680706Z", + "start_time": "2024-10-16T10:19:21.676105Z" } }, "cell_type": "code", - "source": "dr = DataReader(buffer, file_extension=\"xml\")", - "id": "a9f83d6e46715301", + "source": [ + "data_model_instance = load_hierarchical_data(\n", + " buffer, \n", + " genomic_interpretation, \n", + " file_extension=\"xml\",\n", + " subject_or_biosample_id=\"ODM.ClinicalData.SubjectData.SubjectKey\",\n", + " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber\",\n", + ")" + ], + "id": "53937efded7f589f", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "dict_={'ODM': {'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}}, type(dict_)=\n", - "k='ODM', type(k)=, v={'@xmlns': 'http://www.cdisc.org/ns/odm/v1.3', '@xmlns:ds': 'http://www.w3.org/2000/09/xmldsig#', '@xmlns:xsi': 'http://www.w3.org/2001/XMLSchema-instance', '@xmlns:redcap': 'https://projectredcap.org', '@xsi:schemaLocation': 'http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', '@ODMVersion': '1.3.1', '@FileOID': '000-00-0000', '@FileType': 'Snapshot', '@Description': 'genAdipositas - ALT Demo', '@AsOfDateTime': '2024-10-14T11:57:18', '@CreationDateTime': '2024-10-14T11:57:18', '@SourceSystem': 'REDCap', '@SourceSystemVersion': '14.6.9', 'ClinicalData': {'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}}, type(v)=\n", - "k='@xmlns', type(k)=, v='http://www.cdisc.org/ns/odm/v1.3', type(v)=\n", - "k='@xmlns:ds', type(k)=, v='http://www.w3.org/2000/09/xmldsig#', type(v)=\n", - "k='@xmlns:xsi', type(k)=, v='http://www.w3.org/2001/XMLSchema-instance', type(v)=\n", - "k='@xmlns:redcap', type(k)=, v='https://projectredcap.org', type(v)=\n", - "k='@xsi:schemaLocation', type(k)=, v='http://www.cdisc.org/ns/odm/v1.3 schema/odm/ODM1-3-1.xsd', type(v)=\n", - "k='@ODMVersion', type(k)=, v='1.3.1', type(v)=\n", - "k='@FileOID', type(k)=, v='000-00-0000', type(v)=\n", - "k='@FileType', type(k)=, v='Snapshot', type(v)=\n", - "k='@Description', type(k)=, v='genAdipositas - ALT Demo', type(v)=\n", - "k='@AsOfDateTime', type(k)=, v='2024-10-14T11:57:18', type(v)=\n", - "k='@CreationDateTime', type(k)=, v='2024-10-14T11:57:18', type(v)=\n", - "k='@SourceSystem', type(k)=, v='REDCap', type(v)=\n", - "k='@SourceSystemVersion', type(k)=, v='14.6.9', type(v)=\n", - "k='ClinicalData', type(k)=, v={'@StudyOID': 'Project.GenAdipositasALTDemo', '@MetaDataVersionOID': 'Metadata.GenAdipositasALTDemo_2024-10-14_1157', 'SubjectData': {'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}}, type(v)=\n", - "k='@StudyOID', type(k)=, v='Project.GenAdipositasALTDemo', type(v)=\n", - "k='@MetaDataVersionOID', type(k)=, v='Metadata.GenAdipositasALTDemo_2024-10-14_1157', type(v)=\n", - "k='SubjectData', type(k)=, v={'@SubjectKey': '101', '@redcap:RecordIdField': 'record_id'}, type(v)=\n", - "k='@SubjectKey', type(k)=, v='101', type(v)=\n", - "k='@redcap:RecordIdField', type(k)=, v='record_id', type(v)=\n" + "k='subject_or_biosample_id': v='ODM.ClinicalData.SubjectData.SubjectKey'\n", + "retrieved k='subject_or_biosample_id': v=101\n", + "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber'\n", + "retrieved k='example__a_number': v=123\n" ] } ], - "execution_count": 5 + "execution_count": 55 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T09:17:15.597615Z", - "start_time": "2024-10-16T09:17:15.576646Z" + "end_time": "2024-10-16T10:19:21.819369Z", + "start_time": "2024-10-16T10:19:21.815579Z" } }, "cell_type": "code", - "source": [ - "from io import IOBase\n", - "from pathlib import Path\n", - "from typing import Union, List, Literal\n", - "\n", - "\n", - "def load_hierarchical_data(\n", - " file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], \n", - " data_model: DataModel, \n", - " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None\n", - "):\n", - " data_reader = DataReader(file, file_extension=file_extension)\n", - " xml_dict = data_reader.data\n", - " print(xml_dict)" - ], - "id": "affc9ecd939c903f", + "source": "", + "id": "edbf8ad0a0a55290", "outputs": [], - "execution_count": 1 + "execution_count": null } ], "metadata": { From c9d8b2ed6a050853a972a26da97dbc232a41265c Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:24:14 +0200 Subject: [PATCH 09/71] implemented default value in recursive dict get --- notebooks/hierarchical_data_model.ipynb | 60 +++++++++++++------------ 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index b03b23a..29cac9f 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -6,24 +6,25 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.513679Z", - "start_time": "2024-10-16T10:19:21.507885Z" + "end_time": "2024-10-16T10:23:42.867547Z", + "start_time": "2024-10-16T10:23:42.862494Z" } }, "source": [ + "from numpy.f2py.auxfuncs import isintent_dict\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", "from phenopacket_mapper.utils.io import DataReader\n", "from referencing.jsonschema import specification_with" ], "outputs": [], - "execution_count": 50 + "execution_count": 69 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.577040Z", - "start_time": "2024-10-16T10:19:21.562652Z" + "end_time": "2024-10-16T10:23:42.912602Z", + "start_time": "2024-10-16T10:23:42.902709Z" } }, "cell_type": "code", @@ -95,13 +96,13 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 51 + "execution_count": 70 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.593915Z", - "start_time": "2024-10-16T10:19:21.586049Z" + "end_time": "2024-10-16T10:23:42.934883Z", + "start_time": "2024-10-16T10:23:42.928341Z" } }, "cell_type": "code", @@ -183,13 +184,13 @@ ] } ], - "execution_count": 52 + "execution_count": 71 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.621639Z", - "start_time": "2024-10-16T10:19:21.617780Z" + "end_time": "2024-10-16T10:23:42.969443Z", + "start_time": "2024-10-16T10:23:42.963812Z" } }, "cell_type": "code", @@ -211,20 +212,20 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 53 + "execution_count": 72 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.652451Z", - "start_time": "2024-10-16T10:19:21.645367Z" + "end_time": "2024-10-16T10:23:43.011750Z", + "start_time": "2024-10-16T10:23:43.002714Z" } }, "cell_type": "code", "source": [ "from io import IOBase\n", "from pathlib import Path\n", - "from typing import Union, List, Literal\n", + "from typing import Union, List, Literal, Dict\n", "\n", "\n", "def load_hierarchical_data(\n", @@ -233,13 +234,16 @@ " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,\n", " **kwargs,\n", "): \n", - " def recursive_dict_call(d, keys):\n", - " if len(keys) == 1:\n", - " return d[keys[0]]\n", + " def recursive_dict_call(d: Dict, keys: List, default=None):\n", + " if not isinstance(d, dict):\n", + " return d\n", + " elif len(keys) == 1:\n", + " return d.get(keys[0], default)\n", " else:\n", - " return recursive_dict_call(d[keys[0]], keys[1:])\n", + " return recursive_dict_call(d.get(keys[0], default), keys[1:])\n", " data_reader = DataReader(file, file_extension=file_extension)\n", " xml_dict = data_reader.data\n", + " \n", " for k, v in kwargs.items():\n", " print(f\"{k=}: {v=}\")\n", " v_keys = v.split('.')\n", @@ -248,13 +252,13 @@ ], "id": "affc9ecd939c903f", "outputs": [], - "execution_count": 54 + "execution_count": 73 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.680706Z", - "start_time": "2024-10-16T10:19:21.676105Z" + "end_time": "2024-10-16T10:23:43.041065Z", + "start_time": "2024-10-16T10:23:43.035664Z" } }, "cell_type": "code", @@ -264,7 +268,7 @@ " genomic_interpretation, \n", " file_extension=\"xml\",\n", " subject_or_biosample_id=\"ODM.ClinicalData.SubjectData.SubjectKey\",\n", - " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber\",\n", + " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber1\",\n", ")" ], "id": "53937efded7f589f", @@ -275,18 +279,18 @@ "text": [ "k='subject_or_biosample_id': v='ODM.ClinicalData.SubjectData.SubjectKey'\n", "retrieved k='subject_or_biosample_id': v=101\n", - "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber'\n", - "retrieved k='example__a_number': v=123\n" + "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber1'\n", + "retrieved k='example__a_number': v=None\n" ] } ], - "execution_count": 55 + "execution_count": 74 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:19:21.819369Z", - "start_time": "2024-10-16T10:19:21.815579Z" + "end_time": "2024-10-16T10:23:43.183385Z", + "start_time": "2024-10-16T10:23:43.179443Z" } }, "cell_type": "code", From 3e5384dd173ddff0c7d7707d42297cf01569fd59 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:25:53 +0200 Subject: [PATCH 10/71] moved datafield value in class --- .../data_standards/data_model.py | 94 +++++++++---------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 325604c..c9a7d36 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -124,53 +124,6 @@ def __str__(self): ret += "\t)" return ret -@dataclass(slots=True) -class DataFieldValue: - """This class defines the value of a `DataField` in a `DataModelInstance` - - Equivalent to a cell value in a table. - - :ivar row_no: The id of the value, i.e. the row number - :ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field. - :ivar value: The value of the field. - """ - row_no: Union[str, int] - field: DataField - value: Union[int, float, str, bool, Date, CodeSystem] - - def validate(self) -> bool: - """Validates the data model instance based on data model definition - - This method checks if the instance is valid based on the data model definition. It checks if all required fields - are present, if the values are in the value set, etc. - - :return: True if the instance is valid, False otherwise - """ - if self.field.required and self.value is None: # no value - warnings.warn(f"Field {self.field.name} is required but has no value") - return False - elif self.value is not None and self.field.specification: - if Any in self.field.specification: # value set allows any - return True - elif self.value in self.field.specification: # raw value (likely a primitive) is in the value set - return True - else: # check if the value matches one of the types in the value set - for e in self.field.specification: - if isinstance(e, type): - cur_type = e - if cur_type is type(self.value): - return True - elif isinstance(e, CodeSystem): - cs = e - from phenopacket_mapper.data_standards import Coding - if isinstance(self.value, Coding) and self.value.system == cs: - return True - - warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field " - f"{self.field.name} (row {self.row_no})") - return False - - @dataclass(slots=True, frozen=True) class DataModel: """This class defines a data model for medical data using `DataField` @@ -341,6 +294,53 @@ def load_data_using_data_model( ) +@dataclass(slots=True) +class DataFieldValue: + """This class defines the value of a `DataField` in a `DataModelInstance` + + Equivalent to a cell value in a table. + + :ivar row_no: The id of the value, i.e. the row number + :ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field. + :ivar value: The value of the field. + """ + row_no: Union[str, int] + field: DataField + value: Union[int, float, str, bool, Date, CodeSystem] + + def validate(self) -> bool: + """Validates the data model instance based on data model definition + + This method checks if the instance is valid based on the data model definition. It checks if all required fields + are present, if the values are in the value set, etc. + + :return: True if the instance is valid, False otherwise + """ + if self.field.required and self.value is None: # no value + warnings.warn(f"Field {self.field.name} is required but has no value") + return False + elif self.value is not None and self.field.specification: + if Any in self.field.specification: # value set allows any + return True + elif self.value in self.field.specification: # raw value (likely a primitive) is in the value set + return True + else: # check if the value matches one of the types in the value set + for e in self.field.specification: + if isinstance(e, type): + cur_type = e + if cur_type is type(self.value): + return True + elif isinstance(e, CodeSystem): + cs = e + from phenopacket_mapper.data_standards import Coding + if isinstance(self.value, Coding) and self.value.system == cs: + return True + + warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field " + f"{self.field.name} (row {self.row_no})") + return False + + @dataclass(slots=True) class DataModelInstance: """This class defines an instance of a `DataModel`, i.e. a record in a dataset From fd7cd14f833ddfc94cf04a8eb3b0362aedd7d153 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:31:27 +0200 Subject: [PATCH 11/71] added datasection isntance dataclass --- src/phenopacket_mapper/data_standards/data_model.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index c9a7d36..4195c52 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -341,6 +341,16 @@ def validate(self) -> bool: return False +@dataclass(slots=True) +class DataSectionInstance: + identifier: Union[str, int] = field() + data_section: DataSection = field() + values = Tuple[Union[DataFieldValue, DataSectionInstance]] = field(default_factory=tuple(list())) + + def validate(self): + warnings.warn("The DataSectionInstance validate method has not been implemented yet.") + + @dataclass(slots=True) class DataModelInstance: """This class defines an instance of a `DataModel`, i.e. a record in a dataset From 4af2370a77569a45de5df3239414fa68c7a08a7d Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 12:59:12 +0200 Subject: [PATCH 12/71] wrote test for is_hierarchical of d\tamodel --- tests/data_standards/test_data_model.py | 31 +++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/data_standards/test_data_model.py diff --git a/tests/data_standards/test_data_model.py b/tests/data_standards/test_data_model.py new file mode 100644 index 0000000..a06de1b --- /dev/null +++ b/tests/data_standards/test_data_model.py @@ -0,0 +1,31 @@ +import pytest +from phenopacket_mapper import DataModel + +from phenopacket_mapper.data_standards import DataField + +class TestDataModel: + + @staticmethod + @pytest.mark.parametrize( + "inp, expected", + [ + ( + DataModel( + data_model_name="test", + fields=( + DataField( + name="test_field", + specification=int + ), + DataField( + name="test_field2", + specification=str + ), + ) + ), + False + ), + ] + ) + def test_data_model(inp: DataModel, expected): + assert inp.is_hierarchical == expected \ No newline at end of file From 6dba07fbefa67f66754f8a34ee4e193a85a07d8b Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:03:37 +0200 Subject: [PATCH 13/71] wrote more test cases for is hierarchical --- tests/data_standards/test_data_model.py | 58 ++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tests/data_standards/test_data_model.py b/tests/data_standards/test_data_model.py index a06de1b..b87ec7d 100644 --- a/tests/data_standards/test_data_model.py +++ b/tests/data_standards/test_data_model.py @@ -1,7 +1,8 @@ import pytest from phenopacket_mapper import DataModel -from phenopacket_mapper.data_standards import DataField +from phenopacket_mapper.data_standards import DataField, DataSection, OrGroup + class TestDataModel: @@ -25,6 +26,61 @@ class TestDataModel: ), False ), + ( + DataModel( + data_model_name="test", + fields=( + DataField( + name="test_field", + specification=int + ), + DataField( + name="test_field2", + specification=str + ), + DataSection( + name="test_data_section", + fields=( + DataField( + name="test_field3", + specification=bool + ), + ) + ) + ) + ), + True + ), + ( + DataModel( + data_model_name="test", + fields=( + DataField( + name="test_field", + specification=int + ), + OrGroup( + name="test_or_group", + fields=( + DataField( + name="test_field2", + specification=str + ), + DataSection( + name="test_data_section", + fields=( + DataField( + name="test_field3", + specification=bool + ), + ) + ) + ) + ), + ) + ), + True + ), ] ) def test_data_model(inp: DataModel, expected): From 3c3454f51fd123e2b11eeb660e839d9a825c8ab4 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:03:59 +0200 Subject: [PATCH 14/71] added is hierarchical property to data model --- src/phenopacket_mapper/data_standards/data_model.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 4195c52..92cfca4 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -167,6 +167,19 @@ def __str__(self): def __iter__(self): return iter(self.fields) + @property + def is_hierarchical(self) -> bool: + def recursive_is_hierarchical(d: Union[DataField, DataSection, OrGroup]): + if isinstance(d, DataField): + return False + elif isinstance(d, DataSection): + return True + else: # OrGroup + return any([recursive_is_hierarchical(f) for f in d.fields]) + + return any([recursive_is_hierarchical(f) for f in self.fields]) + + def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]: """Returns a DataField object by its id From 15b0d256d4feded16857704353ba2f43b0372254 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:04:18 +0200 Subject: [PATCH 15/71] typos --- src/phenopacket_mapper/data_standards/data_model.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 92cfca4..b5a87e2 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -358,10 +358,11 @@ def validate(self) -> bool: class DataSectionInstance: identifier: Union[str, int] = field() data_section: DataSection = field() - values = Tuple[Union[DataFieldValue, DataSectionInstance]] = field(default_factory=tuple(list())) + values: Tuple[Union[DataFieldValue, 'DataSectionInstance']] = field(default_factory=tuple(list())) - def validate(self): + def validate(self) -> bool: warnings.warn("The DataSectionInstance validate method has not been implemented yet.") + return True @dataclass(slots=True) @@ -379,7 +380,7 @@ class DataModelInstance: """ row_no: Union[int, str] data_model: DataModel - values: List[DataFieldValue] + values: List[Union[DataFieldValue, DataSectionInstance]] compliance: Literal['lenient', 'strict'] = 'lenient' def __post_init__(self): From 0f479c87094d72df376dd04cf4634d3d4f5781f7 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:07:58 +0200 Subject: [PATCH 16/71] added getattr to datasection and updated return type for datamodel --- src/phenopacket_mapper/data_standards/data_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index b5a87e2..298b29d 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -124,6 +124,12 @@ def __str__(self): ret += "\t)" return ret + def __getattr__(self, var_name: str) -> Union[DataField, OrGroup, DataSection]: + for f in self.fields: + if f.id == var_name: + return f + raise AttributeError(f"'DataSection' object has no attribute '{var_name}'") + @dataclass(slots=True, frozen=True) class DataModel: """This class defines a data model for medical data using `DataField` @@ -147,7 +153,7 @@ def __post_init__(self): if len(self.fields) != len(set([f.id for f in self.fields])): raise ValueError("All fields in a DataModel must have unique identifiers") - def __getattr__(self, var_name: str) -> DataField: + def __getattr__(self, var_name: str) -> Union[DataField, OrGroup, DataSection]: for f in self.fields: if f.id == var_name: return f From 717105772e377024ec45e6ed3b144b776023b35a Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:09:18 +0200 Subject: [PATCH 17/71] added getattr for orgroup --- src/phenopacket_mapper/data_standards/data_model.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 298b29d..ce5c718 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -570,6 +570,12 @@ def __str__(self): ret += "\t)" return ret + def __getattr__(self, var_name: str) -> Union[DataField, DataSection, OrGroup]: + for f in self.fields: + if f.id == var_name: + return f + raise AttributeError(f"'OrGroup' object has no attribute '{var_name}'") + if __name__ == "__main__": df = DataField(name="Field 1", specification=int) From f6e9857a4dc5c7d6690473569a21703a4ab61cfe Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:13:43 +0200 Subject: [PATCH 18/71] split loading into hierarchical and tabular --- .../data_standards/data_model.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index ce5c718..dd0c583 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -229,13 +229,16 @@ def load_data( :return: A list of `DataModelInstance` objects """ # TODO: move the dynamic params to the load method in utils.io - column_names = dict() - for f in self.fields: - column_param = f"{f.id}_column" - if column_param not in kwargs: - raise TypeError(f"load_data() missing 1 required argument: '{column_param}'") - else: - column_names[f.id] = kwargs[column_param] + if self.is_hierarchical: + raise NotImplementedError + else: + column_names = dict() + for f in self.fields: + column_param = f"{f.id}_column" + if column_param not in kwargs: + raise TypeError(f"load_data() missing 1 required argument: '{column_param}'") + else: + column_names[f.id] = kwargs[column_param] from phenopacket_mapper.utils.io import load_data_using_data_model return load_data_using_data_model( From 4af3dc9bedd158e635671a4f37f6123bc9768770 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:14:00 +0200 Subject: [PATCH 19/71] renamed path to file --- src/phenopacket_mapper/utils/io/__init__.py | 4 ++-- src/phenopacket_mapper/utils/io/input.py | 15 ++++----------- 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/__init__.py b/src/phenopacket_mapper/utils/io/__init__.py index 812ed0b..f85a98f 100644 --- a/src/phenopacket_mapper/utils/io/__init__.py +++ b/src/phenopacket_mapper/utils/io/__init__.py @@ -3,7 +3,7 @@ from .read_json import read_json from .read_xml import read_xml, parse_xml from .data_reader import DataReader -from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_data_using_data_model +from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_tabular_data_using_data_model from .output import write __all__ = [ @@ -13,7 +13,7 @@ 'read_data_model', 'read_phenopackets', 'read_phenopacket_from_json', - 'load_data_using_data_model', + 'load_tabular_data_using_data_model', 'write', ] diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 0a0785e..88cea4c 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -133,8 +133,8 @@ def remove_line_breaks_if_not_none(value): return DataModel(data_model_name=data_model_name, fields=data_fields, resources=resources) -def load_data_using_data_model( - path: Union[str, Path], +def load_tabular_data_using_data_model( + file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], data_model: DataModel, column_names: Dict[str, str], compliance: Literal['lenient', 'strict'] = 'lenient', @@ -151,7 +151,7 @@ def load_data_using_data_model( load_data_using_data_model("data.csv", data_model, column_names) ``` - :param path: Path to formatted csv or excel file + :param file: :param data_model: DataModel to use for reading the file :param column_names: A dictionary mapping from the id of each field of the `DataField` to the name of a column in the file @@ -159,14 +159,7 @@ def load_data_using_data_model( that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. :return: List of DataModelInstances """ - if isinstance(path, Path): - pass - elif isinstance(path, str): - path = Path(path) - else: - raise ValueError(f'Path must be a string or Path object, not {type(path)}') - - dr = DataReader(path) + dr = DataReader(file) data, data_iterable = dr.data, dr.iterable # TODO: for the moment assume that the data is a pandas DataFrame From 978b8a5c732e02945dfdc52406745db963ce2269 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:14:20 +0200 Subject: [PATCH 20/71] renamed path to file --- .../data_standards/data_model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index dd0c583..8905003 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -240,13 +240,13 @@ def load_data( else: column_names[f.id] = kwargs[column_param] - from phenopacket_mapper.utils.io import load_data_using_data_model - return load_data_using_data_model( - path=path, - data_model=self, - column_names=column_names, - compliance=compliance - ) + from phenopacket_mapper.utils.io import load_tabular_data_using_data_model + return load_tabular_data_using_data_model( + file=path, + data_model=self, + column_names=column_names, + compliance=compliance + ) @staticmethod def from_file( From d4d1cfae4701d48caf51100881179931cad9bd1e Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 13:14:34 +0200 Subject: [PATCH 21/71] removed static methods --- .../data_standards/data_model.py | 67 ------------------- 1 file changed, 67 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 8905003..434eb27 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -248,73 +248,6 @@ def load_data( compliance=compliance ) - @staticmethod - def from_file( - data_model_name: str, - resources: List[CodeSystem], - path: Union[str, Path], - file_type: Literal['csv', 'excel', 'unknown'] = 'unknown', - column_names: Dict[str, str] = MappingProxyType({ - DataField.name.__name__: 'data_field_name', - DataField.description.__name__: 'description', - DataField.specification.__name__: 'value_set', - DataField.required.__name__: 'required', - }), - parse_value_sets: bool = False, - remove_line_breaks: bool = False, - parse_ordinals: bool = True, - ) -> 'DataModel': - """Reads a Data Model from a file - - :param data_model_name: Name to be given to the `DataModel` object - :param resources: List of `CodeSystem` objects to be used as resources in the `DataModel` - :param path: Path to Data Model file - :param file_type: Type of file to read, either 'csv' or 'excel' - :param column_names: A dictionary mapping from each field of the `DataField` (key) class to a column of the file - (value). Leaving a value empty (`''`) will leave the field in the `DataModel` definition empty. - :param parse_value_sets: If True, parses the string to a ValueSet object, can later be used to check - validity of the data. Optional, but highly recommended. - :param remove_line_breaks: Whether to remove line breaks from string values - :param parse_ordinals: Whether to extract the ordinal number from the field name. Warning: this can overwrite values - Ordinals could look like: "1.1.", "1.", "I.a.", or "ii.", etc. - """ - from phenopacket_mapper.pipeline import read_data_model - return read_data_model( - data_model_name, - resources, - path, - file_type, - column_names, - parse_value_sets, - remove_line_breaks, - parse_ordinals - ) - - @staticmethod - def load_data_using_data_model( - path: Union[str, Path], - data_model: 'DataModel', - column_names: Dict[str, str], - compliance: Literal['lenient', 'strict'] = 'lenient', - ) -> 'DataSet': - """Loads data from a file using a DataModel definition - - :param path: Path to formatted csv or excel file - :param data_model: DataModel to use for reading the file - :param column_names: A dictionary mapping from the id of each field of the `DataField` to the name of a - column in the file - :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields - that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. - :return: List of DataModelInstances - """ - from phenopacket_mapper.pipeline import load_data_using_data_model - return load_data_using_data_model( - path=path, - data_model=data_model, - column_names=column_names, - compliance=compliance - ) - @dataclass(slots=True) class DataFieldValue: From 502209b872f108173d8d33e575a6c44713101ef2 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:31:34 +0200 Subject: [PATCH 22/71] fixed --- src/phenopacket_mapper/data_standards/data_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 434eb27..7239d63 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -10,7 +10,6 @@ from dataclasses import dataclass, field from pathlib import Path -from types import MappingProxyType from typing import Union, List, Literal, Dict, Optional, Any, Callable, Tuple import warnings @@ -124,7 +123,7 @@ def __str__(self): ret += "\t)" return ret - def __getattr__(self, var_name: str) -> Union[DataField, OrGroup, DataSection]: + def __getattr__(self, var_name: str) -> Union[DataField, 'OrGroup', 'DataSection']: for f in self.fields: if f.id == var_name: return f @@ -153,7 +152,7 @@ def __post_init__(self): if len(self.fields) != len(set([f.id for f in self.fields])): raise ValueError("All fields in a DataModel must have unique identifiers") - def __getattr__(self, var_name: str) -> Union[DataField, OrGroup, DataSection]: + def __getattr__(self, var_name: str) -> Union[DataField, 'OrGroup', DataSection]: for f in self.fields: if f.id == var_name: return f From 36490182a78e5b66bb720df66160699087c2bb04 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:31:46 +0200 Subject: [PATCH 23/71] import and removed old --- src/phenopacket_mapper/utils/io/input.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 88cea4c..0e9cd9f 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -1,5 +1,6 @@ import math import os +from io import IOBase from pathlib import Path from types import MappingProxyType from typing import Literal, List, Union, Dict, Tuple @@ -122,11 +123,9 @@ def remove_line_breaks_if_not_none(value): data_fields = data_fields + ( DataField( name=data_field_name, - section=section, specification=value_set, description=description, required=required, - ordinal=ordinal ), ) From 6aa301fd28e77c2daceeed0f3149644a284fd652 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:32:09 +0200 Subject: [PATCH 24/71] fix --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 7239d63..4759a8c 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -505,7 +505,7 @@ def __str__(self): ret += "\t)" return ret - def __getattr__(self, var_name: str) -> Union[DataField, DataSection, OrGroup]: + def __getattr__(self, var_name: str) -> Union[DataField, DataSection, 'OrGroup']: for f in self.fields: if f.id == var_name: return f From c982917276cdbd5ad285508261af5550d2667747 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:32:40 +0200 Subject: [PATCH 25/71] remove old todo comments --- src/phenopacket_mapper/utils/io/input.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 0e9cd9f..90758c6 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -161,7 +161,6 @@ def load_tabular_data_using_data_model( dr = DataReader(file) data, data_iterable = dr.data, dr.iterable - # TODO: for the moment assume that the data is a pandas DataFrame df = data # check column_names is in the correct format @@ -176,7 +175,7 @@ def load_tabular_data_using_data_model( data_model_instances = [] - for i in range(len(df)): # todo: change to iter also non tabular data + for i in range(len(df)): values = [] for f in data_model.fields: column_name = column_names[f.id] From d5ae849e7cc22889afa6ac12d7e0acde804934d2 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:34:35 +0200 Subject: [PATCH 26/71] removed old --- src/phenopacket_mapper/utils/io/input.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 90758c6..5877f38 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -96,15 +96,12 @@ def remove_line_breaks_if_not_none(value): data_fields: Tuple[DataField, ...] = tuple() for i in range(len(df)): data_field_name = loc_default(df, row_index=i, column_name=column_names.get(DataField.name.__name__, '')) - section = loc_default(df, row_index=i, column_name=column_names.get(DataField.section.__name__, '')) value_set = loc_default(df, row_index=i, column_name=column_names.get(DataField.specification.__name__, '')) description = loc_default(df, row_index=i, column_name=column_names.get(DataField.description.__name__, '')) required = bool(loc_default(df, row_index=i, column_name=column_names.get(DataField.required.__name__, ''))) - ordinal = loc_default(df, row_index=i, column_name=column_names.get(DataField.ordinal.__name__, '')) if remove_line_breaks: data_field_name = remove_line_breaks_if_not_none(data_field_name) - section = remove_line_breaks_if_not_none(section) description = remove_line_breaks_if_not_none(description) if parse_ordinals: From 31aae1ee89a03f4e6b4156a24ef42be29e0142fd Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:39:37 +0200 Subject: [PATCH 27/71] added test input file --- tests/utils/io/test_input.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/utils/io/test_input.py diff --git a/tests/utils/io/test_input.py b/tests/utils/io/test_input.py new file mode 100644 index 0000000..e69de29 From 279cf646eefc09e985900907df17b215b3aca2f8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 14:39:49 +0200 Subject: [PATCH 28/71] renamed dr to data_reader --- src/phenopacket_mapper/utils/io/input.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 5877f38..255d7d5 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -155,8 +155,8 @@ def load_tabular_data_using_data_model( that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. :return: List of DataModelInstances """ - dr = DataReader(file) - data, data_iterable = dr.data, dr.iterable + data_reader = DataReader(file) + data, data_iterable = data_reader.data, data_reader.iterable df = data From e8e499727885ca7af48b4ab12bc8eb7545a0a2dc Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 15:15:06 +0200 Subject: [PATCH 29/71] created structure of load hierarchical --- notebooks/hierarchical_data_model.ipynb | 163 ++++++++++++++++++------ 1 file changed, 126 insertions(+), 37 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 29cac9f..2560189 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -6,25 +6,27 @@ "metadata": { "collapsed": true, "ExecuteTime": { - "end_time": "2024-10-16T10:23:42.867547Z", - "start_time": "2024-10-16T10:23:42.862494Z" + "end_time": "2024-10-16T12:58:18.383581Z", + "start_time": "2024-10-16T12:58:18.379833Z" } }, "source": [ "from numpy.f2py.auxfuncs import isintent_dict\n", + "\n", + "from build.lib.phenopacket_mapper.data_standards import DataModelInstance\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", "from phenopacket_mapper.utils.io import DataReader\n", "from referencing.jsonschema import specification_with" ], "outputs": [], - "execution_count": 69 + "execution_count": 13 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:42.912602Z", - "start_time": "2024-10-16T10:23:42.902709Z" + "end_time": "2024-10-16T12:58:18.413647Z", + "start_time": "2024-10-16T12:58:18.407115Z" } }, "cell_type": "code", @@ -96,13 +98,13 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 70 + "execution_count": 14 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:42.934883Z", - "start_time": "2024-10-16T10:23:42.928341Z" + "end_time": "2024-10-16T12:58:18.429528Z", + "start_time": "2024-10-16T12:58:18.425940Z" } }, "cell_type": "code", @@ -184,13 +186,13 @@ ] } ], - "execution_count": 71 + "execution_count": 15 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:42.969443Z", - "start_time": "2024-10-16T10:23:42.963812Z" + "end_time": "2024-10-16T12:58:18.484302Z", + "start_time": "2024-10-16T12:58:18.481047Z" } }, "cell_type": "code", @@ -212,53 +214,138 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 72 + "execution_count": 16 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:43.011750Z", - "start_time": "2024-10-16T10:23:43.002714Z" + "end_time": "2024-10-16T13:02:26.777660Z", + "start_time": "2024-10-16T13:02:26.769768Z" } }, "cell_type": "code", "source": [ + "import warnings\n", + "from phenopacket_mapper.data_standards import DataFieldValue\n", + "import math\n", "from io import IOBase\n", "from pathlib import Path\n", "from typing import Union, List, Literal, Dict\n", "\n", + "from phenopacket_mapper.utils import parsing\n", + "\n", + "def recursive_dict_call(d: Dict, keys: List, default=None):\n", + " if not isinstance(d, dict):\n", + " return d\n", + " elif len(keys) == 1:\n", + " return d.get(keys[0], default)\n", + " else:\n", + " return recursive_dict_call(d.get(keys[0], default), keys[1:])\n", + " \n", + "def load_hierarchical_data_recursive(\n", + " loaded_data_instance: Dict,\n", + " data_model: Union[DataModel, DataSection, OrGroup, DataField],\n", + " compliance: Literal['lenient', 'strict'] = 'lenient',\n", + " **kwargs,\n", + "):\n", + " \"\"\"Helper method for `load_hierarchical_data`, recurses through hierarchical :class:`DataModel`\n", + " \n", + " `loaded_data_instance` is expected to be a dictionary as returned by `DataReader.data` when reading a single xml or json file \n", + " \n", + " :param loaded_data_instance: data loaded in by :class:`DataReader`\n", + " :param data_model:\n", + " :compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", + " that are not in the DataModel. If 'strict', the file must have all fields in the DataModel.\n", + " \"\"\"\n", + " if isinstance(data_model, DataModel):\n", + " return (\n", + " load_hierarchical_data_recursive(\n", + " loaded_data_instance,\n", + " f,\n", + " compliance=compliance,\n", + " **kwargs,\n", + " )\n", + " for f in data_model.fields\n", + " )\n", + " elif isinstance(data_model, DataSection):\n", + " # TODO: DataSectionInstance\n", + " pass\n", + " elif isinstance(data_model, OrGroup):\n", + " # TODO: resolve or\n", + " pass\n", + " elif isinstance(data_model, DataField):\n", + " # TODO: assign value\n", + " # # preprocess the data into values\n", + " # for k, v in kwargs.items():\n", + " # print(f\"{k=}: {v=}\")\n", + " # # retrieve value from loaded data instance\n", + " # v_keys = v.split('.')\n", + " # dict_value = recursive_dict_call(data, v_keys)\n", + " # \n", + " # print(f\"{dict_value=}\")\n", + " # if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)):\n", + " # continue\n", + " # \n", + " # value_str = str(dict_value)\n", + " # value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)\n", + " # data_field_value = DataFieldValue(row_no=i, field=f, value=value)\n", + " # \n", + " # kwargs[k] = value\n", + " # \n", + " # print(f\"retrieved {k=}: {value=}\")\n", + " pass\n", + " else:\n", + " err_msg = f\"DataModel {data_model} is not a valid type ({type(data_model)}).\"\n", + " if compliance == 'strict':\n", + " raise ValueError(err_msg)\n", + " elif compliance == 'lenient':\n", + " warnings.warn(err_msg)\n", + " else:\n", + " raise ValueError(f\"Invalid compliance level: {compliance}\")\n", + " \n", "\n", "def load_hierarchical_data(\n", " file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], \n", " data_model: DataModel, \n", " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,\n", + " compliance: Literal['lenient', 'strict'] = 'lenient',\n", " **kwargs,\n", "): \n", - " def recursive_dict_call(d: Dict, keys: List, default=None):\n", - " if not isinstance(d, dict):\n", - " return d\n", - " elif len(keys) == 1:\n", - " return d.get(keys[0], default)\n", - " else:\n", - " return recursive_dict_call(d.get(keys[0], default), keys[1:])\n", + " if not data_model.is_hierarchical:\n", + " warnings.warn(\"This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.\")\n", + " \n", " data_reader = DataReader(file, file_extension=file_extension)\n", - " xml_dict = data_reader.data\n", + " data, data_iterable = data_reader.data, data_reader.iterable\n", " \n", - " for k, v in kwargs.items():\n", - " print(f\"{k=}: {v=}\")\n", - " v_keys = v.split('.')\n", - " v = recursive_dict_call(xml_dict, v_keys)\n", - " print(f\"retrieved {k=}: {v=}\")" + " # assembling data model instances\n", + " data_model_instances = []\n", + " \n", + " for i, data_instance in enumerate(data_iterable):\n", + " # iterate through data model\n", + " data_model_instances.append(\n", + " DataModelInstance(\n", + " row_no=i,\n", + " data_model=data_model,\n", + " values=load_hierarchical_data_recursive(\n", + " data_instance,\n", + " data_model,\n", + " compliance=compliance,\n", + " **kwargs,\n", + " ),\n", + " compliance=compliance,\n", + " )\n", + " )\n", + " " ], "id": "affc9ecd939c903f", "outputs": [], - "execution_count": 73 + "execution_count": 19 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:43.041065Z", - "start_time": "2024-10-16T10:23:43.035664Z" + "end_time": "2024-10-16T12:58:18.510293Z", + "start_time": "2024-10-16T12:58:18.507134Z" } }, "cell_type": "code", @@ -268,7 +355,7 @@ " genomic_interpretation, \n", " file_extension=\"xml\",\n", " subject_or_biosample_id=\"ODM.ClinicalData.SubjectData.SubjectKey\",\n", - " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber1\",\n", + " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber\",\n", ")" ], "id": "53937efded7f589f", @@ -278,19 +365,21 @@ "output_type": "stream", "text": [ "k='subject_or_biosample_id': v='ODM.ClinicalData.SubjectData.SubjectKey'\n", - "retrieved k='subject_or_biosample_id': v=101\n", - "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber1'\n", - "retrieved k='example__a_number': v=None\n" + "dict_value=101\n", + "retrieved k='subject_or_biosample_id': repr(value)='101'\n", + "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber'\n", + "dict_value=123\n", + "retrieved k='example__a_number': repr(value)='123'\n" ] } ], - "execution_count": 74 + "execution_count": 18 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-16T10:23:43.183385Z", - "start_time": "2024-10-16T10:23:43.179443Z" + "end_time": "2024-10-16T12:58:18.563943Z", + "start_time": "2024-10-16T12:58:18.561884Z" } }, "cell_type": "code", From c10d12feb7e43890af4547e0f625cb895e5bdb0f Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 15:21:09 +0200 Subject: [PATCH 30/71] added class desc and tmp code to remove warning --- src/phenopacket_mapper/data_standards/data_model.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 4759a8c..42c899b 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -297,11 +297,17 @@ def validate(self) -> bool: @dataclass(slots=True) class DataSectionInstance: + """ + :ivar identifier: The id of the instance, i.e. the row number + :ivar data_section: The `DataSection` object that defines the data model for this instance + :ivar values: A list of `DataFieldValue` objects, each adhering to the `DataField` definition in the `DataModel` + """ identifier: Union[str, int] = field() data_section: DataSection = field() - values: Tuple[Union[DataFieldValue, 'DataSectionInstance']] = field(default_factory=tuple(list())) + values: Tuple[Union[DataFieldValue, 'DataSectionInstance']] = field() def validate(self) -> bool: + tmp = self.identifier warnings.warn("The DataSectionInstance validate method has not been implemented yet.") return True From e16bf0f1b475fa816c3023f4794b99d59be4c930 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 15:21:21 +0200 Subject: [PATCH 31/71] implemented loading of datasection instances --- notebooks/hierarchical_data_model.ipynb | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 2560189..d187da8 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -16,6 +16,7 @@ "from build.lib.phenopacket_mapper.data_standards import DataModelInstance\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", + "from phenopacket_mapper.data_standards.data_model import DataSectionInstance\n", "from phenopacket_mapper.utils.io import DataReader\n", "from referencing.jsonschema import specification_with" ], @@ -268,10 +269,25 @@ " for f in data_model.fields\n", " )\n", " elif isinstance(data_model, DataSection):\n", - " # TODO: DataSectionInstance\n", - " pass\n", + " data_section: DataSection = data_model\n", + " \n", + " values = (\n", + " load_hierarchical_data_recursive(\n", + " loaded_data_instance,\n", + " f,\n", + " compliance=compliance,\n", + " **kwargs,\n", + " )\n", + " for f in data_section.fields\n", + " )\n", + " \n", + " return DataSectionInstance(\n", + " identifier=\"123\", # TODO: assign identifier\n", + " data_section=data_section,\n", + " values=values,\n", + " )\n", " elif isinstance(data_model, OrGroup):\n", - " # TODO: resolve or\n", + " # TODO: resolve or this seems to be very difficult\n", " pass\n", " elif isinstance(data_model, DataField):\n", " # TODO: assign value\n", From 6565ed09b90ca09047d5f2764c2a87758aebb226 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 15:21:59 +0200 Subject: [PATCH 32/71] typo --- notebooks/hierarchical_data_model.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index d187da8..6f99149 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -255,7 +255,7 @@ " \n", " :param loaded_data_instance: data loaded in by :class:`DataReader`\n", " :param data_model:\n", - " :compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", + " :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", " that are not in the DataModel. If 'strict', the file must have all fields in the DataModel.\n", " \"\"\"\n", " if isinstance(data_model, DataModel):\n", From 33db1694983ec3c744b67e3b09b9235813aa1134 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 18:38:25 +0200 Subject: [PATCH 33/71] removed data models --- src/phenopacket_mapper/data_standards/__init__.py | 2 -- .../data_standards/data_models/__init__.py | 5 ----- .../data_standards/data_models/erdri_cds.py | 9 --------- 3 files changed, 16 deletions(-) delete mode 100644 src/phenopacket_mapper/data_standards/data_models/__init__.py delete mode 100644 src/phenopacket_mapper/data_standards/data_models/erdri_cds.py diff --git a/src/phenopacket_mapper/data_standards/__init__.py b/src/phenopacket_mapper/data_standards/__init__.py index d12bb64..1b30401 100644 --- a/src/phenopacket_mapper/data_standards/__init__.py +++ b/src/phenopacket_mapper/data_standards/__init__.py @@ -4,14 +4,12 @@ from .code_system import CodeSystem, SNOMED_CT, HPO, MONDO, OMIM, ORDO, LOINC from .code import Coding, CodeableConcept from .data_model import DataModel, DataField, DataModelInstance, DataFieldValue, DataSet, DataSection, OrGroup -from . import data_models from .value_set import ValueSet __all__ = [ "Cardinality", "Coding", "CodeableConcept", "DataModel", "DataField", "DataModelInstance", "DataFieldValue", "DataSet", "DataSection", "OrGroup", - "data_models", "CodeSystem", "SNOMED_CT", "HPO", "MONDO", "OMIM", "ORDO", "LOINC", "Date", diff --git a/src/phenopacket_mapper/data_standards/data_models/__init__.py b/src/phenopacket_mapper/data_standards/data_models/__init__.py deleted file mode 100644 index dab9824..0000000 --- a/src/phenopacket_mapper/data_standards/data_models/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -"""Selection of rare disease specific data models""" -from .erdri_cds import ERDRI_CDS -from phenopacket_mapper.utils.parsing.parse_data_type import parse_data_type - -__all__ = ["ERDRI_CDS", "parse_data_type"] diff --git a/src/phenopacket_mapper/data_standards/data_models/erdri_cds.py b/src/phenopacket_mapper/data_standards/data_models/erdri_cds.py deleted file mode 100644 index 92e1b59..0000000 --- a/src/phenopacket_mapper/data_standards/data_models/erdri_cds.py +++ /dev/null @@ -1,9 +0,0 @@ -from phenopacket_mapper.data_standards.data_model import DataModel, DataField -ERDRI_CDS = None -# ERDRI_CDS = DataModel( -# data_model_name="ERDRI_CDS", -# resources=[], -# fields=[ -# # TODO: Implement fields -# ] -# ) From 7a87400da9b8ac22a4eb856e30351586df364c0d Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 18:45:14 +0200 Subject: [PATCH 34/71] added mapping param instead of kwargs --- notebooks/hierarchical_data_model.ipynb | 174 +++++------------------- 1 file changed, 34 insertions(+), 140 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 6f99149..e49b36f 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -5,31 +5,22 @@ "id": "initial_id", "metadata": { "collapsed": true, - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.383581Z", - "start_time": "2024-10-16T12:58:18.379833Z" + "jupyter": { + "is_executing": true } }, "source": [ - "from numpy.f2py.auxfuncs import isintent_dict\n", - "\n", "from build.lib.phenopacket_mapper.data_standards import DataModelInstance\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", "from phenopacket_mapper.data_standards.data_model import DataSectionInstance\n", - "from phenopacket_mapper.utils.io import DataReader\n", - "from referencing.jsonschema import specification_with" + "from phenopacket_mapper.utils.io import DataReader" ], "outputs": [], - "execution_count": 13 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.413647Z", - "start_time": "2024-10-16T12:58:18.407115Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "genomic_interpretation = DataModel(\n", @@ -99,15 +90,18 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 14 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.429528Z", - "start_time": "2024-10-16T12:58:18.425940Z" - } - }, + "metadata": {}, + "cell_type": "code", + "source": "genomic_interpretation.example.a_number", + "id": "a32bb965c37e98b4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, "cell_type": "code", "source": [ "s = str(genomic_interpretation)\n", @@ -115,87 +109,11 @@ "print(s)" ], "id": "35a697d8b9b8236d", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DataModel(\n", - "\tname: Phenopacket schema Genomic Interpretation\n", - "\tDataField(\n", - "\t\tid: subject_or_biosample_id,\n", - "\t\tname: subject_or_biosample_id,\n", - "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 1..n\n", - "\t)\n", - "\tDataField(\n", - "\t\tid: interpretation_status,\n", - "\t\tname: interpretation_status,\n", - "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", - "\t\tcardinality: 1..n\n", - "\t)\n", - "\tDataSection(\n", - "\t\tid: example,\n", - "\t\tname: example,\n", - "\t\trequired: True\n", - "\t\tcardinality: 1..n\n", - "\tDataField(\n", - "\t\tid: a_number,\n", - "\t\tname: a_number,\n", - "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 1..n\n", - "\t)\n", - "\t)\n", - "\tOrGroup(\n", - "\t\tid: call,\n", - "\t\tname: call,\n", - "\t\trequired: False\n", - "\t\tcardinality: 0..n\n", - "\tDataSection(\n", - "\t\tid: genedescriptor,\n", - "\t\tname: GeneDescriptor,\n", - "\t\trequired: False\n", - "\t\tcardinality: 0..n\n", - "\tDataField(\n", - "\t\tid: value_id,\n", - "\t\tname: value_id,\n", - "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 1..n\n", - "\t)\n", - "\tDataField(\n", - "\t\tid: symbol,\n", - "\t\tname: symbol,\n", - "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 1..n\n", - "\t)\n", - "\tDataField(\n", - "\t\tid: description,\n", - "\t\tname: description,\n", - "\t\trequired: False\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 0..n\n", - "\t)\n", - "\t)\n", - "\t)\n", - "---\n", - ")\n" - ] - } - ], - "execution_count": 15 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.484302Z", - "start_time": "2024-10-16T12:58:18.481047Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "from io import StringIO\n", @@ -215,15 +133,10 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 16 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T13:02:26.777660Z", - "start_time": "2024-10-16T13:02:26.769768Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "import warnings\n", @@ -325,8 +238,10 @@ " data_model: DataModel, \n", " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,\n", " compliance: Literal['lenient', 'strict'] = 'lenient',\n", - " **kwargs,\n", + " mapping: Dict[DataField, str] = None,\n", "): \n", + " if not mapping:\n", + " raise AttributeError(f\"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}\")\n", " if not data_model.is_hierarchical:\n", " warnings.warn(\"This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.\")\n", " \n", @@ -355,49 +270,28 @@ ], "id": "affc9ecd939c903f", "outputs": [], - "execution_count": 19 + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.510293Z", - "start_time": "2024-10-16T12:58:18.507134Z" - } - }, + "metadata": {}, "cell_type": "code", "source": [ "data_model_instance = load_hierarchical_data(\n", - " buffer, \n", - " genomic_interpretation, \n", + " file=buffer, \n", + " data_model=genomic_interpretation, \n", " file_extension=\"xml\",\n", - " subject_or_biosample_id=\"ODM.ClinicalData.SubjectData.SubjectKey\",\n", - " example__a_number=\"ODM.ClinicalData.SubjectData.ANumber\",\n", + " mapping={\n", + " genomic_interpretation.subject_or_biosample_id: \"ODM.ClinicalData.SubjectData.SubjectKey\",\n", + " genomic_interpretation.example.a_number: \"ODM.ClinicalData.SubjectData.ANumber\",\n", + " }\n", ")" ], "id": "53937efded7f589f", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "k='subject_or_biosample_id': v='ODM.ClinicalData.SubjectData.SubjectKey'\n", - "dict_value=101\n", - "retrieved k='subject_or_biosample_id': repr(value)='101'\n", - "k='example__a_number': v='ODM.ClinicalData.SubjectData.ANumber'\n", - "dict_value=123\n", - "retrieved k='example__a_number': repr(value)='123'\n" - ] - } - ], - "execution_count": 18 + "outputs": [], + "execution_count": null }, { - "metadata": { - "ExecuteTime": { - "end_time": "2024-10-16T12:58:18.563943Z", - "start_time": "2024-10-16T12:58:18.561884Z" - } - }, + "metadata": {}, "cell_type": "code", "source": "", "id": "edbf8ad0a0a55290", From ce97575b91278cbd02feca8d5d95c6ed3f923eea Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 18:52:11 +0200 Subject: [PATCH 35/71] implemented filling datafieldvalue when loading hierarchical --- notebooks/hierarchical_data_model.ipynb | 46 ++++++++++++------------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index e49b36f..9fa40be 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -160,7 +160,7 @@ " loaded_data_instance: Dict,\n", " data_model: Union[DataModel, DataSection, OrGroup, DataField],\n", " compliance: Literal['lenient', 'strict'] = 'lenient',\n", - " **kwargs,\n", + " mapping: Dict[DataField, str] = None,\n", "):\n", " \"\"\"Helper method for `load_hierarchical_data`, recurses through hierarchical :class:`DataModel`\n", " \n", @@ -170,6 +170,7 @@ " :param data_model:\n", " :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", " that are not in the DataModel. If 'strict', the file must have all fields in the DataModel.\n", + " :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data\n", " \"\"\"\n", " if isinstance(data_model, DataModel):\n", " return (\n", @@ -177,7 +178,7 @@ " loaded_data_instance,\n", " f,\n", " compliance=compliance,\n", - " **kwargs,\n", + " mapping=mapping\n", " )\n", " for f in data_model.fields\n", " )\n", @@ -189,7 +190,7 @@ " loaded_data_instance,\n", " f,\n", " compliance=compliance,\n", - " **kwargs,\n", + " mapping=mapping,\n", " )\n", " for f in data_section.fields\n", " )\n", @@ -203,26 +204,22 @@ " # TODO: resolve or this seems to be very difficult\n", " pass\n", " elif isinstance(data_model, DataField):\n", - " # TODO: assign value\n", - " # # preprocess the data into values\n", - " # for k, v in kwargs.items():\n", - " # print(f\"{k=}: {v=}\")\n", - " # # retrieve value from loaded data instance\n", - " # v_keys = v.split('.')\n", - " # dict_value = recursive_dict_call(data, v_keys)\n", - " # \n", - " # print(f\"{dict_value=}\")\n", - " # if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)):\n", - " # continue\n", - " # \n", - " # value_str = str(dict_value)\n", - " # value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)\n", - " # data_field_value = DataFieldValue(row_no=i, field=f, value=value)\n", - " # \n", - " # kwargs[k] = value\n", - " # \n", - " # print(f\"retrieved {k=}: {value=}\")\n", - " pass\n", + " data_field = data_model\n", + " \n", + " keys_str = mapping.get(data_model, None)\n", + " \n", + " if keys_str:\n", + " keys = keys_str.split('.')\n", + " dict_value = recursive_dict_call(loaded_data_instance, keys)\n", + "\n", + " if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)):\n", + " return None\n", + "\n", + " value_str = str(dict_value)\n", + " value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)\n", + " data_field_value = DataFieldValue(row_no=keys_str, field=data_field, value=value)\n", + "\n", + " return data_field_value\n", " else:\n", " err_msg = f\"DataModel {data_model} is not a valid type ({type(data_model)}).\"\n", " if compliance == 'strict':\n", @@ -242,6 +239,7 @@ "): \n", " if not mapping:\n", " raise AttributeError(f\"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}\")\n", + " \n", " if not data_model.is_hierarchical:\n", " warnings.warn(\"This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.\")\n", " \n", @@ -261,7 +259,7 @@ " data_instance,\n", " data_model,\n", " compliance=compliance,\n", - " **kwargs,\n", + " mapping=mapping\n", " ),\n", " compliance=compliance,\n", " )\n", From 404884cc3e5661747f409648137241e1a0074811 Mon Sep 17 00:00:00 2001 From: frehburg Date: Wed, 16 Oct 2024 18:59:48 +0200 Subject: [PATCH 36/71] fixed identifiers --- notebooks/hierarchical_data_model.ipynb | 26 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 9fa40be..66a4859 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -157,6 +157,7 @@ " return recursive_dict_call(d.get(keys[0], default), keys[1:])\n", " \n", "def load_hierarchical_data_recursive(\n", + " loaded_data_instance_identifier: Union[int, str],\n", " loaded_data_instance: Dict,\n", " data_model: Union[DataModel, DataSection, OrGroup, DataField],\n", " compliance: Literal['lenient', 'strict'] = 'lenient',\n", @@ -166,6 +167,7 @@ " \n", " `loaded_data_instance` is expected to be a dictionary as returned by `DataReader.data` when reading a single xml or json file \n", " \n", + " :param loaded_data_instance_identifier: identifier of the loaded data_instance\n", " :param loaded_data_instance: data loaded in by :class:`DataReader`\n", " :param data_model:\n", " :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", @@ -175,8 +177,9 @@ " if isinstance(data_model, DataModel):\n", " return (\n", " load_hierarchical_data_recursive(\n", - " loaded_data_instance,\n", - " f,\n", + " loaded_data_instance_identifier=loaded_data_instance_identifier,\n", + " loaded_data_instance=loaded_data_instance,\n", + " data_model=f,\n", " compliance=compliance,\n", " mapping=mapping\n", " )\n", @@ -187,8 +190,9 @@ " \n", " values = (\n", " load_hierarchical_data_recursive(\n", - " loaded_data_instance,\n", - " f,\n", + " loaded_data_instance_identifier=loaded_data_instance_identifier,\n", + " loaded_data_instance=loaded_data_instance,\n", + " data_model=f,\n", " compliance=compliance,\n", " mapping=mapping,\n", " )\n", @@ -196,7 +200,7 @@ " )\n", " \n", " return DataSectionInstance(\n", - " identifier=\"123\", # TODO: assign identifier\n", + " identifier=str(loaded_data_instance_identifier) + \":\" + data_section.id, # TODO: get identifiers of parents\n", " data_section=data_section,\n", " values=values,\n", " )\n", @@ -217,7 +221,11 @@ "\n", " value_str = str(dict_value)\n", " value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)\n", - " data_field_value = DataFieldValue(row_no=keys_str, field=data_field, value=value)\n", + " data_field_value = DataFieldValue(\n", + " row_no=str(loaded_data_instance_identifier) + \":\" + keys_str, \n", + " field=data_field, \n", + " value=value\n", + " )\n", "\n", " return data_field_value\n", " else:\n", @@ -250,14 +258,14 @@ " data_model_instances = []\n", " \n", " for i, data_instance in enumerate(data_iterable):\n", - " # iterate through data model\n", " data_model_instances.append(\n", " DataModelInstance(\n", " row_no=i,\n", " data_model=data_model,\n", " values=load_hierarchical_data_recursive(\n", - " data_instance,\n", - " data_model,\n", + " loaded_data_instance_identifier=str(i),\n", + " loaded_data_instance=data_instance,\n", + " data_model=data_model,\n", " compliance=compliance,\n", " mapping=mapping\n", " ),\n", From 73ed4f3c3e08f8476be2fdde67bf837b284975a3 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 13:41:13 +0200 Subject: [PATCH 37/71] moved recursive dict call from nb to utils --- src/phenopacket_mapper/utils/__init__.py | 4 +++- src/phenopacket_mapper/utils/recursive_dict_call.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 src/phenopacket_mapper/utils/recursive_dict_call.py diff --git a/src/phenopacket_mapper/utils/__init__.py b/src/phenopacket_mapper/utils/__init__.py index 02d1f05..d6b7c92 100644 --- a/src/phenopacket_mapper/utils/__init__.py +++ b/src/phenopacket_mapper/utils/__init__.py @@ -2,9 +2,11 @@ from .create_ipynb_in_code import NotebookBuilder from .pandas_utils import loc_default from .str_to_valid_id import str_to_valid_id +from .recursive_dict_call import recursive_dict_call __all__ = [ "NotebookBuilder", "loc_default", - "str_to_valid_id" + "str_to_valid_id", + "recursive_dict_call", ] diff --git a/src/phenopacket_mapper/utils/recursive_dict_call.py b/src/phenopacket_mapper/utils/recursive_dict_call.py new file mode 100644 index 0000000..d3bb401 --- /dev/null +++ b/src/phenopacket_mapper/utils/recursive_dict_call.py @@ -0,0 +1,10 @@ +from typing import Dict, List + + +def recursive_dict_call(d: Dict, keys: List, default=None): + if not isinstance(d, dict): + return d + elif len(keys) == 1: + return d.get(keys[0], default) + else: + return recursive_dict_call(d.get(keys[0], default), keys[1:]) \ No newline at end of file From ffbfe4c5e2692bd3c7985ef3d5685d1f2aeaacbb Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 13:58:00 +0200 Subject: [PATCH 38/71] wrote hierarchical data loading --- src/phenopacket_mapper/utils/io/input.py | 134 ++++++++++++++++++++++- 1 file changed, 132 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 255d7d5..9f824be 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -1,5 +1,6 @@ import math import os +import warnings from io import IOBase from pathlib import Path from types import MappingProxyType @@ -10,11 +11,13 @@ from google.protobuf.json_format import Parse from phenopacket_mapper.data_standards import DataModel, DataModelInstance, DataField, CodeSystem, DataFieldValue, \ - DataSet -from phenopacket_mapper.utils import loc_default + DataSet, OrGroup, DataSection +from phenopacket_mapper.data_standards.data_model import DataSectionInstance +from phenopacket_mapper.utils import loc_default, recursive_dict_call from phenopacket_mapper.utils import parsing from phenopacket_mapper.utils.io.data_reader import DataReader from phenopacket_mapper.utils.parsing import parse_ordinal +from tests.utils.parsing.test_parse_coding import resources def read_data_model( @@ -185,6 +188,9 @@ def load_tabular_data_using_data_model( value_str = str(pandas_value) value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance) values.append(DataFieldValue(row_no=i, field=f, value=value)) + + values = tuple(values) + data_model_instances.append( DataModelInstance( row_no=i, @@ -226,3 +232,127 @@ def read_phenopacket_from_json(path: Union[str, Path]) -> Phenopacket: phenopacket = Phenopacket() Parse(json_data, phenopacket) return phenopacket + + +def load_hierarchical_data_recursive( + loaded_data_instance_identifier: Union[int, str], + loaded_data_instance: Dict, + data_model: Union[DataModel, DataSection, OrGroup, DataField], + resources: List[CodeSystem], + compliance: Literal['lenient', 'strict'] = 'lenient', + mapping: Dict[DataField, str] = None, +) -> Union[Tuple, Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]]: + """Helper method for `load_hierarchical_data`, recurses through hierarchical :class:`DataModel` + + `loaded_data_instance` is expected to be a dictionary as returned by `DataReader.data` when reading a single xml or json file + + :param loaded_data_instance_identifier: identifier of the loaded data_instance + :param loaded_data_instance: data loaded in by :class:`DataReader` + :param data_model: + :param resources: List of `CodeSystem` objects to be used as resources in the `DataModel` + :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields + that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. + :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data + """ + if isinstance(data_model, DataModel): + tmp: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ + load_hierarchical_data_recursive( + loaded_data_instance_identifier=loaded_data_instance_identifier, + loaded_data_instance=loaded_data_instance, + data_model=f, + resources=resources, + compliance=compliance, + mapping=mapping + ) + for f in data_model.fields + ] + return tuple(tmp) + elif isinstance(data_model, DataSection): + data_section: DataSection = data_model + + values = tuple([ + load_hierarchical_data_recursive( + loaded_data_instance_identifier=loaded_data_instance_identifier, + loaded_data_instance=loaded_data_instance, + data_model=f, + resources=resources, + compliance=compliance, + mapping=mapping, + ) + for f in data_section.fields + ]) + + return DataSectionInstance( + identifier=str(loaded_data_instance_identifier) + ":" + data_section.id, # TODO: get identifiers of parents + data_section=data_section, + values=values, + ) + elif isinstance(data_model, OrGroup): + # TODO: resolve or this seems to be very difficult + pass + elif isinstance(data_model, DataField): + data_field = data_model + + keys_str = mapping.get(data_model, None) + + if keys_str: + keys = keys_str.split('.') + dict_value = recursive_dict_call(loaded_data_instance, keys) + + if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)): + return None + + value_str = str(dict_value) + value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance) + data_field_value = DataFieldValue( + row_no=str(loaded_data_instance_identifier) + ":" + keys_str, + field=data_field, + value=value + ) + + return data_field_value + else: + err_msg = f"DataModel {data_model} is not a valid type ({type(data_model)})." + if compliance == 'strict': + raise ValueError(err_msg) + elif compliance == 'lenient': + warnings.warn(err_msg) + else: + raise ValueError(f"Invalid compliance level: {compliance}") + + +def load_hierarchical_data( + file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], + data_model: DataModel, + file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None, + compliance: Literal['lenient', 'strict'] = 'lenient', + mapping: Dict[DataField, str] = None, +): + if not mapping: + raise AttributeError(f"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}") + + if not data_model.is_hierarchical: + warnings.warn("This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.") + + data_reader = DataReader(file, file_extension=file_extension) + data, data_iterable = data_reader.data, data_reader.iterable + + # assembling data model instances + data_model_instances = [] + + for i, data_instance in enumerate(data_iterable): + data_model_instances.append( + DataModelInstance( + row_no=i, + data_model=data_model, + values=load_hierarchical_data_recursive( + loaded_data_instance_identifier=str(i), + loaded_data_instance=data_instance, + data_model=data_model, + resources=data_model.resources, + compliance=compliance, + mapping=mapping + ), + compliance=compliance, + ) + ) From eeaf90955903bc8adb659ea87a9ede1b9b554803 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 13:58:08 +0200 Subject: [PATCH 39/71] changed todo --- src/phenopacket_mapper/utils/io/data_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py index 9189442..a6154da 100644 --- a/src/phenopacket_mapper/utils/io/data_reader.py +++ b/src/phenopacket_mapper/utils/io/data_reader.py @@ -22,7 +22,7 @@ def __init__( :param file_extension: The file extension of the file to read. If `None`, the file extension is inferred from the file path. Default is `None`. """ - # TODO: add option to pass a list of files to read + # TODO: fix file names so we can identify data instances correctly, can do this at the start self.is_dir = False self.file_extension = None From 4cfef5a126a258dc467d30800e539d4b456e78f2 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 13:58:32 +0200 Subject: [PATCH 40/71] fixed typing in data model --- src/phenopacket_mapper/data_standards/data_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 42c899b..e9b8317 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -304,7 +304,7 @@ class DataSectionInstance: """ identifier: Union[str, int] = field() data_section: DataSection = field() - values: Tuple[Union[DataFieldValue, 'DataSectionInstance']] = field() + values: Tuple[Union[DataFieldValue, 'DataSectionInstance'], ...] = field() def validate(self) -> bool: tmp = self.identifier @@ -327,7 +327,7 @@ class DataModelInstance: """ row_no: Union[int, str] data_model: DataModel - values: List[Union[DataFieldValue, DataSectionInstance]] + values: Tuple[Union[DataFieldValue, DataSectionInstance], ...] compliance: Literal['lenient', 'strict'] = 'lenient' def __post_init__(self): From 411a72e2c0a1b85de177dac07a5df35b13e0a7e4 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 13:58:43 +0200 Subject: [PATCH 41/71] removed methods for hierarchical data loading --- notebooks/hierarchical_data_model.ipynb | 173 ++++++++++++++++++++---- 1 file changed, 146 insertions(+), 27 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 66a4859..dfd43d0 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -5,22 +5,28 @@ "id": "initial_id", "metadata": { "collapsed": true, - "jupyter": { - "is_executing": true + "ExecuteTime": { + "end_time": "2024-10-17T11:40:00.972441Z", + "start_time": "2024-10-17T11:40:00.967261Z" } }, "source": [ - "from build.lib.phenopacket_mapper.data_standards import DataModelInstance\n", + "from phenopacket_mapper.data_standards import DataModelInstance\n", "from phenopacket_mapper.data_standards import DataField\n", "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", "from phenopacket_mapper.data_standards.data_model import DataSectionInstance\n", "from phenopacket_mapper.utils.io import DataReader" ], "outputs": [], - "execution_count": null + "execution_count": 8 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:00.995234Z", + "start_time": "2024-10-17T11:40:00.978558Z" + } + }, "cell_type": "code", "source": [ "genomic_interpretation = DataModel(\n", @@ -90,18 +96,39 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": null + "execution_count": 9 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:01.041106Z", + "start_time": "2024-10-17T11:40:01.028028Z" + } + }, "cell_type": "code", "source": "genomic_interpretation.example.a_number", "id": "a32bb965c37e98b4", - "outputs": [], - "execution_count": null + "outputs": [ + { + "data": { + "text/plain": [ + "DataField(name='a_number', specification=ValueSet(elements=[], name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 10 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:01.091102Z", + "start_time": "2024-10-17T11:40:01.085216Z" + } + }, "cell_type": "code", "source": [ "s = str(genomic_interpretation)\n", @@ -109,11 +136,87 @@ "print(s)" ], "id": "35a697d8b9b8236d", - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DataModel(\n", + "\tname: Phenopacket schema Genomic Interpretation\n", + "\tDataField(\n", + "\t\tid: subject_or_biosample_id,\n", + "\t\tname: subject_or_biosample_id,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: interpretation_status,\n", + "\t\tname: interpretation_status,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataSection(\n", + "\t\tid: example,\n", + "\t\tname: example,\n", + "\t\trequired: True\n", + "\t\tcardinality: 1..n\n", + "\tDataField(\n", + "\t\tid: a_number,\n", + "\t\tname: a_number,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\t)\n", + "\tOrGroup(\n", + "\t\tid: call,\n", + "\t\tname: call,\n", + "\t\trequired: False\n", + "\t\tcardinality: 0..n\n", + "\tDataSection(\n", + "\t\tid: genedescriptor,\n", + "\t\tname: GeneDescriptor,\n", + "\t\trequired: False\n", + "\t\tcardinality: 0..n\n", + "\tDataField(\n", + "\t\tid: value_id,\n", + "\t\tname: value_id,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: symbol,\n", + "\t\tname: symbol,\n", + "\t\trequired: True\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 1..n\n", + "\t)\n", + "\tDataField(\n", + "\t\tid: description,\n", + "\t\tname: description,\n", + "\t\trequired: False\n", + "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tcardinality: 0..n\n", + "\t)\n", + "\t)\n", + "\t)\n", + "---\n", + ")\n" + ] + } + ], + "execution_count": 11 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:01.158069Z", + "start_time": "2024-10-17T11:40:01.152182Z" + } + }, "cell_type": "code", "source": [ "from io import StringIO\n", @@ -133,10 +236,15 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": null + "execution_count": 12 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:01.190285Z", + "start_time": "2024-10-17T11:40:01.170449Z" + } + }, "cell_type": "code", "source": [ "import warnings\n", @@ -147,14 +255,6 @@ "from typing import Union, List, Literal, Dict\n", "\n", "from phenopacket_mapper.utils import parsing\n", - "\n", - "def recursive_dict_call(d: Dict, keys: List, default=None):\n", - " if not isinstance(d, dict):\n", - " return d\n", - " elif len(keys) == 1:\n", - " return d.get(keys[0], default)\n", - " else:\n", - " return recursive_dict_call(d.get(keys[0], default), keys[1:])\n", " \n", "def load_hierarchical_data_recursive(\n", " loaded_data_instance_identifier: Union[int, str],\n", @@ -276,10 +376,15 @@ ], "id": "affc9ecd939c903f", "outputs": [], - "execution_count": null + "execution_count": 13 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T11:40:01.226816Z", + "start_time": "2024-10-17T11:40:01.203147Z" + } + }, "cell_type": "code", "source": [ "data_model_instance = load_hierarchical_data(\n", @@ -293,8 +398,22 @@ ")" ], "id": "53937efded7f589f", - "outputs": [], - "execution_count": null + "outputs": [ + { + "ename": "TypeError", + "evalue": "unhashable type: 'list'", + "output_type": "error", + "traceback": [ + "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[1;32mIn[14], line 5\u001B[0m\n\u001B[0;32m 1\u001B[0m data_model_instance \u001B[38;5;241m=\u001B[39m load_hierarchical_data(\n\u001B[0;32m 2\u001B[0m file\u001B[38;5;241m=\u001B[39mbuffer, \n\u001B[0;32m 3\u001B[0m data_model\u001B[38;5;241m=\u001B[39mgenomic_interpretation, \n\u001B[0;32m 4\u001B[0m file_extension\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mxml\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m----> 5\u001B[0m mapping\u001B[38;5;241m=\u001B[39m{\n\u001B[0;32m 6\u001B[0m genomic_interpretation\u001B[38;5;241m.\u001B[39msubject_or_biosample_id: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mODM.ClinicalData.SubjectData.SubjectKey\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 7\u001B[0m genomic_interpretation\u001B[38;5;241m.\u001B[39mexample\u001B[38;5;241m.\u001B[39ma_number: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mODM.ClinicalData.SubjectData.ANumber\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 8\u001B[0m }\n\u001B[0;32m 9\u001B[0m )\n", + "File \u001B[1;32m:3\u001B[0m, in \u001B[0;36m__hash__\u001B[1;34m(self)\u001B[0m\n", + "File \u001B[1;32m:3\u001B[0m, in \u001B[0;36m__hash__\u001B[1;34m(self)\u001B[0m\n", + "\u001B[1;31mTypeError\u001B[0m: unhashable type: 'list'" + ] + } + ], + "execution_count": 14 }, { "metadata": {}, From 5b9e89ef4ccfa9d7fcacbde987fd4f8d925c63ca Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:02:57 +0200 Subject: [PATCH 42/71] added load hierarchical data to init --- src/phenopacket_mapper/utils/io/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/__init__.py b/src/phenopacket_mapper/utils/io/__init__.py index f85a98f..059bb43 100644 --- a/src/phenopacket_mapper/utils/io/__init__.py +++ b/src/phenopacket_mapper/utils/io/__init__.py @@ -4,6 +4,7 @@ from .read_xml import read_xml, parse_xml from .data_reader import DataReader from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_tabular_data_using_data_model +from .input import load_hierarchical_data from .output import write __all__ = [ @@ -14,6 +15,6 @@ 'read_phenopackets', 'read_phenopacket_from_json', 'load_tabular_data_using_data_model', - + 'load_hierarchical_data', 'write', ] From 29b5ab054eb164a2b7a279c7b83efec8aa1d2df3 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:03:12 +0200 Subject: [PATCH 43/71] changed elements in value set to tuple instead of list --- src/phenopacket_mapper/data_standards/value_set.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/value_set.py b/src/phenopacket_mapper/data_standards/value_set.py index 066cc6e..1644664 100644 --- a/src/phenopacket_mapper/data_standards/value_set.py +++ b/src/phenopacket_mapper/data_standards/value_set.py @@ -1,6 +1,6 @@ import warnings from dataclasses import dataclass, field -from typing import List, Union, Literal +from typing import List, Union, Literal, Tuple from phenopacket_mapper.data_standards import Coding, CodeableConcept, CodeSystem, Date @@ -29,7 +29,7 @@ class ValueSet: :ivar name: Name of the value set :ivar description: Description of the value set """ - elements: List[Union[Coding, CodeableConcept, CodeSystem, str, bool, int, float, Date, type]] \ + elements: Tuple[Union[Coding, CodeableConcept, CodeSystem, str, bool, int, float, Date, type], ...] \ = field(default_factory=list) name: str = field(default="") description: str = field(default="") @@ -43,12 +43,12 @@ def __post_init__(self): def extend(self, new_name: str, value_set: 'ValueSet', new_description: str = '') -> 'ValueSet': return ValueSet(name=new_name, - elements=list(set(self.elements + value_set.elements)), + elements=tuple(set(self.elements + value_set.elements)), description=new_description) def remove_duplicates(self) -> 'ValueSet': return ValueSet(name=self.name, - elements=list(set(self.elements)), + elements=tuple(set(self.elements)), description=self.description) @property @@ -113,11 +113,11 @@ def __iter__(self): TRUE_FALSE_VALUE_SET = ValueSet(name="TrueFalseValueSet", - elements=[True, False], + elements=(True, False), description="A value set for True and False") UNKNOWN_VALUE_SET = ValueSet(name="UnknownValueSet", - elements=["unknown"], + elements=("unknown",), description="A value set for Unknown") TRUE_FALSE_UNKNOWN_VALUE_SET = TRUE_FALSE_VALUE_SET.extend(new_name="TrueFalseUnknownValueSet", From 360bf85091c0ab4c63fa35b25404d35587c90433 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:18:37 +0200 Subject: [PATCH 44/71] had to fix some tests --- .../data_models/test_data_model.py | 22 --- .../{data_models => }/test_data_field.py | 0 tests/data_standards/test_data_model.py | 135 ++++++++++-------- tests/utils/parsing/test_parse_data_type.py | 3 +- 4 files changed, 78 insertions(+), 82 deletions(-) delete mode 100644 tests/data_standards/data_models/test_data_model.py rename tests/data_standards/{data_models => }/test_data_field.py (100%) diff --git a/tests/data_standards/data_models/test_data_model.py b/tests/data_standards/data_models/test_data_model.py deleted file mode 100644 index 4d174e4..0000000 --- a/tests/data_standards/data_models/test_data_model.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from phenopacket_mapper.data_standards import DataModel, DataField -from phenopacket_mapper.data_standards.value_set import ValueSet - - -@pytest.fixture -def data_model(): - return DataModel(resources=[], data_model_name='test_data_model', fields=( - DataField(name='Field 0', specification=ValueSet()), - DataField(name='Date of Birth', specification=ValueSet()), - DataField(name='%^ pseudonym!2', specification=ValueSet()), - )) - - -def test_get_data_field_by_id(data_model): - assert data_model.field_0.name == 'Field 0' - assert data_model.get_field('field_0').name == 'Field 0' - assert data_model.date_of_birth.name == 'Date of Birth' - assert data_model.get_field('date_of_birth').name == 'Date of Birth' - assert data_model._12pseudonym_2.name == '%^ pseudonym!2' - assert data_model.get_field('_12pseudonym_2').name == '%^ pseudonym!2' diff --git a/tests/data_standards/data_models/test_data_field.py b/tests/data_standards/test_data_field.py similarity index 100% rename from tests/data_standards/data_models/test_data_field.py rename to tests/data_standards/test_data_field.py diff --git a/tests/data_standards/test_data_model.py b/tests/data_standards/test_data_model.py index b87ec7d..e0ec13b 100644 --- a/tests/data_standards/test_data_model.py +++ b/tests/data_standards/test_data_model.py @@ -1,7 +1,7 @@ import pytest -from phenopacket_mapper import DataModel -from phenopacket_mapper.data_standards import DataField, DataSection, OrGroup +from phenopacket_mapper.data_standards import DataModel, DataField, DataSection, OrGroup +from phenopacket_mapper.data_standards.value_set import ValueSet class TestDataModel: @@ -11,72 +11,72 @@ class TestDataModel: "inp, expected", [ ( - DataModel( - data_model_name="test", - fields=( - DataField( - name="test_field", - specification=int - ), - DataField( - name="test_field2", - specification=str - ), - ) - ), - False + DataModel( + data_model_name="test", + fields=( + DataField( + name="test_field", + specification=int + ), + DataField( + name="test_field2", + specification=str + ), + ) + ), + False ), ( - DataModel( - data_model_name="test", + DataModel( + data_model_name="test", fields=( - DataField( - name="test_field", - specification=int - ), - DataField( - name="test_field2", - specification=str - ), - DataSection( - name="test_data_section", - fields=( - DataField( - name="test_field3", - specification=bool - ), + DataField( + name="test_field", + specification=int + ), + DataField( + name="test_field2", + specification=str + ), + DataSection( + name="test_data_section", + fields=( + DataField( + name="test_field3", + specification=bool + ), + ) ) - ) ) ), True ), ( - DataModel( - data_model_name="test", + DataModel( + data_model_name="test", fields=( - DataField( - name="test_field", - specification=int - ), - OrGroup( - name="test_or_group", - fields=( - DataField( - name="test_field2", - specification=str - ), - DataSection( - name="test_data_section", - fields=( - DataField( - name="test_field3", - specification=bool - ), - ) + DataField( + name="test_field", + specification=int + ), + OrGroup( + name="test_or_group", + fields=( + DataField( + name="test_field2", + specification=str + ), + DataSection( + name="test_data_section", + fields=( + DataField( + name="test_field3", + specification=bool + ), + ) + ) ) - ) - ), + ), ) ), True @@ -84,4 +84,23 @@ class TestDataModel: ] ) def test_data_model(inp: DataModel, expected): - assert inp.is_hierarchical == expected \ No newline at end of file + assert inp.is_hierarchical == expected + + @staticmethod + @pytest.fixture + def data_model(): + return DataModel(resources=tuple(), data_model_name='test_data_model', fields=( + DataField(name='Field 0', specification=ValueSet()), + DataField(name='Date of Birth', specification=ValueSet()), + DataField(name='%^ pseudonym!2', specification=ValueSet()), + )) + + + @staticmethod + def test_get_data_field_by_id(data_model): + assert data_model.field_0.name == 'Field 0' + assert data_model.get_field('field_0').name == 'Field 0' + assert data_model.date_of_birth.name == 'Date of Birth' + assert data_model.get_field('date_of_birth').name == 'Date of Birth' + assert data_model._12pseudonym_2.name == '%^ pseudonym!2' + assert data_model.get_field('_12pseudonym_2').name == '%^ pseudonym!2' diff --git a/tests/utils/parsing/test_parse_data_type.py b/tests/utils/parsing/test_parse_data_type.py index 424dff1..40803ce 100644 --- a/tests/utils/parsing/test_parse_data_type.py +++ b/tests/utils/parsing/test_parse_data_type.py @@ -4,9 +4,8 @@ from phenopacket_mapper.data_standards import Date from phenopacket_mapper.data_standards.code_system import HPO, SNOMED_CT, ICD10CM, ICD9 -from phenopacket_mapper.data_standards.data_models import parse_data_type from phenopacket_mapper.utils.parsing.parse_data_type import \ - parse_single_data_type + parse_single_data_type, parse_data_type @pytest.fixture From e285e8a94f5d2b47efe6df9699e997807cdd4d9e Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:20:34 +0200 Subject: [PATCH 45/71] made all data classes frozen for them to be hashable --- .../data_standards/data_model.py | 14 +++++++------- src/phenopacket_mapper/data_standards/value_set.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index e9b8317..ebfa53e 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -65,10 +65,10 @@ def __post_init__(self): object.__setattr__(self, 'cardinality', Cardinality(min=1, max=self.cardinality.max)) if isinstance(self.specification, type): - object.__setattr__(self, 'specification', ValueSet(elements=[self.specification])) + object.__setattr__(self, 'specification', ValueSet(elements=(self.specification,))) if isinstance(self.specification, list): if all(isinstance(e, type) for e in self.specification): - object.__setattr__(self, 'specification', ValueSet(elements=self.specification)) + object.__setattr__(self, 'specification', ValueSet(elements=tuple(self.specification))) def __str__(self): ret = "DataField(\n" @@ -146,7 +146,7 @@ class DataModel: """ data_model_name: str = field() fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] = field() - resources: List[CodeSystem] = field(default_factory=list) + resources: Tuple[CodeSystem, ...] = field(default_factory=tuple) def __post_init__(self): if len(self.fields) != len(set([f.id for f in self.fields])): @@ -248,7 +248,7 @@ def load_data( ) -@dataclass(slots=True) +@dataclass(slots=True, frozen=True) class DataFieldValue: """This class defines the value of a `DataField` in a `DataModelInstance` @@ -295,7 +295,7 @@ def validate(self) -> bool: return False -@dataclass(slots=True) +@dataclass(slots=True, frozen=True) class DataSectionInstance: """ :ivar identifier: The id of the instance, i.e. the row number @@ -312,7 +312,7 @@ def validate(self) -> bool: return True -@dataclass(slots=True) +@dataclass(slots=True, frozen=True) class DataModelInstance: """This class defines an instance of a `DataModel`, i.e. a record in a dataset @@ -520,4 +520,4 @@ def __getattr__(self, var_name: str) -> Union[DataField, DataSection, 'OrGroup'] if __name__ == "__main__": df = DataField(name="Field 1", specification=int) - print(df.specification == ValueSet([int])) \ No newline at end of file + print(df.specification == ValueSet((int,))) \ No newline at end of file diff --git a/src/phenopacket_mapper/data_standards/value_set.py b/src/phenopacket_mapper/data_standards/value_set.py index 1644664..051be89 100644 --- a/src/phenopacket_mapper/data_standards/value_set.py +++ b/src/phenopacket_mapper/data_standards/value_set.py @@ -33,9 +33,12 @@ class ValueSet: = field(default_factory=list) name: str = field(default="") description: str = field(default="") - _resources: List[CodeSystem] = field(default_factory=list, repr=False) + _resources: Tuple[CodeSystem, ...] = field(default_factory=tuple, repr=False) def __post_init__(self): + if isinstance(self.elements, list): + object.__setattr__(self, 'elements', tuple(self.elements)) + if Coding in self.elements or CodeableConcept in self.elements: warnings.warn("The ValueSet contains Coding or CodeableConcept. It is recommended to limit the dataset to" "the CodeSystems that are used in the DataField. This will improve the interoperability of" @@ -52,12 +55,15 @@ def remove_duplicates(self) -> 'ValueSet': description=self.description) @property - def resources(self) -> List[CodeSystem]: + def resources(self) -> Tuple[CodeSystem, ...]: """Returns the resources if they exist, otherwise provides a default empty list.""" + resources = list() if len(self._resources) == 0: for e in self.elements: if isinstance(e, CodeSystem): - self._resources.append(e) + resources.append(e) + + object.__setattr__(self, '_resources', tuple(resources)) return self._resources @staticmethod From d0c549b01e7956f3f6fc7832cebaaa1a73d8087a Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:22:13 +0200 Subject: [PATCH 46/71] using resources passed to method --- src/phenopacket_mapper/utils/io/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 9f824be..75a3892 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -303,7 +303,7 @@ def load_hierarchical_data_recursive( return None value_str = str(dict_value) - value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance) + value = parsing.parse_value(value_str=value_str, resources=resources, compliance=compliance) data_field_value = DataFieldValue( row_no=str(loaded_data_instance_identifier) + ":" + keys_str, field=data_field, From 098bf3ab80b48cd08726890adaa44944297a6def Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:25:34 +0200 Subject: [PATCH 47/71] resources now tuple not list --- src/phenopacket_mapper/utils/io/input.py | 4 ++-- src/phenopacket_mapper/utils/parsing/parse_value.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 75a3892..97f371c 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -22,7 +22,7 @@ def read_data_model( data_model_name: str, - resources: List[CodeSystem], + resources: Tuple[CodeSystem, ...], path: Union[str, Path], file_type: Literal['csv', 'excel', 'unknown'] = 'unknown', column_names: Dict[str, str] = MappingProxyType({ @@ -238,7 +238,7 @@ def load_hierarchical_data_recursive( loaded_data_instance_identifier: Union[int, str], loaded_data_instance: Dict, data_model: Union[DataModel, DataSection, OrGroup, DataField], - resources: List[CodeSystem], + resources: Tuple[CodeSystem, ...], compliance: Literal['lenient', 'strict'] = 'lenient', mapping: Dict[DataField, str] = None, ) -> Union[Tuple, Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]]: diff --git a/src/phenopacket_mapper/utils/parsing/parse_value.py b/src/phenopacket_mapper/utils/parsing/parse_value.py index ca239c1..0bd5847 100644 --- a/src/phenopacket_mapper/utils/parsing/parse_value.py +++ b/src/phenopacket_mapper/utils/parsing/parse_value.py @@ -1,4 +1,4 @@ -from typing import List, Literal, Union +from typing import List, Literal, Union, Tuple from phenopacket_mapper.data_standards import CodeSystem, Coding, CodeableConcept, Date from phenopacket_mapper.utils.parsing import parse_primitive_data_value, parse_date, parse_coding @@ -6,7 +6,7 @@ def parse_value( value_str: str, - resources: List[CodeSystem], + resources: Tuple[CodeSystem, ...], compliance: Literal['strict', 'lenient'] = 'lenient' ) -> Union[Coding, CodeableConcept, CodeSystem, str, bool, int, float, Date, type]: """Parses a string representing a value to the appropriate type From 270f24f12277bf497febae41b98fd65670d20983 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:50:46 +0200 Subject: [PATCH 48/71] renamed datamodel.datamodelname to name --- .../data_standards/data_model.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index ebfa53e..a2131d8 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -80,7 +80,6 @@ def __str__(self): ret += "\t)" return ret - def __eq__(self, other): if not isinstance(other, DataField): return False @@ -140,15 +139,19 @@ class DataModel: be accessed using the `id` as an attribute of the `DataModel` object. E.g.: `data_model.date_of_birth`. This is useful in the data reading and mapping processes. - :ivar data_model_name: Name of the data model + :ivar name: Name of the data model :ivar fields: List of `DataField` objects :ivar resources: List of `CodeSystem` objects """ - data_model_name: str = field() - fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] = field() + name: str = field() + fields: Tuple[Union[DataField, DataModelSection, 'OrGroup'], ...] = field() + id: str = field(default=None) resources: Tuple[CodeSystem, ...] = field(default_factory=tuple) def __post_init__(self): + if not self.id: + from phenopacket_mapper.utils import str_to_valid_id + object.__setattr__(self, 'id', str_to_valid_id(self.name)) if len(self.fields) != len(set([f.id for f in self.fields])): raise ValueError("All fields in a DataModel must have unique identifiers") @@ -160,7 +163,7 @@ def __getattr__(self, var_name: str) -> Union[DataField, 'OrGroup', DataSection] def __str__(self): ret = f"DataModel(\n" - ret += f"\tname: {self.data_model_name}\n" + ret += f"\tname: {self.name}\n" for _field in self.fields: ret += f"\t{str(_field)}\n" ret += "---\n" @@ -499,7 +502,6 @@ def __post_init__(self): if self.required: object.__setattr__(self, 'cardinality', Cardinality(min=1, max=self.cardinality.max)) - def __str__(self): ret = "OrGroup(\n" ret += f"\t\tid: {self.id},\n" @@ -520,4 +522,4 @@ def __getattr__(self, var_name: str) -> Union[DataField, DataSection, 'OrGroup'] if __name__ == "__main__": df = DataField(name="Field 1", specification=int) - print(df.specification == ValueSet((int,))) \ No newline at end of file + print(df.specification == ValueSet((int,))) From 6715be7dc83fb3817864bd9793ac44c0c6e9dcc7 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:51:30 +0200 Subject: [PATCH 49/71] renamed dtamodel name --- src/phenopacket_mapper/utils/io/input.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 97f371c..90a3d84 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -129,7 +129,7 @@ def remove_line_breaks_if_not_none(value): ), ) - return DataModel(data_model_name=data_model_name, fields=data_fields, resources=resources) + return DataModel(name=data_model_name, fields=data_fields, resources=resources) def load_tabular_data_using_data_model( @@ -254,8 +254,9 @@ def load_hierarchical_data_recursive( that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data """ + print(f"{data_model=} {type(data_model)=}") if isinstance(data_model, DataModel): - tmp: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ + ret: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ load_hierarchical_data_recursive( loaded_data_instance_identifier=loaded_data_instance_identifier, loaded_data_instance=loaded_data_instance, @@ -266,7 +267,7 @@ def load_hierarchical_data_recursive( ) for f in data_model.fields ] - return tuple(tmp) + return tuple(ret) elif isinstance(data_model, DataSection): data_section: DataSection = data_model @@ -341,6 +342,7 @@ def load_hierarchical_data( data_model_instances = [] for i, data_instance in enumerate(data_iterable): + print(f"{data_instance=}") data_model_instances.append( DataModelInstance( row_no=i, From a4deababd45f7c0f82620add5613b0b867685701 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:51:46 +0200 Subject: [PATCH 50/71] renamed name --- tests/data_standards/test_data_model.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/data_standards/test_data_model.py b/tests/data_standards/test_data_model.py index e0ec13b..ce7652a 100644 --- a/tests/data_standards/test_data_model.py +++ b/tests/data_standards/test_data_model.py @@ -12,7 +12,7 @@ class TestDataModel: [ ( DataModel( - data_model_name="test", + name="test", fields=( DataField( name="test_field", @@ -28,7 +28,7 @@ class TestDataModel: ), ( DataModel( - data_model_name="test", + name="test", fields=( DataField( name="test_field", @@ -38,7 +38,7 @@ class TestDataModel: name="test_field2", specification=str ), - DataSection( + DataModelSection( name="test_data_section", fields=( DataField( @@ -53,7 +53,7 @@ class TestDataModel: ), ( DataModel( - data_model_name="test", + name="test", fields=( DataField( name="test_field", @@ -66,7 +66,7 @@ class TestDataModel: name="test_field2", specification=str ), - DataSection( + DataModelSection( name="test_data_section", fields=( DataField( @@ -89,7 +89,7 @@ def test_data_model(inp: DataModel, expected): @staticmethod @pytest.fixture def data_model(): - return DataModel(resources=tuple(), data_model_name='test_data_model', fields=( + return DataModel(resources=tuple(), name='test_data_model', fields=( DataField(name='Field 0', specification=ValueSet()), DataField(name='Date of Birth', specification=ValueSet()), DataField(name='%^ pseudonym!2', specification=ValueSet()), From 215fa83f9234421e1f9ed094a757c6186bc2124e Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:52:28 +0200 Subject: [PATCH 51/71] tuple not list --- src/phenopacket_mapper/utils/parsing/parse_value_set.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/utils/parsing/parse_value_set.py b/src/phenopacket_mapper/utils/parsing/parse_value_set.py index 3a92ee8..3a3458c 100644 --- a/src/phenopacket_mapper/utils/parsing/parse_value_set.py +++ b/src/phenopacket_mapper/utils/parsing/parse_value_set.py @@ -1,4 +1,4 @@ -from typing import List, Literal, Any +from typing import Literal, Any, Tuple from phenopacket_mapper.data_standards import CodeSystem from phenopacket_mapper.utils.parsing import parse_single_data_type, parse_value @@ -9,7 +9,7 @@ def parse_value_set( value_set_str: str, value_set_name: str = "", value_set_description: str = "", - resources: List[CodeSystem] = None, + resources: Tuple[CodeSystem, ...] = None, compliance: Literal['strict', 'lenient'] = 'lenient', ) -> ValueSet: """Parses a value set from a string representation @@ -28,7 +28,7 @@ def parse_value_set( if compliance == 'strict': raise ValueError(f"value_set_str must be a string, not {type(value_set_str)} ({value_set_str})") else: - return ValueSet(elements=[Any], description=value_set_description) + return ValueSet(elements=(Any,), description=value_set_description) value_set_str = value_set_str.strip() From d10722e0e2157d5d83b790312b2fe9cf749242c0 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 14:53:56 +0200 Subject: [PATCH 52/71] typo --- src/phenopacket_mapper/data_standards/data_model.py | 2 +- tests/data_standards/test_data_model.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index a2131d8..cc0fbf1 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -144,7 +144,7 @@ class DataModel: :ivar resources: List of `CodeSystem` objects """ name: str = field() - fields: Tuple[Union[DataField, DataModelSection, 'OrGroup'], ...] = field() + fields: Tuple[Union[DataField, DataSection, 'OrGroup'], ...] = field() id: str = field(default=None) resources: Tuple[CodeSystem, ...] = field(default_factory=tuple) diff --git a/tests/data_standards/test_data_model.py b/tests/data_standards/test_data_model.py index ce7652a..2a11fdd 100644 --- a/tests/data_standards/test_data_model.py +++ b/tests/data_standards/test_data_model.py @@ -38,7 +38,7 @@ class TestDataModel: name="test_field2", specification=str ), - DataModelSection( + DataSection( name="test_data_section", fields=( DataField( @@ -66,7 +66,7 @@ class TestDataModel: name="test_field2", specification=str ), - DataModelSection( + DataSection( name="test_data_section", fields=( DataField( From 867ebca5b5d103c745d5501d47fce056e30ed984 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:11:26 +0200 Subject: [PATCH 53/71] updated cardinality constructor and added examples 0,1 1,1 0,n 1,n --- .../data_standards/cardinality.py | 13 ++++++++++++- src/phenopacket_mapper/data_standards/data_model.py | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py index 4ecdda1..019359f 100644 --- a/src/phenopacket_mapper/data_standards/cardinality.py +++ b/src/phenopacket_mapper/data_standards/cardinality.py @@ -7,6 +7,11 @@ class Cardinality: min: int = field(default=0) max: Union[int, Literal['n']] = field(default='n') + ZERO_TO_ONE: 'Cardinality' = None + ZERO_TO_N: 'Cardinality' = None + ONE: 'Cardinality' = None + ONE_TO_N: 'Cardinality' = None + def __post_init__(self): if not isinstance(self.min, int): raise ValueError(f"Parameter min must be of type integer. (Not: {type(self.min)})") @@ -19,4 +24,10 @@ def __post_init__(self): raise ValueError(f"Parameter max must be a positive integer. (Not: {self.min})") def __str__(self): - return f"{self.min}..{self.max}" \ No newline at end of file + return f"{self.min}..{self.max}" + + +Cardinality.ZERO_TO_ONE = Cardinality(0, 1) +Cardinality.ZERO_TO_ONE = Cardinality(0, 'n') +Cardinality.ONE = Cardinality(1, 1) +Cardinality.ONE_TO_N = Cardinality(1, 'n') diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index cc0fbf1..34300a4 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -128,6 +128,7 @@ def __getattr__(self, var_name: str) -> Union[DataField, 'OrGroup', 'DataSection return f raise AttributeError(f"'DataSection' object has no attribute '{var_name}'") + @dataclass(slots=True, frozen=True) class DataModel: """This class defines a data model for medical data using `DataField` @@ -187,7 +188,6 @@ def recursive_is_hierarchical(d: Union[DataField, DataSection, OrGroup]): return any([recursive_is_hierarchical(f) for f in self.fields]) - def get_field(self, field_id: str, default: Optional = None) -> Optional[DataField]: """Returns a DataField object by its id @@ -492,7 +492,7 @@ class OrGroup(DataNode): id: str = field(default=None) description: str = field(default='') required: bool = field(default=False) - cardinality: Cardinality = field(default_factory=Cardinality) + cardinality: Cardinality = field(default=Cardinality.ZERO_TO_N) def __post_init__(self): if not self.id: From 291d2a3ac3d8d6acaa1850a8c1b2257aba119b7a Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:19:47 +0200 Subject: [PATCH 54/71] had to redo it again because frozen --- .../data_standards/cardinality.py | 39 ++++++++++++++----- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/cardinality.py b/src/phenopacket_mapper/data_standards/cardinality.py index 019359f..1c6f5ed 100644 --- a/src/phenopacket_mapper/data_standards/cardinality.py +++ b/src/phenopacket_mapper/data_standards/cardinality.py @@ -7,11 +7,6 @@ class Cardinality: min: int = field(default=0) max: Union[int, Literal['n']] = field(default='n') - ZERO_TO_ONE: 'Cardinality' = None - ZERO_TO_N: 'Cardinality' = None - ONE: 'Cardinality' = None - ONE_TO_N: 'Cardinality' = None - def __post_init__(self): if not isinstance(self.min, int): raise ValueError(f"Parameter min must be of type integer. (Not: {type(self.min)})") @@ -26,8 +21,34 @@ def __post_init__(self): def __str__(self): return f"{self.min}..{self.max}" + # Singleton instances + _instances = {} + + @classmethod + @property + def ZERO_TO_ONE(cls) -> 'Cardinality': + if 'ZERO_TO_ONE' not in cls._instances: + cls._instances['ZERO_TO_ONE'] = cls(0, 1) + return cls._instances['ZERO_TO_ONE'] + + @classmethod + @property + def ZERO_TO_N(cls) -> 'Cardinality': + if 'ZERO_TO_N' not in cls._instances: + cls._instances['ZERO_TO_N'] = cls(0, 'n') + return cls._instances['ZERO_TO_N'] + + @classmethod + @property + def ONE(cls) -> 'Cardinality': + if 'OPTIONAL' not in cls._instances: + cls._instances['ONE'] = cls(1, 1) + return cls._instances['ONE'] + + @classmethod + @property + def ONE_TO_N(cls) -> 'Cardinality': + if 'ONE_TO_N' not in cls._instances: + cls._instances['ONE_TO_N'] = cls(1, 'n') + return cls._instances['ONE_TO_N'] -Cardinality.ZERO_TO_ONE = Cardinality(0, 1) -Cardinality.ZERO_TO_ONE = Cardinality(0, 'n') -Cardinality.ONE = Cardinality(1, 1) -Cardinality.ONE_TO_N = Cardinality(1, 'n') From 166c4ed753569f9b8d374e19142041b0fced6dfc Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:34:16 +0200 Subject: [PATCH 55/71] renamed row_no to id --- src/phenopacket_mapper/data_standards/data_model.py | 8 ++++---- src/phenopacket_mapper/utils/io/input.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 34300a4..c26ef79 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -321,14 +321,14 @@ class DataModelInstance: This class is used to define an instance of a `DataModel`, i.e. a record or row in a dataset. - :ivar row_no: The id of the instance, i.e. the row number + :ivar id: The id of the instance, i.e. the row number :ivar data_model: The `DataModel` object that defines the data model for this instance :ivar values: A list of `DataFieldValue` objects, each adhering to the `DataField` definition in the `DataModel` :ivar compliance: Compliance level to enforce when validating the instance. If 'lenient', the instance can have extra fields that are not in the DataModel. If 'strict', the instance must have all fields in the DataModel. """ - row_no: Union[int, str] + id: Union[int, str] data_model: DataModel values: Tuple[Union[DataFieldValue, DataSectionInstance], ...] compliance: Literal['lenient', 'strict'] = 'lenient' @@ -344,7 +344,7 @@ def validate(self) -> bool: :return: True if the instance is valid, False otherwise """ - error_msg = f"Instance values do not comply with their respective fields' valuesets. (row {self.row_no})" + error_msg = f"Instance values do not comply with their respective fields' valuesets. (row {self.id})" for v in self.values: if not v.validate(): if self.compliance == 'strict': @@ -359,7 +359,7 @@ def validate(self) -> bool: fields_present = set(v.field.id for v in self.values) if len(missing_fields := (is_required - fields_present)) > 0: - error_msg = (f"Required fields are missing in the instance. (row {self.row_no}) " + error_msg = (f"Required fields are missing in the instance. (row {self.id}) " f"\n(missing_fields={', '.join(missing_fields)})") if self.compliance == 'strict': raise ValueError(error_msg) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 90a3d84..2a6bdf1 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -193,7 +193,7 @@ def load_tabular_data_using_data_model( data_model_instances.append( DataModelInstance( - row_no=i, + id=i, data_model=data_model, values=values, compliance=compliance) @@ -254,9 +254,9 @@ def load_hierarchical_data_recursive( that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data """ - print(f"{data_model=} {type(data_model)=}") + print(f"{data_model.name=} {type(data_model)=}") if isinstance(data_model, DataModel): - ret: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ + data_model_instance_values: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ load_hierarchical_data_recursive( loaded_data_instance_identifier=loaded_data_instance_identifier, loaded_data_instance=loaded_data_instance, @@ -267,7 +267,7 @@ def load_hierarchical_data_recursive( ) for f in data_model.fields ] - return tuple(ret) + return tuple(data_model_instance_values) elif isinstance(data_model, DataSection): data_section: DataSection = data_model @@ -345,7 +345,7 @@ def load_hierarchical_data( print(f"{data_instance=}") data_model_instances.append( DataModelInstance( - row_no=i, + id=i, data_model=data_model, values=load_hierarchical_data_recursive( loaded_data_instance_identifier=str(i), From fe6b783b022e8f33e8eabded92f06d0d9ad67035 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:34:29 +0200 Subject: [PATCH 56/71] wrong import --- src/phenopacket_mapper/utils/io/input.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 2a6bdf1..48b841b 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -17,7 +17,6 @@ from phenopacket_mapper.utils import parsing from phenopacket_mapper.utils.io.data_reader import DataReader from phenopacket_mapper.utils.parsing import parse_ordinal -from tests.utils.parsing.test_parse_coding import resources def read_data_model( From e87deae7beda3d0a024a325a218b5d783cdb1eb0 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:39:05 +0200 Subject: [PATCH 57/71] identifier for tabular instances now includes row: before row number --- src/phenopacket_mapper/utils/io/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 48b841b..bbd7e1f 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -192,7 +192,7 @@ def load_tabular_data_using_data_model( data_model_instances.append( DataModelInstance( - id=i, + id="row:" + str(i), data_model=data_model, values=values, compliance=compliance) From 1f50ed56520add4bb29af63b7cb4526081af780b Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:39:17 +0200 Subject: [PATCH 58/71] removed unnecessary main --- src/phenopacket_mapper/data_standards/data_model.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index c26ef79..af7b4b4 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -518,8 +518,3 @@ def __getattr__(self, var_name: str) -> Union[DataField, DataSection, 'OrGroup'] if f.id == var_name: return f raise AttributeError(f"'OrGroup' object has no attribute '{var_name}'") - - -if __name__ == "__main__": - df = DataField(name="Field 1", specification=int) - print(df.specification == ValueSet((int,))) From a4c635480ccb5885c2f766d642382a884946cd43 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:39:34 +0200 Subject: [PATCH 59/71] renamed method --- src/phenopacket_mapper/utils/io/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index bbd7e1f..ee6fe36 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -321,7 +321,7 @@ def load_hierarchical_data_recursive( raise ValueError(f"Invalid compliance level: {compliance}") -def load_hierarchical_data( +def load_hierarchical_dataset( file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], data_model: DataModel, file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None, From deefb8cc0b5aca006dd29dc01a1812310dfd3bcc Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:39:49 +0200 Subject: [PATCH 60/71] created new method to load single data --- src/phenopacket_mapper/utils/io/input.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index ee6fe36..31d99a7 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -357,3 +357,42 @@ def load_hierarchical_dataset( compliance=compliance, ) ) + + return DataSet(data_model=data_model, data=data_model_instances) + + +def load_hierarchical_data( + file: Union[str, Path, IOBase], + data_model: DataModel, + instance_identifier: Union[int, str] = None, + file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None, + compliance: Literal['lenient', 'strict'] = 'lenient', + mapping: Dict[DataField, str] = None, +): + if not mapping: + raise AttributeError(f"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}") + + if not data_model.is_hierarchical: + warnings.warn("This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.") + + data_reader = DataReader(file, file_extension=file_extension) + + # TODO: give instances identifiers based on file names + if not instance_identifier: + instance_identifier = "PLACEHOLDER_IDENTIFIER" + + data_instance = data_reader.data + + return DataModelInstance( + id=instance_identifier, # TODO: give instances identifiers based on file names + data_model=data_model, + values=load_hierarchical_data_recursive( + loaded_data_instance_identifier=instance_identifier, + loaded_data_instance=data_instance, + data_model=data_model, + resources=data_model.resources, + compliance=compliance, + mapping=mapping + ), + compliance=compliance, + ) From bb4a1ec96c85f936bf458fe90a021c6f18d9c312 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:41:41 +0200 Subject: [PATCH 61/71] changed id of instances --- src/phenopacket_mapper/utils/io/input.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 31d99a7..c770433 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -341,13 +341,14 @@ def load_hierarchical_dataset( data_model_instances = [] for i, data_instance in enumerate(data_iterable): + instance_identifier = str(i) # TODO: give instances identifiers based on file names if available print(f"{data_instance=}") data_model_instances.append( DataModelInstance( - id=i, + id=instance_identifier, data_model=data_model, values=load_hierarchical_data_recursive( - loaded_data_instance_identifier=str(i), + loaded_data_instance_identifier=instance_identifier, loaded_data_instance=data_instance, data_model=data_model, resources=data_model.resources, @@ -377,14 +378,14 @@ def load_hierarchical_data( data_reader = DataReader(file, file_extension=file_extension) - # TODO: give instances identifiers based on file names + # TODO: give instances identifiers based on file names if available if not instance_identifier: instance_identifier = "PLACEHOLDER_IDENTIFIER" data_instance = data_reader.data return DataModelInstance( - id=instance_identifier, # TODO: give instances identifiers based on file names + id=instance_identifier, # TODO: give instances identifiers based on file names if available data_model=data_model, values=load_hierarchical_data_recursive( loaded_data_instance_identifier=instance_identifier, From 2adf8df277965268925574561546fd12d62b43d6 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:41:54 +0200 Subject: [PATCH 62/71] removed unnecessary file type --- src/phenopacket_mapper/utils/io/input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index c770433..08861a0 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -322,7 +322,7 @@ def load_hierarchical_data_recursive( def load_hierarchical_dataset( - file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], + file: Union[str, Path, List[str], List[Path], List[IOBase]], data_model: DataModel, file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None, compliance: Literal['lenient', 'strict'] = 'lenient', From de6589c43ba7eea79b601be90c77f24a7c0e47c6 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:43:43 +0200 Subject: [PATCH 63/71] added load hier dataset to init --- src/phenopacket_mapper/utils/io/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/__init__.py b/src/phenopacket_mapper/utils/io/__init__.py index 059bb43..4b1d7d8 100644 --- a/src/phenopacket_mapper/utils/io/__init__.py +++ b/src/phenopacket_mapper/utils/io/__init__.py @@ -4,7 +4,7 @@ from .read_xml import read_xml, parse_xml from .data_reader import DataReader from .input import read_data_model, read_phenopackets, read_phenopacket_from_json, load_tabular_data_using_data_model -from .input import load_hierarchical_data +from .input import load_hierarchical_data, load_hierarchical_dataset from .output import write __all__ = [ @@ -15,6 +15,6 @@ 'read_phenopackets', 'read_phenopacket_from_json', 'load_tabular_data_using_data_model', - 'load_hierarchical_data', + 'load_hierarchical_data', 'load_hierarchical_dataset', 'write', ] From ac18541e57d8aa649fb432edb8b1ace376747f4c Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:43:50 +0200 Subject: [PATCH 64/71] added method comment --- src/phenopacket_mapper/utils/io/input.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 08861a0..d107765 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -370,6 +370,18 @@ def load_hierarchical_data( compliance: Literal['lenient', 'strict'] = 'lenient', mapping: Dict[DataField, str] = None, ): + """ + Loads hierarchical single data from one hierarchical file using a DataModel definition + + :param file: file to load data from + :param data_model: DataModel to use for reading the file + :param instance_identifier: identifier of the data instance + :param file_extension: file extension of the file + :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields + that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. + :param mapping: specifies the mapping from data fields present in the data model to ids of fields in the data + + """ if not mapping: raise AttributeError(f"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}") From f7e5f65052403da3c62a47394e7642c7f0a4a1ef Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:45:23 +0200 Subject: [PATCH 65/71] renamed datafieldvalue row_no to id --- src/phenopacket_mapper/data_standards/data_model.py | 6 +++--- src/phenopacket_mapper/utils/io/input.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index af7b4b4..492d264 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -257,11 +257,11 @@ class DataFieldValue: Equivalent to a cell value in a table. - :ivar row_no: The id of the value, i.e. the row number + :ivar id: The id of the value, i.e. the row number :ivar field: DataField: The `DataField` to which this value belongs and which defines the value set for the field. :ivar value: The value of the field. """ - row_no: Union[str, int] + id: Union[str, int] field: DataField value: Union[int, float, str, bool, Date, CodeSystem] @@ -294,7 +294,7 @@ def validate(self) -> bool: return True warnings.warn(f"Value {self.value} of type {type(self.value)} is not in the value set of field " - f"{self.field.name} (row {self.row_no})") + f"{self.field.name} (row {self.id})") return False diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index d107765..9e3fc08 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -186,7 +186,7 @@ def load_tabular_data_using_data_model( value_str = str(pandas_value) value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance) - values.append(DataFieldValue(row_no=i, field=f, value=value)) + values.append(DataFieldValue(id=i, field=f, value=value)) values = tuple(values) @@ -305,7 +305,7 @@ def load_hierarchical_data_recursive( value_str = str(dict_value) value = parsing.parse_value(value_str=value_str, resources=resources, compliance=compliance) data_field_value = DataFieldValue( - row_no=str(loaded_data_instance_identifier) + ":" + keys_str, + id=str(loaded_data_instance_identifier) + ":" + keys_str, field=data_field, value=value ) From 2d45665e649aea5e54dff6a9bdf275c85c4da935 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:46:26 +0200 Subject: [PATCH 66/71] removed unnecessary prints --- src/phenopacket_mapper/utils/io/input.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 9e3fc08..84adefd 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -253,7 +253,6 @@ def load_hierarchical_data_recursive( that are not in the DataModel. If 'strict', the file must have all fields in the DataModel. :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data """ - print(f"{data_model.name=} {type(data_model)=}") if isinstance(data_model, DataModel): data_model_instance_values: List[Union[DataModelInstance, DataSectionInstance, DataFieldValue, None]] = [ load_hierarchical_data_recursive( @@ -342,7 +341,6 @@ def load_hierarchical_dataset( for i, data_instance in enumerate(data_iterable): instance_identifier = str(i) # TODO: give instances identifiers based on file names if available - print(f"{data_instance=}") data_model_instances.append( DataModelInstance( id=instance_identifier, From f1edd730832d6323341e82ab07aebece50a79985 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 15:53:12 +0200 Subject: [PATCH 67/71] finally able to load data --- notebooks/hierarchical_data_model.ipynb | 427 ++++++++++-------- .../utils/io/data_reader.py | 8 +- 2 files changed, 245 insertions(+), 190 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index dfd43d0..1189725 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -5,45 +5,46 @@ "id": "initial_id", "metadata": { "collapsed": true, + "jupyter": { + "is_executing": true + }, "ExecuteTime": { - "end_time": "2024-10-17T11:40:00.972441Z", - "start_time": "2024-10-17T11:40:00.967261Z" + "end_time": "2024-10-17T13:52:15.564302Z", + "start_time": "2024-10-17T13:52:15.560679Z" } }, "source": [ - "from phenopacket_mapper.data_standards import DataModelInstance\n", "from phenopacket_mapper.data_standards import DataField\n", - "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup\n", - "from phenopacket_mapper.data_standards.data_model import DataSectionInstance\n", - "from phenopacket_mapper.utils.io import DataReader" + "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup, Cardinality" ], "outputs": [], - "execution_count": 8 + "execution_count": 13 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T11:40:00.995234Z", - "start_time": "2024-10-17T11:40:00.978558Z" + "end_time": "2024-10-17T13:52:15.572450Z", + "start_time": "2024-10-17T13:52:15.567308Z" } }, "cell_type": "code", "source": [ "genomic_interpretation = DataModel(\n", - " data_model_name=\"Phenopacket schema Genomic Interpretation\",\n", + " name=\"Phenopacket schema Genomic Interpretation\",\n", " fields=(\n", " DataField(\n", " name=\"subject_or_biosample_id\",\n", " specification=str,\n", " required=True,\n", - " description=\"The id of the patient or biosample that is the subject being interpreted. REQUIRED.\"\n", + " description=\"The id of the patient or biosample that is the subject being interpreted. REQUIRED.\",\n", + " cardinality=Cardinality.ONE,\n", " ),\n", " \n", " DataField(\n", " name=\"interpretation_status\",\n", " specification=ValueSet(\n", " name=\"Interpretation Status Value Set\",\n", - " elements=[\"UNKNOWN_STATUS\", \"REJECTED\", \"CANDIDATE\", \"CONTRIBUTORY\", \"CAUSATIVE\"],\n", + " elements=(\"UNKNOWN_STATUS\", \"REJECTED\", \"CANDIDATE\", \"CONTRIBUTORY\", \"CAUSATIVE\"),\n", " ),\n", " required=True,\n", " description=\"status of the interpretation. REQUIRED.\",\n", @@ -96,37 +97,37 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 9 + "execution_count": 14 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T11:40:01.041106Z", - "start_time": "2024-10-17T11:40:01.028028Z" + "end_time": "2024-10-17T13:52:15.578062Z", + "start_time": "2024-10-17T13:52:15.573455Z" } }, "cell_type": "code", - "source": "genomic_interpretation.example.a_number", + "source": [ + "genomic_interpretation.example.a_number" + ], "id": "a32bb965c37e98b4", "outputs": [ { "data": { - "text/plain": [ - "DataField(name='a_number', specification=ValueSet(elements=[], name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))" - ] + "text/plain": "DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))" }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 10 + "execution_count": 15 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T11:40:01.091102Z", - "start_time": "2024-10-17T11:40:01.085216Z" + "end_time": "2024-10-17T13:52:15.581403Z", + "start_time": "2024-10-17T13:52:15.579068Z" } }, "cell_type": "code", @@ -147,14 +148,14 @@ "\t\tid: subject_or_biosample_id,\n", "\t\tname: subject_or_biosample_id,\n", "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", - "\t\tcardinality: 1..n\n", + "\t\tspecification: ValueSet(elements=(,), name='', description='')\n", + "\t\tcardinality: 1..1\n", "\t)\n", "\tDataField(\n", "\t\tid: interpretation_status,\n", "\t\tname: interpretation_status,\n", "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=['UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'], name='Interpretation Status Value Set', description='')\n", + "\t\tspecification: ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description='')\n", "\t\tcardinality: 1..n\n", "\t)\n", "\tDataSection(\n", @@ -166,7 +167,7 @@ "\t\tid: a_number,\n", "\t\tname: a_number,\n", "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tspecification: ValueSet(elements=(,), name='', description='')\n", "\t\tcardinality: 1..n\n", "\t)\n", "\t)\n", @@ -184,21 +185,21 @@ "\t\tid: value_id,\n", "\t\tname: value_id,\n", "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tspecification: ValueSet(elements=(,), name='', description='')\n", "\t\tcardinality: 1..n\n", "\t)\n", "\tDataField(\n", "\t\tid: symbol,\n", "\t\tname: symbol,\n", "\t\trequired: True\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tspecification: ValueSet(elements=(,), name='', description='')\n", "\t\tcardinality: 1..n\n", "\t)\n", "\tDataField(\n", "\t\tid: description,\n", "\t\tname: description,\n", "\t\trequired: False\n", - "\t\tspecification: ValueSet(elements=[], name='', description='')\n", + "\t\tspecification: ValueSet(elements=(,), name='', description='')\n", "\t\tcardinality: 0..n\n", "\t)\n", "\t)\n", @@ -208,220 +209,270 @@ ] } ], - "execution_count": 11 + "execution_count": 16 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T11:40:01.158069Z", - "start_time": "2024-10-17T11:40:01.152182Z" + "end_time": "2024-10-17T13:52:15.585667Z", + "start_time": "2024-10-17T13:52:15.582408Z" } }, "cell_type": "code", "source": [ "from io import StringIO\n", "\n", - "xml_data = \\\n", - " (\n", + "xml_data = [\n", " ' '\n", " ''\n", " ''\n", " '123'\n", " ''\n", " ''\n", - " ''\n", - " )\n", - "\n", - "buffer = StringIO(xml_data)" + " '',\n", + " ' '\n", + " ''\n", + " ''\n", + " '124'\n", + " ''\n", + " ''\n", + " '',\n", + " ]" ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 12 + "execution_count": 17 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T11:40:01.190285Z", - "start_time": "2024-10-17T11:40:01.170449Z" + "end_time": "2024-10-17T13:52:15.589625Z", + "start_time": "2024-10-17T13:52:15.586671Z" } }, "cell_type": "code", "source": [ - "import warnings\n", - "from phenopacket_mapper.data_standards import DataFieldValue\n", - "import math\n", - "from io import IOBase\n", - "from pathlib import Path\n", - "from typing import Union, List, Literal, Dict\n", - "\n", - "from phenopacket_mapper.utils import parsing\n", - " \n", - "def load_hierarchical_data_recursive(\n", - " loaded_data_instance_identifier: Union[int, str],\n", - " loaded_data_instance: Dict,\n", - " data_model: Union[DataModel, DataSection, OrGroup, DataField],\n", - " compliance: Literal['lenient', 'strict'] = 'lenient',\n", - " mapping: Dict[DataField, str] = None,\n", - "):\n", - " \"\"\"Helper method for `load_hierarchical_data`, recurses through hierarchical :class:`DataModel`\n", - " \n", - " `loaded_data_instance` is expected to be a dictionary as returned by `DataReader.data` when reading a single xml or json file \n", - " \n", - " :param loaded_data_instance_identifier: identifier of the loaded data_instance\n", - " :param loaded_data_instance: data loaded in by :class:`DataReader`\n", - " :param data_model:\n", - " :param compliance: Compliance level to enforce when reading the file. If 'lenient', the file can have extra fields\n", - " that are not in the DataModel. If 'strict', the file must have all fields in the DataModel.\n", - " :param mapping: specifies the mapping from data fields present in the data model to identifiers of fields in the data\n", - " \"\"\"\n", - " if isinstance(data_model, DataModel):\n", - " return (\n", - " load_hierarchical_data_recursive(\n", - " loaded_data_instance_identifier=loaded_data_instance_identifier,\n", - " loaded_data_instance=loaded_data_instance,\n", - " data_model=f,\n", - " compliance=compliance,\n", - " mapping=mapping\n", - " )\n", - " for f in data_model.fields\n", - " )\n", - " elif isinstance(data_model, DataSection):\n", - " data_section: DataSection = data_model\n", - " \n", - " values = (\n", - " load_hierarchical_data_recursive(\n", - " loaded_data_instance_identifier=loaded_data_instance_identifier,\n", - " loaded_data_instance=loaded_data_instance,\n", - " data_model=f,\n", - " compliance=compliance,\n", - " mapping=mapping,\n", - " )\n", - " for f in data_section.fields\n", - " )\n", - " \n", - " return DataSectionInstance(\n", - " identifier=str(loaded_data_instance_identifier) + \":\" + data_section.id, # TODO: get identifiers of parents\n", - " data_section=data_section,\n", - " values=values,\n", - " )\n", - " elif isinstance(data_model, OrGroup):\n", - " # TODO: resolve or this seems to be very difficult\n", - " pass\n", - " elif isinstance(data_model, DataField):\n", - " data_field = data_model\n", - " \n", - " keys_str = mapping.get(data_model, None)\n", - " \n", - " if keys_str:\n", - " keys = keys_str.split('.')\n", - " dict_value = recursive_dict_call(loaded_data_instance, keys)\n", - "\n", - " if not dict_value or (isinstance(dict_value, float) and math.isnan(dict_value)):\n", - " return None\n", - "\n", - " value_str = str(dict_value)\n", - " value = parsing.parse_value(value_str=value_str, resources=data_model.resources, compliance=compliance)\n", - " data_field_value = DataFieldValue(\n", - " row_no=str(loaded_data_instance_identifier) + \":\" + keys_str, \n", - " field=data_field, \n", - " value=value\n", - " )\n", - "\n", - " return data_field_value\n", - " else:\n", - " err_msg = f\"DataModel {data_model} is not a valid type ({type(data_model)}).\"\n", - " if compliance == 'strict':\n", - " raise ValueError(err_msg)\n", - " elif compliance == 'lenient':\n", - " warnings.warn(err_msg)\n", - " else:\n", - " raise ValueError(f\"Invalid compliance level: {compliance}\")\n", - " \n", - "\n", - "def load_hierarchical_data(\n", - " file: Union[str, Path, IOBase, List[str], List[Path], List[IOBase]], \n", - " data_model: DataModel, \n", - " file_extension: Literal['csv', 'xlsx', 'json', 'xml'] = None,\n", - " compliance: Literal['lenient', 'strict'] = 'lenient',\n", - " mapping: Dict[DataField, str] = None,\n", - "): \n", - " if not mapping:\n", - " raise AttributeError(f\"Parameter 'mapping' must not be empty or None. {mapping=}, {type(mapping)=}\")\n", - " \n", - " if not data_model.is_hierarchical:\n", - " warnings.warn(\"This method is only for loading hierarchical data, it may behave unexpectedly for tabular data.\")\n", - " \n", - " data_reader = DataReader(file, file_extension=file_extension)\n", - " data, data_iterable = data_reader.data, data_reader.iterable\n", - " \n", - " # assembling data model instances\n", - " data_model_instances = []\n", - " \n", - " for i, data_instance in enumerate(data_iterable):\n", - " data_model_instances.append(\n", - " DataModelInstance(\n", - " row_no=i,\n", - " data_model=data_model,\n", - " values=load_hierarchical_data_recursive(\n", - " loaded_data_instance_identifier=str(i),\n", - " loaded_data_instance=data_instance,\n", - " data_model=data_model,\n", - " compliance=compliance,\n", - " mapping=mapping\n", - " ),\n", - " compliance=compliance,\n", - " )\n", - " )\n", - " " + "genomic_interpretation.subject_or_biosample_id" ], - "id": "affc9ecd939c903f", + "id": "bb5e27f9e1425b78", + "outputs": [ + { + "data": { + "text/plain": "DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1))" + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 18 + }, + { + "cell_type": "code", "outputs": [], - "execution_count": 13 + "source": [ + "from phenopacket_mapper.utils.io import load_hierarchical_data, DataReader\n", + "from phenopacket_mapper.utils.io.input import load_hierarchical_data_recursive" + ], + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T13:52:15.593155Z", + "start_time": "2024-10-17T13:52:15.590632Z" + } + }, + "id": "f7fdfee60f008e07", + "execution_count": 19 }, { + "cell_type": "markdown", + "source": [ + "loading value" + ], + "metadata": { + "collapsed": false + }, + "id": "f1b55844653c9d79" + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------\n", + "DataFieldValue(id='value:ODM.ClinicalData.SubjectData.SubjectKey', field=DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), value=101)\n" + ] + } + ], + "source": [ + "data_reader = DataReader(\n", + " file=StringIO(xml_data[0]),\n", + " file_extension=\"xml\",\n", + ")\n", + "tmp = load_hierarchical_data_recursive(\n", + " loaded_data_instance_identifier=\"value\",\n", + " loaded_data_instance=data_reader.data, \n", + " data_model=genomic_interpretation.subject_or_biosample_id, \n", + " mapping={\n", + " genomic_interpretation.subject_or_biosample_id: \"ODM.ClinicalData.SubjectData.SubjectKey\",\n", + " genomic_interpretation.example.a_number: \"ODM.ClinicalData.SubjectData.ANumber\",\n", + " },\n", + " resources=tuple(),\n", + ")\n", + "print(\"-\"*80)\n", + "print(tmp)" + ], "metadata": { + "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-17T11:40:01.226816Z", - "start_time": "2024-10-17T11:40:01.203147Z" + "end_time": "2024-10-17T13:52:15.597800Z", + "start_time": "2024-10-17T13:52:15.594160Z" + } + }, + "id": "9f358144800fb41c", + "execution_count": 20 + }, + { + "cell_type": "markdown", + "source": [ + "loading section" + ], + "metadata": { + "collapsed": false + }, + "id": "fe27e2fdc8207bef" + }, + { + "cell_type": "code", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------\n", + "DataSectionInstance(identifier='section:example', data_section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='section:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),))\n" + ] + } + ], + "source": [ + "data_reader = DataReader(\n", + " file=StringIO(xml_data[0]),\n", + " file_extension=\"xml\",\n", + ")\n", + "tmp = load_hierarchical_data_recursive(\n", + " loaded_data_instance_identifier=\"section\",\n", + " loaded_data_instance=data_reader.data,\n", + " data_model=genomic_interpretation.example,\n", + " mapping={\n", + " genomic_interpretation.subject_or_biosample_id: \"ODM.ClinicalData.SubjectData.SubjectKey\",\n", + " genomic_interpretation.example.a_number: \"ODM.ClinicalData.SubjectData.ANumber\",\n", + " },\n", + " resources=tuple(),\n", + ")\n", + "print(\"-\"*80)\n", + "print(tmp)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-17T13:52:15.602315Z", + "start_time": "2024-10-17T13:52:15.598804Z" + } + }, + "id": "dae94d0e649a0120", + "execution_count": 21 + }, + { + "cell_type": "markdown", + "source": [ + "loading instance" + ], + "metadata": { + "collapsed": false + }, + "id": "7bb09fcfb7061e7d" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T13:52:15.606902Z", + "start_time": "2024-10-17T13:52:15.603320Z" } }, "cell_type": "code", "source": [ "data_model_instance = load_hierarchical_data(\n", - " file=buffer, \n", + " file=StringIO(xml_data[0]), \n", " data_model=genomic_interpretation, \n", " file_extension=\"xml\",\n", " mapping={\n", " genomic_interpretation.subject_or_biosample_id: \"ODM.ClinicalData.SubjectData.SubjectKey\",\n", " genomic_interpretation.example.a_number: \"ODM.ClinicalData.SubjectData.ANumber\",\n", " }\n", - ")" + ")\n", + "print(data_model_instance)" ], "id": "53937efded7f589f", "outputs": [ { - "ename": "TypeError", - "evalue": "unhashable type: 'list'", - "output_type": "error", - "traceback": [ - "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[1;31mTypeError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[1;32mIn[14], line 5\u001B[0m\n\u001B[0;32m 1\u001B[0m data_model_instance \u001B[38;5;241m=\u001B[39m load_hierarchical_data(\n\u001B[0;32m 2\u001B[0m file\u001B[38;5;241m=\u001B[39mbuffer, \n\u001B[0;32m 3\u001B[0m data_model\u001B[38;5;241m=\u001B[39mgenomic_interpretation, \n\u001B[0;32m 4\u001B[0m file_extension\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mxml\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m----> 5\u001B[0m mapping\u001B[38;5;241m=\u001B[39m{\n\u001B[0;32m 6\u001B[0m genomic_interpretation\u001B[38;5;241m.\u001B[39msubject_or_biosample_id: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mODM.ClinicalData.SubjectData.SubjectKey\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 7\u001B[0m genomic_interpretation\u001B[38;5;241m.\u001B[39mexample\u001B[38;5;241m.\u001B[39ma_number: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mODM.ClinicalData.SubjectData.ANumber\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[0;32m 8\u001B[0m }\n\u001B[0;32m 9\u001B[0m )\n", - "File \u001B[1;32m:3\u001B[0m, in \u001B[0;36m__hash__\u001B[1;34m(self)\u001B[0m\n", - "File \u001B[1;32m:3\u001B[0m, in \u001B[0;36m__hash__\u001B[1;34m(self)\u001B[0m\n", - "\u001B[1;31mTypeError\u001B[0m: unhashable type: 'list'" + "name": "stdout", + "output_type": "stream", + "text": [ + "DataModelInstance(id='PLACEHOLDER_IDENTIFIER', data_model=DataModel(name='Phenopacket schema Genomic Interpretation', fields=(DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), DataField(name='interpretation_status', specification=ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description=''), id='interpretation_status', required=True, description='status of the interpretation. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), OrGroup(fields=(DataSection(name='GeneDescriptor', id='genedescriptor', fields=(DataField(name='value_id', specification=ValueSet(elements=(,), name='', description=''), id='value_id', required=True, description='Official identifier of the gene. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='symbol', specification=ValueSet(elements=(,), name='', description=''), id='symbol', required=True, description='Official gene symbol. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='description', specification=ValueSet(elements=(,), name='', description=''), id='description', required=False, description='A free-text description of the gene', cardinality=Cardinality(min=0, max='n'))), required=False, cardinality=Cardinality(min=0, max='n')),), name='call', id='call', description='', required=False, cardinality=Cardinality(min=0, max='n'))), id='phenopacket_schema_genomic_interpretation', resources=()), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey', field=DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), value=101), None, DataSectionInstance(identifier='PLACEHOLDER_IDENTIFIER:example', data_section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),)), None), compliance='lenient')\n" ] } ], - "execution_count": 14 + "execution_count": 22 + }, + { + "cell_type": "markdown", + "source": [ + "laoding dataset" + ], + "metadata": { + "collapsed": false + }, + "id": "66064a72ed75ef3e" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "from phenopacket_mapper.utils.io.input import load_hierarchical_dataset" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-10-17T13:52:15.610160Z", + "start_time": "2024-10-17T13:52:15.606902Z" + } + }, + "id": "440c1a2d269dd7ce", + "execution_count": 23 }, { - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2024-10-17T13:52:15.614763Z", + "start_time": "2024-10-17T13:52:15.610160Z" + } + }, "cell_type": "code", - "source": "", + "source": [ + "data_set = load_hierarchical_dataset(\n", + " file=[StringIO(xml_data[i]) for i in range(len(xml_data))],\n", + " data_model=genomic_interpretation,\n", + " file_extension=\"xml\",\n", + " mapping={\n", + " genomic_interpretation.subject_or_biosample_id: \"ODM.ClinicalData.SubjectData.SubjectKey\",\n", + " genomic_interpretation.example.a_number: \"ODM.ClinicalData.SubjectData.ANumber\",\n", + " }\n", + ")" + ], "id": "edbf8ad0a0a55290", "outputs": [], - "execution_count": null + "execution_count": 24 } ], "metadata": { diff --git a/src/phenopacket_mapper/utils/io/data_reader.py b/src/phenopacket_mapper/utils/io/data_reader.py index a6154da..66aa275 100644 --- a/src/phenopacket_mapper/utils/io/data_reader.py +++ b/src/phenopacket_mapper/utils/io/data_reader.py @@ -25,6 +25,7 @@ def __init__( # TODO: fix file names so we can identify data instances correctly, can do this at the start self.is_dir = False self.file_extension = None + self.file_names = None if isinstance(file, str): self.path = Path(file) @@ -61,8 +62,10 @@ def __init__( elif isinstance(file, list): if file_extension.lower() not in ['json', 'xml']: raise ValueError(f"File extension {file_extension} not supported for reading multiple files.") - self.data = [DataReader(f, encoding=encoding, file_extension=file_extension).data for f in file] + data_readers = [DataReader(f, encoding=encoding, file_extension=file_extension) for f in file] + self.data = [dr.data for dr in data_readers] self.iterable = self.data + self.file_names = [dr.file_names for dr in data_readers] else: raise ValueError(f"Invalid input type {type(file)}.") @@ -82,7 +85,7 @@ def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]: """ # we know that file is always a buffer with the contents of the file # change this to work with self.file - if not self.is_dir: + if not self.is_dir: # is a file if self.file_extension == 'csv': df = pd.read_csv(self.file) return df, [row for row in df.iterrows()] @@ -96,6 +99,7 @@ def _read(self) -> Tuple[Union[pd.DataFrame, List, Dict], Iterable]: else: raise ValueError(f'Unknown file type with extension {self.file_extension}') elif self.is_dir: + self.file_names = [str(file) for file in self.path.iterdir() if file.is_file()] # collect list of all files in the folder files: List[Path] = [file for file in self.path.iterdir() if file.is_file()] file_extension = list(set([file.suffix[1:] for file in files])) From 463baec390bcd39a1d5f4613236ea77f5594da2e Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 16:32:35 +0200 Subject: [PATCH 68/71] set todos and tests for loading hierarchical data --- .../data_standards/data_model.py | 38 ++-- src/phenopacket_mapper/utils/io/input.py | 12 +- tests/utils/io/test_input.py | 180 ++++++++++++++++++ 3 files changed, 207 insertions(+), 23 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/data_model.py b/src/phenopacket_mapper/data_standards/data_model.py index 492d264..1821b72 100644 --- a/src/phenopacket_mapper/data_standards/data_model.py +++ b/src/phenopacket_mapper/data_standards/data_model.py @@ -301,16 +301,17 @@ def validate(self) -> bool: @dataclass(slots=True, frozen=True) class DataSectionInstance: """ - :ivar identifier: The id of the instance, i.e. the row number - :ivar data_section: The `DataSection` object that defines the data model for this instance + :ivar id: The id of the instance, i.e. the row number + :ivar section: The `DataSection` object that defines the data model for this instance :ivar values: A list of `DataFieldValue` objects, each adhering to the `DataField` definition in the `DataModel` """ - identifier: Union[str, int] = field() - data_section: DataSection = field() + id: Union[str, int] = field() + section: DataSection = field() values: Tuple[Union[DataFieldValue, 'DataSectionInstance'], ...] = field() def validate(self) -> bool: - tmp = self.identifier + # TODO: implement this method + tmp = self.id warnings.warn("The DataSectionInstance validate method has not been implemented yet.") return True @@ -355,19 +356,22 @@ def validate(self) -> bool: else: raise ValueError(f"Compliance level {self.compliance} is not valid") - is_required = set(f.id for f in self.data_model.fields if f.required) - fields_present = set(v.field.id for v in self.values) + if not self.data_model.is_hierarchical: + is_required = set(f.id for f in self.data_model.fields if f.required) + fields_present = set(v.field.id for v in self.values) - if len(missing_fields := (is_required - fields_present)) > 0: - error_msg = (f"Required fields are missing in the instance. (row {self.id}) " - f"\n(missing_fields={', '.join(missing_fields)})") - if self.compliance == 'strict': - raise ValueError(error_msg) - elif self.compliance == 'lenient': - warnings.warn(error_msg) - return False - else: - raise ValueError(f"Compliance level {self.compliance} is not valid") + if len(missing_fields := (is_required - fields_present)) > 0: + error_msg = (f"Required fields are missing in the instance. (row {self.id}) " + f"\n(missing_fields={', '.join(missing_fields)})") + if self.compliance == 'strict': + raise ValueError(error_msg) + elif self.compliance == 'lenient': + warnings.warn(error_msg) + return False + else: + raise ValueError(f"Compliance level {self.compliance} is not valid") + else: + pass # TODO: implement validation of hierarchical data return True def __iter__(self): diff --git a/src/phenopacket_mapper/utils/io/input.py b/src/phenopacket_mapper/utils/io/input.py index 84adefd..13a5d2e 100644 --- a/src/phenopacket_mapper/utils/io/input.py +++ b/src/phenopacket_mapper/utils/io/input.py @@ -282,8 +282,8 @@ def load_hierarchical_data_recursive( ]) return DataSectionInstance( - identifier=str(loaded_data_instance_identifier) + ":" + data_section.id, # TODO: get identifiers of parents - data_section=data_section, + id=str(loaded_data_instance_identifier) + ":" + data_section.id, # TODO: get identifiers of parents + section=data_section, values=values, ) elif isinstance(data_model, OrGroup): @@ -345,14 +345,14 @@ def load_hierarchical_dataset( DataModelInstance( id=instance_identifier, data_model=data_model, - values=load_hierarchical_data_recursive( + values=tuple(filter(lambda x: x is not None, list(load_hierarchical_data_recursive( loaded_data_instance_identifier=instance_identifier, loaded_data_instance=data_instance, data_model=data_model, resources=data_model.resources, compliance=compliance, mapping=mapping - ), + )))), compliance=compliance, ) ) @@ -397,13 +397,13 @@ def load_hierarchical_data( return DataModelInstance( id=instance_identifier, # TODO: give instances identifiers based on file names if available data_model=data_model, - values=load_hierarchical_data_recursive( + values=tuple(filter(lambda x: x is not None, list(load_hierarchical_data_recursive( loaded_data_instance_identifier=instance_identifier, loaded_data_instance=data_instance, data_model=data_model, resources=data_model.resources, compliance=compliance, mapping=mapping - ), + )))), compliance=compliance, ) diff --git a/tests/utils/io/test_input.py b/tests/utils/io/test_input.py index e69de29..c79e54b 100644 --- a/tests/utils/io/test_input.py +++ b/tests/utils/io/test_input.py @@ -0,0 +1,180 @@ +from io import StringIO + +import pytest + +from phenopacket_mapper import DataModel +from phenopacket_mapper.data_standards import DataField, Cardinality, ValueSet, DataSection, OrGroup, DataFieldValue +from phenopacket_mapper.data_standards.data_model import DataSectionInstance, DataModelInstance +from phenopacket_mapper.utils.io import DataReader +from phenopacket_mapper.utils.io.input import load_hierarchical_data_recursive, load_hierarchical_data + + +@pytest.fixture +def buffer(): + xml_data = ( + ' ' + '' + '' + '123' + '' + '' + '' + ) + return StringIO(xml_data) + + +@pytest.fixture +def genomic_interpretation(): + return DataModel( + name="Phenopacket schema Genomic Interpretation", + fields=( + DataField( + name="subject_or_biosample_id", + specification=int, + required=True, + description="The id of the patient or biosample that is the subject being interpreted. REQUIRED.", + cardinality=Cardinality.ONE, + ), + + DataField( + name="interpretation_status", + specification=ValueSet( + name="Interpretation Status Value Set", + elements=("UNKNOWN_STATUS", "REJECTED", "CANDIDATE", "CONTRIBUTORY", "CAUSATIVE"), + ), + required=True, + description="status of the interpretation. REQUIRED.", + ), + + DataSection( + name="example", + required=True, + fields=( + DataField( + name="a_number", + required=True, + specification=int, + ), + ) + ), + + OrGroup( + name="call", + fields=( + DataSection( + name="GeneDescriptor", + fields=( + DataField( + name="value_id", + specification=str, + required=True, + description="Official identifier of the gene. REQUIRED." + ), + + DataField( + name="symbol", + specification=str, + required=True, + description="Official gene symbol. REQUIRED." + ), + + DataField( + name="description", + specification=str, + required=False, + description="A free-text description of the gene" + ), + ), + ), + ), + ), + ) + ) + + +def test_load_hierarchical_data_recursive_xml_genomic_interpretation_example_datafieldvalue(buffer, genomic_interpretation): + data_reader = DataReader( + file=buffer, + file_extension="xml", + ) + assert load_hierarchical_data_recursive( + loaded_data_instance_identifier="TEST_IDENTIFIER", + loaded_data_instance=data_reader.data, + data_model=genomic_interpretation.subject_or_biosample_id, + compliance='strict', + mapping={ + genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey", + genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber", + }, + resources=tuple(), + ) == DataFieldValue( + id="TEST_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey", + value=101, + field=genomic_interpretation.subject_or_biosample_id, + ) + + +def test_load_hierarchical_data_recursive_xml_genomic_interpretation_example_datasection( + buffer, + genomic_interpretation +): + data_reader = DataReader( + file=buffer, + file_extension="xml", + ) + assert load_hierarchical_data_recursive( + loaded_data_instance_identifier="TEST_IDENTIFIER", + loaded_data_instance=data_reader.data, + data_model=genomic_interpretation.example, + compliance='strict', + mapping={ + genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey", + genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber", + }, + resources=tuple(), + ) == DataSectionInstance( + id="TEST_IDENTIFIER:example", + section=genomic_interpretation.example, + values=( + DataFieldValue( + id='TEST_IDENTIFIER:ODM.ClinicalData.SubjectData.ANumber', + field=genomic_interpretation.example.a_number, + value=123 + ), + ) + ) + + +def test_load_hierarchical_data_xml_genomic_interpretation_example_instance(buffer, genomic_interpretation): + assert load_hierarchical_data( + file=buffer, + file_extension="xml", + data_model=genomic_interpretation, + compliance='strict', + mapping={ + genomic_interpretation.subject_or_biosample_id: "ODM.ClinicalData.SubjectData.SubjectKey", + genomic_interpretation.example.a_number: "ODM.ClinicalData.SubjectData.ANumber", + }, + ) == DataModelInstance( + id="PLACEHOLDER_IDENTIFIER", # TODO: change once correct identifier is available + data_model=genomic_interpretation, + compliance='strict', + values=( + DataFieldValue( + id="PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey", # TODO: change once correct identifier is available + value=101, + field=genomic_interpretation.subject_or_biosample_id, + ), + DataSectionInstance( + id="PLACEHOLDER_IDENTIFIER:example", # TODO: change once correct identifier is available + section=genomic_interpretation.example, + values=( + DataFieldValue( + id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.ANumber', # TODO: change once correct identifier is available + field=genomic_interpretation.example.a_number, + value=123 + ), + ) + ), + ) + ) From d1d7718e7c71d6eb4c084e66ba47043481a9317d Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 16:33:09 +0200 Subject: [PATCH 69/71] rerun notebook --- notebooks/hierarchical_data_model.ipynb | 107 +++++++++++++++--------- 1 file changed, 66 insertions(+), 41 deletions(-) diff --git a/notebooks/hierarchical_data_model.ipynb b/notebooks/hierarchical_data_model.ipynb index 1189725..4e4b84b 100644 --- a/notebooks/hierarchical_data_model.ipynb +++ b/notebooks/hierarchical_data_model.ipynb @@ -9,8 +9,8 @@ "is_executing": true }, "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.564302Z", - "start_time": "2024-10-17T13:52:15.560679Z" + "end_time": "2024-10-17T14:32:54.331162Z", + "start_time": "2024-10-17T14:32:53.655680Z" } }, "source": [ @@ -18,13 +18,13 @@ "from phenopacket_mapper.data_standards import DataModel, ValueSet, DataSection, OrGroup, Cardinality" ], "outputs": [], - "execution_count": 13 + "execution_count": 1 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.572450Z", - "start_time": "2024-10-17T13:52:15.567308Z" + "end_time": "2024-10-17T14:32:54.337765Z", + "start_time": "2024-10-17T14:32:54.332166Z" } }, "cell_type": "code", @@ -97,13 +97,13 @@ ], "id": "2e979683ae450d9b", "outputs": [], - "execution_count": 14 + "execution_count": 2 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.578062Z", - "start_time": "2024-10-17T13:52:15.573455Z" + "end_time": "2024-10-17T14:32:54.347454Z", + "start_time": "2024-10-17T14:32:54.338770Z" } }, "cell_type": "code", @@ -116,18 +116,18 @@ "data": { "text/plain": "DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n'))" }, - "execution_count": 15, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 15 + "execution_count": 3 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.581403Z", - "start_time": "2024-10-17T13:52:15.579068Z" + "end_time": "2024-10-17T14:32:54.353263Z", + "start_time": "2024-10-17T14:32:54.348460Z" } }, "cell_type": "code", @@ -209,13 +209,13 @@ ] } ], - "execution_count": 16 + "execution_count": 4 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.585667Z", - "start_time": "2024-10-17T13:52:15.582408Z" + "end_time": "2024-10-17T14:32:54.358199Z", + "start_time": "2024-10-17T14:32:54.354267Z" } }, "cell_type": "code", @@ -241,13 +241,13 @@ ], "id": "4c78eb05ea58ff6c", "outputs": [], - "execution_count": 17 + "execution_count": 5 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.589625Z", - "start_time": "2024-10-17T13:52:15.586671Z" + "end_time": "2024-10-17T14:32:54.363102Z", + "start_time": "2024-10-17T14:32:54.359203Z" } }, "cell_type": "code", @@ -260,12 +260,12 @@ "data": { "text/plain": "DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1))" }, - "execution_count": 18, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 18 + "execution_count": 6 }, { "cell_type": "code", @@ -276,12 +276,12 @@ ], "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.593155Z", - "start_time": "2024-10-17T13:52:15.590632Z" + "end_time": "2024-10-17T14:32:54.366626Z", + "start_time": "2024-10-17T14:32:54.364109Z" } }, "id": "f7fdfee60f008e07", - "execution_count": 19 + "execution_count": 7 }, { "cell_type": "markdown", @@ -326,12 +326,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.597800Z", - "start_time": "2024-10-17T13:52:15.594160Z" + "end_time": "2024-10-17T14:32:54.371692Z", + "start_time": "2024-10-17T14:32:54.367631Z" } }, "id": "9f358144800fb41c", - "execution_count": 20 + "execution_count": 8 }, { "cell_type": "markdown", @@ -351,7 +351,7 @@ "output_type": "stream", "text": [ "--------------------------------------------------------------------------------\n", - "DataSectionInstance(identifier='section:example', data_section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='section:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),))\n" + "DataSectionInstance(id='section:example', section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='section:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),))\n" ] } ], @@ -376,12 +376,12 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.602315Z", - "start_time": "2024-10-17T13:52:15.598804Z" + "end_time": "2024-10-17T14:32:54.375786Z", + "start_time": "2024-10-17T14:32:54.372696Z" } }, "id": "dae94d0e649a0120", - "execution_count": 21 + "execution_count": 9 }, { "cell_type": "markdown", @@ -396,8 +396,8 @@ { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.606902Z", - "start_time": "2024-10-17T13:52:15.603320Z" + "end_time": "2024-10-17T14:32:54.380021Z", + "start_time": "2024-10-17T14:32:54.375786Z" } }, "cell_type": "code", @@ -419,11 +419,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "DataModelInstance(id='PLACEHOLDER_IDENTIFIER', data_model=DataModel(name='Phenopacket schema Genomic Interpretation', fields=(DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), DataField(name='interpretation_status', specification=ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description=''), id='interpretation_status', required=True, description='status of the interpretation. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), OrGroup(fields=(DataSection(name='GeneDescriptor', id='genedescriptor', fields=(DataField(name='value_id', specification=ValueSet(elements=(,), name='', description=''), id='value_id', required=True, description='Official identifier of the gene. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='symbol', specification=ValueSet(elements=(,), name='', description=''), id='symbol', required=True, description='Official gene symbol. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='description', specification=ValueSet(elements=(,), name='', description=''), id='description', required=False, description='A free-text description of the gene', cardinality=Cardinality(min=0, max='n'))), required=False, cardinality=Cardinality(min=0, max='n')),), name='call', id='call', description='', required=False, cardinality=Cardinality(min=0, max='n'))), id='phenopacket_schema_genomic_interpretation', resources=()), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey', field=DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), value=101), None, DataSectionInstance(identifier='PLACEHOLDER_IDENTIFIER:example', data_section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),)), None), compliance='lenient')\n" + "DataModelInstance(id='PLACEHOLDER_IDENTIFIER', data_model=DataModel(name='Phenopacket schema Genomic Interpretation', fields=(DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), DataField(name='interpretation_status', specification=ValueSet(elements=('UNKNOWN_STATUS', 'REJECTED', 'CANDIDATE', 'CONTRIBUTORY', 'CAUSATIVE'), name='Interpretation Status Value Set', description=''), id='interpretation_status', required=True, description='status of the interpretation. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), OrGroup(fields=(DataSection(name='GeneDescriptor', id='genedescriptor', fields=(DataField(name='value_id', specification=ValueSet(elements=(,), name='', description=''), id='value_id', required=True, description='Official identifier of the gene. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='symbol', specification=ValueSet(elements=(,), name='', description=''), id='symbol', required=True, description='Official gene symbol. REQUIRED.', cardinality=Cardinality(min=1, max='n')), DataField(name='description', specification=ValueSet(elements=(,), name='', description=''), id='description', required=False, description='A free-text description of the gene', cardinality=Cardinality(min=0, max='n'))), required=False, cardinality=Cardinality(min=0, max='n')),), name='call', id='call', description='', required=False, cardinality=Cardinality(min=0, max='n'))), id='phenopacket_schema_genomic_interpretation', resources=()), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey', field=DataField(name='subject_or_biosample_id', specification=ValueSet(elements=(,), name='', description=''), id='subject_or_biosample_id', required=True, description='The id of the patient or biosample that is the subject being interpreted. REQUIRED.', cardinality=Cardinality(min=1, max=1)), value=101), DataSectionInstance(id='PLACEHOLDER_IDENTIFIER:example', section=DataSection(name='example', id='example', fields=(DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')),), required=True, cardinality=Cardinality(min=1, max='n')), values=(DataFieldValue(id='PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.ANumber', field=DataField(name='a_number', specification=ValueSet(elements=(,), name='', description=''), id='a_number', required=True, description='', cardinality=Cardinality(min=1, max='n')), value=123),))), compliance='lenient')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:296: UserWarning: Value 101 of type is not in the value set of field subject_or_biosample_id (row PLACEHOLDER_IDENTIFIER:ODM.ClinicalData.SubjectData.SubjectKey)\n", + " warnings.warn(f\"Value {self.value} of type {type(self.value)} is not in the value set of field \"\n", + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:354: UserWarning: Instance values do not comply with their respective fields' valuesets. (row PLACEHOLDER_IDENTIFIER)\n", + " warnings.warn(error_msg)\n" ] } ], - "execution_count": 22 + "execution_count": 10 }, { "cell_type": "markdown", @@ -444,18 +454,18 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.610160Z", - "start_time": "2024-10-17T13:52:15.606902Z" + "end_time": "2024-10-17T14:32:54.383585Z", + "start_time": "2024-10-17T14:32:54.380021Z" } }, "id": "440c1a2d269dd7ce", - "execution_count": 23 + "execution_count": 11 }, { "metadata": { "ExecuteTime": { - "end_time": "2024-10-17T13:52:15.614763Z", - "start_time": "2024-10-17T13:52:15.610160Z" + "end_time": "2024-10-17T14:32:54.387942Z", + "start_time": "2024-10-17T14:32:54.384589Z" } }, "cell_type": "code", @@ -471,8 +481,23 @@ ")" ], "id": "edbf8ad0a0a55290", - "outputs": [], - "execution_count": 24 + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:296: UserWarning: Value 101 of type is not in the value set of field subject_or_biosample_id (row 0:ODM.ClinicalData.SubjectData.SubjectKey)\n", + " warnings.warn(f\"Value {self.value} of type {type(self.value)} is not in the value set of field \"\n", + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:354: UserWarning: Instance values do not comply with their respective fields' valuesets. (row 0)\n", + " warnings.warn(error_msg)\n", + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:296: UserWarning: Value 102 of type is not in the value set of field subject_or_biosample_id (row 1:ODM.ClinicalData.SubjectData.SubjectKey)\n", + " warnings.warn(f\"Value {self.value} of type {type(self.value)} is not in the value set of field \"\n", + "C:\\Users\\filip\\OneDrive\\Documents\\dataspell\\phenopacket_mapper\\src\\phenopacket_mapper\\data_standards\\data_model.py:354: UserWarning: Instance values do not comply with their respective fields' valuesets. (row 1)\n", + " warnings.warn(error_msg)\n" + ] + } + ], + "execution_count": 12 } ], "metadata": { From 1478c03cd593825592a401db44ed17a727acea50 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 16:39:08 +0200 Subject: [PATCH 70/71] removed doctest --- src/phenopacket_mapper/utils/parsing/parse_value_set.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/phenopacket_mapper/utils/parsing/parse_value_set.py b/src/phenopacket_mapper/utils/parsing/parse_value_set.py index 3a3458c..e5d262c 100644 --- a/src/phenopacket_mapper/utils/parsing/parse_value_set.py +++ b/src/phenopacket_mapper/utils/parsing/parse_value_set.py @@ -14,9 +14,6 @@ def parse_value_set( ) -> ValueSet: """Parses a value set from a string representation - >>> ValueSet.parse_value_set("True, False", resources=[]) - ValueSet(elements=[True, False], name='', description='') - :param value_set_str: String representation of the value set :param value_set_name: Name of the value set :param value_set_description: Description of the value set From 716159ac88dd21c9c6d6a1c1c245ded8475ec9f8 Mon Sep 17 00:00:00 2001 From: frehburg Date: Thu, 17 Oct 2024 16:46:03 +0200 Subject: [PATCH 71/71] removed doctest --- src/phenopacket_mapper/data_standards/value_set.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/phenopacket_mapper/data_standards/value_set.py b/src/phenopacket_mapper/data_standards/value_set.py index 051be89..47fabe3 100644 --- a/src/phenopacket_mapper/data_standards/value_set.py +++ b/src/phenopacket_mapper/data_standards/value_set.py @@ -76,12 +76,6 @@ def parse_value_set( ) -> 'ValueSet': """Parses a value set from a string representation - >>> ValueSet.parse_value_set("True, False", "TrueFalseValueSet", "A value set for True and False", []) - ValueSet(elements=[True, False], name='TrueFalseValueSet', description='A value set for True and False') - - >>> ValueSet.parse_value_set("-1, 0, 1", resources=[]) - ValueSet(elements=[-1, 0, 1], name='', description='') - :param value_set_str: String representation of the value set :param value_set_name: Name of the value set :param value_set_description: Description of the value set