refactor: replace goodtables with pandera

opencobra · Jan 6, 2024 · e649f79 · e649f79
1 parent dfe79f4
commit e649f79
Show file tree

Hide file tree

Showing 11 changed files with 142 additions and 369 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -56,7 +56,8 @@ install_requires =
 	requests
 	numpydoc
 	pylru
-	goodtables ~=2.0
+    pandera ~=0.18
+    jsonschema ~=4.20
 	depinfo ~=2.2
 	requests
 python_requires = >=3.6

diff --git a/src/memote/experimental/checks.py b/src/memote/experimental/checks.py
diff --git a/src/memote/experimental/essentiality.py b/src/memote/experimental/essentiality.py
@@ -20,8 +20,11 @@
 from __future__ import absolute_import
 
 import logging
+from typing import Optional
 
+import pandera as pa
 from cobra.flux_analysis import single_gene_deletion
+from pandera.typing import Series
 
 from memote.experimental.experiment import Experiment
 
@@ -31,11 +34,31 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class EssentialityExperimentModel(pa.DataFrameModel):
+    gene: Series[str] = pa.Field(
+        title="Gene Identifier",
+        description="The gene identifier must correspond to the metabolic model "
+        "identifiers.",
+        unique=True,
+    )
+    essential: Series[bool] = pa.Field(
+        title="Gene Essentiality",
+        description="Whether a gene is (conditionally) essential.",
+    )
+    comment: Optional[Series[str]] = pa.Field(
+        nullable=True,
+        title="Comment",
+        description="Optional comment which is not processed further.",
+    )
+
+    class Config:
+        coerce = True
+        strict = "filter"
+
+
 class EssentialityExperiment(Experiment):
     """Represent an essentiality experiment."""
 
-    SCHEMA = "essentiality.json"
-
     def __init__(self, **kwargs):
         """
         Initialize an essentiality experiment.
@@ -47,39 +70,10 @@ def __init__(self, **kwargs):
         """
         super(EssentialityExperiment, self).__init__(**kwargs)
 
-    def load(self, dtype_conversion=None):
-        """
-        Load the data table and corresponding validation schema.
-
-        Parameters
-        ----------
-        dtype_conversion : dict
-            Column names as keys and corresponding type for loading the data.
-            Please take a look at the `pandas documentation
-            <https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
-            for detailed explanations.
-
-        """
-        if dtype_conversion is None:
-            dtype_conversion = {"essential": str}
-        super(EssentialityExperiment, self).load(dtype_conversion=dtype_conversion)
-        self.data["essential"] = self.data["essential"].isin(self.TRUTHY)
-
     def validate(self, model, checks=None):
-        """Use a defined schema to validate the medium table format."""
-        if checks is None:
-            checks = []
-        custom = [
-            {
-                "unknown-identifier": {
-                    "column": "gene",
-                    "identifiers": {g.id for g in model.genes},
-                }
-            }
-        ]
-        super(EssentialityExperiment, self).validate(
-            model=model, checks=checks + custom
-        )
+        """Use a defined schema to validate the essentiality table format."""
+        EssentialityExperimentModel.validate(self.data, lazy=True)
+        assert self.data["gene"].isin({g.id for g in model.genes}).all()
 
     def evaluate(self, model):
         """Use the defined parameters to predict single gene essentiality."""

diff --git a/src/memote/experimental/experimental_base.py b/src/memote/experimental/experimental_base.py
@@ -17,20 +17,8 @@
 """Provide a class for medium definitions."""
 
 
-import json
 import logging
 
-
-try:
-    from importlib.resources import files
-except ImportError:
-    from importlib_resources import files
-
-from goodtables import validate
-
-# Importing the checks is necessary in order to register them.
-import memote.experimental.schemata
-from memote.experimental.checks import UnknownIdentifier  # noqa: F401
 from memote.experimental.tabular import read_tabular
 
 
@@ -43,9 +31,6 @@
 class ExperimentalBase(object):
     """Represent a specific medium condition."""
 
-    SCHEMA = None
-    TRUTHY = {"true", "True", "TRUE", "1", "yes", "Yes", "YES"}
-
     def __init__(self, identifier, obj, filename, **kwargs):
         """
         Initialize a medium.
@@ -66,11 +51,10 @@ def __init__(self, identifier, obj, filename, **kwargs):
             self.label = ""
         self.filename = filename
         self.data = None
-        self.schema = None
 
     def load(self, dtype_conversion=None):
         """
-        Load the data table and corresponding validation schema.
+        Load the data table.
 
         Parameters
         ----------
@@ -82,26 +66,10 @@ def load(self, dtype_conversion=None):
 
         """
         self.data = read_tabular(self.filename, dtype_conversion)
-        with files(memote.experimental.schemata).joinpath(self.SCHEMA).open(
-            mode="r", encoding="utf-8"
-        ) as file_handle:
-            self.schema = json.load(file_handle)
 
-    def validate(self, model, checks=None):
+    def validate(self, model):
         """Use a defined schema to validate the given table."""
-        if checks is None:
-            checks = []
-        records = self.data.to_dict("records")
-        self.evaluate_report(
-            validate(
-                records,
-                headers=list(records[0]),
-                preset="table",
-                schema=self.schema,
-                order_fields=True,
-                checks=checks,
-            )
-        )
+        NotImplementedError("Base class does not implement this method.")
 
     @staticmethod
     def evaluate_report(report):

diff --git a/src/memote/experimental/growth.py b/src/memote/experimental/growth.py
@@ -20,8 +20,11 @@
 from __future__ import absolute_import
 
 import logging
+from typing import Optional
 
+import pandera as pa
 from pandas import DataFrame
+from pandera.typing import Series
 
 from memote.experimental.experiment import Experiment
 
@@ -31,11 +34,39 @@
 LOGGER = logging.getLogger(__name__)
 
 
+class GrowthExperimentModel(pa.DataFrameModel):
+    exchange: Series[str] = pa.Field(
+        description="The exchange reaction identifier of the variable medium "
+        "component. Typically, this is a carbon source which will be added to a "
+        "configured base medium.",
+        title="Exchange Reaction Identifier",
+    )
+    uptake: Series[float] = pa.Field(
+        ge=0.0,
+        le=1000.0,
+        title="Uptake Rate",
+        description="The uptake rate for the exchange reaction. For models following "
+        "common practices this modifies the lower bound.",
+    )
+    growth: Series[bool] = pa.Field(
+        title="Growth",
+        description="A binary indicator whether growth was observed according to the "
+        "processed biolog data.",
+    )
+    comment: Optional[Series[str]] = pa.Field(
+        nullable=True,
+        title="Comment",
+        description="Optional comment which is not processed further.",
+    )
+
+    class Config:
+        coerce = True
+        strict = "filter"
+
+
 class GrowthExperiment(Experiment):
     """Represent a growth experiment."""
 
-    SCHEMA = "growth.json"
-
     def __init__(self, **kwargs):
         """
         Initialize a growth experiment.
@@ -47,23 +78,9 @@ def __init__(self, **kwargs):
         """
         super(GrowthExperiment, self).__init__(**kwargs)
 
-    def load(self, dtype_conversion=None):
-        """
-        Load the data table and corresponding validation schema.
-
-        Parameters
-        ----------
-        dtype_conversion : dict
-            Column names as keys and corresponding type for loading the data.
-            Please take a look at the `pandas documentation
-            <https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
-            for detailed explanations.
-
-        """
-        if dtype_conversion is None:
-            dtype_conversion = {"growth": str}
-        super(GrowthExperiment, self).load(dtype_conversion=dtype_conversion)
-        self.data["growth"] = self.data["growth"].isin(self.TRUTHY)
+    def validate(self, model):
+        """Use a defined schema to validate the growth table format."""
+        GrowthExperimentModel.validate(self.data, lazy=True)
 
     def evaluate(self, model):
         """Evaluate in silico growth rates."""