Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: replace goodtables with pandera #761

Merged
merged 3 commits into from
Jan 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ History

Next Release
------------
* Replace ``goodtables`` with ``pandera`` for data validation. This change is not
100% backwards compatible, although most data tables should be unaffected.

0.16.1 (2023-11-21)
-------------------
Expand Down
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,8 @@ install_requires =
requests
numpydoc
pylru
goodtables ~=2.0
pandera ~=0.18
jsonschema ~=4.20
depinfo ~=2.2
requests
python_requires = >=3.6
Expand Down
80 changes: 0 additions & 80 deletions src/memote/experimental/checks.py

This file was deleted.

62 changes: 28 additions & 34 deletions src/memote/experimental/essentiality.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
from __future__ import absolute_import

import logging
from typing import Optional

import pandera as pa
from cobra.flux_analysis import single_gene_deletion
from pandera.typing import Series

from memote.experimental.experiment import Experiment

Expand All @@ -31,11 +34,31 @@
LOGGER = logging.getLogger(__name__)


class EssentialityExperimentModel(pa.DataFrameModel):
gene: Series[str] = pa.Field(
title="Gene Identifier",
description="The gene identifier must correspond to the metabolic model "
"identifiers.",
unique=True,
)
essential: Series[bool] = pa.Field(
title="Gene Essentiality",
description="Whether a gene is (conditionally) essential.",
)
comment: Optional[Series[str]] = pa.Field(
nullable=True,
title="Comment",
description="Optional comment which is not processed further.",
)

class Config:
coerce = True
strict = "filter"


class EssentialityExperiment(Experiment):
"""Represent an essentiality experiment."""

SCHEMA = "essentiality.json"

def __init__(self, **kwargs):
"""
Initialize an essentiality experiment.
Expand All @@ -47,39 +70,10 @@ def __init__(self, **kwargs):
"""
super(EssentialityExperiment, self).__init__(**kwargs)

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.

Parameters
----------
dtype_conversion : dict
Column names as keys and corresponding type for loading the data.
Please take a look at the `pandas documentation
<https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
for detailed explanations.

"""
if dtype_conversion is None:
dtype_conversion = {"essential": str}
super(EssentialityExperiment, self).load(dtype_conversion=dtype_conversion)
self.data["essential"] = self.data["essential"].isin(self.TRUTHY)

def validate(self, model, checks=None):
"""Use a defined schema to validate the medium table format."""
if checks is None:
checks = []
custom = [
{
"unknown-identifier": {
"column": "gene",
"identifiers": {g.id for g in model.genes},
}
}
]
super(EssentialityExperiment, self).validate(
model=model, checks=checks + custom
)
"""Use a defined schema to validate the essentiality table format."""
EssentialityExperimentModel.validate(self.data, lazy=True)
assert self.data["gene"].isin({g.id for g in model.genes}).all()

def evaluate(self, model):
"""Use the defined parameters to predict single gene essentiality."""
Expand Down
38 changes: 3 additions & 35 deletions src/memote/experimental/experimental_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,8 @@
"""Provide a class for medium definitions."""


import json
import logging


try:
from importlib.resources import files
except ImportError:
from importlib_resources import files

from goodtables import validate

# Importing the checks is necessary in order to register them.
import memote.experimental.schemata
from memote.experimental.checks import UnknownIdentifier # noqa: F401
from memote.experimental.tabular import read_tabular


Expand All @@ -43,9 +31,6 @@
class ExperimentalBase(object):
"""Represent a specific medium condition."""

SCHEMA = None
TRUTHY = {"true", "True", "TRUE", "1", "yes", "Yes", "YES"}

def __init__(self, identifier, obj, filename, **kwargs):
"""
Initialize a medium.
Expand All @@ -66,11 +51,10 @@
self.label = ""
self.filename = filename
self.data = None
self.schema = None

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.
Load the data table.

Parameters
----------
Expand All @@ -82,26 +66,10 @@

"""
self.data = read_tabular(self.filename, dtype_conversion)
with files(memote.experimental.schemata).joinpath(self.SCHEMA).open(
mode="r", encoding="utf-8"
) as file_handle:
self.schema = json.load(file_handle)

def validate(self, model, checks=None):
def validate(self, model):
"""Use a defined schema to validate the given table."""
if checks is None:
checks = []
records = self.data.to_dict("records")
self.evaluate_report(
validate(
records,
headers=list(records[0]),
preset="table",
schema=self.schema,
order_fields=True,
checks=checks,
)
)
NotImplementedError("Base class does not implement this method.")

Check warning on line 72 in src/memote/experimental/experimental_base.py

View check run for this annotation

Codecov / codecov/patch

src/memote/experimental/experimental_base.py#L72

Added line #L72 was not covered by tests

@staticmethod
def evaluate_report(report):
Expand Down
55 changes: 36 additions & 19 deletions src/memote/experimental/growth.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@
from __future__ import absolute_import

import logging
from typing import Optional

import pandera as pa
from pandas import DataFrame
from pandera.typing import Series

from memote.experimental.experiment import Experiment

Expand All @@ -31,11 +34,39 @@
LOGGER = logging.getLogger(__name__)


class GrowthExperimentModel(pa.DataFrameModel):
exchange: Series[str] = pa.Field(
description="The exchange reaction identifier of the variable medium "
"component. Typically, this is a carbon source which will be added to a "
"configured base medium.",
title="Exchange Reaction Identifier",
)
uptake: Series[float] = pa.Field(
ge=0.0,
le=1000.0,
title="Uptake Rate",
description="The uptake rate for the exchange reaction. For models following "
"common practices this modifies the lower bound.",
)
growth: Series[bool] = pa.Field(
title="Growth",
description="A binary indicator whether growth was observed according to the "
"processed biolog data.",
)
comment: Optional[Series[str]] = pa.Field(
nullable=True,
title="Comment",
description="Optional comment which is not processed further.",
)

class Config:
coerce = True
strict = "filter"


class GrowthExperiment(Experiment):
"""Represent a growth experiment."""

SCHEMA = "growth.json"

def __init__(self, **kwargs):
"""
Initialize a growth experiment.
Expand All @@ -47,23 +78,9 @@ def __init__(self, **kwargs):
"""
super(GrowthExperiment, self).__init__(**kwargs)

def load(self, dtype_conversion=None):
"""
Load the data table and corresponding validation schema.

Parameters
----------
dtype_conversion : dict
Column names as keys and corresponding type for loading the data.
Please take a look at the `pandas documentation
<https://pandas.pydata.org/pandas-docs/stable/io.html#specifying-column-data-types>`__
for detailed explanations.

"""
if dtype_conversion is None:
dtype_conversion = {"growth": str}
super(GrowthExperiment, self).load(dtype_conversion=dtype_conversion)
self.data["growth"] = self.data["growth"].isin(self.TRUTHY)
def validate(self, model):
"""Use a defined schema to validate the growth table format."""
GrowthExperimentModel.validate(self.data, lazy=True)

def evaluate(self, model):
"""Evaluate in silico growth rates."""
Expand Down
Loading