Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
181 commits
Select commit Hold shift + click to select a range
e37135c
Added the pseaac encoding algorithm
satvshr Jul 7, 2025
6ea5ff7
Added Aptanet implementation
satvshr Jul 7, 2025
a5f01e0
Made pseaac to a class and made the functions private, still working …
satvshr Jul 7, 2025
3773a90
Made a few readability changes
satvshr Jul 7, 2025
9b9a3da
Edited tests
satvshr Jul 8, 2025
2dfe0c7
Added pytest to tests
satvshr Jul 8, 2025
1e182d3
Added numpy style docstrings and ruff formatting
satvshr Jul 8, 2025
20d7e37
Added docstrings, made functions pvt and made code more clean
satvshr Jul 8, 2025
fc2f051
Removed AptaNet from root
satvshr Jul 9, 2025
62f6c42
Added example
satvshr Jul 9, 2025
848fc9b
Removed AptaNet from root
satvshr Jul 9, 2025
1515efe
Made requested changes
satvshr Jul 9, 2025
75d4efb
Merge branch 'main' into issue28
satvshr Jul 10, 2025
733f908
Made requested changes and updated tests
satvshr Jul 10, 2025
04ab599
Made suggested changes
satvshr Jul 11, 2025
dc78e44
Removed lint. from pyproject, will push it as a separate PR
satvshr Jul 11, 2025
c347988
Refactored code
satvshr Jul 11, 2025
d9537f4
Added pandas as a dependancy
satvshr Jul 11, 2025
1c46c55
Renamed parent folder name to put it in the same level as AptaNet
satvshr Jul 11, 2025
a716872
Merge remote-tracking branch 'origin/main' into issue13
satvshr Jul 13, 2025
7781441
Refactored code and made architecture flexible
satvshr Jul 14, 2025
e762cc8
Edited docstrings and directory structure
satvshr Jul 14, 2025
e844d4f
Merge branch 'main' into issue28
satvshr Jul 14, 2025
f9392ef
weird rename experiment
satvshr Jul 14, 2025
beb45ec
weird rename experiment pt. 2
satvshr Jul 14, 2025
d603d07
Made requested changes
satvshr Jul 14, 2025
6ecf576
Made requested changes
satvshr Jul 15, 2025
b91c511
Made requested changes
satvshr Jul 15, 2025
b2428b0
chore: dummy commit to retrigger CI
satvshr Jul 15, 2025
2982954
Added missing init file to utils
satvshr Jul 15, 2025
0b5b388
Made requested changes
satvshr Jul 16, 2025
d24c4d7
Merge branch 'main' into issue28
satvshr Jul 16, 2025
0cd72b7
Added requested changes
satvshr Jul 16, 2025
fabc7b4
Added requested changes
satvshr Jul 16, 2025
32633d3
Added info about prop groups in class docstring
satvshr Jul 16, 2025
2b08363
Merge branch 'main' into issue13
satvshr Jul 17, 2025
f502fed
Merge branch 'issue28' into issue13
satvshr Jul 17, 2025
056c08e
Merged issue28 to issue13
satvshr Jul 17, 2025
6136c39
Removed init method description
satvshr Jul 17, 2025
ae7d1fe
Removed init method description
satvshr Jul 17, 2025
f339a7b
Added tests and bug fixes
satvshr Jul 17, 2025
651d066
Added torch as a dependency
satvshr Jul 17, 2025
7991cc8
Added torch as a dependency
satvshr Jul 17, 2025
8f0f0ae
Added sklearn as a dependency
satvshr Jul 17, 2025
60ec8da
fixed test so that protein_seq length is above 30
satvshr Jul 17, 2025
88c0122
editing changes
satvshr Jul 17, 2025
b7a7349
Made requested changes
satvshr Jul 18, 2025
c14c0bb
Made requested changes
satvshr Jul 18, 2025
d1075a7
Added .vscode to .gitignore
satvshr Jul 22, 2025
839c3e5
Merge branch 'issue28' into issue13
satvshr Jul 22, 2025
19a9e98
Added metadata
satvshr Jul 22, 2025
945addc
Merge branch 'issue28' into issue13
satvshr Jul 22, 2025
3c1fa3a
Added architectural changes
satvshr Jul 23, 2025
6e6836e
Added pdb to string helper function
satvshr Jul 23, 2025
73773af
Made requested changes
satvshr Jul 24, 2025
e2449e0
removed deleted files
satvshr Jul 24, 2025
212d54b
Added architecture to support loaders along with a loader for PFOA
satvshr Jul 28, 2025
8a29c09
Merge branch 'main' into issue55
satvshr Jul 28, 2025
3b09533
Added tests
satvshr Jul 28, 2025
29e8ed9
Merge branch 'main' into issue9
satvshr Jul 28, 2025
9d9738f
Added tests
satvshr Jul 28, 2025
210f09c
Renamed test file
satvshr Jul 28, 2025
6fb1db5
Made requested changes
satvshr Jul 28, 2025
c57e66a
Merge branch 'issue9' into issue55
satvshr Jul 28, 2025
8c00454
Merge branch 'main' into issue13
satvshr Jul 29, 2025
2dbf5d5
Stop tracking .vscode folder
satvshr Jul 29, 2025
8767f11
Added docstrings
satvshr Jul 29, 2025
32d7673
Fixed bugs to ensure pytorch compatibility by type-casting to float32
satvshr Jul 29, 2025
428bf68
Saving work
satvshr Jul 29, 2025
f94cd6a
Added an improved test to test the pipeline instead
satvshr Jul 29, 2025
1ac72fc
Renamed function
satvshr Jul 29, 2025
8278c74
Merge branch 'issue9' into issue54
satvshr Jul 29, 2025
0d16e13
Stop tracking .vscode folder
satvshr Jul 29, 2025
a38f70b
Stop tracking .vscode folder
satvshr Jul 29, 2025
40ba9d3
Added 3eiy and its loader, standardized loader tests
satvshr Jul 30, 2025
fad2f97
Added 3eiy and its loader, standardized loader tests
satvshr Jul 30, 2025
4a17dac
Adds 1ghn instead of 3eiy
satvshr Jul 30, 2025
cace4bd
Adds 1ghn instead of 3eiy
satvshr Jul 30, 2025
3301a4c
Renamed ghn to gnh
satvshr Jul 30, 2025
f61fdbc
Merge branch 'issue65' into issue55
satvshr Jul 30, 2025
28de2a3
Used 1gnh in tests instead of pfoa since pfoa is not a protein
satvshr Jul 30, 2025
17855b3
Merge branch 'issue55' into issue54
satvshr Jul 30, 2025
c6ccb23
Moved utility function from pipeline to utils
satvshr Jul 30, 2025
8304148
Merge branch 'issue13' into issue54
satvshr Jul 30, 2025
e9711c0
Added pytho ndpendency because of skorch
satvshr Jul 31, 2025
bf20e3c
Merge branch 'issue13' into issue54
satvshr Jul 31, 2025
0955226
Added notebook to examples directory
satvshr Jul 31, 2025
e6e3c9b
Added first draft of notebook
satvshr Jul 31, 2025
7273082
Logging error
satvshr Jul 31, 2025
ad8db59
Fixed bug during fit
satvshr Jul 31, 2025
90ffc7e
Merge branch 'main' into issue54
satvshr Jul 31, 2025
fa869d5
Added requested changes
satvshr Aug 3, 2025
38ec2ca
Merge branch 'main' into issue13
fkiraly Aug 3, 2025
e0d0af3
Update pyproject.toml
fkiraly Aug 3, 2025
5a10d65
Update pyproject.toml
fkiraly Aug 3, 2025
0b04ce8
Merge branch 'main' into issue13
satvshr Aug 3, 2025
64fd01b
Changed workflow file to stop testing for python 3.13
satvshr Aug 3, 2025
eeef5d6
Added skorch as a dependency
satvshr Aug 3, 2025
144b405
bug fix
satvshr Aug 3, 2025
ae9a814
Removed 3.13 as a non dependency
satvshr Aug 3, 2025
c6fe37e
Merged main
satvshr Aug 3, 2025
e9846bc
Added init file
satvshr Aug 3, 2025
b80f145
Added skip test
satvshr Aug 4, 2025
eef91ff
Added test
satvshr Aug 4, 2025
bbdaf27
Merged with issue13
satvshr Aug 4, 2025
01090c5
Merge with main
satvshr Aug 4, 2025
6f04f11
Added init file
satvshr Aug 4, 2025
63226a5
Removed loader folder
satvshr Aug 4, 2025
9a74b71
fixed bug
satvshr Aug 4, 2025
4266183
Merged with issue55
satvshr Aug 4, 2025
ac6851c
Removed unecessary classes
satvshr Aug 4, 2025
3adb75a
Fixed some bugs and renames
satvshr Aug 4, 2025
2aeea11
Changed pipeline to a class.
satvshr Aug 5, 2025
912ec72
Updated to add optimizer
satvshr Aug 5, 2025
e29380a
Fixed pipeline bug
satvshr Aug 6, 2025
36f750d
docstring save commit
satvshr Aug 9, 2025
badfafe
Adding failures
satvshr Aug 9, 2025
66a1620
Changed file names and added 2 classes
satvshr Aug 9, 2025
98327d8
Trying to make tests pass
satvshr Aug 9, 2025
f264228
Tests test
satvshr Aug 9, 2025
4a3a622
Update _feature_classifier.py
satvshr Aug 9, 2025
11eef33
Update _feature_classifier.py
satvshr Aug 10, 2025
1b3e178
Update _feature_classifier.py
satvshr Aug 10, 2025
28434c9
Update _feature_classifier.py
satvshr Aug 10, 2025
67ce5bb
Added docstrings back
satvshr Aug 10, 2025
1988408
Spacing for lists
satvshr Aug 10, 2025
1cd198b
Merge branch 'issue13' into issue54
satvshr Aug 11, 2025
cc64583
Update _feature_classifier.py
satvshr Aug 11, 2025
355718e
Merge branch 'issue13' into issue54
satvshr Aug 11, 2025
9f94386
Update aptanet_tutorial.ipynb
satvshr Aug 11, 2025
08297c0
Update aptanet_tutorial.ipynb
satvshr Aug 11, 2025
bc8355f
Update aptanet_tutorial.ipynb
satvshr Aug 11, 2025
5882b24
Update aptanet_tutorial.ipynb
satvshr Aug 11, 2025
06a44a4
Merge branch 'main' into issue13
satvshr Aug 12, 2025
183c48b
Made requested and architectural changes
satvshr Aug 12, 2025
4a8f9dd
Merge branch 'issue13' into issue54
satvshr Aug 12, 2025
492e054
Merge branch 'main' into issue54
fkiraly Aug 14, 2025
e5b1ddd
Update pyproject.toml
satvshr Aug 14, 2025
84eefa1
Delete one_gnh.py
satvshr Aug 15, 2025
1c78a4b
Update pyproject.toml
satvshr Aug 17, 2025
ea608f4
Merge branch 'main' into issue54
satvshr Aug 19, 2025
d9a5df9
Update aptanet_tutorial.ipynb
satvshr Aug 19, 2025
55dcb86
Added workflows
satvshr Aug 20, 2025
96e6f54
Merge branch 'main' into issue109
satvshr Aug 22, 2025
d7da265
Merge branch 'main' into issue109
satvshr Aug 22, 2025
bce4cd1
Getting aptanet ready
satvshr Aug 23, 2025
98a3a1f
Initial setup
satvshr Sep 1, 2025
57a113f
Added metaclass to ensre no new public methods, made some other progr…
satvshr Sep 3, 2025
3d1f930
Merge branch 'main' into issue109
satvshr Sep 3, 2025
1f400b0
Progress on Preprocessors, AptaNetPreprocessor seems completed
satvshr Sep 3, 2025
8b46667
Continued improving on benchmarking and preprocessing
satvshr Sep 4, 2025
b91eaad
bug fixing and improvements
satvshr Sep 4, 2025
5e3f4d6
seems to be working, very slow though
satvshr Sep 4, 2025
0e79b7f
Update _aptanet_utils.py
satvshr Sep 6, 2025
f11853d
merge with main
satvshr Sep 9, 2025
8d875e2
reset main
satvshr Sep 9, 2025
effb111
reset pt2
satvshr Sep 9, 2025
2dcc61f
AptaNet bug fix
satvshr Sep 9, 2025
15029ad
Update test_aptanet.py
satvshr Sep 9, 2025
2fe0e9d
Update _base.py
satvshr Sep 10, 2025
e62a32b
docstring changes
satvshr Sep 11, 2025
45094f6
Update _base.py
satvshr Sep 12, 2025
e38770e
Update _base.py
satvshr Sep 15, 2025
aa9677b
Update _base.py
satvshr Sep 15, 2025
b5724dc
Merge branch 'main' into issue109
satvshr Sep 15, 2025
7e25cd4
Added sklearn-like csv loader
satvshr Sep 16, 2025
0466310
Making requested changes
satvshr Sep 16, 2025
efb6fcf
cleaned up code
satvshr Sep 17, 2025
022d748
Update _base.py
satvshr Sep 17, 2025
63ddaf6
Update test_csv_loader.py
satvshr Sep 17, 2025
2c06084
Update _base.py
satvshr Sep 21, 2025
7027d47
Update _base.py
satvshr Sep 21, 2025
ec96bba
cleaning code remove tag checks
satvshr Sep 21, 2025
d6111a9
Update _base.py
satvshr Sep 21, 2025
2e2d71f
Test suite added and bugs fixed
satvshr Sep 21, 2025
90af1ee
arg name fixing
satvshr Sep 21, 2025
971ae29
Update _csv_loader.py
satvshr Sep 22, 2025
fe19150
Update test_csv_loader.py
satvshr Sep 22, 2025
bb80a81
Merge branch 'main' into issue109
satvshr Sep 29, 2025
18833f0
fixed docstring and example
satvshr Sep 29, 2025
84a2754
Update _aptanet_utils.py
satvshr Sep 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/aptanet_tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -554,7 +554,7 @@
" ]\n",
")\n",
"\n",
"pipeline = AptaNetPipeline(classifier=model)"
"pipeline = AptaNetPipeline(estimator=model)"
]
},
{
Expand Down
17 changes: 9 additions & 8 deletions pyaptamer/aptanet/_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__all__ = ["AptaNetPipeline"]
__required__ = ["python>=3.9,<3.13"]

from skbase.base import BaseObject
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
Expand All @@ -11,7 +12,7 @@
from pyaptamer.utils._aptanet_utils import pairs_to_features


class AptaNetPipeline:
class AptaNetPipeline(BaseObject):
"""
AptaNet algorithm for aptamer–protein interaction prediction [1]_

Expand All @@ -22,14 +23,14 @@ class AptaNetPipeline:

The pipeline starts from string pairs, converts them into numeric features
(aptamer k-mer frequencies + protein PSeAAC), applies tree-based feature
selection, and feeds the result into the classifier.
selection, and feeds the result into the estimator.

Parameters
----------
k : int, optional, default=4
The k-mer size used to generate aptamer k-mer vectors.

classifier : sklearn-compatible estimator or None, default=None
estimator : sklearn-compatible estimator or None, default=None
Estimator applied after feature selection. If None, uses `AptaNetClassifier`.

Attributes
Expand Down Expand Up @@ -61,18 +62,18 @@ class AptaNetPipeline:
>>> preds = pipe.predict(X_test_pairs)
"""

def __init__(self, k=None, classifier=None):
def __init__(self, k=4, estimator=None):
self.k = k
self.classifier = classifier
self.estimator = estimator

def _build_pipeline(self):
transformer = FunctionTransformer(
func=pairs_to_features,
kw_args=self.k,
kw_args={"k": self.k},
validate=False,
)
self._classifier = self.classifier or AptaNetClassifier()
return Pipeline([("features", transformer), ("clf", clone(self._classifier))])
self._estimator = self.estimator or AptaNetClassifier()
return Pipeline([("features", transformer), ("clf", clone(self._estimator))])

def fit(self, X, y):
self.pipeline_ = self._build_pipeline()
Expand Down
4 changes: 2 additions & 2 deletions pyaptamer/aptanet/tests/test_aptanet.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_pipeline_fit_and_predict_classification(aptamer_seq, protein_seq):
Test if Pipeline predictions are valid class labels and shape matches input
for classification.
"""
pipe = AptaNetPipeline()
pipe = AptaNetPipeline(k=4)

X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
y = np.array([0] * 20 + [1] * 20, dtype=np.float32)
Expand All @@ -46,7 +46,7 @@ def test_pipeline_fit_and_predict_regression(aptamer_seq, protein_seq):
Test if Pipeline predictions are valid floats and shape matches input
for regression.
"""
pipe = AptaNetPipeline(classifier=AptaNetRegressor())
pipe = AptaNetPipeline(estimator=AptaNetRegressor())

X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
y = np.linspace(0, 1, 40).astype(np.float32)
Expand Down
5 changes: 5 additions & 0 deletions pyaptamer/benchmarking/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Benchmarking module."""

from pyaptamer.benchmarking._base import Benchmarking

__all__ = ["Benchmarking"]
154 changes: 154 additions & 0 deletions pyaptamer/benchmarking/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
__author__ = "satvshr"
__all__ = ["Benchmarking"]

import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_validate


class Benchmarking:
"""
Benchmark estimators using cross-validation.

You can:

- pass `X, y` (feature matrix and labels/targets) along with `cv`
to use any cross-validation strategy;
- if you want a fixed train/test split, pass a `PredefinedSplit`
object as `cv`.

Parameters
----------
estimators : list[estimator] | estimator
List of sklearn-like estimators implementing `fit` and `predict`.
metrics : list[callable] | callable
List of callables with signature `(y_true, y_pred) -> float`.
X : array-like
Feature matrix.
y : array-like
Target vector.
cv : int, CV splitter, or None, default=None
Cross-validation strategy. If `None`, defaults to 5-fold CV.
If you want to use an explicit train/test split, pass a
`PredefinedSplit` object.

Attributes
----------
results : pd.DataFrame
DataFrame produced by :meth:`run`.

- Index: pandas.MultiIndex with two levels (names shown in parentheses)
- level 0 "estimator": estimator name
- level 1 "metric": evaluator name
- Columns: ["train", "test"] (both floats)
- Cell values: mean scores (float) computed across CV folds:
- "train" = mean of cross_validate(...)[f"train_{metric}"]
- "test" = mean of cross_validate(...)[f"test_{metric}"]

Example
-------
>>> import numpy as np
>>> from sklearn.metrics import accuracy_score
>>> from sklearn.model_selection import PredefinedSplit
>>> from pyaptamer.benchmarking._base import Benchmarking
>>> from pyaptamer.aptanet import AptaNetPipeline
>>> aptamer_seq = "AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC"
>>> protein_seq = "ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY"
>>> # dataset: 20 aptamer–protein pairs
>>> X = [(aptamer_seq, protein_seq) for _ in range(20)]
>>> y = np.array([0] * 10 + [1] * 10, dtype=np.float32)
>>> clf = AptaNetPipeline(k=4)
>>> # define a fixed train/test split
>>> test_fold = np.ones(len(y)) * -1
>>> test_fold[-2:] = 0
>>> cv = PredefinedSplit(test_fold)
>>> bench = Benchmarking(
... estimators=[clf],
... metrics=[accuracy_score],
... X=X,
... y=y,
... cv=cv,
... )
>>> summary = bench.run() # doctest: +SKIP
"""

def __init__(self, estimators, metrics, X, y, cv=None):
self.estimators = estimators if isinstance(estimators, list) else [estimators]
self.metrics = metrics if isinstance(metrics, list) else [metrics]
self.X = X
self.y = y
self.cv = cv
self.results = None

def _to_scorers(self, metrics):
"""Convert metric callables to a dict of scorers."""
scorers = {}
for metric in metrics:
if not callable(metric):
raise ValueError("Each metric should be a callable.")
name = (
metric.__name__
if hasattr(metric, "__name__")
else metric.__class__.__name__
)
scorers[name] = make_scorer(metric)
return scorers

def _to_df(self, results):
"""Convert nested results to a unified DataFrame."""
records = []
index = []

for est_name, est_scores in results.items():
for metric_name, scores in est_scores.items():
records.append(scores)
index.append((est_name, metric_name))

index = pd.MultiIndex.from_tuples(index, names=["estimator", "metric"])
return pd.DataFrame(records, index=index, columns=["train", "test"])

def run(self):
"""
Train each estimator and evaluate with cross-validation.

Returns
-------
results : pd.DataFrame

- Index: pandas.MultiIndex with two levels (names shown in parentheses)
- level 0 "estimator": estimator name
- level 1 "metric": evaluator name
- Columns: ["train", "test"] (both floats)
- Cell values: mean scores (float) computed across CV folds:
- "train" = mean of cross_validate(...)[f"train_{metric}"]
- "test" = mean of cross_validate(...)[f"test_{metric}"]

"""
self.scorers_ = self._to_scorers(self.metrics)
results = {}

for estimator in self.estimators:
est_name = estimator.__class__.__name__

cv_results = cross_validate(
estimator,
self.X,
self.y,
cv=self.cv,
scoring=self.scorers_,
return_train_score=True,
)

# average across folds
est_scores = {}
for metric in self.scorers_.keys():
est_scores[metric] = {
"train": float(np.mean(cv_results[f"train_{metric}"])),
"test": float(np.mean(cv_results[f"test_{metric}"])),
}

results[est_name] = est_scores

self.results = self._to_df(results)
return self.results
1 change: 1 addition & 0 deletions pyaptamer/benchmarking/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Test suite for the benchmarking module"""
78 changes: 78 additions & 0 deletions pyaptamer/benchmarking/tests/test_benchmarking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import sys

import numpy as np
import pytest
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import PredefinedSplit

from pyaptamer.aptanet import AptaNetPipeline, AptaNetRegressor
from pyaptamer.benchmarking._base import Benchmarking

params = [
(
"AGCTTAGCGTACAGCTTAAAAGGGTTTCCCCTGCCCGCGTAC",
"ACDEFGHIKLMNPQRSTVWYACDEFGHIKLMNPQRSTVWY",
)
]


@pytest.mark.skipif(
sys.version_info >= (3, 13), reason="skorch does not support Python 3.13"
)
@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
def test_benchmarking_with_predefined_split_classification(aptamer_seq, protein_seq):
"""
Test Benchmarking on a classification task using PredefinedSplit.
"""
X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
y = np.array([0] * 20 + [1] * 20, dtype=np.float32)

clf = AptaNetPipeline()

test_fold = np.ones(len(y), dtype=int) * -1
test_fold[-2:] = 0
cv = PredefinedSplit(test_fold)

bench = Benchmarking(
estimators=[clf],
metrics=[accuracy_score],
X=X_raw,
y=y,
cv=cv,
)
summary = bench.run()

assert "train" in summary.columns
assert "test" in summary.columns
assert (clf.__class__.__name__, "accuracy_score") in summary.index


@pytest.mark.skipif(
sys.version_info >= (3, 13), reason="skorch does not support Python 3.13"
)
@pytest.mark.parametrize("aptamer_seq, protein_seq", params)
def test_benchmarking_with_predefined_split_regression(aptamer_seq, protein_seq):
"""
Test Benchmarking on a regression task using PredefinedSplit.
"""
X_raw = [(aptamer_seq, protein_seq) for _ in range(40)]
y = np.linspace(0, 1, 40).astype(np.float32)

reg = AptaNetPipeline(estimator=AptaNetRegressor())

test_fold = np.ones(len(y), dtype=int) * -1
test_fold[-3:] = 0
cv = PredefinedSplit(test_fold)

bench = Benchmarking(
estimators=[reg],
metrics=[mean_squared_error],
X=X_raw,
y=y,
cv=cv,
)
summary = bench.run()

assert "train" in summary.columns
assert "test" in summary.columns
assert (reg.__class__.__name__, "mean_squared_error") in summary.index
2 changes: 2 additions & 0 deletions pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Contains datasets along with their loaders."""

from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
Expand All @@ -8,4 +9,5 @@
"load_pfoa_structure",
"load_1gnh_structure",
"load_from_rcsb",
"load_csv_dataset",
]
3 changes: 2 additions & 1 deletion pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Loaders for different data structures."""

from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure

__all__ = ["load_pfoa_structure", "load_1gnh_structure"]
__all__ = ["load_pfoa_structure", "load_1gnh_structure", "load_csv_dataset"]
43 changes: 43 additions & 0 deletions pyaptamer/datasets/_loaders/_csv_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
__author__ = ["satvshr"]
__all__ = ["load_csv_dataset"]

import os

import pandas as pd


def load_csv_dataset(name, target_col, return_X_y=False):
"""
Load a dataset from a CSV file in DataFrame format.

Parameters
----------
name : str
Name of the dataset (file basename without `.csv`) located in the
package `dataset/data/` directory.
target_col : str
Column name in the CSV to use as the target variable.
return_X_y : bool, optional, default=False
If True, return (X_df, y_df) as pandas DataFrames.
If False, return the full DataFrame (features + target).

Returns
-------
pandas.DataFrame or tuple of pandas.DataFrame
If `return_X_y` is False, returns the full DataFrame with all columns.
If `return_X_y` is True, returns:
- X_df : pd.DataFrame of shape (n_samples, n_features)
- y_df : pd.DataFrame of shape (n_samples, 1)
"""
path = os.path.relpath(
os.path.join(os.path.dirname(__file__), "..", "data", f"{name}.csv")
)

df = pd.read_csv(path)

if return_X_y:
X_df = df.drop(columns=[target_col])
y_df = df[[target_col]]
return X_df, y_df

return df
Loading