From 389dd07d8612dda30936bff68fc4ffc150ed6657 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Fri, 26 Sep 2025 01:19:24 +0530 Subject: [PATCH 1/9] Add AptaDB loader --- pyaptamer/datasets/__init__.py | 5 +- pyaptamer/datasets/_loaders/__init__.py | 3 +- .../_loaders/load_aptamer_interactions.py | 118 ++++++++++++++++++ .../tests/test_aptamer_interactions_loader.py | 115 +++++++++++++++++ 4 files changed, 238 insertions(+), 3 deletions(-) create mode 100644 pyaptamer/datasets/_loaders/load_aptamer_interactions.py create mode 100644 pyaptamer/datasets/tests/test_aptamer_interactions_loader.py diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py index 41b5fa67..c99715eb 100644 --- a/pyaptamer/datasets/__init__.py +++ b/pyaptamer/datasets/__init__.py @@ -1,11 +1,12 @@ """Contains datasets along with their loaders.""" from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure -from pyaptamer.datasets._loaders._online_databank import load_from_rcsb from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions __all__ = [ "load_pfoa_structure", "load_1gnh_structure", - "load_from_rcsb", + "load_aptadb", + "load_interactions", ] diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py index 5fdc4143..90cc5cf8 100644 --- a/pyaptamer/datasets/_loaders/__init__.py +++ b/pyaptamer/datasets/_loaders/__init__.py @@ -2,5 +2,6 @@ from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions -__all__ = ["load_pfoa_structure", "load_1gnh_structure"] +__all__ = ["load_pfoa_structure", "load_1gnh_structure", "load_aptadb", "load_interactions"] \ No newline at end of file diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py new file mode 100644 index 00000000..05caa3ba --- /dev/null +++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py @@ -0,0 +1,118 @@ +__author__ = "Satarupa22-SD" +__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"] + +from pathlib import Path +from typing import Optional, Union + +import pandas as pd + + +def download_dataset(dataset_name: str, target_dir: Path, force_download: bool = False): + """Download dataset_name into target_dir using Kaggle API and unzip there.""" + import kaggle # avoid import-time auth + target_dir.mkdir(parents=True, exist_ok=True) + if force_download or not any(target_dir.glob("*.csv")): + kaggle.api.dataset_download_files(dataset_name, path=str(target_dir), unzip=True) + + +def find_csv(directory: Path): + csv_files = list(directory.glob("*.csv")) + if not csv_files: + return None + if len(csv_files) == 1: + return csv_files[0] + candidates = [ + f for f in csv_files + if any(t in f.name.lower() for t in ["aptamer", "interaction", "main", "data"]) + ] + return candidates[0] if candidates else csv_files[0] + + + + + +def normalize_interaction_present(df: pd.DataFrame) -> None: + return + + +def load_aptamer_interactions( + path: Union[str, Path], + *, + encoding: Optional[str] = None, + **read_csv_kwargs, +): + """ + Load AptaDB-style CSV into a pandas.DataFrame. + + Parameters + ---------- + path : str | Path + Path to the CSV file. + encoding : str | None + Specific file encoding. If None, several encodings are tried. + **read_csv_kwargs : Any + Additional arguments forwarded to pandas.read_csv. + """ + candidate_encodings = [ + "utf-8", + "utf-8-sig", + "latin-1", + "cp1252", + "windows-1252", + ] if encoding is None else [encoding] + last_error: Optional[Exception] = None + for enc in candidate_encodings: + try: + df = pd.read_csv(path, encoding=enc, **read_csv_kwargs) + return df + except Exception as e: + last_error = e + continue + # If all encodings failed, raise the last error + raise RuntimeError(f"Failed to read CSV at {path} with candidate encodings {candidate_encodings}: {last_error}") + + +def load_interactions( + path: Union[str, Path], + *, + encoding: Optional[str] = None, + **read_csv_kwargs, +): + """Simple alias for load_aptamer_interactions.""" + return load_aptamer_interactions( + path=path, + encoding=encoding, + **read_csv_kwargs, + ) + + +def load_aptadb( + dataset_name: str = "satarupadeb/aptamer-interactions", + cache_dir: Optional[Union[str, Path]] = None, + force_download: bool = False, + *, + encoding: Optional[str] = None, + **kwargs, +): + """ + Download (optional) and load the aptamer-interactions Kaggle dataset as pandas.DataFrame. + """ + cache_dir = ( + Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") + if cache_dir is None else Path(cache_dir) + ) + + csv_file = find_csv(cache_dir) if cache_dir.exists() else None + if csv_file is None: + try: + download_dataset(dataset_name, cache_dir, force_download=force_download) + except ImportError: + # ImportError for tests and clear messaging when kaggle is missing + raise + except Exception as e: + raise RuntimeError(f"Failed to download dataset '{dataset_name}' from Kaggle: {e}") from e + csv_file = find_csv(cache_dir) + if csv_file is None: + raise FileNotFoundError(f"No CSV found in downloaded Kaggle dataset at {cache_dir}") + + return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py new file mode 100644 index 00000000..6987deb1 --- /dev/null +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -0,0 +1,115 @@ +__author__ = "Satarupa22-SD" + +import pytest +import pandas as pd +from pathlib import Path +from unittest.mock import patch + +from pyaptamer.datasets import load_aptadb +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_interactions, +) + + +def test_local_csv(tmp_path): + csv_path = tmp_path / "aptadb_sample.csv" + pd.DataFrame({ + "aptamer_id": ["APT001"], + "aptamer_sequence": ["AUGCUU"], + "target_name": ["Thrombin"], + "interaction_present": ["1"], + }).to_csv(csv_path, index=False) + + df = load_interactions(csv_path) + assert isinstance(df, pd.DataFrame) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGCUU" # unchanged + + +def test_uses_cache(tmp_path): + csv_path = tmp_path / "aptadb.csv" + pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(csv_path, index=False) + + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", + return_value=csv_path, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset" + ) as mock_dl: + df = load_aptadb(cache_dir=tmp_path) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGU" + mock_dl.assert_not_called() + + +def test_requires_kaggle(tmp_path): + # Ensure no CSV present so a download would be attempted + with patch.dict("sys.modules", {"kaggle": None}): + with pytest.raises(ImportError): + load_aptadb(cache_dir=tmp_path) + + +def test_invalid_dataset(tmp_path): + # Force the download path and make it fail + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", + return_value=None, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset", + side_effect=Exception("boom"), + ): + with pytest.raises(RuntimeError, match=r"Failed to download dataset .* from Kaggle"): + load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path) + + +@pytest.fixture +def sample_aptadb_data(): + return pd.DataFrame({ + 'aptamer_id': ['APT001', 'APT002', 'APT003'], + 'target_id': ['TGT001', 'TGT002', 'TGT003'], + 'aptamer_sequence': ['ATCGATCGATCGATCG', 'GCTAGCTAGCTAGCTA', 'TTAACCGGTTAACCGG'], + 'target_name': ['Thrombin', 'VEGF', 'Lysozyme'], + 'target_uniprot': ['P00734', 'P15692', 'P61626'], + 'organism': ['Homo sapiens', 'Homo sapiens', 'Gallus gallus'], + 'ligand_type': ['Protein', 'Protein', 'Protein'], + 'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'], + 'reference_pubmed_id': ['12345678', '87654321', '11223344'], + 'interaction_present': [True, True, False] + }) + + +def test_sample_columns(sample_aptadb_data): + df = sample_aptadb_data + assert isinstance(df, pd.DataFrame) + assert len(df) == 3 + + expected_columns = [ + 'aptamer_id', 'target_id', 'aptamer_sequence', 'target_name', + 'target_uniprot', 'organism', 'ligand_type', 'binding_conditions', + 'reference_pubmed_id', 'interaction_present' + ] + + for col in expected_columns: + assert col in df.columns, f"Expected column '{col}' not found in dataset" + + assert df['aptamer_sequence'].dtype == 'object' + assert df['target_name'].dtype == 'object' + + +@pytest.mark.slow +def test_cache_consistency(tmp_path): + # This test verifies that two consecutive calls yield same DataFrame when using cache. + # It still avoids network by seeding the cache with a local CSV. + csv_path = tmp_path / "aptadb.csv" + seeded = pd.DataFrame({"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]}) + seeded.to_csv(csv_path, index=False) + + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", + return_value=csv_path, + ): + df1 = load_aptadb(cache_dir=tmp_path) + df2 = load_aptadb(cache_dir=tmp_path) + pd.testing.assert_frame_equal(df1, df2) \ No newline at end of file From 9eceaaa94df965d9814f35615914a862236e29e1 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Fri, 26 Sep 2025 01:44:06 +0530 Subject: [PATCH 2/9] Update datasets __init__.py --- pyaptamer/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py index c99715eb..05f8a75d 100644 --- a/pyaptamer/datasets/__init__.py +++ b/pyaptamer/datasets/__init__.py @@ -2,11 +2,13 @@ from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders._online_databank import load_from_rcsb from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions __all__ = [ "load_pfoa_structure", "load_1gnh_structure", "load_aptadb", + "load_from_rcsb", "load_interactions", ] From 478087db78af65edf67491be52e2df39452185d5 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Fri, 26 Sep 2025 01:53:21 +0530 Subject: [PATCH 3/9] Update test --- pyaptamer/datasets/tests/test_aptamer_interactions_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py index 6987deb1..4cbf5ada 100644 --- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -76,7 +76,7 @@ def sample_aptadb_data(): 'ligand_type': ['Protein', 'Protein', 'Protein'], 'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'], 'reference_pubmed_id': ['12345678', '87654321', '11223344'], - 'interaction_present': [True, True, False] + 'interaction_present': [1, 1, 0] }) From 18730037475abda6a52f8c51537ca1cd3922d827 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Sun, 5 Oct 2025 01:15:16 +0530 Subject: [PATCH 4/9] Add docstrings and fix linting issues for aptadb loader --- pyaptamer/datasets/__init__.py | 7 +- pyaptamer/datasets/_loaders/__init__.py | 12 +- .../_loaders/load_aptamer_interactions.py | 221 +++++++--- .../tests/test_aptamer_interactions_loader.py | 399 +++++++++++++----- 4 files changed, 477 insertions(+), 162 deletions(-) diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py index 05f8a75d..de156dad 100644 --- a/pyaptamer/datasets/__init__.py +++ b/pyaptamer/datasets/__init__.py @@ -1,9 +1,12 @@ """Contains datasets along with their loaders.""" from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure -from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure from pyaptamer.datasets._loaders._online_databank import load_from_rcsb -from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions +from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_aptadb, + load_interactions, +) __all__ = [ "load_pfoa_structure", diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py index 90cc5cf8..9728696b 100644 --- a/pyaptamer/datasets/_loaders/__init__.py +++ b/pyaptamer/datasets/_loaders/__init__.py @@ -2,6 +2,14 @@ from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure -from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_aptadb, + load_interactions, +) -__all__ = ["load_pfoa_structure", "load_1gnh_structure", "load_aptadb", "load_interactions"] \ No newline at end of file +__all__ = [ + "load_pfoa_structure", + "load_1gnh_structure", + "load_aptadb", + "load_interactions", +] diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py index 05caa3ba..25f519c1 100644 --- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py +++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py @@ -2,65 +2,166 @@ __all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"] from pathlib import Path -from typing import Optional, Union import pandas as pd -def download_dataset(dataset_name: str, target_dir: Path, force_download: bool = False): - """Download dataset_name into target_dir using Kaggle API and unzip there.""" - import kaggle # avoid import-time auth +def _download_dataset( + dataset_name: str, target_dir: Path, force_download: bool = False +) -> None: + """Download a Kaggle dataset to the specified directory and unzip it. + + This is a private helper function used internally by the module. + + Parameters + ---------- + dataset_name : str + The Kaggle dataset identifier in format "username/dataset-name" + target_dir : Path + Directory where the dataset should be downloaded and extracted + force_download : bool, default False + If True, download even if CSV files already exist in target_dir + + Raises + ------ + ImportError + If the kaggle package is not installed + Exception + If the download fails for any reason + + Notes + ----- + This function requires the kaggle package to be installed and properly + configured with API credentials. + """ + import kaggle # avoid import-time auth + target_dir.mkdir(parents=True, exist_ok=True) + + # Only download if forced or no CSV files exist if force_download or not any(target_dir.glob("*.csv")): - kaggle.api.dataset_download_files(dataset_name, path=str(target_dir), unzip=True) + kaggle.api.dataset_download_files( + dataset_name, path=str(target_dir), unzip=True + ) + + +def _find_csv(directory: Path) -> Path | None: + """Find the most appropriate CSV file in a directory. + This is a private helper function that implements smart CSV file detection. -def find_csv(directory: Path): + Parameters + ---------- + directory : Path + Directory to search for CSV files + + Returns + ------- + Path or None + Path to the most appropriate CSV file, or None if no CSV files found + + Notes + ----- + Selection priority: + 1. If only one CSV file exists, return it + 2. If multiple CSV files exist, prefer files with names containing: + "aptamer", "interaction", "main", or "data" + 3. If no preferred names found, return the first CSV file + """ csv_files = list(directory.glob("*.csv")) + if not csv_files: return None + if len(csv_files) == 1: return csv_files[0] + + # Look for files with preferred keywords in their names + preferred_keywords = ["aptamer", "interaction", "main", "data"] candidates = [ - f for f in csv_files - if any(t in f.name.lower() for t in ["aptamer", "interaction", "main", "data"]) + f + for f in csv_files + if any(keyword in f.name.lower() for keyword in preferred_keywords) ] + return candidates[0] if candidates else csv_files[0] +def _normalize_interaction_present(df: pd.DataFrame) -> None: + """Normalize interaction present column in the dataset. + This is a private helper function for data preprocessing. + Currently a placeholder for future implementation. + Parameters + ---------- + df : pd.DataFrame + The dataframe to normalize -def normalize_interaction_present(df: pd.DataFrame) -> None: + Notes + ----- + This function is currently not implemented and serves as a placeholder + for future data normalization functionality. + """ + # TODO: Implement interaction present normalization return def load_aptamer_interactions( - path: Union[str, Path], + path: str | Path, *, - encoding: Optional[str] = None, + encoding: str | None = None, **read_csv_kwargs, -): - """ - Load AptaDB-style CSV into a pandas.DataFrame. +) -> pd.DataFrame: + """Load an aptamer interactions CSV file into a pandas DataFrame. + + This function provides robust CSV loading with automatic encoding detection + and error handling for various file formats commonly found in biological + datasets. Parameters ---------- - path : str | Path - Path to the CSV file. - encoding : str | None - Specific file encoding. If None, several encodings are tried. - **read_csv_kwargs : Any - Additional arguments forwarded to pandas.read_csv. + path : str or Path + Path to the CSV file containing aptamer interaction data + encoding : str, optional + Specific file encoding to use. If None (default), multiple common + encodings will be tried automatically + **read_csv_kwargs + Additional keyword arguments passed directly to pandas.read_csv() + + Returns + ------- + pd.DataFrame + DataFrame containing the loaded aptamer interaction data + + Raises + ------ + RuntimeError + If the CSV file cannot be read with any of the attempted encodings + + Notes + ----- + The function attempts the following encodings in order: + - utf-8 + - utf-8-sig (for files with BOM) + - latin-1 + - cp1252 + - windows-1252 """ - candidate_encodings = [ - "utf-8", - "utf-8-sig", - "latin-1", - "cp1252", - "windows-1252", - ] if encoding is None else [encoding] - last_error: Optional[Exception] = None + candidate_encodings = ( + [ + "utf-8", + "utf-8-sig", # For files with byte order mark + "latin-1", + "cp1252", # Common Windows encoding + "windows-1252", # Alternative Windows encoding + ] + if encoding is None + else [encoding] + ) + + last_error: Exception | None = None + for enc in candidate_encodings: try: df = pd.read_csv(path, encoding=enc, **read_csv_kwargs) @@ -68,17 +169,24 @@ def load_aptamer_interactions( except Exception as e: last_error = e continue - # If all encodings failed, raise the last error - raise RuntimeError(f"Failed to read CSV at {path} with candidate encodings {candidate_encodings}: {last_error}") + + raise RuntimeError( + f"Failed to read CSV at {path} with candidate encodings " + f"{candidate_encodings}: {last_error}" + ) def load_interactions( - path: Union[str, Path], + path: str | Path, *, - encoding: Optional[str] = None, + encoding: str | None = None, **read_csv_kwargs, -): - """Simple alias for load_aptamer_interactions.""" +) -> pd.DataFrame: + """Load interaction data from a CSV file. + + This is a convenience alias for load_aptamer_interactions() with identical + functionality and parameters. + """ return load_aptamer_interactions( path=path, encoding=encoding, @@ -88,31 +196,40 @@ def load_interactions( def load_aptadb( dataset_name: str = "satarupadeb/aptamer-interactions", - cache_dir: Optional[Union[str, Path]] = None, + cache_dir: str | Path | None = None, force_download: bool = False, *, - encoding: Optional[str] = None, + encoding: str | None = None, **kwargs, -): - """ - Download (optional) and load the aptamer-interactions Kaggle dataset as pandas.DataFrame. - """ - cache_dir = ( - Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") - if cache_dir is None else Path(cache_dir) - ) +) -> pd.DataFrame: + """Download and load aptamer-interactions Kaggle dataset as DataFrame.""" + if cache_dir is None: + cache_dir = ( + Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") + ) + else: + cache_dir = Path(cache_dir) + + csv_file = _find_csv(cache_dir) if cache_dir.exists() else None - csv_file = find_csv(cache_dir) if cache_dir.exists() else None if csv_file is None: try: - download_dataset(dataset_name, cache_dir, force_download=force_download) - except ImportError: - # ImportError for tests and clear messaging when kaggle is missing - raise + _download_dataset(dataset_name, cache_dir, force_download=force_download) + except ImportError as err: + # Re-raise ImportError for clear messaging when kaggle is missing + raise ImportError( + "The 'kaggle' package is required to download datasets. " + "Install it with: pip install kaggle" + ) from err except Exception as e: - raise RuntimeError(f"Failed to download dataset '{dataset_name}' from Kaggle: {e}") from e - csv_file = find_csv(cache_dir) + raise RuntimeError( + f"Failed to download dataset '{dataset_name}' from Kaggle: {e}" + ) from e + + csv_file = _find_csv(cache_dir) if csv_file is None: - raise FileNotFoundError(f"No CSV found in downloaded Kaggle dataset at {cache_dir}") + raise FileNotFoundError( + f"No CSV files found in downloaded Kaggle dataset at {cache_dir}" + ) return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py index 4cbf5ada..ee5a902a 100644 --- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -1,115 +1,302 @@ __author__ = "Satarupa22-SD" +__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"] -import pytest -import pandas as pd from pathlib import Path -from unittest.mock import patch - -from pyaptamer.datasets import load_aptadb -from pyaptamer.datasets._loaders.load_aptamer_interactions import ( - load_interactions, -) - - -def test_local_csv(tmp_path): - csv_path = tmp_path / "aptadb_sample.csv" - pd.DataFrame({ - "aptamer_id": ["APT001"], - "aptamer_sequence": ["AUGCUU"], - "target_name": ["Thrombin"], - "interaction_present": ["1"], - }).to_csv(csv_path, index=False) - - df = load_interactions(csv_path) - assert isinstance(df, pd.DataFrame) - assert not df.empty - assert df.loc[0, "aptamer_sequence"] == "AUGCUU" # unchanged - - -def test_uses_cache(tmp_path): - csv_path = tmp_path / "aptadb.csv" - pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(csv_path, index=False) - - with patch( - "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", - return_value=csv_path, - ): - with patch( - "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset" - ) as mock_dl: - df = load_aptadb(cache_dir=tmp_path) - assert not df.empty - assert df.loc[0, "aptamer_sequence"] == "AUGU" - mock_dl.assert_not_called() - - -def test_requires_kaggle(tmp_path): - # Ensure no CSV present so a download would be attempted - with patch.dict("sys.modules", {"kaggle": None}): - with pytest.raises(ImportError): - load_aptadb(cache_dir=tmp_path) - - -def test_invalid_dataset(tmp_path): - # Force the download path and make it fail - with patch( - "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", - return_value=None, - ): - with patch( - "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset", - side_effect=Exception("boom"), - ): - with pytest.raises(RuntimeError, match=r"Failed to download dataset .* from Kaggle"): - load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path) - - -@pytest.fixture -def sample_aptadb_data(): - return pd.DataFrame({ - 'aptamer_id': ['APT001', 'APT002', 'APT003'], - 'target_id': ['TGT001', 'TGT002', 'TGT003'], - 'aptamer_sequence': ['ATCGATCGATCGATCG', 'GCTAGCTAGCTAGCTA', 'TTAACCGGTTAACCGG'], - 'target_name': ['Thrombin', 'VEGF', 'Lysozyme'], - 'target_uniprot': ['P00734', 'P15692', 'P61626'], - 'organism': ['Homo sapiens', 'Homo sapiens', 'Gallus gallus'], - 'ligand_type': ['Protein', 'Protein', 'Protein'], - 'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'], - 'reference_pubmed_id': ['12345678', '87654321', '11223344'], - 'interaction_present': [1, 1, 0] - }) - - -def test_sample_columns(sample_aptadb_data): - df = sample_aptadb_data - assert isinstance(df, pd.DataFrame) - assert len(df) == 3 - - expected_columns = [ - 'aptamer_id', 'target_id', 'aptamer_sequence', 'target_name', - 'target_uniprot', 'organism', 'ligand_type', 'binding_conditions', - 'reference_pubmed_id', 'interaction_present' + +import pandas as pd + + +def _download_dataset( + dataset_name: str, target_dir: Path, force_download: bool = False +) -> None: + """Download a Kaggle dataset to the specified directory and unzip it. + + This is a private helper function used internally by the module. + + Parameters + ---------- + dataset_name : str + The Kaggle dataset identifier in format "username/dataset-name" + target_dir : Path + Directory where the dataset should be downloaded and extracted + force_download : bool, default False + If True, download even if CSV files already exist in target_dir + + Raises + ------ + ImportError + If the kaggle package is not installed + Exception + If the download fails for any reason + + Notes + ----- + This function requires the kaggle package to be installed and properly + configured with API credentials. + """ + import kaggle # avoid import-time auth + + target_dir.mkdir(parents=True, exist_ok=True) + + # Only download if forced or no CSV files exist + if force_download or not any(target_dir.glob("*.csv")): + kaggle.api.dataset_download_files( + dataset_name, path=str(target_dir), unzip=True + ) + + +def _find_csv(directory: Path) -> Path | None: + """Find the most appropriate CSV file in a directory. + + This is a private helper function that implements smart CSV file detection. + + Parameters + ---------- + directory : Path + Directory to search for CSV files + + Returns + ------- + Path or None + Path to the most appropriate CSV file, or None if no CSV files found + + Notes + ----- + Selection priority: + 1. If only one CSV file exists, return it + 2. If multiple CSV files exist, prefer files with names containing: + "aptamer", "interaction", "main", or "data" + 3. If no preferred names found, return the first CSV file + """ + csv_files = list(directory.glob("*.csv")) + + if not csv_files: + return None + + if len(csv_files) == 1: + return csv_files[0] + + # Look for files with preferred keywords in their names + preferred_keywords = ["aptamer", "interaction", "main", "data"] + candidates = [ + f + for f in csv_files + if any(keyword in f.name.lower() for keyword in preferred_keywords) ] - for col in expected_columns: - assert col in df.columns, f"Expected column '{col}' not found in dataset" + return candidates[0] if candidates else csv_files[0] + + +def _normalize_interaction_present(df: pd.DataFrame) -> None: + """Normalize interaction present column in the dataset. + + This is a private helper function for data preprocessing. + Currently a placeholder for future implementation. + + Parameters + ---------- + df : pd.DataFrame + The dataframe to normalize + + Notes + ----- + This function is currently not implemented and serves as a placeholder + for future data normalization functionality. + """ + # TODO: Implement interaction present normalization + return + + +def load_aptamer_interactions( + path: str | Path, + *, + encoding: str | None = None, + **read_csv_kwargs, +) -> pd.DataFrame: + """Load an aptamer interactions CSV file into a pandas DataFrame. + + This function provides robust CSV loading with automatic encoding detection + and error handling for various file formats commonly found in biological + datasets. + + Parameters + ---------- + path : str or Path + Path to the CSV file containing aptamer interaction data + encoding : str, optional + Specific file encoding to use. If None (default), multiple common + encodings will be tried automatically + **read_csv_kwargs + Additional keyword arguments passed directly to pandas.read_csv() + + Returns + ------- + pd.DataFrame + DataFrame containing the loaded aptamer interaction data + + Raises + ------ + RuntimeError + If the CSV file cannot be read with any of the attempted encodings + + Notes + ----- + The function attempts the following encodings in order: + - utf-8 + - utf-8-sig (for files with BOM) + - latin-1 + - cp1252 + - windows-1252 + """ + # Define candidate encodings to try + candidate_encodings = ( + [ + "utf-8", + "utf-8-sig", # For files with byte order mark + "latin-1", + "cp1252", # Common Windows encoding + "windows-1252", # Alternative Windows encoding + ] + if encoding is None + else [encoding] + ) + + last_error: Exception | None = None + + # Try each encoding until one works + for enc in candidate_encodings: + try: + df = pd.read_csv(path, encoding=enc, **read_csv_kwargs) + return df + except Exception as e: + last_error = e + continue + + # If all encodings failed, raise informative error + raise RuntimeError( + f"Failed to read CSV at {path} with candidate encodings " + f"{candidate_encodings}: {last_error}" + ) + + +def load_interactions( + path: str | Path, + *, + encoding: str | None = None, + **read_csv_kwargs, +) -> pd.DataFrame: + """Load interaction data from a CSV file. + + This is a convenience alias for load_aptamer_interactions() with identical + functionality and parameters. + + Parameters + ---------- + path : str or Path + Path to the CSV file containing interaction data + encoding : str, optional + Specific file encoding to use. If None, automatic detection is used + **read_csv_kwargs + Additional keyword arguments passed to pandas.read_csv() + + Returns + ------- + pd.DataFrame + DataFrame containing the loaded interaction data + + See Also + -------- + load_aptamer_interactions : The main function this aliases + """ + return load_aptamer_interactions( + path=path, + encoding=encoding, + **read_csv_kwargs, + ) + + +def load_aptadb( + dataset_name: str = "satarupadeb/aptamer-interactions", + cache_dir: str | Path | None = None, + force_download: bool = False, + *, + encoding: str | None = None, + **kwargs, +) -> pd.DataFrame: + """Download and load aptamer-interactions Kaggle dataset as DataFrame. + + This is the main function for accessing aptamer interaction data. It + handles dataset downloading, caching, and loading with a single function + call. + + Parameters + ---------- + dataset_name : str, default "satarupadeb/aptamer-interactions" + Kaggle dataset identifier in format "username/dataset-name" + cache_dir : str, Path, or None, default None + Directory for caching downloaded datasets. If None, uses + ~/.pyaptamer/cache/dataset_name + force_download : bool, default False + If True, re-download the dataset even if it exists in cache + encoding : str, optional + Specific file encoding for CSV reading. If None, automatic detection + is used + **kwargs + Additional keyword arguments passed to pandas.read_csv() + + Returns + ------- + pd.DataFrame + DataFrame containing the aptamer interaction dataset + + Raises + ------ + ImportError + If the kaggle package is not installed + RuntimeError + If dataset download fails + FileNotFoundError + If no CSV files are found in the downloaded dataset + + Notes + ----- + - Requires kaggle package installation and API configuration + - First run will download data depending on dataset size + - Subsequent runs use cached data unless force_download=True + - Cache directory structure: cache_dir/username_dataset-name/ + """ + # Set up cache directory + if cache_dir is None: + cache_dir = ( + Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") + ) + else: + cache_dir = Path(cache_dir) - assert df['aptamer_sequence'].dtype == 'object' - assert df['target_name'].dtype == 'object' + # Check if we already have a cached CSV file + csv_file = _find_csv(cache_dir) if cache_dir.exists() else None + # Download dataset if no cached file exists + if csv_file is None: + try: + _download_dataset(dataset_name, cache_dir, force_download=force_download) + except ImportError: + # Re-raise ImportError for clear messaging when kaggle is missing + raise ImportError( + "The 'kaggle' package is required to download datasets. " + "Install it with: pip install kaggle" + ) from None + except Exception as e: + raise RuntimeError( + f"Failed to download dataset '{dataset_name}' from Kaggle: {e}" + ) from e -@pytest.mark.slow -def test_cache_consistency(tmp_path): - # This test verifies that two consecutive calls yield same DataFrame when using cache. - # It still avoids network by seeding the cache with a local CSV. - csv_path = tmp_path / "aptadb.csv" - seeded = pd.DataFrame({"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]}) - seeded.to_csv(csv_path, index=False) + # Find the CSV file in downloaded data + csv_file = _find_csv(cache_dir) + if csv_file is None: + raise FileNotFoundError( + f"No CSV files found in downloaded Kaggle dataset at {cache_dir}" + ) - with patch( - "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv", - return_value=csv_path, - ): - df1 = load_aptadb(cache_dir=tmp_path) - df2 = load_aptadb(cache_dir=tmp_path) - pd.testing.assert_frame_equal(df1, df2) \ No newline at end of file + # Load and return the CSV data + return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs) From cafe7e5a979741359fe81bb14eefe510303b9cd3 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Sun, 5 Oct 2025 19:05:55 +0530 Subject: [PATCH 5/9] Add tests --- .../tests/test_aptamer_interactions_loader.py | 389 ++++++------------ 1 file changed, 137 insertions(+), 252 deletions(-) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py index ee5a902a..43a0baba 100644 --- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -1,302 +1,187 @@ __author__ = "Satarupa22-SD" -__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"] -from pathlib import Path +from unittest.mock import patch import pandas as pd +import pytest +from pyaptamer.datasets import load_aptadb +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_interactions, +) -def _download_dataset( - dataset_name: str, target_dir: Path, force_download: bool = False -) -> None: - """Download a Kaggle dataset to the specified directory and unzip it. - This is a private helper function used internally by the module. +def test_local_csv(tmp_path): + """Test loading aptamer data from a local CSV file. Parameters ---------- - dataset_name : str - The Kaggle dataset identifier in format "username/dataset-name" - target_dir : Path - Directory where the dataset should be downloaded and extracted - force_download : bool, default False - If True, download even if CSV files already exist in target_dir - - Raises - ------ - ImportError - If the kaggle package is not installed - Exception - If the download fails for any reason - - Notes - ----- - This function requires the kaggle package to be installed and properly - configured with API credentials. + tmp_path : Path + Pytest fixture providing a temporary directory """ - import kaggle # avoid import-time auth - - target_dir.mkdir(parents=True, exist_ok=True) + csv_path = tmp_path / "aptadb_sample.csv" + pd.DataFrame( + { + "aptamer_id": ["APT001"], + "aptamer_sequence": ["AUGCUU"], + "target_name": ["Thrombin"], + "interaction_present": ["1"], + } + ).to_csv(csv_path, index=False) - # Only download if forced or no CSV files exist - if force_download or not any(target_dir.glob("*.csv")): - kaggle.api.dataset_download_files( - dataset_name, path=str(target_dir), unzip=True - ) + df = load_interactions(csv_path) + assert isinstance(df, pd.DataFrame) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGCUU" -def _find_csv(directory: Path) -> Path | None: - """Find the most appropriate CSV file in a directory. - - This is a private helper function that implements smart CSV file detection. +def test_uses_cache(tmp_path): + """Test that cached data is used instead of downloading. Parameters ---------- - directory : Path - Directory to search for CSV files - - Returns - ------- - Path or None - Path to the most appropriate CSV file, or None if no CSV files found - - Notes - ----- - Selection priority: - 1. If only one CSV file exists, return it - 2. If multiple CSV files exist, prefer files with names containing: - "aptamer", "interaction", "main", or "data" - 3. If no preferred names found, return the first CSV file + tmp_path : Path + Pytest fixture providing a temporary directory """ - csv_files = list(directory.glob("*.csv")) - - if not csv_files: - return None - - if len(csv_files) == 1: - return csv_files[0] - - # Look for files with preferred keywords in their names - preferred_keywords = ["aptamer", "interaction", "main", "data"] - candidates = [ - f - for f in csv_files - if any(keyword in f.name.lower() for keyword in preferred_keywords) - ] - - return candidates[0] if candidates else csv_files[0] + csv_path = tmp_path / "aptadb.csv" + pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv( + csv_path, index=False + ) + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=csv_path, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset" + ) as mock_dl: + df = load_aptadb(cache_dir=tmp_path) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGU" + mock_dl.assert_not_called() -def _normalize_interaction_present(df: pd.DataFrame) -> None: - """Normalize interaction present column in the dataset. - This is a private helper function for data preprocessing. - Currently a placeholder for future implementation. +def test_requires_kaggle(tmp_path): + """Test that ImportError is raised when kaggle package is missing. Parameters ---------- - df : pd.DataFrame - The dataframe to normalize - - Notes - ----- - This function is currently not implemented and serves as a placeholder - for future data normalization functionality. + tmp_path : Path + Pytest fixture providing a temporary directory """ - # TODO: Implement interaction present normalization - return - + # Ensure no CSV present so a download would be attempted + with patch.dict("sys.modules", {"kaggle": None}): + with pytest.raises(ImportError): + load_aptadb(cache_dir=tmp_path) -def load_aptamer_interactions( - path: str | Path, - *, - encoding: str | None = None, - **read_csv_kwargs, -) -> pd.DataFrame: - """Load an aptamer interactions CSV file into a pandas DataFrame. - This function provides robust CSV loading with automatic encoding detection - and error handling for various file formats commonly found in biological - datasets. +def test_invalid_dataset(tmp_path): + """Test error handling for invalid dataset download. Parameters ---------- - path : str or Path - Path to the CSV file containing aptamer interaction data - encoding : str, optional - Specific file encoding to use. If None (default), multiple common - encodings will be tried automatically - **read_csv_kwargs - Additional keyword arguments passed directly to pandas.read_csv() + tmp_path : Path + Pytest fixture providing a temporary directory + """ + # Force the download path and make it fail + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=None, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset", + side_effect=Exception("boom"), + ): + with pytest.raises( + RuntimeError, match=r"Failed to download dataset .* from Kaggle" + ): + load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path) + + +@pytest.fixture +def sample_aptadb_data(): + """Create sample aptamer interaction data for testing. Returns ------- pd.DataFrame - DataFrame containing the loaded aptamer interaction data - - Raises - ------ - RuntimeError - If the CSV file cannot be read with any of the attempted encodings - - Notes - ----- - The function attempts the following encodings in order: - - utf-8 - - utf-8-sig (for files with BOM) - - latin-1 - - cp1252 - - windows-1252 + Sample DataFrame with aptamer interaction data """ - # Define candidate encodings to try - candidate_encodings = ( - [ - "utf-8", - "utf-8-sig", # For files with byte order mark - "latin-1", - "cp1252", # Common Windows encoding - "windows-1252", # Alternative Windows encoding - ] - if encoding is None - else [encoding] - ) - - last_error: Exception | None = None - - # Try each encoding until one works - for enc in candidate_encodings: - try: - df = pd.read_csv(path, encoding=enc, **read_csv_kwargs) - return df - except Exception as e: - last_error = e - continue - - # If all encodings failed, raise informative error - raise RuntimeError( - f"Failed to read CSV at {path} with candidate encodings " - f"{candidate_encodings}: {last_error}" + return pd.DataFrame( + { + "aptamer_id": ["APT001", "APT002", "APT003"], + "target_id": ["TGT001", "TGT002", "TGT003"], + "aptamer_sequence": [ + "ATCGATCGATCGATCG", + "GCTAGCTAGCTAGCTA", + "TTAACCGGTTAACCGG", + ], + "target_name": ["Thrombin", "VEGF", "Lysozyme"], + "target_uniprot": ["P00734", "P15692", "P61626"], + "organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"], + "ligand_type": ["Protein", "Protein", "Protein"], + "binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"], + "reference_pubmed_id": ["12345678", "87654321", "11223344"], + "interaction_present": [1, 1, 0], + } ) -def load_interactions( - path: str | Path, - *, - encoding: str | None = None, - **read_csv_kwargs, -) -> pd.DataFrame: - """Load interaction data from a CSV file. - - This is a convenience alias for load_aptamer_interactions() with identical - functionality and parameters. +def test_sample_columns(sample_aptadb_data): + """Test that sample data contains expected columns and data types. Parameters ---------- - path : str or Path - Path to the CSV file containing interaction data - encoding : str, optional - Specific file encoding to use. If None, automatic detection is used - **read_csv_kwargs - Additional keyword arguments passed to pandas.read_csv() + sample_aptadb_data : pd.DataFrame + Fixture providing sample aptamer data + """ + df = sample_aptadb_data + assert isinstance(df, pd.DataFrame) + assert len(df) == 3 + + expected_columns = [ + "aptamer_id", + "target_id", + "aptamer_sequence", + "target_name", + "target_uniprot", + "organism", + "ligand_type", + "binding_conditions", + "reference_pubmed_id", + "interaction_present", + ] - Returns - ------- - pd.DataFrame - DataFrame containing the loaded interaction data + for col in expected_columns: + assert col in df.columns, f"Expected column '{col}' not found in dataset" - See Also - -------- - load_aptamer_interactions : The main function this aliases - """ - return load_aptamer_interactions( - path=path, - encoding=encoding, - **read_csv_kwargs, - ) + assert df["aptamer_sequence"].dtype == "object" + assert df["target_name"].dtype == "object" -def load_aptadb( - dataset_name: str = "satarupadeb/aptamer-interactions", - cache_dir: str | Path | None = None, - force_download: bool = False, - *, - encoding: str | None = None, - **kwargs, -) -> pd.DataFrame: - """Download and load aptamer-interactions Kaggle dataset as DataFrame. +@pytest.mark.slow +def test_cache_consistency(tmp_path): + """Test that consecutive calls with cache yield identical DataFrames. - This is the main function for accessing aptamer interaction data. It - handles dataset downloading, caching, and loading with a single function - call. + This test verifies that two consecutive calls yield same DataFrame + when using cache. It avoids network by seeding the cache with a + local CSV. Parameters ---------- - dataset_name : str, default "satarupadeb/aptamer-interactions" - Kaggle dataset identifier in format "username/dataset-name" - cache_dir : str, Path, or None, default None - Directory for caching downloaded datasets. If None, uses - ~/.pyaptamer/cache/dataset_name - force_download : bool, default False - If True, re-download the dataset even if it exists in cache - encoding : str, optional - Specific file encoding for CSV reading. If None, automatic detection - is used - **kwargs - Additional keyword arguments passed to pandas.read_csv() - - Returns - ------- - pd.DataFrame - DataFrame containing the aptamer interaction dataset - - Raises - ------ - ImportError - If the kaggle package is not installed - RuntimeError - If dataset download fails - FileNotFoundError - If no CSV files are found in the downloaded dataset - - Notes - ----- - - Requires kaggle package installation and API configuration - - First run will download data depending on dataset size - - Subsequent runs use cached data unless force_download=True - - Cache directory structure: cache_dir/username_dataset-name/ + tmp_path : Path + Pytest fixture providing a temporary directory """ - # Set up cache directory - if cache_dir is None: - cache_dir = ( - Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") - ) - else: - cache_dir = Path(cache_dir) - - # Check if we already have a cached CSV file - csv_file = _find_csv(cache_dir) if cache_dir.exists() else None - - # Download dataset if no cached file exists - if csv_file is None: - try: - _download_dataset(dataset_name, cache_dir, force_download=force_download) - except ImportError: - # Re-raise ImportError for clear messaging when kaggle is missing - raise ImportError( - "The 'kaggle' package is required to download datasets. " - "Install it with: pip install kaggle" - ) from None - except Exception as e: - raise RuntimeError( - f"Failed to download dataset '{dataset_name}' from Kaggle: {e}" - ) from e - - # Find the CSV file in downloaded data - csv_file = _find_csv(cache_dir) - if csv_file is None: - raise FileNotFoundError( - f"No CSV files found in downloaded Kaggle dataset at {cache_dir}" - ) - - # Load and return the CSV data - return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs) + csv_path = tmp_path / "aptadb.csv" + seeded = pd.DataFrame( + {"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]} + ) + seeded.to_csv(csv_path, index=False) + + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=csv_path, + ): + df1 = load_aptadb(cache_dir=tmp_path) + df2 = load_aptadb(cache_dir=tmp_path) + pd.testing.assert_frame_equal(df1, df2) From 8b2982af5b03fdbc8aa04161668b8a2bc3150daf Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Sun, 5 Oct 2025 20:10:47 +0530 Subject: [PATCH 6/9] Update docsting --- .../_loaders/load_aptamer_interactions.py | 115 +++++++++--------- 1 file changed, 59 insertions(+), 56 deletions(-) diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py index 25f519c1..106f31a7 100644 --- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py +++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py @@ -11,28 +11,25 @@ def _download_dataset( ) -> None: """Download a Kaggle dataset to the specified directory and unzip it. - This is a private helper function used internally by the module. - Parameters ---------- dataset_name : str - The Kaggle dataset identifier in format "username/dataset-name" + Kaggle dataset identifier like "username/dataset-name". target_dir : Path - Directory where the dataset should be downloaded and extracted + Directory to download and extract the dataset. force_download : bool, default False - If True, download even if CSV files already exist in target_dir + If True, download even if CSV files already exist in target_dir. Raises ------ ImportError - If the kaggle package is not installed + If the kaggle package is not installed. Exception - If the download fails for any reason + If the download fails for any reason. Notes ----- - This function requires the kaggle package to be installed and properly - configured with API credentials. + Requires kaggle package installed and configured with API credentials. """ import kaggle # avoid import-time auth @@ -46,27 +43,25 @@ def _download_dataset( def _find_csv(directory: Path) -> Path | None: - """Find the most appropriate CSV file in a directory. - - This is a private helper function that implements smart CSV file detection. + """Return the most appropriate CSV file path from a directory. Parameters ---------- directory : Path - Directory to search for CSV files + Directory to look for CSV files. Returns ------- Path or None - Path to the most appropriate CSV file, or None if no CSV files found + Path to CSV file or None if none found. Notes ----- - Selection priority: - 1. If only one CSV file exists, return it - 2. If multiple CSV files exist, prefer files with names containing: - "aptamer", "interaction", "main", or "data" - 3. If no preferred names found, return the first CSV file + Preference order: + 1. If only one CSV, return it. + 2. If multiple, prefer files with "aptamer", "interaction", "main", or "data" + in name. + 3. Otherwise, return first CSV found. """ csv_files = list(directory.glob("*.csv")) @@ -76,7 +71,6 @@ def _find_csv(directory: Path) -> Path | None: if len(csv_files) == 1: return csv_files[0] - # Look for files with preferred keywords in their names preferred_keywords = ["aptamer", "interaction", "main", "data"] candidates = [ f @@ -88,22 +82,17 @@ def _find_csv(directory: Path) -> Path | None: def _normalize_interaction_present(df: pd.DataFrame) -> None: - """Normalize interaction present column in the dataset. - - This is a private helper function for data preprocessing. - Currently a placeholder for future implementation. + """Placeholder to normalize 'interaction_present' column in the DataFrame. Parameters ---------- df : pd.DataFrame - The dataframe to normalize + DataFrame to normalize. Notes ----- - This function is currently not implemented and serves as a placeholder - for future data normalization functionality. + Currently not implemented, kept for future data normalization. """ - # TODO: Implement interaction present normalization return @@ -113,48 +102,40 @@ def load_aptamer_interactions( encoding: str | None = None, **read_csv_kwargs, ) -> pd.DataFrame: - """Load an aptamer interactions CSV file into a pandas DataFrame. + """Load aptamer interactions CSV into a pandas DataFrame. - This function provides robust CSV loading with automatic encoding detection - and error handling for various file formats commonly found in biological - datasets. + Tries common encodings automatically for robust loading. Parameters ---------- path : str or Path - Path to the CSV file containing aptamer interaction data + Path to CSV file with aptamer interactions. encoding : str, optional - Specific file encoding to use. If None (default), multiple common - encodings will be tried automatically + Specific file encoding to use. If None, tries common encodings. **read_csv_kwargs - Additional keyword arguments passed directly to pandas.read_csv() + Additional arguments passed to pandas.read_csv(). Returns ------- pd.DataFrame - DataFrame containing the loaded aptamer interaction data + DataFrame with aptamer interaction data. Raises ------ RuntimeError - If the CSV file cannot be read with any of the attempted encodings + If CSV cannot be read with any attempted encodings. Notes ----- - The function attempts the following encodings in order: - - utf-8 - - utf-8-sig (for files with BOM) - - latin-1 - - cp1252 - - windows-1252 + Encodings tried (in order): utf-8, utf-8-sig, latin-1, cp1252, windows-1252. """ candidate_encodings = ( [ "utf-8", - "utf-8-sig", # For files with byte order mark + "utf-8-sig", "latin-1", - "cp1252", # Common Windows encoding - "windows-1252", # Alternative Windows encoding + "cp1252", + "windows-1252", ] if encoding is None else [encoding] @@ -171,8 +152,7 @@ def load_aptamer_interactions( continue raise RuntimeError( - f"Failed to read CSV at {path} with candidate encodings " - f"{candidate_encodings}: {last_error}" + f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}" ) @@ -182,11 +162,7 @@ def load_interactions( encoding: str | None = None, **read_csv_kwargs, ) -> pd.DataFrame: - """Load interaction data from a CSV file. - - This is a convenience alias for load_aptamer_interactions() with identical - functionality and parameters. - """ + """Alias for load_aptamer_interactions with same parameters and return.""" return load_aptamer_interactions( path=path, encoding=encoding, @@ -202,7 +178,35 @@ def load_aptadb( encoding: str | None = None, **kwargs, ) -> pd.DataFrame: - """Download and load aptamer-interactions Kaggle dataset as DataFrame.""" + """Download and load aptamer-interactions dataset from Kaggle as DataFrame. + + Parameters + ---------- + dataset_name : str, optional + Kaggle dataset name. + cache_dir : str or Path, optional + Local directory for caching dataset files. + force_download : bool, default False + If True, download dataset even if cached files exist. + encoding : str, optional + Encoding for CSV file loading. + **kwargs + Additional arguments passed to CSV loader. + + Returns + ------- + pd.DataFrame + Loaded dataset as a pandas DataFrame. + + Raises + ------ + ImportError + If the 'kaggle' package is missing. + RuntimeError + If dataset download fails. + FileNotFoundError + If no CSV file found after download. + """ if cache_dir is None: cache_dir = ( Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") @@ -216,7 +220,6 @@ def load_aptadb( try: _download_dataset(dataset_name, cache_dir, force_download=force_download) except ImportError as err: - # Re-raise ImportError for clear messaging when kaggle is missing raise ImportError( "The 'kaggle' package is required to download datasets. " "Install it with: pip install kaggle" From 08c10513ba62aa8187c4883a45e54dffa9a46ad0 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Tue, 7 Oct 2025 02:51:56 +0530 Subject: [PATCH 7/9] Remove normalise function --- .../_loaders/load_aptamer_interactions.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py index 106f31a7..c0be31cd 100644 --- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py +++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py @@ -81,21 +81,6 @@ def _find_csv(directory: Path) -> Path | None: return candidates[0] if candidates else csv_files[0] -def _normalize_interaction_present(df: pd.DataFrame) -> None: - """Placeholder to normalize 'interaction_present' column in the DataFrame. - - Parameters - ---------- - df : pd.DataFrame - DataFrame to normalize. - - Notes - ----- - Currently not implemented, kept for future data normalization. - """ - return - - def load_aptamer_interactions( path: str | Path, *, From 9a5fb63123c5c6d28537b0ffbb363e6f2be26b22 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Thu, 9 Oct 2025 23:11:42 +0530 Subject: [PATCH 8/9] add tests --- .../datasets/tests/test_aptamer_interactions_loader.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py index 43a0baba..953b16c8 100644 --- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -13,7 +13,6 @@ def test_local_csv(tmp_path): """Test loading aptamer data from a local CSV file. - Parameters ---------- tmp_path : Path @@ -37,7 +36,6 @@ def test_local_csv(tmp_path): def test_uses_cache(tmp_path): """Test that cached data is used instead of downloading. - Parameters ---------- tmp_path : Path @@ -63,7 +61,6 @@ def test_uses_cache(tmp_path): def test_requires_kaggle(tmp_path): """Test that ImportError is raised when kaggle package is missing. - Parameters ---------- tmp_path : Path @@ -77,7 +74,6 @@ def test_requires_kaggle(tmp_path): def test_invalid_dataset(tmp_path): """Test error handling for invalid dataset download. - Parameters ---------- tmp_path : Path @@ -101,7 +97,6 @@ def test_invalid_dataset(tmp_path): @pytest.fixture def sample_aptadb_data(): """Create sample aptamer interaction data for testing. - Returns ------- pd.DataFrame @@ -129,7 +124,6 @@ def sample_aptadb_data(): def test_sample_columns(sample_aptadb_data): """Test that sample data contains expected columns and data types. - Parameters ---------- sample_aptadb_data : pd.DataFrame @@ -162,11 +156,9 @@ def test_sample_columns(sample_aptadb_data): @pytest.mark.slow def test_cache_consistency(tmp_path): """Test that consecutive calls with cache yield identical DataFrames. - This test verifies that two consecutive calls yield same DataFrame when using cache. It avoids network by seeding the cache with a local CSV. - Parameters ---------- tmp_path : Path From ba79a32c606b12b9aa943cd858a40eca4e2778b0 Mon Sep 17 00:00:00 2001 From: Satarupa22-SD Date: Sun, 12 Oct 2025 00:03:55 +0530 Subject: [PATCH 9/9] update dependencies --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index ce9a019e..58a794e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "scikit-learn>=1.3.0", "skorch", "imblearn", + "kaggle", ] [project.optional-dependencies]