diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py index 41b5fa67..de156dad 100644 --- a/pyaptamer/datasets/__init__.py +++ b/pyaptamer/datasets/__init__.py @@ -3,9 +3,15 @@ from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure from pyaptamer.datasets._loaders._online_databank import load_from_rcsb from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_aptadb, + load_interactions, +) __all__ = [ "load_pfoa_structure", "load_1gnh_structure", + "load_aptadb", "load_from_rcsb", + "load_interactions", ] diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py index 5fdc4143..9728696b 100644 --- a/pyaptamer/datasets/_loaders/__init__.py +++ b/pyaptamer/datasets/_loaders/__init__.py @@ -2,5 +2,14 @@ from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_aptadb, + load_interactions, +) -__all__ = ["load_pfoa_structure", "load_1gnh_structure"] +__all__ = [ + "load_pfoa_structure", + "load_1gnh_structure", + "load_aptadb", + "load_interactions", +] diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py new file mode 100644 index 00000000..c0be31cd --- /dev/null +++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py @@ -0,0 +1,223 @@ +__author__ = "Satarupa22-SD" +__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"] + +from pathlib import Path + +import pandas as pd + + +def _download_dataset( + dataset_name: str, target_dir: Path, force_download: bool = False +) -> None: + """Download a Kaggle dataset to the specified directory and unzip it. + + Parameters + ---------- + dataset_name : str + Kaggle dataset identifier like "username/dataset-name". + target_dir : Path + Directory to download and extract the dataset. + force_download : bool, default False + If True, download even if CSV files already exist in target_dir. + + Raises + ------ + ImportError + If the kaggle package is not installed. + Exception + If the download fails for any reason. + + Notes + ----- + Requires kaggle package installed and configured with API credentials. + """ + import kaggle # avoid import-time auth + + target_dir.mkdir(parents=True, exist_ok=True) + + # Only download if forced or no CSV files exist + if force_download or not any(target_dir.glob("*.csv")): + kaggle.api.dataset_download_files( + dataset_name, path=str(target_dir), unzip=True + ) + + +def _find_csv(directory: Path) -> Path | None: + """Return the most appropriate CSV file path from a directory. + + Parameters + ---------- + directory : Path + Directory to look for CSV files. + + Returns + ------- + Path or None + Path to CSV file or None if none found. + + Notes + ----- + Preference order: + 1. If only one CSV, return it. + 2. If multiple, prefer files with "aptamer", "interaction", "main", or "data" + in name. + 3. Otherwise, return first CSV found. + """ + csv_files = list(directory.glob("*.csv")) + + if not csv_files: + return None + + if len(csv_files) == 1: + return csv_files[0] + + preferred_keywords = ["aptamer", "interaction", "main", "data"] + candidates = [ + f + for f in csv_files + if any(keyword in f.name.lower() for keyword in preferred_keywords) + ] + + return candidates[0] if candidates else csv_files[0] + + +def load_aptamer_interactions( + path: str | Path, + *, + encoding: str | None = None, + **read_csv_kwargs, +) -> pd.DataFrame: + """Load aptamer interactions CSV into a pandas DataFrame. + + Tries common encodings automatically for robust loading. + + Parameters + ---------- + path : str or Path + Path to CSV file with aptamer interactions. + encoding : str, optional + Specific file encoding to use. If None, tries common encodings. + **read_csv_kwargs + Additional arguments passed to pandas.read_csv(). + + Returns + ------- + pd.DataFrame + DataFrame with aptamer interaction data. + + Raises + ------ + RuntimeError + If CSV cannot be read with any attempted encodings. + + Notes + ----- + Encodings tried (in order): utf-8, utf-8-sig, latin-1, cp1252, windows-1252. + """ + candidate_encodings = ( + [ + "utf-8", + "utf-8-sig", + "latin-1", + "cp1252", + "windows-1252", + ] + if encoding is None + else [encoding] + ) + + last_error: Exception | None = None + + for enc in candidate_encodings: + try: + df = pd.read_csv(path, encoding=enc, **read_csv_kwargs) + return df + except Exception as e: + last_error = e + continue + + raise RuntimeError( + f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}" + ) + + +def load_interactions( + path: str | Path, + *, + encoding: str | None = None, + **read_csv_kwargs, +) -> pd.DataFrame: + """Alias for load_aptamer_interactions with same parameters and return.""" + return load_aptamer_interactions( + path=path, + encoding=encoding, + **read_csv_kwargs, + ) + + +def load_aptadb( + dataset_name: str = "satarupadeb/aptamer-interactions", + cache_dir: str | Path | None = None, + force_download: bool = False, + *, + encoding: str | None = None, + **kwargs, +) -> pd.DataFrame: + """Download and load aptamer-interactions dataset from Kaggle as DataFrame. + + Parameters + ---------- + dataset_name : str, optional + Kaggle dataset name. + cache_dir : str or Path, optional + Local directory for caching dataset files. + force_download : bool, default False + If True, download dataset even if cached files exist. + encoding : str, optional + Encoding for CSV file loading. + **kwargs + Additional arguments passed to CSV loader. + + Returns + ------- + pd.DataFrame + Loaded dataset as a pandas DataFrame. + + Raises + ------ + ImportError + If the 'kaggle' package is missing. + RuntimeError + If dataset download fails. + FileNotFoundError + If no CSV file found after download. + """ + if cache_dir is None: + cache_dir = ( + Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") + ) + else: + cache_dir = Path(cache_dir) + + csv_file = _find_csv(cache_dir) if cache_dir.exists() else None + + if csv_file is None: + try: + _download_dataset(dataset_name, cache_dir, force_download=force_download) + except ImportError as err: + raise ImportError( + "The 'kaggle' package is required to download datasets. " + "Install it with: pip install kaggle" + ) from err + except Exception as e: + raise RuntimeError( + f"Failed to download dataset '{dataset_name}' from Kaggle: {e}" + ) from e + + csv_file = _find_csv(cache_dir) + if csv_file is None: + raise FileNotFoundError( + f"No CSV files found in downloaded Kaggle dataset at {cache_dir}" + ) + + return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs) diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py new file mode 100644 index 00000000..953b16c8 --- /dev/null +++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py @@ -0,0 +1,179 @@ +__author__ = "Satarupa22-SD" + +from unittest.mock import patch + +import pandas as pd +import pytest + +from pyaptamer.datasets import load_aptadb +from pyaptamer.datasets._loaders.load_aptamer_interactions import ( + load_interactions, +) + + +def test_local_csv(tmp_path): + """Test loading aptamer data from a local CSV file. + Parameters + ---------- + tmp_path : Path + Pytest fixture providing a temporary directory + """ + csv_path = tmp_path / "aptadb_sample.csv" + pd.DataFrame( + { + "aptamer_id": ["APT001"], + "aptamer_sequence": ["AUGCUU"], + "target_name": ["Thrombin"], + "interaction_present": ["1"], + } + ).to_csv(csv_path, index=False) + + df = load_interactions(csv_path) + assert isinstance(df, pd.DataFrame) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGCUU" + + +def test_uses_cache(tmp_path): + """Test that cached data is used instead of downloading. + Parameters + ---------- + tmp_path : Path + Pytest fixture providing a temporary directory + """ + csv_path = tmp_path / "aptadb.csv" + pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv( + csv_path, index=False + ) + + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=csv_path, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset" + ) as mock_dl: + df = load_aptadb(cache_dir=tmp_path) + assert not df.empty + assert df.loc[0, "aptamer_sequence"] == "AUGU" + mock_dl.assert_not_called() + + +def test_requires_kaggle(tmp_path): + """Test that ImportError is raised when kaggle package is missing. + Parameters + ---------- + tmp_path : Path + Pytest fixture providing a temporary directory + """ + # Ensure no CSV present so a download would be attempted + with patch.dict("sys.modules", {"kaggle": None}): + with pytest.raises(ImportError): + load_aptadb(cache_dir=tmp_path) + + +def test_invalid_dataset(tmp_path): + """Test error handling for invalid dataset download. + Parameters + ---------- + tmp_path : Path + Pytest fixture providing a temporary directory + """ + # Force the download path and make it fail + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=None, + ): + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset", + side_effect=Exception("boom"), + ): + with pytest.raises( + RuntimeError, match=r"Failed to download dataset .* from Kaggle" + ): + load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path) + + +@pytest.fixture +def sample_aptadb_data(): + """Create sample aptamer interaction data for testing. + Returns + ------- + pd.DataFrame + Sample DataFrame with aptamer interaction data + """ + return pd.DataFrame( + { + "aptamer_id": ["APT001", "APT002", "APT003"], + "target_id": ["TGT001", "TGT002", "TGT003"], + "aptamer_sequence": [ + "ATCGATCGATCGATCG", + "GCTAGCTAGCTAGCTA", + "TTAACCGGTTAACCGG", + ], + "target_name": ["Thrombin", "VEGF", "Lysozyme"], + "target_uniprot": ["P00734", "P15692", "P61626"], + "organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"], + "ligand_type": ["Protein", "Protein", "Protein"], + "binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"], + "reference_pubmed_id": ["12345678", "87654321", "11223344"], + "interaction_present": [1, 1, 0], + } + ) + + +def test_sample_columns(sample_aptadb_data): + """Test that sample data contains expected columns and data types. + Parameters + ---------- + sample_aptadb_data : pd.DataFrame + Fixture providing sample aptamer data + """ + df = sample_aptadb_data + assert isinstance(df, pd.DataFrame) + assert len(df) == 3 + + expected_columns = [ + "aptamer_id", + "target_id", + "aptamer_sequence", + "target_name", + "target_uniprot", + "organism", + "ligand_type", + "binding_conditions", + "reference_pubmed_id", + "interaction_present", + ] + + for col in expected_columns: + assert col in df.columns, f"Expected column '{col}' not found in dataset" + + assert df["aptamer_sequence"].dtype == "object" + assert df["target_name"].dtype == "object" + + +@pytest.mark.slow +def test_cache_consistency(tmp_path): + """Test that consecutive calls with cache yield identical DataFrames. + This test verifies that two consecutive calls yield same DataFrame + when using cache. It avoids network by seeding the cache with a + local CSV. + Parameters + ---------- + tmp_path : Path + Pytest fixture providing a temporary directory + """ + csv_path = tmp_path / "aptadb.csv" + seeded = pd.DataFrame( + {"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]} + ) + seeded.to_csv(csv_path, index=False) + + with patch( + "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv", + return_value=csv_path, + ): + df1 = load_aptadb(cache_dir=tmp_path) + df2 = load_aptadb(cache_dir=tmp_path) + pd.testing.assert_frame_equal(df1, df2) diff --git a/pyproject.toml b/pyproject.toml index ce9a019e..58a794e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "scikit-learn>=1.3.0", "skorch", "imblearn", + "kaggle", ] [project.optional-dependencies]