From 389dd07d8612dda30936bff68fc4ffc150ed6657 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Fri, 26 Sep 2025 01:19:24 +0530
Subject: [PATCH 1/9] Add AptaDB loader

---
 pyaptamer/datasets/__init__.py                |   5 +-
 pyaptamer/datasets/_loaders/__init__.py       |   3 +-
 .../_loaders/load_aptamer_interactions.py     | 118 ++++++++++++++++++
 .../tests/test_aptamer_interactions_loader.py | 115 +++++++++++++++++
 4 files changed, 238 insertions(+), 3 deletions(-)
 create mode 100644 pyaptamer/datasets/_loaders/load_aptamer_interactions.py
 create mode 100644 pyaptamer/datasets/tests/test_aptamer_interactions_loader.py

diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
index 41b5fa67..c99715eb 100644
--- a/pyaptamer/datasets/__init__.py
+++ b/pyaptamer/datasets/__init__.py
@@ -1,11 +1,12 @@
 """Contains datasets along with their loaders."""
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
-from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
 from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions
 
 __all__ = [
     "load_pfoa_structure",
     "load_1gnh_structure",
-    "load_from_rcsb",
+    "load_aptadb",
+    "load_interactions",
 ]
diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py
index 5fdc4143..90cc5cf8 100644
--- a/pyaptamer/datasets/_loaders/__init__.py
+++ b/pyaptamer/datasets/_loaders/__init__.py
@@ -2,5 +2,6 @@
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
 from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions
 
-__all__ = ["load_pfoa_structure", "load_1gnh_structure"]
+__all__ = ["load_pfoa_structure", "load_1gnh_structure", "load_aptadb", "load_interactions"]
\ No newline at end of file
diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
new file mode 100644
index 00000000..05caa3ba
--- /dev/null
+++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
@@ -0,0 +1,118 @@
+__author__ = "Satarupa22-SD"
+__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"]
+
+from pathlib import Path
+from typing import Optional, Union
+
+import pandas as pd
+
+
+def download_dataset(dataset_name: str, target_dir: Path, force_download: bool = False):
+    """Download dataset_name into target_dir using Kaggle API and unzip there."""
+    import kaggle # avoid import-time auth
+    target_dir.mkdir(parents=True, exist_ok=True)
+    if force_download or not any(target_dir.glob("*.csv")):
+        kaggle.api.dataset_download_files(dataset_name, path=str(target_dir), unzip=True)
+
+
+def find_csv(directory: Path):
+    csv_files = list(directory.glob("*.csv"))
+    if not csv_files:
+        return None
+    if len(csv_files) == 1:
+        return csv_files[0]
+    candidates = [
+        f for f in csv_files
+        if any(t in f.name.lower() for t in ["aptamer", "interaction", "main", "data"])
+    ]
+    return candidates[0] if candidates else csv_files[0]
+
+
+
+
+
+def normalize_interaction_present(df: pd.DataFrame) -> None:
+    return
+
+
+def load_aptamer_interactions(
+    path: Union[str, Path],
+    *,
+    encoding: Optional[str] = None,
+    **read_csv_kwargs,
+):
+    """
+    Load AptaDB-style CSV into a pandas.DataFrame.
+
+    Parameters
+    ----------
+    path : str | Path
+        Path to the CSV file.
+    encoding : str | None
+        Specific file encoding. If None, several encodings are tried.
+    **read_csv_kwargs : Any
+        Additional arguments forwarded to pandas.read_csv.
+    """
+    candidate_encodings = [
+        "utf-8",
+        "utf-8-sig",
+        "latin-1",
+        "cp1252",         
+        "windows-1252",   
+    ] if encoding is None else [encoding]
+    last_error: Optional[Exception] = None
+    for enc in candidate_encodings:
+        try:
+            df = pd.read_csv(path, encoding=enc, **read_csv_kwargs)
+            return df
+        except Exception as e:
+            last_error = e
+            continue
+    # If all encodings failed, raise the last error
+    raise RuntimeError(f"Failed to read CSV at {path} with candidate encodings {candidate_encodings}: {last_error}")
+
+
+def load_interactions(
+    path: Union[str, Path],
+    *,
+    encoding: Optional[str] = None,
+    **read_csv_kwargs,
+):
+    """Simple alias for load_aptamer_interactions."""
+    return load_aptamer_interactions(
+        path=path,
+        encoding=encoding,
+        **read_csv_kwargs,
+    )
+
+
+def load_aptadb(
+    dataset_name: str = "satarupadeb/aptamer-interactions",
+    cache_dir: Optional[Union[str, Path]] = None,
+    force_download: bool = False,
+    *,
+    encoding: Optional[str] = None,
+    **kwargs,
+):
+    """
+    Download (optional) and load the aptamer-interactions Kaggle dataset as pandas.DataFrame.
+    """
+    cache_dir = (
+        Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
+        if cache_dir is None else Path(cache_dir)
+    )
+
+    csv_file = find_csv(cache_dir) if cache_dir.exists() else None
+    if csv_file is None:
+        try:
+            download_dataset(dataset_name, cache_dir, force_download=force_download)
+        except ImportError:
+            # ImportError for tests and clear messaging when kaggle is missing
+            raise
+        except Exception as e:
+            raise RuntimeError(f"Failed to download dataset '{dataset_name}' from Kaggle: {e}") from e
+        csv_file = find_csv(cache_dir)
+        if csv_file is None:
+            raise FileNotFoundError(f"No CSV found in downloaded Kaggle dataset at {cache_dir}")
+
+    return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs)
diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
new file mode 100644
index 00000000..6987deb1
--- /dev/null
+++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -0,0 +1,115 @@
+__author__ = "Satarupa22-SD"
+
+import pytest
+import pandas as pd
+from pathlib import Path
+from unittest.mock import patch
+
+from pyaptamer.datasets import load_aptadb
+from pyaptamer.datasets._loaders.load_aptamer_interactions import (
+    load_interactions,
+)
+
+
+def test_local_csv(tmp_path):
+    csv_path = tmp_path / "aptadb_sample.csv"
+    pd.DataFrame({
+        "aptamer_id": ["APT001"],
+        "aptamer_sequence": ["AUGCUU"],
+        "target_name": ["Thrombin"],
+        "interaction_present": ["1"],
+    }).to_csv(csv_path, index=False)
+
+    df = load_interactions(csv_path)
+    assert isinstance(df, pd.DataFrame)
+    assert not df.empty
+    assert df.loc[0, "aptamer_sequence"] == "AUGCUU"  # unchanged
+
+
+def test_uses_cache(tmp_path):
+    csv_path = tmp_path / "aptadb.csv"
+    pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(csv_path, index=False)
+
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
+        return_value=csv_path,
+    ):
+        with patch(
+            "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset"
+        ) as mock_dl:
+            df = load_aptadb(cache_dir=tmp_path)
+            assert not df.empty
+            assert df.loc[0, "aptamer_sequence"] == "AUGU"
+            mock_dl.assert_not_called()
+
+
+def test_requires_kaggle(tmp_path):
+    # Ensure no CSV present so a download would be attempted
+    with patch.dict("sys.modules", {"kaggle": None}):
+        with pytest.raises(ImportError):
+            load_aptadb(cache_dir=tmp_path)
+
+
+def test_invalid_dataset(tmp_path):
+    # Force the download path and make it fail
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
+        return_value=None,
+    ):
+        with patch(
+            "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset",
+            side_effect=Exception("boom"),
+        ):
+            with pytest.raises(RuntimeError, match=r"Failed to download dataset .* from Kaggle"):
+                load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path)
+
+
+@pytest.fixture
+def sample_aptadb_data():
+    return pd.DataFrame({
+        'aptamer_id': ['APT001', 'APT002', 'APT003'],
+        'target_id': ['TGT001', 'TGT002', 'TGT003'],
+        'aptamer_sequence': ['ATCGATCGATCGATCG', 'GCTAGCTAGCTAGCTA', 'TTAACCGGTTAACCGG'],
+        'target_name': ['Thrombin', 'VEGF', 'Lysozyme'],
+        'target_uniprot': ['P00734', 'P15692', 'P61626'],
+        'organism': ['Homo sapiens', 'Homo sapiens', 'Gallus gallus'],
+        'ligand_type': ['Protein', 'Protein', 'Protein'],
+        'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'],
+        'reference_pubmed_id': ['12345678', '87654321', '11223344'],
+        'interaction_present': [True, True, False]
+    })
+
+
+def test_sample_columns(sample_aptadb_data):
+    df = sample_aptadb_data
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 3
+
+    expected_columns = [
+        'aptamer_id', 'target_id', 'aptamer_sequence', 'target_name',
+        'target_uniprot', 'organism', 'ligand_type', 'binding_conditions',
+        'reference_pubmed_id', 'interaction_present'
+    ]
+
+    for col in expected_columns:
+        assert col in df.columns, f"Expected column '{col}' not found in dataset"
+
+    assert df['aptamer_sequence'].dtype == 'object'
+    assert df['target_name'].dtype == 'object'
+
+
+@pytest.mark.slow
+def test_cache_consistency(tmp_path):
+    # This test verifies that two consecutive calls yield same DataFrame when using cache.
+    # It still avoids network by seeding the cache with a local CSV.
+    csv_path = tmp_path / "aptadb.csv"
+    seeded = pd.DataFrame({"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]})
+    seeded.to_csv(csv_path, index=False)
+
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
+        return_value=csv_path,
+    ):
+        df1 = load_aptadb(cache_dir=tmp_path)
+        df2 = load_aptadb(cache_dir=tmp_path)
+        pd.testing.assert_frame_equal(df1, df2)
\ No newline at end of file

From 9eceaaa94df965d9814f35615914a862236e29e1 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Fri, 26 Sep 2025 01:44:06 +0530
Subject: [PATCH 2/9] Update datasets __init__.py

---
 pyaptamer/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
index c99715eb..05f8a75d 100644
--- a/pyaptamer/datasets/__init__.py
+++ b/pyaptamer/datasets/__init__.py
@@ -2,11 +2,13 @@
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
 from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
 from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions
 
 __all__ = [
     "load_pfoa_structure",
     "load_1gnh_structure",
     "load_aptadb",
+    "load_from_rcsb",
     "load_interactions",
 ]

From 478087db78af65edf67491be52e2df39452185d5 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Fri, 26 Sep 2025 01:53:21 +0530
Subject: [PATCH 3/9] Update test

---
 pyaptamer/datasets/tests/test_aptamer_interactions_loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
index 6987deb1..4cbf5ada 100644
--- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
+++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -76,7 +76,7 @@ def sample_aptadb_data():
         'ligand_type': ['Protein', 'Protein', 'Protein'],
         'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'],
         'reference_pubmed_id': ['12345678', '87654321', '11223344'],
-        'interaction_present': [True, True, False]
+        'interaction_present': [1, 1, 0]
     })
 
 

From 18730037475abda6a52f8c51537ca1cd3922d827 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Sun, 5 Oct 2025 01:15:16 +0530
Subject: [PATCH 4/9] Add docstrings and fix linting issues for aptadb loader

---
 pyaptamer/datasets/__init__.py                |   7 +-
 pyaptamer/datasets/_loaders/__init__.py       |  12 +-
 .../_loaders/load_aptamer_interactions.py     | 221 +++++++---
 .../tests/test_aptamer_interactions_loader.py | 399 +++++++++++++-----
 4 files changed, 477 insertions(+), 162 deletions(-)

diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
index 05f8a75d..de156dad 100644
--- a/pyaptamer/datasets/__init__.py
+++ b/pyaptamer/datasets/__init__.py
@@ -1,9 +1,12 @@
 """Contains datasets along with their loaders."""
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
-from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
 from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
-from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions
+from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
+from pyaptamer.datasets._loaders.load_aptamer_interactions import (
+    load_aptadb,
+    load_interactions,
+)
 
 __all__ = [
     "load_pfoa_structure",
diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py
index 90cc5cf8..9728696b 100644
--- a/pyaptamer/datasets/_loaders/__init__.py
+++ b/pyaptamer/datasets/_loaders/__init__.py
@@ -2,6 +2,14 @@
 
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
 from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
-from pyaptamer.datasets._loaders.load_aptamer_interactions import load_aptadb, load_interactions
+from pyaptamer.datasets._loaders.load_aptamer_interactions import (
+    load_aptadb,
+    load_interactions,
+)
 
-__all__ = ["load_pfoa_structure", "load_1gnh_structure", "load_aptadb", "load_interactions"]
\ No newline at end of file
+__all__ = [
+    "load_pfoa_structure",
+    "load_1gnh_structure",
+    "load_aptadb",
+    "load_interactions",
+]
diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
index 05caa3ba..25f519c1 100644
--- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
+++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
@@ -2,65 +2,166 @@
 __all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"]
 
 from pathlib import Path
-from typing import Optional, Union
 
 import pandas as pd
 
 
-def download_dataset(dataset_name: str, target_dir: Path, force_download: bool = False):
-    """Download dataset_name into target_dir using Kaggle API and unzip there."""
-    import kaggle # avoid import-time auth
+def _download_dataset(
+    dataset_name: str, target_dir: Path, force_download: bool = False
+) -> None:
+    """Download a Kaggle dataset to the specified directory and unzip it.
+
+    This is a private helper function used internally by the module.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The Kaggle dataset identifier in format "username/dataset-name"
+    target_dir : Path
+        Directory where the dataset should be downloaded and extracted
+    force_download : bool, default False
+        If True, download even if CSV files already exist in target_dir
+
+    Raises
+    ------
+    ImportError
+        If the kaggle package is not installed
+    Exception
+        If the download fails for any reason
+
+    Notes
+    -----
+    This function requires the kaggle package to be installed and properly
+    configured with API credentials.
+    """
+    import kaggle  # avoid import-time auth
+
     target_dir.mkdir(parents=True, exist_ok=True)
+
+    # Only download if forced or no CSV files exist
     if force_download or not any(target_dir.glob("*.csv")):
-        kaggle.api.dataset_download_files(dataset_name, path=str(target_dir), unzip=True)
+        kaggle.api.dataset_download_files(
+            dataset_name, path=str(target_dir), unzip=True
+        )
+
+
+def _find_csv(directory: Path) -> Path | None:
+    """Find the most appropriate CSV file in a directory.
 
+    This is a private helper function that implements smart CSV file detection.
 
-def find_csv(directory: Path):
+    Parameters
+    ----------
+    directory : Path
+        Directory to search for CSV files
+
+    Returns
+    -------
+    Path or None
+        Path to the most appropriate CSV file, or None if no CSV files found
+
+    Notes
+    -----
+    Selection priority:
+    1. If only one CSV file exists, return it
+    2. If multiple CSV files exist, prefer files with names containing:
+       "aptamer", "interaction", "main", or "data"
+    3. If no preferred names found, return the first CSV file
+    """
     csv_files = list(directory.glob("*.csv"))
+
     if not csv_files:
         return None
+
     if len(csv_files) == 1:
         return csv_files[0]
+
+    # Look for files with preferred keywords in their names
+    preferred_keywords = ["aptamer", "interaction", "main", "data"]
     candidates = [
-        f for f in csv_files
-        if any(t in f.name.lower() for t in ["aptamer", "interaction", "main", "data"])
+        f
+        for f in csv_files
+        if any(keyword in f.name.lower() for keyword in preferred_keywords)
     ]
+
     return candidates[0] if candidates else csv_files[0]
 
 
+def _normalize_interaction_present(df: pd.DataFrame) -> None:
+    """Normalize interaction present column in the dataset.
 
+    This is a private helper function for data preprocessing.
+    Currently a placeholder for future implementation.
 
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe to normalize
 
-def normalize_interaction_present(df: pd.DataFrame) -> None:
+    Notes
+    -----
+    This function is currently not implemented and serves as a placeholder
+    for future data normalization functionality.
+    """
+    # TODO: Implement interaction present normalization
     return
 
 
 def load_aptamer_interactions(
-    path: Union[str, Path],
+    path: str | Path,
     *,
-    encoding: Optional[str] = None,
+    encoding: str | None = None,
     **read_csv_kwargs,
-):
-    """
-    Load AptaDB-style CSV into a pandas.DataFrame.
+) -> pd.DataFrame:
+    """Load an aptamer interactions CSV file into a pandas DataFrame.
+
+    This function provides robust CSV loading with automatic encoding detection
+    and error handling for various file formats commonly found in biological
+    datasets.
 
     Parameters
     ----------
-    path : str | Path
-        Path to the CSV file.
-    encoding : str | None
-        Specific file encoding. If None, several encodings are tried.
-    **read_csv_kwargs : Any
-        Additional arguments forwarded to pandas.read_csv.
+    path : str or Path
+        Path to the CSV file containing aptamer interaction data
+    encoding : str, optional
+        Specific file encoding to use. If None (default), multiple common
+        encodings will be tried automatically
+    **read_csv_kwargs
+        Additional keyword arguments passed directly to pandas.read_csv()
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the loaded aptamer interaction data
+
+    Raises
+    ------
+    RuntimeError
+        If the CSV file cannot be read with any of the attempted encodings
+
+    Notes
+    -----
+    The function attempts the following encodings in order:
+    - utf-8
+    - utf-8-sig (for files with BOM)
+    - latin-1
+    - cp1252
+    - windows-1252
     """
-    candidate_encodings = [
-        "utf-8",
-        "utf-8-sig",
-        "latin-1",
-        "cp1252",         
-        "windows-1252",   
-    ] if encoding is None else [encoding]
-    last_error: Optional[Exception] = None
+    candidate_encodings = (
+        [
+            "utf-8",
+            "utf-8-sig",  # For files with byte order mark
+            "latin-1",
+            "cp1252",  # Common Windows encoding
+            "windows-1252",  # Alternative Windows encoding
+        ]
+        if encoding is None
+        else [encoding]
+    )
+
+    last_error: Exception | None = None
+
     for enc in candidate_encodings:
         try:
             df = pd.read_csv(path, encoding=enc, **read_csv_kwargs)
@@ -68,17 +169,24 @@ def load_aptamer_interactions(
         except Exception as e:
             last_error = e
             continue
-    # If all encodings failed, raise the last error
-    raise RuntimeError(f"Failed to read CSV at {path} with candidate encodings {candidate_encodings}: {last_error}")
+
+    raise RuntimeError(
+        f"Failed to read CSV at {path} with candidate encodings "
+        f"{candidate_encodings}: {last_error}"
+    )
 
 
 def load_interactions(
-    path: Union[str, Path],
+    path: str | Path,
     *,
-    encoding: Optional[str] = None,
+    encoding: str | None = None,
     **read_csv_kwargs,
-):
-    """Simple alias for load_aptamer_interactions."""
+) -> pd.DataFrame:
+    """Load interaction data from a CSV file.
+
+    This is a convenience alias for load_aptamer_interactions() with identical
+    functionality and parameters.
+    """
     return load_aptamer_interactions(
         path=path,
         encoding=encoding,
@@ -88,31 +196,40 @@ def load_interactions(
 
 def load_aptadb(
     dataset_name: str = "satarupadeb/aptamer-interactions",
-    cache_dir: Optional[Union[str, Path]] = None,
+    cache_dir: str | Path | None = None,
     force_download: bool = False,
     *,
-    encoding: Optional[str] = None,
+    encoding: str | None = None,
     **kwargs,
-):
-    """
-    Download (optional) and load the aptamer-interactions Kaggle dataset as pandas.DataFrame.
-    """
-    cache_dir = (
-        Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
-        if cache_dir is None else Path(cache_dir)
-    )
+) -> pd.DataFrame:
+    """Download and load aptamer-interactions Kaggle dataset as DataFrame."""
+    if cache_dir is None:
+        cache_dir = (
+            Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
+        )
+    else:
+        cache_dir = Path(cache_dir)
+
+    csv_file = _find_csv(cache_dir) if cache_dir.exists() else None
 
-    csv_file = find_csv(cache_dir) if cache_dir.exists() else None
     if csv_file is None:
         try:
-            download_dataset(dataset_name, cache_dir, force_download=force_download)
-        except ImportError:
-            # ImportError for tests and clear messaging when kaggle is missing
-            raise
+            _download_dataset(dataset_name, cache_dir, force_download=force_download)
+        except ImportError as err:
+            # Re-raise ImportError for clear messaging when kaggle is missing
+            raise ImportError(
+                "The 'kaggle' package is required to download datasets. "
+                "Install it with: pip install kaggle"
+            ) from err
         except Exception as e:
-            raise RuntimeError(f"Failed to download dataset '{dataset_name}' from Kaggle: {e}") from e
-        csv_file = find_csv(cache_dir)
+            raise RuntimeError(
+                f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
+            ) from e
+
+        csv_file = _find_csv(cache_dir)
         if csv_file is None:
-            raise FileNotFoundError(f"No CSV found in downloaded Kaggle dataset at {cache_dir}")
+            raise FileNotFoundError(
+                f"No CSV files found in downloaded Kaggle dataset at {cache_dir}"
+            )
 
     return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs)
diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
index 4cbf5ada..ee5a902a 100644
--- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
+++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -1,115 +1,302 @@
 __author__ = "Satarupa22-SD"
+__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"]
 
-import pytest
-import pandas as pd
 from pathlib import Path
-from unittest.mock import patch
-
-from pyaptamer.datasets import load_aptadb
-from pyaptamer.datasets._loaders.load_aptamer_interactions import (
-    load_interactions,
-)
-
-
-def test_local_csv(tmp_path):
-    csv_path = tmp_path / "aptadb_sample.csv"
-    pd.DataFrame({
-        "aptamer_id": ["APT001"],
-        "aptamer_sequence": ["AUGCUU"],
-        "target_name": ["Thrombin"],
-        "interaction_present": ["1"],
-    }).to_csv(csv_path, index=False)
-
-    df = load_interactions(csv_path)
-    assert isinstance(df, pd.DataFrame)
-    assert not df.empty
-    assert df.loc[0, "aptamer_sequence"] == "AUGCUU"  # unchanged
-
-
-def test_uses_cache(tmp_path):
-    csv_path = tmp_path / "aptadb.csv"
-    pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(csv_path, index=False)
-
-    with patch(
-        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
-        return_value=csv_path,
-    ):
-        with patch(
-            "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset"
-        ) as mock_dl:
-            df = load_aptadb(cache_dir=tmp_path)
-            assert not df.empty
-            assert df.loc[0, "aptamer_sequence"] == "AUGU"
-            mock_dl.assert_not_called()
-
-
-def test_requires_kaggle(tmp_path):
-    # Ensure no CSV present so a download would be attempted
-    with patch.dict("sys.modules", {"kaggle": None}):
-        with pytest.raises(ImportError):
-            load_aptadb(cache_dir=tmp_path)
-
-
-def test_invalid_dataset(tmp_path):
-    # Force the download path and make it fail
-    with patch(
-        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
-        return_value=None,
-    ):
-        with patch(
-            "pyaptamer.datasets._loaders.load_aptamer_interactions.download_dataset",
-            side_effect=Exception("boom"),
-        ):
-            with pytest.raises(RuntimeError, match=r"Failed to download dataset .* from Kaggle"):
-                load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path)
-
-
-@pytest.fixture
-def sample_aptadb_data():
-    return pd.DataFrame({
-        'aptamer_id': ['APT001', 'APT002', 'APT003'],
-        'target_id': ['TGT001', 'TGT002', 'TGT003'],
-        'aptamer_sequence': ['ATCGATCGATCGATCG', 'GCTAGCTAGCTAGCTA', 'TTAACCGGTTAACCGG'],
-        'target_name': ['Thrombin', 'VEGF', 'Lysozyme'],
-        'target_uniprot': ['P00734', 'P15692', 'P61626'],
-        'organism': ['Homo sapiens', 'Homo sapiens', 'Gallus gallus'],
-        'ligand_type': ['Protein', 'Protein', 'Protein'],
-        'binding_conditions': ['pH 7.4, 25°C', 'pH 7.0, 37°C', 'pH 8.0, 25°C'],
-        'reference_pubmed_id': ['12345678', '87654321', '11223344'],
-        'interaction_present': [1, 1, 0]
-    })
-
-
-def test_sample_columns(sample_aptadb_data):
-    df = sample_aptadb_data
-    assert isinstance(df, pd.DataFrame)
-    assert len(df) == 3
-
-    expected_columns = [
-        'aptamer_id', 'target_id', 'aptamer_sequence', 'target_name',
-        'target_uniprot', 'organism', 'ligand_type', 'binding_conditions',
-        'reference_pubmed_id', 'interaction_present'
+
+import pandas as pd
+
+
+def _download_dataset(
+    dataset_name: str, target_dir: Path, force_download: bool = False
+) -> None:
+    """Download a Kaggle dataset to the specified directory and unzip it.
+
+    This is a private helper function used internally by the module.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The Kaggle dataset identifier in format "username/dataset-name"
+    target_dir : Path
+        Directory where the dataset should be downloaded and extracted
+    force_download : bool, default False
+        If True, download even if CSV files already exist in target_dir
+
+    Raises
+    ------
+    ImportError
+        If the kaggle package is not installed
+    Exception
+        If the download fails for any reason
+
+    Notes
+    -----
+    This function requires the kaggle package to be installed and properly
+    configured with API credentials.
+    """
+    import kaggle  # avoid import-time auth
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    # Only download if forced or no CSV files exist
+    if force_download or not any(target_dir.glob("*.csv")):
+        kaggle.api.dataset_download_files(
+            dataset_name, path=str(target_dir), unzip=True
+        )
+
+
+def _find_csv(directory: Path) -> Path | None:
+    """Find the most appropriate CSV file in a directory.
+
+    This is a private helper function that implements smart CSV file detection.
+
+    Parameters
+    ----------
+    directory : Path
+        Directory to search for CSV files
+
+    Returns
+    -------
+    Path or None
+        Path to the most appropriate CSV file, or None if no CSV files found
+
+    Notes
+    -----
+    Selection priority:
+    1. If only one CSV file exists, return it
+    2. If multiple CSV files exist, prefer files with names containing:
+       "aptamer", "interaction", "main", or "data"
+    3. If no preferred names found, return the first CSV file
+    """
+    csv_files = list(directory.glob("*.csv"))
+
+    if not csv_files:
+        return None
+
+    if len(csv_files) == 1:
+        return csv_files[0]
+
+    # Look for files with preferred keywords in their names
+    preferred_keywords = ["aptamer", "interaction", "main", "data"]
+    candidates = [
+        f
+        for f in csv_files
+        if any(keyword in f.name.lower() for keyword in preferred_keywords)
     ]
 
-    for col in expected_columns:
-        assert col in df.columns, f"Expected column '{col}' not found in dataset"
+    return candidates[0] if candidates else csv_files[0]
+
+
+def _normalize_interaction_present(df: pd.DataFrame) -> None:
+    """Normalize interaction present column in the dataset.
+
+    This is a private helper function for data preprocessing.
+    Currently a placeholder for future implementation.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        The dataframe to normalize
+
+    Notes
+    -----
+    This function is currently not implemented and serves as a placeholder
+    for future data normalization functionality.
+    """
+    # TODO: Implement interaction present normalization
+    return
+
+
+def load_aptamer_interactions(
+    path: str | Path,
+    *,
+    encoding: str | None = None,
+    **read_csv_kwargs,
+) -> pd.DataFrame:
+    """Load an aptamer interactions CSV file into a pandas DataFrame.
+
+    This function provides robust CSV loading with automatic encoding detection
+    and error handling for various file formats commonly found in biological
+    datasets.
+
+    Parameters
+    ----------
+    path : str or Path
+        Path to the CSV file containing aptamer interaction data
+    encoding : str, optional
+        Specific file encoding to use. If None (default), multiple common
+        encodings will be tried automatically
+    **read_csv_kwargs
+        Additional keyword arguments passed directly to pandas.read_csv()
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the loaded aptamer interaction data
+
+    Raises
+    ------
+    RuntimeError
+        If the CSV file cannot be read with any of the attempted encodings
+
+    Notes
+    -----
+    The function attempts the following encodings in order:
+    - utf-8
+    - utf-8-sig (for files with BOM)
+    - latin-1
+    - cp1252
+    - windows-1252
+    """
+    # Define candidate encodings to try
+    candidate_encodings = (
+        [
+            "utf-8",
+            "utf-8-sig",  # For files with byte order mark
+            "latin-1",
+            "cp1252",  # Common Windows encoding
+            "windows-1252",  # Alternative Windows encoding
+        ]
+        if encoding is None
+        else [encoding]
+    )
+
+    last_error: Exception | None = None
+
+    # Try each encoding until one works
+    for enc in candidate_encodings:
+        try:
+            df = pd.read_csv(path, encoding=enc, **read_csv_kwargs)
+            return df
+        except Exception as e:
+            last_error = e
+            continue
+
+    # If all encodings failed, raise informative error
+    raise RuntimeError(
+        f"Failed to read CSV at {path} with candidate encodings "
+        f"{candidate_encodings}: {last_error}"
+    )
+
+
+def load_interactions(
+    path: str | Path,
+    *,
+    encoding: str | None = None,
+    **read_csv_kwargs,
+) -> pd.DataFrame:
+    """Load interaction data from a CSV file.
+
+    This is a convenience alias for load_aptamer_interactions() with identical
+    functionality and parameters.
+
+    Parameters
+    ----------
+    path : str or Path
+        Path to the CSV file containing interaction data
+    encoding : str, optional
+        Specific file encoding to use. If None, automatic detection is used
+    **read_csv_kwargs
+        Additional keyword arguments passed to pandas.read_csv()
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the loaded interaction data
+
+    See Also
+    --------
+    load_aptamer_interactions : The main function this aliases
+    """
+    return load_aptamer_interactions(
+        path=path,
+        encoding=encoding,
+        **read_csv_kwargs,
+    )
+
+
+def load_aptadb(
+    dataset_name: str = "satarupadeb/aptamer-interactions",
+    cache_dir: str | Path | None = None,
+    force_download: bool = False,
+    *,
+    encoding: str | None = None,
+    **kwargs,
+) -> pd.DataFrame:
+    """Download and load aptamer-interactions Kaggle dataset as DataFrame.
+
+    This is the main function for accessing aptamer interaction data. It
+    handles dataset downloading, caching, and loading with a single function
+    call.
+
+    Parameters
+    ----------
+    dataset_name : str, default "satarupadeb/aptamer-interactions"
+        Kaggle dataset identifier in format "username/dataset-name"
+    cache_dir : str, Path, or None, default None
+        Directory for caching downloaded datasets. If None, uses
+        ~/.pyaptamer/cache/dataset_name
+    force_download : bool, default False
+        If True, re-download the dataset even if it exists in cache
+    encoding : str, optional
+        Specific file encoding for CSV reading. If None, automatic detection
+        is used
+    **kwargs
+        Additional keyword arguments passed to pandas.read_csv()
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame containing the aptamer interaction dataset
+
+    Raises
+    ------
+    ImportError
+        If the kaggle package is not installed
+    RuntimeError
+        If dataset download fails
+    FileNotFoundError
+        If no CSV files are found in the downloaded dataset
+
+    Notes
+    -----
+    - Requires kaggle package installation and API configuration
+    - First run will download data depending on dataset size
+    - Subsequent runs use cached data unless force_download=True
+    - Cache directory structure: cache_dir/username_dataset-name/
+    """
+    # Set up cache directory
+    if cache_dir is None:
+        cache_dir = (
+            Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
+        )
+    else:
+        cache_dir = Path(cache_dir)
 
-    assert df['aptamer_sequence'].dtype == 'object'
-    assert df['target_name'].dtype == 'object'
+    # Check if we already have a cached CSV file
+    csv_file = _find_csv(cache_dir) if cache_dir.exists() else None
 
+    # Download dataset if no cached file exists
+    if csv_file is None:
+        try:
+            _download_dataset(dataset_name, cache_dir, force_download=force_download)
+        except ImportError:
+            # Re-raise ImportError for clear messaging when kaggle is missing
+            raise ImportError(
+                "The 'kaggle' package is required to download datasets. "
+                "Install it with: pip install kaggle"
+            ) from None
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
+            ) from e
 
-@pytest.mark.slow
-def test_cache_consistency(tmp_path):
-    # This test verifies that two consecutive calls yield same DataFrame when using cache.
-    # It still avoids network by seeding the cache with a local CSV.
-    csv_path = tmp_path / "aptadb.csv"
-    seeded = pd.DataFrame({"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]})
-    seeded.to_csv(csv_path, index=False)
+        # Find the CSV file in downloaded data
+        csv_file = _find_csv(cache_dir)
+        if csv_file is None:
+            raise FileNotFoundError(
+                f"No CSV files found in downloaded Kaggle dataset at {cache_dir}"
+            )
 
-    with patch(
-        "pyaptamer.datasets._loaders.load_aptamer_interactions.find_csv",
-        return_value=csv_path,
-    ):
-        df1 = load_aptadb(cache_dir=tmp_path)
-        df2 = load_aptadb(cache_dir=tmp_path)
-        pd.testing.assert_frame_equal(df1, df2)
\ No newline at end of file
+    # Load and return the CSV data
+    return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs)

From cafe7e5a979741359fe81bb14eefe510303b9cd3 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Sun, 5 Oct 2025 19:05:55 +0530
Subject: [PATCH 5/9] Add tests

---
 .../tests/test_aptamer_interactions_loader.py | 389 ++++++------------
 1 file changed, 137 insertions(+), 252 deletions(-)

diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
index ee5a902a..43a0baba 100644
--- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
+++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -1,302 +1,187 @@
 __author__ = "Satarupa22-SD"
-__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"]
 
-from pathlib import Path
+from unittest.mock import patch
 
 import pandas as pd
+import pytest
 
+from pyaptamer.datasets import load_aptadb
+from pyaptamer.datasets._loaders.load_aptamer_interactions import (
+    load_interactions,
+)
 
-def _download_dataset(
-    dataset_name: str, target_dir: Path, force_download: bool = False
-) -> None:
-    """Download a Kaggle dataset to the specified directory and unzip it.
 
-    This is a private helper function used internally by the module.
+def test_local_csv(tmp_path):
+    """Test loading aptamer data from a local CSV file.
 
     Parameters
     ----------
-    dataset_name : str
-        The Kaggle dataset identifier in format "username/dataset-name"
-    target_dir : Path
-        Directory where the dataset should be downloaded and extracted
-    force_download : bool, default False
-        If True, download even if CSV files already exist in target_dir
-
-    Raises
-    ------
-    ImportError
-        If the kaggle package is not installed
-    Exception
-        If the download fails for any reason
-
-    Notes
-    -----
-    This function requires the kaggle package to be installed and properly
-    configured with API credentials.
+    tmp_path : Path
+        Pytest fixture providing a temporary directory
     """
-    import kaggle  # avoid import-time auth
-
-    target_dir.mkdir(parents=True, exist_ok=True)
+    csv_path = tmp_path / "aptadb_sample.csv"
+    pd.DataFrame(
+        {
+            "aptamer_id": ["APT001"],
+            "aptamer_sequence": ["AUGCUU"],
+            "target_name": ["Thrombin"],
+            "interaction_present": ["1"],
+        }
+    ).to_csv(csv_path, index=False)
 
-    # Only download if forced or no CSV files exist
-    if force_download or not any(target_dir.glob("*.csv")):
-        kaggle.api.dataset_download_files(
-            dataset_name, path=str(target_dir), unzip=True
-        )
+    df = load_interactions(csv_path)
+    assert isinstance(df, pd.DataFrame)
+    assert not df.empty
+    assert df.loc[0, "aptamer_sequence"] == "AUGCUU"
 
 
-def _find_csv(directory: Path) -> Path | None:
-    """Find the most appropriate CSV file in a directory.
-
-    This is a private helper function that implements smart CSV file detection.
+def test_uses_cache(tmp_path):
+    """Test that cached data is used instead of downloading.
 
     Parameters
     ----------
-    directory : Path
-        Directory to search for CSV files
-
-    Returns
-    -------
-    Path or None
-        Path to the most appropriate CSV file, or None if no CSV files found
-
-    Notes
-    -----
-    Selection priority:
-    1. If only one CSV file exists, return it
-    2. If multiple CSV files exist, prefer files with names containing:
-       "aptamer", "interaction", "main", or "data"
-    3. If no preferred names found, return the first CSV file
+    tmp_path : Path
+        Pytest fixture providing a temporary directory
     """
-    csv_files = list(directory.glob("*.csv"))
-
-    if not csv_files:
-        return None
-
-    if len(csv_files) == 1:
-        return csv_files[0]
-
-    # Look for files with preferred keywords in their names
-    preferred_keywords = ["aptamer", "interaction", "main", "data"]
-    candidates = [
-        f
-        for f in csv_files
-        if any(keyword in f.name.lower() for keyword in preferred_keywords)
-    ]
-
-    return candidates[0] if candidates else csv_files[0]
+    csv_path = tmp_path / "aptadb.csv"
+    pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(
+        csv_path, index=False
+    )
 
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv",
+        return_value=csv_path,
+    ):
+        with patch(
+            "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset"
+        ) as mock_dl:
+            df = load_aptadb(cache_dir=tmp_path)
+            assert not df.empty
+            assert df.loc[0, "aptamer_sequence"] == "AUGU"
+            mock_dl.assert_not_called()
 
-def _normalize_interaction_present(df: pd.DataFrame) -> None:
-    """Normalize interaction present column in the dataset.
 
-    This is a private helper function for data preprocessing.
-    Currently a placeholder for future implementation.
+def test_requires_kaggle(tmp_path):
+    """Test that ImportError is raised when kaggle package is missing.
 
     Parameters
     ----------
-    df : pd.DataFrame
-        The dataframe to normalize
-
-    Notes
-    -----
-    This function is currently not implemented and serves as a placeholder
-    for future data normalization functionality.
+    tmp_path : Path
+        Pytest fixture providing a temporary directory
     """
-    # TODO: Implement interaction present normalization
-    return
-
+    # Ensure no CSV present so a download would be attempted
+    with patch.dict("sys.modules", {"kaggle": None}):
+        with pytest.raises(ImportError):
+            load_aptadb(cache_dir=tmp_path)
 
-def load_aptamer_interactions(
-    path: str | Path,
-    *,
-    encoding: str | None = None,
-    **read_csv_kwargs,
-) -> pd.DataFrame:
-    """Load an aptamer interactions CSV file into a pandas DataFrame.
 
-    This function provides robust CSV loading with automatic encoding detection
-    and error handling for various file formats commonly found in biological
-    datasets.
+def test_invalid_dataset(tmp_path):
+    """Test error handling for invalid dataset download.
 
     Parameters
     ----------
-    path : str or Path
-        Path to the CSV file containing aptamer interaction data
-    encoding : str, optional
-        Specific file encoding to use. If None (default), multiple common
-        encodings will be tried automatically
-    **read_csv_kwargs
-        Additional keyword arguments passed directly to pandas.read_csv()
+    tmp_path : Path
+        Pytest fixture providing a temporary directory
+    """
+    # Force the download path and make it fail
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv",
+        return_value=None,
+    ):
+        with patch(
+            "pyaptamer.datasets._loaders.load_aptamer_interactions._download_dataset",
+            side_effect=Exception("boom"),
+        ):
+            with pytest.raises(
+                RuntimeError, match=r"Failed to download dataset .* from Kaggle"
+            ):
+                load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path)
+
+
+@pytest.fixture
+def sample_aptadb_data():
+    """Create sample aptamer interaction data for testing.
 
     Returns
     -------
     pd.DataFrame
-        DataFrame containing the loaded aptamer interaction data
-
-    Raises
-    ------
-    RuntimeError
-        If the CSV file cannot be read with any of the attempted encodings
-
-    Notes
-    -----
-    The function attempts the following encodings in order:
-    - utf-8
-    - utf-8-sig (for files with BOM)
-    - latin-1
-    - cp1252
-    - windows-1252
+        Sample DataFrame with aptamer interaction data
     """
-    # Define candidate encodings to try
-    candidate_encodings = (
-        [
-            "utf-8",
-            "utf-8-sig",  # For files with byte order mark
-            "latin-1",
-            "cp1252",  # Common Windows encoding
-            "windows-1252",  # Alternative Windows encoding
-        ]
-        if encoding is None
-        else [encoding]
-    )
-
-    last_error: Exception | None = None
-
-    # Try each encoding until one works
-    for enc in candidate_encodings:
-        try:
-            df = pd.read_csv(path, encoding=enc, **read_csv_kwargs)
-            return df
-        except Exception as e:
-            last_error = e
-            continue
-
-    # If all encodings failed, raise informative error
-    raise RuntimeError(
-        f"Failed to read CSV at {path} with candidate encodings "
-        f"{candidate_encodings}: {last_error}"
+    return pd.DataFrame(
+        {
+            "aptamer_id": ["APT001", "APT002", "APT003"],
+            "target_id": ["TGT001", "TGT002", "TGT003"],
+            "aptamer_sequence": [
+                "ATCGATCGATCGATCG",
+                "GCTAGCTAGCTAGCTA",
+                "TTAACCGGTTAACCGG",
+            ],
+            "target_name": ["Thrombin", "VEGF", "Lysozyme"],
+            "target_uniprot": ["P00734", "P15692", "P61626"],
+            "organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"],
+            "ligand_type": ["Protein", "Protein", "Protein"],
+            "binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"],
+            "reference_pubmed_id": ["12345678", "87654321", "11223344"],
+            "interaction_present": [1, 1, 0],
+        }
     )
 
 
-def load_interactions(
-    path: str | Path,
-    *,
-    encoding: str | None = None,
-    **read_csv_kwargs,
-) -> pd.DataFrame:
-    """Load interaction data from a CSV file.
-
-    This is a convenience alias for load_aptamer_interactions() with identical
-    functionality and parameters.
+def test_sample_columns(sample_aptadb_data):
+    """Test that sample data contains expected columns and data types.
 
     Parameters
     ----------
-    path : str or Path
-        Path to the CSV file containing interaction data
-    encoding : str, optional
-        Specific file encoding to use. If None, automatic detection is used
-    **read_csv_kwargs
-        Additional keyword arguments passed to pandas.read_csv()
+    sample_aptadb_data : pd.DataFrame
+        Fixture providing sample aptamer data
+    """
+    df = sample_aptadb_data
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 3
+
+    expected_columns = [
+        "aptamer_id",
+        "target_id",
+        "aptamer_sequence",
+        "target_name",
+        "target_uniprot",
+        "organism",
+        "ligand_type",
+        "binding_conditions",
+        "reference_pubmed_id",
+        "interaction_present",
+    ]
 
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame containing the loaded interaction data
+    for col in expected_columns:
+        assert col in df.columns, f"Expected column '{col}' not found in dataset"
 
-    See Also
-    --------
-    load_aptamer_interactions : The main function this aliases
-    """
-    return load_aptamer_interactions(
-        path=path,
-        encoding=encoding,
-        **read_csv_kwargs,
-    )
+    assert df["aptamer_sequence"].dtype == "object"
+    assert df["target_name"].dtype == "object"
 
 
-def load_aptadb(
-    dataset_name: str = "satarupadeb/aptamer-interactions",
-    cache_dir: str | Path | None = None,
-    force_download: bool = False,
-    *,
-    encoding: str | None = None,
-    **kwargs,
-) -> pd.DataFrame:
-    """Download and load aptamer-interactions Kaggle dataset as DataFrame.
+@pytest.mark.slow
+def test_cache_consistency(tmp_path):
+    """Test that consecutive calls with cache yield identical DataFrames.
 
-    This is the main function for accessing aptamer interaction data. It
-    handles dataset downloading, caching, and loading with a single function
-    call.
+    This test verifies that two consecutive calls yield same DataFrame
+    when using cache. It avoids network by seeding the cache with a
+    local CSV.
 
     Parameters
     ----------
-    dataset_name : str, default "satarupadeb/aptamer-interactions"
-        Kaggle dataset identifier in format "username/dataset-name"
-    cache_dir : str, Path, or None, default None
-        Directory for caching downloaded datasets. If None, uses
-        ~/.pyaptamer/cache/dataset_name
-    force_download : bool, default False
-        If True, re-download the dataset even if it exists in cache
-    encoding : str, optional
-        Specific file encoding for CSV reading. If None, automatic detection
-        is used
-    **kwargs
-        Additional keyword arguments passed to pandas.read_csv()
-
-    Returns
-    -------
-    pd.DataFrame
-        DataFrame containing the aptamer interaction dataset
-
-    Raises
-    ------
-    ImportError
-        If the kaggle package is not installed
-    RuntimeError
-        If dataset download fails
-    FileNotFoundError
-        If no CSV files are found in the downloaded dataset
-
-    Notes
-    -----
-    - Requires kaggle package installation and API configuration
-    - First run will download data depending on dataset size
-    - Subsequent runs use cached data unless force_download=True
-    - Cache directory structure: cache_dir/username_dataset-name/
+    tmp_path : Path
+        Pytest fixture providing a temporary directory
     """
-    # Set up cache directory
-    if cache_dir is None:
-        cache_dir = (
-            Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
-        )
-    else:
-        cache_dir = Path(cache_dir)
-
-    # Check if we already have a cached CSV file
-    csv_file = _find_csv(cache_dir) if cache_dir.exists() else None
-
-    # Download dataset if no cached file exists
-    if csv_file is None:
-        try:
-            _download_dataset(dataset_name, cache_dir, force_download=force_download)
-        except ImportError:
-            # Re-raise ImportError for clear messaging when kaggle is missing
-            raise ImportError(
-                "The 'kaggle' package is required to download datasets. "
-                "Install it with: pip install kaggle"
-            ) from None
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
-            ) from e
-
-        # Find the CSV file in downloaded data
-        csv_file = _find_csv(cache_dir)
-        if csv_file is None:
-            raise FileNotFoundError(
-                f"No CSV files found in downloaded Kaggle dataset at {cache_dir}"
-            )
-
-    # Load and return the CSV data
-    return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs)
+    csv_path = tmp_path / "aptadb.csv"
+    seeded = pd.DataFrame(
+        {"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]}
+    )
+    seeded.to_csv(csv_path, index=False)
+
+    with patch(
+        "pyaptamer.datasets._loaders.load_aptamer_interactions._find_csv",
+        return_value=csv_path,
+    ):
+        df1 = load_aptadb(cache_dir=tmp_path)
+        df2 = load_aptadb(cache_dir=tmp_path)
+        pd.testing.assert_frame_equal(df1, df2)

From 8b2982af5b03fdbc8aa04161668b8a2bc3150daf Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Sun, 5 Oct 2025 20:10:47 +0530
Subject: [PATCH 6/9] Update docsting

---
 .../_loaders/load_aptamer_interactions.py     | 115 +++++++++---------
 1 file changed, 59 insertions(+), 56 deletions(-)

diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
index 25f519c1..106f31a7 100644
--- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
+++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
@@ -11,28 +11,25 @@ def _download_dataset(
 ) -> None:
     """Download a Kaggle dataset to the specified directory and unzip it.
 
-    This is a private helper function used internally by the module.
-
     Parameters
     ----------
     dataset_name : str
-        The Kaggle dataset identifier in format "username/dataset-name"
+        Kaggle dataset identifier like "username/dataset-name".
     target_dir : Path
-        Directory where the dataset should be downloaded and extracted
+        Directory to download and extract the dataset.
     force_download : bool, default False
-        If True, download even if CSV files already exist in target_dir
+        If True, download even if CSV files already exist in target_dir.
 
     Raises
     ------
     ImportError
-        If the kaggle package is not installed
+        If the kaggle package is not installed.
     Exception
-        If the download fails for any reason
+        If the download fails for any reason.
 
     Notes
     -----
-    This function requires the kaggle package to be installed and properly
-    configured with API credentials.
+    Requires kaggle package installed and configured with API credentials.
     """
     import kaggle  # avoid import-time auth
 
@@ -46,27 +43,25 @@ def _download_dataset(
 
 
 def _find_csv(directory: Path) -> Path | None:
-    """Find the most appropriate CSV file in a directory.
-
-    This is a private helper function that implements smart CSV file detection.
+    """Return the most appropriate CSV file path from a directory.
 
     Parameters
     ----------
     directory : Path
-        Directory to search for CSV files
+        Directory to look for CSV files.
 
     Returns
     -------
     Path or None
-        Path to the most appropriate CSV file, or None if no CSV files found
+        Path to CSV file or None if none found.
 
     Notes
     -----
-    Selection priority:
-    1. If only one CSV file exists, return it
-    2. If multiple CSV files exist, prefer files with names containing:
-       "aptamer", "interaction", "main", or "data"
-    3. If no preferred names found, return the first CSV file
+    Preference order:
+    1. If only one CSV, return it.
+    2. If multiple, prefer files with "aptamer", "interaction", "main", or "data"
+    in name.
+    3. Otherwise, return first CSV found.
     """
     csv_files = list(directory.glob("*.csv"))
 
@@ -76,7 +71,6 @@ def _find_csv(directory: Path) -> Path | None:
     if len(csv_files) == 1:
         return csv_files[0]
 
-    # Look for files with preferred keywords in their names
     preferred_keywords = ["aptamer", "interaction", "main", "data"]
     candidates = [
         f
@@ -88,22 +82,17 @@ def _find_csv(directory: Path) -> Path | None:
 
 
 def _normalize_interaction_present(df: pd.DataFrame) -> None:
-    """Normalize interaction present column in the dataset.
-
-    This is a private helper function for data preprocessing.
-    Currently a placeholder for future implementation.
+    """Placeholder to normalize 'interaction_present' column in the DataFrame.
 
     Parameters
     ----------
     df : pd.DataFrame
-        The dataframe to normalize
+        DataFrame to normalize.
 
     Notes
     -----
-    This function is currently not implemented and serves as a placeholder
-    for future data normalization functionality.
+    Currently not implemented, kept for future data normalization.
     """
-    # TODO: Implement interaction present normalization
     return
 
 
@@ -113,48 +102,40 @@ def load_aptamer_interactions(
     encoding: str | None = None,
     **read_csv_kwargs,
 ) -> pd.DataFrame:
-    """Load an aptamer interactions CSV file into a pandas DataFrame.
+    """Load aptamer interactions CSV into a pandas DataFrame.
 
-    This function provides robust CSV loading with automatic encoding detection
-    and error handling for various file formats commonly found in biological
-    datasets.
+    Tries common encodings automatically for robust loading.
 
     Parameters
     ----------
     path : str or Path
-        Path to the CSV file containing aptamer interaction data
+        Path to CSV file with aptamer interactions.
     encoding : str, optional
-        Specific file encoding to use. If None (default), multiple common
-        encodings will be tried automatically
+        Specific file encoding to use. If None, tries common encodings.
     **read_csv_kwargs
-        Additional keyword arguments passed directly to pandas.read_csv()
+        Additional arguments passed to pandas.read_csv().
 
     Returns
     -------
     pd.DataFrame
-        DataFrame containing the loaded aptamer interaction data
+        DataFrame with aptamer interaction data.
 
     Raises
     ------
     RuntimeError
-        If the CSV file cannot be read with any of the attempted encodings
+        If CSV cannot be read with any attempted encodings.
 
     Notes
     -----
-    The function attempts the following encodings in order:
-    - utf-8
-    - utf-8-sig (for files with BOM)
-    - latin-1
-    - cp1252
-    - windows-1252
+    Encodings tried (in order): utf-8, utf-8-sig, latin-1, cp1252, windows-1252.
     """
     candidate_encodings = (
         [
             "utf-8",
-            "utf-8-sig",  # For files with byte order mark
+            "utf-8-sig",
             "latin-1",
-            "cp1252",  # Common Windows encoding
-            "windows-1252",  # Alternative Windows encoding
+            "cp1252",
+            "windows-1252",
         ]
         if encoding is None
         else [encoding]
@@ -171,8 +152,7 @@ def load_aptamer_interactions(
             continue
 
     raise RuntimeError(
-        f"Failed to read CSV at {path} with candidate encodings "
-        f"{candidate_encodings}: {last_error}"
+        f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}"
     )
 
 
@@ -182,11 +162,7 @@ def load_interactions(
     encoding: str | None = None,
     **read_csv_kwargs,
 ) -> pd.DataFrame:
-    """Load interaction data from a CSV file.
-
-    This is a convenience alias for load_aptamer_interactions() with identical
-    functionality and parameters.
-    """
+    """Alias for load_aptamer_interactions with same parameters and return."""
     return load_aptamer_interactions(
         path=path,
         encoding=encoding,
@@ -202,7 +178,35 @@ def load_aptadb(
     encoding: str | None = None,
     **kwargs,
 ) -> pd.DataFrame:
-    """Download and load aptamer-interactions Kaggle dataset as DataFrame."""
+    """Download and load aptamer-interactions dataset from Kaggle as DataFrame.
+
+    Parameters
+    ----------
+    dataset_name : str, optional
+        Kaggle dataset name.
+    cache_dir : str or Path, optional
+        Local directory for caching dataset files.
+    force_download : bool, default False
+        If True, download dataset even if cached files exist.
+    encoding : str, optional
+        Encoding for CSV file loading.
+    **kwargs
+        Additional arguments passed to CSV loader.
+
+    Returns
+    -------
+    pd.DataFrame
+        Loaded dataset as a pandas DataFrame.
+
+    Raises
+    ------
+    ImportError
+        If the 'kaggle' package is missing.
+    RuntimeError
+        If dataset download fails.
+    FileNotFoundError
+        If no CSV file found after download.
+    """
     if cache_dir is None:
         cache_dir = (
             Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
@@ -216,7 +220,6 @@ def load_aptadb(
         try:
             _download_dataset(dataset_name, cache_dir, force_download=force_download)
         except ImportError as err:
-            # Re-raise ImportError for clear messaging when kaggle is missing
             raise ImportError(
                 "The 'kaggle' package is required to download datasets. "
                 "Install it with: pip install kaggle"

From 08c10513ba62aa8187c4883a45e54dffa9a46ad0 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Tue, 7 Oct 2025 02:51:56 +0530
Subject: [PATCH 7/9] Remove normalise function

---
 .../_loaders/load_aptamer_interactions.py         | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
index 106f31a7..c0be31cd 100644
--- a/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
+++ b/pyaptamer/datasets/_loaders/load_aptamer_interactions.py
@@ -81,21 +81,6 @@ def _find_csv(directory: Path) -> Path | None:
     return candidates[0] if candidates else csv_files[0]
 
 
-def _normalize_interaction_present(df: pd.DataFrame) -> None:
-    """Placeholder to normalize 'interaction_present' column in the DataFrame.
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        DataFrame to normalize.
-
-    Notes
-    -----
-    Currently not implemented, kept for future data normalization.
-    """
-    return
-
-
 def load_aptamer_interactions(
     path: str | Path,
     *,

From 9a5fb63123c5c6d28537b0ffbb363e6f2be26b22 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Thu, 9 Oct 2025 23:11:42 +0530
Subject: [PATCH 8/9] add tests

---
 .../datasets/tests/test_aptamer_interactions_loader.py    | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
index 43a0baba..953b16c8 100644
--- a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
+++ b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -13,7 +13,6 @@
 
 def test_local_csv(tmp_path):
     """Test loading aptamer data from a local CSV file.
-
     Parameters
     ----------
     tmp_path : Path
@@ -37,7 +36,6 @@ def test_local_csv(tmp_path):
 
 def test_uses_cache(tmp_path):
     """Test that cached data is used instead of downloading.
-
     Parameters
     ----------
     tmp_path : Path
@@ -63,7 +61,6 @@ def test_uses_cache(tmp_path):
 
 def test_requires_kaggle(tmp_path):
     """Test that ImportError is raised when kaggle package is missing.
-
     Parameters
     ----------
     tmp_path : Path
@@ -77,7 +74,6 @@ def test_requires_kaggle(tmp_path):
 
 def test_invalid_dataset(tmp_path):
     """Test error handling for invalid dataset download.
-
     Parameters
     ----------
     tmp_path : Path
@@ -101,7 +97,6 @@ def test_invalid_dataset(tmp_path):
 @pytest.fixture
 def sample_aptadb_data():
     """Create sample aptamer interaction data for testing.
-
     Returns
     -------
     pd.DataFrame
@@ -129,7 +124,6 @@ def sample_aptadb_data():
 
 def test_sample_columns(sample_aptadb_data):
     """Test that sample data contains expected columns and data types.
-
     Parameters
     ----------
     sample_aptadb_data : pd.DataFrame
@@ -162,11 +156,9 @@ def test_sample_columns(sample_aptadb_data):
 @pytest.mark.slow
 def test_cache_consistency(tmp_path):
     """Test that consecutive calls with cache yield identical DataFrames.
-
     This test verifies that two consecutive calls yield same DataFrame
     when using cache. It avoids network by seeding the cache with a
     local CSV.
-
     Parameters
     ----------
     tmp_path : Path

From ba79a32c606b12b9aa943cd858a40eca4e2778b0 Mon Sep 17 00:00:00 2001
From: Satarupa22-SD <satarupa2212@gmail.com>
Date: Sun, 12 Oct 2025 00:03:55 +0530
Subject: [PATCH 9/9] update dependencies

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index ce9a019e..58a794e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "scikit-learn>=1.3.0",
     "skorch",
     "imblearn",
+    "kaggle",
 ]
 
 [project.optional-dependencies]