Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders.load_aptamer_interactions import (
load_aptadb,
load_interactions,
)

__all__ = [
"load_pfoa_structure",
"load_1gnh_structure",
"load_aptadb",
"load_from_rcsb",
"load_interactions",
]
11 changes: 10 additions & 1 deletion pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,14 @@

from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa_loader import load_pfoa_structure
from pyaptamer.datasets._loaders.load_aptamer_interactions import (
load_aptadb,
load_interactions,
)

__all__ = ["load_pfoa_structure", "load_1gnh_structure"]
__all__ = [
"load_pfoa_structure",
"load_1gnh_structure",
"load_aptadb",
"load_interactions",
]
235 changes: 235 additions & 0 deletions pyaptamer/datasets/_loaders/load_aptamer_interactions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
__author__ = "Satarupa22-SD"
__all__ = ["load_aptadb", "load_aptamer_interactions", "load_interactions"]

from pathlib import Path

import pandas as pd


def _download_dataset(
dataset_name: str, target_dir: Path, force_download: bool = False
) -> None:
"""Download a Kaggle dataset to the specified directory and unzip it.

This is a private helper function used internally by the module.

Parameters
----------
dataset_name : str
The Kaggle dataset identifier in format "username/dataset-name"
target_dir : Path
Directory where the dataset should be downloaded and extracted
force_download : bool, default False
If True, download even if CSV files already exist in target_dir

Raises
------
ImportError
If the kaggle package is not installed
Exception
If the download fails for any reason

Notes
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GPT tends to generate notes, I dont think we should add aditional text to reaad in a new subsection, but it could be developer preference. what do you feel @fkiraly ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move it up to the top

-----
This function requires the kaggle package to be installed and properly
configured with API credentials.
"""
import kaggle # avoid import-time auth

target_dir.mkdir(parents=True, exist_ok=True)

# Only download if forced or no CSV files exist
if force_download or not any(target_dir.glob("*.csv")):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do not understand why the parent directory cannot have other csv files in it. What if the user wants multiple datasets in one directory? Moreover why are we assuming that csv is the only file format one can download from kaggle?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dataset that we are working with is in csv format.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see, can you rename the functoin to _download_aptadb then? Moreover your docstring is very generalized to indicate it is meant for all kaggle datasets, kindly rewrite it to indicate it is for aptadb only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but the function can download any csv, not just aptadb?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's a generic function to download datasets from Kaggle.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current dataset is a combination of three files from the original aptaDB, this has been created to ensure the correct dataset is selected in case any updates are made to the original dataset(aptaDB or if we narrow down or expand the scope of the current dataset, currently it only targets aptamers, but if we include complexes in future and we have a new dataset there, we can update this function as needed.
The default cache structure uses dataset-specific subdirectories (~/.pyaptamer/cache/{dataset_name}/), which automatically isolates different Kaggle datasets in separate folders. When load_aptadb() is called multiple times, it reuses the cached CSV without creating duplicates (unless force_download=True is explicitly used).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default cache structure

where is this cache structure created?

What if the user wants multiple datasets in one directory?

How is cache coming into play when the user is using our aptadb loader?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the dataset is not present in the cache, the loader downloads it with kaggle api(user have to create their own), saves it to cache directory and returns the dataframe. if the dataset already exists in cache, the download is skipped and loads the csv and return dataframe from the cache itself.
Screenshot (24)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant where is the cache directory coming from? Is this something that is created in the users workspace once your loader is used?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's currently being created in the users home directory.

kaggle.api.dataset_download_files(
dataset_name, path=str(target_dir), unzip=True
)


def _find_csv(directory: Path) -> Path | None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you explain why this function is useful with an example?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is with reference to the last function, sorry i missed this, in case there are different files in future with varied filename, this function helps to find the dataset we want which in this case is aptamer_interaction. suppose the dataset is extended to complexes as well aptamer_interactions.csv and complexes.csv in the same root kaggle folder, this function would return aptamer_interactions.csv. the reason why i have added these extended function is so that if the dataset expands or we get new dataset with similar data, we can store it under the same kaggle dataset and don't have to repeat the functions or if we want to narrow the dataset scope we can store that too here, in which case it would be useful.

"""Find the most appropriate CSV file in a directory.

This is a private helper function that implements smart CSV file detection.

Parameters
----------
directory : Path
Directory to search for CSV files

Returns
-------
Path or None
Path to the most appropriate CSV file, or None if no CSV files found

Notes
-----
Selection priority:
1. If only one CSV file exists, return it
2. If multiple CSV files exist, prefer files with names containing:
"aptamer", "interaction", "main", or "data"
3. If no preferred names found, return the first CSV file
"""
csv_files = list(directory.glob("*.csv"))

if not csv_files:
return None

if len(csv_files) == 1:
return csv_files[0]

# Look for files with preferred keywords in their names
preferred_keywords = ["aptamer", "interaction", "main", "data"]
candidates = [
f
for f in csv_files
if any(keyword in f.name.lower() for keyword in preferred_keywords)
]

return candidates[0] if candidates else csv_files[0]


def _normalize_interaction_present(df: pd.DataFrame) -> None:
"""Normalize interaction present column in the dataset.

This is a private helper function for data preprocessing.
Currently a placeholder for future implementation.

Parameters
----------
df : pd.DataFrame
The dataframe to normalize

Notes
-----
This function is currently not implemented and serves as a placeholder
for future data normalization functionality.
"""
# TODO: Implement interaction present normalization
return


def load_aptamer_interactions(
path: str | Path,
*,
encoding: str | None = None,
**read_csv_kwargs,
) -> pd.DataFrame:
"""Load an aptamer interactions CSV file into a pandas DataFrame.

This function provides robust CSV loading with automatic encoding detection
and error handling for various file formats commonly found in biological
datasets.

Parameters
----------
path : str or Path
Path to the CSV file containing aptamer interaction data
encoding : str, optional
Specific file encoding to use. If None (default), multiple common
encodings will be tried automatically
**read_csv_kwargs
Additional keyword arguments passed directly to pandas.read_csv()

Returns
-------
pd.DataFrame
DataFrame containing the loaded aptamer interaction data

Raises
------
RuntimeError
If the CSV file cannot be read with any of the attempted encodings

Notes
-----
The function attempts the following encodings in order:
- utf-8
- utf-8-sig (for files with BOM)
- latin-1
- cp1252
- windows-1252
"""
candidate_encodings = (
[
"utf-8",
"utf-8-sig", # For files with byte order mark
"latin-1",
"cp1252", # Common Windows encoding
"windows-1252", # Alternative Windows encoding
]
if encoding is None
else [encoding]
)

last_error: Exception | None = None

for enc in candidate_encodings:
try:
df = pd.read_csv(path, encoding=enc, **read_csv_kwargs)
return df
except Exception as e:
last_error = e
continue

raise RuntimeError(
f"Failed to read CSV at {path} with candidate encodings "
f"{candidate_encodings}: {last_error}"
)


def load_interactions(
path: str | Path,
*,
encoding: str | None = None,
**read_csv_kwargs,
) -> pd.DataFrame:
"""Load interaction data from a CSV file.

This is a convenience alias for load_aptamer_interactions() with identical
functionality and parameters.
"""
return load_aptamer_interactions(
path=path,
encoding=encoding,
**read_csv_kwargs,
)


def load_aptadb(
dataset_name: str = "satarupadeb/aptamer-interactions",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder for when your PR is ready to be merged we should move this dataset to an account under gcos.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes sure. I think the process is also similar to how we do it on Hugging Face.
@fkiraly does GCOS have an organization on Kaggle?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could do it once your PR is ready, just remind me please I will get it sorted out 😄

cache_dir: str | Path | None = None,
force_download: bool = False,
*,
encoding: str | None = None,
**kwargs,
) -> pd.DataFrame:
"""Download and load aptamer-interactions Kaggle dataset as DataFrame."""
if cache_dir is None:
cache_dir = (
Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
)
else:
cache_dir = Path(cache_dir)

csv_file = _find_csv(cache_dir) if cache_dir.exists() else None

if csv_file is None:
try:
_download_dataset(dataset_name, cache_dir, force_download=force_download)
except ImportError as err:
# Re-raise ImportError for clear messaging when kaggle is missing
raise ImportError(
"The 'kaggle' package is required to download datasets. "
"Install it with: pip install kaggle"
) from err
except Exception as e:
raise RuntimeError(
f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
) from e

csv_file = _find_csv(cache_dir)
if csv_file is None:
raise FileNotFoundError(
f"No CSV files found in downloaded Kaggle dataset at {cache_dir}"
)

return load_aptamer_interactions(path=str(csv_file), encoding=encoding, **kwargs)
Loading