Skip to content

Commit 00f7de6

Browse files
Fix to paginated loading in SCDL. (#1285)
### Description This addresses the issue: #1222. Paginated loading in SCDL enables loading in large anndata files block by block. Previously, this would not access the raw .X data by default for the paginated loading, while it is the default for the regular loading pathway. This fix will load raw.X data by default, and .X data if use_X_not_raw is set to true. I have verified that paginated loading is equivalent to regular loading with sample cell x gene files. These have different raw .X values and processed .X values. #### Usage Usage remains the same ### Type of changes <!-- Mark the relevant option with an [x] --> - [x] Bug fix (non-breaking change which fixes an issue) - [ ] New feature (non-breaking change which adds functionality) - [ ] Refactor - [ ] Documentation update - [ ] Other (please describe): <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Tests** * Added testing for lazy loading of single-cell datasets with raw matrices. * **Bug Fixes** * Improved handling of raw matrices in lazy-loading scenarios for single-cell data processing. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: polinabinder1 <[email protected]> Co-authored-by: Steven Kothen-Hill <[email protected]>
1 parent 112ad15 commit 00f7de6

15 files changed

+339
-114
lines changed

sub-packages/bionemo-geneformer/examples/geneformer-celltype-classification.ipynb

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,19 @@
2525
},
2626
{
2727
"cell_type": "code",
28-
"execution_count": 1,
28+
"execution_count": 2,
2929
"metadata": {},
3030
"outputs": [
31+
{
32+
"name": "stderr",
33+
"output_type": "stream",
34+
"text": [
35+
"/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
36+
" return dispatch(args[0].__class__)(*args, **kw)\n",
37+
"/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
38+
" return dispatch(args[0].__class__)(*args, **kw)\n"
39+
]
40+
},
3141
{
3242
"data": {
3343
"text/plain": [
@@ -64,7 +74,7 @@
6474
" 'vein endothelial cell']"
6575
]
6676
},
67-
"execution_count": 1,
77+
"execution_count": 2,
6878
"metadata": {},
6979
"output_type": "execute_result"
7080
}
@@ -87,7 +97,7 @@
8797
},
8898
{
8999
"cell_type": "code",
90-
"execution_count": 2,
100+
"execution_count": 3,
91101
"metadata": {},
92102
"outputs": [
93103
{
@@ -96,7 +106,7 @@
96106
"(8000, 60664)"
97107
]
98108
},
99-
"execution_count": 2,
109+
"execution_count": 3,
100110
"metadata": {},
101111
"output_type": "execute_result"
102112
}
@@ -132,7 +142,7 @@
132142
},
133143
{
134144
"cell_type": "code",
135-
"execution_count": 3,
145+
"execution_count": 4,
136146
"metadata": {},
137147
"outputs": [],
138148
"source": [
@@ -162,11 +172,11 @@
162172
},
163173
{
164174
"cell_type": "code",
165-
"execution_count": 4,
175+
"execution_count": 5,
166176
"metadata": {},
167177
"outputs": [],
168178
"source": [
169-
"!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir}"
179+
"!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir} --use-X-not-raw"
170180
]
171181
},
172182
{
@@ -178,7 +188,7 @@
178188
},
179189
{
180190
"cell_type": "code",
181-
"execution_count": 5,
191+
"execution_count": 6,
182192
"metadata": {},
183193
"outputs": [
184194
{
@@ -194,7 +204,7 @@
194204
" 'version.json']"
195205
]
196206
},
197-
"execution_count": 5,
207+
"execution_count": 6,
198208
"metadata": {},
199209
"output_type": "execute_result"
200210
}

sub-packages/bionemo-geneformer/examples/geneformer-gene-embedding-GRN.ipynb

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,16 @@
1717
"execution_count": 1,
1818
"metadata": {},
1919
"outputs": [
20+
{
21+
"name": "stderr",
22+
"output_type": "stream",
23+
"text": [
24+
"/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
25+
" return dispatch(args[0].__class__)(*args, **kw)\n",
26+
"/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
27+
" return dispatch(args[0].__class__)(*args, **kw)\n"
28+
]
29+
},
2030
{
2131
"data": {
2232
"text/plain": [
@@ -161,7 +171,7 @@
161171
"name": "stdout",
162172
"output_type": "stream",
163173
"text": [
164-
"/home/ubuntu/.cache/bionemo/notebook_tutorials/geneformer_geneembeddings-GRN\n"
174+
"/home/pbinder/.cache/bionemo/notebook_tutorials/geneformer_geneembeddings-GRN\n"
165175
]
166176
}
167177
],
@@ -178,11 +188,11 @@
178188
},
179189
{
180190
"cell_type": "code",
181-
"execution_count": 5,
191+
"execution_count": 7,
182192
"metadata": {},
183193
"outputs": [],
184194
"source": [
185-
"!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir}"
195+
"!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir} --use-X-not-raw"
186196
]
187197
},
188198
{
@@ -194,7 +204,7 @@
194204
},
195205
{
196206
"cell_type": "code",
197-
"execution_count": 6,
207+
"execution_count": 8,
198208
"metadata": {
199209
"scrolled": true
200210
},
@@ -212,7 +222,7 @@
212222
" 'version.json']"
213223
]
214224
},
215-
"execution_count": 6,
225+
"execution_count": 8,
216226
"metadata": {},
217227
"output_type": "execute_result"
218228
}
@@ -1253,7 +1263,7 @@
12531263
],
12541264
"metadata": {
12551265
"kernelspec": {
1256-
"display_name": "Python 3",
1266+
"display_name": "newenv12",
12571267
"language": "python",
12581268
"name": "python3"
12591269
},
@@ -1267,7 +1277,7 @@
12671277
"name": "python",
12681278
"nbconvert_exporter": "python",
12691279
"pygments_lexer": "ipython3",
1270-
"version": "3.12.3"
1280+
"version": "3.12.5"
12711281
}
12721282
},
12731283
"nbformat": 4,

sub-packages/bionemo-geneformer/examples/geneformer_cellxgene_tutorial.ipynb

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -152,17 +152,20 @@
152152
"# Create training data processed directory\n",
153153
"!convert_h5ad_to_scdl \\\n",
154154
" --data-path {train_tutorial_data_dir} \\\n",
155-
" --save-path {train_tutorial_processed_dir}\n",
155+
" --save-path {train_tutorial_processed_dir} \\\n",
156+
" --use-X-not-raw\n",
156157
"\n",
157158
"# Create validation data processed directory\n",
158159
"!convert_h5ad_to_scdl \\\n",
159160
" --data-path {val_tutorial_data_dir} \\\n",
160-
" --save-path {val_tutorial_processed_dir}\n",
161+
" --save-path {val_tutorial_processed_dir} \\\n",
162+
" --use-X-not-raw\n",
161163
"\n",
162164
"# Create test data processed directory\n",
163165
"!convert_h5ad_to_scdl \\\n",
164166
" --data-path {test_tutorial_data_dir} \\\n",
165-
" --save-path {test_tutorial_processed_dir}"
167+
" --save-path {test_tutorial_processed_dir} \\\n",
168+
" --use-X-not-raw"
166169
]
167170
},
168171
{
@@ -1454,7 +1457,7 @@
14541457
},
14551458
{
14561459
"cell_type": "code",
1457-
"execution_count": null,
1460+
"execution_count": 19,
14581461
"id": "c8e8d923-faea-487a-90c6-e59a6e99c41a",
14591462
"metadata": {},
14601463
"outputs": [],
@@ -1463,7 +1466,7 @@
14631466
],
14641467
"metadata": {
14651468
"kernelspec": {
1466-
"display_name": "Python 3",
1469+
"display_name": "newenv12",
14671470
"language": "python",
14681471
"name": "python3"
14691472
},
@@ -1477,7 +1480,7 @@
14771480
"name": "python",
14781481
"nbconvert_exporter": "python",
14791482
"pygments_lexer": "ipython3",
1480-
"version": "3.12.3"
1483+
"version": "3.12.5"
14811484
}
14821485
},
14831486
"nbformat": 4,

sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_dataset.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,19 +44,19 @@ def test_load_sc_datasets(tmp_path, test_directory_feat_ids):
4444
tokenizer = MagicMock()
4545
sc_memmap_dataset_path0 = tmp_path / "test_data_0"
4646
ds_0 = SingleCellMemMapDataset(
47-
str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad")
47+
str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad"), use_X_not_raw=True
4848
) # create the memmap dataset format from h5ad for testing purposes
4949
dataset0 = SingleCellDataset(str(sc_memmap_dataset_path0), tokenizer)
5050
assert len(dataset0) == len(ds_0) == 8
5151
sc_memmap_dataset_path1 = tmp_path / "test_data_1"
5252
ds_1 = SingleCellMemMapDataset(
53-
str(sc_memmap_dataset_path1), h5ad_path=str(test_directory_feat_ids / "adata_sample1.h5ad")
53+
str(sc_memmap_dataset_path1), h5ad_path=str(test_directory_feat_ids / "adata_sample1.h5ad"), use_X_not_raw=True
5454
) # create the memmap dataset format from h5ad for testing purposes
5555
dataset1 = SingleCellDataset(str(sc_memmap_dataset_path1), tokenizer)
5656
assert len(dataset1) == len(ds_1) == 6
5757
sc_memmap_dataset_path2 = tmp_path / "test_data_2"
5858
ds_2 = SingleCellMemMapDataset(
59-
str(sc_memmap_dataset_path2), h5ad_path=str(test_directory_feat_ids / "adata_sample2.h5ad")
59+
str(sc_memmap_dataset_path2), h5ad_path=str(test_directory_feat_ids / "adata_sample2.h5ad"), use_X_not_raw=True
6060
) # create the memmap dataset format from h5ad for testing purposes
6161
dataset2 = SingleCellDataset(str(sc_memmap_dataset_path2), tokenizer)
6262
assert len(dataset2) == len(ds_2) == 100
@@ -82,7 +82,7 @@ def test_gene_not_in_tok_vocab(tmp_path, test_directory_feat_ids):
8282
adata.var["feature_id"] = synthetic_ids
8383
adata.write(sc_h5ad_dataset_path0)
8484
SingleCellMemMapDataset(
85-
str(sc_memmap_dataset_path0), h5ad_path=str(sc_h5ad_dataset_path0)
85+
str(sc_memmap_dataset_path0), h5ad_path=str(sc_h5ad_dataset_path0), use_X_not_raw=True
8686
) # create the memmap dataset format from h5ad for testing purposes
8787
preprocessor = GeneformerPreprocess(
8888
download_directory=str(sc_memmap_dataset_path0),
@@ -115,7 +115,7 @@ def test_gene_not_in_tok_vocab(tmp_path, test_directory_feat_ids):
115115
def test_empty_gene_data_input(tmp_path, test_directory_feat_ids):
116116
sc_memmap_dataset_path0 = tmp_path / "test_data_0"
117117
SingleCellMemMapDataset(
118-
str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad")
118+
str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad"), use_X_not_raw=True
119119
) # create the memmap dataset format from h5ad for testing purposes
120120
preprocessor = GeneformerPreprocess(
121121
download_directory=str(sc_memmap_dataset_path0),
@@ -158,7 +158,7 @@ def test_lookup_row(tmp_path, cellx_small_directory):
158158
def test_get_item_synthetic(tmp_path, test_directory_feat_ids):
159159
sc_memmap_dataset_path0 = tmp_path / "test_data_0"
160160
SingleCellMemMapDataset(
161-
sc_memmap_dataset_path0, h5ad_path=test_directory_feat_ids / "adata_sample0.h5ad"
161+
sc_memmap_dataset_path0, h5ad_path=test_directory_feat_ids / "adata_sample0.h5ad", use_X_not_raw=True
162162
) # create the memmap dataset format from h5ad for testing purposes
163163
preprocessor = GeneformerPreprocess(
164164
download_directory=sc_memmap_dataset_path0,

sub-packages/bionemo-scdl/README.md

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,19 @@ If the dataset is large, the AnnData file can be lazy-loaded and then read in ba
4848
- `paginated_load_cutoff`, which sets the minimal file size in megabytes at which an AnnData file will be read in in a paginated manner.
4949
- `load_block_row_size`, which is the number of rows that are read into memory at a given time.
5050

51+
### Loading `raw.X` vs `.X` from the anndata file
52+
53+
By default, SCDL will load the data from the `raw.X` in the anndata file. If using the `.X` is desired, set `use_X_not_raw = True`
54+
during the dataset creation:
55+
56+
```python
57+
from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
58+
59+
data = SingleCellMemMapDataset(
60+
"97e_scmm", "hdf5s/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad"
61+
)
62+
```
63+
5164
### Interrogating single cell datasets and exploring the API
5265

5366
```python
@@ -206,7 +219,7 @@ During dataset concatenation, it is assumed that all of the data types are eithe
206219
To convert multiple files with a given data format, the user can run:
207220

208221
```bash
209-
convert_h5ad_to_scdl --data-path hdf5s --save-path example_dataset [--data-dtype float64 --paginated_load_cutoff 10_000 --load-block-row-size 1_000_000]
222+
convert_h5ad_to_scdl --data-path hdf5s --save-path example_dataset [--data-dtype float64 --paginated_load_cutoff 10_000 --load-block-row-size 1_000_000 --use-X-not-raw]
210223
```
211224

212225
## Runtimes with SCDL

sub-packages/bionemo-scdl/simple-benchmark/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ python scdl_speedtest.py --generate-baseline -i my_data.h5ad --scdl-path /path/t
113113
| `--scdl-path` | Path to SCDL dataset (optional, only used with --generate-baseline) | None |
114114
| `--num-epochs` | The number of epochs (passes through the training dataset). | 1 |
115115
| `--num-runs` | Number of benchmark runs to average (for more stable and reliable measurements) | 1 |
116+
| `--use-X-not-raw` | Set to use the .X, not the raw.X from an anndata file at conversion time | None |
116117

117118
## Sample Output
118119

@@ -259,7 +260,7 @@ full conversion; however, running a single plate of the data should give you a g
259260
on your system. The following command will run the speedtest on the first plate, as downloaded above:
260261

261262
```bash
262-
python scdl_speedtest.py --generate-baseline -i tahoe-100m/h5ad/plate1_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad --warmup-time 30 --max-time 120
263+
python scdl_speedtest.py --generate-baseline -i tahoe-100m/h5ad/plate1_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad --warmup-time 30 --max-time 120 --use-X-not-raw
263264
```
264265

265266
Alternatively, on the fully converted data:

sub-packages/bionemo-scdl/simple-benchmark/scdl_speedtest.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -868,7 +868,9 @@ def get_sampler(sampling_scheme: str, dataset: torch.utils.data.Dataset):
868868
return shuffle, sampler
869869

870870

871-
def create_dataloader_factory(input_path: str, sampling_scheme: str, batch_size: int = 32, use_anndata: bool = False):
871+
def create_dataloader_factory(
872+
input_path: str, sampling_scheme: str, batch_size: int = 32, use_anndata: bool = False, use_X_not_raw: bool = False
873+
):
872874
"""Create a factory function for the dataloader."""
873875
# Track conversion metrics globally to be accessible later
874876
conversion_metrics = {"time": 0.0, "performed": False}
@@ -924,7 +926,9 @@ def factory():
924926
if input_path.endswith(".h5ad"):
925927
print(f"Converting h5ad to SCDL format: {Path(input_path).name}")
926928
conversion_start = time.perf_counter()
927-
dataset = SingleCellMemMapDataset(data_path=data_dir, h5ad_path=input_path)
929+
dataset = SingleCellMemMapDataset(
930+
data_path=data_dir, h5ad_path=input_path, use_X_not_raw=use_X_not_raw
931+
)
928932
conversion_end = time.perf_counter()
929933
conversion_time = conversion_end - conversion_start
930934
conversion_metrics["time"] = conversion_time
@@ -934,7 +938,9 @@ def factory():
934938
# Directory: convert all h5ad files in the directory
935939
with tempfile.TemporaryDirectory() as temp_dir:
936940
coll = SingleCellCollection(temp_dir)
937-
coll.load_h5ad_multi(input_path, max_workers=4, use_processes=False)
941+
coll.load_h5ad_multi(
942+
input_path, max_workers=4, use_processes=False, use_X_not_raw=use_X_not_raw
943+
)
938944
coll.flatten(data_dir, destroy_on_copy=True)
939945

940946
conversion_start = time.perf_counter()
@@ -1269,7 +1275,11 @@ def main():
12691275

12701276
parser.add_argument("--num-epochs", type=int, default=1, help="Number of epochs (default: 1)")
12711277
parser.add_argument("--num-runs", type=int, default=1, help="Number of benchmark runs to average (default: 1)")
1272-
1278+
parser.add_argument(
1279+
"--use-X-not-raw",
1280+
action="store_true",
1281+
help="Use .X instead of raw.X from the anndata file (only applicable when generating a SCDL dataset)",
1282+
)
12731283
args = parser.parse_args()
12741284

12751285
# Validate num_runs parameter
@@ -1347,7 +1357,11 @@ def run_single_benchmark(name, factory, data_path, run_num=None):
13471357
else:
13481358
scdl_path = str(input_path)
13491359
scdl_factory = create_dataloader_factory(
1350-
str(scdl_path), args.sampling_scheme, args.batch_size, use_anndata=False
1360+
str(scdl_path),
1361+
args.sampling_scheme,
1362+
args.batch_size,
1363+
use_anndata=False,
1364+
use_X_not_raw=args.use_X_not_raw,
13511365
)
13521366

13531367
scdl_results = []

0 commit comments

Comments
 (0)