NVIDIA
diff --git a/‎sub-packages/bionemo-geneformer/examples/geneformer-celltype-classification.ipynb‎
Lines changed: 19 additions & 9 deletions b/‎sub-packages/bionemo-geneformer/examples/geneformer-celltype-classification.ipynb‎
Lines changed: 19 additions & 9 deletions
diff --git a/‎sub-packages/bionemo-geneformer/examples/geneformer-gene-embedding-GRN.ipynb‎
Lines changed: 17 additions & 7 deletions b/‎sub-packages/bionemo-geneformer/examples/geneformer-gene-embedding-GRN.ipynb‎
Lines changed: 17 additions & 7 deletions
diff --git a/‎sub-packages/bionemo-geneformer/examples/geneformer_cellxgene_tutorial.ipynb‎
Lines changed: 9 additions & 6 deletions b/‎sub-packages/bionemo-geneformer/examples/geneformer_cellxgene_tutorial.ipynb‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_dataset.py‎
Lines changed: 6 additions & 6 deletions b/‎sub-packages/bionemo-geneformer/tests/bionemo/geneformer/test_dataset.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎sub-packages/bionemo-scdl/README.md‎
Lines changed: 14 additions & 1 deletion b/‎sub-packages/bionemo-scdl/README.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎sub-packages/bionemo-scdl/simple-benchmark/README.md‎
Lines changed: 2 additions & 1 deletion b/‎sub-packages/bionemo-scdl/simple-benchmark/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sub-packages/bionemo-scdl/simple-benchmark/scdl_speedtest.py‎
Lines changed: 19 additions & 5 deletions b/‎sub-packages/bionemo-scdl/simple-benchmark/scdl_speedtest.py‎
Lines changed: 19 additions & 5 deletions
@@ -25,9 +25,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
+      "  return dispatch(args[0].__class__)(*args, **kw)\n",
+      "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
+      "  return dispatch(args[0].__class__)(*args, **kw)\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -64,7 +74,7 @@
        " 'vein endothelial cell']"
       ]
      },
-     "execution_count": 1,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -87,7 +97,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -96,7 +106,7 @@
        "(8000, 60664)"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -132,7 +142,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -162,11 +172,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir}"
+    "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir} --use-X-not-raw"
    ]
   },
   {
@@ -178,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -194,7 +204,7 @@
        " 'version.json']"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
 
@@ -17,6 +17,16 @@
    "execution_count": 1,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
+      "  return dispatch(args[0].__class__)(*args, **kw)\n",
+      "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",
+      "  return dispatch(args[0].__class__)(*args, **kw)\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
@@ -161,7 +171,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/home/ubuntu/.cache/bionemo/notebook_tutorials/geneformer_geneembeddings-GRN\n"
+      "/home/pbinder/.cache/bionemo/notebook_tutorials/geneformer_geneembeddings-GRN\n"
      ]
     }
    ],
@@ -178,11 +188,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir}"
+    "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir} --use-X-not-raw"
    ]
   },
   {
@@ -194,7 +204,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "metadata": {
     "scrolled": true
    },
@@ -212,7 +222,7 @@
        " 'version.json']"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1253,7 +1263,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "newenv12",
    "language": "python",
    "name": "python3"
   },
@@ -1267,7 +1277,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,
 
@@ -152,17 +152,20 @@
     "# Create training data processed directory\n",
     "!convert_h5ad_to_scdl \\\n",
     "  --data-path {train_tutorial_data_dir} \\\n",
-    "  --save-path {train_tutorial_processed_dir}\n",
+    "  --save-path {train_tutorial_processed_dir} \\\n",
+    "  --use-X-not-raw\n",
     "\n",
     "# Create validation data processed directory\n",
     "!convert_h5ad_to_scdl \\\n",
     "  --data-path {val_tutorial_data_dir} \\\n",
-    "  --save-path {val_tutorial_processed_dir}\n",
+    "  --save-path {val_tutorial_processed_dir} \\\n",
+    "  --use-X-not-raw\n",
     "\n",
     "# Create test data processed directory\n",
     "!convert_h5ad_to_scdl \\\n",
     "  --data-path {test_tutorial_data_dir} \\\n",
-    "  --save-path {test_tutorial_processed_dir}"
+    "  --save-path {test_tutorial_processed_dir} \\\n",
+    "  --use-X-not-raw"
    ]
   },
   {
@@ -1454,7 +1457,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "id": "c8e8d923-faea-487a-90c6-e59a6e99c41a",
    "metadata": {},
    "outputs": [],
@@ -1463,7 +1466,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "newenv12",
    "language": "python",
    "name": "python3"
   },
@@ -1477,7 +1480,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.5"
   }
  },
  "nbformat": 4,
 
@@ -44,19 +44,19 @@ def test_load_sc_datasets(tmp_path, test_directory_feat_ids):
     tokenizer = MagicMock()
     sc_memmap_dataset_path0 = tmp_path / "test_data_0"
     ds_0 = SingleCellMemMapDataset(
-        str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad")
+        str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad"), use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     dataset0 = SingleCellDataset(str(sc_memmap_dataset_path0), tokenizer)
     assert len(dataset0) == len(ds_0) == 8
     sc_memmap_dataset_path1 = tmp_path / "test_data_1"
     ds_1 = SingleCellMemMapDataset(
-        str(sc_memmap_dataset_path1), h5ad_path=str(test_directory_feat_ids / "adata_sample1.h5ad")
+        str(sc_memmap_dataset_path1), h5ad_path=str(test_directory_feat_ids / "adata_sample1.h5ad"), use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     dataset1 = SingleCellDataset(str(sc_memmap_dataset_path1), tokenizer)
     assert len(dataset1) == len(ds_1) == 6
     sc_memmap_dataset_path2 = tmp_path / "test_data_2"
     ds_2 = SingleCellMemMapDataset(
-        str(sc_memmap_dataset_path2), h5ad_path=str(test_directory_feat_ids / "adata_sample2.h5ad")
+        str(sc_memmap_dataset_path2), h5ad_path=str(test_directory_feat_ids / "adata_sample2.h5ad"), use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     dataset2 = SingleCellDataset(str(sc_memmap_dataset_path2), tokenizer)
     assert len(dataset2) == len(ds_2) == 100
@@ -82,7 +82,7 @@ def test_gene_not_in_tok_vocab(tmp_path, test_directory_feat_ids):
     adata.var["feature_id"] = synthetic_ids
     adata.write(sc_h5ad_dataset_path0)
     SingleCellMemMapDataset(
-        str(sc_memmap_dataset_path0), h5ad_path=str(sc_h5ad_dataset_path0)
+        str(sc_memmap_dataset_path0), h5ad_path=str(sc_h5ad_dataset_path0), use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     preprocessor = GeneformerPreprocess(
         download_directory=str(sc_memmap_dataset_path0),
@@ -115,7 +115,7 @@ def test_gene_not_in_tok_vocab(tmp_path, test_directory_feat_ids):
 def test_empty_gene_data_input(tmp_path, test_directory_feat_ids):
     sc_memmap_dataset_path0 = tmp_path / "test_data_0"
     SingleCellMemMapDataset(
-        str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad")
+        str(sc_memmap_dataset_path0), h5ad_path=str(test_directory_feat_ids / "adata_sample0.h5ad"), use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     preprocessor = GeneformerPreprocess(
         download_directory=str(sc_memmap_dataset_path0),
@@ -158,7 +158,7 @@ def test_lookup_row(tmp_path, cellx_small_directory):
 def test_get_item_synthetic(tmp_path, test_directory_feat_ids):
     sc_memmap_dataset_path0 = tmp_path / "test_data_0"
     SingleCellMemMapDataset(
-        sc_memmap_dataset_path0, h5ad_path=test_directory_feat_ids / "adata_sample0.h5ad"
+        sc_memmap_dataset_path0, h5ad_path=test_directory_feat_ids / "adata_sample0.h5ad", use_X_not_raw=True
     )  # create the memmap dataset format from h5ad for testing purposes
     preprocessor = GeneformerPreprocess(
         download_directory=sc_memmap_dataset_path0,
 
@@ -48,6 +48,19 @@ If the dataset is large, the AnnData file can be lazy-loaded and then read in ba
 - `paginated_load_cutoff`, which sets the minimal file size in megabytes at which an AnnData file will be read in in a paginated manner.
 - `load_block_row_size`, which is the number of rows that are read into memory at a given time.
 
+### Loading `raw.X` vs `.X` from the anndata file
+
+By default, SCDL will load the data from the `raw.X` in the anndata file. If using the `.X` is desired, set `use_X_not_raw = True`
+during the dataset creation:
+
+```python
+from bionemo.scdl.io.single_cell_memmap_dataset import SingleCellMemMapDataset
+
+data = SingleCellMemMapDataset(
+    "97e_scmm", "hdf5s/97e96fb1-8caf-4f08-9174-27308eabd4ea.h5ad"
+)
+```
+
 ### Interrogating single cell datasets and exploring the API
 
 ```python
@@ -206,7 +219,7 @@ During dataset concatenation, it is assumed that all of the data types are eithe
 To convert multiple files with a given data format, the user can run:
 
 ```bash
-convert_h5ad_to_scdl --data-path hdf5s --save-path example_dataset [--data-dtype float64 --paginated_load_cutoff 10_000 --load-block-row-size 1_000_000]
+convert_h5ad_to_scdl --data-path hdf5s --save-path example_dataset [--data-dtype float64 --paginated_load_cutoff 10_000 --load-block-row-size 1_000_000 --use-X-not-raw]
 ```
 
 ## Runtimes with SCDL
 
@@ -113,6 +113,7 @@ python scdl_speedtest.py --generate-baseline -i my_data.h5ad --scdl-path /path/t
 | `--scdl-path`           | Path to SCDL dataset (optional, only used with --generate-baseline)                                                  | None                     |
 | `--num-epochs`          | The number of epochs (passes through the training dataset).                                                          | 1                        |
 | `--num-runs`            | Number of benchmark runs to average (for more stable and reliable measurements)                                      | 1                        |
+| `--use-X-not-raw`       | Set to use the .X, not the raw.X from an anndata file at conversion time                                             | None                     |
 
 ## Sample Output
 
@@ -259,7 +260,7 @@ full conversion; however, running a single plate of the data should give you a g
 on your system. The following command will run the speedtest on the first plate, as downloaded above:
 
 ```bash
-python scdl_speedtest.py --generate-baseline -i tahoe-100m/h5ad/plate1_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad --warmup-time 30 --max-time 120
+python scdl_speedtest.py --generate-baseline -i tahoe-100m/h5ad/plate1_filt_Vevo_Tahoe100M_WServicesFrom_ParseGigalab.h5ad --warmup-time 30 --max-time 120 --use-X-not-raw
 ```
 
 Alternatively, on the fully converted data:
 
@@ -868,7 +868,9 @@ def get_sampler(sampling_scheme: str, dataset: torch.utils.data.Dataset):
     return shuffle, sampler
 
 
-def create_dataloader_factory(input_path: str, sampling_scheme: str, batch_size: int = 32, use_anndata: bool = False):
+def create_dataloader_factory(
+    input_path: str, sampling_scheme: str, batch_size: int = 32, use_anndata: bool = False, use_X_not_raw: bool = False
+):
     """Create a factory function for the dataloader."""
     # Track conversion metrics globally to be accessible later
     conversion_metrics = {"time": 0.0, "performed": False}
@@ -924,7 +926,9 @@ def factory():
                     if input_path.endswith(".h5ad"):
                         print(f"Converting h5ad to SCDL format: {Path(input_path).name}")
                         conversion_start = time.perf_counter()
-                        dataset = SingleCellMemMapDataset(data_path=data_dir, h5ad_path=input_path)
+                        dataset = SingleCellMemMapDataset(
+                            data_path=data_dir, h5ad_path=input_path, use_X_not_raw=use_X_not_raw
+                        )
                         conversion_end = time.perf_counter()
                         conversion_time = conversion_end - conversion_start
                         conversion_metrics["time"] = conversion_time
@@ -934,7 +938,9 @@ def factory():
                         # Directory: convert all h5ad files in the directory
                         with tempfile.TemporaryDirectory() as temp_dir:
                             coll = SingleCellCollection(temp_dir)
-                            coll.load_h5ad_multi(input_path, max_workers=4, use_processes=False)
+                            coll.load_h5ad_multi(
+                                input_path, max_workers=4, use_processes=False, use_X_not_raw=use_X_not_raw
+                            )
                             coll.flatten(data_dir, destroy_on_copy=True)
 
                         conversion_start = time.perf_counter()
@@ -1269,7 +1275,11 @@ def main():
 
     parser.add_argument("--num-epochs", type=int, default=1, help="Number of epochs (default: 1)")
     parser.add_argument("--num-runs", type=int, default=1, help="Number of benchmark runs to average (default: 1)")
-
+    parser.add_argument(
+        "--use-X-not-raw",
+        action="store_true",
+        help="Use .X instead of raw.X from the anndata file (only applicable when generating a SCDL dataset)",
+    )
     args = parser.parse_args()
 
     # Validate num_runs parameter
@@ -1347,7 +1357,11 @@ def run_single_benchmark(name, factory, data_path, run_num=None):
             else:
                 scdl_path = str(input_path)
             scdl_factory = create_dataloader_factory(
-                str(scdl_path), args.sampling_scheme, args.batch_size, use_anndata=False
+                str(scdl_path),
+                args.sampling_scheme,
+                args.batch_size,
+                use_anndata=False,
+                use_X_not_raw=args.use_X_not_raw,
             )
 
             scdl_results = []
Original file line number	Diff line number	Diff line change
`@@ -25,9 +25,19 @@`
`25`	`25`	`},`
`26`	`26`	`{`
`27`	`27`	`"cell_type": "code",`
`28`		`- "execution_count": 1,`
	`28`	`+ "execution_count": 2,`
`29`	`29`	`"metadata": {},`
`30`	`30`	`"outputs": [`
	`31`	`+ {`
	`32`	`+ "name": "stderr",`
	`33`	`+ "output_type": "stream",`
	`34`	`+ "text": [`
	`35`	`+ "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",`
	`36`	`+ " return dispatch(args[0].__class__)(args, *kw)\n",`
	`37`	`+ "/home/pbinder/miniforge3/envs/newenv12/lib/python3.12/functools.py:907: ImplicitModificationWarning: Transforming to str index.\n",`
	`38`	`+ " return dispatch(args[0].__class__)(args, *kw)\n"`
	`39`	`+ ]`
	`40`	`+ },`
`31`	`41`	`{`
`32`	`42`	`"data": {`
`33`	`43`	`"text/plain": [`
`@@ -64,7 +74,7 @@`
`64`	`74`	`" 'vein endothelial cell']"`
`65`	`75`	`]`
`66`	`76`	`},`
`67`		`- "execution_count": 1,`
	`77`	`+ "execution_count": 2,`
`68`	`78`	`"metadata": {},`
`69`	`79`	`"output_type": "execute_result"`
`70`	`80`	`}`
`@@ -87,7 +97,7 @@`
`87`	`97`	`},`
`88`	`98`	`{`
`89`	`99`	`"cell_type": "code",`
`90`		`- "execution_count": 2,`
	`100`	`+ "execution_count": 3,`
`91`	`101`	`"metadata": {},`
`92`	`102`	`"outputs": [`
`93`	`103`	`{`
`@@ -96,7 +106,7 @@`
`96`	`106`	`"(8000, 60664)"`
`97`	`107`	`]`
`98`	`108`	`},`
`99`		`- "execution_count": 2,`
	`109`	`+ "execution_count": 3,`
`100`	`110`	`"metadata": {},`
`101`	`111`	`"output_type": "execute_result"`
`102`	`112`	`}`
`@@ -132,7 +142,7 @@`
`132`	`142`	`},`
`133`	`143`	`{`
`134`	`144`	`"cell_type": "code",`
`135`		`- "execution_count": 3,`
	`145`	`+ "execution_count": 4,`
`136`	`146`	`"metadata": {},`
`137`	`147`	`"outputs": [],`
`138`	`148`	`"source": [`
`@@ -162,11 +172,11 @@`
`162`	`172`	`},`
`163`	`173`	`{`
`164`	`174`	`"cell_type": "code",`
`165`		`- "execution_count": 4,`
	`175`	`+ "execution_count": 5,`
`166`	`176`	`"metadata": {},`
`167`	`177`	`"outputs": [],`
`168`	`178`	`"source": [`
`169`		`- "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir}"`
	`179`	`+ "!convert_h5ad_to_scdl --data-path {input_dir} --save-path {data_dir} --use-X-not-raw"`
`170`	`180`	`]`
`171`	`181`	`},`
`172`	`182`	`{`
`@@ -178,7 +188,7 @@`
`178`	`188`	`},`
`179`	`189`	`{`
`180`	`190`	`"cell_type": "code",`
`181`		`- "execution_count": 5,`
	`191`	`+ "execution_count": 6,`
`182`	`192`	`"metadata": {},`
`183`	`193`	`"outputs": [`
`184`	`194`	`{`
`@@ -194,7 +204,7 @@`
`194`	`204`	`" 'version.json']"`
`195`	`205`	`]`
`196`	`206`	`},`
`197`		`- "execution_count": 5,`
	`207`	`+ "execution_count": 6,`
`198`	`208`	`"metadata": {},`
`199`	`209`	`"output_type": "execute_result"`
`200`	`210`	`}`