Updated README.md to communicate easier install options

jorenretel · jorenretel · commit c543fce70a0f · 2024-09-26T22:46:39.000+02:00
diff --git a/README.md b/README.md
@@ -1,59 +1,155 @@
-# Dataloader for BigWig files
+# :lollipop: Epigenetics Dataloader for BigWig files
 
-Faster batched dataloading of BigWig files and corresponding sequence data powered by GPU for deep learning applications.
-This library is meant for loading batches of data with the same dimensionality, which allows for some assumptions that can
-speed up the loading process. As can be seen from the plot below, when loading a small amount of data, pyBigWig is very fast,
-but does not exploit the batched nature of data loading for machine learning.
+Fast batched dataloading of BigWig files containing epigentic track data and corresponding sequences powered by GPU
+for deep learning applications.
 
-In the benchmark below we also created PyTorch dataloaders (with set_start_method('spawn')) using pyBigWig to compare to
-the realistic scenario where multiple CPUs would be used per GPU. We see that the throughput of the CPU dataloader does
-not go up linearly with the number of CPUs, and therefore it becomes hard to get the needed throughput to keep the GPU,
-training the neural network,saturated during the learning steps.
+## Quickstart
 
+### Installation with conda/mamba
 
-![benchmark.png](images%2Fbenchmark.png)
+Bigwig-loader mainly depends on the rapidsai kvikio library and cupy, both of which are best installed using
+conda/mamba. Bigwig-loader can now also be installed using conda/mamba. To create a new environment with bigwig-loader
+installed:
 
-This is the problem bigwig-loader solves. This is an example of how to use bigwig-loader:
+```shell
+mamba create -n my-env -c rapidsai -c conda-forge -c bioconda -c dataloading bigwig-loader
+```
+
+Or add this to you environment.yml file:
+
+```yaml
+name: my-env
+channels:
+  - rapidsai
+  - conda-forge
+  - bioconda
+  - dataloading
+dependencies:
+    - bigwig-loader
+```
+
+and update:
+
+```shell
+mamba env update -f environment.yml
+```
+
+### Installation with pip
+Bigwig-loader can also be installed using pip in an environment which has the rapidsai kvikio library
+and cupy installed already:
+
+```shell
+pip install bigwig-loader
+```
+
+### PyTorch Example
+We wrapped the BigWigDataset in a PyTorch iterable dataset that you can directly use:
 
 ```python
+# examples/pytorch_example.py
 import pandas as pd
-from bigwig_loader.dataset import BigWigDataset
+import torch
+from torch.utils.data import DataLoader
 from bigwig_loader import config
+from bigwig_loader.pytorch import PytorchBigWigDataset
 from bigwig_loader.download_example_data import download_example_data
 
-# Download some data to play with
+# Download example data to play with
 download_example_data()
+example_bigwigs_directory = config.bigwig_dir
+reference_genome_file = config.reference_genome
 
-# created by running examples/create_train_val_test_intervals.py
-train_regions = pd.read_csv("train_regions.tsv", sep="\t")
-
-# now there is some example data here
-bigwig_dir = config.bigwig_dir
-reference_genome = config.reference_genome
-print("Loading from:", bigwig_dir)
+train_regions = pd.DataFrame({"chrom": ["chr1", "chr2"], "start": [0, 0], "end": [1000000, 1000000]})
 
-dataset = BigWigDataset(
+dataset = PytorchBigWigDataset(
     regions_of_interest=train_regions,
-    collection=bigwig_dir,
-    reference_genome_path=reference_genome,
+    collection=example_bigwigs_directory,
+    reference_genome_path=reference_genome_file,
     sequence_length=1000,
-    center_bin_to_predict=1000,
+    center_bin_to_predict=500,
     window_size=1,
-    batch_size=256,
+    batch_size=32,
     super_batch_size=1024,
     batches_per_epoch=20,
     maximum_unknown_bases_fraction=0.1,
     sequence_encoder="onehot",
+    n_threads=4,
+    return_batch_objects=True,
 )
 
-for encoded_sequences, epigenetics_profiles in dataset:
-    print(encoded_sequences)
-    print(epigenetics_profiles)
+# Don't use num_workers > 0 in DataLoader. The heavy
+# lifting/parallelism is done on cuda streams on the GPU.
+dataloader = DataLoader(dataset, num_workers=0, batch_size=None)
+
 
+class MyTerribleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(4, 2)
+
+    def forward(self, batch):
+        return self.linear(batch).transpose(1, 2)
+
+
+model = MyTerribleModel()
+optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+def poisson_loss(pred, target):
+    return (pred - target * torch.log(pred.clamp(min=1e-8))).mean()
+
+for batch in dataloader:
+    # batch.sequences.shape = n_batch (32), sequence_length (1000), onehot encoding (4)
+    pred = model(batch.sequences)
+    # batch.values.shape = n_batch (32), n_tracks (2) center_bin_to_predict (500)
+    loss = poisson_loss(pred[:, :, 250:750], batch.values)
+    print(loss)
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()
 ```
 
+### Other frameworks
+
+A framework agnostic Dataset object can be imported from `bigwig_loader.dataset`. This dataset object
+returns cupy tensors. Cupy tensors adhere to the cuda array interface and can be zero-copy transformed
+to JAX or tensorflow tensors.
+
+```python
+from bigwig_loader.dataset import BigWigDataset
+
+dataset = BigWigDataset(
+    regions_of_interest=train_regions,
+    collection=example_bigwigs_directory,
+    reference_genome_path=reference_genome_file,
+    sequence_length=1000,
+    center_bin_to_predict=500,
+    window_size=1,
+    batch_size=32,
+    super_batch_size=1024,
+    batches_per_epoch=20,
+    maximum_unknown_bases_fraction=0.1,
+    sequence_encoder="onehot",
+)
+
+```
 See the examples directory for more examples.
 
+## Background
+
+This library is meant for loading batches of data with the same dimensionality, which allows for some assumptions that can
+speed up the loading process. As can be seen from the plot below, when loading a small amount of data, pyBigWig is very fast,
+but does not exploit the batched nature of data loading for machine learning.
+
+In the benchmark below we also created PyTorch dataloaders (with set_start_method('spawn')) using pyBigWig to compare to
+the realistic scenario where multiple CPUs would be used per GPU. We see that the throughput of the CPU dataloader does
+not go up linearly with the number of CPUs, and therefore it becomes hard to get the needed throughput to keep the GPU,
+training the neural network,saturated during the learning steps.
+
+
+![benchmark.png](images%2Fbenchmark.png)
+
+This is the problem bigwig-loader solves. This is an example of how to use bigwig-loader:
+
 ### Installation
 
 1. `git clone git@github.com:pfizer-opensource/bigwig-loader`
diff --git a/examples/pytorch_example.py b/examples/pytorch_example.py
@@ -1,42 +1,61 @@
 import pandas as pd
+import torch
 from torch.utils.data import DataLoader
 
 from bigwig_loader import config
 from bigwig_loader.download_example_data import download_example_data
 from bigwig_loader.pytorch import PytorchBigWigDataset
 
+download_example_data()
+example_bigwigs_directory = config.bigwig_dir
+reference_genome_file = config.reference_genome
 
-def run():
-    # Download some data to play with
-    download_example_data()
+train_regions = pd.DataFrame(
+    {"chrom": ["chr1", "chr2"], "start": [0, 0], "end": [1000000, 1000000]}
+)
 
-    # created by running examples/create_train_val_test_intervals.py
-    train_regions = pd.read_csv("train_regions.tsv", sep="\t")
+dataset = PytorchBigWigDataset(
+    regions_of_interest=train_regions,
+    collection=example_bigwigs_directory,
+    reference_genome_path=reference_genome_file,
+    sequence_length=1000,
+    center_bin_to_predict=500,
+    window_size=1,
+    batch_size=32,
+    super_batch_size=1024,
+    batches_per_epoch=20,
+    maximum_unknown_bases_fraction=0.1,
+    sequence_encoder="onehot",
+    n_threads=4,
+    return_batch_objects=True,
+)
 
-    # now there is some example data here
-    bigwig_dir = config.bigwig_dir
-    reference_genome = config.reference_genome
-    print("Loading from:", bigwig_dir)
+dataloader = DataLoader(dataset, num_workers=0, batch_size=None)
 
-    dataset = PytorchBigWigDataset(
-        regions_of_interest=train_regions,
-        collection=bigwig_dir,
-        reference_genome_path=reference_genome,
-        sequence_length=1000,
-        center_bin_to_predict=1000,
-        window_size=1,
-        batch_size=256,
-        batches_per_epoch=20,
-        maximum_unknown_bases_fraction=0.1,
-        sequence_encoder="onehot",
-    )
 
-    loader = DataLoader(dataset, batch_size=None, num_workers=0)
+class MyTerribleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear = torch.nn.Linear(4, 2)
 
-    for input, target in loader:
-        print(input)
-        print(target)
+    def forward(self, batch):
+        return self.linear(batch).transpose(1, 2)
 
 
-if __name__ == "__main__":
-    run()
+model = MyTerribleModel()
+optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
+
+
+def poisson_loss(pred, target):
+    return (pred - target * torch.log(pred.clamp(min=1e-8))).mean()
+
+
+for batch in dataloader:
+    # batch.sequences.shape = n_batch (32), sequence_length (1000), onehot encoding (4)
+    pred = model(batch.sequences)
+    # batch.values.shape = n_batch (32), n_tracks (2) center_bin_to_predict (500)
+    loss = poisson_loss(pred[:, :, 250:750], batch.values)
+    print(loss)
+    optimizer.zero_grad()
+    loss.backward()
+    optimizer.step()