Skip to content

Commit 122d5e5

Browse files
authored
Merge pull request #56 from Mye-InfoBank/implement-modes
Implement multiple pipeline modes
2 parents 5d95205 + 7616379 commit 122d5e5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+519
-333
lines changed

bin/merge_datasets.py

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,53 +4,53 @@
44
import anndata as ad
55
import scanpy as sc
66
from scipy.sparse import csr_matrix
7-
import numpy as np
8-
9-
107

118
parser = argparse.ArgumentParser(description="Merge datasets")
129
parser.add_argument("--input", help="Input file", type=str, nargs="+")
13-
parser.add_argument("--output_integration", help="Output file containing only cells which do not require transfer learning", type=str)
14-
parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str)
15-
parser.add_argument("--output_transfer", help="Output file containing all cells which require transfer learning", type=str)
16-
parser.add_argument("--output_counts", help="Output file, outer join of cells and genes", type=str)
10+
parser.add_argument("--base", help="Base dataset to use as reference", type=str, required=False)
11+
parser.add_argument("--output_intersection", help="Output file containing all cells but gene intersection", type=str, required=True)
12+
parser.add_argument("--output_union", help="Output file, outer join of cells and genes", type=str, required=True)
13+
parser.add_argument("--output_transfer", help="Output file, cells to project onto base", type=str, required=False)
1714
parser.add_argument("--min_cells", help='Minimum number of cells to keep a gene', type=int, required=False, default=50)
1815
parser.add_argument("--custom_genes", help="Additional genes to include", type=str, nargs="*")
1916

2017
args = parser.parse_args()
2118

2219
datasets = [ad.read_h5ad(f) for f in args.input]
2320

24-
adata = ad.concat(datasets)
25-
adata_outer = ad.concat(datasets, join='outer')
21+
if args.base:
22+
if not args.output_transfer:
23+
raise ValueError("Transfer file required when using base dataset")
24+
25+
adata_base = ad.read_h5ad(args.base)
26+
datasets = [adata_base] + datasets
27+
28+
adata_intersection = ad.concat(datasets)
29+
adata_union = ad.concat(datasets, join='outer')
2630

27-
additional_genes = [gene for gene in args.custom_genes if gene not in adata.var_names and gene in adata_outer.var_names]
31+
additional_genes = [gene for gene in args.custom_genes if gene not in adata_intersection.var_names and gene in adata_union.var_names]
2832

2933
# Add custom genes from outer join to the intersection
3034
if additional_genes:
31-
adata_additional = adata_outer[adata.obs_names, additional_genes]
32-
adata_concatenated = ad.concat([adata, adata_additional], join="outer", axis=1)
33-
adata_concatenated.obs, adata_concatenated.obsm = adata.obs, adata.obsm
34-
adata = adata_concatenated
35+
adata_additional = adata_union[adata_intersection.obs_names, additional_genes]
36+
adata_concatenated = ad.concat([adata_intersection, adata_additional], join="outer", axis=1)
37+
adata_concatenated.obs, adata_concatenated.obsm = adata_intersection.obs, adata_intersection.obsm
38+
adata_intersection = adata_concatenated
3539

3640
# Convert to CSR matrix
37-
adata.X = csr_matrix(adata.X)
38-
adata_outer.X = csr_matrix(adata_outer.X)
39-
40-
# Filter genes with no counts in core atlas
41-
gene_mask, _ = sc.pp.filter_genes(adata[~adata.obs["transfer"]], min_cells=1, inplace=False)
42-
adata = adata[:, gene_mask]
41+
adata_intersection.X = csr_matrix(adata_intersection.X)
42+
adata_union.X = csr_matrix(adata_union.X)
4343

4444
# Filter cells with no counts
45-
cell_mask, _ = sc.pp.filter_cells(adata, min_genes=1, inplace=False)
46-
adata = adata[cell_mask, :]
47-
adata_outer = adata_outer[cell_mask, :]
45+
cell_mask, _ = sc.pp.filter_cells(adata_intersection, min_genes=1, inplace=False)
46+
adata_intersection = adata_intersection[cell_mask, :]
47+
adata_union = adata_union[cell_mask, :]
4848

4949
# Filter genes with too few occurrences in outer join
50-
sc.pp.filter_genes(adata_outer, min_cells=args.min_cells)
50+
sc.pp.filter_genes(adata_union, min_cells=args.min_cells)
5151

52-
adata.obs["batch"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["batch"].astype(str)
53-
adata.obs["patient"] = adata.obs["dataset"].astype(str) + "_" + adata.obs["patient"].astype(str)
52+
adata_intersection.obs["batch"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["batch"].astype(str)
53+
adata_intersection.obs["patient"] = adata_intersection.obs["dataset"].astype(str) + "_" + adata_intersection.obs["patient"].astype(str)
5454

5555
def to_Florent_case(s: str):
5656
corrected = s.lower().strip()
@@ -77,25 +77,25 @@ def to_Florent_case(s: str):
7777

7878
return corrected[0].upper() + corrected[1:]
7979

80-
for column in adata.obs.columns:
81-
if column == "transfer":
82-
continue
83-
if not adata.obs[column].dtype.name == "category" and not adata.obs[column].dtype.name == "object":
80+
for column in adata_intersection.obs.columns:
81+
if not adata_intersection.obs[column].dtype.name == "category" and not adata_intersection.obs[column].dtype.name == "object":
8482
continue
8583
# Convert first to string and then to category
86-
adata.obs[column] = adata.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")
84+
adata_intersection.obs[column] = adata_intersection.obs[column].astype(str).fillna("Unknown").apply(to_Florent_case).astype("category")
85+
86+
adata_union.obs = adata_intersection.obs
87+
88+
adata_intersection.layers["counts"] = adata_intersection.X
89+
adata_union.layers["counts"] = adata_union.X
8790

88-
adata_outer.obs = adata.obs
91+
if args.base:
92+
adata_transfer = adata_intersection[~adata_intersection.obs.index.isin(adata_base.obs.index)]
8993

90-
adata.layers["counts"] = adata.X
91-
adata_outer.layers["counts"] = adata_outer.X
94+
known_celltypes = adata_base.obs["cell_type"].unique()
95+
adata_transfer.obs["cell_type"] = adata_transfer.obs["cell_type"].map(lambda x: x if x in known_celltypes else "Unknown")
9296

93-
if any(adata.obs["transfer"]):
94-
adata_transfer = adata[adata.obs["transfer"]]
9597
adata_transfer.write_h5ad(args.output_transfer)
9698

97-
adata_notransfer = adata[~adata.obs["transfer"]]
98-
adata_notransfer.write_h5ad(args.output_integration)
99+
adata_intersection.write_h5ad(args.output_intersection)
100+
adata_union.write_h5ad(args.output_union)
99101

100-
adata.write_h5ad(args.output_intersection)
101-
adata_outer.write_h5ad(args.output_counts)

bin/preprocess.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
"patient": True,
1414
"tissue": True,
1515
"dataset": True,
16-
"transfer": True
1716
}
1817

1918
parser = argparse.ArgumentParser(description="Filter dataset")
@@ -22,7 +21,6 @@
2221
parser.add_argument("--output", help="Output file", type=str)
2322
parser.add_argument("--problems", help="Problems file", type=str)
2423
parser.add_argument("--no-symbols", help="Convert varnames to gene symbols", action="store_true")
25-
parser.add_argument("--transfer", help="Apply transfer leanring on dataset", action="store_true")
2624
parser.add_argument("--sure_raw", help="Skip check for raw counts", action="store_true")
2725
parser.add_argument("--custom_metadata", help="Additional metadata columns to include", type=str, nargs="*")
2826

@@ -57,7 +55,6 @@ def aggregate_duplicate_var(adata, aggr_fun=np.mean):
5755
print("Reading input")
5856
adata = sc.read_h5ad(args.input)
5957
adata.obs["dataset"] = args.id
60-
adata.obs["transfer"] = args.transfer
6158

6259
if adata.__dict__["_raw"] and "_index" in adata.__dict__["_raw"].__dict__["_var"]:
6360
adata.__dict__["_raw"].__dict__["_var"] = (

conf/modes.config

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
params {
2+
leiden_resolutions = [0.25, 0.5, 0.75, 1, 1.5, 2]
3+
entropy = false
4+
entropy_initial_smoothness = 0.5
5+
scshc = false
6+
}
7+
8+
process {
9+
withName: SCSHC_CLUSTERING {
10+
ext.when = { params.scshc }
11+
}
12+
13+
withName: SCSHC_CLUSTERING_QC {
14+
ext.when = { params.scshc }
15+
}
16+
17+
withName: ENTROPY {
18+
ext.when = { params.entropy }
19+
}
20+
}
21+
22+
profiles {
23+
build {
24+
params.mode = "build"
25+
includeConfig "modes/build.config"
26+
includeConfig "modes/build-extend.config"
27+
}
28+
29+
extend {
30+
params.mode = "extend"
31+
includeConfig "modes/extend.config"
32+
includeConfig "modes/build-extend.config"
33+
}
34+
35+
sub {
36+
params.mode = "sub"
37+
includeConfig "modes/sub.config"
38+
}
39+
}

conf/modules.config renamed to conf/modes/build-extend.config

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
params {
2+
samplesheet = null
3+
celltypist_model = null
4+
min_cells = 50
5+
cell_cycle = true
6+
7+
normalization_method = "log_total"
8+
upset_only = false
9+
decontX = true
10+
11+
has_celltypes = true
12+
custom_metadata = []
13+
}
14+
115
process {
216
withName: CELLTYPIST {
317
ext.when = { params.celltypist_model != null }
@@ -7,18 +21,6 @@ process {
721
ext.when = { params.cell_cycle }
822
}
923

10-
withName: SCSHC_CLUSTERING {
11-
ext.when = { params.scshc }
12-
}
13-
14-
withName: SCSHC_CLUSTERING_QC {
15-
ext.when = { params.scshc }
16-
}
17-
18-
withName: ENTROPY {
19-
ext.when = { params.entropy }
20-
}
21-
2224
withName: BENCHMARK_INTEGRATIONS {
2325
ext.when = { params.benchmark_hvgs > 0 }
2426
}

conf/modes/build.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
params {
2+
integration_methods = ["scvi", "scanvi", "harmony", "scgen", "scanorama", "bbknn", "desc", "combat", "trvaep"]
3+
custom_hvgs = []
4+
integration_hvgs = 10000
5+
benchmark_hvgs = 0
6+
}

conf/modes/extend.config

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
params {
2+
base = null
3+
model = null
4+
}

conf/modes/sub.config

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
params {
2+
input = null
3+
integration = null
4+
annotation = null
5+
split_on = null
6+
}
File renamed without changes.
File renamed without changes.
Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,11 @@
1+
includeConfig "../test.config"
2+
13
params {
24
samplesheet = "samplesheet.csv"
35

46
benchmark_hvgs = 100
5-
scshc = false
6-
entropy = false
77
cell_cycle = true
8-
leiden_resolutions = [0.5, 1]
98

109
celltypist_model = "Cells_Intestinal_Tract.pkl"
1110
integration_methods = ["scvi", "scanvi", "harmony", "desc", "combat"]
12-
13-
max_cpus = 4
14-
max_memory = "12G"
15-
max_time = "6.h"
16-
}
17-
18-
process {
19-
executor = "local"
2011
}

0 commit comments

Comments
 (0)