forked from leeoesterreich/scRNA-CellLines
-
Notifications
You must be signed in to change notification settings - Fork 0
/
calcTF.py
79 lines (60 loc) · 3.08 KB
/
calcTF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from utils import *
import dask
from dask.diagnostics import ProgressBar
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2
import pyscenic
from pyscenic.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell
def calcTFs(expr, tf_names, db, prefix,
motif_path='../data/pySCENIC/ref/motifs-v9-nr.hgnc-m0.001-o0.0.tbl',
out_path='../data/pySCENIC',
ppn=8):
"""Computes motifs, regulons and trancriptional factor activation using pySCENIC.
Arguments
---------
expr: `pandas DataFrame`
cell X gene raw counts; FPKM; not TPM as coexpression will be calculated
tf_names: `list` (`str`)
curated human transcriptional factor downloaded from github: pySCENIC/ref/hs_hgnc_curated_tfs.txt
db: `list` (`FeatherRankingDatabase()`)
feather files, ranking genome [FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss")]
prefix: `str` (default: `None`)
Specify name to save files (eg, cell line names)
Returns
-------
Do not return but write files (the calc takes too long...)
"""
# Inference of co-expression modules
adjacencies = grnboost2(expr, tf_names=tf_names, verbose=True)
modules = list(modules_from_adjacencies(adjacencies, expr))
# Calculate a list of enriched motifs and the corresponding target genes for all modules.
with ProgressBar():
df = prune2df(db, modules, motif_path, num_workers=ppn)
# Create regulons from this table of enriched motifs.
regulons = df2regulons(df)
# Save the enriched motifs and the discovered regulons to disk.
with open('{}/{}_motifs.csv'.format(out_path, prefix), "wb") as f:
pickle.dump(regulons, f)
auc_mtx = aucell(expr, regulons, num_workers=ppn)
tfs = [tf.strip('(+)') for tf in auc_mtx.columns]
auc_mtx.to_csv('{}/{}_auc_mtx.csv'.format(out_path, prefix))
print('finished calculation for %s'%(prefix))
if __name__ == "__main__":
wkdir = '/bgfs/alee/chelsea/projects/10X/AL1/codes'
os.chdir(wkdir)
db = [RankingDatabase(fname='../data/pySCENIC/ref/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.feather', name='hg38__refseq-r80__10kb_up_and_down_tss.mc9nr')]
tf_names = load_tf_names('../data/pySCENIC/ref/hs_hgnc_curated_tfs.txt')
CellTypes = ['MCF7','T47D WT','T47D KO', 'MM134','SUM44','BCK4', 'MCF10A','HEK293']
for cell in CellTypes:
tmp = adata_raw[adata_raw.obs['CellType'] == cell]
RawSplicedCts = pd.DataFrame(tmp.layers['spliced'].todense(), index=tmp.obs.index, columns=tmp.var.index) # cell X gene
print(cell, RawSplicedCts.shape)
if not isfile('{}/{}_auc_mtx.csv'.format('../data/pySCENIC', cell)):
calcTFs(RawSplicedCts, tf_names, db, cell)
print('calculating TF for %s'%(cell))
# srun -n 1 --cpus-per-task 8 --mem=10g -t 24:00:00 --pty zsh
# python calcTF.py > ../logs/calcTF.log 2>&1 &
# tail -n 5 ../logs/calcTF.log