add meta analysis example scripts

hpratt · hpratt · commit 4ffd97fa9781 · 2022-09-30T10:13:27.000-04:00
diff --git a/examples/zoonomia-meta-analysis/formatted-h2.R b/examples/zoonomia-meta-analysis/formatted-h2.R
@@ -0,0 +1,38 @@
+library(rmeta)
+
+meta_sldsc <- function (myrow,dir,analysis,mytraits=traits,mysd=""){
+  nbtrait = length(mytraits)
+  M    = 5961159
+  res   = NULL
+  for (t in mytraits){
+    data     = read.table(paste(dir,"/",t,".",analysis,".results",sep=""),h=T)[myrow,]
+    log     = read.table(paste(dir,"/",t,".",analysis,".log",sep=""),h=F,fill=T)
+    h2g     = as.numeric(as.character(log[which(log$V4=="h2:"),5]))
+    #
+    myenrstat  = (h2g/M)*((data$Prop._h2/data$Prop._SNPs)-(1-data$Prop._h2)/(1-data$Prop._SNPs)) #step 1
+	  myenrstat_z = qnorm(data$Enrichment_p/2) #step2
+	  myenrstat_sd = myenrstat/myenrstat_z #step3
+	  data     = cbind(data, myenrstat, myenrstat_sd)
+    #
+    if (mysd==""){ #particular case of binary annotation, where sd=sqrt(p(1-p))
+      data$Coefficient      = M * sqrt(data$Prop._SNPs*(1-data$Prop._SNPs)) * data$Coefficient      / h2g
+      data$Coefficient_std_error = M * sqrt(data$Prop._SNPs*(1-data$Prop._SNPs)) * data$Coefficient_std_error / h2g
+    } else {
+      data$Coefficient      = M * mysd * data$Coefficient      / h2g
+      data$Coefficient_std_error = M * mysd * data$Coefficient_std_error / h2g
+    }
+    #
+    res     = rbind(res,data)
+  }
+  res = data.frame(res)
+  test0 = meta.summaries(res$Prop._h2  , res$Prop._h2_std_error  , method="random")
+  test1 = meta.summaries(res$Enrichment , res$Enrichment_std_error , method="random")
+  test2 = meta.summaries(res$myenrstat  , res$myenrstat_sd     , method="random")
+  test3 = meta.summaries(res$Coefficient , res$Coefficient_std_error , method="random")
+  out = rbind(c(mean(res$Prop._SNPs),test0$summary, test0$se.summary, test1$summary, test1$se.summary, 2*pnorm(-abs(test2$summary/test2$se.summary)), test3$summary, test3$se.summary, 2*pnorm(-abs(test3$summary/test3$se.summary))))
+  colnames(out) = c("propsnps","proph2","proph2_se","Enr","Enr_se","Enr_P","tau","tau_se","tau_P")
+  rownames(out) = data$Category[1]
+  out
+}
+
+meta_sldsc(%d, "%s", "baselineLD_mammal", c(%s))
diff --git a/examples/zoonomia-meta-analysis/meta-analysis.py b/examples/zoonomia-meta-analysis/meta-analysis.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+import gzip
+import glob
+import sys
+import os
+import tempfile
+import subprocess
+import math
+
+from joblib import Parallel, delayed
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot
+import matplotlib.font_manager
+
+def dformat(xxl):
+    x = xxl if type(xxl) is str else xxl[0]
+    xx = xxl if type(xxl) is str else xxl[0]
+    with open(x, 'rt') as f:
+        if "LD Score Regression (LDSC)" not in f.read(): return
+    with open(x, 'rt') as f:
+        lines = [ x.strip() for x in f.read().split("\n") ]
+    lines = [ x if "/tmp" not in x else os.path.basename(xx).replace(".5", "_5").split('.')[-3] + "\t" + "\t".join(x.strip().split()[1:]) for x in lines ]
+    if type(xxl) is list:
+        def v(xx):
+            print(xx, file = sys.stderr)
+            with open(xx, 'rt') as f:
+                llines = [ x.strip() for x in f.read().split("\n") ][-3:]
+                llines = [ os.path.basename(xx).replace(".5", "_5").split('.')[-3] + "\t" + "\t".join(x.strip().split()[1:]) for x in llines if "/tmp" in x ]
+                return llines
+        r = Parallel(n_jobs = 64)(delayed(v)(xx) for xx in xxl[1:])
+        for xx in r: lines += xx
+    lidx = [ i for i, x in enumerate(lines) if x.startswith("Category") ][0]
+    with open(os.path.dirname(x) + "/formatted-h2/" + os.path.basename(x).replace("460K.", "460K_").split('.')[0] + ".results", 'w') as o:
+        o.write('\n'.join(lines[lidx:]))
+    with open(os.path.dirname(x) + "/formatted-h2/" + os.path.basename(x).replace("460K.", "460K_").split('.')[0] + ".log", 'w') as o:
+        o.write('\n'.join(lines[:lidx]))
+
+def run(x, d):
+    with tempfile.NamedTemporaryFile('wt') as o:
+        with open("formatted-h2.R", 'r') as f:
+            o.write(f.read() % (x, d, ", ".join([ '"%s"' % os.path.basename(x).split('.')[0] for x in glob.glob(d + "/*.result*") ])))
+        o.flush()
+        return subprocess.check_output("Rscript %s" % o.name, shell = True).decode()
+
+def kv(vx):
+    r = {}
+    lines = vx.strip().split("\n")
+    for i in range(int(len(lines) / 2)):
+        keys = [ x for x in lines[i * 2].strip().split() ]
+        for ii, k in enumerate(keys):
+            r[k] = float(lines[i * 2 + 1].strip().split()[ii + 1])
+    return lines[1].strip().split()[0], r
+
+def trun(x, d):
+    try:
+        return kv(run(x, d))
+    except:
+        return None
+
+def percentages(inputf):
+    common = glob.glob("common_snps/*common*bed")
+    def read_c(cc):
+        with open(cc, 'r') as f:
+            return { x.strip().split()[-1] for x in f }
+    r = Parallel(n_jobs = 64)(delayed(read_c)(x) for x in common)
+    all_common = set()
+    for x in r:
+        all_common = all_common.union(x)
+    def totals(iif):
+        count = 0
+        with gzip.open(iif, 'rt') as ff:
+            hmap = { i: x.strip() for i, x in enumerate(ff.readline().split("\t")) }
+            counts = { x: 0 for _, x in hmap.items() }
+            for line in ff:
+                if line.strip().split()[2] not in all_common: continue
+                count += 1
+                for i, x in enumerate(line.strip().split()):
+                    counts[hmap[i]] += 1 if x == "1" else 0
+        return count, counts
+    results = Parallel(n_jobs = 64)(delayed(totals)(x) for x in inputf)
+    gtotal = sum([ x[0] for x in results ])
+    return { k.strip().replace('[', "").replace(']', "").replace(',', ""): sum([ x[1][k] if len(x) >= 2 and k in x[1] else 0 for x in results ]) / float(gtotal) for k, _ in results[0][1].items() }
+
+def main(argc, argv):
+
+    if argc < 3:
+        print("usage: meta-analysis.py ldsc-results-directory *.annot.gz", file = sys.stderr
+        return 1
+
+    os.system("mkdir -p %s" % (argv[1] + "/formatted-h2"))
+    for x in glob.glob(argv[1] + "/*"):
+        dformat(x)
+    values = Parallel(n_jobs = 64)(delayed(trun)(x, argv[1] + "/formatted-h2") for x in range(2, 150))
+
+    cmap = { x[0]: x[1] for x in values if x is not None }
+    p = percentages(argv[2:])
+    for k, v in cmap.items():
+        print("%s\t%f\t%f\t%f\t%e\t%f\t%f\t%e" % (k, p[k.split("L2")[0]], v["Enr"], v["Enr_se"], v["Enr_P"], v["tau"], v["tau_se"], v["tau_P"]))
+    return 0
+
+if __name__ == "__main__":
+    sys.exit(main(len(sys.argv), sys.argv))