Commiting changes to Rpackage branch

CreRecombinase · Nov 22, 2016 · 8db6061 · 8db6061
1 parent 9ffe1bc
commit 8db6061
Show file tree

Hide file tree

Showing 18 changed files with 1,086 additions and 77 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,6 +8,6 @@ Description: More about what it does (maybe more than one line)
 License: What license is it under?
 LazyData: TRUE
 Imports:
-    Rcpp (>= 0.12.5),topGO,dplyr,tidyr,org.Hs.eg.db
+    Rcpp (>= 0.12.5),RcppArmadillo,dplyr,tidyr,SQUAREM
 LinkingTo:
-    Rcpp
+    Rcpp,RcppArmadillo
diff --git a/R/FGEM_Func.R b/R/FGEM_Func.R
@@ -4,10 +4,19 @@
 FGEM_Logit <- function(Beta,x,B){
   pvec  <- 1/(1+exp(-(x%*%Beta)))
   uvec <- (pvec*B)/((pvec*B)+(1-pvec))
-  Beta <- coefficients(glm(uvec~x+0,family=quasibinomial(link="logit")))
+  Beta <- coefficients(glm.fit(x = x,y = uvec,family=quasibinomial(link="logit"),control=list(maxit=50)))
   return(Beta)
 }
 
+FGEM_Logit_fit <- function(Beta,x,B){
+  pvec  <- 1/(1+exp(-(x%*%Beta)))
+  uvec <- (pvec*B)/((pvec*B)+(1-pvec))
+
+  return(Beta)
+}
+
+
+
 FGEM_Logit_log_lik <- function(Beta,x,B){
   pvec  <- 1/(1+exp(-(x%*%Beta)))
   # opvec  <- 1/(1+exp((mx%*%Beta)))
@@ -57,25 +66,19 @@ cfeat_mat <- function(annodf,datadf){
   return(inner_join(tannodf,datadf))
 }
 
-cfeat_df <- function(annodf,datadf,impute=F){
+cfeat_df <- function(annotation_df,datadf){
   require(dplyr)
-  stopifnot(length(unique(annodf$feature))==1)
-   isbin <- all(annodf$value==1)
-   datadf <- select(datadf,Gene,BF)
-   if(isbin){
-     full_feat <- left_join(datadf,annodf)
-     full_feat <- mutate(full_feat,feature=feature[!is.na(feature)][1],
-                         class=class[!is.na(class)][1],
-                         value=ifelse(is.na(value),0,1))
-   }else{
-     if(impute){
-       full_feat <- left_join(datadf,annodf)
-       full_feat <- mutate(full_feat,value=ifelse(is.na(value),mean(value,na.rm=T),value))
-     }else{
-       full_feat <- inner_join(datadf,annodf)
-     }
-   }
-   return(full_feat)
+  require(tidyr)
+  # stopifnot(length(unique(annodf$feature))==1)
+  annotation_df <- group_by(annotation_df,feature) %>% mutate(isBin=(n_distinct(value)==1),ngenes=n()) %>%ungroup()
+  mingenes <-filter(annotation_df,isBin==FALSE) %>% group_by(Gene) %>% summarise(nfeat=n_distinct(feature)) %>% ungroup() %>% filter(nfeat==max(nfeat))
+  if(all(annotation_df$isBin)){
+    mingenes <-distinct(annotation_df,Gene)
+  }
+  annotation_df <- annotation_df %>% inner_join(mingenes,by="Gene") %>% select(-one_of(c("feat_ind","nfeat","isBin","ngenes"))) %>% spread(feature,value,fill = 0)
+  datadf <- select(datadf,Gene,BF)
+  full_feat <- inner_join(datadf,annotation_df,by="Gene")
+  return(full_feat)
 }
 
 gen_prior_df <- function(annodf,datadf,scale=F){
@@ -101,14 +104,13 @@ pmean <-function(Beta,feat_mat){
 EM_mat <-function(fBeta,feat_matrix,BF){
   require(SQUAREM)
   require(dplyr)
+  require(tidyr)
   opf <- squarem(par=fBeta,fixptfn=FGEM_Logit,x=feat_matrix,B=BF)
   cn <- colnames(feat_matrix)
   opf$LogLik <- -FGEM_Logit_log_lik(opf$par,feat_matrix,BF)
-  retdf <-data.frame(opf) %>% select(-value.objfn) %>% mutate(Intercept=opf$par[1],feature=cn) %>% rename(Beta=par)
-  if(nrow(retdf)>1){
-    retdf <- filter(retdf,feature!="Intercept")
-  }
-  return(retdf)
+  opf$feature_name <- names(opf$par)
+  opdf <- as_data_frame(data.frame(opf)) %>% select(-value.objfn,-iter,-fpevals,-objfevals)
+  return(nest(opdf,par,feature_name))
 }
 
 
@@ -132,16 +134,43 @@ anno2mat <- function(full_feat){
   return(feat_mat)
 }
 
-FGEM <-function(fBeta,feat_mat,BF){
+FGEM <-function(fBeta,feat_mat,BF,null_features="Intercept"){
   retdf <- EM_mat(fBeta,feat_mat,BF) %>% mutate(nac=NA)
-  retdf <- EM_mat(fBeta[1],feat_mat[,"Intercept",drop=F],BF = BF) %>%
-    select(NullLogLik=LogLik) %>% mutate(nac=NA) %>%
-    inner_join(retdf) %>% select(-nac)
+  retdf <- EM_mat(fBeta[paste0("feat_mat",null_features)],feat_mat[,null_features,drop=F],BF = BF) %>%
+    select(NullLogLik=LogLik) %>% mutate(nac=NA)  %>% inner_join(retdf,by="nac") %>% select(-nac)
   retdf <- mutate(retdf,Chisq=-2*(NullLogLik-LogLik),
                   pval=pchisq(Chisq,df=ncol(feat_mat)-1,lower.tail=F))
   return(retdf)
 }
 
+
+ FGEM_bootstrap <- function(full_feat,scale=F,prior_mean=0.02,iterations=100){
+   require(dplyr)
+   require(tidyr)
+   require(foreach)
+   feat_mat <-select(full_feat,Gene,feature,value,BF) %>% spread(feature,value)  %>% filter(complete.cases(.))
+   Genes <- select(feat_mat,Gene)
+   feat_mat <- mutate(Genes,Intercept=1) %>% inner_join(feat_mat) %>% select(-Gene)
+
+   BF <-feat_mat$BF
+   tmu <-(prior_mean*BF)/((prior_mean*BF)+(1-prior_mean))
+   feat_mat <- data.matrix(select(feat_mat,-BF))
+   feat_mat <- scale(feat_mat,scale,scale)
+   if(sum(is.na(feat_mat))==nrow(feat_mat)){
+     feat_mat[is.na(feat_mat)] <- 1
+   }
+   fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
+
+   while(any(is.na(fBeta))){
+     feat_mat <- feat_mat[,!is.na(fBeta)[-1]]
+     fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
+   }
+  foreach(i=1:iterations,.combine = "bind_rows") %do%{
+    subsamp <- sample(1:nrow(feat_mat),nrow(feat_mat),replace = T)
+    bretdf <- FGEM(fBeta=fBeta,feat_mat=feat_mat[subsamp,],BF=BF[subsamp]) %>% mutate(iter=i)
+  }
+}
+
 sem_df <-function(full_feat,scale=F,prior_mean=0.02){
   require(dplyr)
   require(tidyr)
@@ -165,9 +194,36 @@ sem_df <-function(full_feat,scale=F,prior_mean=0.02){
   }
   retdf <- FGEM(fBeta = fBeta,feat_mat=feat_mat,BF=BF)
 
+
   return(retdf)
 }
 
+em_df <-function(data_df,prior_mean=0.02,scale=F,null_feature){
+
+  stopifnot(nrow(datadf)>1,
+            all(null_feature %in% colnames(data_df)),
+            all(!is.na(data_df)))
+  data_df <-mutate(data_df,Intercept=1)
+  null_feature <- unique(c(null_feature,"Intercept"))
+  BF <- data_df$BF
+  tmu <-(prior_mean*BF)/((prior_mean*BF)+(1-prior_mean))
+
+  feat_mat <- data.matrix(select(data_df,-one_of("BF","Gene","class")))
+  feat_mat <- scale(feat_mat,scale,scale)
+  if(sum(is.na(feat_mat))==nrow(feat_mat)){
+    feat_mat[is.na(feat_mat)] <- 1
+  }
+  fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
+
+  while(any(is.na(fBeta))){
+    feat_mat <- feat_mat[,!is.na(fBeta)[-1]]
+    fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
+  }
+  fretdf <- FGEM(fBeta = fBeta,feat_mat=feat_mat,BF=BF,null_features = null_feature)
+  return(fretdf)
+}
+
+
 
 
 
@@ -189,7 +245,7 @@ gen_model <- function(feat_list,annodf,datadf,scale=F){
 
 
 posterior_results <- function(full_model,datadf,anno_df,scale=F){
-    retdf <- select(full_model,Beta,Intercept,feature) %>% inner_join(anno_df) %>% do(gen_prior_df(.,datadf,)) %>%
+    retdf <- select(full_model,Beta,Intercept,feature) %>% inner_join(anno_df) %>% do(gen_prior_df(.,datadf,scale=scale)) %>%
     mutate(old_prior=mean(prior))  %>% rename(new_prior=prior) %>% inner_join(datadf) %>%
     mutate(new_posterior=new_prior*BF/((new_prior*BF)+(1-new_prior)),
            old_posterior=old_prior*BF/((old_prior*BF)+(1-old_prior)),
@@ -198,12 +254,6 @@ posterior_results <- function(full_model,datadf,anno_df,scale=F){
 }
 
 
-stepwise_model <- function(feat_mat,BF,features,scale=F){
-  if(length(features==0)){
-
-  }
-}
-
 #     feat_res <- group_by(full_feat,feature) %>% mutate(isbin=(length(unique(value))==2),
 #                                            feat_bin=factor(ifelse(isbin,factor(value),value>quantile(value,prior))),
 #                                            BF_bin=factor(BF>quantile(BF,prior))) %>%

diff --git a/README.md b/README.md
@@ -1,2 +1,9 @@
 # FGEM
 Functional GWAS by Expectation Maximization
+install using
+
+`install.packages(devtools)`
+`library(devtools)`
+`install_github("CreRecombinase/FGEM")`
+
+Annotation scripts are in `data-raw`
diff --git a/analyses/Bayes_EM_GWAS.Rmd b/analyses/Bayes_EM_GWAS.Rmd
@@ -0,0 +1,40 @@
+---
+title: "Statistical Gene Mapping Using Gene Annotation and Expectation Maximization"
+author: "Nicholas Knoblauch"
+date: "October 7, 2015"
+output: html_document
+---
+
+
+###Introduction
+
+  GWAS, in the most simple context, seeks to identify loci significantly associated with a trait of interest.  In the majority of GWAS, variants are weighted equally, that is to say, the prior probability that a particular variant contributes to the trait of interest is uniform.  Given that the number of loci in the human genome that are known to very is in the millions, and the number of loci that contribute to a given trait is much smaller, there is an extremely high threshold for significance, due to the multiple-testing problem.  
+  The functional characterization of elements of the genome is another avenue by which we can understand the biological basis of complex traits.
+
+  Instead of blindly testing all observed variants in isolation with their association with a trait of interest, it would be desirable to prioritize variants based on their functional relevance to the trait of interest.  Furthermore, it would be desirable in the context of coding regions to summarize the contribution of variants within a gene, rather than treating each individually.
+
+###Method
+
+####Data
+  For each of the $I$ genes being tested $B_i$ is the gene-level Bayes Factor from an association study for the trait of interest. For each of the $I$ genes being tested we also have $J$ functional annotations.  These annotations can be a mix of almost any type of data.  They could be binary data (e.g presence of absence of a particular GO term), count data (e.g number of exons), or continuous data (e.g level of conservation across mammals).  These annotations make up the matrix $A$, which is $IxJ$.  
+
+####Model
+  For each gene, $Z_i=1$ indicates that gene $i$ is involved in the trait of interest, and $Z_i=0$ indicates that is not.  If we knew $Z_i$ we could use logistic regression to learn the importance of each annotation in the trait of interest:
+  $$logit(P(Z_i=1))=A_i\beta$$
+
+However, we do not know $Z_i$.  We can instead use Empirical Bayes.  If we rewrite our bayes factor as the probability of observing the genotype data given the gene is causal divided by the probability of observing the genotype data given that the gene is not causal, then we can write the following likelihood function.
+
+$$ P(x|\beta)=\prod_iP(x_i|\beta)=\prod_i [\pi_i(\beta)P(x_i|Z_i=1)+(1-\pi_i(\beta))P(x_i|Z_i=0)]$$
+
+Remembering the definition of the Bayes factor from, this is equivalent to 
+
+$$P(x|\beta) \propto \prod_i[\pi_i(\beta)B_i+(1-\pi_i(\beta))]$$
+And we can use maximum likelihood here to make estimates for $\beta$
+
+Using Bayes rule, the posterior is 
+$$P(Z_i=1|x)=\frac{P(x|Z_i=1)P(Z_i=1)}{P(x)}=\frac{\pi_i(\beta)B_i}{\pi_i(\beta)B_i+(1-\pi_i(\beta))}$$
+
+###Code
+
+We will use Expectation Maximization to estimate $\beta$ (I will denote the current estimate of $\beta$ as $\beta^{(t)}$) and the membership probabilities(i.e $P(Z_i=1|x,\beta^{(t)})$).  The first step is simply to set up our matrix of annotations.  For this example We will use the Biological Process (BP) and Molecular Function (MF) GO terms.
+
diff --git a/analyses/CandidateASD.Rmd b/analyses/CandidateASD.Rmd
@@ -0,0 +1,73 @@
+---
+title: "ASD Gene Set"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Candidate ASD genes
+
+```{r load,echo=FALSE,message=FALSE,warning=FALSE}
+library(dplyr)
+library(ggplot2)
+source("~/Dropbox/BayesianDA/FGEM/SFARIFunc.R")
+outdir <- "~/Dropbox/BayesianDA/GSE/"
+featdir <- "~/Desktop/SFARI/"
+consdf <- gen_annotations(filelist,featdir)
+consdf <- mutate(consdf,TADAq=ifelse(is.na(TADAq),1,TADAq))
+consdf <- mutate(consdf,TADAi=ifelse(TADAq<0.15,1,0))
+consdf <- mutate(consdf,UASD=pmax(TADAi,ASDc))
+result_df <-readRDS("~/Dropbox/BayesianDA/GSE/COMBUASD-3.RDS")
+pred_df <- readRDS("~/Dropbox/BayesianDA/GSE/COMBUASD-3-PRED.RDS")
+err_df <- group_by(result_df,iter) %>% summarise(err=err[1])
+nconsdf <- select(consdf,-ASDc,-TADAq,-TADAi) %>%  rename(y=UASD) %>% combofeat(sep="5")
+
+
+
+
+```
+
+## Including Plots
+
+You can also embed plots, for example:
+
+```{r misclass, echo=FALSE}
+
+ggplot(err_df)+geom_histogram(aes(x=err),binwidth=.005)+ggtitle(label="CV misclassification error across iterations")
+
+```
+
+
+```{r effect,echo=FALSE}
+
+ggplot(result_df)+geom_histogram(aes(x=B))+facet_wrap(~coef)+ggtitle("Distribution of effect Betas")
+filter(result_df,B!=0) %>% mutate(BetaSign=ifelse(B>0,"Positive","Negative")) %>% filter(!grepl("5",coef)) %>% 
+  ggplot(aes(coef))+geom_bar()+geom_bar(aes(fill=BetaSign))+ggtitle("Number of nonzero Betas")
+
+ filter(result_df,B!=0) %>% mutate(BetaSign=ifelse(B>0,"Positive","Negative")) %>% filter(grepl("5",coef)) %>% 
+  separate(coef,c("firstCoef","secondCoef"),sep="5",remove=T) %>% do(bind_rows(.,rename(.,firstCoef=secondCoef,secondCoef=firstCoef))) %>% 
+  ggplot(aes(secondCoef))+geom_bar()+geom_bar(aes(fill=BetaSign))+ggtitle("Number of nonzero Betas(Interaction Terms)") +facet_wrap(~firstCoef,scales="free")
+
+
+```
+
+
+
+```{r}
+mpred <- group_by(pred_df,gene) %>% summarise(varp=var(pred),meanp=mean(pred),medp=median(pred),minp=min(pred),maxp=max(pred)) %>% ungroup() %>% arrange(desc(minp))
+predf <- inner_join(consdf,mpred) %>% arrange(desc(minp))
+predf <- arrange(predf,desc(meanp))
+
+ggplot(predf)+geom_point(aes(x=log(meanp),y=log(TADAq)))
+ggplot(predf)+geom_point(aes(x=log(minp),y=log(TADAq)))
+
+write.table(predf,"~/Dropbox/BayesianDA/FGEM/ASD_Candidates.txt",col.names=T,row.names=F,sep="\t")
+finlist <- filter(predf,TADAq<0.5) %>% arrange(desc(meanp))
+arrange(finlist,desc(worstp)) %>% head
+
+
+```
+
+Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
diff --git a/analyses/ExAC_Correction.Rmd b/analyses/ExAC_Correction.Rmd
@@ -0,0 +1,80 @@
+---
+title: "ExAC_Corrections"
+author: "Nicholas Knoblauch"
+date: "October 15, 2016"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## R Markdown
+
+This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
+
+When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
+
+```{r data}
+library(FGEM)
+library(readr)
+library(dplyr)
+library(lazyeval)
+datafile <- "~/Dropbox/BayesianDA/FGEM_Data/TADA_ASC_SSC_results_Dec23.csv"
+anno_dff <- "~/Dropbox/BayesianDA/FGEM_Data/all_annotations.RDS"
+
+datadf <-read.table(datafile,header=T,sep=",",stringsAsFactors = F) 
+anno_df <- readRDS(anno_dff)
+exacf <- "~/Dropbox/BayesianDA/FGEM_Data/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt"
+exacdf <- read.table(exacf,sep="\t",stringsAsFactors = F,header=T)
+exacdf <- dplyr::select(exacdf,-chr,-transcript)  %>% dplyr::rename(Gene=gene)
+exacdf <- inner_join(exacdf,datadf)
+exac_feat <- c("syn_z",
+               "mis_z",
+               "lof_z",
+               "pLI",
+               "pRec",
+               "pNull")
+i <- 1
+eq <- paste0("y~x")
+eresid <- exacdf %>% mutate(res_syn_z=residuals(lm(interp(eq,y=syn_z,x=bp))))
+eresid <-  mutate(eresid,res_mis_z=residuals(lm(interp(eq,y=mis_z,x=bp))))
+eresid <-  mutate(eresid,res_lof_z=residuals(lm(interp(eq,y=lof_z,x=bp))))
+eresid <-  mutate(eresid,res_pLI_z=residuals(lm(interp(eq,y=pLI,x=bp))))
+eresid <-  mutate(eresid,res_pnull_z=residuals(lm(interp(eq,y=pNull,x=bp))))
+eresid <-  mutate(eresid,res_pRec_z=residuals(lm(interp(eq,y=pRec,x=bp))))
+
+exac_feat <- c("syn_z",
+               "mis_z",
+               "pLI",
+               "pRec",
+               "pNull")
+ofeat <- anno2df(select(eresid,Gene,one_of(exac_feat)),feat.name="ExAC_z")
+eres_feat <- select(eresid,Gene,starts_with("res_"))
+eres_feat <- anno2df(eres_feat,feat.name="ExAC_resid")
+anno_df <- bind_rows(anno_df,eres_feat)
+go_sigfeat <- c("GO:0071420", "GO:0006473", "GO:0097119", "GO:0001711", "GO:0097114")
+nsig_feat <- c(go_sigfeat,unique(eres_feat$feature))
+nsig_feat <- nsig_feat[!nsig_feat%in%c("res_lof_z","res_pRec_z")]
+
+
+sanno_df <- filter(anno_df,feature=="GO:0018024")
+smdf <- cfeat_df(sanno_df,datadf)
+tgmodel <- gen_model("GO:0018024",anno_df,datadf)
+res_post <- posterior_results(nfmodel,datadf,anno_df) %>% arrange(desc(post_improvement))
+head(res_post)
+tail(res_post)
+res_results <- group_by(eres_feat,feature) %>% do(sem_df(cfeat_df(.,datadf)))
+
+
+# exac_zs <- select(exacdf,Gene,BF,qvalue,one_of(exac_feat))
+neresid  <- inner_join(eresid,res_post)
+
+cor(log(neresid$new_posterior),log(neresid$res_pRec_z),)
+
+texac_zs <- mutate_at(exac_zs,vars(ends_with("_z")),percent_rank)
+filter(texac_zs,Gene=="KATNAL2")
+exacdf <- anno2df(exacdf,feat.name="ExAC")
+
+```
+