Skip to content

Commit

Permalink
Commiting changes to Rpackage branch
Browse files Browse the repository at this point in the history
  • Loading branch information
CreRecombinase committed Nov 22, 2016
1 parent 9ffe1bc commit 8db6061
Show file tree
Hide file tree
Showing 18 changed files with 1,086 additions and 77 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ Description: More about what it does (maybe more than one line)
License: What license is it under?
LazyData: TRUE
Imports:
Rcpp (>= 0.12.5),topGO,dplyr,tidyr,org.Hs.eg.db
Rcpp (>= 0.12.5),RcppArmadillo,dplyr,tidyr,SQUAREM
LinkingTo:
Rcpp
Rcpp,RcppArmadillo
120 changes: 85 additions & 35 deletions R/FGEM_Func.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,19 @@
FGEM_Logit <- function(Beta,x,B){
pvec <- 1/(1+exp(-(x%*%Beta)))
uvec <- (pvec*B)/((pvec*B)+(1-pvec))
Beta <- coefficients(glm(uvec~x+0,family=quasibinomial(link="logit")))
Beta <- coefficients(glm.fit(x = x,y = uvec,family=quasibinomial(link="logit"),control=list(maxit=50)))
return(Beta)
}

FGEM_Logit_fit <- function(Beta,x,B){
pvec <- 1/(1+exp(-(x%*%Beta)))
uvec <- (pvec*B)/((pvec*B)+(1-pvec))

return(Beta)
}



FGEM_Logit_log_lik <- function(Beta,x,B){
pvec <- 1/(1+exp(-(x%*%Beta)))
# opvec <- 1/(1+exp((mx%*%Beta)))
Expand Down Expand Up @@ -57,25 +66,19 @@ cfeat_mat <- function(annodf,datadf){
return(inner_join(tannodf,datadf))
}

cfeat_df <- function(annodf,datadf,impute=F){
cfeat_df <- function(annotation_df,datadf){
require(dplyr)
stopifnot(length(unique(annodf$feature))==1)
isbin <- all(annodf$value==1)
datadf <- select(datadf,Gene,BF)
if(isbin){
full_feat <- left_join(datadf,annodf)
full_feat <- mutate(full_feat,feature=feature[!is.na(feature)][1],
class=class[!is.na(class)][1],
value=ifelse(is.na(value),0,1))
}else{
if(impute){
full_feat <- left_join(datadf,annodf)
full_feat <- mutate(full_feat,value=ifelse(is.na(value),mean(value,na.rm=T),value))
}else{
full_feat <- inner_join(datadf,annodf)
}
}
return(full_feat)
require(tidyr)
# stopifnot(length(unique(annodf$feature))==1)
annotation_df <- group_by(annotation_df,feature) %>% mutate(isBin=(n_distinct(value)==1),ngenes=n()) %>%ungroup()
mingenes <-filter(annotation_df,isBin==FALSE) %>% group_by(Gene) %>% summarise(nfeat=n_distinct(feature)) %>% ungroup() %>% filter(nfeat==max(nfeat))
if(all(annotation_df$isBin)){
mingenes <-distinct(annotation_df,Gene)
}
annotation_df <- annotation_df %>% inner_join(mingenes,by="Gene") %>% select(-one_of(c("feat_ind","nfeat","isBin","ngenes"))) %>% spread(feature,value,fill = 0)
datadf <- select(datadf,Gene,BF)
full_feat <- inner_join(datadf,annotation_df,by="Gene")
return(full_feat)
}

gen_prior_df <- function(annodf,datadf,scale=F){
Expand All @@ -101,14 +104,13 @@ pmean <-function(Beta,feat_mat){
EM_mat <-function(fBeta,feat_matrix,BF){
require(SQUAREM)
require(dplyr)
require(tidyr)
opf <- squarem(par=fBeta,fixptfn=FGEM_Logit,x=feat_matrix,B=BF)
cn <- colnames(feat_matrix)
opf$LogLik <- -FGEM_Logit_log_lik(opf$par,feat_matrix,BF)
retdf <-data.frame(opf) %>% select(-value.objfn) %>% mutate(Intercept=opf$par[1],feature=cn) %>% rename(Beta=par)
if(nrow(retdf)>1){
retdf <- filter(retdf,feature!="Intercept")
}
return(retdf)
opf$feature_name <- names(opf$par)
opdf <- as_data_frame(data.frame(opf)) %>% select(-value.objfn,-iter,-fpevals,-objfevals)
return(nest(opdf,par,feature_name))
}


Expand All @@ -132,16 +134,43 @@ anno2mat <- function(full_feat){
return(feat_mat)
}

FGEM <-function(fBeta,feat_mat,BF){
FGEM <-function(fBeta,feat_mat,BF,null_features="Intercept"){
retdf <- EM_mat(fBeta,feat_mat,BF) %>% mutate(nac=NA)
retdf <- EM_mat(fBeta[1],feat_mat[,"Intercept",drop=F],BF = BF) %>%
select(NullLogLik=LogLik) %>% mutate(nac=NA) %>%
inner_join(retdf) %>% select(-nac)
retdf <- EM_mat(fBeta[paste0("feat_mat",null_features)],feat_mat[,null_features,drop=F],BF = BF) %>%
select(NullLogLik=LogLik) %>% mutate(nac=NA) %>% inner_join(retdf,by="nac") %>% select(-nac)
retdf <- mutate(retdf,Chisq=-2*(NullLogLik-LogLik),
pval=pchisq(Chisq,df=ncol(feat_mat)-1,lower.tail=F))
return(retdf)
}


FGEM_bootstrap <- function(full_feat,scale=F,prior_mean=0.02,iterations=100){
require(dplyr)
require(tidyr)
require(foreach)
feat_mat <-select(full_feat,Gene,feature,value,BF) %>% spread(feature,value) %>% filter(complete.cases(.))
Genes <- select(feat_mat,Gene)
feat_mat <- mutate(Genes,Intercept=1) %>% inner_join(feat_mat) %>% select(-Gene)

BF <-feat_mat$BF
tmu <-(prior_mean*BF)/((prior_mean*BF)+(1-prior_mean))
feat_mat <- data.matrix(select(feat_mat,-BF))
feat_mat <- scale(feat_mat,scale,scale)
if(sum(is.na(feat_mat))==nrow(feat_mat)){
feat_mat[is.na(feat_mat)] <- 1
}
fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))

while(any(is.na(fBeta))){
feat_mat <- feat_mat[,!is.na(fBeta)[-1]]
fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
}
foreach(i=1:iterations,.combine = "bind_rows") %do%{
subsamp <- sample(1:nrow(feat_mat),nrow(feat_mat),replace = T)
bretdf <- FGEM(fBeta=fBeta,feat_mat=feat_mat[subsamp,],BF=BF[subsamp]) %>% mutate(iter=i)
}
}

sem_df <-function(full_feat,scale=F,prior_mean=0.02){
require(dplyr)
require(tidyr)
Expand All @@ -165,9 +194,36 @@ sem_df <-function(full_feat,scale=F,prior_mean=0.02){
}
retdf <- FGEM(fBeta = fBeta,feat_mat=feat_mat,BF=BF)


return(retdf)
}

em_df <-function(data_df,prior_mean=0.02,scale=F,null_feature){

stopifnot(nrow(datadf)>1,
all(null_feature %in% colnames(data_df)),
all(!is.na(data_df)))
data_df <-mutate(data_df,Intercept=1)
null_feature <- unique(c(null_feature,"Intercept"))
BF <- data_df$BF
tmu <-(prior_mean*BF)/((prior_mean*BF)+(1-prior_mean))

feat_mat <- data.matrix(select(data_df,-one_of("BF","Gene","class")))
feat_mat <- scale(feat_mat,scale,scale)
if(sum(is.na(feat_mat))==nrow(feat_mat)){
feat_mat[is.na(feat_mat)] <- 1
}
fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))

while(any(is.na(fBeta))){
feat_mat <- feat_mat[,!is.na(fBeta)[-1]]
fBeta <- coefficients(glm(tmu~feat_mat+0,family=quasibinomial(link="logit")))
}
fretdf <- FGEM(fBeta = fBeta,feat_mat=feat_mat,BF=BF,null_features = null_feature)
return(fretdf)
}





Expand All @@ -189,7 +245,7 @@ gen_model <- function(feat_list,annodf,datadf,scale=F){


posterior_results <- function(full_model,datadf,anno_df,scale=F){
retdf <- select(full_model,Beta,Intercept,feature) %>% inner_join(anno_df) %>% do(gen_prior_df(.,datadf,)) %>%
retdf <- select(full_model,Beta,Intercept,feature) %>% inner_join(anno_df) %>% do(gen_prior_df(.,datadf,scale=scale)) %>%
mutate(old_prior=mean(prior)) %>% rename(new_prior=prior) %>% inner_join(datadf) %>%
mutate(new_posterior=new_prior*BF/((new_prior*BF)+(1-new_prior)),
old_posterior=old_prior*BF/((old_prior*BF)+(1-old_prior)),
Expand All @@ -198,12 +254,6 @@ posterior_results <- function(full_model,datadf,anno_df,scale=F){
}


stepwise_model <- function(feat_mat,BF,features,scale=F){
if(length(features==0)){

}
}

# feat_res <- group_by(full_feat,feature) %>% mutate(isbin=(length(unique(value))==2),
# feat_bin=factor(ifelse(isbin,factor(value),value>quantile(value,prior))),
# BF_bin=factor(BF>quantile(BF,prior))) %>%
Expand Down
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
# FGEM
Functional GWAS by Expectation Maximization
install using

`install.packages(devtools)`
`library(devtools)`
`install_github("CreRecombinase/FGEM")`

Annotation scripts are in `data-raw`
40 changes: 40 additions & 0 deletions analyses/Bayes_EM_GWAS.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
---
title: "Statistical Gene Mapping Using Gene Annotation and Expectation Maximization"
author: "Nicholas Knoblauch"
date: "October 7, 2015"
output: html_document
---


###Introduction

GWAS, in the most simple context, seeks to identify loci significantly associated with a trait of interest. In the majority of GWAS, variants are weighted equally, that is to say, the prior probability that a particular variant contributes to the trait of interest is uniform. Given that the number of loci in the human genome that are known to very is in the millions, and the number of loci that contribute to a given trait is much smaller, there is an extremely high threshold for significance, due to the multiple-testing problem.
The functional characterization of elements of the genome is another avenue by which we can understand the biological basis of complex traits.

Instead of blindly testing all observed variants in isolation with their association with a trait of interest, it would be desirable to prioritize variants based on their functional relevance to the trait of interest. Furthermore, it would be desirable in the context of coding regions to summarize the contribution of variants within a gene, rather than treating each individually.

###Method

####Data
For each of the $I$ genes being tested $B_i$ is the gene-level Bayes Factor from an association study for the trait of interest. For each of the $I$ genes being tested we also have $J$ functional annotations. These annotations can be a mix of almost any type of data. They could be binary data (e.g presence of absence of a particular GO term), count data (e.g number of exons), or continuous data (e.g level of conservation across mammals). These annotations make up the matrix $A$, which is $IxJ$.

####Model
For each gene, $Z_i=1$ indicates that gene $i$ is involved in the trait of interest, and $Z_i=0$ indicates that is not. If we knew $Z_i$ we could use logistic regression to learn the importance of each annotation in the trait of interest:
$$logit(P(Z_i=1))=A_i\beta$$

However, we do not know $Z_i$. We can instead use Empirical Bayes. If we rewrite our bayes factor as the probability of observing the genotype data given the gene is causal divided by the probability of observing the genotype data given that the gene is not causal, then we can write the following likelihood function.

$$ P(x|\beta)=\prod_iP(x_i|\beta)=\prod_i [\pi_i(\beta)P(x_i|Z_i=1)+(1-\pi_i(\beta))P(x_i|Z_i=0)]$$

Remembering the definition of the Bayes factor from, this is equivalent to

$$P(x|\beta) \propto \prod_i[\pi_i(\beta)B_i+(1-\pi_i(\beta))]$$
And we can use maximum likelihood here to make estimates for $\beta$

Using Bayes rule, the posterior is
$$P(Z_i=1|x)=\frac{P(x|Z_i=1)P(Z_i=1)}{P(x)}=\frac{\pi_i(\beta)B_i}{\pi_i(\beta)B_i+(1-\pi_i(\beta))}$$

###Code

We will use Expectation Maximization to estimate $\beta$ (I will denote the current estimate of $\beta$ as $\beta^{(t)}$) and the membership probabilities(i.e $P(Z_i=1|x,\beta^{(t)})$). The first step is simply to set up our matrix of annotations. For this example We will use the Biological Process (BP) and Molecular Function (MF) GO terms.

73 changes: 73 additions & 0 deletions analyses/CandidateASD.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
---
title: "ASD Gene Set"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

# Candidate ASD genes

```{r load,echo=FALSE,message=FALSE,warning=FALSE}
library(dplyr)
library(ggplot2)
source("~/Dropbox/BayesianDA/FGEM/SFARIFunc.R")
outdir <- "~/Dropbox/BayesianDA/GSE/"
featdir <- "~/Desktop/SFARI/"
consdf <- gen_annotations(filelist,featdir)
consdf <- mutate(consdf,TADAq=ifelse(is.na(TADAq),1,TADAq))
consdf <- mutate(consdf,TADAi=ifelse(TADAq<0.15,1,0))
consdf <- mutate(consdf,UASD=pmax(TADAi,ASDc))
result_df <-readRDS("~/Dropbox/BayesianDA/GSE/COMBUASD-3.RDS")
pred_df <- readRDS("~/Dropbox/BayesianDA/GSE/COMBUASD-3-PRED.RDS")
err_df <- group_by(result_df,iter) %>% summarise(err=err[1])
nconsdf <- select(consdf,-ASDc,-TADAq,-TADAi) %>% rename(y=UASD) %>% combofeat(sep="5")
```

## Including Plots

You can also embed plots, for example:

```{r misclass, echo=FALSE}
ggplot(err_df)+geom_histogram(aes(x=err),binwidth=.005)+ggtitle(label="CV misclassification error across iterations")
```


```{r effect,echo=FALSE}
ggplot(result_df)+geom_histogram(aes(x=B))+facet_wrap(~coef)+ggtitle("Distribution of effect Betas")
filter(result_df,B!=0) %>% mutate(BetaSign=ifelse(B>0,"Positive","Negative")) %>% filter(!grepl("5",coef)) %>%
ggplot(aes(coef))+geom_bar()+geom_bar(aes(fill=BetaSign))+ggtitle("Number of nonzero Betas")
filter(result_df,B!=0) %>% mutate(BetaSign=ifelse(B>0,"Positive","Negative")) %>% filter(grepl("5",coef)) %>%
separate(coef,c("firstCoef","secondCoef"),sep="5",remove=T) %>% do(bind_rows(.,rename(.,firstCoef=secondCoef,secondCoef=firstCoef))) %>%
ggplot(aes(secondCoef))+geom_bar()+geom_bar(aes(fill=BetaSign))+ggtitle("Number of nonzero Betas(Interaction Terms)") +facet_wrap(~firstCoef,scales="free")
```



```{r}
mpred <- group_by(pred_df,gene) %>% summarise(varp=var(pred),meanp=mean(pred),medp=median(pred),minp=min(pred),maxp=max(pred)) %>% ungroup() %>% arrange(desc(minp))
predf <- inner_join(consdf,mpred) %>% arrange(desc(minp))
predf <- arrange(predf,desc(meanp))
ggplot(predf)+geom_point(aes(x=log(meanp),y=log(TADAq)))
ggplot(predf)+geom_point(aes(x=log(minp),y=log(TADAq)))
write.table(predf,"~/Dropbox/BayesianDA/FGEM/ASD_Candidates.txt",col.names=T,row.names=F,sep="\t")
finlist <- filter(predf,TADAq<0.5) %>% arrange(desc(meanp))
arrange(finlist,desc(worstp)) %>% head
```

Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
80 changes: 80 additions & 0 deletions analyses/ExAC_Correction.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
---
title: "ExAC_Corrections"
author: "Nicholas Knoblauch"
date: "October 15, 2016"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.

When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

```{r data}
library(FGEM)
library(readr)
library(dplyr)
library(lazyeval)
datafile <- "~/Dropbox/BayesianDA/FGEM_Data/TADA_ASC_SSC_results_Dec23.csv"
anno_dff <- "~/Dropbox/BayesianDA/FGEM_Data/all_annotations.RDS"
datadf <-read.table(datafile,header=T,sep=",",stringsAsFactors = F)
anno_df <- readRDS(anno_dff)
exacf <- "~/Dropbox/BayesianDA/FGEM_Data/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt"
exacdf <- read.table(exacf,sep="\t",stringsAsFactors = F,header=T)
exacdf <- dplyr::select(exacdf,-chr,-transcript) %>% dplyr::rename(Gene=gene)
exacdf <- inner_join(exacdf,datadf)
exac_feat <- c("syn_z",
"mis_z",
"lof_z",
"pLI",
"pRec",
"pNull")
i <- 1
eq <- paste0("y~x")
eresid <- exacdf %>% mutate(res_syn_z=residuals(lm(interp(eq,y=syn_z,x=bp))))
eresid <- mutate(eresid,res_mis_z=residuals(lm(interp(eq,y=mis_z,x=bp))))
eresid <- mutate(eresid,res_lof_z=residuals(lm(interp(eq,y=lof_z,x=bp))))
eresid <- mutate(eresid,res_pLI_z=residuals(lm(interp(eq,y=pLI,x=bp))))
eresid <- mutate(eresid,res_pnull_z=residuals(lm(interp(eq,y=pNull,x=bp))))
eresid <- mutate(eresid,res_pRec_z=residuals(lm(interp(eq,y=pRec,x=bp))))
exac_feat <- c("syn_z",
"mis_z",
"pLI",
"pRec",
"pNull")
ofeat <- anno2df(select(eresid,Gene,one_of(exac_feat)),feat.name="ExAC_z")
eres_feat <- select(eresid,Gene,starts_with("res_"))
eres_feat <- anno2df(eres_feat,feat.name="ExAC_resid")
anno_df <- bind_rows(anno_df,eres_feat)
go_sigfeat <- c("GO:0071420", "GO:0006473", "GO:0097119", "GO:0001711", "GO:0097114")
nsig_feat <- c(go_sigfeat,unique(eres_feat$feature))
nsig_feat <- nsig_feat[!nsig_feat%in%c("res_lof_z","res_pRec_z")]
sanno_df <- filter(anno_df,feature=="GO:0018024")
smdf <- cfeat_df(sanno_df,datadf)
tgmodel <- gen_model("GO:0018024",anno_df,datadf)
res_post <- posterior_results(nfmodel,datadf,anno_df) %>% arrange(desc(post_improvement))
head(res_post)
tail(res_post)
res_results <- group_by(eres_feat,feature) %>% do(sem_df(cfeat_df(.,datadf)))
# exac_zs <- select(exacdf,Gene,BF,qvalue,one_of(exac_feat))
neresid <- inner_join(eresid,res_post)
cor(log(neresid$new_posterior),log(neresid$res_pRec_z),)
texac_zs <- mutate_at(exac_zs,vars(ends_with("_z")),percent_rank)
filter(texac_zs,Gene=="KATNAL2")
exacdf <- anno2df(exacdf,feat.name="ExAC")
```

Loading

0 comments on commit 8db6061

Please sign in to comment.