biocore · wasade · Jan 22, 2016 · Jan 24, 2016 · Jan 24, 2016 · Feb 5, 2016
diff --git a/ipynb/agp_distance_decay.ipynb b/ipynb/agp_distance_decay.ipynb
@@ -0,0 +1,279 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This Jupyter notebook recreates the distance-decay analyses for the American Gut Project paper by McDonald et al.\n",
+    "\n",
+    "Ashkaan K Fahimipour\n",
+    "2016.01.21"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## Let's first install and load some libraries.\n",
+    "install.packages('biom', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('phyloseq', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('qiimer', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('ggplot2', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('vegan', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('fields', repos = \"http://cran.rstudio.com/\")\n",
+    "install.packages('dplyr', repos = \"http://cran.rstudio.com/\")\n",
+    "library(biom)\n",
+    "library(phyloseq)\n",
+    "library(qiimer)\n",
+    "library(ggplot2)\n",
+    "library(vegan)\n",
+    "library(fields)\n",
+    "library(dplyr)\n",
+    "\n",
+    "## I was given a 1-column csv file with sample IDs to include in my analyses\n",
+    "use <- read.csv('filepath/file.csv', header = TRUE)\n",
+    "use[, 1] <- as.character(use[, 1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we'll read in the BIOM table into R for analysis. R really hates non-JSON formatted BIOM tables (e.g., tables generated with macqiime v1.9), so you may have to use BIOM-convert in to reformat to JSON before importing:\n",
+    "\n",
+    "http://biom-format.org/documentation/biom_conversion.html"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## import the table\n",
+    "biom <- read_biom('filepath/file.biom')\n",
+    "\n",
+    "## subset data by 'Samples To Use' file\n",
+    "dat <- biom_data(biom)\n",
+    "dat <- as.data.frame(as(biom_data(biom), \"matrix\"), header = TRUE)\n",
+    "dat <- dat[, which(names(dat) %in% use[, 1])]\n",
+    "\n",
+    "## subset again by samples that only occur in >= 10 samples per Josh Ladau's instructions\n",
+    "dat <- dat[-c(which(rowSums(dat != 0) < 10)), ]\n",
+    "\n",
+    "## pull out metadata\n",
+    "tax <- as(observation_metadata(biom), \"matrix\")\n",
+    "meta <- as(sample_metadata(biom), \"matrix\")\n",
+    "meta <- as.data.frame(samp)\n",
+    "\n",
+    "## subset metadata by 'use' rows\n",
+    "meta <- meta[c(which(rownames(meta) %in% use[, 1])), ]\n",
+    "\n",
+    "## coordinates matrix for great circle calculation\n",
+    "## I'm assuming column headers weren't renamed, so I'll go with the same for now\n",
+    "coords <- as.matrix(data.frame(lon = as.numeric(as.character(samp$longitude)),\n",
+    "                               lat = as.numeric(as.character(samp$latitude))))\n",
+    "\n",
+    "## normalize biom table\n",
+    "dat.rel <- t(t(dat) / rowSums(t(dat))) \n",
+    "dat.rel <- dat.rel[which(rowSums(dat.rel) > 0), ]\n",
+    "\n",
+    "## cube-root transform normalized abundances for well-behaved residuals\n",
+    "dat.rel.trans <- dat.rel^(1/3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that our data are successfully loaded, let's compute our distance matrices."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## compute Canberra distances\n",
+    "comm.dist <- vegdist(t(dat.rel.trans), method = 'canberra')\n",
+    "comm.dist <- as.matrix(comm.dist)\n",
+    "\n",
+    "## compute great circle distance for sites\n",
+    "gc.dist <- rdist.earth(coords, miles = FALSE, R = 6371) ## manually added a more accurate estimate of Earth's radius\n",
+    "colnames(gc.dist) <- colnames(comm.dist)\n",
+    "rownames(gc.dist) <- rownames(comm.dist)\n",
+    "\n",
+    "## this is a pretty large dataset, so let's save some memory and\n",
+    "## just plot the upper triangles of both matrices to avoid duplicate points\n",
+    "## in the scatterplot\n",
+    "tri.comm <- comm.dist\n",
+    "diag(tri.comm) <- NA\n",
+    "tri.comm[lower.tri(tri.comm)] <- NA\n",
+    "\n",
+    "tri.gc <- gc.dist\n",
+    "diag(tri.gc) <- NA\n",
+    "tri.gc[lower.tri(tri.gc)] <- NA\n",
+    "\n",
+    "## bind data for plotting\n",
+    "ggplot.dat <- na.omit(data.frame('comm' = c(tri.comm), 'gc' = c(tri.gc)))\n",
+    "\n",
+    "## distance-decay plot for all data\n",
+    "ggplot(ggplot.dat, aes(x = gc, y = comm)) + \n",
+    "  geom_point(colour = 'dodgerblue4', alpha = 0.7, size = 0.35) +\n",
+    "  stat_smooth(colour = 'black', method = \"lm\", \n",
+    "              formula = y ~ poly(x, 1)) + ## fits a linear model\n",
+    "  guides(colour = FALSE, fill = FALSE) +\n",
+    "  xlab('Spatial Distance (km)') +\n",
+    "  ylab('Community Distance') +\n",
+    "  theme_bw() + \n",
+    "  theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), \n",
+    "        panel.border = element_blank(), panel.grid.minor = element_blank(), \n",
+    "        panel.background = element_blank()) +\n",
+    "  theme(axis.line = element_line(color = 'black'))\n",
+    "\n",
+    "## and we perform a permutational test to assess significance\n",
+    "mant.mod <- mantel(comm.dist, gc.dist, method = 'spear', permutations = 999)\n",
+    "mant.mod"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next section performs individual analyses at the state-level, and performs Benjamini-Hochberg correction for multiple comparisons and spits them out into a summary table, so that you can evaluate which states (if any) display significant distance-decay relationships and plot those particular states, if you want.\n",
+    "\n",
+    "In the first version of the data I recieved, I had to manually clean up multiple discrepancies in state namings (e.g., state labels for Colorado were inputted as both 'CO' and 'Colorado'. Not sure if this has been corrected in the new data. If not, let me know and we can figure out a quick solution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## define some things beforehand\n",
+    "uni.state <- unique(samp$state)\n",
+    "state.frames <- list() ## save the distance outputs for later\n",
+    "mantel.list <- list() ## save statistics for later\n",
+    "\n",
+    "for(s in 1:length(uni.state)){\n",
+    "  curr.state <- uni.state[s]\n",
+    "  meta.state <- as.data.frame(meta)\n",
+    "  meta.state <- subset(meta.state, state == curr.state)\n",
+    "  \n",
+    "  ## subset data by state\n",
+    "  dat.state <- dat[, which(names(dat) %in% rownames(meta.state))]\n",
+    "  \n",
+    "  ## pull out lat and lon coordinates\n",
+    "  coords.state <- data.frame(lon = samp.state[, 'longitude'], lat = samp.state[, 'latitude'])\n",
+    "  rownames(coords.state) <- NULL\n",
+    "  coords.state$lon <- as.numeric(as.character(coords.state$lon)) ## make sure R doesn't do any of its weird factor stuff\n",
+    "  coords.state$lat <- as.numeric(as.character(coords.state$lat))\n",
+    "  coords.state <- as.matrix(coords.state)\n",
+    "  \n",
+    "  ## normalize new community matrix\n",
+    "  dat.rel.state <- t(t(dat.state) / rowSums(t(dat.state)))\n",
+    "  dat.rel.state <- dat.rel.state[which(rowSums(dat.rel.state) > 0), ]\n",
+    "  dat.rel.state.trans <- dat.rel.state^(1/3)  \n",
+    "  canb.state <- vegdist(t(dat.rel.state), method = 'canberra') ## compute state-level distance matrix\n",
+    "  canb.state <- as.matrix(canb.state)\n",
+    "  \n",
+    "  ## compute great circle distance for sites\n",
+    "  gc.state <- rdist.earth(coords.state, miles = FALSE, R = 6371)\n",
+    "  \n",
+    "  ## mantel test\n",
+    "  mant.state <- mantel(canb.state, gc.state, method = 'spear', permutations = 999)\n",
+    "  mantel.list[[s]] <- c(as.character(curr.state), \n",
+    "                             mant.state$statistic, ## correlation\n",
+    "                             mant.state$signif) ## p-value\n",
+    "  )\n",
+    "  \n",
+    "  ## take upper triangle of both matrices\n",
+    "  tri.canb.state <- canb.state\n",
+    "  diag(tri.canb) <- NA\n",
+    "  tri.canb.state[lower.tri(tri.canb.state)] <- NA\n",
+    "  tri.gc.state <- gc.state\n",
+    "  diag(tri.gc.state) <- NA\n",
+    "  tri.gc.state[lower.tri(tri.gc.state)] <- NA\n",
+    "  \n",
+    "  ## save data for plotting and fit model\n",
+    "  state.frames[[s]] <- na.omit(data.frame('comm' = c(tri.canb.state), 'gc' = c(tri.gc.state), 'state' = rep(curr.state)))\n",
+    "  \n",
+    "  ## progress bar to drop us a line\n",
+    "  if(s == 1){bar.vec <- c(na.omit(seq(1:length(uni.state))[1:length(uni.state) * round(length(uni.state) / 10)]))\n",
+    "  cat('|')}\n",
+    "  if(f %in% bar.vec == TRUE){cat('=====|')}\n",
+    "}\n",
+    "\n",
+    "## bind these lists together\n",
+    "state.dat <- do.call('rbind', state.frames)\n",
+    "mantel.frame <- as.data.frame(do.call('rbind', mantel.list))\n",
+    "names(mantel.frame) <- c('state', 'coef', 'p')\n",
+    "\n",
+    "## adjust p-values with FDR\n",
+    "mantel.frame$p.adjust <- p.adjust(mantel.frame$p, method = 'fdr')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You should now have a table called 'mantel.frame' summarizing state-level distance-decay relationships. To plot specific states, run the cell below after specifying which states you want to visualize."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "## subset data by states of interest. for instance, CO and CA.\n",
+    "state.dat.sub <- subset(state.dat, state == 'CO' | state == 'CA')\n",
+    "\n",
+    "## quick ggplot\n",
+    "ggplot(state.dat.sub, aes(x = gc, y = comm, show_guide = FALSE)) + \n",
+    "  geom_point(alpha = 0.8, size = .6, colour = 'dodgerblue4') +\n",
+    "  stat_smooth(width = 5, colour = 'black', method = \"lm\", formula = y ~ x, linetype = 2, level = 0, fill = 'dodgerblue4') +\n",
+    "  xlab('Great Circle Distance (km)') +\n",
+    "  ylab('Community Distance') +\n",
+    "  guides(colour = FALSE, fill = FALSE) +\n",
+    "  theme_bw() + \n",
+    "  theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), \n",
+    "        panel.border = element_blank(), panel.grid.minor = element_blank(), \n",
+    "        panel.background = element_blank()) +\n",
+    "  theme(axis.line = element_line(color = 'black')) +\n",
+    "  facet_wrap(~ state)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "R",
+   "language": "R",
+   "name": "ir"
+  },
+  "language_info": {
+   "codemirror_mode": "r",
+   "file_extension": ".r",
+   "mimetype": "text/x-r-source",
+   "name": "R",
+   "pygments_lexer": "r",
+   "version": "3.2.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}