update the documents

smithlabcode · Jun 27, 2018 · 5a43a39 · 5a43a39
1 parent 686d74b
commit 5a43a39
Show file tree

Hide file tree

Showing 30 changed files with 134 additions and 130 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: preseqR
 Type: Package
-Title: Predicting the Number of Species in a Random Sample
+Title: Predicting Species Accumulation Curves
 Version: 4.0.0
-Date: 2017-12-26
+Date: 2018-06-27
 Author: Chao Deng, Timothy Daley and Andrew D. Smith
 Maintainer: Chao Deng <[email protected]>
-Description: The relation between the number of species and the number of individuals in a random sample is a classic problem back to Fisher (1943) <doi:10.2307/1411>. We generalize this problem to predict the number of species represented at least r times in a random sample. In particular when r=1, it becomes the classic problem. We use a mixture of Poisson processes to model sampling procedures and apply an empirical Bayes approach to obtain a rational function estimator. The approach can be applied to assess the quality of DNA sequencing libraries and optimize depths of sequencing experiments. For more information on 'preseqR', see Deng C, Daley T and Smith AD (2015) <doi:10.1007/s40484-015-0049-7> and Deng C and Smith AD (2016) <arXiv:1607.02804v2>.
+Description: Originally as an R version of Preseq <doi:10.1038/nmeth.2375>, the package has extended its functionality to predict the r-species accumulation curve (r-SAC), which is the number of species represented at least r times as a function of the sampling effort. When r = 1, the curve is known as the species accumulation curve, or the library complexity curve in high-throughput genomic sequencing. The package includes both parametric and nonparametric methods, as described by Deng C, et al. (2018) <arXiv:1607.02804v3>. 
 License: GPL-3
 Imports:
   polynom, graphics, stats
diff --git a/NAMESPACE b/NAMESPACE
@@ -9,6 +9,7 @@ export(preseqR.interpolate.rSAC)
 export(preseqR.rSAC)
 export(preseqR.rSAC.bootstrap)
 export(ds.rSAC)
+export(ds.rSAC.bootstrap)
 export(ztnb.rSAC)
 export(ztp.rSAC)
 export(bbc.rSAC)

diff --git a/R/kmer.R b/R/kmer.R
@@ -26,18 +26,18 @@ kmer.frac <- function(n, r=2, mt=20) {
 
 ## the fraction of k-mers represented at least r times as a function of 
 ## sample sizes
-kmer.frac.curve <- function(n, k, read.len, seq.gb, r=2, mt=20) {
+kmer.frac.curve <- function(n, k, read.len, seq, r=2, mt=20) {
   f <- kmer.frac(n, r=r, mt=mt)
   if (is.null(f))
     return(NULL)
   n[, 2] <- as.numeric(n[, 2])
   N <- n[, 1] %*% n[, 2]
   ## average number of k-mers per read
   m <- read.len - k + 1
-  unit.gb <- N / m * read.len / 1e9
-  seq.effort <- seq.gb / unit.gb
-  result <- matrix(c(seq.gb, f(seq.effort)), ncol=2, byrow=FALSE)
-  colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep=""))
+  unit <- N / m * read.len
+  seq.effort <- seq / unit
+  result <- matrix(c(seq, f(seq.effort)), ncol=2, byrow=FALSE)
+  colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep=""))
   return(result)
 }
 
@@ -50,7 +50,7 @@ kmer.frac.bootstrap <- function(n, r=2, mt=20, times=30, conf=0.95) {
 
 ## the fraction of k-mers represented at least r times as a function of 
 ## sample sizes
-kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20,
+kmer.frac.curve.bootstrap <- function(n, k, read.len, seq, r=2, mt=20,
                                       times=30, conf=0.95)
 {
   f <- kmer.frac.bootstrap(n, r=r, mt=mt, times=times, conf=conf)
@@ -60,11 +60,11 @@ kmer.frac.curve.bootstrap <- function(n, k, read.len, seq.gb, r=2, mt=20,
   N <- n[, 1] %*% n[, 2]
   ## average number of k-mers per read
   m <- read.len - k + 1
-  unit.gb <- N / m * read.len / 1e9
-  seq.effort <- seq.gb / unit.gb
-  result <- matrix(c(seq.gb, f$f(seq.effort), f$lb(seq.effort), 
+  unit <- N / m * read.len
+  seq.effort <- seq / unit
+  result <- matrix(c(seq, f$f(seq.effort), f$lb(seq.effort), 
                      f$ub(seq.effort)), ncol=4, byrow=FALSE)
-  colnames(result) <- c("bases(GB)", paste("frac(X>=", r, ")", sep=""), 
+  colnames(result) <- c("bases", paste("frac(X>=", r, ")", sep=""), 
                         "lb", "ub")
   return(result)
 }
diff --git a/R/sequencing.R b/R/sequencing.R
@@ -20,8 +20,7 @@
 
 ## predict the optimal number of sequenced bases using cost-benefit ratio
 preseqR.optimal.sequencing <- function(
-  n, efficiency=0.05, bin=1e8, r=1, mt=20, size=SIZE.INIT,
-  mu=MU.INIT, times=30, conf=0.95)
+  n, efficiency=0.05, bin=1e8, r=1, mt=20, times=30, conf=0.95)
 {
   find.start <- function(f, N, bin, efficiency) {
     y = sapply(1:100, function(x) (f(x + bin / N) - f(x)) / bin - efficiency)
@@ -36,8 +35,8 @@ preseqR.optimal.sequencing <- function(
   N <- n[, 1] %*% n[, 2]
 
   ## r-species accumulation curve as a function of relative sample size
-  f.rSAC <- preseqR.rSAC.bootstrap(
-    n=n, r=r, mt=mt, size=size, mu=mu,times=times, conf=conf)
+  f.rSAC <- ds.rSAC.bootstrap(
+    n=n, r=r, mt=mt, times=times, conf=conf)
 
   ## hint: using r-SAC as a function of the number of sequenced bases
   f <- f.rSAC$f
@@ -73,7 +72,7 @@ preseqR.optimal.sequencing <- function(
 ## the function is designed for EXOME sequencing, where aligned reads that
 ## map to the same location are removed to avoid potential duplicate
 preseqR.rSAC.sequencing.rmdup <- function(
-  n_base, n_read, r=1, mt=20, times=100, conf=0.95)
+  n_base, n_read, r=1, mt=20, times=30, conf=0.95)
 {
   checking.hist(n_read)
   checking.hist(n_base)

diff --git a/inst/CITATION b/inst/CITATION
@@ -19,14 +19,14 @@ citEntry(entry = "article",
 
 citEntry(entry = "article",
   title        = "Estimating the number of species to attain sufficient representation in a random sample",
-  author       = personList(as.person("Chao Deng"), as.person("Andrew D. Smith")),
+  author       = personList(as.person("Chao Deng"), as.person("Timothy Daley"), as.person("Peter Calabrese"), as.person("Jie Ren"), as.person("Andrew D. Smith")),
   journal      = "arXiv",
-  year         = "2016",
-  url          = "https://arxiv.org/abs/1607.02804v2",
+  year         = "2018",
+  url          = "https://arxiv.org/abs/1607.02804v3",
 
   textVersion  =
-  paste("Deng C and Smith AD (2016).",
+  paste("Deng C, Daley T, Calabrese P, Ren J and Smith AD (2018).",
         "Estimating the number of species to attain sufficient representation in a random sample.",
         "arXiv preprint.",
-        "URL https://arxiv.org/abs/1607.02804v2.")
+        "URL https://arxiv.org/abs/1607.02804v3.")
 )
diff --git a/man/Dickens.Rd b/man/Dickens.Rd
@@ -8,7 +8,7 @@
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of unique words appeared \eqn{j}
+    is \eqn{N_j}, the number of unique words appeared exactly \eqn{j}
     times in a collection of Charles Dickens.
 }
 

diff --git a/man/ShakespeareWordHist.Rd → man/Shakespeare.Rd b/man/ShakespeareWordHist.Rd → man/Shakespeare.Rd
diff --git a/man/Twitter.Rd b/man/Twitter.Rd
@@ -7,7 +7,7 @@
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of users with \eqn{j} followers.
+    is \eqn{n_j}, the number of users with exactly \eqn{j} followers.
 }
 
 \references{

diff --git a/man/WillButterfly.Rd b/man/WillButterfly.Rd
@@ -13,7 +13,7 @@ Animal Population, Journal of Animal Ecology, 12, 42-58, Table 3.
 \details{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{n_j}, the number of butterflies captured \eqn{j}
+    is \eqn{n_j}, the number of butterflies captured exactly \eqn{j}
     times in the sample.
 }  
 

diff --git a/man/bbc.rSAC.Rd b/man/bbc.rSAC.Rd
@@ -18,7 +18,7 @@ bbc.rSAC(n, r=1)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -41,6 +41,10 @@ bbc.rSAC(n, r=1)
 Boneh, S., Boneh, A., & Caron, R. J. (1998). Estimating the prediction function
 and the number of unseen species in sampling with replacement.
 Journal of the American Statistical Association, 93(441), 372-379.
+
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \examples{

diff --git a/man/cs.rSAC.Rd b/man/cs.rSAC.Rd
@@ -18,7 +18,7 @@ cs.rSAC(n, r=1, k=10)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -43,6 +43,10 @@ cs.rSAC(n, r=1, k=10)
 \references{
 Chao, A., & Shen, T. J. (2004). Nonparametric prediction in species sampling.       
 Journal of agricultural, biological, and environmental statistics, 9(3), 253-269.
+
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \examples{

diff --git a/man/ds.rSAC.Rd b/man/ds.rSAC.Rd
@@ -16,7 +16,7 @@ ds.rSAC(n, r=1, mt=20)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -45,8 +45,9 @@ ds.rSAC(n, r=1, mt=20)
   the initial sample.
 }
 \references{
-Deng, C and Smith, AD (2016). Estimating the number of species to attain 
-sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 \author{
   Chao Deng

diff --git a/man/ds.rSAC.bootsrap.Rd b/man/ds.rSAC.bootsrap.Rd
@@ -17,7 +17,7 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -29,7 +29,7 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
     approximations. Default is 20.
   }
   \item{times}{
-    The number of bootstrap samples.
+    The number of bootstrap samples. Default is 30.
   }
   \item{conf}{
     The confidence level. Default is 0.95
@@ -69,8 +69,9 @@ ds.rSAC.bootstrap(n, r=1, mt=20, times=30, conf=0.95)
 \references{
 Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press.
 
-Deng, C & Smith, AD (2016). Estimating the number of species to attain 
-sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \author{
@@ -102,7 +103,7 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 ## when the sample size is 50 or 100 times of the initial sample
 # ds2$f(c(50, 100))
 ## The standard error of the estiamtes
-# ds2$se(c(50, 100)))
+# ds2$se(c(50, 100))
 ## The confidence interval of the estimates
 # lb <- ds2$lb(c(50, 100))
 # ub <- ds2$ub(c(50, 100))

diff --git a/man/fisher.alpha.Rd b/man/fisher.alpha.Rd
@@ -16,7 +16,7 @@ fisher.alpha(n)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
 }

diff --git a/man/fisher.rSAC.Rd b/man/fisher.rSAC.Rd
@@ -18,7 +18,7 @@ fisher.rSAC(n, r=1)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }
@@ -41,6 +41,10 @@ fisher.rSAC(n, r=1)
 Fisher, R., Corbet, A., & Williams, C. (1943). The Relation Between the Number 
 of Species and the Number of Individuals in a Random Sample of an Animal 
 Population. Journal of Animal Ecology, 12(1), 42-58. doi:10.2307/1411  
+
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \examples{

diff --git a/man/kmer.frac.curve.Rd b/man/kmer.frac.curve.Rd
@@ -10,7 +10,7 @@ least \eqn{r} times in a high-throughput sequencing experiment given the
 amount of sequencing
 }
 \usage{
-  kmer.frac.curve(n, k, read.len, seq.gb, r=2, mt=20)
+  kmer.frac.curve(n, k, read.len, seq, r=2, mt=20)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -27,8 +27,8 @@ amount of sequencing
   \item{read.len}{
     The average length of a read.
   }
-  \item{seq.gb}{
-    The amount of sequencing in billions.
+  \item{seq}{
+    The amount of nucleotides sequenced..
   }
   \item{r}{
     A positive integer. Default is 1.
@@ -49,8 +49,7 @@ amount of sequencing
 }
 \value{
   A two-column matrix. The first column is the amount of sequencing in an 
-  experiment. The value is specified by the variable \eqn{seq.gb}. 
-  The second column is the estimate of the fraction of \eqn{k}-mers observed at least
+  experiment. The second column is the estimate of the fraction of \eqn{k}-mers observed at least
   \eqn{r} times in the experiment.
 }
 \references{
@@ -68,9 +67,9 @@ library(preseqR)
 ## import data
 data(SRR061157_k31)
 
-## the fraction of 31-mers represented at least twice in an experiment when
+## the fraction of 31-mers represented at least 10 times in an experiment when
 ## sequencing 1M, 10M, 100M, 1G, 10G, 100G, 1T nucleotides
-kmer.frac.curve(n=SRR061157_k31, k=31, read.len=200, seq.gb=10^(6:12), r=2, mt=20) 
+kmer.frac.curve(n=SRR061157_k31, k=31, read.len=100, seq=10^(6:12), r=10, mt=20) 
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the
 % R documentation directory.

diff --git a/man/kmer.frac.curve.bootstrap.Rd b/man/kmer.frac.curve.bootstrap.Rd
@@ -10,7 +10,7 @@ least \eqn{r} times in a high-throughput sequencing experiment given the
 amount of sequencing
 }
 \usage{
-kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.95)
+kmer.frac.curve.bootstrap(n, k, read.len, seq, r=2, mt=20, times=30, conf=0.95)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -27,8 +27,8 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9
   \item{read.len}{
     The average length of a read.
   }
-  \item{seq.gb}{
-    The amount of sequencing in billions.
+  \item{seq}{
+    The amount of nucleotides sequenced.
   }
   \item{r}{
     A positive integer. Default is 1.
@@ -55,7 +55,7 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9
 }
 \value{
   A four-column matrix. The first column is the amount of sequencing in an 
-  experiment. The value is specified by the variable \eqn{seq.gb}. 
+  experiment.
   The second column is the estimate of the fraction of \eqn{k}-mers observed at least
   \eqn{r} times in the experiment. The third and fourth columns are the lower
   bounds and the upper bounds of the confidence intervals.
@@ -64,8 +64,9 @@ kmer.frac.curve.bootstrap(n, k, read.len, seq.gb, r=2, mt=20, times=30, conf=0.9
 \references{
 Efron, B., & Tibshirani, R. J. (1994). An introduction to the bootstrap. CRC press.
 
-Deng, C and Smith, AD (2016). Estimating the number of species to attain 
-sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
+Deng, C., Daley, T., Calabrese, P., Ren, J., & Smith, A.D. (2016). Estimating
+the number of species to attain sufficient representation in a random sample.
+arXiv preprint arXiv:1607.02804v3.
 }
 
 \author{
@@ -74,15 +75,15 @@ sufficient representation in a random sample. arXiv preprint arXiv:1607.02804
 
 \examples{
 ## load library
-library(preseqR)
+# library(preseqR)
 
 ## import data
-data(SRR061157_k31)
+# data(SRR061157_k31)
 
-## the fraction of 31-mers represented at least twice in an experiment when
+## the fraction of 31-mers represented at least 10 times in an experiment when
 ## sequencing 1M, 10M, 100M, 1G, 10G, 100G, 1T nucleotides
-kmer.frac.curve.bootstrap(n=SRR061157_k31, k=31, read.len=200, 
-                          seq.gb=10^(6:12), r=2, mt=20) 
+# kmer.frac.curve.bootstrap(n=SRR061157_k31, k=31, read.len=100, 
+#                          seq=10^(6:12), r=10, mt=20) 
 }
 % Add one or more standard keywords, see file 'KEYWORDS' in the
 % R documentation directory.

diff --git a/man/preseqR.interpolate.rSAC.Rd b/man/preseqR.interpolate.rSAC.Rd
@@ -16,7 +16,7 @@ preseqR.interpolate.rSAC(n, ss, r=1)
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species with each species represented \eqn{j}
+    is \eqn{N_j}, the number of species with each species represented exactly \eqn{j}
     times in the initial sample. The first column must be sorted in an
     ascending order.
   }

diff --git a/man/preseqR.nonreplace.sampling.Rd b/man/preseqR.nonreplace.sampling.Rd
@@ -15,7 +15,7 @@ Sampling
   \item{n}{
     A two-column matrix.  
     The first column is the frequency \eqn{j = 1,2,\dots}; and the second column
-    is \eqn{N_j}, the number of species represented \eqn{j} times in the initial
+    is \eqn{N_j}, the number of species represented exactly \eqn{j} times in the initial
     sample. The first column must be sorted in an ascending order.
   }
   \item{size}{