-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathGCperContig.R
48 lines (33 loc) · 1.39 KB
/
GCperContig.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
##%######################################################%##
# #
#### this scripts counts GC and AT bases ####
#### per contigs of ####
#### the different genomes ####
# #
##%######################################################%##
# the only argument is the number of CPUs to use
args <- commandArgs(trailingOnly = TRUE)
nCPUs <- as.integer(args[1])
source("HTvFunctions.R")
# we list the genome sequences downloaded in the first step
genomes <- list.files(pattern = ".gz", full.names = T, recursive = T)
# we will process bigger genomes first
genomes <- genomes[order(-file.size(genomes))]
# the function that counts GC and AT bases
GCcontent <- function(file) {
genome <- readDNAStringSet(file)
# the workhorse command
counts <- letterFrequency(genome, c("GC", "AT"))
# we put counts in a data table, one row per sequence (contig)
dt <- data.table(
contig = splitToColumns(names(genome), " ", 1),
counts,
)
setnames(dt, 2:3, c("GC", "AT"))
cat(".") #progress indicator
dt
}
# we apply the function in parallel for the different genomes
res <- mclapply(genomes, GCcontent, mc.cores = nCPUs, mc.preschedule = F)
res <- rbindlist(res)
writeT(res, "genomes/GCcontents.txt")