-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrecalculate_frequency.R
executable file
·96 lines (81 loc) · 4.4 KB
/
recalculate_frequency.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Recalculates relative frequency given a pre-filtered file (i.e. allreads_filtered.csv).
# This step is done in the tprk_pipeline.py. recalculate_frequency.R will replace the file.
suppressMessages(suppressWarnings(require("optparse")))
suppressMessages(suppressWarnings(require(vegan)))
option_list <- list(make_option(c("-d", "--directory"), type="character", default=NULL, help="Specify working directory", metavar="character"),
make_option(c("-f", "--filename"), type="character", default=NULL, help="Specify file to recalculate frequency.",
metavar="character"),
make_option(c("-m", "--metadata"), type="character", default=NULL, help="Specify metadata", metavar="character"),
make_option(c("-i", "--illumina_check"), type="character", default=FALSE, help="Specify if these files are only PacBio.",
metavar="character", action="store_true"));
opt_parser <- OptionParser(option_list=option_list);
opt <- parse_args(opt_parser)
path <- "./"
filename <- (opt$filename)
#####
## To run this script manually in R, uncomment the following lines. You do not need to change the preceding line of path
## but remember to recomment the lines if you want to run the script automatically in the pipeline.
## path refers to the folder your metadata.csv and sequencing files (.fastq) are.
## filename refers to the pre-filtered file (i.e. allreads_filtered.csv) that you want to recalculate frequencies for.
#path <- "/Users/uwvirongs/Documents/Michelle/tprk_pipeline/testing2"
#filename <- "/Users/uwvirongs/Documents/Michelle/tprk_pipeline/testing2/allreads_filtered.csv"
## This script can also be run from the command line.
## Usage: rscript \path\to\recalculate_frequency.R -d [path] -f [file_name]
#####
allreads_filtered <- read.table(filename, sep=',', header=TRUE)
# Grabs the actual number of samples.
numsamples <- (length(colnames(allreads_filtered)) - 2) / 4
metadata <- read.table(opt$metadata, sep=',', header=TRUE)
sample_names <- c(as.character(metadata$SampleName))
# TODO: Implement Illumina check.
# if(opt$illumina_check) {
# for (i in c(1:length(sample_names))) {
# IlluminaCounttitle <- paste("Ill_",sample_names[i],"_Count",sep='')
# illumina_filtered <- allreads_filtered[allreads_filtered[[IlluminaCounttitle]] < 5 ,]
# illumina_filtered <- which(allreads_filtered[IlluminaCounttitle] < 5, arr.ind)
# allreads_filtered1[illumina_filter]
# for (j in allreads_filtered[IlluminaCounttitle]) {
# if(j>=5) {
# print(j)
# }
# }
# }
allreads_filtered1 <- allreads_filtered
# Allreads_filtered.csv has both PacBio and Illumina data, so must run through twice.
if (grepl("allreads_filtered.csv",filename)) {
numsamples = numsamples * 2
}
# Loops through samples and recalculates frequency for each sample (column) for each region.
for (sample in c(1:(numsamples))){
freqcol <- (sample * 2) + 1
countcol <- (sample * 2) + 2
for (region in unique(allreads_filtered1$Region)){
allreads_filtered1[which(allreads_filtered1[,1]==region),freqcol] <-
allreads_filtered1[which(allreads_filtered1[,1]==region),countcol] / sum(allreads_filtered1[which(allreads_filtered1[,1]==region),countcol],na.rm=TRUE) * 100
}
allreads_filtered1[,freqcol] <- trimws(format(allreads_filtered1[,freqcol], digits = 4, nsmall = 4))
}
#out_filename <- substr(filename,1,nchar(filename)-4)
#allreads_out <- paste(path,"/",out_filename,".csv",sep='')
# Rewrites the file with the newly calculated relative frequencies.
write.csv(allreads_filtered1,file=filename,row.names=FALSE,quote=FALSE)
# Also makes Amin's requested CSV
if (grepl("allreads_filtered.csv",filename)) {
amin_csv <- t(allreads_filtered1)
variable_region_lists <- split(allreads_filtered1, f = allreads_filtered1$Region)
for (i in 1:length(variable_region_lists)) {
amin_csv <- t(variable_region_lists[[i]])
colnames(amin_csv) <- NULL
to_remove <- "Region"
for (row in 1:nrow(amin_csv)) {
row_name <- rownames(amin_csv)[row]
if (grepl("_RelativeFreq", row_name)) {
to_remove <- c(to_remove, row_name)
}
}
removedamin_csv = amin_csv[ !(row.names(amin_csv) %in% to_remove), ]
path <- gsub("[\r\n]", "", path)
dir.create(file.path(path, "Diversity"), showWarnings=FALSE)
write.table(removedamin_csv,file=paste(path,"/Diversity/V",i,".csv",sep=""), sep=",",quote=FALSE, col.names=FALSE)
}
}