-
Notifications
You must be signed in to change notification settings - Fork 0
/
03_trimmomatic_array_job.R
112 lines (92 loc) · 4 KB
/
03_trimmomatic_array_job.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File: 03_trimmomatic_array_job.R
# Auth: [email protected]
# DESC: create a parameter file and shell script to run array job on hpc
# Date: 17/01/2017
## set variables and source libraries
source('header.R')
## connect to mysql database to get sample information
library('RMySQL')
##### connect to mysql database to get samples
db = dbConnect(MySQL(), user='rstudio', password='12345', dbname='Projects', host='127.0.0.1')
dbListTables(db)
# check how many files each sample has
g_did
q = paste0('select count(File.idSample) as files, Sample.idData, Sample.title, Sample.id as SampleID from File, Sample
where (Sample.idData = 11 and File.idSample = Sample.id) group by File.idSample')
dfQuery = dbGetQuery(db, q)
dfQuery$title = gsub(" ", "", dfQuery$title, fixed = T)
dfQuery
# for each sample id, get the corresponding files
cvQueries = paste0('select File.*, Sample.title from File, Sample
where (Sample.idData = 11 and Sample.id =', dfQuery$SampleID, ') and (File.idSample = Sample.id)')
# set header variables
cvShell = '#!/bin/bash'
cvShell.2 = '#$ -S /bin/bash'
cvProcessors = '#$ -pe smp 1'
cvWorkingDir = '#$ -cwd'
cvJobName = '#$ -N trim-array'
cvStdout = '#$ -j y'
cvMemoryReserve = '#$ -l h_vmem=19G'
# set array job loop
cvArrayJob = '#$ -t 1-9'
# using high memory queue with one slot and 19 Gigs of memory
# set the directory names for trimmomatic
cvInput = 'input/'
cvOutput = 'output/Trimmed/'
cvOutput.unpaired = paste0(cvOutput, 'Unpaired/')
cvTrimmomatic = '/opt/apps/bioinformatics/trimmomatic/0.36/trimmomatic-0.36.jar'
cvIlluminaAdap = '/users/k1625253/brc_scratch/Data/MetaData/trimmomatic_adapters.fa'
# create a parameter file and shell script
dir.create('AutoScripts')
oFile.param = file('AutoScripts/trimmomatic_param.txt', 'wt')
temp = sapply(cvQueries, function(x){
# get the file names
dfFiles = dbGetQuery(db, x)
# check for null return
if (nrow(dfFiles) == 0) return();
# remove white space from title
dfFiles$title = gsub(" ", "", dfFiles$title, fixed=T)
# split the file names into paired end 1 and 2, identified by R1 and R2 in the file name
f = dfFiles$name
d = grepl('_R1_', f)
d = as.character(d)
d[d == 'TRUE'] = 'R1'
d[d == 'FALSE'] = 'R2'
lf = split(f, d)
# write trimmomatic command
in.r1 = paste0(cvInput, lf[[1]])
in.r2 = paste0(cvInput, lf[[2]])
out.r1 = paste0(cvOutput, 'trim_', lf[[1]])
out.r2 = paste0(cvOutput, 'trim_', lf[[2]])
out.r1.up = paste0(cvOutput.unpaired, 'up_', lf[[1]])
out.r2.up = paste0(cvOutput.unpaired, 'up_', lf[[2]])
p1 = paste(in.r1, in.r2, out.r1, out.r1.up, out.r2, out.r2.up, sep=' ')
writeLines(p1, oFile.param)
})
close(oFile.param)
oFile = file('AutoScripts/trimmomatic.sh', 'wt')
writeLines(c('# Autogenerated script from write_trimmomatic_script.R', paste('# date', date())), oFile)
writeLines(c('# make sure directory paths exist before running script'), oFile)
writeLines(c(cvShell, cvShell.2, cvProcessors, cvWorkingDir, cvJobName, cvStdout, cvMemoryReserve, cvArrayJob), oFile)
writeLines('\n\n', oFile)
# module load
writeLines(c('module load general/JRE/1.8.0_65', 'module load bioinformatics/trimmomatic/0.36'), oFile)
writeLines('\n\n', oFile)
## write array job lines
writeLines("# Parse parameter file to get variables.
number=$SGE_TASK_ID
paramfile=trimmomatic_param.txt
inr1=`sed -n ${number}p $paramfile | awk '{print $1}'`
inr2=`sed -n ${number}p $paramfile | awk '{print $2}'`
outr1=`sed -n ${number}p $paramfile | awk '{print $3}'`
outr1up=`sed -n ${number}p $paramfile | awk '{print $4}'`
outr2=`sed -n ${number}p $paramfile | awk '{print $5}'`
outr2up=`sed -n ${number}p $paramfile | awk '{print $6}'`
# 9. Run the program.", oFile)
p1 = paste('java -jar', cvTrimmomatic, 'PE -phred33', '$inr1 $inr2 $outr1 $outr1up $outr2 $outr2up', sep=' ')
p2 = paste0('ILLUMINACLIP:', cvIlluminaAdap, ':2:30:10:8:true') # remove more stringent settings LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36')
com = paste(p1, p2, sep=' ')
writeLines(com, oFile)
writeLines('\n\n', oFile)
close(oFile)
dbDisconnect(db)