-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path5_preseq.sh
47 lines (34 loc) · 1.57 KB
/
5_preseq.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
## Script for running preseq to estimate library complexity with shotgun data
## Written by M. Nieves-Colon - 3/29/17
## Updated 12/3/2017 - change -s parameters used and extrapolation total reads
## Usage on cluster: sbatch 5_preseq.sh
# Set program variable
preseq=/home/mnievesc/software/preseq_v2.0/preseq
# Copy over .sort.bam files
cp ../10_DownsampledMapFilter/*sort.bam .
## Use Preseq to estimate complexity curve of sample library (distinct reads produced)
## -B input is bam -v verbose -o output text file name
## If amount of reads is very low, use -s to modify step size (default is 1 million reads)
## 2> prints screen output generated by verbose option to a text file, this can be erased if desired.
# To create list of file names
ls *.bam | cut -d "." -f 1,2 > list
cat list | while read line
do
echo "Processing sample $line"
samtools sort $line.trimmed.merged.mapped.bwa.bam $line.trimmed.merged.mapped.bwa.sort
preseq c_curve -B -v -o $line.ccurve.txt $line.trimmed.merged.mapped.bwa.sort.bam 2> $line.ccurve.stats.txt
done
mkdir ../ccurve_results
mv *ccurve* ../ccurve_results
# To get stats for each file printed to screen
grep "TOTAL READS" *.txt
grep "DISTINCT READS" *txt
## Run extrapolation analysis only for two samples with highest amount of reads
array=("PI-67-D.ds" "PI-67-H.ds" "aGB7-D.ds" "aGB7-H.ds")
for i in "${array[@]}"
do
echo "Processing sample $i"
$preseq lc_extrap -B -v -e 600000000 -o $i.lcextrap.txt $i.trimmed.merged.mapped.q30.bwa.sort.bam 2> $i.lcextrap.stats.txt
done
mv *lcextrap* ../lcextrap_results