5_preseq.sh

#!/bin/bash

## Script for running preseq to estimate library complexity with shotgun data
## Written by M. Nieves-Colon - 3/29/17
## Updated 12/3/2017 - change -s parameters used and extrapolation total reads

## Usage on cluster: sbatch 5_preseq.sh

# Set program variable
preseq=/home/mnievesc/software/preseq_v2.0/preseq

# Copy over .sort.bam files
cp ../10_DownsampledMapFilter/*sort.bam .

## Use Preseq to estimate complexity curve of sample library (distinct reads produced)
##  -B input is bam  -v verbose  -o output text file name
##  If amount of reads is very low, use -s to modify step size (default is 1 million reads)
##  2>  prints screen output generated by verbose option to a text file, this can be erased if desired.

# To create list of file names
ls *.bam | cut -d "." -f 1,2  >  list

cat list | while read line
 do
  echo "Processing sample $line"
  samtools sort $line.trimmed.merged.mapped.bwa.bam $line.trimmed.merged.mapped.bwa.sort
  preseq c_curve -B -v -o $line.ccurve.txt $line.trimmed.merged.mapped.bwa.sort.bam 2>  $line.ccurve.stats.txt
  done

mkdir ../ccurve_results
mv *ccurve* ../ccurve_results

# To get stats for each file printed to screen
grep "TOTAL READS" *.txt
grep "DISTINCT READS" *txt


## Run extrapolation analysis only for two samples with highest amount of reads
array=("PI-67-D.ds" "PI-67-H.ds" "aGB7-D.ds" "aGB7-H.ds")

for i in "${array[@]}"
  do
  echo "Processing sample $i"
  $preseq lc_extrap -B -v -e 600000000  -o $i.lcextrap.txt  $i.trimmed.merged.mapped.q30.bwa.sort.bam 2>  $i.lcextrap.stats.txt
 done

mv *lcextrap* ../lcextrap_results