Skip to content

Commit

Permalink
New 10x processing scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
apredeus committed Feb 19, 2022
1 parent dfe40fa commit 605ae41
Show file tree
Hide file tree
Showing 12 changed files with 428 additions and 314 deletions.
19 changes: 19 additions & 0 deletions scripts/bulk_QC.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

echo -e "Sample\tN_reads\tN_uniq\tN_multi\tN_gene\tPct_uniq\tPct_multi\tPct_gene"

for i in *
do
if [[ -d $i && -s $i/Log.out ]]
then
N1=`grep "Number of input reads " $i/Log.final.out | awk '{print $6}'`
N2=`grep "Uniquely mapped reads number " $i/Log.final.out | awk '{print $6}'`
N3=`grep "Number of reads mapped to multiple loci " $i/Log.final.out | awk '{print $9}'`
N4=`cat $i/$i.rsem.genes.results | awk '{if (NR>1) {sum+=$5}} END {printf "%d\n",sum}'`

P1=`echo $N2 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
P2=`echo $N3 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
P3=`echo $N4 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
echo -e "$i\t$N1\t$N2\t$N3\t$N4\t$P1\t$P2\t$P3"
fi
done
29 changes: 29 additions & 0 deletions scripts/reorg_bulk_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash

## change sample tag for reuse!
## reorganize directories for bulk RNA-seq STAR+rsem processing

mkdir counts rsem_gene rsem_tran

for i in PR*
do
echo "Processing sample $i - moving files.."
cp $i/*genes.results rsem_gene
cp $i/*isoforms.results rsem_tran
done

cd rsem_gene
## we assume Ensembl IDs without dots

for i in *genes.results
do
TAG=${i%%.rsem.genes.results}
echo "Processing file $TAG - generating count tables.."
echo -e "geneName\tgeneEffLength\t$TAG" > $TAG.counts.tsv
grep -v expected_count $i | grep -v _PAR_ | cut -f 1,4,5 >> $TAG.counts.tsv
done

mv *.counts.tsv ../counts

echo "ALL DONE!"

46 changes: 29 additions & 17 deletions scripts/star_rsem_bulk.sh
Original file line number Diff line number Diff line change
@@ -1,38 +1,50 @@
#!/bin/bash

TAG=$1
if [[ $TAG == "" ]]
then
>&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>"
>&2 echo "(make sure you set the correct SREF, RREF, and FQDIR variables)"
exit 1
fi

SREF=/nfs/cellgeni/alexp/GRCh38_2020A/STAR
RREF=/nfs/cellgeni/alexp/GRCh38_2020A/GRCh38_v32_rsem
FQDIR=/lustre/scratch117/cellgen/cellgeni/alexp/Bulk/Bulk_for_tic957/fastqs
SREF=/nfs/cellgeni/STAR/human/2020A/index
RREF=/nfs/cellgeni/STAR/human/2020A/2020A_rsem
FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-bulkseq/tic-1333/fastqs
PAIRED="--paired-end"
STRAND="--forward-prob 0"
CPUS=16

## for single-end reads, change both STAR and RSEM options
## for strand-specific processing, change --forward-prob in rsem-calculate-expression (1 is FR, 0 is RF, 0.5 is non-strand-specific).

mkdir $TAG && cd $TAG
R1=`find $FQDIR/* | grep $TAG | grep R1`
R2=`find $FQDIR/* | grep $TAG | grep R2`

if [[ $R1 == "" ]]
R1=""
R2=""
if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]]
then
R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]]
then
>&2 echo "No R1 read files was found for sample tag $TAG!"
>&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>"
R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]]
then
R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"`
else
>&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
exit 1
fi

GZIP=""
if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]]
then
GZIP="--readFilesCommand zcat"
fi

STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 --readFilesCommand zcat --outFilterMultimapNmax 20 \
## ENCODE options for RNA-seq alignment
STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 $GZIP --outFilterMultimapNmax 20 \
--alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \
--alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt \
--outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD \
--outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM GeneCounts --sjdbScore 1 --limitBAMsortRAM 30000000000

rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output --forward-prob 0.5 Aligned.toTranscriptome.out.bam $RREF $TAG.rsem
samtools index -@$CPUS Aligned.sortedByCoord.out.bam

rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output $STRAND Aligned.toTranscriptome.out.bam $RREF $TAG.rsem
210 changes: 210 additions & 0 deletions scripts/starsolo_10x_auto.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
#!/bin/bash -e

## v3.0 of STARsolo wrappers is set up to guess the chemistry automatically
## newest version of the script uses STAR v2.7.9a with EM multimapper processing in STARsolo (but it's not on by default; turn this on with --soloMultiMappers EM)
## velocyto extra processing also became unnecessary

TAG=$1
if [[ $TAG == "" ]]
then
>&2 echo "Usage: ./starsolo_auto.sh <sample_tag>"
>&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)"
exit 1
fi

CPUS=16 ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.
REF=/nfs/cellgeni/STAR/human/2020A/index ## choose the appropriate reference
FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1328/fastqs ### change to the directory with fastq files/folders
## choose one of the two otions, depending on whether you need a BAM file
#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
BAM="--outSAMtype None"

###################### DONT CHANGE OPTIONS BELOW THIS LINE ###########################

mkdir $TAG && cd $TAG

## three popular cases: <sample>_1.fastq/<sample>_2.fastq, <sample>.R1.fastq/<sample>.R2.fastq, and <sample>_L001_R1_S001.fastq/<sample>_L001_R2_S001.fastq
## the command below will generate a comma-separated list for each read
R1=""
R2=""
if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]]
then
R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]]
then
R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]]
then
R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"`
R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"`
else
>&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
exit 1
fi

## also define one file from R1/R2; we choose the largest one
R1F=`echo $R1 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'`
R2F=`echo $R2 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'`

## let's see if the files are archived or not. Gzip is the only one we test for, but bgzip archives should work too since they are gzip-compatible.
GZIP=""
BC=""
NBC1=""
NBC2=""
NBC3=""
NBCA=""
R1LEN=""
R2LEN=""
R1DIS=""

## depending on whether the files are archived or not,
if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]]
then
GZIP="--readFilesCommand zcat"
NBC1=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
NBC2=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
NBC3=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
NBCA=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
R1LEN=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
R2LEN=`zcat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
R1DIS=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
zcat $R1F | head -n 100000 > test.R1.fastq
zcat $R2F | head -n 100000 > test.R2.fastq
else
NBC1=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
NBC2=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
NBC3=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
NBCA=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
R1LEN=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
R2LEN=`cat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
R1DIS=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
cat $R1F | head -n 100000 > test.R1.fastq
cat $R2F | head -n 100000 > test.R2.fastq
fi

## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt):
if (( $NBC2 > 5000 ))
then
BC=/nfs/cellgeni/STAR/whitelists/737K-august-2016.txt
elif (( $NBC3 > 5000 ))
then
BC=/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt
elif (( $NBCA > 5000 ))
then
BC=/nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt
elif (( $NBC1 > 5000 ))
then
BC=/nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt
else
>&2 echo "ERROR: No whitelist has matched first 10000 barcodes!"
exit 1
fi

## check read lengths, fail if something funky is going on:
PAIRED=False
UMILEN=""
CBLEN=""
if (( $R1DIS > 1 && $R1LEN <= 30 ))
then
>&2 echo "ERROR: Read 1 (barcode) has varying length; possibly someone thought it's a good idea to quality-trim it. Please check the fastq files."
exit 1
elif (( $R1LEN < 24 ))
then
>&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files."
exit 1
elif (( $R2LEN < 50 ))
then
>&2 echo "ERROR: Read 2 (biological read) is less than 50 bp in length. Please check the fastq files."
exit 1
fi

## assign the necessary variables for barcode/UMI length/paired-end processing:
if (( $R1LEN > 50 ))
then
PAIRED=True
UMILEN=10
CBLEN=16
elif (( $NBC1 > 5000 ))
then
CBLEN=14
UMILEN=$((R1LEN-14))
else
CBLEN=16
UMILEN=$((R1LEN-16))
fi

## finally, see if you have 5' or 3' experiment. I don't know and easier way:
STRAND=Forward

STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
--soloUMIlen $UMILEN --soloStrand Forward \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
--soloFeatures Gene --soloOutFileNames test_strand/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
rm test.R1.fastq test.R2.fastq

GENEPCT=`grep "Reads Mapped to Gene: Unique+Multipe Gene" test_strand/Gene/Summary.csv | awk -F "," '{printf "%d\n",$2*100}'`
if (( $GENEPCT < 10 ))
then
STRAND=Reverse
fi

## finally, if paired-end experiment turned out to be 3', process it as single-end:
if [[ $STRAND == "Forward" && $PAIRED == "True" ]]
then
PAIRED=False
if [[ $BC == "/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt" ]]
then
UMILEN=12
fi
fi

echo "Done setting up the STARsolo run; here are final processing options:"
echo "============================================================================="
echo "Sample: $TAG"
echo "Paired-end mode: $PAIRED"
echo "Strand (Forward = 3', Reverse = 5'): $STRAND"
echo "CB whitelist: $BC"
echo "CB length: $CBLEN"
echo "UMI length: $UMILEN"
echo "GZIP: $GZIP"
echo "-----------------------------------------------------------------------------"
echo "Read 1 files: $R1"
echo "-----------------------------------------------------------------------------"
echo "Read 2 files: $R2"
echo "-----------------------------------------------------------------------------"

if [[ $PAIRED == "True" ]]
then
STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
else
STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
fi

## index the BAM file
if [[ -s Aligned.sortedByCoord.out.bam ]]
then
samtools index -@16 Aligned.sortedByCoord.out.bam
fi

## finally, let's gzip all outputs
cd output
for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered Velocyto/raw Velocyto/filtered
do
cd $i; for j in *; do gzip $j & done
cd ../../
done

wait
echo "ALL DONE!"
59 changes: 0 additions & 59 deletions scripts/starsolo_3p_v1.sh

This file was deleted.

Loading

0 comments on commit 605ae41

Please sign in to comment.