New 10x processing scripts

cellgeni · Feb 19, 2022 · 605ae41 · 605ae41
1 parent dfe40fa
commit 605ae41
Show file tree

Hide file tree

Showing 12 changed files with 428 additions and 314 deletions.
diff --git a/scripts/bulk_QC.sh b/scripts/bulk_QC.sh
@@ -0,0 +1,19 @@
+#!/bin/bash 
+
+echo -e "Sample\tN_reads\tN_uniq\tN_multi\tN_gene\tPct_uniq\tPct_multi\tPct_gene"
+
+for i in *
+do
+  if [[ -d $i && -s $i/Log.out ]]
+  then 
+    N1=`grep "Number of input reads " $i/Log.final.out | awk '{print $6}'`	
+    N2=`grep "Uniquely mapped reads number " $i/Log.final.out | awk '{print $6}'`
+    N3=`grep "Number of reads mapped to multiple loci " $i/Log.final.out | awk '{print $9}'`
+    N4=`cat $i/$i.rsem.genes.results | awk '{if (NR>1) {sum+=$5}} END {printf "%d\n",sum}'`
+
+    P1=`echo $N2 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
+    P2=`echo $N3 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
+    P3=`echo $N4 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'`
+    echo -e "$i\t$N1\t$N2\t$N3\t$N4\t$P1\t$P2\t$P3"
+  fi
+done 
diff --git a/scripts/reorg_bulk_dir.sh b/scripts/reorg_bulk_dir.sh
@@ -0,0 +1,29 @@
+#!/bin/bash 
+
+## change sample tag for reuse!
+## reorganize directories for bulk RNA-seq STAR+rsem processing 
+
+mkdir counts rsem_gene rsem_tran
+
+for i in PR*
+do
+  echo "Processing sample $i - moving files.."
+  cp $i/*genes.results rsem_gene
+  cp $i/*isoforms.results rsem_tran
+done 
+
+cd rsem_gene
+## we assume Ensembl IDs without dots
+
+for i in *genes.results
+do
+  TAG=${i%%.rsem.genes.results}
+  echo "Processing file $TAG - generating count tables.." 
+  echo -e "geneName\tgeneEffLength\t$TAG" > $TAG.counts.tsv
+  grep -v expected_count $i | grep -v _PAR_ | cut -f 1,4,5 >> $TAG.counts.tsv
+done 
+
+mv *.counts.tsv ../counts 
+
+echo "ALL DONE!" 
+
diff --git a/scripts/star_rsem_bulk.sh b/scripts/star_rsem_bulk.sh
@@ -1,38 +1,50 @@
 #!/bin/bash
 
 TAG=$1
-if [[ $TAG == "" ]]
-then
-  >&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>"
-  >&2 echo "(make sure you set the correct SREF, RREF, and FQDIR variables)"
-  exit 1
-fi
-
-SREF=/nfs/cellgeni/alexp/GRCh38_2020A/STAR
-RREF=/nfs/cellgeni/alexp/GRCh38_2020A/GRCh38_v32_rsem
-FQDIR=/lustre/scratch117/cellgen/cellgeni/alexp/Bulk/Bulk_for_tic957/fastqs
+SREF=/nfs/cellgeni/STAR/human/2020A/index
+RREF=/nfs/cellgeni/STAR/human/2020A/2020A_rsem
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-bulkseq/tic-1333/fastqs
 PAIRED="--paired-end"
+STRAND="--forward-prob 0"
 CPUS=16
 
 ## for single-end reads, change both STAR and RSEM options
 ## for strand-specific processing, change --forward-prob in rsem-calculate-expression (1 is FR, 0 is RF, 0.5 is non-strand-specific). 
 
 mkdir $TAG && cd $TAG
-R1=`find $FQDIR/* | grep $TAG | grep R1`
-R2=`find $FQDIR/* | grep $TAG | grep R2`
 
-if [[ $R1 == "" ]]
+R1=""
+R2=""
+if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]]
+then
+  R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]]
 then
-  >&2 echo "No R1 read files was found for sample tag $TAG!"
-  >&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>"
+  R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]]
+then
+  R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"`
+else
+  >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
   exit 1
 fi
 
+GZIP=""
+if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]]
+then
+  GZIP="--readFilesCommand zcat"
+fi
 
-STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 --readFilesCommand zcat --outFilterMultimapNmax 20 \
+## ENCODE options for RNA-seq alignment
+STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 $GZIP --outFilterMultimapNmax 20 \
      --alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \
      --alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt \
      --outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD \
      --outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM GeneCounts --sjdbScore 1 --limitBAMsortRAM 30000000000
 
-rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output --forward-prob 0.5 Aligned.toTranscriptome.out.bam $RREF $TAG.rsem
+samtools index -@$CPUS Aligned.sortedByCoord.out.bam
+
+rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output $STRAND Aligned.toTranscriptome.out.bam $RREF $TAG.rsem
diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
@@ -0,0 +1,210 @@
+#!/bin/bash -e 
+
+## v3.0 of STARsolo wrappers is set up to guess the chemistry automatically
+## newest version of the script uses STAR v2.7.9a with EM multimapper processing in STARsolo (but it's not on by default; turn this on with --soloMultiMappers EM)
+## velocyto extra processing also became unnecessary 
+
+TAG=$1
+if [[ $TAG == "" ]]
+then
+  >&2 echo "Usage: ./starsolo_auto.sh <sample_tag>"
+  >&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)"
+  exit 1
+fi
+
+CPUS=16      ## typically bsub this into normal queue with 16 cores and 64 Gb RAM.   
+REF=/nfs/cellgeni/STAR/human/2020A/index  ## choose the appropriate reference 
+FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1328/fastqs  ### change to the directory with fastq files/folders
+## choose one of the two otions, depending on whether you need a BAM file 
+#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN"
+BAM="--outSAMtype None"
+
+###################### DONT CHANGE OPTIONS BELOW THIS LINE ###########################
+
+mkdir $TAG && cd $TAG
+
+## three popular cases: <sample>_1.fastq/<sample>_2.fastq, <sample>.R1.fastq/<sample>.R2.fastq, and <sample>_L001_R1_S001.fastq/<sample>_L001_R2_S001.fastq
+## the command below will generate a comma-separated list for each read
+R1=""
+R2=""
+if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]]
+then 
+  R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]]
+then
+  R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"`
+elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]]
+then
+  R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"`
+  R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"`
+else 
+  >&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR."
+  exit 1
+fi 
+
+## also define one file from R1/R2; we choose the largest one  
+R1F=`echo $R1 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'`
+R2F=`echo $R2 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'`
+
+## let's see if the files are archived or not. Gzip is the only one we test for, but bgzip archives should work too since they are gzip-compatible.
+GZIP=""
+BC=""
+NBC1=""
+NBC2=""
+NBC3=""
+NBCA=""
+R1LEN=""
+R2LEN=""
+R1DIS=""
+
+## depending on whether the files are archived or not,  
+if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]]
+then  
+  GZIP="--readFilesCommand zcat"
+  NBC1=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
+  NBC2=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
+  NBC3=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
+  NBCA=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
+  R1LEN=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+  R2LEN=`zcat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+  R1DIS=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
+  zcat $R1F | head -n 100000 > test.R1.fastq
+  zcat $R2F | head -n 100000 > test.R2.fastq 
+else 
+  NBC1=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l`
+  NBC2=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l`
+  NBC3=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l`
+  NBCA=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l`
+  R1LEN=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+  R2LEN=`cat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'`
+  R1DIS=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l`
+  cat $R1F | head -n 100000 > test.R1.fastq
+  cat $R2F | head -n 100000 > test.R2.fastq 
+fi
+
+## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt):
+if (( $NBC2 > 5000 )) 
+then 
+  BC=/nfs/cellgeni/STAR/whitelists/737K-august-2016.txt
+elif (( $NBC3 > 5000 ))
+then
+  BC=/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt
+elif (( $NBCA > 5000 ))
+then
+  BC=/nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt
+elif (( $NBC1 > 5000 )) 
+then
+  BC=/nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt
+else 
+  >&2 echo "ERROR: No whitelist has matched first 10000 barcodes!"
+  exit 1
+fi 
+
+## check read lengths, fail if something funky is going on: 
+PAIRED=False
+UMILEN=""
+CBLEN=""
+if (( $R1DIS > 1 && $R1LEN <= 30 ))
+then 
+  >&2 echo "ERROR: Read 1 (barcode) has varying length; possibly someone thought it's a good idea to quality-trim it. Please check the fastq files."
+  exit 1
+elif (( $R1LEN < 24 )) 
+then
+  >&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files."
+  exit 1
+elif (( $R2LEN < 50 )) 
+then
+  >&2 echo "ERROR: Read 2 (biological read) is less than 50 bp in length. Please check the fastq files."
+  exit 1
+fi
+
+## assign the necessary variables for barcode/UMI length/paired-end processing:  
+if (( $R1LEN > 50 )) 
+then
+  PAIRED=True
+  UMILEN=10
+  CBLEN=16
+elif (( $NBC1 > 5000 )) 
+then
+  CBLEN=14
+  UMILEN=$((R1LEN-14))
+else
+  CBLEN=16
+  UMILEN=$((R1LEN-16)) 
+fi 
+
+## finally, see if you have 5' or 3' experiment. I don't know and easier way:  
+STRAND=Forward
+
+STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
+     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
+     --soloUMIlen $UMILEN --soloStrand Forward \
+     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+     --soloFeatures Gene --soloOutFileNames test_strand/ features.tsv barcodes.tsv matrix.mtx &> /dev/null 
+rm test.R1.fastq test.R2.fastq
+
+GENEPCT=`grep "Reads Mapped to Gene: Unique+Multipe Gene" test_strand/Gene/Summary.csv | awk -F "," '{printf "%d\n",$2*100}'`
+if (( $GENEPCT < 10 )) 
+then
+  STRAND=Reverse
+fi
+
+## finally, if paired-end experiment turned out to be 3', process it as single-end: 
+if [[ $STRAND == "Forward" && $PAIRED == "True" ]]
+then
+  PAIRED=False
+  if [[ $BC == "/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt" ]] 
+  then
+    UMILEN=12
+  fi
+fi
+
+echo "Done setting up the STARsolo run; here are final processing options:"
+echo "============================================================================="
+echo "Sample: $TAG"
+echo "Paired-end mode: $PAIRED"
+echo "Strand (Forward = 3', Reverse = 5'): $STRAND"
+echo "CB whitelist: $BC"
+echo "CB length: $CBLEN"
+echo "UMI length: $UMILEN"
+echo "GZIP: $GZIP"
+echo "-----------------------------------------------------------------------------"
+echo "Read 1 files: $R1"
+echo "-----------------------------------------------------------------------------"
+echo "Read 2 files: $R2" 
+echo "-----------------------------------------------------------------------------"
+
+if [[ $PAIRED == "True" ]]
+then
+  STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
+     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
+     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
+     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+else 
+  STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
+     --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
+     --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
+     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+     --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx
+fi
+
+## index the BAM file
+if [[ -s Aligned.sortedByCoord.out.bam ]]
+then
+  samtools index -@16 Aligned.sortedByCoord.out.bam
+fi
+
+## finally, let's gzip all outputs
+cd output
+for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered Velocyto/raw Velocyto/filtered
+do 
+  cd $i; for j in *; do gzip $j & done
+  cd ../../
+done
+
+wait
+echo "ALL DONE!"
diff --git a/scripts/starsolo_3p_v1.sh b/scripts/starsolo_3p_v1.sh