-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
428 additions
and
314 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/bash | ||
|
||
echo -e "Sample\tN_reads\tN_uniq\tN_multi\tN_gene\tPct_uniq\tPct_multi\tPct_gene" | ||
|
||
for i in * | ||
do | ||
if [[ -d $i && -s $i/Log.out ]] | ||
then | ||
N1=`grep "Number of input reads " $i/Log.final.out | awk '{print $6}'` | ||
N2=`grep "Uniquely mapped reads number " $i/Log.final.out | awk '{print $6}'` | ||
N3=`grep "Number of reads mapped to multiple loci " $i/Log.final.out | awk '{print $9}'` | ||
N4=`cat $i/$i.rsem.genes.results | awk '{if (NR>1) {sum+=$5}} END {printf "%d\n",sum}'` | ||
|
||
P1=`echo $N2 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'` | ||
P2=`echo $N3 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'` | ||
P3=`echo $N4 | awk -v v=$N1 '{printf "%.2f\n",100*$1/v}'` | ||
echo -e "$i\t$N1\t$N2\t$N3\t$N4\t$P1\t$P2\t$P3" | ||
fi | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#!/bin/bash | ||
|
||
## change sample tag for reuse! | ||
## reorganize directories for bulk RNA-seq STAR+rsem processing | ||
|
||
mkdir counts rsem_gene rsem_tran | ||
|
||
for i in PR* | ||
do | ||
echo "Processing sample $i - moving files.." | ||
cp $i/*genes.results rsem_gene | ||
cp $i/*isoforms.results rsem_tran | ||
done | ||
|
||
cd rsem_gene | ||
## we assume Ensembl IDs without dots | ||
|
||
for i in *genes.results | ||
do | ||
TAG=${i%%.rsem.genes.results} | ||
echo "Processing file $TAG - generating count tables.." | ||
echo -e "geneName\tgeneEffLength\t$TAG" > $TAG.counts.tsv | ||
grep -v expected_count $i | grep -v _PAR_ | cut -f 1,4,5 >> $TAG.counts.tsv | ||
done | ||
|
||
mv *.counts.tsv ../counts | ||
|
||
echo "ALL DONE!" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,50 @@ | ||
#!/bin/bash | ||
|
||
TAG=$1 | ||
if [[ $TAG == "" ]] | ||
then | ||
>&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>" | ||
>&2 echo "(make sure you set the correct SREF, RREF, and FQDIR variables)" | ||
exit 1 | ||
fi | ||
|
||
SREF=/nfs/cellgeni/alexp/GRCh38_2020A/STAR | ||
RREF=/nfs/cellgeni/alexp/GRCh38_2020A/GRCh38_v32_rsem | ||
FQDIR=/lustre/scratch117/cellgen/cellgeni/alexp/Bulk/Bulk_for_tic957/fastqs | ||
SREF=/nfs/cellgeni/STAR/human/2020A/index | ||
RREF=/nfs/cellgeni/STAR/human/2020A/2020A_rsem | ||
FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-bulkseq/tic-1333/fastqs | ||
PAIRED="--paired-end" | ||
STRAND="--forward-prob 0" | ||
CPUS=16 | ||
|
||
## for single-end reads, change both STAR and RSEM options | ||
## for strand-specific processing, change --forward-prob in rsem-calculate-expression (1 is FR, 0 is RF, 0.5 is non-strand-specific). | ||
|
||
mkdir $TAG && cd $TAG | ||
R1=`find $FQDIR/* | grep $TAG | grep R1` | ||
R2=`find $FQDIR/* | grep $TAG | grep R2` | ||
|
||
if [[ $R1 == "" ]] | ||
R1="" | ||
R2="" | ||
if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]] | ||
then | ||
R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]] | ||
then | ||
>&2 echo "No R1 read files was found for sample tag $TAG!" | ||
>&2 echo "Usage: ./star_rsem_bulk.sh <sample_tag>" | ||
R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]] | ||
then | ||
R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
else | ||
>&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR." | ||
exit 1 | ||
fi | ||
|
||
GZIP="" | ||
if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]] | ||
then | ||
GZIP="--readFilesCommand zcat" | ||
fi | ||
|
||
STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 --readFilesCommand zcat --outFilterMultimapNmax 20 \ | ||
## ENCODE options for RNA-seq alignment | ||
STAR --runThreadN $CPUS --genomeDir $SREF --readFilesIn $R1 $R2 $GZIP --outFilterMultimapNmax 20 \ | ||
--alignSJoverhangMin 8 --alignSJDBoverhangMin 1 --outFilterMismatchNmax 999 --outFilterMismatchNoverReadLmax 0.04 \ | ||
--alignIntronMin 20 --alignIntronMax 1000000 --alignMatesGapMax 1000000 --outSAMheaderCommentFile COfile.txt \ | ||
--outSAMheaderHD @HD VN:1.4 SO:coordinate --outSAMunmapped Within --outFilterType BySJout --outSAMattributes NH HI AS NM MD \ | ||
--outSAMtype BAM SortedByCoordinate --quantMode TranscriptomeSAM GeneCounts --sjdbScore 1 --limitBAMsortRAM 30000000000 | ||
|
||
rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output --forward-prob 0.5 Aligned.toTranscriptome.out.bam $RREF $TAG.rsem | ||
samtools index -@$CPUS Aligned.sortedByCoord.out.bam | ||
|
||
rsem-calculate-expression $PAIRED -p $CPUS --bam --estimate-rspd --seed 12345 --no-bam-output $STRAND Aligned.toTranscriptome.out.bam $RREF $TAG.rsem |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,210 @@ | ||
#!/bin/bash -e | ||
|
||
## v3.0 of STARsolo wrappers is set up to guess the chemistry automatically | ||
## newest version of the script uses STAR v2.7.9a with EM multimapper processing in STARsolo (but it's not on by default; turn this on with --soloMultiMappers EM) | ||
## velocyto extra processing also became unnecessary | ||
|
||
TAG=$1 | ||
if [[ $TAG == "" ]] | ||
then | ||
>&2 echo "Usage: ./starsolo_auto.sh <sample_tag>" | ||
>&2 echo "(make sure you set the correct REF, FQDIR, and SORTEDBAM/NOBAM variables)" | ||
exit 1 | ||
fi | ||
|
||
CPUS=16 ## typically bsub this into normal queue with 16 cores and 64 Gb RAM. | ||
REF=/nfs/cellgeni/STAR/human/2020A/index ## choose the appropriate reference | ||
FQDIR=/lustre/scratch117/cellgen/cellgeni/TIC-starsolo/tic-1328/fastqs ### change to the directory with fastq files/folders | ||
## choose one of the two otions, depending on whether you need a BAM file | ||
#BAM="--outSAMtype BAM SortedByCoordinate --outBAMsortingThreadN 2 --limitBAMsortRAM 120000000000 --outMultimapperOrder Random --runRNGseed 1 --outSAMattributes NH HI AS nM CB UB GX GN" | ||
BAM="--outSAMtype None" | ||
|
||
###################### DONT CHANGE OPTIONS BELOW THIS LINE ########################### | ||
|
||
mkdir $TAG && cd $TAG | ||
|
||
## three popular cases: <sample>_1.fastq/<sample>_2.fastq, <sample>.R1.fastq/<sample>.R2.fastq, and <sample>_L001_R1_S001.fastq/<sample>_L001_R2_S001.fastq | ||
## the command below will generate a comma-separated list for each read | ||
R1="" | ||
R2="" | ||
if [[ `find $FQDIR/* | grep $TAG | grep "_1\.fastq"` != "" ]] | ||
then | ||
R1=`find $FQDIR/* | grep $TAG | grep "_1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "_2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
elif [[ `find $FQDIR/* | grep $TAG | grep "R1\.fastq"` != "" ]] | ||
then | ||
R1=`find $FQDIR/* | grep $TAG | grep "R1\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "R2\.fastq" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
elif [[ `find $FQDIR/* | grep $TAG | grep "_R1_.*\.fastq"` != "" ]] | ||
then | ||
R1=`find $FQDIR/* | grep $TAG | grep "_R1_" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
R2=`find $FQDIR/* | grep $TAG | grep "_R2_" | sort | tr '\n' ',' | sed "s/,$//g"` | ||
else | ||
>&2 echo "ERROR: No appropriate fastq files were found! Please check file formatting, and check if you have set the right FQDIR." | ||
exit 1 | ||
fi | ||
|
||
## also define one file from R1/R2; we choose the largest one | ||
R1F=`echo $R1 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'` | ||
R2F=`echo $R2 | tr ',' ' ' | xargs ls -s | tail -n1 | awk '{print $2}'` | ||
|
||
## let's see if the files are archived or not. Gzip is the only one we test for, but bgzip archives should work too since they are gzip-compatible. | ||
GZIP="" | ||
BC="" | ||
NBC1="" | ||
NBC2="" | ||
NBC3="" | ||
NBCA="" | ||
R1LEN="" | ||
R2LEN="" | ||
R1DIS="" | ||
|
||
## depending on whether the files are archived or not, | ||
if [[ `find $FQDIR/* | grep $TAG | grep "\.gz$"` != "" ]] | ||
then | ||
GZIP="--readFilesCommand zcat" | ||
NBC1=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l` | ||
NBC2=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l` | ||
NBC3=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l` | ||
NBCA=`zcat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l` | ||
R1LEN=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` | ||
R2LEN=`zcat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` | ||
R1DIS=`zcat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l` | ||
zcat $R1F | head -n 100000 > test.R1.fastq | ||
zcat $R2F | head -n 100000 > test.R2.fastq | ||
else | ||
NBC1=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | wc -l` | ||
NBC2=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | wc -l` | ||
NBC3=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | wc -l` | ||
NBCA=`cat $R1F | awk 'NR%4==2' | grep -v N | head -n10000 | grep -F -f /nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | wc -l` | ||
R1LEN=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` | ||
R2LEN=`cat $R2F | awk 'NR%4==2' | head -n1000 | awk '{sum+=length($0)} END {printf "%d\n",sum/NR+0.5}'` | ||
R1DIS=`cat $R1F | awk 'NR%4==2' | head -n1000 | awk '{print length($0)}' | sort | uniq -c | wc -l` | ||
cat $R1F | head -n 100000 > test.R1.fastq | ||
cat $R2F | head -n 100000 > test.R2.fastq | ||
fi | ||
|
||
## elucidate the right barcode whitelist to use. Grepping out N saves us some trouble. Note the special list for multiome experiments (737K-arc-v1.txt): | ||
if (( $NBC2 > 5000 )) | ||
then | ||
BC=/nfs/cellgeni/STAR/whitelists/737K-august-2016.txt | ||
elif (( $NBC3 > 5000 )) | ||
then | ||
BC=/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt | ||
elif (( $NBCA > 5000 )) | ||
then | ||
BC=/nfs/cellgeni/STAR/whitelists/737K-arc-v1.txt | ||
elif (( $NBC1 > 5000 )) | ||
then | ||
BC=/nfs/cellgeni/STAR/whitelists/737K-april-2014_rc.txt | ||
else | ||
>&2 echo "ERROR: No whitelist has matched first 10000 barcodes!" | ||
exit 1 | ||
fi | ||
|
||
## check read lengths, fail if something funky is going on: | ||
PAIRED=False | ||
UMILEN="" | ||
CBLEN="" | ||
if (( $R1DIS > 1 && $R1LEN <= 30 )) | ||
then | ||
>&2 echo "ERROR: Read 1 (barcode) has varying length; possibly someone thought it's a good idea to quality-trim it. Please check the fastq files." | ||
exit 1 | ||
elif (( $R1LEN < 24 )) | ||
then | ||
>&2 echo "ERROR: Read 1 (barcode) is less than 24 bp in length. Please check the fastq files." | ||
exit 1 | ||
elif (( $R2LEN < 50 )) | ||
then | ||
>&2 echo "ERROR: Read 2 (biological read) is less than 50 bp in length. Please check the fastq files." | ||
exit 1 | ||
fi | ||
|
||
## assign the necessary variables for barcode/UMI length/paired-end processing: | ||
if (( $R1LEN > 50 )) | ||
then | ||
PAIRED=True | ||
UMILEN=10 | ||
CBLEN=16 | ||
elif (( $NBC1 > 5000 )) | ||
then | ||
CBLEN=14 | ||
UMILEN=$((R1LEN-14)) | ||
else | ||
CBLEN=16 | ||
UMILEN=$((R1LEN-16)) | ||
fi | ||
|
||
## finally, see if you have 5' or 3' experiment. I don't know and easier way: | ||
STRAND=Forward | ||
|
||
STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ | ||
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \ | ||
--soloUMIlen $UMILEN --soloStrand Forward \ | ||
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ | ||
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ | ||
--soloFeatures Gene --soloOutFileNames test_strand/ features.tsv barcodes.tsv matrix.mtx &> /dev/null | ||
rm test.R1.fastq test.R2.fastq | ||
|
||
GENEPCT=`grep "Reads Mapped to Gene: Unique+Multipe Gene" test_strand/Gene/Summary.csv | awk -F "," '{printf "%d\n",$2*100}'` | ||
if (( $GENEPCT < 10 )) | ||
then | ||
STRAND=Reverse | ||
fi | ||
|
||
## finally, if paired-end experiment turned out to be 3', process it as single-end: | ||
if [[ $STRAND == "Forward" && $PAIRED == "True" ]] | ||
then | ||
PAIRED=False | ||
if [[ $BC == "/nfs/cellgeni/STAR/whitelists/3M-february-2018.txt" ]] | ||
then | ||
UMILEN=12 | ||
fi | ||
fi | ||
|
||
echo "Done setting up the STARsolo run; here are final processing options:" | ||
echo "=============================================================================" | ||
echo "Sample: $TAG" | ||
echo "Paired-end mode: $PAIRED" | ||
echo "Strand (Forward = 3', Reverse = 5'): $STRAND" | ||
echo "CB whitelist: $BC" | ||
echo "CB length: $CBLEN" | ||
echo "UMI length: $UMILEN" | ||
echo "GZIP: $GZIP" | ||
echo "-----------------------------------------------------------------------------" | ||
echo "Read 1 files: $R1" | ||
echo "-----------------------------------------------------------------------------" | ||
echo "Read 2 files: $R2" | ||
echo "-----------------------------------------------------------------------------" | ||
|
||
if [[ $PAIRED == "True" ]] | ||
then | ||
STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \ | ||
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \ | ||
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ | ||
--soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \ | ||
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx | ||
else | ||
STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \ | ||
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \ | ||
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ | ||
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ | ||
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx | ||
fi | ||
|
||
## index the BAM file | ||
if [[ -s Aligned.sortedByCoord.out.bam ]] | ||
then | ||
samtools index -@16 Aligned.sortedByCoord.out.bam | ||
fi | ||
|
||
## finally, let's gzip all outputs | ||
cd output | ||
for i in Gene/raw Gene/filtered GeneFull/raw GeneFull/filtered Velocyto/raw Velocyto/filtered | ||
do | ||
cd $i; for j in *; do gzip $j & done | ||
cd ../../ | ||
done | ||
|
||
wait | ||
echo "ALL DONE!" |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.