added the analysis script

We uploaded the analysis script
xjtu-omics · Apr 28, 2021 · b7d86f8 · b7d86f8
1 parent 2194542
commit b7d86f8
Show file tree

Hide file tree

Showing 15 changed files with 308 additions and 0 deletions.
diff --git a/analysis_scripts/3d-DNA-whole_pipeline.sh b/analysis_scripts/3d-DNA-whole_pipeline.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+#PBS -N juicer_contig_brk
+#PBS -l nodes=1:ppn=90
+#PBS -l walltime=999:00:00
+#PBS -V
+#PBS -q fat
+
+#### step 1
+BWA=/opt/software/bwa/bwa
+SAMTOOLS=~/software/miniconda3/bin/samtools
+BASEDIR=/home/xfyang/poppy_project/poppy_HN1_contig/3dDNA_contig_break
+REFNAME=poppy_HN1_contig.fa.break
+REF=$BASEDIR/references/$REFNAME.fasta
+
+cd $BASEDIR/references/
+
+$BWA index $REF
+
+$SAMTOOLS faidx $REF
+cut -f 1,2 $REF.fai > $REF.size
+
+## step 2
+PYTHON=/opt/anaconda2/bin/python
+GENSCRIPT=/home/xfyang/software/juicer/misc/generate_site_positions.py
+cd $BASEDIR/restriction_sites/
+$PYTHON $GENSCRIPT DpnII $REFNAME $REF
+
+## step 3
+export PATH=/opt/software/bwa/:/opt/anaconda2/bin/:$PATH
+JUICERSH=$BASEDIR/scripts/juicer.sh
+t=90
+
+cd $BASEDIR 
+$JUICERSH -S early -D $BASEDIR -d $BASEDIR -z $REF -y $BASEDIR/restriction_sites/$REFNAME"_DpnII.txt" -p $REF.size -g poppy_HN1 -t $t -s DpnII 2>juicer_log
+
+# run the 3d-dna/run-asm-pipeline.sh that takes the assembly and the $BASEDIR/aligned/merged_nodups.txt. I use --rounds 1 which actually two rounds of missjoin correction (round 0 and round 1, it seems to count from zero). For the cockle however I had to use three rounds (--rounds 1) to get the superscaffolds look as the actual chromosomes (the 'squares' in the contact map).
+cd $BASEDIR
+
+#bash 3d-dna/run-asm-pipeline.sh --sort-output --rounds 1 $BASEDIR/references/$REF $BASEDIR/aligned/merged_nodups.txt
+
+# this last script produces three files ended in FINAL.fasta (the actual scaffolds), FINAL.assembly (the assembly map) and final.hic (the contact map). The last two files are visulized in juicebox
+# The whole process took me about seven days to run, the /scripts/juicer.sh uses 24 CPU, I used 64Gb RAM (more than enough I think)
+
diff --git a/analysis_scripts/P. rohoeas-Purge_dups-cutoffs b/analysis_scripts/P. rohoeas-Purge_dups-cutoffs
@@ -0,0 +1 @@
+5	34	56	67	112	201
diff --git a/analysis_scripts/RNA-seq_analysis.sh b/analysis_scripts/RNA-seq_analysis.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+t=30
+OUTDIR=$WORKDIR
+mkdir $OUTDIR
+HISAT2DIR=/home/software/hisat2/hisat2-2.2.1
+STRINGTIEDIR=/home/software/stringTie/stringtie-2.1.4.Linux_x86_64
+
+cd $OUTDIR
+GFF=gene.gff
+GTF=gene.gtf
+GENOME=genome.fa
+
+## S1, extract exon and splice
+$HISAT2DIR/extract_splice_sites.py $GTF  > gene.ss
+$HISAT2DIR/extract_exons.py $GTF > gene.exon
+
+## S2, buid hisat2 index
+$HISAT2DIR/hisat2-build --ss gene.ss --exon gene.exon -p $t $GENOME hisat2_tran
+
+## S3, mapping RNA-seq
+$HISAT2DIR/hisat2 -p $t --dta -x hisat2_tran -1 $FQ1 -2 $FQ2 -S $SIGN.sam
+$SAMTOOLS sort -@ $t -o $SIGN.bam $SIGN.sam
+
+## S4, Assemble and quantify expressed genes and transcripts
+$STRINGTIEDIR/stringtie -p $t -G $GFF -o $SIGN.gtf -l $SIGN $SIGN.bam
+
+## S5, merge transcripts from all samples:
+$STRINGTIEDIR/stringtie --merge -p $t -G $GFF -o $SIGN"_merged.gtf" $MERGELIST
+
+
+## S6, Estimate transcript abundances and create table counts for Ballgown
+$STRINGTIEDIR/stringtie -e -B -p $t -G $MERGEDGTF -o $onetis/$SIGN.gtf $ALIGNDIR/$SIGN.bam
+
diff --git a/analysis_scripts/busco3_evaluation.sh b/analysis_scripts/busco3_evaluation.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+export AUGUSTUS_CONFIG_PATH=/home/software/augustus-3.3.3/config
+export BUSCO_CONFIG_FILE="/home/software/busco-master/config/config.ini"
+export PATH=/home/software/augustus-3.3.3/bin:/home/xfyang/software/augustus-3.3.3/scripts:$PATH
+cd /home/poppy_project/YMR_nextdenovo/purged_dups_res 
+BUSCO=/home/software/busco-master/scripts/run_BUSCO.py
+PYTHON=/home/software/miniconda2/bin/python
+REF=$referenceFa
+
+$PYTHON $BUSCO -o busco3_Res -i $REF -l /home/busco_lineage/embryophyta_odb9 -m genome -c 28
+
+
diff --git a/analysis_scripts/kaks_calculator_run.sh b/analysis_scripts/kaks_calculator_run.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+cd $WORKDIR
+
+./ParaAT.pl -h $SIGN.genepair -n $SIGN.cds.fa -a $SIGN.pep.fa -p proc -o $SIGN -f axt -k > $SIGN.log
+cat ./$SIGN/*.kaks > $SIGN.kaks
+awk 'NR%2==0' $SIGN.kaks > $SIGN.kaks.value
+
diff --git a/analysis_scripts/maker_anno.sh b/analysis_scripts/maker_anno.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+cd $annotationDir
+/opt/mpi/mpich-3.2/bin/mpirun -n 94 /opt/software/maker/bin/maker 2> maker.log
diff --git a/analysis_scripts/maker_bopts.ctl b/analysis_scripts/maker_bopts.ctl
@@ -0,0 +1,28 @@
+#-----BLAST and Exonerate Statistics Thresholds
+blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast'
+
+pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments
+pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments
+eval_blastn=1e-10 #Blastn eval cutoff
+bit_blastn=40 #Blastn bit cutoff
+depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff)
+
+pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments
+pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments
+eval_blastx=1e-06 #Blastx eval cutoff
+bit_blastx=30 #Blastx bit cutoff
+depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff)
+
+pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments
+pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments
+eval_tblastx=1e-10 #tBlastx eval cutoff
+bit_tblastx=40 #tBlastx bit cutoff
+depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff)
+
+pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking
+pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking
+eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking
+bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking
+
+ep_score_limit=20 #Exonerate protein percent of maximal score threshold
+en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold
diff --git a/analysis_scripts/maker_exe.ctl b/analysis_scripts/maker_exe.ctl
@@ -0,0 +1,23 @@
+#-----Location of Executables Used by MAKER/EVALUATOR
+makeblastdb=/ncbi-blast-2.2.28+/bin/makeblastdb #location of NCBI+ makeblastdb executable
+blastn=/ncbi-blast-2.2.28+/bin/blastn #location of NCBI+ blastn executable
+blastx=/ncbi-blast-2.2.28+/bin/blastx #location of NCBI+ blastx executable
+tblastx=/ncbi-blast-2.2.28+/bin/tblastx #location of NCBI+ tblastx executable
+formatdb= #location of NCBI formatdb executable
+blastall= #location of NCBI blastall executable
+xdformat= #location of WUBLAST xdformat executable
+blasta= #location of WUBLAST blasta executable
+RepeatMasker=/RepeatMasker/RepeatMasker #location of RepeatMasker executable
+exonerate=/exonerate/bin/exonerate #location of exonerate executable
+
+#-----Ab-initio Gene Prediction Algorithms
+snap=/snap/snap #location of snap executable
+gmhmme3=/genemark_hmm_euk_linux_64/ehmm/gmhmme3 #location of eukaryotic genemark executable
+gmhmmp= #location of prokaryotic genemark executable
+augustus=/home/xiaofei/annotation_dzj/annotation/annotation/Annotation_soft/augustus-3.3/bin/augustus #location of augustus executable
+fgenesh= #location of fgenesh executable
+tRNAscan-SE=tRNAscan-SE #location of trnascan executable
+snoscan=snoscan #location of snoscan executable
+
+#-----Other Algorithms
+probuild=/gm_et_linux_64/gmes_petap/probuild #location of probuild executable (required for genemark)
diff --git a/analysis_scripts/maker_opts.ctl b/analysis_scripts/maker_opts.ctl
@@ -0,0 +1,74 @@
+#-----Genome (these are always required)
+genome=ref.fa #genome sequence (fasta file or fasta embeded in GFF3 file)
+organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic
+
+#-----Re-annotation Using MAKER Derived GFF3
+maker_gff= #MAKER derived GFF3 file
+est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
+altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
+protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
+rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
+model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
+pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
+other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no
+
+#-----EST Evidence (for best results provide a file for at least one)
+est=all_tissue_merge.trinity.fasta #set of ESTs or assembled mRNA-seq in fasta format
+altest= #EST/cDNA sequence file in fasta format from an alternate organism
+est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
+altest_gff= #aligned ESTs from a closly relate species in GFF3 format
+
+#-----Protein Homology Evidence (for best results provide a file for at least one)
+protein=uniprot_sprot.fasta,Araport11_genes.201606.pep.fasta,Beta_vulgaris.RefBeet-1.2.2.pep.all.fa,Vitis_vinifera.12X.pep.all.fa #protein sequence file in fasta format (i.e. from mutiple oransisms)
+protein_gff=  #aligned protein homology evidence from an external GFF3 file
+
+#-----Repeat Masking (leave values blank to skip repeat masking)
+model_org=all #select a model organism for RepBase masking in RepeatMasker
+rmlib=/RepeatModeler/result/consensi.fa.classified #provide an organism specific repeat library in fasta format for RepeatMasker
+repeat_protein=te_proteins.fasta #provide a fasta file of transposable element proteins for RepeatRunner
+rm_gff= #pre-identified repeat elements from an external GFF3 file
+prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
+softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)
+
+#-----Gene Prediction
+snaphmm=/snap/HMM/A.thaliana.hmm #SNAP HMM file
+gmhmm=/genemark_hmm_euk_linux_64/ehmm/a_thaliana.mod #GeneMark HMM file
+augustus_species=tomato #Augustus gene prediction species model
+fgenesh_par_file= #FGENESH parameter file
+pred_gff= #ab-initio predictions from an external GFF3 file
+model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
+est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
+protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
+trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
+snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
+unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no
+
+#-----Other Annotation Feature Types (features MAKER doesn't recognize)
+other_gff= #extra features to pass-through to final MAKER generated GFF3 file
+
+#-----External Application Behavior Options
+alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
+cpus=28 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)
+
+#-----MAKER Behavior Options
+max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
+min_contig=1 #skip genome contigs below this length (under 10kb are often useless)
+
+pred_flank=200 #flank for extending evidence clusters sent to gene predictors
+pred_stats=0 #report AED and QI statistics for all predictions as well as models
+AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
+min_protein=0 #require at least this many amino acids in predicted proteins
+alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
+always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
+map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
+keep_preds=1 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)
+
+split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
+single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
+single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
+correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes
+
+tries=2 #number of times to try a contig if there is a failure for some reason
+clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
+clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
+TMP= #specify a directory other than the system default temporary directory for temporary files
diff --git a/analysis_scripts/mcscanx_run.sh b/analysis_scripts/mcscanx_run.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+cd $WORKDIR
+
+MCScanX $SIGN
diff --git a/analysis_scripts/nextdenovo_run.pbs b/analysis_scripts/nextdenovo_run.pbs
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+export PATH=/home/software/miniconda2/bin:$PATH
+SEQSTAT=/home/software/nextdenovo/NextDenovo/bin/seq_stat
+DATADIR=/home/rawdata
+cd $DATADIR
+
+echo $DATADIR
+$SEQSTAT -g $GENOMESIZE $FOFNFILE  ## generate the parameters, FOFNFILE is the fastq file list
+
+
+$NEXTDENOVO run.cfg
+
diff --git a/analysis_scripts/purge_dups-whole-pipeline.sh b/analysis_scripts/purge_dups-whole-pipeline.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+MINIMAP2=/opt/software/minimap2/minimap2
+PURGE_DUP_BIN=/home/software/purge_dups/bin
+
+echo "#####################################"
+echo "REF=$REF"
+echo "QUERY=$QUERY"
+echo "####################################"
+
+cd $WORKDIR
+## Step 1.1
+$MINIMAP2 -x map-ont -t $t $REF $QUERY | gzip -c - > $QUERY.mini.paf.gz
+
+## Step 1.2
+$PURGE_DUP_BIN/split_fa $REF > $REF.split
+$MINIMAP2 -xasm5 -DP -t $t $REF.split $REF.split | gzip -c - > $REF.split.self.paf.gz
+
+## Step 1.3
+$PURGE_DUP_BIN/pbcstat *.mini.paf.gz #(produces PB.base.cov and PB.stat files)
+$PURGE_DUP_BIN/calcuts PB.stat > cutoffs 2>calcults.log
+python purge_dups/scripts/hist_plot.py -c cutoffs PB.stat covs.png
+
+#Step 2. Purge haplotigs and overlaps with the following command.
+$PURGE_DUP_BIN/purge_dups -2 -T cutoffs -c PB.base.cov $REF.split.self.paf.gz > dups.bed 2> purge_dups.log
+#Step 3. Get purged primary and haplotig sequences from draft assembly.
+$PURGE_DUP_BIN/get_seqs dups.bed $REF
diff --git a/analysis_scripts/repetmolder_run.sh b/analysis_scripts/repetmolder_run.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+mkdir ./RepeatModeler
+cd ./RepeatModeler
+/RepeatModeler/BuildDatabase -name mydb genome.fasta > genome.rmodeler.log 
+//RepeatModeler/RepeatModeler -pa 28 -database mydb > genome.repeatmmodeler.out
+rm -r RM_*/round-*
+mv RM_* result
diff --git a/analysis_scripts/scaffHic_breakhic_run.sh b/analysis_scripts/scaffHic_breakhic_run.sh
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+NO_THREAD=$t
+SCAFFHICDIR=/home/software/scaffhic/src
+
+cd $WORKDIR
+$SCAFFHICDIR/breakhic -nodes $NO_THREAD -grid 100  -fq1 $fq1 -fq2 $fq2 $contig $contig.break.fasta
diff --git a/analysis_scripts/trinity_run.pbs b/analysis_scripts/trinity_run.pbs
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+NO_THREAD=$t
+source activate Trinity
+WORKDIR=/home/trinity_out
+DATADIR=/home/rawdata/RNA-seq
+
+echo "trinity assembly transcripts"
+cd $DATADIR
+gunzip -c $TYPE"_R1.fastq.gz" > $TYPE"_R1.fastq"
+gunzip -c $TYPE"_R2.fastq.gz" > $TYPE"_R2.fastq"
+
+cd $WORKDIR
+mkdir $TYPE
+cd $TYPE
+Trinity --seqType fq --max_memory 200G --left $DATADIR/$TYPE"_R1.fastq" --right $DATADIR/$TYPE"_R2.fastq" --CPU $NO_THREAD --output $WORKDIR/$TYPE"2"/trinity --no_version_check
+
+cd $DATADIR
+rm -rf $TYPE"_R1.fastq"
+rm -rf $TYPE"_R2.fastq"