Skip to content

Commit

Permalink
added the analysis script
Browse files Browse the repository at this point in the history
We uploaded the analysis script
  • Loading branch information
yangxiaofeill authored Apr 28, 2021
1 parent 2194542 commit b7d86f8
Show file tree
Hide file tree
Showing 15 changed files with 308 additions and 0 deletions.
43 changes: 43 additions & 0 deletions analysis_scripts/3d-DNA-whole_pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash
#PBS -N juicer_contig_brk
#PBS -l nodes=1:ppn=90
#PBS -l walltime=999:00:00
#PBS -V
#PBS -q fat

#### step 1
BWA=/opt/software/bwa/bwa
SAMTOOLS=~/software/miniconda3/bin/samtools
BASEDIR=/home/xfyang/poppy_project/poppy_HN1_contig/3dDNA_contig_break
REFNAME=poppy_HN1_contig.fa.break
REF=$BASEDIR/references/$REFNAME.fasta

cd $BASEDIR/references/

$BWA index $REF

$SAMTOOLS faidx $REF
cut -f 1,2 $REF.fai > $REF.size

## step 2
PYTHON=/opt/anaconda2/bin/python
GENSCRIPT=/home/xfyang/software/juicer/misc/generate_site_positions.py
cd $BASEDIR/restriction_sites/
$PYTHON $GENSCRIPT DpnII $REFNAME $REF

## step 3
export PATH=/opt/software/bwa/:/opt/anaconda2/bin/:$PATH
JUICERSH=$BASEDIR/scripts/juicer.sh
t=90

cd $BASEDIR
$JUICERSH -S early -D $BASEDIR -d $BASEDIR -z $REF -y $BASEDIR/restriction_sites/$REFNAME"_DpnII.txt" -p $REF.size -g poppy_HN1 -t $t -s DpnII 2>juicer_log

# run the 3d-dna/run-asm-pipeline.sh that takes the assembly and the $BASEDIR/aligned/merged_nodups.txt. I use --rounds 1 which actually two rounds of missjoin correction (round 0 and round 1, it seems to count from zero). For the cockle however I had to use three rounds (--rounds 1) to get the superscaffolds look as the actual chromosomes (the 'squares' in the contact map).
cd $BASEDIR

#bash 3d-dna/run-asm-pipeline.sh --sort-output --rounds 1 $BASEDIR/references/$REF $BASEDIR/aligned/merged_nodups.txt

# this last script produces three files ended in FINAL.fasta (the actual scaffolds), FINAL.assembly (the assembly map) and final.hic (the contact map). The last two files are visulized in juicebox
# The whole process took me about seven days to run, the /scripts/juicer.sh uses 24 CPU, I used 64Gb RAM (more than enough I think)

1 change: 1 addition & 0 deletions analysis_scripts/P. rohoeas-Purge_dups-cutoffs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
5 34 56 67 112 201
34 changes: 34 additions & 0 deletions analysis_scripts/RNA-seq_analysis.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

t=30
OUTDIR=$WORKDIR
mkdir $OUTDIR
HISAT2DIR=/home/software/hisat2/hisat2-2.2.1
STRINGTIEDIR=/home/software/stringTie/stringtie-2.1.4.Linux_x86_64

cd $OUTDIR
GFF=gene.gff
GTF=gene.gtf
GENOME=genome.fa

## S1, extract exon and splice
$HISAT2DIR/extract_splice_sites.py $GTF > gene.ss
$HISAT2DIR/extract_exons.py $GTF > gene.exon

## S2, buid hisat2 index
$HISAT2DIR/hisat2-build --ss gene.ss --exon gene.exon -p $t $GENOME hisat2_tran

## S3, mapping RNA-seq
$HISAT2DIR/hisat2 -p $t --dta -x hisat2_tran -1 $FQ1 -2 $FQ2 -S $SIGN.sam
$SAMTOOLS sort -@ $t -o $SIGN.bam $SIGN.sam

## S4, Assemble and quantify expressed genes and transcripts
$STRINGTIEDIR/stringtie -p $t -G $GFF -o $SIGN.gtf -l $SIGN $SIGN.bam

## S5, merge transcripts from all samples:
$STRINGTIEDIR/stringtie --merge -p $t -G $GFF -o $SIGN"_merged.gtf" $MERGELIST


## S6, Estimate transcript abundances and create table counts for Ballgown
$STRINGTIEDIR/stringtie -e -B -p $t -G $MERGEDGTF -o $onetis/$SIGN.gtf $ALIGNDIR/$SIGN.bam

13 changes: 13 additions & 0 deletions analysis_scripts/busco3_evaluation.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

export AUGUSTUS_CONFIG_PATH=/home/software/augustus-3.3.3/config
export BUSCO_CONFIG_FILE="/home/software/busco-master/config/config.ini"
export PATH=/home/software/augustus-3.3.3/bin:/home/xfyang/software/augustus-3.3.3/scripts:$PATH
cd /home/poppy_project/YMR_nextdenovo/purged_dups_res
BUSCO=/home/software/busco-master/scripts/run_BUSCO.py
PYTHON=/home/software/miniconda2/bin/python
REF=$referenceFa

$PYTHON $BUSCO -o busco3_Res -i $REF -l /home/busco_lineage/embryophyta_odb9 -m genome -c 28


8 changes: 8 additions & 0 deletions analysis_scripts/kaks_calculator_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

cd $WORKDIR

./ParaAT.pl -h $SIGN.genepair -n $SIGN.cds.fa -a $SIGN.pep.fa -p proc -o $SIGN -f axt -k > $SIGN.log
cat ./$SIGN/*.kaks > $SIGN.kaks
awk 'NR%2==0' $SIGN.kaks > $SIGN.kaks.value

4 changes: 4 additions & 0 deletions analysis_scripts/maker_anno.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh

cd $annotationDir
/opt/mpi/mpich-3.2/bin/mpirun -n 94 /opt/software/maker/bin/maker 2> maker.log
28 changes: 28 additions & 0 deletions analysis_scripts/maker_bopts.ctl
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#-----BLAST and Exonerate Statistics Thresholds
blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast'

pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments
pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments
eval_blastn=1e-10 #Blastn eval cutoff
bit_blastn=40 #Blastn bit cutoff
depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff)

pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments
pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments
eval_blastx=1e-06 #Blastx eval cutoff
bit_blastx=30 #Blastx bit cutoff
depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff)

pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments
pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments
eval_tblastx=1e-10 #tBlastx eval cutoff
bit_tblastx=40 #tBlastx bit cutoff
depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff)

pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking
pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking
eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking
bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking

ep_score_limit=20 #Exonerate protein percent of maximal score threshold
en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold
23 changes: 23 additions & 0 deletions analysis_scripts/maker_exe.ctl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#-----Location of Executables Used by MAKER/EVALUATOR
makeblastdb=/ncbi-blast-2.2.28+/bin/makeblastdb #location of NCBI+ makeblastdb executable
blastn=/ncbi-blast-2.2.28+/bin/blastn #location of NCBI+ blastn executable
blastx=/ncbi-blast-2.2.28+/bin/blastx #location of NCBI+ blastx executable
tblastx=/ncbi-blast-2.2.28+/bin/tblastx #location of NCBI+ tblastx executable
formatdb= #location of NCBI formatdb executable
blastall= #location of NCBI blastall executable
xdformat= #location of WUBLAST xdformat executable
blasta= #location of WUBLAST blasta executable
RepeatMasker=/RepeatMasker/RepeatMasker #location of RepeatMasker executable
exonerate=/exonerate/bin/exonerate #location of exonerate executable

#-----Ab-initio Gene Prediction Algorithms
snap=/snap/snap #location of snap executable
gmhmme3=/genemark_hmm_euk_linux_64/ehmm/gmhmme3 #location of eukaryotic genemark executable
gmhmmp= #location of prokaryotic genemark executable
augustus=/home/xiaofei/annotation_dzj/annotation/annotation/Annotation_soft/augustus-3.3/bin/augustus #location of augustus executable
fgenesh= #location of fgenesh executable
tRNAscan-SE=tRNAscan-SE #location of trnascan executable
snoscan=snoscan #location of snoscan executable

#-----Other Algorithms
probuild=/gm_et_linux_64/gmes_petap/probuild #location of probuild executable (required for genemark)
74 changes: 74 additions & 0 deletions analysis_scripts/maker_opts.ctl
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#-----Genome (these are always required)
genome=ref.fa #genome sequence (fasta file or fasta embeded in GFF3 file)
organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic

#-----Re-annotation Using MAKER Derived GFF3
maker_gff= #MAKER derived GFF3 file
est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no
altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no
protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no
rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no
model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no
pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no
other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no

#-----EST Evidence (for best results provide a file for at least one)
est=all_tissue_merge.trinity.fasta #set of ESTs or assembled mRNA-seq in fasta format
altest= #EST/cDNA sequence file in fasta format from an alternate organism
est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file
altest_gff= #aligned ESTs from a closly relate species in GFF3 format

#-----Protein Homology Evidence (for best results provide a file for at least one)
protein=uniprot_sprot.fasta,Araport11_genes.201606.pep.fasta,Beta_vulgaris.RefBeet-1.2.2.pep.all.fa,Vitis_vinifera.12X.pep.all.fa #protein sequence file in fasta format (i.e. from mutiple oransisms)
protein_gff= #aligned protein homology evidence from an external GFF3 file

#-----Repeat Masking (leave values blank to skip repeat masking)
model_org=all #select a model organism for RepBase masking in RepeatMasker
rmlib=/RepeatModeler/result/consensi.fa.classified #provide an organism specific repeat library in fasta format for RepeatMasker
repeat_protein=te_proteins.fasta #provide a fasta file of transposable element proteins for RepeatRunner
rm_gff= #pre-identified repeat elements from an external GFF3 file
prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no
softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering)

#-----Gene Prediction
snaphmm=/snap/HMM/A.thaliana.hmm #SNAP HMM file
gmhmm=/genemark_hmm_euk_linux_64/ehmm/a_thaliana.mod #GeneMark HMM file
augustus_species=tomato #Augustus gene prediction species model
fgenesh_par_file= #FGENESH parameter file
pred_gff= #ab-initio predictions from an external GFF3 file
model_gff= #annotated gene models from an external GFF3 file (annotation pass-through)
est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no
protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no
trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no
snoscan_rrna= #rRNA file to have Snoscan find snoRNAs
unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no

#-----Other Annotation Feature Types (features MAKER doesn't recognize)
other_gff= #extra features to pass-through to final MAKER generated GFF3 file

#-----External Application Behavior Options
alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases
cpus=28 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI)

#-----MAKER Behavior Options
max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage)
min_contig=1 #skip genome contigs below this length (under 10kb are often useless)

pred_flank=200 #flank for extending evidence clusters sent to gene predictors
pred_stats=0 #report AED and QI statistics for all predictions as well as models
AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1)
min_protein=0 #require at least this many amino acids in predicted proteins
alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no
always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no
map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no
keep_preds=1 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1)

split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments)
single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no
single_length=250 #min length required for single exon ESTs if 'single_exon is enabled'
correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes

tries=2 #number of times to try a contig if there is a failure for some reason
clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no
clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no
TMP= #specify a directory other than the system default temporary directory for temporary files
5 changes: 5 additions & 0 deletions analysis_scripts/mcscanx_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

cd $WORKDIR

MCScanX $SIGN
13 changes: 13 additions & 0 deletions analysis_scripts/nextdenovo_run.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/sh

export PATH=/home/software/miniconda2/bin:$PATH
SEQSTAT=/home/software/nextdenovo/NextDenovo/bin/seq_stat
DATADIR=/home/rawdata
cd $DATADIR

echo $DATADIR
$SEQSTAT -g $GENOMESIZE $FOFNFILE ## generate the parameters, FOFNFILE is the fastq file list


$NEXTDENOVO run.cfg

27 changes: 27 additions & 0 deletions analysis_scripts/purge_dups-whole-pipeline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

MINIMAP2=/opt/software/minimap2/minimap2
PURGE_DUP_BIN=/home/software/purge_dups/bin

echo "#####################################"
echo "REF=$REF"
echo "QUERY=$QUERY"
echo "####################################"

cd $WORKDIR
## Step 1.1
$MINIMAP2 -x map-ont -t $t $REF $QUERY | gzip -c - > $QUERY.mini.paf.gz

## Step 1.2
$PURGE_DUP_BIN/split_fa $REF > $REF.split
$MINIMAP2 -xasm5 -DP -t $t $REF.split $REF.split | gzip -c - > $REF.split.self.paf.gz

## Step 1.3
$PURGE_DUP_BIN/pbcstat *.mini.paf.gz #(produces PB.base.cov and PB.stat files)
$PURGE_DUP_BIN/calcuts PB.stat > cutoffs 2>calcults.log
python purge_dups/scripts/hist_plot.py -c cutoffs PB.stat covs.png

#Step 2. Purge haplotigs and overlaps with the following command.
$PURGE_DUP_BIN/purge_dups -2 -T cutoffs -c PB.base.cov $REF.split.self.paf.gz > dups.bed 2> purge_dups.log
#Step 3. Get purged primary and haplotig sequences from draft assembly.
$PURGE_DUP_BIN/get_seqs dups.bed $REF
8 changes: 8 additions & 0 deletions analysis_scripts/repetmolder_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/sh

mkdir ./RepeatModeler
cd ./RepeatModeler
/RepeatModeler/BuildDatabase -name mydb genome.fasta > genome.rmodeler.log
//RepeatModeler/RepeatModeler -pa 28 -database mydb > genome.repeatmmodeler.out
rm -r RM_*/round-*
mv RM_* result
7 changes: 7 additions & 0 deletions analysis_scripts/scaffHic_breakhic_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/sh

NO_THREAD=$t
SCAFFHICDIR=/home/software/scaffhic/src

cd $WORKDIR
$SCAFFHICDIR/breakhic -nodes $NO_THREAD -grid 100 -fq1 $fq1 -fq2 $fq2 $contig $contig.break.fasta
20 changes: 20 additions & 0 deletions analysis_scripts/trinity_run.pbs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/sh

NO_THREAD=$t
source activate Trinity
WORKDIR=/home/trinity_out
DATADIR=/home/rawdata/RNA-seq

echo "trinity assembly transcripts"
cd $DATADIR
gunzip -c $TYPE"_R1.fastq.gz" > $TYPE"_R1.fastq"
gunzip -c $TYPE"_R2.fastq.gz" > $TYPE"_R2.fastq"

cd $WORKDIR
mkdir $TYPE
cd $TYPE
Trinity --seqType fq --max_memory 200G --left $DATADIR/$TYPE"_R1.fastq" --right $DATADIR/$TYPE"_R2.fastq" --CPU $NO_THREAD --output $WORKDIR/$TYPE"2"/trinity --no_version_check

cd $DATADIR
rm -rf $TYPE"_R1.fastq"
rm -rf $TYPE"_R2.fastq"

0 comments on commit b7d86f8

Please sign in to comment.