-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
15 changed files
with
308 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/bash | ||
#PBS -N juicer_contig_brk | ||
#PBS -l nodes=1:ppn=90 | ||
#PBS -l walltime=999:00:00 | ||
#PBS -V | ||
#PBS -q fat | ||
|
||
#### step 1 | ||
BWA=/opt/software/bwa/bwa | ||
SAMTOOLS=~/software/miniconda3/bin/samtools | ||
BASEDIR=/home/xfyang/poppy_project/poppy_HN1_contig/3dDNA_contig_break | ||
REFNAME=poppy_HN1_contig.fa.break | ||
REF=$BASEDIR/references/$REFNAME.fasta | ||
|
||
cd $BASEDIR/references/ | ||
|
||
$BWA index $REF | ||
|
||
$SAMTOOLS faidx $REF | ||
cut -f 1,2 $REF.fai > $REF.size | ||
|
||
## step 2 | ||
PYTHON=/opt/anaconda2/bin/python | ||
GENSCRIPT=/home/xfyang/software/juicer/misc/generate_site_positions.py | ||
cd $BASEDIR/restriction_sites/ | ||
$PYTHON $GENSCRIPT DpnII $REFNAME $REF | ||
|
||
## step 3 | ||
export PATH=/opt/software/bwa/:/opt/anaconda2/bin/:$PATH | ||
JUICERSH=$BASEDIR/scripts/juicer.sh | ||
t=90 | ||
|
||
cd $BASEDIR | ||
$JUICERSH -S early -D $BASEDIR -d $BASEDIR -z $REF -y $BASEDIR/restriction_sites/$REFNAME"_DpnII.txt" -p $REF.size -g poppy_HN1 -t $t -s DpnII 2>juicer_log | ||
|
||
# run the 3d-dna/run-asm-pipeline.sh that takes the assembly and the $BASEDIR/aligned/merged_nodups.txt. I use --rounds 1 which actually two rounds of missjoin correction (round 0 and round 1, it seems to count from zero). For the cockle however I had to use three rounds (--rounds 1) to get the superscaffolds look as the actual chromosomes (the 'squares' in the contact map). | ||
cd $BASEDIR | ||
|
||
#bash 3d-dna/run-asm-pipeline.sh --sort-output --rounds 1 $BASEDIR/references/$REF $BASEDIR/aligned/merged_nodups.txt | ||
|
||
# this last script produces three files ended in FINAL.fasta (the actual scaffolds), FINAL.assembly (the assembly map) and final.hic (the contact map). The last two files are visulized in juicebox | ||
# The whole process took me about seven days to run, the /scripts/juicer.sh uses 24 CPU, I used 64Gb RAM (more than enough I think) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
5 34 56 67 112 201 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/bin/bash | ||
|
||
t=30 | ||
OUTDIR=$WORKDIR | ||
mkdir $OUTDIR | ||
HISAT2DIR=/home/software/hisat2/hisat2-2.2.1 | ||
STRINGTIEDIR=/home/software/stringTie/stringtie-2.1.4.Linux_x86_64 | ||
|
||
cd $OUTDIR | ||
GFF=gene.gff | ||
GTF=gene.gtf | ||
GENOME=genome.fa | ||
|
||
## S1, extract exon and splice | ||
$HISAT2DIR/extract_splice_sites.py $GTF > gene.ss | ||
$HISAT2DIR/extract_exons.py $GTF > gene.exon | ||
|
||
## S2, buid hisat2 index | ||
$HISAT2DIR/hisat2-build --ss gene.ss --exon gene.exon -p $t $GENOME hisat2_tran | ||
|
||
## S3, mapping RNA-seq | ||
$HISAT2DIR/hisat2 -p $t --dta -x hisat2_tran -1 $FQ1 -2 $FQ2 -S $SIGN.sam | ||
$SAMTOOLS sort -@ $t -o $SIGN.bam $SIGN.sam | ||
|
||
## S4, Assemble and quantify expressed genes and transcripts | ||
$STRINGTIEDIR/stringtie -p $t -G $GFF -o $SIGN.gtf -l $SIGN $SIGN.bam | ||
|
||
## S5, merge transcripts from all samples: | ||
$STRINGTIEDIR/stringtie --merge -p $t -G $GFF -o $SIGN"_merged.gtf" $MERGELIST | ||
|
||
|
||
## S6, Estimate transcript abundances and create table counts for Ballgown | ||
$STRINGTIEDIR/stringtie -e -B -p $t -G $MERGEDGTF -o $onetis/$SIGN.gtf $ALIGNDIR/$SIGN.bam | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
export AUGUSTUS_CONFIG_PATH=/home/software/augustus-3.3.3/config | ||
export BUSCO_CONFIG_FILE="/home/software/busco-master/config/config.ini" | ||
export PATH=/home/software/augustus-3.3.3/bin:/home/xfyang/software/augustus-3.3.3/scripts:$PATH | ||
cd /home/poppy_project/YMR_nextdenovo/purged_dups_res | ||
BUSCO=/home/software/busco-master/scripts/run_BUSCO.py | ||
PYTHON=/home/software/miniconda2/bin/python | ||
REF=$referenceFa | ||
|
||
$PYTHON $BUSCO -o busco3_Res -i $REF -l /home/busco_lineage/embryophyta_odb9 -m genome -c 28 | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/bash | ||
|
||
cd $WORKDIR | ||
|
||
./ParaAT.pl -h $SIGN.genepair -n $SIGN.cds.fa -a $SIGN.pep.fa -p proc -o $SIGN -f axt -k > $SIGN.log | ||
cat ./$SIGN/*.kaks > $SIGN.kaks | ||
awk 'NR%2==0' $SIGN.kaks > $SIGN.kaks.value | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/sh | ||
|
||
cd $annotationDir | ||
/opt/mpi/mpich-3.2/bin/mpirun -n 94 /opt/software/maker/bin/maker 2> maker.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#-----BLAST and Exonerate Statistics Thresholds | ||
blast_type=ncbi+ #set to 'ncbi+', 'ncbi' or 'wublast' | ||
|
||
pcov_blastn=0.8 #Blastn Percent Coverage Threhold EST-Genome Alignments | ||
pid_blastn=0.85 #Blastn Percent Identity Threshold EST-Genome Aligments | ||
eval_blastn=1e-10 #Blastn eval cutoff | ||
bit_blastn=40 #Blastn bit cutoff | ||
depth_blastn=0 #Blastn depth cutoff (0 to disable cutoff) | ||
|
||
pcov_blastx=0.5 #Blastx Percent Coverage Threhold Protein-Genome Alignments | ||
pid_blastx=0.4 #Blastx Percent Identity Threshold Protein-Genome Aligments | ||
eval_blastx=1e-06 #Blastx eval cutoff | ||
bit_blastx=30 #Blastx bit cutoff | ||
depth_blastx=0 #Blastx depth cutoff (0 to disable cutoff) | ||
|
||
pcov_tblastx=0.8 #tBlastx Percent Coverage Threhold alt-EST-Genome Alignments | ||
pid_tblastx=0.85 #tBlastx Percent Identity Threshold alt-EST-Genome Aligments | ||
eval_tblastx=1e-10 #tBlastx eval cutoff | ||
bit_tblastx=40 #tBlastx bit cutoff | ||
depth_tblastx=0 #tBlastx depth cutoff (0 to disable cutoff) | ||
|
||
pcov_rm_blastx=0.5 #Blastx Percent Coverage Threhold For Transposable Element Masking | ||
pid_rm_blastx=0.4 #Blastx Percent Identity Threshold For Transposbale Element Masking | ||
eval_rm_blastx=1e-06 #Blastx eval cutoff for transposable element masking | ||
bit_rm_blastx=30 #Blastx bit cutoff for transposable element masking | ||
|
||
ep_score_limit=20 #Exonerate protein percent of maximal score threshold | ||
en_score_limit=20 #Exonerate nucleotide percent of maximal score threshold |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#-----Location of Executables Used by MAKER/EVALUATOR | ||
makeblastdb=/ncbi-blast-2.2.28+/bin/makeblastdb #location of NCBI+ makeblastdb executable | ||
blastn=/ncbi-blast-2.2.28+/bin/blastn #location of NCBI+ blastn executable | ||
blastx=/ncbi-blast-2.2.28+/bin/blastx #location of NCBI+ blastx executable | ||
tblastx=/ncbi-blast-2.2.28+/bin/tblastx #location of NCBI+ tblastx executable | ||
formatdb= #location of NCBI formatdb executable | ||
blastall= #location of NCBI blastall executable | ||
xdformat= #location of WUBLAST xdformat executable | ||
blasta= #location of WUBLAST blasta executable | ||
RepeatMasker=/RepeatMasker/RepeatMasker #location of RepeatMasker executable | ||
exonerate=/exonerate/bin/exonerate #location of exonerate executable | ||
|
||
#-----Ab-initio Gene Prediction Algorithms | ||
snap=/snap/snap #location of snap executable | ||
gmhmme3=/genemark_hmm_euk_linux_64/ehmm/gmhmme3 #location of eukaryotic genemark executable | ||
gmhmmp= #location of prokaryotic genemark executable | ||
augustus=/home/xiaofei/annotation_dzj/annotation/annotation/Annotation_soft/augustus-3.3/bin/augustus #location of augustus executable | ||
fgenesh= #location of fgenesh executable | ||
tRNAscan-SE=tRNAscan-SE #location of trnascan executable | ||
snoscan=snoscan #location of snoscan executable | ||
|
||
#-----Other Algorithms | ||
probuild=/gm_et_linux_64/gmes_petap/probuild #location of probuild executable (required for genemark) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#-----Genome (these are always required) | ||
genome=ref.fa #genome sequence (fasta file or fasta embeded in GFF3 file) | ||
organism_type=eukaryotic #eukaryotic or prokaryotic. Default is eukaryotic | ||
|
||
#-----Re-annotation Using MAKER Derived GFF3 | ||
maker_gff= #MAKER derived GFF3 file | ||
est_pass=0 #use ESTs in maker_gff: 1 = yes, 0 = no | ||
altest_pass=0 #use alternate organism ESTs in maker_gff: 1 = yes, 0 = no | ||
protein_pass=0 #use protein alignments in maker_gff: 1 = yes, 0 = no | ||
rm_pass=0 #use repeats in maker_gff: 1 = yes, 0 = no | ||
model_pass=0 #use gene models in maker_gff: 1 = yes, 0 = no | ||
pred_pass=0 #use ab-initio predictions in maker_gff: 1 = yes, 0 = no | ||
other_pass=0 #passthrough anyything else in maker_gff: 1 = yes, 0 = no | ||
|
||
#-----EST Evidence (for best results provide a file for at least one) | ||
est=all_tissue_merge.trinity.fasta #set of ESTs or assembled mRNA-seq in fasta format | ||
altest= #EST/cDNA sequence file in fasta format from an alternate organism | ||
est_gff= #aligned ESTs or mRNA-seq from an external GFF3 file | ||
altest_gff= #aligned ESTs from a closly relate species in GFF3 format | ||
|
||
#-----Protein Homology Evidence (for best results provide a file for at least one) | ||
protein=uniprot_sprot.fasta,Araport11_genes.201606.pep.fasta,Beta_vulgaris.RefBeet-1.2.2.pep.all.fa,Vitis_vinifera.12X.pep.all.fa #protein sequence file in fasta format (i.e. from mutiple oransisms) | ||
protein_gff= #aligned protein homology evidence from an external GFF3 file | ||
|
||
#-----Repeat Masking (leave values blank to skip repeat masking) | ||
model_org=all #select a model organism for RepBase masking in RepeatMasker | ||
rmlib=/RepeatModeler/result/consensi.fa.classified #provide an organism specific repeat library in fasta format for RepeatMasker | ||
repeat_protein=te_proteins.fasta #provide a fasta file of transposable element proteins for RepeatRunner | ||
rm_gff= #pre-identified repeat elements from an external GFF3 file | ||
prok_rm=0 #forces MAKER to repeatmask prokaryotes (no reason to change this), 1 = yes, 0 = no | ||
softmask=1 #use soft-masking rather than hard-masking in BLAST (i.e. seg and dust filtering) | ||
|
||
#-----Gene Prediction | ||
snaphmm=/snap/HMM/A.thaliana.hmm #SNAP HMM file | ||
gmhmm=/genemark_hmm_euk_linux_64/ehmm/a_thaliana.mod #GeneMark HMM file | ||
augustus_species=tomato #Augustus gene prediction species model | ||
fgenesh_par_file= #FGENESH parameter file | ||
pred_gff= #ab-initio predictions from an external GFF3 file | ||
model_gff= #annotated gene models from an external GFF3 file (annotation pass-through) | ||
est2genome=0 #infer gene predictions directly from ESTs, 1 = yes, 0 = no | ||
protein2genome=0 #infer predictions from protein homology, 1 = yes, 0 = no | ||
trna=0 #find tRNAs with tRNAscan, 1 = yes, 0 = no | ||
snoscan_rrna= #rRNA file to have Snoscan find snoRNAs | ||
unmask=0 #also run ab-initio prediction programs on unmasked sequence, 1 = yes, 0 = no | ||
|
||
#-----Other Annotation Feature Types (features MAKER doesn't recognize) | ||
other_gff= #extra features to pass-through to final MAKER generated GFF3 file | ||
|
||
#-----External Application Behavior Options | ||
alt_peptide=C #amino acid used to replace non-standard amino acids in BLAST databases | ||
cpus=28 #max number of cpus to use in BLAST and RepeatMasker (not for MPI, leave 1 when using MPI) | ||
|
||
#-----MAKER Behavior Options | ||
max_dna_len=100000 #length for dividing up contigs into chunks (increases/decreases memory usage) | ||
min_contig=1 #skip genome contigs below this length (under 10kb are often useless) | ||
|
||
pred_flank=200 #flank for extending evidence clusters sent to gene predictors | ||
pred_stats=0 #report AED and QI statistics for all predictions as well as models | ||
AED_threshold=1 #Maximum Annotation Edit Distance allowed (bound by 0 and 1) | ||
min_protein=0 #require at least this many amino acids in predicted proteins | ||
alt_splice=0 #Take extra steps to try and find alternative splicing, 1 = yes, 0 = no | ||
always_complete=0 #extra steps to force start and stop codons, 1 = yes, 0 = no | ||
map_forward=0 #map names and attributes forward from old GFF3 genes, 1 = yes, 0 = no | ||
keep_preds=1 #Concordance threshold to add unsupported gene prediction (bound by 0 and 1) | ||
|
||
split_hit=10000 #length for the splitting of hits (expected max intron size for evidence alignments) | ||
single_exon=0 #consider single exon EST evidence when generating annotations, 1 = yes, 0 = no | ||
single_length=250 #min length required for single exon ESTs if 'single_exon is enabled' | ||
correct_est_fusion=0 #limits use of ESTs in annotation to avoid fusion genes | ||
|
||
tries=2 #number of times to try a contig if there is a failure for some reason | ||
clean_try=0 #remove all data from previous run before retrying, 1 = yes, 0 = no | ||
clean_up=0 #removes theVoid directory with individual analysis files, 1 = yes, 0 = no | ||
TMP= #specify a directory other than the system default temporary directory for temporary files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
|
||
cd $WORKDIR | ||
|
||
MCScanX $SIGN |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/sh | ||
|
||
export PATH=/home/software/miniconda2/bin:$PATH | ||
SEQSTAT=/home/software/nextdenovo/NextDenovo/bin/seq_stat | ||
DATADIR=/home/rawdata | ||
cd $DATADIR | ||
|
||
echo $DATADIR | ||
$SEQSTAT -g $GENOMESIZE $FOFNFILE ## generate the parameters, FOFNFILE is the fastq file list | ||
|
||
|
||
$NEXTDENOVO run.cfg | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
#!/bin/bash | ||
|
||
MINIMAP2=/opt/software/minimap2/minimap2 | ||
PURGE_DUP_BIN=/home/software/purge_dups/bin | ||
|
||
echo "#####################################" | ||
echo "REF=$REF" | ||
echo "QUERY=$QUERY" | ||
echo "####################################" | ||
|
||
cd $WORKDIR | ||
## Step 1.1 | ||
$MINIMAP2 -x map-ont -t $t $REF $QUERY | gzip -c - > $QUERY.mini.paf.gz | ||
|
||
## Step 1.2 | ||
$PURGE_DUP_BIN/split_fa $REF > $REF.split | ||
$MINIMAP2 -xasm5 -DP -t $t $REF.split $REF.split | gzip -c - > $REF.split.self.paf.gz | ||
|
||
## Step 1.3 | ||
$PURGE_DUP_BIN/pbcstat *.mini.paf.gz #(produces PB.base.cov and PB.stat files) | ||
$PURGE_DUP_BIN/calcuts PB.stat > cutoffs 2>calcults.log | ||
python purge_dups/scripts/hist_plot.py -c cutoffs PB.stat covs.png | ||
|
||
#Step 2. Purge haplotigs and overlaps with the following command. | ||
$PURGE_DUP_BIN/purge_dups -2 -T cutoffs -c PB.base.cov $REF.split.self.paf.gz > dups.bed 2> purge_dups.log | ||
#Step 3. Get purged primary and haplotig sequences from draft assembly. | ||
$PURGE_DUP_BIN/get_seqs dups.bed $REF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
#!/bin/sh | ||
|
||
mkdir ./RepeatModeler | ||
cd ./RepeatModeler | ||
/RepeatModeler/BuildDatabase -name mydb genome.fasta > genome.rmodeler.log | ||
//RepeatModeler/RepeatModeler -pa 28 -database mydb > genome.repeatmmodeler.out | ||
rm -r RM_*/round-* | ||
mv RM_* result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/sh | ||
|
||
NO_THREAD=$t | ||
SCAFFHICDIR=/home/software/scaffhic/src | ||
|
||
cd $WORKDIR | ||
$SCAFFHICDIR/breakhic -nodes $NO_THREAD -grid 100 -fq1 $fq1 -fq2 $fq2 $contig $contig.break.fasta |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/sh | ||
|
||
NO_THREAD=$t | ||
source activate Trinity | ||
WORKDIR=/home/trinity_out | ||
DATADIR=/home/rawdata/RNA-seq | ||
|
||
echo "trinity assembly transcripts" | ||
cd $DATADIR | ||
gunzip -c $TYPE"_R1.fastq.gz" > $TYPE"_R1.fastq" | ||
gunzip -c $TYPE"_R2.fastq.gz" > $TYPE"_R2.fastq" | ||
|
||
cd $WORKDIR | ||
mkdir $TYPE | ||
cd $TYPE | ||
Trinity --seqType fq --max_memory 200G --left $DATADIR/$TYPE"_R1.fastq" --right $DATADIR/$TYPE"_R2.fastq" --CPU $NO_THREAD --output $WORKDIR/$TYPE"2"/trinity --no_version_check | ||
|
||
cd $DATADIR | ||
rm -rf $TYPE"_R1.fastq" | ||
rm -rf $TYPE"_R2.fastq" |