-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWLE_2014_ROS_gene_blasts_readme.text
123 lines (91 loc) · 8.83 KB
/
WLE_2014_ROS_gene_blasts_readme.text
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#DJS 17-Jun-19:
#Found out that the katG from Bryobacter is actually identical to the katG in the new Paludibaculum bin, and is 77 % identical to that in the new Bryobacter genome.
#This suggests that the original incomplete bin was a composite bin of both Acidobacteria
#Going to rerun the catalase/peroxidase read blasts to include mapping to the newly assembled genes from the coassemblies.
#First get all sequences annotated as catalase HPII (katA/katE), HPI (katG), cytochrome c peroxidase (ccp), and alkyl hydroperoxidase C (ahpC) from IMG annotations of all assemblies.
#Used these KO numbers to search for genes in IMG web browser:
#K03781 monofunctional heme catalase (katE/katA)
#K07217 manganese catalases (katN and others)
#K03782 catalase-peroxidase (katG)
#K00428 cytrochrome c peroxidase (CCP)
#K03386 peroxiredoxin ahpC
#K00434 Ascorbate peroxidases
#K03043 bacterial rpoB
#K03044 archaeal rpoB
#K03010 eukaryotic rpoB
####Noticed that there are a lot more CCP genes than originally detected by BLAST.
#These sequences are saved in the respective "uncleaned" fasta files
nohup bash batch_filter_seqs.sh &> batch_filter_seqs.log &
#Next, Filtered out all gene sequences below 70% of the average gene length in NCBI database (took average of the first ten genes found in gene search):
wd: ./
#Cluster the parsed gene sequence fastas using vsearch. Cluster at 95% identity (used BLAST definition of % identity), using the longest sequences as seeds.
#Report the longest gene for each cluster as the representative sequence.
#command for each gene:
nohup vsearch --cluster_fast WLE_2014_assembled_ahpC_cleaned.fasta --centroids WLE_2014_assembled_ahpC_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_ahpC_cleaned_cluster_results.txt &> vsearch_ahpC.log &
nohup vsearch --cluster_fast WLE_2014_assembled_katE_cleaned.fasta --centroids WLE_2014_assembled_katE_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_katE_cleaned_cluster_results.txt &> vsearch_katE.log &
nohup vsearch --cluster_fast WLE_2014_assembled_APX_cleaned.fasta --centroids WLE_2014_assembled_APX_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_APX_cleaned_cluster_results.txt &> vsearch_APX.log &
nohup vsearch --cluster_fast WLE_2014_assembled_bacterial_rpoB_cleaned.fasta --centroids WLE_2014_assembled_bacterial_rpoB_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_bacterial_rpoB_cleaned_cluster_results.txt &> vsearch_bacterial_rpoB.log &
nohup vsearch --cluster_fast WLE_2014_assembled_Euk_RPB2_cleaned.fasta --centroids WLE_2014_assembled_Euk_RPB2_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_Euk_RPB2_cleaned_cluster_results.txt &> vsearch_Euk_RPB2.log &
nohup vsearch --cluster_fast WLE_2014_assembled_CCP_cleaned.fasta --centroids WLE_2014_assembled_CCP_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_CCP_cleaned_cluster_results.txt &> vsearch_CCP.log &
nohup vsearch --cluster_fast WLE_2014_assembled_katG_cleaned.fasta --centroids WLE_2014_assembled_katG_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_katG_cleaned_cluster_results.txt &> vsearch_katG.log &
nohup vsearch --cluster_fast WLE_2014_assembled_MnCat_cleaned.fasta --centroids WLE_2014_assembled_MnCat_cleaned_clustered.fasta --id 0.95 -iddef 4 --uc WLE_2014_assembled_MnCat_cleaned_cluster_results.txt &> vsearch_MnCat.log &
#Get a list of genes in each cleaned and clustered gene fasta file:
#wd: ./
for i in *_clustered.fasta; do grep ">" $i > ${i}.genelist.txt; done
#Ran Get_Centroid_Scaffolds script get list of scaffolds and bins for each gene in the cleaned/clustered fasta file:
#Repeat the below script for each gene by changing the user set prefix variable in the script
#wd: ./
bash Get_centroid_scaffolds.sh
#DJS 6/23/19
#Convert metagenomic reads into fasta format for blast:
#wd: ./
nohup bash batch_convert_to_fasta.sh &> batch_convert_to_fasta.log &
#DJS 6/24/19
#Blast forward and reverse metaG reads against cleaned and dereplicated catalase/peroxidase genes (also blast against rpoB for normalization):
#wd: ./
nohup bash batch_metaG_blast.sh &> batch_metaG_blast.log &
#DJS 6/27/19
#Blast metaT reads to the cleaned and dereplicated catalase/peroxidase genes (also rpoB genes):
#wd: metaT_blasts/
nohup bash batch_metaT_blast.sh &> batch_metaT_blast.log &
#DJS 7/10/2019
#Blast the katG, ahpC, CCP, and APX genes against NCBI nr:
nohup blastx -query WLE_2014_assembled_katG_cleaned_clustered.fasta -db /geomicro/data9/flux/reference-data/blast/nr -outfmt "7 std qlen slen qcovs staxids" -out WLE_2014_assembled_katG_vs_nr.blastx &> WLE_2014_assembled_katG_vs_nr.log &
nohup blastx -query WLE_2014_assembled_ahpC_cleaned_clustered.fasta -db /geomicro/data9/flux/reference-data/blast/nr -outfmt "7 std qlen slen qcovs staxids" -out WLE_2014_assembled_ahpC_vs_nr.blastx &> WLE_2014_assembled_ahpC_vs_nr.log &
#DJS 7/19/2019
#Filter out hits that have an e-value higher than 1e-5:
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_katG_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_katG_vs_nr_postblast.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_ahpC_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_ahpC_vs_nr_postblast.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_CCP_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_CCP_vs_nr_postblast.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_APX_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_APX_vs_nr_postblast.blastx
#DJS 8/17/2019
#Filter blast hits again, but this time remove hits with a % ID below 70%:
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_katG_vs_nr.blastx -p 70 -e 1e-5 -o WLE_2014_assembled_katG_vs_nr_postblast_IDcutoff.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_ahpC_vs_nr.blastx -p 70 -e 1e-5 -o WLE_2014_assembled_ahpC_vs_nr_postblast_IDcutoff.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_CCP_vs_nr.blastx -p 70 -e 1e-5 -o WLE_2014_assembled_CCP_vs_nr_postblast_IDcutoff.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_APX_vs_nr.blastx -p 70 -e 1e-5 -o WLE_2014_assembled_APX_vs_nr_postblast_IDcutoff.blastx
#Ran blastx against NR for katE, MnCat, and rpoB:
nohup blastx -query WLE_2014_assembled_katE_cleaned_clustered.fasta -db /geomicro/data9/flux/reference-data/blast/nr -outfmt "7 std qlen slen qcovs staxids" -out WLE_2014_assembled_katE_vs_nr.blastx &> WLE_2014_assembled_katE_vs_nr.log &
nohup blastx -query WLE_2014_assembled_MnCat_cleaned_clustered.fasta -db /geomicro/data9/flux/reference-data/blast/nr -outfmt "7 std qlen slen qcovs staxids" -out WLE_2014_assembled_MnCat_vs_nr.blastx &> WLE_2014_assembled_MnCat_vs_nr.log &
nohup blastx -query WLE_2014_assembled_bacterial_rpoB_cleaned_clustered.fasta -db /geomicro/data9/flux/reference-data/blast/nr -outfmt "7 std qlen slen qcovs staxids" -out WLE_2014_assembled_bacterial_rpoB_vs_nr.blastx &> WLE_2014_assembled_bacterial_rpoB_vs_nr.log &
#DJS 9/3/2019
#Ran postblast for katE and MnCat:
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_katE_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_katE_vs_nr_postblast.blastx
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_MnCat_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_MnCat_vs_nr_postblast.blastx
#Run the following bash script to get a tally of all reads that mapped to each gene for a given sample, as well as blast statistics for the best match to NCBI nr database:
#wd: metaT_blasts/ and metaG_blasts/ for the respective sample types
nohup bash get_read_counts.sh &> get_read_counts.log &
#DJS 9/6/19
#Ran postblast for bacterial rpoBs:
perl /geomicro/data1/COMMON/scripts/BlastTools/postBlast.pl -b WLE_2014_assembled_bacterial_rpoB_vs_nr.blastx -e 1e-5 -o WLE_2014_assembled_bacterial_rpoB_vs_nr_postblast.blastx
#9/3/19
#Get the descriptions of the hits to NCBI from the accession numbers with a custom python script written by Robert Hein:
#Below is an example command to run the script:
comics python3 get_NCBI_accessions.py Read_counts_Sample_53599_adtrim_clean_qtrim_fwd_vs_ahpC.blastn.postblast.tophits.txt > Read_counts_Sample_53599_adtrim_clean_qtrim_fwd_vs_ahpC.blastn.postblast.tophits.txt_NCBI_accessions.txt
#To run in batch, use this command:
for i in Read_counts_*.tophits.txt; do comics python3 get_NCBI_accessions.py $i > ${i}.NCBI_accessions.txt; done
#4/13/21 DJS
#Updated the counts script to only count alignments with 80% read coverage and re-ran the get_read_counts command and the get_NCBI_accession commands
#Added a column for file name in each Read_counts_*.NCBI_accessions.txt file:
#Ran this command in the metaG_blasts/ and metaT_blasts/ directories
for f in *.NCBI_accessions.txt; do sed -i "s/$/\t$f/g" $f; done