2
2
3
3
import csv
4
4
import re
5
- import warnings
6
5
from .Interfaces import hAMRonizedResultIterator
7
- from hAMRonization .constants import NUCLEOTIDE_VARIANT , AMINO_ACID_VARIANT , GENE_PRESENCE
6
+ from hAMRonization .constants import (
7
+ NUCLEOTIDE_VARIANT ,
8
+ AMINO_ACID_VARIANT ,
9
+ GENE_PRESENCE ,
10
+ )
8
11
9
- required_metadata = ['analysis_software_version' ,
10
- 'reference_database_version' ,
11
- 'input_file_name' ]
12
+ required_metadata = [
13
+ "analysis_software_version" ,
14
+ "reference_database_version" ,
15
+ "input_file_name" ,
16
+ ]
12
17
13
18
14
19
class AmrFinderPlusIterator (hAMRonizedResultIterator ):
15
-
16
20
def __init__ (self , source , metadata ):
17
- metadata [' analysis_software_name' ] = ' amrfinderplus'
18
- metadata [' reference_database_name' ] = ' NCBI Reference Gene Database'
21
+ metadata [" analysis_software_name" ] = " amrfinderplus"
22
+ metadata [" reference_database_name" ] = " NCBI Reference Gene Database"
19
23
self .metadata = metadata
20
24
21
25
# check source for whether AMFP has been run in protein or nt mode
22
26
23
27
nucleotide_field_mapping = {
24
- 'Protein identifier' : None ,
25
- 'Contig id' : 'input_sequence_id' ,
26
- 'Start' : 'input_gene_start' ,
27
- 'Stop' : 'input_gene_stop' ,
28
- 'Strand' : 'strand_orientation' ,
29
- 'Gene symbol' : 'gene_symbol' ,
30
- 'Sequence name' : 'gene_name' ,
31
- 'Scope' : None ,
32
- 'Element type' : None ,
33
- 'Element subtype' : None ,
34
- 'Class' : 'drug_class' ,
35
- 'Subclass' : 'antimicrobial_agent' ,
36
- 'Method' : None ,
37
- 'Target length' : 'input_protein_length' ,
38
- 'Reference sequence length' : 'reference_protein_length' ,
39
- '% Coverage of reference sequence' : 'coverage_percentage' ,
40
- '% Identity to reference sequence' : 'sequence_identity' ,
41
- 'Alignment length' : None ,
42
- 'Accession of closest sequence' : 'reference_accession' ,
43
- 'Name of closest sequence' : None ,
44
- 'HMM id' : None ,
45
- 'HMM description' : None ,
46
- 'AA Mutation' : 'amino_acid_mutation' ,
47
- 'Nucleotide Mutation' : 'nucleotide_mutation' ,
48
- 'genetic_variation_type' : 'genetic_variation_type'
49
- }
50
- protein_field_mapping = {'Protein identifier' : 'input_sequence_id' ,
51
- 'Gene symbol' : 'gene_symbol' ,
52
- 'Sequence name' : 'gene_name' ,
53
- 'Scope' : None ,
54
- 'Element' : None ,
55
- 'Element subtype' : None ,
56
- 'Class' : 'drug_class' ,
57
- 'Subclass' : 'antimicrobial_agent' ,
58
- 'Method' : None ,
59
- 'Target length' : 'input_protein_length' ,
60
- 'Reference sequence length' : 'reference_protein_length' ,
61
- '% Coverage of reference sequence' : 'coverage_percentage' ,
62
- '% Identity to reference sequence' : 'sequence_identity' ,
63
- 'Alignment length' : None ,
64
- 'Accession of closest sequence' : 'reference_accession' ,
65
- 'Name of closest sequence' : None ,
66
- 'HMM id' : None ,
67
- 'HMM description' : None ,
68
- 'AA Mutation' : 'amino_acid_mutation' ,
69
- 'genetic_variation_type' : 'genetic_variation_type'
70
- }
28
+ "Protein identifier" : None ,
29
+ "Contig id" : "input_sequence_id" ,
30
+ "Start" : "input_gene_start" ,
31
+ "Stop" : "input_gene_stop" ,
32
+ "Strand" : "strand_orientation" ,
33
+ "Gene symbol" : "gene_symbol" ,
34
+ "Sequence name" : "gene_name" ,
35
+ "Scope" : None ,
36
+ "Element type" : None ,
37
+ "Element subtype" : None ,
38
+ "Class" : "drug_class" ,
39
+ "Subclass" : "antimicrobial_agent" ,
40
+ "Method" : None ,
41
+ "Target length" : "input_protein_length" ,
42
+ "Reference sequence length" : "reference_protein_length" ,
43
+ "% Coverage of reference sequence" : "coverage_percentage" ,
44
+ "% Identity to reference sequence" : "sequence_identity" ,
45
+ "Alignment length" : None ,
46
+ "Accession of closest sequence" : "reference_accession" ,
47
+ "Name of closest sequence" : None ,
48
+ "HMM id" : None ,
49
+ "HMM description" : None ,
50
+ "AA Mutation" : "amino_acid_mutation" ,
51
+ "Nucleotide Mutation" : "nucleotide_mutation" ,
52
+ "genetic_variation_type" : "genetic_variation_type" ,
53
+ }
54
+ protein_field_mapping = {
55
+ "Protein identifier" : "input_sequence_id" ,
56
+ "Gene symbol" : "gene_symbol" ,
57
+ "Sequence name" : "gene_name" ,
58
+ "Scope" : None ,
59
+ "Element" : None ,
60
+ "Element subtype" : None ,
61
+ "Class" : "drug_class" ,
62
+ "Subclass" : "antimicrobial_agent" ,
63
+ "Method" : None ,
64
+ "Target length" : "input_protein_length" ,
65
+ "Reference sequence length" : "reference_protein_length" ,
66
+ "% Coverage of reference sequence" : "coverage_percentage" ,
67
+ "% Identity to reference sequence" : "sequence_identity" ,
68
+ "Alignment length" : None ,
69
+ "Accession of closest sequence" : "reference_accession" ,
70
+ "Name of closest sequence" : None ,
71
+ "HMM id" : None ,
72
+ "HMM description" : None ,
73
+ "AA Mutation" : "amino_acid_mutation" ,
74
+ "genetic_variation_type" : "genetic_variation_type" ,
75
+ }
71
76
72
77
with open (source ) as fh :
73
- header = next (fh )
78
+ _ = next (fh )
74
79
try :
75
80
first_result = next (fh )
76
- if first_result .strip ().split (' \t ' )[0 ] == 'NA' :
81
+ if first_result .strip ().split (" \t " )[0 ] == "NA" :
77
82
self .field_mapping = nucleotide_field_mapping
78
83
else :
79
84
self .field_mapping = protein_field_mapping
@@ -84,12 +89,11 @@ def __init__(self, source, metadata):
84
89
85
90
super ().__init__ (source , self .field_mapping , self .metadata )
86
91
87
-
88
92
def parse (self , handle ):
89
93
"""
90
94
Read each and return it
91
95
"""
92
- reader = csv .DictReader (handle , delimiter = ' \t ' )
96
+ reader = csv .DictReader (handle , delimiter = " \t " )
93
97
for result in reader :
94
98
# replace NA value with None for consitency
95
99
for field , value in result .items ():
@@ -99,21 +103,21 @@ def parse(self, handle):
99
103
# "POINT" indicates mutational resistance
100
104
# amrfinderplus has no special fields but the mutation itself is
101
105
# appended to the symbol name so we want to split this
102
- result [' AA Mutation' ] = None
103
- result [' Nucleotide Mutation' ] = None
104
- result [' genetic_variation_type' ] = GENE_PRESENCE
106
+ result [" AA Mutation" ] = None
107
+ result [" Nucleotide Mutation" ] = None
108
+ result [" genetic_variation_type" ] = GENE_PRESENCE
105
109
106
- if result [' Element subtype' ] == ' POINT' :
107
- gene_symbol , mutation = result ["Gene symbol" ].rsplit ('_' , 1 )
108
- result [' Gene symbol' ] = gene_symbol
110
+ if result [" Element subtype" ] == " POINT" :
111
+ gene_symbol , mutation = result ["Gene symbol" ].rsplit ("_" , 1 )
112
+ result [" Gene symbol" ] = gene_symbol
109
113
_ , ref , pos , alt , _ = re .split (r"(\D+)(\d+)(\D+)" , mutation )
110
114
# this means it is a protein mutation
111
- if result [' Method' ] in [' POINTX' , ' POINTP' ]:
112
- result [' AA Mutation' ] = f"p.{ ref } { pos } { alt } "
113
- result [' genetic_variation_type' ] = AMINO_ACID_VARIANT
114
- elif result [' Method' ] == ' POINTN' :
115
+ if result [" Method" ] in [" POINTX" , " POINTP" ]:
116
+ result [" AA Mutation" ] = f"p.{ ref } { pos } { alt } "
117
+ result [" genetic_variation_type" ] = AMINO_ACID_VARIANT
118
+ elif result [" Method" ] == " POINTN" :
115
119
# e.g., 23S_G2032G ampC_C-11C -> c.2032G>G
116
- result [' Nucleotide Mutation' ] = f"c.{ pos } { ref } >{ alt } "
117
- result [' genetic_variation_type' ] = NUCLEOTIDE_VARIANT
120
+ result [" Nucleotide Mutation" ] = f"c.{ pos } { ref } >{ alt } "
121
+ result [" genetic_variation_type" ] = NUCLEOTIDE_VARIANT
118
122
119
123
yield self .hAMRonize (result , self .metadata )
0 commit comments