Skip to content

Commit 8d70c4d

Browse files
author
Hongxin
committed
Support reference genome
Add GRCh38 example maf file Update README.md
1 parent 77d065b commit 8d70c4d

File tree

8 files changed

+161
-37
lines changed

8 files changed

+161
-37
lines changed

.editorconfig

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# The EditorConfig project consists of a file format for defining coding styles
2+
# and a collection of text editor plugins that enable editors to read the file format
3+
# and adhere to defined styles.
4+
5+
# EditorConfig files are read top to bottom and the closest EditorConfig files are read last.
6+
# Properties from matching EditorConfig sections are applied in the order they were read,
7+
# so properties in closer files take precedence.
8+
9+
# Please only specify the formats you want to apply through out the project in this file.
10+
# Otherwise, please create new config file in your directory where you want to apply these styles.
11+
12+
# More details about EditorConfig: http://EditorConfig.org
13+
14+
# top-most EditorConfig file
15+
root = true
16+
17+
[*]
18+
# Unix-style newlines with a newline ending every file
19+
insert_final_newline = false
20+
trim_trailing_whitespace = false
21+
22+

AnnotatorCore.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ def setsampleidsfileterfile(f):
114114
PROTEIN_POSITION_HEADERS = ['PROTEIN_POSITION']
115115
CANCER_TYPE_HEADERS = ['ONCOTREE_CODE', 'CANCER_TYPE']
116116
FUSION_HEADERS = ['FUSION']
117+
REFERENCE_GENOME_HEADERS = ['NCBI_BUILD', 'REFERENCE_GENOME']
117118

118119
# columns for genomic change annotation
119120
GC_CHROMOSOME_HEADER = 'CHROMOSOME'
@@ -132,6 +133,11 @@ class QueryType(Enum):
132133
GENOMIC_CHANGE = 'GENOMIC_CHANGE'
133134

134135

136+
class ReferenceGenome(Enum):
137+
GRCH37 = 'GRCh37'
138+
GRCH38 = 'GRCh38'
139+
140+
135141
REQUIRED_QUERY_TYPE_COLUMNS = {
136142
QueryType.HGVSP_SHORT: [HGVSP_SHORT_HEADER],
137143
QueryType.HGVSP: [HGVSP_HEADER],
@@ -336,8 +342,19 @@ def resolve_query_type(user_input_query_type, headers):
336342
return selected_query_type
337343

338344

345+
def get_reference_genome_from_row(row_reference_genome, default_reference_genome):
346+
reference_genome = default_reference_genome
347+
if row_reference_genome is not None and row_reference_genome != '':
348+
try:
349+
reference_genome = ReferenceGenome[row_reference_genome.upper()]
350+
except KeyError:
351+
log.warning('Unexpected reference genome, only GRCh37 and GRCh38 are supported.' + (
352+
' Use default.' if default_reference_genome is not None else ' Skipping.'))
353+
return reference_genome
354+
355+
339356
def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerType, cancerTypeMap,
340-
retainonlycuratedgenes, annotatehotspots, user_input_query_type):
357+
retainonlycuratedgenes, annotatehotspots, user_input_query_type, default_reference_genome):
341358
if annotatehotspots:
342359
inithotspots()
343360
if os.path.isfile(previousoutfile):
@@ -381,19 +398,19 @@ def processalterationevents(eventfile, outfile, previousoutfile, defaultCancerTy
381398
process_alteration(reader, outf, headers, [HGVSP_SHORT_HEADER, ALTERATION_HEADER], ncols, newncols,
382399
defaultCancerType,
383400
cancerTypeMap,
384-
retainonlycuratedgenes, annotatehotspots)
401+
retainonlycuratedgenes, annotatehotspots, default_reference_genome)
385402

386403
if (query_type == QueryType.HGVSP):
387404
process_alteration(reader, outf, headers, [HGVSP_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
388405
cancerTypeMap,
389-
retainonlycuratedgenes, annotatehotspots)
406+
retainonlycuratedgenes, annotatehotspots, default_reference_genome)
390407

391408
if (query_type == QueryType.HGVSG):
392409
process_hvsg(reader, outf, headers, [HGVSG_HEADER, ALTERATION_HEADER], ncols, newncols, defaultCancerType,
393-
cancerTypeMap)
410+
cancerTypeMap, default_reference_genome)
394411

395412
if (query_type == QueryType.GENOMIC_CHANGE):
396-
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap)
413+
process_genomic_change(reader, outf, headers, ncols, newncols, defaultCancerType, cancerTypeMap, default_reference_genome)
397414

398415
outf.close()
399416

@@ -407,7 +424,7 @@ def get_cell_content(row, index, return_empty_string=False):
407424
return None
408425

409426
def process_alteration(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap,
410-
retainonlycuratedgenes, annotatehotspots):
427+
retainonlycuratedgenes, annotatehotspots, default_reference_genome):
411428
ihugo = geIndexOfHeader(maf_headers, HUGO_HEADERS)
412429
iconsequence = geIndexOfHeader(maf_headers, CONSEQUENCE_HEADERS)
413430
ihgvs = geIndexOfHeader(maf_headers, alteration_column_names)
@@ -416,6 +433,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
416433
iend = geIndexOfHeader(maf_headers, PROTEIN_END_HEADERS)
417434
iproteinpos = geIndexOfHeader(maf_headers, PROTEIN_POSITION_HEADERS)
418435
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
436+
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)
419437

420438
posp = re.compile('[0-9]+')
421439

@@ -445,6 +463,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
445463
hgvs = hgvs[2:]
446464

447465
cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
466+
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)
448467

449468
hgvs = conversion(hgvs)
450469

@@ -478,7 +497,7 @@ def process_alteration(maffilereader, outf, maf_headers, alteration_column_names
478497
row.append(_3dhotspot)
479498

480499
if not retainonlycuratedgenes or hugo in curatedgenes:
481-
query = ProteinChangeQuery(hugo, hgvs, cancertype, consequence, start, end)
500+
query = ProteinChangeQuery(hugo, hgvs, cancertype, reference_genome, consequence, start, end)
482501
queries.append(query)
483502
rows.append(row)
484503
else:
@@ -510,7 +529,7 @@ def get_var_allele(ref_allele, tumor_seq_allele1, tumor_seq_allele2):
510529

511530
return tumor_seq_allele
512531

513-
def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap):
532+
def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationcols, defaultCancerType, cancerTypeMap, default_reference_genome):
514533
ichromosome = geIndexOfHeader(maf_headers, [GC_CHROMOSOME_HEADER])
515534
istart = geIndexOfHeader(maf_headers, [GC_START_POSITION_HEADER])
516535
iend = geIndexOfHeader(maf_headers, [GC_END_POSITION_HEADER])
@@ -520,6 +539,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
520539

521540
isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS)
522541
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
542+
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)
523543

524544
posp = re.compile('[0-9]+')
525545

@@ -539,6 +559,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
539559
continue
540560

541561
cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
562+
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)
542563

543564
chromosome = get_cell_content(row, ichromosome, True)
544565
start = get_cell_content(row, istart, True)
@@ -548,7 +569,7 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
548569
var_allele_2 = get_cell_content(row, ivarallele2, True)
549570
var_allele = get_var_allele(ref_allele, var_allele_1, var_allele_2)
550571

551-
query = GenomicChangeQuery(chromosome, start, end, ref_allele, var_allele, cancertype)
572+
query = GenomicChangeQuery(chromosome, start, end, ref_allele, var_allele, cancertype, reference_genome)
552573
queries.append(query)
553574
rows.append(row)
554575

@@ -562,10 +583,11 @@ def process_genomic_change(maffilereader, outf, maf_headers, ncols, nannotationc
562583
annotations = pull_genomic_change_info(queries)
563584
append_annotation_to_file(outf, ncols+nannotationcols, rows, annotations)
564585

565-
def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap):
586+
def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncols, nannotationcols, defaultCancerType, cancerTypeMap, default_reference_genome):
566587
ihgvsg = geIndexOfHeader(maf_headers, alteration_column_names)
567588
isample = geIndexOfHeader(maf_headers, SAMPLE_HEADERS)
568589
icancertype = geIndexOfHeader(maf_headers, CANCER_TYPE_HEADERS)
590+
ireferencegenome= geIndexOfHeader(maf_headers, REFERENCE_GENOME_HEADERS)
569591

570592
i = 0
571593
queries = []
@@ -585,12 +607,13 @@ def process_hvsg(maffilereader, outf, maf_headers, alteration_column_names, ncol
585607
hgvsg = get_cell_content(row, ihgvsg)
586608

587609
cancertype = get_tumor_type_from_row(row, i, defaultCancerType, icancertype, cancerTypeMap, sample)
610+
reference_genome = get_reference_genome_from_row(get_cell_content(row, ireferencegenome), default_reference_genome)
588611

589612
if hgvsg is None:
590613
append_annotation_to_file(outf, ncols + nannotationcols, [row],
591614
[[GENE_IN_ONCOKB_DEFAULT, VARIANT_IN_ONCOKB_DEFAULT]])
592615
else:
593-
query = HGVSgQuery(hgvsg, cancertype)
616+
query = HGVSgQuery(hgvsg, cancertype, reference_genome)
594617
queries.append(query)
595618
rows.append(row)
596619

@@ -1201,7 +1224,7 @@ def __init__(self, hugo):
12011224

12021225

12031226
class ProteinChangeQuery:
1204-
def __init__(self, hugo, hgvs, cancertype, consequence=None, start=None, end=None):
1227+
def __init__(self, hugo, hgvs, cancertype, reference_genome=None, consequence=None, start=None, end=None):
12051228
self.gene = Gene(hugo)
12061229
self.alteration = hgvs
12071230
if consequence is not None:
@@ -1211,16 +1234,24 @@ def __init__(self, hugo, hgvs, cancertype, consequence=None, start=None, end=Non
12111234
if end is not None:
12121235
self.proteinEnd = end
12131236
self.tumorType = cancertype
1237+
if reference_genome is not None:
1238+
self.referenceGenome = reference_genome.value
1239+
12141240

12151241
class HGVSgQuery:
1216-
def __init__(self, hgvsg, cancertype):
1242+
def __init__(self, hgvsg, cancertype, reference_genome=None):
12171243
self.hgvsg = hgvsg
12181244
self.tumorType = cancertype
1245+
if reference_genome is not None:
1246+
self.referenceGenome = reference_genome.value
1247+
12191248

12201249
class GenomicChangeQuery:
1221-
def __init__(self, chromosome, start, end, ref_allele, var_allele, cancertype):
1250+
def __init__(self, chromosome, start, end, ref_allele, var_allele, cancertype, reference_genome=None):
12221251
self.genomicLocation = ','.join([chromosome, start, end, ref_allele, var_allele])
12231252
self.tumorType = cancertype
1253+
if reference_genome is not None:
1254+
self.referenceGenome = reference_genome.value
12241255

12251256
class CNAQuery:
12261257
def __init__(self, hugo, cnatype, cancertype):

MafAnnotator.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def main(argv):
1313
log.info(
1414
'\n'
1515
'MafAnnotator.py -i <input MAF file> -o <output MAF file> [-p previous results] [-c <input clinical file>] '
16-
'[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] [-q query type]\n'
16+
'[-s sample list filter] [-t <default tumor type>] [-u oncokb-base-url] [-b oncokb api bear token] [-a] [-q query type] [-r defauult reference genome]\n'
1717
'Essential MAF columns (case insensitive):\n'
1818
' HUGO_SYMBOL: Hugo gene symbol\n'
1919
' VARIANT_CLASSIFICATION: Translational effect of variant allele\n'
@@ -30,14 +30,17 @@ def main(argv):
3030
' 2) ONCOTREE_CODE exist in MAF\n'
3131
' 3) default tumor type (-t)\n'
3232
'Query type only allows the following values (case-insensitive):\n'
33-
' - HGVSp_Short \n'
33+
' - HGVSp_Short\n'
3434
' It reads from column HGVSp_Short or Alteration\n'
3535
' - HGVSp\n'
3636
' It reads from column HGVSp or Alteration\n'
3737
' - HGVSg\n'
3838
' It reads from column HGVSg or Alteration\n'
3939
' - Genomic_Change\n'
4040
' It reads from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2 \n'
41+
'Reference Genome only allows the following values(case-insensitive):\n'
42+
' - GRCh37\n'
43+
' GRCh38\n'
4144
'Default OncoKB base url is https://www.oncokb.org.\n'
4245
'Use -a to annotate mutational hotspots\n'
4346
)
@@ -66,13 +69,21 @@ def main(argv):
6669
try:
6770
user_input_query_type = QueryType[argv.query_type.upper()]
6871
except KeyError:
69-
# if not isinstance(argv.query_type.upper(), QueryType):
70-
print(
72+
log.error(
7173
'Query type is not acceptable. Only the following allows(case insensitive): HGVSp_Short, HGVSp, HGVSg, Genomic_Change')
7274
raise
7375

76+
default_reference_genome = None
77+
if argv.default_reference_genome is not None:
78+
try:
79+
default_reference_genome = ReferenceGenome[argv.default_reference_genome.upper()]
80+
except KeyError:
81+
log.error(
82+
'Reference genome is not acceptable. Only the following allows(case insensitive): GRCh37, GRCh38')
83+
raise
84+
7485
processalterationevents(argv.input_file, argv.output_file, argv.previous_result_file, argv.default_cancer_type,
75-
cancertypemap, True, argv.annotate_hotspots, user_input_query_type)
86+
cancertypemap, True, argv.annotate_hotspots, user_input_query_type, default_reference_genome)
7687

7788
log.info('done!')
7889

@@ -91,6 +102,7 @@ def main(argv):
91102
parser.add_argument('-v', dest='cancer_hotspots_base_url', default='', type=str)
92103
parser.add_argument('-b', dest='oncokb_api_bearer_token', default='', type=str)
93104
parser.add_argument('-q', dest='query_type', default=None, type=str)
105+
parser.add_argument('-r', dest='default_reference_genome', default=None, type=str)
94106
parser.set_defaults(func=main)
95107

96108
args = parser.parse_args()

README.md

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Example input files are under [data](data). An example script is here: [example.
3131
We recommend processing VCF files by [vcf2maf](https://github.com/mskcc/vcf2maf/) with [MSK override isoforms](https://github.com/mskcc/vcf2maf/blob/master/data/isoform_overrides_at_mskcc) before using the `MafAnnotator` here.
3232

3333

34-
#### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change
34+
### Annotate with HGVSp_Short, HGVSp, HGVSg or Genomic Change
3535
OncoKB MafAnnotator supports annotating the alteration with HGVSp, HGVSp_Short, HGVSg or Genomic Change format. Please specify the query type with -q parameter.
3636
The acceptable values are HGVSp_Short, HGVSp, HGVSg and Genomic_Change(case-insensitive). Please see data/example.sh for examples.
3737
If you do not specify query type, the MafAnnotator will try to figure out the query type based on the headers.
@@ -42,6 +42,18 @@ For HGVSg, the annotator takes alteration from the column HGVSg or Alteration
4242
For Genomic_Change, the annotator takes genomic change from columns Chromosome, Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1 and Tumor_Seq_Allele2
4343

4444

45+
### Annotate with different reference genomes (GRCh37, GRCh38)
46+
OncoKB MafAnnotator supports annotating the alteration with reference genome GRCh37 and GRCh38.
47+
48+
The annotator will get the reference genome from MAF file column NCBI_Build or Reference_Genome.
49+
If there is no reference genome specified in the file, we will use the default reference genome through -r parameter.
50+
51+
You can specify the default reference genome using -r parameter (This is only applicable to MafAnnotator.py).
52+
The acceptable values are GRCh37, GRCh38 (case in-sensitive).
53+
54+
If both values are not specified, the annotator will use OncoKB default reference genome which is GRCh37.
55+
56+
4557
## Levels of Evidence
4658
Introducing [Simplified OncoKB Levels of Evidence](https://www.oncokb.org/levels):
4759
- New Level 2, defined as “Standard care biomarker recommended by the NCCN or other expert panels predictive of response to an FDA-approved drug in this indication” (formerly Level 2A).

data/example_maf.txt

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,17 @@
1-
NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
2-
GRCh37 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser
3-
GRCh37 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182*
4-
GRCh37 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.178936082G>A 3 178936082 178936082 G A A
5-
GRCh37 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met
6-
GRCh37 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55223543C>T 7 55223543 55223543 C T T
7-
GRCh37 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.89692922T>C 10 89692922 89692922 T C C
8-
GRCh37 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys
9-
GRCh37 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg
10-
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285C>A 12 25398285 25398285 C A A
11-
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285_25398286delinsAG 12 25398285 25398286 CA AG AG
12-
GRCh37 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702*
13-
GRCh37 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7577538C>T 17 7577538 7577538 C T T
14-
GRCh37 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.29586049G>A 17 29586049 29586049 G A A
15-
GRCh37 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg
16-
GRCh37 TERT 5'Flank TCGA-05-4417-01 5:g.1295228G>A 5 1295228 1295228 G A A
1+
NCBI_Build Hugo_Symbol Variant_Classification Tumor_Sample_Barcode HGVSp_Short HGVSp HGVSg Chromosome Start_Position End_Position Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2
2+
GRCh37 CUL1 Missense_Mutation TCGA-A6-2672-01A-01W-0833-10 p.Y466S Tyr466Ser
3+
GRCh37 AKT3 Nonsense_Mutation TCGA-05-4417-01 p.E182* Glu182*
4+
GRCh37 PIK3CA Missense_Mutation TCGA-02-0033-01 p.E542K Glu542Lys 3:g.178936082G>A 3 178936082 178936082 G A A
5+
GRCh37 FGFR3 Missense_Mutation TCGA-05-4417-01 p.V271M Val271Met
6+
GRCh37 EGFR Missense_Mutation TCGA-06-0155-01 p.H304Y His304Tyr 7:g.55223543C>T 7 55223543 55223543 C T T
7+
GRCh37 PTEN Missense_Mutation TCGA-06-0155-01 p.C136R Cys136Arg 10:g.89692922T>C 10 89692922 89692922 T C C
8+
GRCh37 FGFR2 Missense_Mutation TCGA-02-0033-01 p.Q212K Gln121Lys
9+
GRCh37 ATM Missense_Mutation TCGA-05-4417-01 p.L2890R Leu2890Arg
10+
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285C>A 12 25398285 25398285 C A A
11+
GRCh37 KRAS Missense_Mutation TCGA-05-4417-01 p.G12C Gly12Cys 12:g.25398285_25398286delinsAG 12 25398285 25398286 CA AG AG
12+
GRCh37 RB1 Nonsense_Mutation TCGA-02-0033-01 p.Q702* Gln702*
13+
GRCh37 TP53 Missense_Mutation TCGA-02-0033-01 p.R248Q Arg248Gln 17:g.7577538C>T 17 7577538 7577538 C T T
14+
GRCh37 NF1 Splice_Site TCGA-02-0033-01 p.X1445_splice X1445_splice 17:g.29586049G>A 17 29586049 29586049 G A A
15+
GRCh37 STK11 Missense_Mutation TCGA-05-4417-01 p.H168R His168Arg
16+
GRCh37 TERT 5'Flank TCGA-05-4417-01 5:g.1295228G>A 5 1295228 1295228 G A A
17+
GRCh37 MYD88 Missense_Mutation TCGA-05-4417-01 M232T

0 commit comments

Comments
 (0)