Skip to content

Commit 3ec2706

Browse files
authored
Merge pull request #361 from bmeg/rc5-patch
Fix G2P -> Publication links from DGIGB
2 parents e57def7 + a80511b commit 3ec2706

File tree

10 files changed

+76
-53
lines changed

10 files changed

+76
-53
lines changed

outputs.bmeg_manifest.dvc

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
md5: 8799729d258acd405bdf6d92de68dfcf
1+
md5: fa537571e66fb976ab6f547028ddf408
22
cmd: echo generating file manifest...
33
deps:
44
- md5: b2d2ab73b01758f77505caecc3963f29
@@ -69,23 +69,23 @@ deps:
6969
path: outputs/ccle/maf.SomaticCallset_Aliquots_Aliquot.Edge.json.gz
7070
- md5: 4dfc12d1a89eee58e38ef9109bbd64e4
7171
path: outputs/celllines/Case_SameAs_Case.Edge.json.gz
72-
- md5: 8e8e11059b5cbd3c1ae85573b30dabb0
72+
- md5: 015143f9421a9fb5a3311f92043056c2
7373
path: outputs/compound/normalized.Case_Compounds_Compound.Edge.json.gz
74-
- md5: dd0ac5a944094d76a0cf12c4bff575da
74+
- md5: 20b8a955bff703c00690081bebfbd372
7575
path: outputs/compound/normalized.Compound.Vertex.json.gz
76-
- md5: 488eee20c9e4f0c99070eb6dfbb3fdb7
76+
- md5: acb155e76510e2661158e52272338a3c
7777
path: outputs/compound/normalized.Compound_Cases_Case.Edge.json.gz
78-
- md5: 967a3e45c1ea2270a9b9bb4763b9cff9
78+
- md5: a0af3202a4dcef822bd16127407c043e
7979
path: outputs/compound/normalized.Compound_DrugResponses_DrugResponse.Edge.json.gz
80-
- md5: 21734e88b95e98c4eabac40c91cfbd45
80+
- md5: 6a029d0ee0b8f531cce4410d0e8affa4
8181
path: outputs/compound/normalized.Compound_G2PAssociations_G2PAssociation.Edge.json.gz
82-
- md5: a9945dd00371f9ae553098b2b71f4ce8
82+
- md5: d8cfbd16c89e661cf15df5c37cea5d72
8383
path: outputs/compound/normalized.Compound_Projects_Project.Edge.json.gz
84-
- md5: fb5f300161494a94d2493cfb5b704e68
84+
- md5: 645c674a623e27bc84f9449ad8111683
8585
path: outputs/compound/normalized.DrugResponse_Compounds_Compound.Edge.json.gz
86-
- md5: c76490317a7a49562e903c52ddba2c6e
86+
- md5: 69b9e6f4ee8226dac032bf8814d72ff2
8787
path: outputs/compound/normalized.G2PAssociation_Compounds_Compound.Edge.json.gz
88-
- md5: c53f52b19e36799ee099edc0ef3d7657
88+
- md5: a56821071854cc805120007a99728f0e
8989
path: outputs/compound/normalized.Project_Compounds_Compound.Edge.json.gz
9090
- md5: cb3fcbfcee28d6434f8397355cff568a
9191
path: outputs/ctrp/ctrp.Aliquot.Vertex.json.gz
@@ -113,15 +113,15 @@ deps:
113113
path: outputs/ctrp/ctrp.Sample_Aliquots_Aliquot.Edge.json.gz
114114
- md5: ce514bf610e31c6b36b3d2945d2e9ccc
115115
path: outputs/ctrp/ctrp.Sample_Case_Case.Edge.json.gz
116-
- md5: 3fa57b7c369e2c8dd2dc0d0b6c44766e
116+
- md5: aac51ae53ff021ed9291b1aa2ae78076
117117
path: outputs/dgidb/G2PAssociation.Vertex.json.gz
118-
- md5: 166177ce31138f0ee32dc339fdfdd987
118+
- md5: 758362f57be6a69c6969a48182e0ff18
119119
path: outputs/dgidb/G2PAssociation_Genes_Gene.Edge.json.gz
120-
- md5: 4f99308481017576d6cc208b6f514eab
120+
- md5: 57d481f7135e3a8761ad8920ceca5fb0
121121
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
122-
- md5: 97282e6b782ab81889884ba6d9ba368e
122+
- md5: a701cee66767a702c18b7b3d4cb4b829
123123
path: outputs/dgidb/Gene_G2PAssociations_G2PAssociation.Edge.json.gz
124-
- md5: 03f13f3513d5282b8a1f309429ae81f0
124+
- md5: cecbf1a9f863ecfd0ce3db553becfad9
125125
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
126126
- md5: 3f89bd640983b7ea76b5dbbbcd846941
127127
path: outputs/ensembl/Exon.Vertex.json.gz
@@ -359,7 +359,7 @@ deps:
359359
path: outputs/phenotype/normalized.Phenotype_Samples_Sample.Edge.json.gz
360360
- md5: 3729d0155dbf3f3f30b926a030a1cf95
361361
path: outputs/phenotype/normalized.Sample_Phenotypes_Phenotype.Edge.json.gz
362-
- md5: 87182b1ccb82bc03a2efb61ca77703d5
362+
- md5: 7bbe3da8a9ade7327c41858d20107e58
363363
path: outputs/publication/stub.Publication.Vertex.json.gz
364364
- md5: 6bdd6ef5f03bc16023a4c59bfcec95db
365365
path: outputs/pubmed/baseline/pubmed19n0001.Publication.Vertex.json.gz

outputs/compound/normalized.compounds.dvc

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
md5: 72113976b51fe192773b38a2d2071503
1+
md5: a8590f84b821452a9ac88e92ab816e88
22
cmd: python3 transform/compound/transform.py
33
wdir: ../..
44
deps:
@@ -14,7 +14,7 @@ deps:
1414
path: outputs/gdc/gdc.Compound.Vertex.json.gz
1515
- md5: bdd1c3dedf67c2f3179f04771115a3c7
1616
path: outputs/pharmacodb/Compound.Vertex.json.gz
17-
- md5: f7a2bbfb7436d76e09b9e13bd3d0c9ff
17+
- md5: 502320cf8546d8607d62a1f0518e97cc
1818
path: outputs/dgidb/Compound.Vertex.json.gz
1919
- md5: 90070ff6332de4a429188a69ea8d9633
2020
path: outputs/pharmacodb/DrugResponse_Compounds_Compound.Edge.json.gz
@@ -36,52 +36,52 @@ deps:
3636
path: outputs/pharmacodb/Project_Compounds_Compound.Edge.json.gz
3737
- md5: 62990f11a0051b7357506699167a7611
3838
path: outputs/pharmacodb/Compound_Projects_Project.Edge.json.gz
39-
- md5: 9c4627785660215fa9adbf7c9f6cf172
39+
- md5: 708a48991dc4d6051097e6a8e8175b7b
4040
path: outputs/dgidb/G2PAssociation_Compounds_Compound.Edge.json.gz
41-
- md5: fa0ae005deb28af2d935cc818c1394b4
41+
- md5: 0e9b6635d4e8153d221d1c2a0604627a
4242
path: outputs/dgidb/Compound_G2PAssociations_G2PAssociation.Edge.json.gz
4343
outs:
44-
- md5: dd0ac5a944094d76a0cf12c4bff575da
44+
- md5: 20b8a955bff703c00690081bebfbd372
4545
path: outputs/compound/normalized.Compound.Vertex.json.gz
4646
cache: true
4747
metric: false
4848
persist: false
49-
- md5: fb5f300161494a94d2493cfb5b704e68
49+
- md5: 645c674a623e27bc84f9449ad8111683
5050
path: outputs/compound/normalized.DrugResponse_Compounds_Compound.Edge.json.gz
5151
cache: true
5252
metric: false
5353
persist: false
54-
- md5: 967a3e45c1ea2270a9b9bb4763b9cff9
54+
- md5: a0af3202a4dcef822bd16127407c043e
5555
path: outputs/compound/normalized.Compound_DrugResponses_DrugResponse.Edge.json.gz
5656
cache: true
5757
metric: false
5858
persist: false
59-
- md5: c53f52b19e36799ee099edc0ef3d7657
59+
- md5: a56821071854cc805120007a99728f0e
6060
path: outputs/compound/normalized.Project_Compounds_Compound.Edge.json.gz
6161
cache: true
6262
metric: false
6363
persist: false
64-
- md5: a9945dd00371f9ae553098b2b71f4ce8
64+
- md5: d8cfbd16c89e661cf15df5c37cea5d72
6565
path: outputs/compound/normalized.Compound_Projects_Project.Edge.json.gz
6666
cache: true
6767
metric: false
6868
persist: false
69-
- md5: 488eee20c9e4f0c99070eb6dfbb3fdb7
69+
- md5: acb155e76510e2661158e52272338a3c
7070
path: outputs/compound/normalized.Compound_Cases_Case.Edge.json.gz
7171
cache: true
7272
metric: false
7373
persist: false
74-
- md5: 8e8e11059b5cbd3c1ae85573b30dabb0
74+
- md5: 015143f9421a9fb5a3311f92043056c2
7575
path: outputs/compound/normalized.Case_Compounds_Compound.Edge.json.gz
7676
cache: true
7777
metric: false
7878
persist: false
79-
- md5: c76490317a7a49562e903c52ddba2c6e
79+
- md5: 69b9e6f4ee8226dac032bf8814d72ff2
8080
path: outputs/compound/normalized.G2PAssociation_Compounds_Compound.Edge.json.gz
8181
cache: true
8282
metric: false
8383
persist: false
84-
- md5: 21734e88b95e98c4eabac40c91cfbd45
84+
- md5: 6a029d0ee0b8f531cce4410d0e8affa4
8585
path: outputs/compound/normalized.Compound_G2PAssociations_G2PAssociation.Edge.json.gz
8686
cache: true
8787
metric: false

outputs/dgidb/dgidb.dvc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
md5: e720598cc16fd7bb5918bd09f0397589
1+
md5: 34c1bbfdb0868d369c4283b8982f2137
22
cmd: python3 transform/dgidb/transform.py
33
wdir: ../..
44
deps:
@@ -8,45 +8,45 @@ deps:
88
path: source/drug_enricher/drug_alias.tsv
99
- md5: 64e7a82c87e7151a7c49846469157547
1010
path: src/bmeg/enrichers/drug_enricher.py
11-
- md5: 936873de1ea7a748293263ff126a2967
11+
- md5: 1f1f691fa223e6c7c05cb9324a68d96a
1212
path: transform/dgidb/transform.py
1313
outs:
14-
- md5: 3fa57b7c369e2c8dd2dc0d0b6c44766e
14+
- md5: aac51ae53ff021ed9291b1aa2ae78076
1515
path: outputs/dgidb/G2PAssociation.Vertex.json.gz
1616
cache: true
1717
metric: false
1818
persist: false
19-
- md5: f7a2bbfb7436d76e09b9e13bd3d0c9ff
19+
- md5: 502320cf8546d8607d62a1f0518e97cc
2020
path: outputs/dgidb/Compound.Vertex.json.gz
2121
cache: true
2222
metric: false
2323
persist: false
24-
- md5: 166177ce31138f0ee32dc339fdfdd987
24+
- md5: 758362f57be6a69c6969a48182e0ff18
2525
path: outputs/dgidb/G2PAssociation_Genes_Gene.Edge.json.gz
2626
cache: true
2727
metric: false
2828
persist: false
29-
- md5: 4f99308481017576d6cc208b6f514eab
29+
- md5: 57d481f7135e3a8761ad8920ceca5fb0
3030
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
3131
cache: true
3232
metric: false
3333
persist: false
34-
- md5: 9c4627785660215fa9adbf7c9f6cf172
34+
- md5: 708a48991dc4d6051097e6a8e8175b7b
3535
path: outputs/dgidb/G2PAssociation_Compounds_Compound.Edge.json.gz
3636
cache: true
3737
metric: false
3838
persist: false
39-
- md5: 03f13f3513d5282b8a1f309429ae81f0
39+
- md5: cecbf1a9f863ecfd0ce3db553becfad9
4040
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
4141
cache: true
4242
metric: false
4343
persist: false
44-
- md5: 97282e6b782ab81889884ba6d9ba368e
44+
- md5: a701cee66767a702c18b7b3d4cb4b829
4545
path: outputs/dgidb/Gene_G2PAssociations_G2PAssociation.Edge.json.gz
4646
cache: true
4747
metric: false
4848
persist: false
49-
- md5: fa0ae005deb28af2d935cc818c1394b4
49+
- md5: 0e9b6635d4e8153d221d1c2a0604627a
5050
path: outputs/dgidb/Compound_G2PAssociations_G2PAssociation.Edge.json.gz
5151
cache: true
5252
metric: false

outputs/publication/stub_publications.dvc

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
1-
md5: 9c3bbc179474e91c7ae2da409b34ca88
1+
md5: b76b1becf88a1b639c81da86933d69dc
22
cmd: python3 transform/publication/transform.py
33
wdir: ../..
44
deps:
5-
- md5: 15fc6631cd700953dc7bcc74728c44fa
5+
- md5: ba4e56706b22e7572ef4591a999a5a58
66
path: transform/publication/transform.py
77
- md5: 1bef5c26bd8aab5b43de9c6e9f8d87de
88
path: outputs/g2p/G2PAssociation_Publications_Publication.Edge.json.gz
99
- md5: f2b1f26f98bc1f94fa528cbafbb6c60a
1010
path: outputs/g2p/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
11-
- md5: 4f99308481017576d6cc208b6f514eab
11+
- md5: 57d481f7135e3a8761ad8920ceca5fb0
1212
path: outputs/dgidb/G2PAssociation_Publications_Publication.Edge.json.gz
13-
- md5: 03f13f3513d5282b8a1f309429ae81f0
13+
- md5: cecbf1a9f863ecfd0ce3db553becfad9
1414
path: outputs/dgidb/Publication_G2PAssociations_G2PAssociation.Edge.json.gz
1515
- md5: 2a69a3d9fe9acd1c5465afc03be20cd2.dir
1616
path: outputs/pubmed/baseline
@@ -23,7 +23,7 @@ deps:
2323
- md5: 737bc587e7faef80a008f30b9f310643
2424
path: outputs/pathway_commons/Interaction_Publications_Publication.Edge.json.gz
2525
outs:
26-
- md5: 87182b1ccb82bc03a2efb61ca77703d5
26+
- md5: 7bbe3da8a9ade7327c41858d20107e58
2727
path: outputs/publication/stub.Publication.Vertex.json.gz
2828
cache: true
2929
metric: false

src/bmeg/bmeg-dictionary

tests/unit/dgidb/source/dgidb/interactions.tsv

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ PDGFRB PDGFR 5159 ChemblInteractions inhibitor CHEMBL576982 QUIZARTINIB QUIZARTI
88
MTOR 2109 2475 GuideToPharmacologyInteractions inhibitor 9361 WYE-354 CHEMBL561708 CHEMBL561708
99
DNTT DNTT 1791 NCI HYDROXYUREA HYDROXYUREA HYDROXYUREA CHEMBL467 291471
1010
OPRM1 319 4988 GuideToPharmacologyInteractions agonist 3534 SUFENTANIL SUFENTANIL CHEMBL658
11+
PDGFRA PDGFRA 5156 DoCM IMATINIB IMATINIB IMATINIB CHEMBL941 15928335,15685537,22718859,16638875,15146165,12949711,26130666,25157968,24132921,14645423,16954519,18794084,22745105

tests/unit/dgidb/test_dgidb_transform.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import contextlib
33
import pytest
44
import shutil
5+
import json
6+
57
from transform.dgidb.transform import transform
8+
from bmeg.ioutils import reader
69

710

811
@pytest.fixture
@@ -50,6 +53,14 @@ def validate(helpers, interactions_file, emitter_directory):
5053
exclude_labels=['Publication', 'Gene']
5154
)
5255

56+
count = 0
57+
with reader(pub_edge_file) as f:
58+
for line in f:
59+
e = json.loads(line)
60+
assert e['to'] != 'Publication:ncbi.nlm.nih.gov/pubmed/'
61+
count += 1
62+
assert count == 16
63+
5364

5465
def test_simple(helpers, interactions_file, emitter_directory):
5566
""" simple test """

transform/dgidb/transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def transform(interactions_file="source/dgidb/interactions.tsv",
9494
),
9595
emit_backref=True
9696
)
97-
if line["PMIDs"] is None or line["PMIDs"] != "":
97+
if line["PMIDs"] is None or line["PMIDs"] == "":
9898
continue
9999
pubs = line["PMIDs"].split(",")
100100
for p in pubs:

transform/ensembl/uniprot.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def transform(protein_table_path='source/ensembl/Homo_sapiens.GRCh37.85.uniprot.
3838

3939
if uniprot_id not in emitted:
4040
p = Uniprot(id=Uniprot.make_gid(uniprot_id),
41+
uniprot_id=uniprot_id,
4142
genome=GENOME_BUILD,
4243
project_id=PROJECT_ID)
4344
emitter.emit_vertex(p)

transform/publication/transform.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,33 +50,43 @@ def transform(
5050
e = f = r = 0
5151
for file in files:
5252
f += 1
53-
logging.info("processing HasSupportingReference file: {}/{}".format(f, nfiles))
53+
logging.info("processing file: {}/{}".format(f, nfiles))
5454
with reader(file) as ins:
5555
for line in ins:
5656
try:
5757
edge = ujson.loads(line)
58-
if 'Publication:' not in edge['gid']:
58+
pid = None
59+
# get edge components
60+
if edge['to'].startswith('Publication'):
61+
pid = edge['to']
62+
elif edge['from'].startswith('Publication'):
63+
pid = edge['from']
64+
else:
5965
logging.info('Edge {} has no publications that need transformation. skipping.'.format(file))
6066
break
61-
# get edge components
62-
to = edge['to']
63-
if to in dedup:
67+
68+
if pid in dedup:
6469
r += 1
6570
continue
66-
dedup[to] = True
67-
url = to.replace('Publication:', 'http://')
71+
72+
url = pid.replace('Publication:', 'http://')
6873
publication = Publication(
6974
id=Publication.make_gid(url),
7075
url=url,
7176
project_id=Project.make_gid("Reference")
7277
)
7378
emitter.emit_vertex(publication)
79+
80+
dedup[pid] = True
7481
e += 1
82+
7583
except Exception as exc:
7684
logging.error(str(exc))
7785
raise exc
86+
7887
if e % batch_size == 0:
7988
logging.info('emitted stub publication vertices: {}'.format(e))
89+
8090
logging.info('emitted stub publication vertices: {}'.format(e))
8191
logging.info('existing publication refs found: {}'.format(r))
8292
emitter.close()

0 commit comments

Comments
 (0)