-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_entries.py
105 lines (100 loc) · 5.78 KB
/
add_entries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
import glob
import json
import sys
import os
import pandas as pd
from jsonschema import validate, ValidationError
from functions import *
def main(entry_table, json_dir, json_schema, chemspider_api_key, comptox_api_key, mibig_dir, iarc_file, update, fungi_without_taxid, smiles_annotation):
loaded_schema, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df = load_files(json_schema, mibig_dir, iarc_file, fungi_without_taxid, smiles_annotation)
json_dir = json_dir.rstrip("/")
existing_json_files = glob.glob(json_dir+"/*.json")
existing_names = []
invalid_entries = {}
try:
last_file = sorted(existing_json_files)[-1]
last_id = os.path.basename(last_file).replace(".json","")
for json_file in existing_json_files:
with open(json_file, 'r') as stream:
loaded_dict = json.load(stream)
name = loaded_dict["data"]["compound"]["main_name"].casefold()
id = loaded_dict["data"]["id"]
existing_names.append(name)
if update == True:
try:
validate(loaded_dict, loaded_schema)
except ValidationError:
invalid_entries[name] = id
except IndexError:
print("No previously existing entries found in "+json_dir+".")
last_id = None
with open(entry_table, 'r', encoding='utf-8') as entry_table_in:
next(entry_table_in) #skip the header
for i, line in enumerate(entry_table_in):
if line.strip(): #if not an empty line
compound_name = line.split("\t")[0].casefold()
if compound_name in existing_names:
if update == True:
try:
id = invalid_entries[compound_name]
print("Updating entry "+id+"...")
except KeyError:
print("Skipping line "+str(i+2)+" in "+entry_table+". A valid entry already exists in "+json_dir+" for compound \""+compound_name+"\".")
continue
else:
print("Skipping line "+str(i+2)+" in "+entry_table+". An entry already exists in "+json_dir+" for compound \""+compound_name+"\".")
continue
else:
id = generate_id(last_id)
print("Building new entry "+id+"...")
FT_dict = build_FT_dict(line, id, chemspider_api_key, comptox_api_key, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df)
validate(FT_dict, loaded_schema)
print("Writing entry "+id+" to "+json_dir+"/"+id+".json...")
with open(json_dir+"/"+id+".json", 'w') as stream:
json.dump(FT_dict, stream, indent=4)
last_id = id
def load_files(json_schema, mibig_dir, iarc_file, fungi_without_taxid, smiles_annotation):
with open(json_schema, 'r') as stream:
loaded_schema = json.load(stream)
mibig_db = load_mibig(mibig_dir)
IARC_df = load_IARC_file(iarc_file)
fungi_without_taxid_list = None
smiles_df = None
if fungi_without_taxid:
fungi_without_taxid_list = open(fungi_without_taxid).read().splitlines()
if smiles_annotation:
smiles_df = pd.read_csv(smiles_annotation, sep='\t', header=0, encoding='utf-8')
return loaded_schema, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df
def build_FT_dict(line, id, chemspider_api_key, comptox_api_key, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df):
FT_dict = line_to_FT_dict(line, id, fungi_without_taxid_list)
compound_names = get_all_compound_names(FT_dict)
FT_dict = add_compound_data_dbs(FT_dict, compound_names, chemspider_api_key)
if not FT_dict["data"]["compound"]["smiles"]:
print("No molecular data added for "+id+" ("+FT_dict["data"]["compound"]["main_name"]+"). No record found in chembl, npatlas, chemspider or pubchem.")
if smiles_df is not None:
print("Adding molecular data manually...")
FT_dict = add_compound_data_manual(FT_dict, smiles_df)
else:
sys.exit("Error: No manual annotation file found. Please add an annotation file with --smiles_annotation.")
FT_dict = add_gene_clusters(FT_dict, compound_names, mibig_db)
FT_dict = add_carcinogenicity_data(FT_dict, compound_names, IARC_df)
FT_dict = add_comptox_id(FT_dict, compound_names, comptox_api_key)
FT_dict = add_ctd_id(FT_dict, compound_names)
return FT_dict
if __name__ == "__main__":
#Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument("entry_table", type=str, help="Table with mycotoxin data in .tsv format")
parser.add_argument("json_dir", type=str, help="Directory for storing json files")
parser.add_argument("json_schema", type=str, help="The json schema used")
parser.add_argument("chemspider_api_key", type=str, help="ChemSpider API key")
parser.add_argument("comptox_api_key", type=str, help="CompTox API key")
parser.add_argument("mibig_dir", type=str, help="Path to an instance of the MIBiG directory")
parser.add_argument("iarc_file", type=str, help="Table with IARC data on carcinogenicity in .tsv format")
parser.add_argument("--update", action="store_true", help="If true; rebuild any entries that do not fit the json schema")
parser.add_argument("--fungi_without_taxid", type=str, help="List of allowed fungi names for which there exists no NCBI taxid (delimited by newline character).")
parser.add_argument("--smiles_annotation", type=str, help="Annotation of compound names to smiles in .tsv format")
args = parser.parse_args()
#Run the main script
main(**vars(args))