add_entries.py

import argparse
import glob
import json
import sys
import os
import pandas as pd
from jsonschema import validate, ValidationError
from functions import *

def main(entry_table, json_dir, json_schema, chemspider_api_key, comptox_api_key, mibig_dir, iarc_file, update, fungi_without_taxid, smiles_annotation):
    loaded_schema, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df = load_files(json_schema, mibig_dir, iarc_file, fungi_without_taxid, smiles_annotation)
    json_dir = json_dir.rstrip("/")
    existing_json_files = glob.glob(json_dir+"/*.json")
    existing_names = []
    invalid_entries = {}
    try:
        last_file = sorted(existing_json_files)[-1]
        last_id = os.path.basename(last_file).replace(".json","")
        for json_file in existing_json_files:
            with open(json_file, 'r') as stream:
                loaded_dict = json.load(stream)
                name = loaded_dict["data"]["compound"]["main_name"].casefold()
                id = loaded_dict["data"]["id"]
                existing_names.append(name)
                if update == True:
                    try:
                        validate(loaded_dict, loaded_schema)
                    except ValidationError:
                        invalid_entries[name] = id
    except IndexError:
        print("No previously existing entries found in "+json_dir+".")
        last_id = None

    with open(entry_table, 'r', encoding='utf-8') as entry_table_in:
        next(entry_table_in) #skip the header
        for i, line in enumerate(entry_table_in):
            if line.strip(): #if not an empty line
                compound_name = line.split("\t")[0].casefold()
                if compound_name in existing_names:
                    if update == True:
                        try:
                            id = invalid_entries[compound_name]
                            print("Updating entry "+id+"...")
                        except KeyError:
                            print("Skipping line "+str(i+2)+" in "+entry_table+". A valid entry already exists in "+json_dir+" for compound \""+compound_name+"\".")
                            continue
                    else:
                        print("Skipping line "+str(i+2)+" in "+entry_table+". An entry already exists in "+json_dir+" for compound \""+compound_name+"\".")
                        continue
                else:
                    id = generate_id(last_id)
                    print("Building new entry "+id+"...")
            FT_dict = build_FT_dict(line, id, chemspider_api_key, comptox_api_key, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df)
            validate(FT_dict, loaded_schema)
            print("Writing entry "+id+" to "+json_dir+"/"+id+".json...")
            with open(json_dir+"/"+id+".json", 'w') as stream:
                json.dump(FT_dict, stream, indent=4)
            last_id = id

def load_files(json_schema, mibig_dir, iarc_file, fungi_without_taxid, smiles_annotation):
    with open(json_schema, 'r') as stream:
        loaded_schema = json.load(stream)
    mibig_db = load_mibig(mibig_dir)
    IARC_df = load_IARC_file(iarc_file)
    fungi_without_taxid_list = None
    smiles_df = None
    if fungi_without_taxid:
        fungi_without_taxid_list = open(fungi_without_taxid).read().splitlines()
    if smiles_annotation:
        smiles_df = pd.read_csv(smiles_annotation, sep='\t', header=0, encoding='utf-8')
    return loaded_schema, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df

def build_FT_dict(line, id, chemspider_api_key, comptox_api_key, mibig_db, IARC_df, fungi_without_taxid_list, smiles_df):
    FT_dict = line_to_FT_dict(line, id, fungi_without_taxid_list)
    compound_names = get_all_compound_names(FT_dict)
    FT_dict = add_compound_data_dbs(FT_dict, compound_names, chemspider_api_key)
    if not FT_dict["data"]["compound"]["smiles"]:
        print("No molecular data added for "+id+" ("+FT_dict["data"]["compound"]["main_name"]+"). No record found in chembl, npatlas, chemspider or pubchem.")
        if smiles_df is not None:
            print("Adding molecular data manually...")
            FT_dict = add_compound_data_manual(FT_dict, smiles_df)
        else:
            sys.exit("Error: No manual annotation file found. Please add an annotation file with --smiles_annotation.")
    FT_dict = add_gene_clusters(FT_dict, compound_names, mibig_db)
    FT_dict = add_carcinogenicity_data(FT_dict, compound_names, IARC_df)
    FT_dict = add_comptox_id(FT_dict, compound_names, comptox_api_key)
    FT_dict = add_ctd_id(FT_dict, compound_names)
    return FT_dict

if __name__ == "__main__":
    #Argument parsing
    parser = argparse.ArgumentParser()
    parser.add_argument("entry_table", type=str, help="Table with mycotoxin data in .tsv format")
    parser.add_argument("json_dir", type=str, help="Directory for storing json files")
    parser.add_argument("json_schema", type=str, help="The json schema used")
    parser.add_argument("chemspider_api_key", type=str, help="ChemSpider API key")
    parser.add_argument("comptox_api_key", type=str, help="CompTox API key")
    parser.add_argument("mibig_dir", type=str, help="Path to an instance of the MIBiG directory")
    parser.add_argument("iarc_file", type=str, help="Table with IARC data on carcinogenicity in .tsv format")
    parser.add_argument("--update", action="store_true", help="If true; rebuild any entries that do not fit the json schema")
    parser.add_argument("--fungi_without_taxid", type=str, help="List of allowed fungi names for which there exists no NCBI taxid (delimited by newline character).")
    parser.add_argument("--smiles_annotation", type=str, help="Annotation of compound names to smiles in .tsv format")
    args = parser.parse_args()
    #Run the main script
    main(**vars(args))