functions.py

import json
import re
import requests
import glob
import sys
import pandas as pd
import numpy as np
from collections import defaultdict

import pubchempy as pcp
from chembl_webresource_client.new_client import new_client
from chemspipy import ChemSpider
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcMolFormula
from rdkit.Chem import Descriptors
from ete3 import NCBITaxa
NCBI = NCBITaxa()

# Generic functions

def capitalize_first_letter(string):
    capitalized_string = string[:1].upper()+string[1:]
    return capitalized_string

def ref_nr_to_id(ref_nr):
    base_id = "REF00000"
    ref_id = re.sub("0{"+str(len(str(ref_nr)))+"}$",str(ref_nr),base_id)
    return ref_id

def generate_id(last_id):
    if not last_id:
        new_id = "FT00001"
    else:
        last_nr = int(re.sub("FT0*","",last_id))
        new_nr = last_nr + 1
        new_id = "FT"+str(new_nr).zfill(5)
    return new_id

def get_all_compound_names(dict):
    all_names = []
    main_name = dict["data"]["compound"]["main_name"]
    alt_names = dict["data"]["compound"]["alt_names"]
    all_names.append(main_name)
    for name in alt_names:
        if name:
            all_names.append(name)
    return all_names

# NCBI taxonomy

def name_to_taxid(name):
    name_translator = NCBI.get_name_translator([name])
    taxid = next(iter(name_translator.values()))[0] #report first taxid found
    return taxid

def taxid_to_name(taxid):
    taxid_translator = NCBI.get_taxid_translator([taxid])
    name = next(iter(taxid_translator.values())) #report first name found
    return name

def taxid_to_category(taxid):
    lineage = NCBI.get_lineage(taxid)
    if 2 in lineage:
        category = "bacteria"
    elif 4751 in lineage:
        category = "fungi"
    elif 33090 in lineage:
        category = "plants"
    else:
        category = "other"
    return category

# Manually added information

def line_to_FT_dict(line, id, fungi_without_taxid):
    infinite_defaultdict = lambda: defaultdict(infinite_defaultdict)
    d = infinite_defaultdict()
    main_name, alt_names, organism_field, toxicity_field, notes, group = line.rstrip('\n').split('\t')
    d["data"]["id"] = id
    d["data"]["compound"]["main_name"] = capitalize_first_letter(main_name)
    d["data"]["compound"]["alt_names"] = [capitalize_first_letter(name) for name in alt_names.split(', ')]
    d["data"]["compound"]["family"] = group.capitalize()
    d["data"]["notes"] = notes.split(',')
    ref_dict_o = unpack_ref_field(organism_field)
    ref_dict_t = unpack_ref_field(toxicity_field)
    producer_list = write_producer_list(ref_dict_o, fungi_without_taxid)
    toxicity_list = write_toxicity_list(ref_dict_t)
    d["data"]["biosynthesis"]["producers"] = producer_list
    d["data"]["toxicity"]["experimental_data"] = toxicity_list
    return d

def unpack_ref_field(field):
    chunks = re.findall("\D*[\d, ]+",field) #each chunk consists of one or multiple values coupled to one or multiple references
    ref_dict = defaultdict(set)
    for chunk in chunks:
        values_str, refs_str = re.findall("\d+[\d, ]*|\D+", chunk) #split values and references
        values = values_str.strip(" ,").split(',')
        refs = refs_str.strip(" ,").split(',')
        values = [value.strip() for value in values]
        refs = [ref.strip() for ref in refs]
        ref_ids = set()
        for value in values:
            #check that this contains only letters, hyphens, parentheses and dots. Spaces in between letters are allowed.
            assert re.match("^[a-zA-Z-.\(\)]+( *[a-zA-Z-.\(\)]+)*$", value), "\""+value+"\" in field "+field+" is not a valid value"
        for ref in refs:
            #check that this contains only digits
            assert re.match("^\d*$", ref), "\""+ref+"\" in field "+field+" is not a valid ref number"
            #convert to ref id
            ref_id = ref_nr_to_id(ref)
            ref_ids.add(ref_id)
        for value in values:
            ref_dict[value] |= ref_ids
    return ref_dict

def write_producer_list(ref_dict, fungi_without_taxid_list):
    producer_list = []
    for organism, ref_ids in ref_dict.items():
        try:
            taxid = name_to_taxid(organism)
            organism_name = taxid_to_name(taxid)     #Reconvert to name, as the name specified might not correspond to the 'official' name in NCBI Taxonomy
            category = taxid_to_category(taxid)
        except StopIteration:
            print("No taxid found for \""+organism+"\".")
            if fungi_without_taxid_list:
                if organism in fungi_without_taxid_list:
                    print("Adding name without taxid")
                    taxid = "missing"
                    organism_name = organism
                    category = "fungi"
                else:
                    sys.exit("Error: Name not in list of allowed names without taxid, aborting.")
            else:
                sys.exit("Error: No list of allowed names without taxid found. Please add file under --fungi_without_taxid.")
        producer_dict = defaultdict(dict)
        producer_dict["organism"]["organism_name"] = organism_name
        producer_dict["organism"]["taxid"] = str(taxid)
        producer_dict["organism"]["category"] = category
        producer_dict["ref_id"] = list(ref_ids)
        producer_list.append(producer_dict)
    return producer_list

def write_toxicity_list(ref_dict):
    toxicity_list = []
    for tox_label, ref_ids in ref_dict.items():
        tox_label = tox_label.lower()
        tox_dict = {
            "toxicity_type": tox_label,
            "ref_id": list(ref_ids)
        }
        toxicity_list.append(tox_dict)
    return toxicity_list

# Compound
## Retrieving chemical info from databases

def get_data_chembl(names, get_mol_info = True):
    chembl_id = "missing"
    mol_info = None
    for name in names:
        molecule = new_client.molecule
        molecule_data = molecule.filter(pref_name__iexact=name)[0]
        if not molecule_data:
            molecule_data = molecule.filter(molecule_synonyms__molecule_synonym__iexact=name)[0]
        if molecule_data:
            chembl_id = molecule_data["molecule_chembl_id"]
            if get_mol_info == True:
                mol_info = {
                    "smiles": molecule_data["molecule_structures"]["canonical_smiles"],
                    "mol_formula": molecule_data["molecule_properties"]["full_molformula"],
                    "mw_avg": round(float(molecule_data["molecule_properties"]["full_mwt"]),2),
                    "mw_mono": round(float(molecule_data["molecule_properties"]["mw_monoisotopic"]),4)
                    }
            break
    return chembl_id, mol_info

def get_data_npatlas(names, get_mol_info = True):
    npatlas_id = "missing"
    mol_info = None
    for name in names:
        url = "https://www.npatlas.org/api/v1/compounds/advancedSearch"
        query = {
            "operator": "eq",
            "attribute": "name",
            "value": name
            }
        headers = {
            "Content-Type": "application/json",
        }
        response = requests.post(url, data=json.dumps(query), headers=headers)
        if response.json():
            molecule_data = response.json()[0]
            npatlas_id = molecule_data["npaid"]
            if get_mol_info == True:
                mol_info = {
                    "smiles": molecule_data["smiles"],
                    "mol_formula": molecule_data["mol_formula"],
                    "mw_avg": round(float(molecule_data["mol_weight"]),2),
                    "mw_mono": round(float(molecule_data["exact_mass"]),4)
                }
            break
    return npatlas_id, mol_info

def get_data_chemspider(names, api_key, get_mol_info = True):
    chemspider_id = "missing"
    mol_info = None
    cs = ChemSpider(api_key)
    for name in names:
        if len(name) >= 3:
            query_id = cs.filter_name(name)
            results = cs.filter_results(query_id)
            if results:
                chemspider_id = str(results[0])
                if get_mol_info == True:
                    molecule = cs.get_compound(chemspider_id)
                    mol_info = {
                        "smiles": molecule.smiles,
                        "mol_formula": re.sub("[_/{/}]",'',molecule.molecular_formula),
                        "mw_avg": round(molecule.molecular_weight,2),
                        "mw_mono": round(molecule.monoisotopic_mass,4)
                    }
                break
    return chemspider_id, mol_info

def get_data_pubchem(names, get_mol_info = True):
    pubchem_id = "missing"
    mol_info = None
    for name in names:
        results = pcp.get_compounds(name, "name")
        if results:
            molecule_data = results[0].to_dict()
            pubchem_id = str(molecule_data["cid"])
            if get_mol_info == True:
                mol_info = {
                        "smiles": molecule_data["canonical_smiles"],
                        "mol_formula": molecule_data["molecular_formula"],
                        "mw_avg": round(float(molecule_data["molecular_weight"]),2),
                        "mw_mono": round(float(molecule_data["monoisotopic_mass"]),4)
                }
            break
    return pubchem_id, mol_info

def add_compound_data_dbs(FT_dict, compound_names, chemspider_api_key):
    db_order = ["chembl","npatlas","chemspider","pubchem"]
    get_mol_info = True
    id_dict = {}
    for database in db_order:
        if database == "chemspider":
            kwargs = {
                "api_key": chemspider_api_key
            }
        else:
            kwargs = {}
        id, mol_info = globals()["get_data_" + database](compound_names, get_mol_info = get_mol_info, **kwargs)
        if mol_info:
        # If molecule info is found in the db, stop trying to retrieve this info from the next dbs
            get_mol_info = False
            FT_dict["data"]["compound"].update(mol_info)
        # Always search for the database ID
        id_dict[database] = id
    FT_dict["data"]["compound"]["databases"] = id_dict
    return FT_dict

## Adding chemical info based on manually annotated SMILES

def add_compound_data_manual(FT_dict, smiles_df):
    main_name = FT_dict["data"]["compound"]["main_name"]
    try:
        smiles = smiles_df.loc[smiles_df["compound_name"].str.lower() == main_name.lower()].values[0][1]
    except KeyError:
        print("Error: No SMILES provided for compound \""+main_name+"\".")
    molecule = Chem.MolFromSmiles(smiles)
    mol_info = {
        "smiles": smiles,
        "mol_formula": CalcMolFormula(molecule),
        "mw_avg": round(Descriptors.MolWt(molecule),2),
        "mw_mono": round(Descriptors.ExactMolWt(molecule),4)
    }
    FT_dict["data"]["compound"].update(mol_info)
    return FT_dict

# Biosynthesis
## Adding BGC info from MIBiG

def load_mibig(mibig_dir):
    mibig_db = []
    mibig_entries=glob.glob(mibig_dir+"/*.json")
    for entry in mibig_entries:
        with open(entry, 'r') as json_file:
            bgc_json = json.load(json_file)
            mibig_db.append(bgc_json)
    return mibig_db

def add_gene_clusters(FT_dict, names, mibig_db):
    bgc_list = []
    for name in names:
        for bgc_json in mibig_db:
            compounds = []
            for compound_entry in bgc_json["cluster"]["compounds"]:
                compound_name = compound_entry["compound"]
                compounds.append(compound_name)
            if name.casefold() in [compound.casefold() for compound in compounds]:
                taxid = bgc_json["cluster"]["ncbi_tax_id"]
                organism_name = taxid_to_name(taxid) #get organism name from taxid, as it may have changed since the MIBiG entry was created
                organism_category = taxid_to_category(taxid)
                bgc_dict = defaultdict(dict)
                bgc_dict["mibig_id"] = bgc_json["cluster"]["mibig_accession"]
                bgc_dict["mibig_class"] = bgc_json["cluster"]["biosyn_class"]
                bgc_dict["organism"]["organism_name"] = organism_name
                bgc_dict["organism"]["taxid"] = taxid
                bgc_dict["organism"]["category"] = organism_category
                bgc_dict["compounds"] = compounds
                bgc_dict["status"] = bgc_json["cluster"]["status"]
                bgc_dict["completeness"] = bgc_json["cluster"]["loci"]["completeness"].lower()
                bgc_dict["minimal"] = bgc_json["cluster"]["minimal"]
                bgc_list.append(bgc_dict)
    FT_dict["data"]["biosynthesis"]["gene_clusters"] = bgc_list
    return FT_dict

# Toxicity
## Adding carcinogenicity classifications by the International Agency for Research on Cancer (IARC)

def load_IARC_file(IARC_file):
    IARC_df = pd.read_csv(IARC_file, sep='\t', header=0, encoding='utf-16')
    IARC_df = IARC_df.astype(str)
    return IARC_df

def retrieve_IARC_classification(IARC_df, names):
    for name in names:
        #Search for occurrence of entire compound name (cannot be a substring within a word)
        pattern = "(?<!\w)" + re.escape(name) + "(?!\w)"
        classification = IARC_df[IARC_df["Agent"].str.contains(pattern, case=False, na=False)]
        if not classification.empty:
            break
        #Find base name of the compound e.g. Fumonisin for Fumonisin B2
        try:
            basename = re.match(".*(?= [A-Z])", name).group()
            #Search for occurrence of base name (cannot be preceeded by any letters or followed by a suffix specifying the type (e.g. A, B2, III))
            pattern = "(?<!\w)" + re.escape(basename) + "(?!( [A-Z]+(0-9)*))"
            classification = IARC_df[IARC_df["Agent"].str.match(pattern, case=False, na=False)]
            if not classification.empty:
                break
        except AttributeError:
            continue
    return classification

def extract_properties(classification):
    classification = classification.drop(["CAS No.", "Additional information"], axis=1)
    classification = classification.replace("nan", "missing")
    nr_nans = classification.eq("missing").sum(axis=1).tolist()
    best_row = classification.values[nr_nans.index(min(nr_nans))]
    agent, group, volume, vol_year, eval_year = best_row
    return agent, group, volume, vol_year, eval_year

def add_carcinogenicity_data(FT_dict, names, IARC_df):
    classification = retrieve_IARC_classification(IARC_df, names)
    if not classification.empty:
        agent, group, volume, vol_year, eval_year = extract_properties(classification)
        IARC_dict = {
            "IARC_group": group,
            "agent": agent,
            "volume": volume,
            "vol_year": vol_year,
            "eval_year": str(eval_year)
        }
    else:
        IARC_dict = {
                "IARC_group": "missing",
                "agent": "missing",
                "volume": "missing",
                "vol_year": "missing",
                "eval_year": "missing"
            }
    FT_dict["data"]["toxicity"]["carcinogenicity"] = IARC_dict
    return FT_dict

## Adding links to toxicity databases

def add_comptox_id(FT_dict, names, api_key):
    comptox_id = "missing"
    for name in names:
        url = "https://api-ccte.epa.gov/chemical/search/equal/"+name
        headers = {
            'accept': 'application/json',
            'x-api-key': api_key
            }
        response=requests.get(url, headers=headers)
        if not response.status_code == 400:
            molecule_data = response.json()[0]
            comptox_id = molecule_data["dtxcid"]
            if not comptox_id:
                comptox_id = molecule_data["dtxsid"]
            break
    FT_dict["data"]["toxicity"]["databases"]["comptox"] = comptox_id
    return FT_dict

def add_ctd_id(FT_dict, names):
    ctd_id = "missing"
    for name in names:
        url = "https://id.nlm.nih.gov/mesh/lookup/descriptor?label="+name+"&match=exact"
        headers = {
            'accept': 'application/json',
            }
        response=requests.get(url, headers=headers)
        if response.json():
            mesh_data = response.json()[0]
            ctd_id = mesh_data["resource"].rsplit('/', 1)[-1]
            break
    FT_dict["data"]["toxicity"]["databases"]["ctd"] = ctd_id
    return FT_dict