got logoplots to work for HTML generation, now need to do ints and su…

…rface
asapdiscovery · Apr 16, 2024 · ca68a13 · ca68a13
1 parent 9fdd1e8
commit ca68a13
Show file tree

Hide file tree

Showing 9 changed files with 16,127 additions and 11 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/choppa/.DS_Store b/choppa/.DS_Store
diff --git a/choppa/render/.DS_Store b/choppa/render/.DS_Store
diff --git a/choppa/render/Template.html b/choppa/render/Template.html
diff --git a/choppa/render/logoplots.py b/choppa/render/logoplots.py
@@ -72,17 +72,20 @@ def divide_fitness_types(self):
 
         return {wildtype: wildtype_fitness}, unfit_mutants, fit_mutants
 
-    def render_logoplot(self, mutants, global_min_confidence=False, global_max_confidence=False, lhs=True):
+    def render_logoplot(self, mutants, global_min_confidence=False, global_max_confidence=False, lhs=True, wildtype=False):
         """
         Creates a logoplot as a base64 string. Also annotes with confidence values if present.
 
         TODO: nicer rounded ticks agnostic to array limits
         """  
         if len(mutants) == 0:
-            # this can happen when there are no mutants in this category. Return an empty base64 instead.
-            return ""
+            # this can happen when there are no mutants in this category. Return an empty white-sqare base64 instead.
+            return "iVBORw0KGgoAAAANSUhEUgAAAJYAAACfCAIAAACUbLd9AAAACXBIWXMAAAsTAAALEwEAmpwYAAABhElEQVR4nO3RwQkAIBDAMHX/nc8hfEghmaDQPTOLsvM7gFcW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5lmYZ2GehXkW5l1WGwQ7i50I0AAAAABJRU5ErkJggg=="
         plt.switch_backend('Agg') # prevents plt from opening a figure on OS
-        _, ax = plt.subplots(figsize=(3, 10))
+        if wildtype: # we want this to be a bit smaller and square because it'll always have 1 residue.
+            _, ax = plt.subplots(figsize=(4, 4))
+        else:
+            _, ax = plt.subplots(figsize=(3, 10))
 
         # if there are confidences, we well color the logoplot AA letters by confidence and
         # show a color bar if this is the left-hand-side logoplot. 
@@ -147,9 +150,9 @@ def build_logoplot(self, global_min_confidence=False, global_max_confidence=Fals
 
         # generate the logoplot base64 for wildtype (LHS, top), fit (LHS, bottom) and unfit (RHS; with colorbar) 
         wildtype_base64 = self.render_logoplot(wildtype, global_min_confidence=global_min_confidence, 
-                             global_max_confidence=global_max_confidence, lhs=True)
+                             global_max_confidence=global_max_confidence, wildtype=True)
         fit_base64 = self.render_logoplot(fit_mutants, global_min_confidence=global_min_confidence, 
-                             global_max_confidence=global_max_confidence, lhs=True)
+                             global_max_confidence=global_max_confidence)
         unfit_base64 = self.render_logoplot(unfit_mutants, global_min_confidence=global_min_confidence, 
                              global_max_confidence=global_max_confidence, lhs=False)
 

diff --git a/choppa/render/out.html b/choppa/render/out.html
diff --git a/choppa/render/pose.html b/choppa/render/pose.html
diff --git a/choppa/render/render.py b/choppa/render/render.py
@@ -6,7 +6,7 @@
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from tqdm import tqdm
 
-from choppa.render.utils import show_contacts, get_ligand_resnames_from_pdb_str
+from choppa.render.utils import show_contacts, get_ligand_resnames_from_pdb_str, split_pdb_str
 from choppa.render.logoplots import LogoPlot
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = logging.getLogger()
@@ -316,7 +316,66 @@ def get_logoplot_dict(self, confidence_lims, multiprocess=False):
                     }}
 
         return logoplot_dict
-
+
+    # also add in interactions dict
+
+
+    def inject_stuff_in_template(self, sdf_str, pdb_str, logoplot_dict, template="Template.html", out_file="out.html"):
+        """"
+        Replaces parts of a template HTML with relevant bits of data to get to a HTML view
+        of the (ligand-) protein, its fitness and its interactions (if any).
+        TODO: HMO to replace this crude replacement code with `jinja`.
+        """
+        # create a bunch of DIVs of the logoplots.
+        logoplot_divs = ""
+        for _, logoplot_data in logoplot_dict.items():
+            # we have to write a DIV for each logoplot. keep this repetetive for HMO to understand more easily.
+            # we're just adding more and more to the `logoplot_divs` string with properly placed newlines to make this work.
+            # start with wildtype
+            LOGOPLOT_TYPE_INSERT = "logoplotbox_wt"
+            LOGOPLOT_DIV_ID_INSERT = f"wtDIV_{logoplot_data['fitness_aligned_index']}"
+            LOGOPLOT_DESCRIPTION_INSERT = "wt residue logoplot"
+            LOGOPLOT_BASE64_INSERT = str(logoplot_data['logoplots_base64']['wildtype']).replace("b'", "").replace("'", "") # cleanup some BytesIO artefacts; found using https://base64.guru/tools/repair 
+            logoplot_divs += f'<div class="{LOGOPLOT_TYPE_INSERT}" id="{LOGOPLOT_DIV_ID_INSERT}" style="display:none">\n'\
+            +f'  <img alt="{LOGOPLOT_DESCRIPTION_INSERT}" src="data:image/png;base64,{LOGOPLOT_BASE64_INSERT}" />\n'\
+                +'</div>\n' # NB: had to switch around quotation types bc JS is awful (the language, not the person)
+            # then do fit
+            LOGOPLOT_TYPE_INSERT = "logoplotbox_fit"
+            LOGOPLOT_DIV_ID_INSERT = f"fitDIV_{logoplot_data['fitness_aligned_index']}"
+            LOGOPLOT_DESCRIPTION_INSERT = "fit residue logoplot"
+            LOGOPLOT_BASE64_INSERT = str(logoplot_data['logoplots_base64']['fit']).replace("b'", "").replace("'", "") # cleanup some BytesIO artefacts; found using https://base64.guru/tools/repair 
+            logoplot_divs += f'<div class="{LOGOPLOT_TYPE_INSERT}" id="{LOGOPLOT_DIV_ID_INSERT}" style="display:none">\n'\
+            +f'  <img alt="{LOGOPLOT_DESCRIPTION_INSERT}" src="data:image/png;base64,{LOGOPLOT_BASE64_INSERT}" />\n'\
+                +'</div>\n' # NB: had to switch around quotation types bc JS is awful (the language, not the person)
+            # then do unfit
+            LOGOPLOT_TYPE_INSERT = "logoplotbox_unfit"
+            LOGOPLOT_DIV_ID_INSERT = f"unfitDIV_{logoplot_data['fitness_aligned_index']}"
+            LOGOPLOT_DESCRIPTION_INSERT = "unfit residue logoplot"
+            LOGOPLOT_BASE64_INSERT = str(logoplot_data['logoplots_base64']['unfit']).replace("b'", "").replace("'", "") # cleanup some BytesIO artefacts; found using https://base64.guru/tools/repair 
+            logoplot_divs += f'<div class="{LOGOPLOT_TYPE_INSERT}" id="{LOGOPLOT_DIV_ID_INSERT}" style="display:none">\n'\
+            +f'  <img alt="{LOGOPLOT_DESCRIPTION_INSERT}" src="data:image/png;base64,{LOGOPLOT_BASE64_INSERT}" />\n'\
+                +'</div>\n' # NB: had to switch around quotation types bc JS is awful (the language, not the person)
+
+        # add the PDB (protein) and SDF (ligand)
+        with open(template, "rt") as fin:
+            with open(out_file, "wt") as fout:
+                for line in fin:
+                    line = line.replace("{{PDB_INSERT}}", f"{pdb_str}")
+                    line = line.replace("{{SDF_INSERT}}", f"{sdf_str}")
+
+                    # logoplots are a bit more complicated, need to add all those DIVs
+                    if "{{LOGOPLOTS_INSERTS}}" in line:
+                        line = line.replace("{{LOGOPLOTS_INSERTS}}", logoplot_divs)
+
+                    # add in interactions
+                    fout.write(line)
+
+        # then add interactions
+
+        # then add surface coloring
+
+
+
 
 
     def render(self):
@@ -330,8 +389,10 @@ def render(self):
         logoplot_dict = self.get_logoplot_dict(confidence_lims)
 
         # get the strings for the PDB (prot) and the SDF (lig, if present) 
+        lig_sdf_str, prot_pdb_str = split_pdb_str(self.complex_pdb_str)
 
         # do a dirty HTML generation using the logoplot and fitness dicts.
+        self.inject_stuff_in_template(lig_sdf_str, prot_pdb_str, logoplot_dict)
 
 
 
@@ -342,8 +403,8 @@ def render(self):
 
     from choppa.IO.input import FitnessFactory, ComplexFactory
 
-    fitness_dict = FitnessFactory(TOY_FITNESS_DATA_SECTIONED, 
-                                    # confidence_colname="confidence"
+    fitness_dict = FitnessFactory(TOY_FITNESS_DATA_COMPLETE, 
+                                    confidence_colname="confidence"
                                     ).get_fitness_basedict()
     complex = ComplexFactory(TOY_COMPLEX).load_pdb()
     complex_rdkit = ComplexFactory(TOY_COMPLEX).load_pdb_rdkit()

diff --git a/choppa/render/utils.py b/choppa/render/utils.py
@@ -1,7 +1,12 @@
 import MDAnalysis
 from MDAnalysis.lib.util import NamedStream
+import pymol2
+from rdkit import Chem
+
+import os
 from io import StringIO
 import warnings
+import tempfile
 
 def get_ligand_resnames_from_pdb_str(PDB_str, remove_solvent=True):
     """
@@ -20,6 +25,66 @@ def get_ligand_resnames_from_pdb_str(PDB_str, remove_solvent=True):
     resnames = set(ag.resnames)
     return list(resnames)
 
+def get_pdb_components(PDB_str, remove_solvent=True):
+    """
+    Split a protein-ligand pdb into protein and ligand components
+    :param PDB_str:
+    :return:
+    """
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore") # hides MDA RunTimeWarning that complains about string IO
+        u = MDAnalysis.Universe(NamedStream(StringIO(PDB_str), "complex.pdb"))
+
+    if remove_solvent:
+        ag = u.select_atoms("not (name H* or type OW)")
+
+    ligand = u.select_atoms("not protein")
+    protein = u.select_atoms("protein")
+
+    return ligand, protein
+
+
+def process_ligand(ligand):
+    """
+    Add bond orders to a pdb ligand in an MDA universe object.
+    1. load PDB into PyMol session (PyMOL does the bond guessing)
+    2. write ligand to stream as SDF
+    3. Read the stream into an RDKit molecule
+    """
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        ligand.write(f"{tmpdirname}/lig_tmp_while_hmo_helps_write_to_stream.pdb")
+
+        p = pymol2.PyMOL()
+        p.start()
+        p.cmd.load(f"{tmpdirname}/lig_tmp_while_hmo_helps_write_to_stream.pdb")
+        p.cmd.save(f"{tmpdirname}/lig_tmp_while_hmo_helps_write_to_stream.sdf", "all", 0) # writes all states, so should be able to handle multi-ligand
+        p.stop()
+
+        with open(f"{tmpdirname}/lig_tmp_while_hmo_helps_write_to_stream.sdf","r") as f:
+            string = f.read()
+    return string
+
+def process_protein(protein):
+    """
+    Returns the string for the protein in an MDA universe object.
+    """
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        protein.write(f"{tmpdirname}/prot_tmp_while_hmo_helps_write_to_stream.pdb")
+        with open(f"{tmpdirname}/prot_tmp_while_hmo_helps_write_to_stream.pdb","r") as f:
+            string = f.read()
+    return string
+
+def split_pdb_str(PDB_str):
+    """
+    From a PDB string, gets the string for the protein and (if present) the ligand SDF (with guessed
+    bond orders).
+
+    Inspired by https://gist.github.com/PatWalters/c046fee2760e6894ed13e19b8c99193b
+    TODO: set below functions through NamedStream instead of tmpdir
+    """
+    ligand_pdb, protein_pdb = get_pdb_components(PDB_str)
+    return process_ligand(ligand_pdb),  process_protein(protein_pdb)
+
 def show_contacts(
     pymol_instance,
     selection_residues,
@@ -69,4 +134,5 @@ def show_contacts(
     pymol_instance.cmd.set("dash_color", "green", contacts_name)
     pymol_instance.cmd.hide("labels", contacts_name) 
 
-    return True
+    return True
+