Merge pull request #1 from asapdiscovery/tweaks_for_demo

hmacdope · web-flow · commit b6b993e1a954 · 2024-09-24T22:29:46.000+10:00
Tweaks
diff --git a/app.py b/app.py
@@ -4,8 +4,10 @@
 asap_prod_streamlit = int(os.getenv("ASAP_OE_PROD_STREAMLIT", None))
 
 if asap_prod_streamlit == 1:
+
     def sort_out_openeye_license():
         import os
+
         # need to write the license file to disk
         txt = st.secrets.openeye_credentials.license_file_txt
         if not txt:
@@ -18,11 +20,11 @@ def sort_out_openeye_license():
     sort_out_openeye_license()
 
 
-
 import pandas as pd
 import numpy as np
 from asapdiscovery.ml.inference import GATInference
 from asapdiscovery.ml.models import ASAPMLModelRegistry
+import matplotlib.pyplot as plt
 from rdkit import Chem
 from streamlit_ketcher import st_ketcher
 from io import StringIO
@@ -31,6 +33,7 @@ def sort_out_openeye_license():
 # need to update the registry periodically
 schedule.every(4).hours.do(ASAPMLModelRegistry.update_registry)
 
+
 def _is_valid_smiles(smi):
     if smi is None or smi == "":
         return False
@@ -42,9 +45,11 @@ def _is_valid_smiles(smi):
             return True
     except:
         return False
-    
+
+
 def sdf_str_to_rdkit_mol(sdf):
     from io import BytesIO
+
     bio = BytesIO(sdf.encode())
     suppl = Chem.ForwardSDMolSupplier(bio, removeHs=False)
     mols = [mol for mol in suppl if mol is not None]
@@ -57,39 +62,46 @@ def convert_df(df):
     return df.to_csv().encode("utf-8")
 
 
-
-    
-
 # Set the title of the Streamlit app
-st.title('ASAPDiscovery Machine Learning')
+st.title("ASAP Discovery Local Models (ML)")
 
-st.markdown("## Intro")
+st.markdown("## Background")
 
-st.markdown("The [ASAPDiscovery antiviral drug discovery consortium](https://asapdiscovery.org) has developed a series of machine learning models (primarily Graph Attention Networks (GATs)) to predict molecular properties based on our experimental data, much of which is [available](https://asapdiscovery.org/outputs/) as part of our [open science](https://asapdiscovery.org/open-science/) and public disclosure policy.")
-st.markdown("These models are trained on a variety of endpoints, including in-vitro activity, assayed LogD, and more \n Some models are specific to a target, while others are global models that predict properties across all targets.")
-st.markdown("This web app gives you easy API-less access to the models, I hope you find it useful!\n As scientists we should always be looking to get our models into people's hands as easily as possible.")
-st.markdown("These models are trained bi-weekly. The latest models are used for prediction where possible. Note that predictions are pre-alpha and are provided as is, we are still working very actively on improving and validating models.")
+st.markdown(
+    "**The [ASAP Discovery antiviral drug discovery consortium](https://asapdiscovery.org) has developed a series of local machine learning models (GAT architecture) to predict properties based on our local data, much of which is [available](https://asapdiscovery.org/outputs/) as part of our [open science policy](https://asapdiscovery.org/open-science/).**"
+)
+st.markdown(
+    "**These models are trained on a variety of experimental endpoints that are found in ASAP's CDD vault, including biochemical and antiviral potency, assayed LogD, and more. Some models are specific to a target, while others are global models that predict properties across all targets.**"
+)
+st.markdown(
+    "This web app gives you easy access to the trained models without having to write or execute any code. The intention is to empower anyone across ASAP to make these predictions."
+)
+st.markdown("---")
+st.markdown(
+    "These models are trained bi-weekly. The latest models are used for prediction where possible. Note that predictions are pre-alpha and are provided as is, work is on-going on improving and validating these models. As a general rule of thumb, predictions on your data will be better when your query compound(s) is/are closer chemically to the compounds in the CDD. Are you having problems using this UI or do you have a feature request? Please open an issue on [our issue tracker](https://github.com/asapdiscovery/asap-ml-streamlit/issues/new)."
+)
 
-st.markdown("## Select input")
+st.markdown("## Input :clipboard:")
 
 
-input = st.selectbox("How would you like to enter your input?", ["Upload a CSV file", "Draw a molecule", "Enter SMILES", "Upload an SDF file"])
+input = st.selectbox(
+    "How would you like to enter your input?",
+    ["Upload a CSV file", "Draw a molecule", "Enter SMILES", "Upload an SDF file"],
+)
 
 multismiles = False
 if input == "Draw a molecule":
-    st.write("Draw a molecule")
     smiles = st_ketcher(None)
     if _is_valid_smiles(smiles):
-        st.success("Valid SMILES string", icon="✅")
+        st.success("Valid molecule", icon="✅")
     else:
-        st.error("Invalid SMILES string", icon="🚨")
+        st.error("Invalid molecule", icon="🚨")
         st.stop()
     smiles = [smiles]
     df = pd.DataFrame(smiles, columns=["SMILES"])
     column = "SMILES"
     smiles_column = df["SMILES"]
 elif input == "Enter SMILES":
-    st.write("Enter SMILES")
     smiles = st.text_input("Enter a SMILES string")
     if _is_valid_smiles(smiles):
         st.success("Valid SMILES string", icon="✅")
@@ -101,10 +113,10 @@ def convert_df(df):
     column = "SMILES"
     smiles_column = df["SMILES"]
 elif input == "Upload a CSV file":
-    st.write("Upload a CSV file")
-
     # Create a file uploader for CSV files
-    uploaded_file = st.file_uploader("Choose a CSV file to upload your predictions to", type="csv")
+    uploaded_file = st.file_uploader(
+        "Choose a CSV file to upload your predictions to", type="csv"
+    )
 
     # If a file is uploaded, parse it into a DataFrame
     if uploaded_file is not None:
@@ -119,14 +131,20 @@ def convert_df(df):
     # check if the smiles are valid
     valid_smiles = [_is_valid_smiles(smi) for smi in smiles_column]
     if not all(valid_smiles):
-        st.error("Some of the SMILES strings are invalid, please check the input", icon="🚨")
+        st.error(
+            "Some of the SMILES strings are invalid, please check the input", icon="🚨"
+        )
         st.stop()
-    st.success("All SMILES strings are valid, proceeding with prediction", icon="✅")
+    st.success(
+        f"All SMILES strings are valid (n={len(valid_smiles)}), proceeding with prediction",
+        icon="✅",
+    )
 
 elif input == "Upload an SDF file":
-    st.write("Upload an SDF file")
     # Create a file uploader for SDF files
-    uploaded_file = st.file_uploader("Choose a SDF file to upload your predictions to", type="sdf")
+    uploaded_file = st.file_uploader(
+        "Choose a SDF file to upload your predictions to", type="sdf"
+    )
     # read with rdkit
     if uploaded_file is not None:
         # To convert to a string based IO:
@@ -136,17 +154,20 @@ def convert_df(df):
         mols = sdf_str_to_rdkit_mol(string_data)
         smiles = [Chem.MolToSmiles(m) for m in mols]
         df = pd.DataFrame(smiles, columns=["SMILES"])
-            # st.error("Error reading the SDF file, please check the input", icon="🚨")
-            # st.stop()
+        # st.error("Error reading the SDF file, please check the input", icon="🚨")
+        # st.stop()
     else:
         st.stop()
-    
-    st.success("All SMILES strings are valid, proceeding with prediction", icon="✅")
+
+    st.success(
+        f"All molecule entries are valid (n={len(df)}), proceeding with prediction",
+        icon="✅",
+    )
     column = "SMILES"
     smiles_column = df["SMILES"]
     multismiles = True
 
-st.markdown("## Select your model parameters")
+st.markdown("## Model parameters :nut_and_bolt:")
 
 
 targets = ASAPMLModelRegistry.get_targets_with_models()
@@ -167,71 +188,129 @@ def convert_df(df):
     _target = target_value
     _target_str = target_value
 # Get the latest model for the target and endpoint
-model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(_target, "GAT", endpoint_value)
+model = ASAPMLModelRegistry.get_latest_model_for_target_type_and_endpoint(
+    _target, "GAT", endpoint_value
+)
 if model is None:
     st.write(f"No model found for {target_value} {endpoint_value}")
     st.stop()
     # retry with a different target or endpoint
 
-st.markdown("## Prediction time 🚀")
+st.markdown("## Prediction 🚀")
 
 
-st.write(f"Predicting {_target_str} {endpoint_value} using model {model.name}")
+st.write(
+    f"Predicting **{_target_str} {endpoint_value}** using model:\n\n `{model.name}`"
+)
 # Create a GATInference object from the model
 infr = GATInference.from_ml_model_spec(model)
 if infr.is_ensemble:
-    st.write(f"Ensemble model with {len(model.models)} models, will estimate uncertainty using ensemble variance")
+    st.write(
+        f"_Using ensemble model (n={len(model.models)}); estimating uncertainty as variance of predictions._"
+    )
 # Predict the property value for each SMILES string
-predictions = [infr.predict_from_smiles(smiles, return_err=True) for smiles in smiles_column]
+predictions = [
+    infr.predict_from_smiles(smiles, return_err=True) for smiles in smiles_column
+]
 predictions = np.asarray(predictions)
 # check if second column is all np.nan
 if np.all(np.isnan(predictions[:, 1])):
     preds = predictions[:, 0]
     err = None
 else:
     preds = predictions[:, 0]
-    err = predictions[:, 1] # rejoin with the original dataframe
+    err = predictions[:, 1]  # rejoin with the original dataframe
 
 
-df["predictions"] = preds
-df["prediction_error"] = err
-
-# sort the dataframe by predictions
-df = df.sort_values(by="predictions", ascending=False)
+pred_column_name = f"{_target_str}_computed-{endpoint_value}"
+unc_column_name = f"{_target_str}_computed-{endpoint_value}_uncertainty"
+df[pred_column_name] = preds
+df[unc_column_name] = err
 
+st.markdown("---")
 if multismiles:
     # plot the predictions and errors
-    st.scatter_chart(df, x=column, y="predictions", color="prediction_error", use_container_width=True, x_label="SMILES", y_label=f"Predicted {_target_str} {endpoint_value} ")
+    # Histogram first
+    fig, ax = plt.subplots()
+
+    sorted_df = df.sort_values(by=pred_column_name)
+    n_bins = int(len(sorted_df[pred_column_name]) / 10)
+    if n_bins < 5:  # makes the histogram slightly more interpretable with low data
+        n_bins = 5
+
+    ax.hist(sorted_df[pred_column_name], bins=n_bins)
+
+    ax.set_ylabel("Count")
+    ax.set_xlabel(f"Computed {endpoint_value}")
+    ax.set_title(f"Histogram of computed {endpoint_value} for target: {_target_str}")
+
+    st.pyplot(fig)
+
+    # then a barplot
+    fig, ax = plt.subplots()
+
+    ax.bar(range(len(sorted_df)), sorted_df[pred_column_name])
+
+    ax.set_xticks([])
+    ax.set_xlabel(f"Query compounds")
+    ax.set_ylabel(f"Computed {endpoint_value}")
+
+    ax.set_title(f"Barplot of computed {endpoint_value} for target: {_target_str}")
+
+    st.pyplot(fig)
+
+    if endpoint_value == "pIC50":
+        from rdkit.Chem.Descriptors import MolWt
+        import seaborn as sns
+
+        # then a scatterplot of uncertainty vs MW
+        df["MW"] = [MolWt(Chem.MolFromSmiles(smi)) for smi in sorted_df["SMILES"]]
+        fig, ax = plt.subplots()
+
+        ax = sns.scatterplot(
+            x="MW", y=pred_column_name, hue=unc_column_name, palette="coolwarm", data=df
+        )
+
+        norm = plt.Normalize(df[unc_column_name].min(), df[unc_column_name].max())
+        sm = plt.cm.ScalarMappable(cmap="coolwarm", norm=norm)
+        sm.set_array([])
+
+        # Remove the legend and add a colorbar
+        ax.get_legend().remove()
+        cbar = ax.figure.colorbar(sm, ax=ax)
+        ax.annotate(
+            f"Computed {endpoint_value} uncertainty",
+            xy=(1.2, 0.3),
+            xycoords="axes fraction",
+            rotation=270,
+        )
+
+        ax.set_title(
+            f"Scatterplot of predicted {endpoint_value} versus MW\ntarget: {_target_str}"
+        )
+        ax.set_xlabel(f"Molecular weight (Da)")
+        ax.set_ylabel(f"Computed {endpoint_value}")
+        st.pyplot(fig)
 
 else:
     # just print the prediction
-    preds = df["predictions"].values[0]
+    preds = df[pred_column_name].values[0]
     smiles = df["SMILES"].values[0]
     if err:
-        err = df["prediction_error"].values[0]
+        err = df[unc_column_name].values[0]
         errstr = f"± {err:.2f}"
     else:
         errstr = ""
-    
-    st.markdown("### 🕵️")
-    st.markdown(f"Predicted {_target_str} {endpoint_value} for {smiles} is {preds:.2f} {errstr} using model {infr.model_name}")
+
+    st.markdown(
+        f"Predicted {_target_str} {endpoint_value} for {smiles} is {preds:.2f} {errstr}."
+    )
 
 # allow the user to download the predictions
 csv = convert_df(df)
 st.download_button(
-     label="Download data as CSV",
-     data=csv,
-     file_name=f"predictions_{model.name}.csv",
-     mime="text/csv",
- )
-
-
-
-    
-    
-
-
-
-
-# else:
-#     st.write("Please upload a CSV file to view its contents.")
+    label="Download data as CSV",
+    data=csv,
+    file_name=f"predictions_{model.name}.csv",
+    mime="text/csv",
+)