VishwamAI · kasinadhsarma · Oct 30, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,37 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch>=2.0.0 numpy>=1.21.0 scipy>=1.10.0 pandas>=2.0.0 scikit-learn>=1.2.0 biopython>=1.79 mdtraj>=1.9.7 prody>=2.4.0 biotite>=0.35.0 plotly>=5.13.0 py3dmol>=1.8.0 flask>=2.0.1 transformers>=4.30.0 datasets>=2.12.0 tokenizers>=0.13.0 tqdm>=4.65.0 requests>=2.31.0 openmm>=8.0.0 fair-esm>=2.0.0
+
+    - name: Lint code
+      run: |
+        pip install flake8
+        flake8 src tests
+
+    - name: Run tests
+      run: |
+        pip install pytest
+        pytest
diff --git a/README.md b/README.md
@@ -1 +1,90 @@
-# ProtienFlex
+# ProtienFlex
+
+## Project Description
+
+ProtienFlex is a project aimed at accelerating drug discovery by leveraging advanced techniques such as protein folding, virtual screening, and natural language processing (NLP). The primary goal is to reduce the time required for drug discovery and increase the effectiveness of identified compounds. This is achieved by using 3D models like AlphaFold in combination with human NLP and development.
+
+## Purpose and Goals
+
+The purpose of ProtienFlex is to streamline the drug discovery process by integrating various cutting-edge technologies. The main goals of the project are:
+- To reduce the time required for drug discovery.
+- To increase the effectiveness of identified drug compounds.
+- To utilize 3D models like AlphaFold for accurate protein folding predictions.
+- To employ NLP techniques for processing human language data relevant to drug discovery.
+
+## Setup and Installation
+
+To set up and run the ProtienFlex project, follow these steps:
+
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/VishwamAI/ProtienFlex.git
+   cd ProtienFlex
+   ```
+
+2. Create a virtual environment and activate it:
+   ```bash
+   python3 -m venv env
+   source env/bin/activate
+   ```
+
+3. Install the required dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+4. Run the main script:
+   ```bash
+   python src/main.py
+   ```
+
+## Contributing
+
+We welcome contributions to the ProtienFlex project. To contribute, please follow these steps:
+
+1. Fork the repository.
+2. Create a new branch for your feature or bugfix.
+3. Make your changes and commit them with descriptive messages.
+4. Push your changes to your forked repository.
+5. Create a pull request to the main repository.
+
+Please ensure that your code adheres to the project's coding standards and includes appropriate tests.
+
+## Using OpenMM and PDB for Protein Development
+
+ProtienFlex leverages OpenMM and PDB for protein development. OpenMM is a high-performance toolkit for molecular simulation, and PDB (Protein Data Bank) files contain 3D structures of proteins. Here are some steps to use OpenMM and PDB in the project:
+
+1. Install OpenMM:
+   ```bash
+   conda install -c conda-forge openmm
+   ```
+
+2. Load a PDB file in OpenMM:
+   ```python
+   from simtk.openmm.app import PDBFile
+   pdb = PDBFile('path_to_pdb_file.pdb')
+   ```
+
+3. Create a system and simulation:
+   ```python
+   from simtk.openmm.app import ForceField, Simulation
+   from simtk.openmm import LangevinIntegrator
+   from simtk.unit import kelvin, picoseconds, femtoseconds
+
+   forcefield = ForceField('amber99sb.xml')
+   system = forcefield.createSystem(pdb.topology)
+   integrator = LangevinIntegrator(300*kelvin, 1/picoseconds, 2*femtoseconds)
+   simulation = Simulation(pdb.topology, system, integrator)
+   simulation.context.setPositions(pdb.positions)
+   ```
+
+4. Run the simulation:
+   ```python
+   simulation.minimizeEnergy()
+   simulation.reporters.append(StateDataReporter(stdout, 1000, step=True, potentialEnergy=True, temperature=True))
+   simulation.step(10000)
+   ```
+
+## License
+
+This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
diff --git a/src/data_preprocessing.py b/src/data_preprocessing.py
@@ -0,0 +1,99 @@
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from simtk.openmm.app import PDBFile
+from simtk.openmm.app import ForceField, Simulation
+from simtk.openmm import LangevinIntegrator
+from simtk.unit import kelvin, picoseconds, femtoseconds
+
+
+def clean_data(df):
+    """
+    Clean the protein data by handling missing values and duplicates.
+
+    Parameters:
+    df (pd.DataFrame): The input protein data.
+
+    Returns:
+    pd.DataFrame: The cleaned protein data.
+    """
+    df = df.drop_duplicates()
+    df = df.dropna()
+    return df
+
+
+def normalize_data(df):
+    """
+    Normalize the protein data using standard scaling.
+
+    Parameters:
+    df (pd.DataFrame): The input protein data.
+
+    Returns:
+    pd.DataFrame: The normalized protein data.
+    """
+    scaler = StandardScaler()
+    df[df.columns] = scaler.fit_transform(df[df.columns])
+    return df
+
+
+def transform_data(df):
+    """
+    Transform the protein data for model input.
+
+    Parameters:
+    df (pd.DataFrame): The input protein data.
+
+    Returns:
+    pd.DataFrame: The transformed protein data.
+    """
+    # Example transformation: log transformation
+    df = df.applymap(lambda x: np.log(x + 1))
+    return df
+
+
+def preprocess_protein_data(df):
+    """
+    Preprocess the protein data by cleaning, normalizing, and transforming it.
+
+    Parameters:
+    df (pd.DataFrame): The input protein data.
+
+    Returns:
+    pd.DataFrame: The preprocessed protein data.
+    """
+    df = clean_data(df)
+    df = normalize_data(df)
+    df = transform_data(df)
+    return df
+
+
+def parse_pdb_file(file_path):
+    """
+    Parse a PDB file and return the PDB object.
+
+    Parameters:
+    file_path (str): The path to the PDB file.
+
+    Returns:
+    PDBFile: The parsed PDB object.
+    """
+    pdb = PDBFile(file_path)
+    return pdb
+
+
+def create_simulation(pdb):
+    """
+    Create a simulation using the PDB object.
+
+    Parameters:
+    pdb (PDBFile): The PDB object.
+
+    Returns:
+    Simulation: The created simulation object.
+    """
+    forcefield = ForceField('amber99sb.xml')
+    system = forcefield.createSystem(pdb.topology)
+    integrator = LangevinIntegrator(300*kelvin, 1/picoseconds, 2*femtoseconds)
+    simulation = Simulation(pdb.topology, system, integrator)
+    simulation.context.setPositions(pdb.positions)
+    return simulation
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,49 @@
+import pandas as pd
+import numpy as np
+from src.data_preprocessing import preprocess_protein_data
+from src.model_training import train_and_evaluate_model
+from src.nlp_processing import process_text
+from src.virtual_screening import virtual_screening
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+
+
+def main():
+    # Load protein data
+    protein_data = pd.read_csv('data/protein_data.csv')
+
+    # Preprocess protein data
+    preprocessed_data = preprocess_protein_data(protein_data)
+
+    # Load labels for model training
+    labels = np.load('data/labels.npy')
+
+    # Train and evaluate the 3D model
+    evaluation_metrics = train_and_evaluate_model(preprocessed_data, labels)
+    print("Model Evaluation Metrics:", evaluation_metrics)
+
+    # Load and process text data
+    with open('data/text_data.txt', 'r') as file:
+        text_data = file.read()
+    text_embeddings = process_text(text_data)
+    print("Text Embeddings:", text_embeddings)
+
+    # Load protein structure for virtual screening
+    protein_structure = Chem.MolFromPDBFile('data/protein_structure.pdb')
+
+    # Load potential drug compounds
+    compounds = [Chem.MolFromSmiles(smiles) for smiles in open('data/compounds.smi').read().splitlines()]
+
+    # Define selection criteria for compounds
+    criteria = {
+        'MW': lambda x: Descriptors.MolWt(x) < 500,
+        'LogP': lambda x: Descriptors.MolLogP(x) < 5
+    }
+
+    # Perform virtual screening
+    scored_compounds = virtual_screening(protein_structure, compounds, criteria)
+    print("Scored Compounds:", scored_compounds)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/model_training.py b/src/model_training.py
@@ -0,0 +1,89 @@
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+
+
+def build_model(input_shape):
+    """
+    Build a 3D model for protein folding using a neural network.
+
+    Parameters:
+    input_shape (tuple): The shape of the input data.
+
+    Returns:
+    tf.keras.Model: The constructed neural network model.
+    """
+    model = tf.keras.Sequential([
+        tf.keras.layers.InputLayer(input_shape=input_shape),
+        tf.keras.layers.Conv3D(32, (3, 3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling3D((2, 2, 2)),
+        tf.keras.layers.Conv3D(64, (3, 3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling3D((2, 2, 2)),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dense(128, activation='relu'),
+        tf.keras.layers.Dense(1, activation='sigmoid')
+    ])
+    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
+    return model
+
+
+def train_model(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32):
+    """
+    Train the 3D model using the training data.
+
+    Parameters:
+    model (tf.keras.Model): The neural network model to be trained.
+    X_train (np.ndarray): The training data.
+    y_train (np.ndarray): The training labels.
+    X_val (np.ndarray): The validation data.
+    y_val (np.ndarray): The validation labels.
+    epochs (int): The number of epochs to train the model.
+    batch_size (int): The batch size for training.
+
+    Returns:
+    tf.keras.callbacks.History: The training history of the model.
+    """
+    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size)
+    return history
+
+
+def evaluate_model(model, X_test, y_test):
+    """
+    Evaluate the trained model using the test data.
+
+    Parameters:
+    model (tf.keras.Model): The trained neural network model.
+    X_test (np.ndarray): The test data.
+    y_test (np.ndarray): The test labels.
+
+    Returns:
+    dict: The evaluation metrics of the model.
+    """
+    evaluation = model.evaluate(X_test, y_test)
+    return dict(zip(model.metrics_names, evaluation))
+
+
+def train_and_evaluate_model(data, labels, test_size=0.2, val_size=0.2, epochs=10, batch_size=32):
+    """
+    Train and evaluate the 3D model using the provided data and labels.
+
+    Parameters:
+    data (np.ndarray): The input data.
+    labels (np.ndarray): The labels for the input data.
+    test_size (float): The proportion of the data to be used as test data.
+    val_size (float): The proportion of the training data to be used as validation data.
+    epochs (int): The number of epochs to train the model.
+    batch_size (int): The batch size for training.
+
+    Returns:
+    dict: The evaluation metrics of the model.
+    """
+    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42)
+    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size, random_state=42)
+
+    input_shape = X_train.shape[1:]
+    model = build_model(input_shape)
+
+    train_model(model, X_train, y_train, X_val, y_val, epochs=epochs, batch_size=batch_size)
+    evaluation_metrics = evaluate_model(model, X_test, y_test)
+
+    return evaluation_metrics