Added MOFid-v2 analysis

snurr-group · Dec 5, 2024 · 5c1b7d3 · 5c1b7d3
1 parent 419186b
commit 5c1b7d3
Show file tree

Hide file tree

Showing 29 changed files with 141,941 additions and 0 deletions.
diff --git a/mofid-v2/README.md b/mofid-v2/README.md
@@ -0,0 +1,15 @@
+# MOFid-v2 Conversion Scripts
+
+## Overview
+This repository contains scripts used to extract and convert the results of the MOFid software to MOFid-v2 format. These scripts are designed to streamline the process of transitioning from the original MOFid to the updated MOFid-v2 representation.
+
+For detailed information about MOFid-v2, please refer to the CoRE MOF 2024 publication.
+
+## Requirements/Dependencies
+The following Python libraries are required to run the scripts:
+- [ASE](https://wiki.fysik.dtu.dk/ase/) `>= 3.22.1`
+- [pymatgen] (https://pymatgen.org/) `>=2024.2.8`
+- [Open Babel](http://openbabel.org/wiki/Main_Page) `>= 3.1.1`
+- [NetworkX](https://networkx.org/) `>= 3.0`
+
+Ensure that these dependencies are installed before using the scripts.
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/.group_linkers_by_SMILES.py.swp b/mofid-v2/analysis_of_nodes_and_linkers/.group_linkers_by_SMILES.py.swp
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/README.md b/mofid-v2/analysis_of_nodes_and_linkers/README.md
@@ -0,0 +1,56 @@
+# MOFid-v2 Conversion and Analysis Scripts
+
+## Overview
+This repository contains scripts for processing and analyzing CoRE MOF 2024 structures using the MOFid software. The scripts enable the decomposition of CIF files into building blocks, such as nodes and linkers, followed by grouping and further characterization based on their chemical compositions.
+
+## Instructions for Using the Scripts
+
+### Step 1: Extracting Building Blocks
+Using the MOFid software, we processed all CIF files from the CoRE MOF 2024 dataset. This process generated:
+- **MOFid-v1 identifiers** for the MOFs.
+- Decomposition of CIF files into their constituent building blocks (nodes and linkers).
+
+### Step 2: Processing Nodes
+1. **Splitting Nodes into Individual `.xyz` Files**:
+   - The **AllNode method** from MOFid provides the coordinates of all nodes in a MOF.
+   - To simplify analysis, these node files were split into individual `.xyz` files based on their chemical compositions using the script:
+     ```bash
+     python split_nodes_to_xyz.py
+     ```
+
+2. **Grouping Nodes by Composition**:
+   - After generating `.xyz` files for all nodes, the nodes were grouped by their chemical compositions using:
+     ```bash
+     python group_node_by_composition.py
+     ```
+
+3. **Characterizing Node Types**:
+   - Further analysis was performed to classify nodes into different types based on their structures using:
+      ```bash
+     python group_node_by_structure.py
+     ```  
+
+### Step 3: Processing Linkers
+1. **Splitting Linkers into Individual `.xyz` Files**:
+   - The **MetalOxo method** from MOFid provides the coordinates of linkers.
+   - These linkers were split into individual `.xyz` files based on their chemical compositions using:
+     ```bash
+     python split_linkers_to_xyz.py
+     ```
+
+2. **Grouping Linkers by SMILES String**:
+   - Using the output of MOFid, the linkers were grouped by their canonical SMILES representation as implemented in OpenBabel:
+     ```bash
+     python group_linkers_by_SMILES.py
+     ```
+
+3. **Converting Linkers to XYZ Files**:
+   - Using OpenBabel, the SMILES strings were reconverted to XYZ format using:
+     ```bash
+     python convert_smiles_to_xyz.py
+     ```
+
+## Notes
+- Ensure that all necessary dependencies and input files are correctly configured before running the scripts.
+- Refer to the CoRE MOF 2024 publication for detailed descriptions of the dataset and methods.
+
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/all_mofid-v1.txt b/mofid-v2/analysis_of_nodes_and_linkers/all_mofid-v1.txt
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/convert_smiles_to_xyz.py b/mofid-v2/analysis_of_nodes_and_linkers/convert_smiles_to_xyz.py
@@ -0,0 +1,33 @@
+import subprocess
+import json
+import os
+
+new_json = {}
+
+with open("smiles_by_ref.json", "r") as f:
+    json_data = json.load(f)
+
+for count, item in enumerate(list(json_data)):
+    print(item)
+    if count % 100 == 0:
+        print("Iterate through item {}".format(count))
+    result = subprocess.run("obabel -:\"{}\" -oxyz --gen3d -O output.xyz".format(item), shell=True)
+    if result.returncode != 0:
+        print(f"Open Babel failed for item: {item}")
+        continue
+
+    xyz_file = "output.xyz"
+    if os.path.isfile(xyz_file):
+        with open(xyz_file, "r") as xyz_f:
+            xyz_content = xyz_f.read()
+    else:
+        print(item)
+    # Add XYZ content to original dictionary
+    new_json[item] = {}
+    new_json[item]["xyz_coordinate"] = xyz_content
+    new_json[item]["ref_code"] = json_data[item]
+    subprocess.run("rm {}".format(xyz_file), shell=True)
+# Save updated JSON data
+with open("test.json", "w") as f:
+    json.dump(new_json, f, indent=4)
+
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/group_linkers_by_SMILES.py b/mofid-v2/analysis_of_nodes_and_linkers/group_linkers_by_SMILES.py
@@ -0,0 +1,76 @@
+import subprocess
+from openbabel import openbabel as ob
+import json
+
+def dict2str(dct):
+    """Convert symbol-to-number dict to str.
+    """
+    return ''.join(symb + (str(n)) for symb, n in dct.items())
+
+def are_identical_smiles(smiles1, smiles2):
+    obConversion = ob.OBConversion()
+    obConversion.SetInFormat("smi")
+    obConversion.SetOutFormat("can")  # Set output to canonical SMILES
+
+    mol1 = ob.OBMol()
+    mol2 = ob.OBMol()
+
+    # Read the SMILES strings into molecules
+    obConversion.ReadString(mol1, smiles1)
+    obConversion.ReadString(mol2, smiles2)
+
+    # Convert molecules to canonical SMILES
+    can_smiles1 = obConversion.WriteString(mol1).strip()
+    can_smiles2 = obConversion.WriteString(mol2).strip()
+
+    # Compare canonical SMILES
+    return can_smiles1 == can_smiles2
+
+metals  = ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr', 'Be', 'Mg', 'Ca', 'Sr', 'Ba', 'Ra',
+          'Al', 'Ga', 'Ge', 'In', 'Sn', 'Sb', 'Tl', 'Pb', 'Bi', 'Po',
+          'Sc', 'Ti', 'V' , 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn',
+          'Y' , 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd',
+          'Hf', 'Ta', 'W' , 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
+          'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'U', 'Tm', 'Yb', 'Lu',
+          'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr'
+         ]
+
+linkers = []
+refcodes = []
+dict_data = {}
+
+with open('all_mofid-v1.txt', 'r') as f:
+    for idx, line in enumerate(f):
+        if idx % 100 == 0:
+            print(idx, len(dict_data), dict_data)
+        if line.startswith('*'):
+            continue
+        mofid = line.strip()
+        main = mofid.split()[0]
+        supp = mofid.split()[1]
+        refcode = supp.split(';')[-1]
+
+        main = main.split('.')
+        for new_smile in main:
+            check = False
+            for j in metals:
+                if j in new_smile:
+                    check = True
+                    break
+            if check == True:
+                continue
+            else:
+                check_match = False
+                if idx == 0:
+                    dict_data[new_smile] = [refcode]
+                    continue
+                for smile in list(dict_data):
+                    if are_identical_smiles(new_smile, smile):
+                        dict_data[smile].append(refcode)
+                        check_match = True
+                        break
+                if check_match ==  False:
+                    dict_data[new_smile] = [refcode]
+
+with open('smiles_by_ref.json', 'w') as f:
+    json.dump(dict_data, f, indent=4)
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/group_node_by_composition.py b/mofid-v2/analysis_of_nodes_and_linkers/group_node_by_composition.py
@@ -0,0 +1,34 @@
+from ase.io import read as ase_read
+from ase.formula import Formula
+import networkx as nx
+import glob
+from ase.build import sort
+import json
+
+def dict2str(dct):
+    """Convert symbol-to-number dict to str.
+    """
+    return ''.join(symb + (str(n)) for symb, n in dct.items())
+
+dict_formulas = {}
+
+xyzfiles = sorted(glob.glob('*.xyz'))
+
+for count, xyzfile in enumerate(xyzfiles):
+    ref = xyzfile[:-4]
+    atoms = ase_read(xyzfile)
+    atoms = sort(atoms)
+    form_dict = atoms.symbols.formula.count()
+    form = dict2str(form_dict)
+
+    if form in list(dict_formulas):
+        dict_formulas[form].append(ref)
+    else:
+        dict_formulas[form] = [ref]
+    """ 
+    if count% 1000 == 0:
+        print(dict_formulas)
+        print(len(dict_formulas))
+    """
+with open('nodes_details.json', 'w') as f:
+    json.dump(dict_formulas, f, indent=4)
diff --git a/mofid-v2/analysis_of_nodes_and_linkers/group_node_by_structure.py b/mofid-v2/analysis_of_nodes_and_linkers/group_node_by_structure.py
@@ -0,0 +1,143 @@
+import copy
+import ase
+import json
+from io import StringIO
+from ase.io import read, write
+import numpy as np
+import collections
+from pymatgen.core.structure import Structure
+from pymatgen.core.lattice import Lattice
+from pymatgen.analysis.structure_matcher import ElementComparator
+from pymatgen.analysis.structure_matcher import StructureMatcher
+
+def convert_ase_pymat(ase_objects):
+    structure_lattice = Lattice(ase_objects.cell)
+    structure_species = ase_objects.get_chemical_symbols()
+    structure_positions = ase_objects.get_positions()
+    return Structure(structure_lattice,structure_species,structure_positions)
+
+def remove_pbc_cuts(atoms):
+    """Remove building block cuts due to periodic boundary conditions. After the
+    removal, the atoms object is centered at the center of the unit cell.
+
+    Parameters
+    ----------
+    atoms : ase.Atoms
+        The atoms object to be processed.
+
+    Returns
+    -------
+    ase.Atoms
+        The processed atoms object.
+    """
+    try:
+        # setting cuttoff parameter
+        scale = 1.4
+        cutoffs = ase.neighborlist.natural_cutoffs(atoms)
+        cutoffs = [scale * c for c in cutoffs]
+
+        #making neighbor_list
+        #graphs of single metal is not constructed because there is no neighbor.
+        I, J, D = ase.neighborlist.neighbor_list("ijD",atoms,cutoff=cutoffs)
+
+        nl = [[] for _ in atoms]
+        for i, j, d in zip(I, J, D):
+            nl[i].append((j, d))
+        visited = [False for _ in atoms]
+        q = collections.deque()
+
+        # Center of the unit cell.
+        abc_half = np.sum(atoms.get_cell(), axis=0) * 0.5
+
+        positions = {}
+        q.append((0, np.array([0.0, 0.0, 0.0])))
+        while q:
+            i, pos = q.pop()
+            visited[i] = True
+            positions[i] = pos
+            for j, d in nl[i]:
+                if not visited[j]:
+                    q.append((j, pos + d))
+                    visited[j] = True
+
+        centroid = np.array([0.0, 0.0, 0.0])
+        for v in positions.values():
+            centroid += v
+        centroid /= len(positions)
+
+        syms = [None for _ in atoms]
+        poss = [None for _ in atoms]
+
+        for i in range(len(atoms)):
+            syms[i] = atoms.symbols[i]
+            poss[i] = positions[i] - centroid + abc_half
+
+
+        atoms = ase.Atoms(
+            symbols=syms, positions=poss, pbc=True, cell=atoms.get_cell()
+        )
+
+        # resize of cell
+        cell_x = np.max(atoms.positions[:,0]) - np.min(atoms.positions[:,0])
+        cell_y = np.max(atoms.positions[:,1]) - np.min(atoms.positions[:,1])
+        cell_z = np.max(atoms.positions[:,2]) - np.min(atoms.positions[:,2])
+        cell = max([cell_x,cell_y,cell_z])
+
+        atoms.set_cell([cell+2,cell+2,cell+2, 90,90,90])
+        center_mass = atoms.get_center_of_mass()
+        cell_half  = atoms.cell.cellpar()[0:3]/2
+        atoms.positions = atoms.positions - center_mass + cell_half    
+
+        return atoms
+    except:
+        return atoms
+
+def return_xyz_list(atoms):
+    xyz_stringio = StringIO()
+    write(xyz_stringio, atoms, format='xyz')
+    xyz_stringio.seek(0)
+    return xyz_stringio.readlines()
+
+
+f = open("nodes_details.json")
+node_jsons = json.load(f)
+
+
+type_list_in_formula = {}
+for formula in node_jsons.keys():
+
+    node_list = sorted(node_jsons[formula])
+    unique_list_in_formula = []
+    if len(node_list) >1:
+
+        i = 1
+        type_list = {}
+        while len(node_list) > 0:
+
+            duplicate_list_in_formula = [node_list[0]]
+            node_list_for_loop = copy.copy(node_list)
+            mof = remove_pbc_cuts(read(node_list[0] + ".xyz"))
+            write(formula+"_Type-" + str(i)+".xyz",mof)
+            a = convert_ase_pymat(mof)
+            node_list_for_loop.pop(0)
+
+            for node in reversed(node_list_for_loop):
+                b = convert_ase_pymat(remove_pbc_cuts(read(node + ".xyz")))
+                matcher = StructureMatcher(ltol = 0.3, stol = 2, angle_tol = 5, primitive_cell = False, scale = False, comparator=ElementComparator()) 
+                are_structures_matching = matcher.fit(a, b)
+                if are_structures_matching:
+                    node_list_for_loop.pop(node_list_for_loop.index(node))
+                    duplicate_list_in_formula.append(node)
+
+            type_list["Type-" + str(i)] = {"node-file":sorted(duplicate_list_in_formula), "xyz": return_xyz_list(mof)}
+            node_list = node_list_for_loop
+            i += 1
+        type_list_in_formula[formula] = type_list
+
+    elif len(node_list) ==1:
+        mof = remove_pbc_cuts(read(node_list[0] + ".xyz"))
+        write(formula+"_Type-1.xyz",mof)
+        type_list_in_formula[formula] = {"Type-1":{"node-file":node_list[0], "xyz": return_xyz_list(mof)}} #
+
+with open("node_grouped_by_structure.json", 'w') as file:
+    json.dump(type_list_in_formula, file,indent=2)