Source code for marsi.chemistry.rdkit

# Copyright 2016 Chr. Hansen A/S and The Novo Nordisk Foundation Center for Biosustainability, DTU.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import

import math
import time

import numpy as np
import rdkit
from bitarray import bitarray
from cachetools import cached, LRUCache
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys, MCS
from rdkit.Chem.SaltRemover import SaltRemover

from marsi.chemistry.common import monte_carlo_volume as mc_vol, inchi_key_lru_cache


lru_cache = LRUCache(maxsize=256)

periodic_table = Chem.GetPeriodicTable()
salt_remove = SaltRemover(defnData="[Li,K,Rb,Cs,Fr,Be,Mg,Ca,Sr,Ba,Ra,F,Cl,Br]")

fps = ["maccs", "morgan2", "morgan3", "morgan4", "morgan5"]


[docs]@cached(lru_cache) def inchi_to_molecule(inchi): """ Returns a molecule from a InChI string. Parameters ---------- inchi : str A valid string. Returns ------- rdkit.Chem.rdchem.Mol A molecule. """ mol = Chem.MolFromInchi(inchi) mol = salt_remove.StripMol(mol, dontRemoveEverything=True) Chem.Kekulize(mol) mol = Chem.AddHs(mol) return mol
[docs]def mol_to_molecule(file_or_molecule_desc, from_file=True): """ Returns a molecule from a MOL file. Parameters ---------- file_or_molecule_desc : str A valid MOL file path or a valid MOL string. from_file : bool If True tries to read the molecule from a file. Returns ------- rdkit.Chem.rdchem.Mol A molecule. """ if from_file: mol = Chem.MolFromMolFile(file_or_molecule_desc) else: mol = Chem.MolFromMolBlock(str(file_or_molecule_desc)) mol = salt_remove.StripMol(mol, dontRemoveEverything=True) Chem.Kekulize(mol) mol = Chem.AddHs(mol) return mol
[docs]def sdf_to_molecule(file_or_molecule_desc, from_file=True): """ Returns a molecule from a SDF file. Parameters ---------- file_or_molecule_desc : str A valid sdf file path or a valid SDF string. from_file : bool If True tries to read the molecule from a file. Returns ------- rdkit.Chem.rdchem.Mol A molecule. """ if from_file: supplier = Chem.SDMolSupplier(file_or_molecule_desc) else: supplier = Chem.SDMolSupplier() supplier.SetData(str(file_or_molecule_desc), strictParsing=False) mol = next(supplier) mol = salt_remove.StripMol(mol, dontRemoveEverything=True) Chem.Kekulize(mol) mol = Chem.AddHs(mol) return mol
[docs]@cached(inchi_key_lru_cache) def inchi_to_inchi_key(inchi): """ Makes an InChI Key from a InChI string. Parameters ---------- inchi : str A valid InChI string. Returns ------- str A InChI key. """ return Chem.InchiToInchiKey(inchi)
[docs]def mol_to_inchi_key(mol): """ Makes an InChI Key from a Molecule. Parameters ---------- mol : rdkit.Chem.rdchem.Mol A molecule. Returns ------- str A InChI key. """ return inchi_to_inchi_key(mol_to_inchi(mol))
[docs]def mol_to_inchi(mol): """ Makes an InChI from a Molecule. Parameters ---------- mol : rdkit.Chem.rdchem.Mol A molecule. Returns ------- str A InChI. """ return Chem.MolToInchi(mol)
[docs]def fingerprint(molecule, fpformat='maccs'): """ Returns the Fingerprint of the molecule. Parameters ---------- molecule : rdkit.Chem.rdchem.Mol A molecule. fpformat : str A valid fingerprint format. Returns ------- Fingerprint rdkit.DataStructs.cDataStructs.ExplicitBitVect """ assert isinstance(molecule, Chem.rdchem.Mol) assert isinstance(fpformat, str) fp = None if fpformat not in fps: raise AssertionError("'%s' is not a valid fingerprint format" % fpformat) if fpformat == "maccs": fp = MACCSkeys.FingerprintMol(molecule) elif fpformat == "morgan2": fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 2) elif fpformat == "morgan3": fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 3) elif fpformat == "morgan4": fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 4) elif fpformat == "morgan5": fp = AllChem.GetMorganFingerprintAsBitVect(molecule, 5) return fp
[docs]def fingerprint_to_bits(fp, bits=1024): """ Converts a pybel.Fingerprint into a binary array Parameters ---------- fp : rdkit.DataStructs.cDataStructs.ExplicitBitVect A fingerprint molecule. bits : int Number of bits (default is 1024) Returns ------- bitarray """ bits_list = bitarray(bits) bits_list.setall(0) for i in range(fp.GenNumBits()): if fp.GetBit(i): bits_list[i - 1] = 1 return bits_list
[docs]def maximum_common_substructure(reference, molecule, match_rings=True, match_fraction=0.6, timeout=None): """ Returns the Maximum Common Substructure (MCS) between two molecules. Parameters ---------- reference : rdkit.Chem.Mol A molecule. molecule : rdkit.Chem.Mol Another molecule. match_rings : bool Force ring structure to match match_fraction : float Match is fraction of the reference atoms (default: 0.6) timeout: int Time out in seconds. Returns ------- rdkit.Chem.MCS.MCSResult Maximum Common Substructure result. """ assert isinstance(reference, rdkit.Chem.rdchem.Mol) min_num_atoms = math.ceil(reference.GetNumAtoms()) * match_fraction return MCS.FindMCS([reference, molecule], ringMatchesRingOnly=match_rings, minNumAtoms=min_num_atoms, timeout=timeout, atomCompare="any", bondCompare="any")
[docs]def mcs_similarity(mcs_result, molecule, atoms_weight=0.5, bonds_weight=0.5): """ Returns the Maximum Common Substructure (MCS) between two molecules. $$ atoms\_weight * (mcs_res.similar\_atoms/mol.num_atoms) + bonds\_weight * (mcs_res.similar\_bonds/mol.num\_bonds) $$ Parameters ---------- mcs_result : rdkit.Chem.MCS.MCSResult The result of a Maximum Common Substructure run. molecule : rdkit.Chem.Mol A molecule. atoms_weight : float How much similar atoms matter. bonds_weight : float How much similar bonds matter. Returns ------- float Similarity value """ assert isinstance(mcs_result, MCS.MCSResult) assert isinstance(molecule, Chem.rdchem.Mol) atoms_score = atoms_weight * (float(mcs_result.numAtoms) / float(molecule.GetNumAtoms())) try: bonds_score = bonds_weight * (float(mcs_result.numBonds) / float(molecule.GetNumBonds())) except ZeroDivisionError: bonds_score = .0 return atoms_score + bonds_score
[docs]def structural_similarity(reference, molecule, atoms_weight=0.5, bonds_weight=0.5, match_rings=True, match_fraction=0.6, timeout=None): """ Returns a structural similarity based on the Maximum Common Substructure (MCS) between two molecules. $$ mcs\_s(ref) * mcs\_s(mol) $$ Parameters ---------- reference : rdkit.Chem.Mol The result of a Maximum Common Substructure run. molecule : rdkit.Chem.Mol A molecule. atoms_weight : float How much similar atoms matter. bonds_weight : float How much similar bonds matter. match_rings : bool Force ring structure to match. match_fraction : float Match is fraction of the reference atoms (default: 0.6). timeout : int Time out in seconds. Returns ------- float Similarity between reference and molecule. """ reference = Chem.RemoveHs(reference, implicitOnly=True, updateExplicitCount=True) molecule = Chem.RemoveHs(molecule, implicitOnly=True, updateExplicitCount=True) mcs_res = maximum_common_substructure(reference, molecule, match_rings=match_rings, match_fraction=match_fraction, timeout=timeout) ref_similarity = mcs_similarity(mcs_res, reference, atoms_weight=atoms_weight, bonds_weight=bonds_weight) mol_similarity = mcs_similarity(mcs_res, molecule, atoms_weight=atoms_weight, bonds_weight=bonds_weight) return ref_similarity * mol_similarity
[docs]def monte_carlo_volume(molecule, coordinates=None, tolerance=1, max_iterations=10000, step_size=1000, seed=time.time(), verbose=False, forcefield='mmff94', steps=100): """ Adapted from: Simple Monte Carlo estimation of VdW molecular volume (in A^3) by Geoffrey Hutchison <geoffh@pitt.edu> https://github.com/ghutchis/hutchison-cluster Parameters ---------- molecule : rdkit.Chem.rdchem.Mol A molecule from rdkit. coordinates : list A list of pre selected x,y,z coords. It must match the atoms order. tolerance : float The tolerance for convergence of the monte carlo abs(new_volume - volume) < tolerance max_iterations : int Number of iterations before the algorithm starts. step_size : int Number of points to add each step. seed : object A valid seed for random. verbose : bool Print debug information if True. forcefield : str The force field to get a 3D molecule. (only if it is not 3D already) steps : int The number of steps used for the force field to get a 3D molecule. (only if it is not 3D already) Returns ------- float Molecule volume """ assert isinstance(molecule, rdkit.Chem.rdchem.Mol) if coordinates is None: if len(molecule.GetConformers()) == 0: AllChem.EmbedMolecule(molecule) AllChem.UFFOptimizeMolecule(molecule) conformer = molecule.GetConformer(0) coordinates = [] for index, atom in enumerate(molecule.GetAtoms()): pos = tuple(conformer.GetAtomPosition(index)) coordinates.append(pos) vdw_radii = [] for index, atom in enumerate(molecule.GetAtoms()): radius = periodic_table.GetRvdw(atom.GetAtomicNum()) vdw_radii.append(radius) coordinates = np.array(coordinates, dtype=np.float32) vdw_radii = np.array(vdw_radii, dtype=np.float32) return mc_vol(coordinates, vdw_radii, tolerance, max_iterations, step_size, seed, int(verbose))