Module ccsfp.informatics.carbon_capture
Expand source code
#!/usr/bin/env python
# Copyright IBM Corporation 2022.
# SPDX-License-Identifier: MIT
# https://www.rdkit.org/docs/GettingStartedInPython.html
# creative commons sa 4.0 tutorial used to learn rdkit methods
# https://creativecommons.org/licenses/by-sa/4.0/
# (C) 2007-2021 by Greg Landrum
# Python packages and utilities
from __future__ import annotations
import logging
import numpy as np
import pandas as pd
import rdkit
from IPython.display import display
from rdkit import Chem
from ccsfp.informatics import molecules_and_images as mai
# RDKit
# disp
# Logging
# own modules
citation = "{ADD BIBTEX ENTRY}"
random_seed = 15791
def number_of_x_atoms(mol: rdkit.Chem.rdchem.Mol, x="N"):
"""
Function to calculate the number of nitrogen atoms in a molecule
:param mol: RDKit mol - rdkit.Chem.Mol instance
:param x: str - element symbol to count in a RDKit molecule should be the same symbol RDKit uses
:return: int
>>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("O"), "O")
1
"""
log = logging.getLogger(__name__)
n = 0
for at in mol.GetAtoms():
if at.GetSymbol() == x:
n = n + 1
return n
def number_of_n_atoms(mol: rdkit.Chem.rdchem.Mol) -> int:
"""
Function to calculate the number of nitrogen atoms in a molecule
:param mol: RDKit mol - rdkit.Chem.Mol instance
:return: int
>>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("CCN"))
1
"""
log = logging.getLogger(__name__)
nN = 0
for at in mol.GetAtoms():
if at.GetSymbol() == "N":
nN = nN + 1
return nN
def count_amine_types(
smi: str,
primary: str = "[NX3;H2][CX4;!$(C=[#7,#8])]",
secondary: str = "[NX3;H1][CX4;!$(C=[#7,#8])][CX4;!$(C=[#7,#8])]",
tertiary: str = "[NX3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]",
aromatic_sp2_n: str = "[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]",
show: bool = False,
show_primary: bool = False,
show_secondary: bool = False,
show_tertiary: bool = False,
show_aromaticsp2: bool = False,
) -> (int, int, int, int):
"""
Function to count the sub-structuer matches to the 4 amine types
:param smi: str - smiles string
:param primary: str - Smarts string for primary aliphatic (chain) amine identifying sub-structures
:param secondary: str - Smarts string for identifying identifying secondary aliphatic (chain) amine sub-structures
:param tertiary: str - Smarts string for identifying tertiary aliphatic (chain) amine sub-structures
:param aromatic_sp2_n: str - Smarts string for identifying aromatic sp2 hybridized nitrogen atoms sub-structures
:param show: True/False - boolean to plot the molecule graphs and overlaps
:param show_primary: True/False - boolean to plot the molecule graphs and overlaps for primary amine matches
:param show_secondary: True/False - boolean to plot the molecule graphs and overlaps for secondary amine matches
:param show_tertiary: True/False - boolean to plot the molecule graphs and overlaps for tertiary amine matches
:param show_aromaticsp2=False : True/False - boolean to plot the molecule graphs and overlaps for aromatic sp2 hybridized nitrogen atom matches
:return: tuple(int, int, int, int)
>>> count_amine_types("NCCN(CCc1ccncc1)CCNC")
(1, 1, 1, 1)
"""
log = logging.getLogger(__name__)
primary_substructure = Chem.MolFromSmarts(primary)
secondary_substructure = Chem.MolFromSmarts(secondary)
tertiary_substructure = Chem.MolFromSmarts(tertiary)
aromsp2_substructure = Chem.MolFromSmarts(aromatic_sp2_n)
mol = mai.smiles_to_molecule(smi, threed=False)
matches = mol.GetSubstructMatches(primary_substructure)
n_primary = len(matches)
if show is True or show_primary is True:
log.info("\n----- Primary -----")
if len(matches) > 0:
log.info("{}".format(display(mol)))
else:
log.info("No matches")
log.info(
"\nNumber of matches: {} Match atom indexes: {}".format(
len(matches), matches,
),
)
mol = mai.smiles_to_molecule(smi, threed=False)
matches = mol.GetSubstructMatches(secondary_substructure)
n_secondary = len(matches)
if show is True or show_secondary is True:
log.info("\n----- Secondary -----")
if len(matches) > 0:
log.info("{}".format(display(mol)))
else:
log.info("No matches")
log.info(
"\nNumber of matches: {} Match atom indexes: {}".format(
len(matches), matches,
),
)
mol = mai.smiles_to_molecule(smi, threed=False)
matches = mol.GetSubstructMatches(tertiary_substructure)
n_tertiary = len(matches)
if show is True or show_tertiary is True:
log.info("\n----- Tertiary -----")
if len(matches) > 0:
log.info("{}".format(display(mol)))
else:
log.info("No matches")
log.info(
"\nNumber of matches: {} Match atom indexes: {}".format(
len(matches), matches,
),
)
mol = mai.smiles_to_molecule(smi, threed=False)
matches = mol.GetSubstructMatches(aromsp2_substructure)
n_aromaticsp2 = len(matches)
if show is True or show_aromaticsp2 is True:
log.info("\n----- Atomatic sp2 hybridized nitrogen atoms -----")
if len(matches) > 0:
log.info("{}".format(display(mol)))
else:
log.info("No matches")
log.info(
"\nNumber of matches: {} Match atom indexes: {}".format(
len(matches), matches,
),
)
return n_primary, n_secondary, n_tertiary, n_aromaticsp2
def capacity_classes(
n_primary: list,
n_secodnary: list,
n_tertiary: list,
n_aromatic_sp2: list,
capacity: list,
units: str = "molco2_moln",
number_N_atoms: list = None,
amines_mr: list = None,
co2_mass: float = 44.009,
) -> list:
"""
Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity
in the given units
:param n_primary: list - number of primary amine groups in the molecule
:param n_secondary: list - number of secondary amine groups in the molecule
:param n_tertiary: list - number of tertiary amine groups in the molecule
:param n_aromatic_sp2: list - number of aromatic sp2 hybridized nitrogen atoms in the molecule
:param capacity: list - amine capacity values in the appropiate units
:param units: str - Three accepted unit "molco2_moln", "molco2_molamine", "gco2_gamine" classes are consistent across tehse units
:param number_N_atoms: list - The number of N atoms in the amine for each smiles
:param amine_mr: list - The molar mass to each amine for each smiles
:param co2_mass: float - mass of a co2 molecule
:return: list
>>> capacity_classes([1], [0], [0], [1], [0.55], amines_mr=[102.01])
[1]
"""
log = logging.getLogger(__name__)
classes = []
molar_ratios = [ent / co2_mass for ent in amines_mr]
df = pd.DataFrame(
data=np.array([n_primary, n_secodnary, n_tertiary, n_aromatic_sp2]).T,
columns=[
"primary_amine_counts",
"secondary_amine_counts",
"tertiary_amine_counts",
"aromatic_sp2_n",
],
)
for indx, row in df.iterrows():
ret = classify(*row.values)
# N molar capacity
if units == "molco2_moln":
if capacity[indx] < ret:
classes.append(0)
else:
classes.append(1)
log.info(
"{} N molar capacity (mol co2 / mol N) threshold {:.2f} capacity {:.2f} class {}".format(
indx, ret, capacity[indx], classes[-1],
),
)
elif units == "molco2_molamine":
# amine molar capacity
if capacity[indx] < ret * number_N_atoms[indx]:
classes.append(0)
else:
classes.append(1)
log.info(
"{} Amine molar capacity (mol co2 / mol amine) threshold {:.2f} capacity {:.2f} class {}".format(
indx, ret *
number_N_atoms[indx], capacity[indx], classes[-1],
),
)
elif units == "gco2_gamine":
# mass capacity
if capacity[indx] < (ret * number_N_atoms[indx]) / molar_ratios[indx]:
classes.append(0)
else:
classes.append(1)
log.info(
"{} Mass capacity (co2 (g) / amine (g)) threshold {:.2f} capacity {:.2f} class {}\n----\n".format(
indx,
(ret * number_N_atoms[indx]) / molar_ratios[indx],
capacity[indx],
classes[-1],
),
)
return classes
def classify(
n_primary: int,
n_secodnary: int,
n_tertiary: int,
n_aromatic_sp2: int,
primary_secondary_weight: float = 0.5,
tertiary_weight: float = 1.0,
):
"""
Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity
:param n_primary: int - number of primary amine groups in the molecule
:param n_secondary: int - number of secondary amine groups in the molecule
:param n_tertiary: int - number of tertiary amine groups in the molecule
:param n_aromatic_sp2: int - number of aromatic sp2 hybridized nitrogen atoms in the molecule
:param primary_secondary_weight: float - weighting per primary or secondary amine group
:param tertiary_weight: float - weighting per tertiary amine group
:return: float
"""
if n_primary + n_secodnary > 0 and n_tertiary > 0:
# d provides a tempering to the other wise ever increasing expectation of a polyamine of all amine types in which the tertiary is likely
# to play a small role comapred to kinetically favourable primary and secondary amine reactions.
n = (primary_secondary_weight * (n_primary + n_secodnary)) + (
tertiary_weight * n_tertiary
)
d = 2.0 * n_tertiary
return n / d
elif n_primary >= 1 and n_secodnary >= 1:
return primary_secondary_weight * (n_primary + n_secodnary)
elif n_primary > 1:
return primary_secondary_weight * (n_primary)
elif n_secodnary > 1:
return primary_secondary_weight * (n_secodnary)
elif n_primary == 1:
return primary_secondary_weight
elif n_secodnary == 1:
return primary_secondary_weight
elif n_tertiary > 0:
# reacts as a catalytic molecule rather than a reactant
return tertiary_weight
elif (
n_primary == 0 and n_secodnary == 0 and n_tertiary == 0 and n_aromatic_sp2 != 0
):
# estimate as it is not clear on exactly what route these may take
return primary_secondary_weight
else:
return primary_secondary_weight
if __name__ == "__main__":
import doctest
doctest.testmod()
Functions
def capacity_classes(n_primary: list, n_secodnary: list, n_tertiary: list, n_aromatic_sp2: list, capacity: list, units: str = 'molco2_moln', number_N_atoms: list = None, amines_mr: list = None, co2_mass: float = 44.009) ‑> list
-
Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity in the given units :param n_primary: list - number of primary amine groups in the molecule :param n_secondary: list - number of secondary amine groups in the molecule :param n_tertiary: list - number of tertiary amine groups in the molecule :param n_aromatic_sp2: list - number of aromatic sp2 hybridized nitrogen atoms in the molecule :param capacity: list - amine capacity values in the appropiate units :param units: str - Three accepted unit "molco2_moln", "molco2_molamine", "gco2_gamine" classes are consistent across tehse units :param number_N_atoms: list - The number of N atoms in the amine for each smiles :param amine_mr: list - The molar mass to each amine for each smiles :param co2_mass: float - mass of a co2 molecule :return: list
>>> capacity_classes([1], [0], [0], [1], [0.55], amines_mr=[102.01]) [1]
Expand source code
def capacity_classes( n_primary: list, n_secodnary: list, n_tertiary: list, n_aromatic_sp2: list, capacity: list, units: str = "molco2_moln", number_N_atoms: list = None, amines_mr: list = None, co2_mass: float = 44.009, ) -> list: """ Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity in the given units :param n_primary: list - number of primary amine groups in the molecule :param n_secondary: list - number of secondary amine groups in the molecule :param n_tertiary: list - number of tertiary amine groups in the molecule :param n_aromatic_sp2: list - number of aromatic sp2 hybridized nitrogen atoms in the molecule :param capacity: list - amine capacity values in the appropiate units :param units: str - Three accepted unit "molco2_moln", "molco2_molamine", "gco2_gamine" classes are consistent across tehse units :param number_N_atoms: list - The number of N atoms in the amine for each smiles :param amine_mr: list - The molar mass to each amine for each smiles :param co2_mass: float - mass of a co2 molecule :return: list >>> capacity_classes([1], [0], [0], [1], [0.55], amines_mr=[102.01]) [1] """ log = logging.getLogger(__name__) classes = [] molar_ratios = [ent / co2_mass for ent in amines_mr] df = pd.DataFrame( data=np.array([n_primary, n_secodnary, n_tertiary, n_aromatic_sp2]).T, columns=[ "primary_amine_counts", "secondary_amine_counts", "tertiary_amine_counts", "aromatic_sp2_n", ], ) for indx, row in df.iterrows(): ret = classify(*row.values) # N molar capacity if units == "molco2_moln": if capacity[indx] < ret: classes.append(0) else: classes.append(1) log.info( "{} N molar capacity (mol co2 / mol N) threshold {:.2f} capacity {:.2f} class {}".format( indx, ret, capacity[indx], classes[-1], ), ) elif units == "molco2_molamine": # amine molar capacity if capacity[indx] < ret * number_N_atoms[indx]: classes.append(0) else: classes.append(1) log.info( "{} Amine molar capacity (mol co2 / mol amine) threshold {:.2f} capacity {:.2f} class {}".format( indx, ret * number_N_atoms[indx], capacity[indx], classes[-1], ), ) elif units == "gco2_gamine": # mass capacity if capacity[indx] < (ret * number_N_atoms[indx]) / molar_ratios[indx]: classes.append(0) else: classes.append(1) log.info( "{} Mass capacity (co2 (g) / amine (g)) threshold {:.2f} capacity {:.2f} class {}\n----\n".format( indx, (ret * number_N_atoms[indx]) / molar_ratios[indx], capacity[indx], classes[-1], ), ) return classes
def classify(n_primary: int, n_secodnary: int, n_tertiary: int, n_aromatic_sp2: int, primary_secondary_weight: float = 0.5, tertiary_weight: float = 1.0)
-
Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity :param n_primary: int - number of primary amine groups in the molecule :param n_secondary: int - number of secondary amine groups in the molecule :param n_tertiary: int - number of tertiary amine groups in the molecule :param n_aromatic_sp2: int - number of aromatic sp2 hybridized nitrogen atoms in the molecule :param primary_secondary_weight: float - weighting per primary or secondary amine group :param tertiary_weight: float - weighting per tertiary amine group :return: float
Expand source code
def classify( n_primary: int, n_secodnary: int, n_tertiary: int, n_aromatic_sp2: int, primary_secondary_weight: float = 0.5, tertiary_weight: float = 1.0, ): """ Function to output a suggested threshold for 'good' or 'bad' classification of amine molecules based on carbon capture capacity :param n_primary: int - number of primary amine groups in the molecule :param n_secondary: int - number of secondary amine groups in the molecule :param n_tertiary: int - number of tertiary amine groups in the molecule :param n_aromatic_sp2: int - number of aromatic sp2 hybridized nitrogen atoms in the molecule :param primary_secondary_weight: float - weighting per primary or secondary amine group :param tertiary_weight: float - weighting per tertiary amine group :return: float """ if n_primary + n_secodnary > 0 and n_tertiary > 0: # d provides a tempering to the other wise ever increasing expectation of a polyamine of all amine types in which the tertiary is likely # to play a small role comapred to kinetically favourable primary and secondary amine reactions. n = (primary_secondary_weight * (n_primary + n_secodnary)) + ( tertiary_weight * n_tertiary ) d = 2.0 * n_tertiary return n / d elif n_primary >= 1 and n_secodnary >= 1: return primary_secondary_weight * (n_primary + n_secodnary) elif n_primary > 1: return primary_secondary_weight * (n_primary) elif n_secodnary > 1: return primary_secondary_weight * (n_secodnary) elif n_primary == 1: return primary_secondary_weight elif n_secodnary == 1: return primary_secondary_weight elif n_tertiary > 0: # reacts as a catalytic molecule rather than a reactant return tertiary_weight elif ( n_primary == 0 and n_secodnary == 0 and n_tertiary == 0 and n_aromatic_sp2 != 0 ): # estimate as it is not clear on exactly what route these may take return primary_secondary_weight else: return primary_secondary_weight
def count_amine_types(smi: str, primary: str = '[NX3;H2][CX4;!$(C=[#7,#8])]', secondary: str = '[NX3;H1][CX4;!$(C=[#7,#8])][CX4;!$(C=[#7,#8])]', tertiary: str = '[NX3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]', aromatic_sp2_n: str = '[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]', show: bool = False, show_primary: bool = False, show_secondary: bool = False, show_tertiary: bool = False, show_aromaticsp2: bool = False) ‑> (int, int, int, int)
-
Function to count the sub-structuer matches to the 4 amine types :param smi: str - smiles string :param primary: str - Smarts string for primary aliphatic (chain) amine identifying sub-structures :param secondary: str - Smarts string for identifying identifying secondary aliphatic (chain) amine sub-structures :param tertiary: str - Smarts string for identifying tertiary aliphatic (chain) amine sub-structures :param aromatic_sp2_n: str - Smarts string for identifying aromatic sp2 hybridized nitrogen atoms sub-structures :param show: True/False - boolean to plot the molecule graphs and overlaps :param show_primary: True/False - boolean to plot the molecule graphs and overlaps for primary amine matches :param show_secondary: True/False - boolean to plot the molecule graphs and overlaps for secondary amine matches :param show_tertiary: True/False - boolean to plot the molecule graphs and overlaps for tertiary amine matches :param show_aromaticsp2=False : True/False - boolean to plot the molecule graphs and overlaps for aromatic sp2 hybridized nitrogen atom matches :return: tuple(int, int, int, int)
>>> count_amine_types("NCCN(CCc1ccncc1)CCNC") (1, 1, 1, 1)
Expand source code
def count_amine_types( smi: str, primary: str = "[NX3;H2][CX4;!$(C=[#7,#8])]", secondary: str = "[NX3;H1][CX4;!$(C=[#7,#8])][CX4;!$(C=[#7,#8])]", tertiary: str = "[NX3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]", aromatic_sp2_n: str = "[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]", show: bool = False, show_primary: bool = False, show_secondary: bool = False, show_tertiary: bool = False, show_aromaticsp2: bool = False, ) -> (int, int, int, int): """ Function to count the sub-structuer matches to the 4 amine types :param smi: str - smiles string :param primary: str - Smarts string for primary aliphatic (chain) amine identifying sub-structures :param secondary: str - Smarts string for identifying identifying secondary aliphatic (chain) amine sub-structures :param tertiary: str - Smarts string for identifying tertiary aliphatic (chain) amine sub-structures :param aromatic_sp2_n: str - Smarts string for identifying aromatic sp2 hybridized nitrogen atoms sub-structures :param show: True/False - boolean to plot the molecule graphs and overlaps :param show_primary: True/False - boolean to plot the molecule graphs and overlaps for primary amine matches :param show_secondary: True/False - boolean to plot the molecule graphs and overlaps for secondary amine matches :param show_tertiary: True/False - boolean to plot the molecule graphs and overlaps for tertiary amine matches :param show_aromaticsp2=False : True/False - boolean to plot the molecule graphs and overlaps for aromatic sp2 hybridized nitrogen atom matches :return: tuple(int, int, int, int) >>> count_amine_types("NCCN(CCc1ccncc1)CCNC") (1, 1, 1, 1) """ log = logging.getLogger(__name__) primary_substructure = Chem.MolFromSmarts(primary) secondary_substructure = Chem.MolFromSmarts(secondary) tertiary_substructure = Chem.MolFromSmarts(tertiary) aromsp2_substructure = Chem.MolFromSmarts(aromatic_sp2_n) mol = mai.smiles_to_molecule(smi, threed=False) matches = mol.GetSubstructMatches(primary_substructure) n_primary = len(matches) if show is True or show_primary is True: log.info("\n----- Primary -----") if len(matches) > 0: log.info("{}".format(display(mol))) else: log.info("No matches") log.info( "\nNumber of matches: {} Match atom indexes: {}".format( len(matches), matches, ), ) mol = mai.smiles_to_molecule(smi, threed=False) matches = mol.GetSubstructMatches(secondary_substructure) n_secondary = len(matches) if show is True or show_secondary is True: log.info("\n----- Secondary -----") if len(matches) > 0: log.info("{}".format(display(mol))) else: log.info("No matches") log.info( "\nNumber of matches: {} Match atom indexes: {}".format( len(matches), matches, ), ) mol = mai.smiles_to_molecule(smi, threed=False) matches = mol.GetSubstructMatches(tertiary_substructure) n_tertiary = len(matches) if show is True or show_tertiary is True: log.info("\n----- Tertiary -----") if len(matches) > 0: log.info("{}".format(display(mol))) else: log.info("No matches") log.info( "\nNumber of matches: {} Match atom indexes: {}".format( len(matches), matches, ), ) mol = mai.smiles_to_molecule(smi, threed=False) matches = mol.GetSubstructMatches(aromsp2_substructure) n_aromaticsp2 = len(matches) if show is True or show_aromaticsp2 is True: log.info("\n----- Atomatic sp2 hybridized nitrogen atoms -----") if len(matches) > 0: log.info("{}".format(display(mol))) else: log.info("No matches") log.info( "\nNumber of matches: {} Match atom indexes: {}".format( len(matches), matches, ), ) return n_primary, n_secondary, n_tertiary, n_aromaticsp2
def number_of_n_atoms(mol: rdkit.Chem.rdchem.Mol) ‑> int
-
Function to calculate the number of nitrogen atoms in a molecule :param mol: RDKit mol - rdkit.Chem.Mol instance :return: int
>>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("CCN")) 1
Expand source code
def number_of_n_atoms(mol: rdkit.Chem.rdchem.Mol) -> int: """ Function to calculate the number of nitrogen atoms in a molecule :param mol: RDKit mol - rdkit.Chem.Mol instance :return: int >>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("CCN")) 1 """ log = logging.getLogger(__name__) nN = 0 for at in mol.GetAtoms(): if at.GetSymbol() == "N": nN = nN + 1 return nN
def number_of_x_atoms(mol: rdkit.Chem.rdchem.Mol, x='N')
-
Function to calculate the number of nitrogen atoms in a molecule :param mol: RDKit mol - rdkit.Chem.Mol instance :param x: str - element symbol to count in a RDKit molecule should be the same symbol RDKit uses :return: int
>>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("O"), "O") 1
Expand source code
def number_of_x_atoms(mol: rdkit.Chem.rdchem.Mol, x="N"): """ Function to calculate the number of nitrogen atoms in a molecule :param mol: RDKit mol - rdkit.Chem.Mol instance :param x: str - element symbol to count in a RDKit molecule should be the same symbol RDKit uses :return: int >>> number_of_x_atoms(Chem.rdmolfiles.MolFromSmiles("O"), "O") 1 """ log = logging.getLogger(__name__) n = 0 for at in mol.GetAtoms(): if at.GetSymbol() == x: n = n + 1 return n