Module ccsfp.informatics.finger_prints

Expand source code
#!/usr/bin/env python
# Copyright IBM Corporation 2022.
# SPDX-License-Identifier: MIT
# https://www.rdkit.org/docs/GettingStartedInPython.html
# creative commons sa 4.0 tutorial used to learn rdkit methods
# https://creativecommons.org/licenses/by-sa/4.0/
# (C) 2007-2021 by Greg Landrum
# Python packages and utilities
from __future__ import annotations

import logging

import dask
import numpy as np
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit import SimDivFilters
from rdkit.Chem import DataStructs
from rdkit.Chem import MACCSkeys

from . import molecules_and_images as mai

# RDKit
# Logging
# Dask
# own modules

citation = """@article{mcdonagh2022chemical,
  title={Chemical space analysis and property prediction for carbon capture amine molecules},
  author={McDonagh, James and Zavitsanou, Stamatia and Harrison, Alexander and Zubarev, Dimitry and Wunsch, Benjamin and van Kessel, Theordore and Cipcigan, Flaviu},
  year={2022},
  url={https://chemrxiv.org/engage/chemrxiv/article-details/62e110cbadb01e653cae19f4}
}"""

random_seed = 15791


class ccus_fps(object):
    def __init__(
        self,
        fingerprint_version: int = 1,
        names: list = None,
        substructures: list = None,
        log: logging.Logger = None,
        verbose: bool = True,
    ):
        """
        Initialise the class
        :param fingerprint_version: int - version number
        :param names: iterable - list of names of the substructure to use for the fingerprint
        :param substructures: iterable - list of substructure strings in smarts notation
        :param log: logging.Logger - logger object
        :param verbose: bool - print extra information is verbose
        """
        self.version_explanations = {
            1: "This is a filtered set which seems to perform well for modelling this finger print includes rarer groups like"
            "sulphur.",
            2: "This is a filtered set which seems to perform well for modelling but does not include rarer groups like"
            "sulphur containing hetrocycles.",
        }

        self.version_names = {
            1: [
                "ammonia",
                "primary_amine",
                "secondary_amine",
                "tertiary_amine",
                "quaternary_N",
                "imine",
                "nitrogen_bonded_to_carbon",
                "aromatic_N_sp2",
                "carboxylic_acid",
                "primary_alcohol",
                "secondary_alcohol",
                "tertiary_alcohol",
                "t_butyl",
                "carbonyl",
                "halocarbon",
                "benezene_ring",
                "6_member_aromatic_c_and_n_ring",
                "6_member_c_and_o_ring",
                "5_c_ring",
                "5_member_aromatic_c_and_n_ring",
                "5_member_c_and_o_ring",
                "Cyclohexane",
                "Cyclohexylamine",
                "Aniline",
                "benzylamine",
                "piperidine",
                "pyridine",
                "pyrrole",
                "primary_amino_alcohol_two_carbon_separation",
                "secondary_amino_alcohol_two_carbon_separation",
                "tertiary_amino_alcohol_two_carbon_separation",
                "primary_amino_alcohol_three_carbon_separation",
                "secondary_amino_alcohol_three_carbon_separation",
                "tertiary_amino_alcohol_three_carbon_separation",
                "aliphatic_primary_amino_alcohol_two_carbon_separation",
                "aliphatic_secondary_amino_alcohol_two_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_two_carbon_separation",
                "aliphatic_primary_amino_alcohol_three_carbon_separation",
                "aliphatic_secondary_amino_alcohol_three_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_three_carbon_separation",
                "primary_amine_one_carbon_aromatic_group",
                "primary_amine_two_carbon_aromatic_group",
                "primary_amine_three_carbon_aromatic_group",
                "secondary_amine_one_carbon_aromatic_group",
                "secondary_amine_two_carbon_aromatic_group",
                "secondary_amine_three_carbon_aromatic_group",
                "tertiary_amine_one_carbon_aromatic_group",
                "tertiary_amine_two_carbon_aromatic_group",
                "tertiary_amine_three_carbon_aromatic_group",
                "methyl_branch_one_carbon_from_a_N_atom",
                "methyl_branch_two_carbon_from_a_N_atom",
                "methyl_branch_three_carbon_from_a_N_atom",
                "methyl_branch_four_carbon_from_a_N_atom",
                "methyl_branch_five_carbon_from_a_N_atom",
                "methyl_branch_six_carbon_from_a_N_atom",
                "ethyl_chain",
                "propyl_chain",
                "butyl_chain",
                "pentyl_chain",
                "hexyl_chain",
                "poly_primary_and_or_secondary_amine",
                "poly_primary_and_or_secondary_and_or_tertiary_amine",
                "poly_alcohol",
                "pyrazine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_substitution",
                "Presence_of_Boron",
                "Presence_of_Silicon",
                "Presence_of_Phosphurus",
                "Presence_of_Sulphur",
                "positive_charge_group",
                "negative_charge_group",
            ],
            2: [
                "ammonia",
                "primary_amine",
                "secondary_amine",
                "tertiary_amine",
                "quaternary_N",
                "aromatic_N_sp2",
                "carboxylic_acid",
                "primary_alcohol",
                "secondary_alcohol",
                "tertiary_alcohol",
                "t_butyl",
                "carbonyl",
                "halocarbon",
                "benezene_ring",
                "6_member_aromatic_c_and_n_ring",
                "6_member_c_and_o_ring",
                "5_c_ring",
                "5_member_aromatic_c_and_n_ring",
                "5_member_c_and_o_ring",
                "Cyclohexane",
                "Cyclohexylamine",
                "Aniline",
                "benzylamine",
                "piperidine",
                "pyridine",
                "pyrrole",
                "primary_amino_alcohol_two_carbon_separation",
                "secondary_amino_alcohol_two_carbon_separation",
                "tertiary_amino_alcohol_two_carbon_separation",
                "primary_amino_alcohol_three_carbon_separation",
                "secondary_amino_alcohol_three_carbon_separation",
                "tertiary_amino_alcohol_three_carbon_separation",
                "aliphatic_primary_amino_alcohol_two_carbon_separation",
                "aliphatic_secondary_amino_alcohol_two_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_two_carbon_separation",
                "aliphatic_primary_amino_alcohol_three_carbon_separation",
                "aliphatic_secondary_amino_alcohol_three_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_three_carbon_separation",
                "primary_amine_one_carbon_aromatic_group",
                "primary_amine_two_carbon_aromatic_group",
                "primary_amine_three_carbon_aromatic_group",
                "secondary_amine_one_carbon_aromatic_group",
                "secondary_amine_two_carbon_aromatic_group",
                "secondary_amine_three_carbon_aromatic_group",
                "tertiary_amine_one_carbon_aromatic_group",
                "tertiary_amine_two_carbon_aromatic_group",
                "tertiary_amine_three_carbon_aromatic_group",
                "methyl_branch_one_carbon_from_a_N_atom",
                "methyl_branch_two_carbon_from_a_N_atom",
                "methyl_branch_three_carbon_from_a_N_atom",
                "methyl_branch_four_carbon_from_a_N_atom",
                "methyl_branch_five_carbon_from_a_N_atom",
                "methyl_branch_six_carbon_from_a_N_atom",
                "ethyl_chain",
                "propyl_chain",
                "butyl_chain",
                "pentyl_chain",
                "hexyl_chain",
                "poly_primary_and_or_secondary_amine",
                "poly_primary_and_or_secondary_and_or_tertiary_amine",
                "poly_alcohol",
                "pyrazine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_substitution",
            ],
        }

        self.version_substructures = {
            1: [
                "[NH3]",  # ammonia
                "[NX3;H2][C;!$(C=[#7,#8])]",  # 1' amine
                "[NX3;H1][C;!$(C=[#7,#8])][C;!$(C=[#7,#8])]",  # 2' amine
                # 3' amine
                "[NX3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
                "[NX4+]",  # ammonium
                "[N]=[C]",  # imine,
                # N bonded to C "[$([#6]~[#7]);!$([#6]-[#7])]", # nitrogen bonded to carbon with any bond other than a single bond
                "[#6]~[#7]",
                "[a]:[nX3,X2]:[a]",  # SP2 aromatic N
                "[CX3;$([#6]),$([O;H1])](=[OX1])[$([O])]",  # carboxylic acid
                "[#6][#6;!$(C(=O)[OH])][OH]",  # 1' alcohol
                "[#6][#6]([#6])[OH]",  # 2' alcohol'
                "[#6][#6]([#6])([#6])[OH]",  # 3' alcohol
                "[#6]C([CH3])([CH3])([CH3])",  # t-butyl
                "[CX3]=[O;!$(O*)]",  # Carbonyl
                "[#6]~[F,Cl,Br,I]",  # halo carbon
                "c1ccccc1",  # benzene
                # aromatic n or c 6 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n][c,n]1",
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",  # Any O and C 6 ring
                "[#6]1~[#6]~[#6]~[#6]~[#6]~1",  # any C 5 ring
                # aromatic n or c 5 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n]1",
                # any O or C 5 member ring system
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",
                "C1CCCCC1",  # cyclohexane
                # amine bound to ring
                "[NX3;H2,H1][#6]1~[#6]~[#6]~[#6]~[#6]~[#6]~1",
                "[NH2]c1ccccc1",  # 1' amine bound to benzene
                "c1ccccc1[CH2][NH2]",  # benzyl NH2
                "C1N([#1])CCCC1",  # H connected to N in an unsaturated ring
                "c1ncccc1",  # Pyridine
                "c1n([H])ccc1",  # pyrrole
                # see description
                "[$([#6]([OH])[#6][#7H2]);!$([#6]([OH])(=O)[#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#7H]([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#7]([#6])([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H2]);!$([#6]([OH])(=O)[#6][#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#6][#7H]([#6]))]",
                "[$([#6]([OH])[#6][#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#6][#7]([#6])([#6]))]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7]([CX4])([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[N]([CX4])([CX4])",
                "[a][C][#7H2]",  # see description
                "[a][C][C][#7H2]",  # see description
                "[a][C][C][C][#7H2]",  # see description
                "[a][C][#7H]([#6])",  # see description
                "[a][C][C][#7H]([#6])",  # see description
                "[a][C][C][C][#7H]([#6])",  # see description
                "[a][C][#7]([#6])([#6])",  # see description
                "[a][C][C][#7]([#6])([#6])",  # see description
                "[a][C][C][C][#7]([#6])([#6])",  # see description
                "[NH2][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4]([CH3])",  # see description
                # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4][CX4]([CH3])",
                "[CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",
                "[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])].[$([#7X3;H2][C;!$(C=[#7,#8])]),"
                # poly 1' 2' amine
                "$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])]",
                "[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]),$([#7X3]([C;!$(C=[#7,#8])])"
                "([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])].[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]),"
                # poly 1' 2' or 3' amine
                "$([#7X3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])]",
                "[#6][O;H1].[#6][O;H1]",  # poly alcohol
                # pyrazine aliphatic C2 and C5 substitution
                "n1c([CX4])cnc([CX4])c1",
                # pyridine_aliphatic_C_2_and_5_substitution
                "n1c([CX4])ccc([CX4])c1",
                "n1cccc([CX4])c1",  # pyridine_aliphatic_C_2_substitution
                "[#5]",  # B
                "[#14]",  # Si
                "[#15]",  # P
                "[#16]",  # S
                "[+]",  # positive cahrged group
                "[-]",  # negative charge group
            ],
            2: [
                "[NH3]",  # ammonia
                "[NX3;H2][CX4;!$(C=[#7,#8])]",  # 1' amine
                "[NX3;H1][CX4;!$(C=[#7,#8])][CX4;!$(C=[#7,#8])]",  # 2' amine
                # 3' amine
                "[NX3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]",
                "[NX4+]",  # ammonium
                "[a]:[nX3,X2]:[a]",  # SP2 aromatic N
                "[CX3;$([#6]),$([O;H1])](=[OX1])[$([O])]",  # carboxylic acid
                "[#6][#6;!$(C(=O)[OH])][OH]",  # 1' alcohol
                "[#6][#6]([#6])[OH]",  # 2' alcohol'
                "[#6][#6]([#6])([#6])[OH]",  # 3' alcohol
                "[#6]C([CH3])([CH3])([CH3])",  # t-butyl
                "[CX3]=[O;!$(O*)]",  # Carbonyl
                "[#6]~[F,Cl,Br,I]",  # halo carbon
                "c1ccccc1",  # benzene
                # aromatic n or c 6 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n][c,n]1",
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",  # Any O and C 6 ring
                "[#6]1~[#6]~[#6]~[#6]~[#6]~1",  # any C 5 ring
                # aromatic n or c 5 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n]1",
                # any O or C 5 member ring system
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",
                "C1CCCCC1",  # cyclohexane
                # amine bound to ring
                "[NX3;H2,H1][#6]1~[#6]~[#6]~[#6]~[#6]~[#6]~1",
                "[NH2]c1ccccc1",  # 1' amine bound to benzene
                "c1ccccc1[CH2][NH2]",  # benzyl NH2
                "C1N([#1])CCCC1",  # H connected to N in an unsaturated ring
                "c1ncccc1",  # Pyridine
                "c1n([H])ccc1",  # pyrrole
                # see description
                "[$([#6]([OH])[#6][#7H2]);!$([#6]([OH])(=O)[#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#7H]([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#7]([#6])([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H2]);!$([#6]([OH])(=O)[#6][#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#6][#7H]([#6]))]",
                "[$([#6]([OH])[#6][#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#6][#7]([#6])([#6]))]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7]([CX4])([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[N]([CX4])([CX4])",
                "[a][C][#7H2]",  # see description
                "[a][C][C][#7H2]",  # see description
                "[a][C][C][C][#7H2]",  # see description
                "[a][C][#7H]([#6])",  # see description
                "[a][C][C][#7H]([#6])",  # see description
                "[a][C][C][C][#7H]([#6])",  # see description
                "[a][C][#7]([#6])([#6])",  # see description
                "[a][C][C][#7]([#6])([#6])",  # see description
                "[a][C][C][C][#7]([#6])([#6])",  # see description
                "[NH2][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4]([CH3])",  # see description
                # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4][CX4]([CH3])",
                "[CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",
                "[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])].[$([#7X3;H2][CX4;!$(C=[#7,#8])]),"
                # poly 1' 2' amine
                "$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])]",
                "[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]),$([#7X3]([CX4;!$(C=[#7,#8])])"
                "([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])].[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]),"
                # poly 1' 2' or 3' amine
                "$([#7X3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])]",
                "[#6][O;H1].[#6][O;H1]",  # poly alcohol
                # pyrazine aliphatic C2 and C5 substitution
                "n1c([CX4])cnc([CX4])c1",
                # pyridine_aliphatic_C_2_and_5_substitution
                "n1c([CX4])ccc([CX4])c1",
                "n1cccc([CX4])c1",  # pyridine_aliphatic_C_2_substitution
            ],
        }

        try:
            log.info("\n")
        except Exception:
            log = logging.getLogger(__name__)
            log.info("\n")

        self.fingerprint_version = fingerprint_version

        if names is None:
            self.names = self.get_default_names(
                version=self.fingerprint_version)
        else:
            self.names = names

        if substructures is None:
            self.substructures = self.get_default_substructures(
                version=self.fingerprint_version,
            )
        else:
            self.substructures = substructures

        if substructures is None and names is None:
            self.fingerprint_explanation = self.get_default_explanation(
                version=self.fingerprint_version,
            )
            if verbose is True:
                log.info(
                    "Finger print is version {}\n{}".format(
                        self.fingerprint_version, self.fingerprint_explanation,
                    ),
                )
        else:
            log.warning(
                "No fingerprint explanation avaliable as custom substructures have been given, hence you know better than I do what they mean.",
            )

        if len(self.names) != len(self.substructures):
            try:
                log.warning(
                    "WARNING - the number of names ({}) and the number of substructures ({}) is different, "
                    "This will cause issues for defualt functions in this module. Names will be reset to indexes.".format(
                        len(self.names), len(self.substructures),
                    ),
                )
                self.names = [str(ith)
                              for ith in enumerate(self.substructures)]
                log.warning(
                    "New names and substructures:\n{}".format(
                        "\n".join(
                            [
                                "{} : {}".format(n, s)
                                for n, s in zip(self.names, self.substructures)
                            ],
                        ),
                    ),
                )
            except NameError:
                print(
                    "WARNING - the number of names ({}) and the number of substructures ({}) is different, "
                    "This will cause issues for defualt functions in this module.".format(
                        len(self.names), len(self.substructures),
                    ),
                )

        log.info(
            "Please use the citation below for use of this code:\n{}".format(
                citation),
        )

    def get_default_names(self, version: int = None) -> list:
        """
        Function to get the names descriptive names of the substructures we are looking for
        Essentially these substructures look for the amine environment and groups which can interact with it. The first
        elements identify specific groups. The later elements look for the motifs of closeness of certain functional
        groups to the amine groups.
        :param version: int - which version of the fingerprints to get name for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_names[version]

    def get_default_substructures(self, version: int = None) -> list:
        """
        Function get the smarts to search for substructures. Essentially these substructures look for the amine
        environment and groups which can interact with it. The first elements identify specific groups. The
        later elements look for the motifs of closeness of certain functional groups to the amine groups.
        :param version: int - which version of the fingerprints to get substructures for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_substructures[version]

    def get_default_explanation(self, version: int = None) -> str:
        """
        Function to get the description of the version you have picked  names of the substrictires we are looking for
        Essentially these substructures look for the amine environment and groups which can interact with it. The first
        elements identify specific groups. The later elements look for the motifs of closeness of certain
        functional groups to the amine groups.
        :param version: int - which version of the fingerprints to get explanations for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_explanations[version]

    def get_fp_information(self, return_df: bool = False):
        """
        Print the infomration related to the current fingerprint instantiation
        """

        log = logging.getLogger(__name__)

        log.info(
            "{:4} | {:59} | {:50}\n---------------------------------------------------------"
            "-------------------------------------".format(
                "bit", "description", "smarts",
            ),
        )

        names = []
        for nam in self.names:
            nam = " ".join(nam.split("_"))
            nam = " ".join(nam.split("-"))
            names.append(nam)

        for ith, ds in enumerate(zip(names, self.substructures)):
            log.info("{:4} | {:59} | {:50}".format(ith, ds[0], ds[1]))

        if return_df is True:
            df_information = pd.DataFrame(
                np.array([names, self.substructures]).T,
                columns=["description", "smarts"],
            )
            return df_information


############## END of class ############


def maccskeys_fingerprints(smiles: list) -> list:
    """
    Function to get MACCS fingerprints
    :param smiles: list - smiles representation of the molecules to make fingerprints of
    """

    log = logging.getLogger(__name__)

    mols = [mai.smiles_to_molecule(smile) for smile in smiles]
    fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]

    return fps


def dask_substructure_checker(
    representation: str, substructures: list = None, smiles=True,
) -> list:
    """
    Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in
    :param smi: str - smiles
    :param substructures: iterable - SMARTS defining the substructure to search for
    :return: tuple - smiles substructure and True/False for looking for the substructure
    >>> dask_substructure_checker("CC", ["*CC*"])
    [1]
    """

    log = logging.getLogger(__name__)

    if smiles is True:
        mol = mai.smiles_to_molecule(representation)
    else:
        mol = mai.inchi_to_molecule(representation)

    substructs = [Chem.MolFromSmarts(substructure)
                  for substructure in substructures]

    fp_vec = [int(mol.HasSubstructMatch(substruct))
              for substruct in substructs]

    return fp_vec


def ccs_fp(
    representation: list,
    substructures: list = None,
    substructure_names: list = None,
    return_smarts_only: bool = False,
    version: int = 1,
    thresh: int = 1000,
    return_only_fingerprint: bool = False,
    return_fingerprints_as_str: bool = False,
    inchi_regex="InChI=",
):
    """
    Function to make a fingerprint out of the presence or not of a sub-structure using SMARTS. Note this uses
    Lazy dask parallel execution to make the porcess run in parallel if number of representations is >= thresh.
    :param representation: tuple/list - smiles representations of molecules to check for substructure presence or absence
    :param substructures: tuple or list - SMARTS tuple/list to look for to form the fingerprint
    :param substructure_names: tuple or list - names of the substructure if given a dataframe is returned as well
    :param return_smarts_only: bool - return only the keys for teh smarts in the substructure search
    :param version: int - version of the predefined ccs fingerprint to use
    :param thresh: int - number of smiles under which run in serial over or equal run in parallel with dask
    :param return_only_fingerprint: bool - return only the fingerprints
    :param inchi_regex: str - string to make sure a representation is inchi
    :Returns: list, dask dataframe, list
    >>> ccs_fp(["CCN"], return_fingerprints_as_str=True)
    ['010000100000000000000000000000000000000000000000010000000000000000000000']
    >>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True)
    ['010000100000000000000000000000000000000000000000010000000000000000000000']
    >>> ccs_fp(["CCN"], return_fingerprints_as_str=True, version=2)
    ['0100000000000000000000000000000000000000000000010000000000000000']
    >>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True, version=2)
    ['0100000000000000000000000000000000000000000000010000000000000000']
    """
    log = logging.getLogger(__name__)

    # Essentially these substructures look for the amine environment and groups which can interact with it. The
    # first elements identify specific groups. The later elements look for the motifs of closeness of certain
    # functional groups to the amine groups.
    ccus_substructs = ccus_fps(
        names=substructure_names,
        substructures=substructures,
        fingerprint_version=version,
        log=log,
    )
    substructure_names = ccus_substructs.names
    substructures = ccus_substructs.substructures

    if return_smarts_only is True:
        return substructures

    log.info(
        "Number of substructures: {} Number of substructure names: {}".format(
            len(substructures), len(substructure_names),
        ),
    )
    if len(substructures) != len(substructure_names):
        log.error(
            "Number of substructures and names differ cannot produce dataframe: {} "
            "{}".format(len(substructures), len(substructure_names)),
        )
        for s, n in zip(substructures, substructure_names):
            log.info("{} {}".format(n, s))

    # ASSUMPTION: no one puts a mix of smiles and inchi in
    if inchi_regex in representation[0]:
        log.info(
            "'inchi=' found in the first molecule, assume all molecules will be inchi not smiles!",
        )
        inchi = representation

        log.info("Making fingerprint from {} InChI".format(len(inchi)))

        # chunk larger datasets manually
        if len(inchi) >= thresh:
            fps = []
            iters = int(np.floor(len(inchi) / thresh)) + 1
            limit = int(iters)
            bases = [0 + i * thresh for i in range(iters)]
            uppers = [thresh + i * thresh for i in range(iters)]
            uppers[-1] = None
            for b, u in zip(bases, uppers):
                log.info("fingerprints computed for {} InChI".format(b))
                log.info("InChI[{}:{}]".format(b, u))
                inchs = inchi[b:u]

                fp_tmp = [
                    dask.delayed(dask_substructure_checker)(
                        inc, substructures=substructures, smiles=False,
                    )
                    for inc in inchs
                ]
                fps = fps + fp_tmp

            # compute fingerprints
            log.info("Running DASK computation .....")
            fps = dask.compute(*fps)
            log.info("DASK complete fingerprints generated.")
        else:
            log.info(
                "Length of the InChI list is less than the threshold ({} change through function call) running "
                "without DASK".format(thresh),
            )
            fps = [
                dask_substructure_checker(
                    inch, substructures=substructures, smiles=False,
                )
                for inch in inchi
            ]
            fps = dask.compute(*fps)

        log.info("Preparing fingerprints")
        if substructure_names is not None:
            log.info("Building dataframe .....")
            df = pd.DataFrame(data=fps, columns=substructure_names)
            log.info("Building RDKit bits .....")
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]
            log.info("Fingerprint generation finished.")

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, df, substructures
        else:
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, substructures

    else:
        log.info(
            "'inchi=' not found in the first molecule, assume all molecules will be SMILES not InChI!",
        )
        smiles = representation

        log.info("Making fingerprint from {} SMILES".format(len(smiles)))

        # chunk larger datasets manually
        if len(smiles) >= thresh:
            fps = []
            iters = int(np.floor(len(smiles) / thresh)) + 1
            limit = int(iters)
            bases = [0 + i * thresh for i in range(iters)]
            uppers = [thresh + i * thresh for i in range(iters)]
            uppers[-1] = None
            for b, u in zip(bases, uppers):
                log.info("fingerprints computed for {} smiles".format(b))
                log.info("smiles[{}:{}]".format(b, u))
                smis = smiles[b:u]

                fp_tmp = [
                    dask.delayed(dask_substructure_checker)(
                        smi, substructures=substructures, smiles=True,
                    )
                    for smi in smis
                ]
                fps = fps + fp_tmp

            # compute fingerprints
            log.info("Running DASK computation .....")
            fps = dask.compute(*fps)
            log.info("DASK complete fingerprints generated.")
        else:
            log.info(
                "Length of the smiles list is less than the threshold ({} change through function call) running "
                "without DASK".format(thresh),
            )
            fps = [
                dask_substructure_checker(
                    smi, substructures=substructures, smiles=True)
                for smi in smiles
            ]
            fps = dask.compute(*fps)

        log.info("Preparing fingerprints")
        if substructure_names is not None:
            log.info("Building dataframe .....")
            df = pd.DataFrame(data=fps, columns=substructure_names)
            log.info("Building RDKit bits .....")
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]
            log.info("Fingerprint generation finished.")

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, df, substructures
        else:
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, substructures


def substructure_checker(smiles: str, substructure: str = None) -> int:
    """
    Function to find a substructure using SMARTS
    :param smi: str - smiles
    :param substructure: str - SMARTS defining the substructure to search for
    :return: tuple - smiles substructure and True/False for looking for the substructure
    >>> substructure_checker("CC", "*CC*")
    1
    """

    mol = mai.smiles_to_molecule(smiles)

    substruct = Chem.MolFromSmarts(substructure)

    has_substructure = 0
    if mol.HasSubstructMatch(substruct):
        has_substructure = 1

    return has_substructure


def fingerprint_similarity(
    fps1, fps2, dice: bool = False, return_distance: bool = False,
) -> float:
    """
    Function to calculate fingerprint similarity
    :param fps1: RDKit fingerprint - fingerprint of molecule 1
    :param fps2: RDKit fingerprint - fingerprint of molecule 2
    :param dice: true/false - Use dice similarity
    :param return_distance: bool - return distance (1 - similarity)
    """

    if dice is True:
        similarity = dice_similarity(fps1, fps2)
    else:
        similarity = DataStructs.TanimotoSimilarity(fps1, fps2)

    if return_distance is True:
        similarity = 1.0 - similarity

    return similarity


def bits_to_text(fp) -> str:
    """
    Function to convert bit vec to text 0s and 1s
    :param fp: RDKit bit fingerprint - RDKit bit fingerprint to be set to 1s and 0s
    >>> bits_to_text(maccskeys_fingerprints(["CC"])[0])
    '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000100000000001000000'
    """

    text = DataStructs.cDataStructs.BitVectToText(fp)

    return text


def bulk_similarity(
    fp, fp_targets: list, test: bool = False, thresh: float = 0.5,
) -> pd.DataFrame:
    """
    Function to compare one fp with a list of others and get all the scores
    :param fp: RDKit fingerprint - fingerprint to compare to a list of fingerprint targets
    :param fp_targets: list - fingerprint targets to compare fp to
    :param test: bool - return only molecules with similarity greater than or equal to the thresh
    :param thresh: float - threshold for similarity to be returned
    :return:
    """

    tani_similarity = DataStructs.BulkTanimotoSimilarity(fp, fp_targets)
    data = np.array(
        [
            [i for i in range(0, len(fp_targets))],
            [fp] * len(fp_targets),
            fp_targets,
            tani_similarity,
        ],
    ).T
    df = pd.DataFrame(
        data=data,
        columns=["number", "fp_reference", "fp_target", "tanimoto_similarity"],
    )

    if test is True:
        df = df[df["tanimoto_similarity"] >= thresh]
    return df


def dice_similarity(v1, v2):
    """
    Function to return dice similarity between two bitvectors
    :param v1: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint
    :param v2: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint
    """

    return DataStructs.DiceSimilarity(v1, v2)


def diverse_set_picking(fps: list, n_diverse_batch: int = 10):
    """
    A function using the commonly applied maxmin picking methods https://onlinelibrary.wiley.com/doi/epdf/10.1002/qsar.200290002
    essentially the algorithm selects a seed molecule calculates dissimilarlity from a fingerprint distance metric
    and adds the most dissimilar molecule to the set. This is stopped when either n molecules are picked or m threshold
    in the distance metric is surpassed by all molecules.
    :param fps: list of RDKit fingerprint - molecule fingerprints to use to pick a diverse set from
    :param n_diverse_batch : int - the number of set members to include in the diverse set
    """

    log = logging.getLogger(__name__)

    diversity_picker = SimDivFilters.rdSimDivPickers.MaxMinPicker()

    number_fps = len(fps)
    diverse_indices = diversity_picker.LazyBitVectorPick(
        fps, poolSize=number_fps, pickSize=n_diverse_batch, seed=random_seed,
    )

    log.debug("Diverse indices: {}".format(list(diverse_indices)))

    return diverse_indices


def contains_substructures(
    smiles: list,
    substructures: tuple = (
        "[NH3]",
        "[NX3;H2][C;!$(C=[#7,#8])]",
        "[NX3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
        "[NX3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
        "[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]",
    ),
    substructure_names: tuple = (
        "ammonia",
        "primiary_amine",
        "secondary_amine",
        "tertiary_amine",
        "aromatic_sp2_n",
    ),
    version_name: int = 1,
    thresh: int = 1000,
    remove_no_match_rows: bool = False,
    test: bool = False,
):
    """
    Function to check if the smiles are amines or contain an aromatic N sp2
    :param smiles: str - smiles string to look for substructure
    :param substructures: iterable of str - smarts patterns to look for
    :param substructure_names: iterable of str - description of the SMARTS patterns
    :param version_name: int - ccs fp version number
    :param thresh: int - batch threshold for fingerprint code
    :param remove_no_match_rows: bool - remove rows with no matches
    :param test: bool - for testing the function
    :return: dataframe

    """

    log = logging.getLogger(__name__)

    log.info("Passing smiles and substructures to dask fp")
    log.info(
        "Substructures:\n{}\n-----\n".format(
            "\n".join(
                [
                    "{} ; {}".format(n, s)
                    for n, s in zip(substructure_names, substructures)
                ],
            ),
        ),
    )

    if isinstance(smiles, str):
        log.info(
            "Smiles is expected to be a list assume it is one smiles and put in a list",
        )
        smiles = [smiles]

    fingps, fingps_df, smarts = ccs_fp(
        smiles,
        substructures=substructures,
        substructure_names=substructure_names,
        return_smarts_only=False,
        version=version_name,
        thresh=thresh,
    )

    any_true = fingps_df.any(axis=1)
    log.info(
        "The following rows have at least one of the substructures found: {}".format(
            any_true,
        ),
    )
    fingps_df["any_true"] = any_true

    # dataframe index values which have ata least one matching substructure
    idx = fingps_df.index[fingps_df["any_true"] == 0]

    if remove_no_match_rows is True:
        log.info("{}".format(fingps_df))
        log.info("Dropping rows: {}".format(idx))
        fingps_df.drop(idx, axis=0, inplace=True)

    return fingps_df


def ccus_fp_bitstr(
    mol: rdkit.Chem.rdchem.Mol,
    substructures: list = None,
    substructure_names: list = None,
    version: int = 1,
):
    """
    Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in
    return the ccus fingerprint as a cDatastructs array.
    :param mol: str - RDkit molecule
    :param substructures: iterable - SMARTS defining the substructure to search for
    :param substructure_names: iterable - names to describe the SMARTS substructure strings meaning
    :param version: int - version of the fingerprints to use
    :return: bitstr
    """

    log = logging.getLogger(__name__)

    if substructures is None:
        ccus_substructs = ccus_fps(
            names=substructure_names,
            substructures=substructures,
            fingerprint_version=version,
            verbose=False,
        )
        if substructure_names is None:
            substructure_names = ccus_substructs.names
        substructures = ccus_substructs.substructures
    if substructure_names is None:
        ccus_substructs = ccus_fps(
            names=substructure_names,
            substructures=substructures,
            fingerprint_version=version,
            verbose=False,
        )
        if substructures is None:
            substructures = ccus_substructs.substructures
        substructure_names = ccus_substructs.names

    substructs = [Chem.MolFromSmarts(substructure)
                  for substructure in substructures]
    fps = [int(mol.HasSubstructMatch(substruct)) for substruct in substructs]
    ffps = DataStructs.cDataStructs.CreateFromBitString(
        "".join([str(ent) for ent in fps]),
    )

    return ffps


if __name__ == "__main__":
    import doctest

    doctest.testmod()

Functions

def bits_to_text(fp) ‑> str

Function to convert bit vec to text 0s and 1s :param fp: RDKit bit fingerprint - RDKit bit fingerprint to be set to 1s and 0s

>>> bits_to_text(maccskeys_fingerprints(["CC"])[0])
'00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000100000000001000000'
Expand source code
def bits_to_text(fp) -> str:
    """
    Function to convert bit vec to text 0s and 1s
    :param fp: RDKit bit fingerprint - RDKit bit fingerprint to be set to 1s and 0s
    >>> bits_to_text(maccskeys_fingerprints(["CC"])[0])
    '00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001000000000000000000000000000000000000100000000001000000'
    """

    text = DataStructs.cDataStructs.BitVectToText(fp)

    return text
def bulk_similarity(fp, fp_targets: list, test: bool = False, thresh: float = 0.5) ‑> pandas.core.frame.DataFrame

Function to compare one fp with a list of others and get all the scores :param fp: RDKit fingerprint - fingerprint to compare to a list of fingerprint targets :param fp_targets: list - fingerprint targets to compare fp to :param test: bool - return only molecules with similarity greater than or equal to the thresh :param thresh: float - threshold for similarity to be returned :return:

Expand source code
def bulk_similarity(
    fp, fp_targets: list, test: bool = False, thresh: float = 0.5,
) -> pd.DataFrame:
    """
    Function to compare one fp with a list of others and get all the scores
    :param fp: RDKit fingerprint - fingerprint to compare to a list of fingerprint targets
    :param fp_targets: list - fingerprint targets to compare fp to
    :param test: bool - return only molecules with similarity greater than or equal to the thresh
    :param thresh: float - threshold for similarity to be returned
    :return:
    """

    tani_similarity = DataStructs.BulkTanimotoSimilarity(fp, fp_targets)
    data = np.array(
        [
            [i for i in range(0, len(fp_targets))],
            [fp] * len(fp_targets),
            fp_targets,
            tani_similarity,
        ],
    ).T
    df = pd.DataFrame(
        data=data,
        columns=["number", "fp_reference", "fp_target", "tanimoto_similarity"],
    )

    if test is True:
        df = df[df["tanimoto_similarity"] >= thresh]
    return df
def ccs_fp(representation: list, substructures: list = None, substructure_names: list = None, return_smarts_only: bool = False, version: int = 1, thresh: int = 1000, return_only_fingerprint: bool = False, return_fingerprints_as_str: bool = False, inchi_regex='InChI=')

Function to make a fingerprint out of the presence or not of a sub-structure using SMARTS. Note this uses Lazy dask parallel execution to make the porcess run in parallel if number of representations is >= thresh. :param representation: tuple/list - smiles representations of molecules to check for substructure presence or absence :param substructures: tuple or list - SMARTS tuple/list to look for to form the fingerprint :param substructure_names: tuple or list - names of the substructure if given a dataframe is returned as well :param return_smarts_only: bool - return only the keys for teh smarts in the substructure search :param version: int - version of the predefined ccs fingerprint to use :param thresh: int - number of smiles under which run in serial over or equal run in parallel with dask :param return_only_fingerprint: bool - return only the fingerprints :param inchi_regex: str - string to make sure a representation is inchi :Returns: list, dask dataframe, list

>>> ccs_fp(["CCN"], return_fingerprints_as_str=True)
['010000100000000000000000000000000000000000000000010000000000000000000000']
>>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True)
['010000100000000000000000000000000000000000000000010000000000000000000000']
>>> ccs_fp(["CCN"], return_fingerprints_as_str=True, version=2)
['0100000000000000000000000000000000000000000000010000000000000000']
>>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True, version=2)
['0100000000000000000000000000000000000000000000010000000000000000']
Expand source code
def ccs_fp(
    representation: list,
    substructures: list = None,
    substructure_names: list = None,
    return_smarts_only: bool = False,
    version: int = 1,
    thresh: int = 1000,
    return_only_fingerprint: bool = False,
    return_fingerprints_as_str: bool = False,
    inchi_regex="InChI=",
):
    """
    Function to make a fingerprint out of the presence or not of a sub-structure using SMARTS. Note this uses
    Lazy dask parallel execution to make the porcess run in parallel if number of representations is >= thresh.
    :param representation: tuple/list - smiles representations of molecules to check for substructure presence or absence
    :param substructures: tuple or list - SMARTS tuple/list to look for to form the fingerprint
    :param substructure_names: tuple or list - names of the substructure if given a dataframe is returned as well
    :param return_smarts_only: bool - return only the keys for teh smarts in the substructure search
    :param version: int - version of the predefined ccs fingerprint to use
    :param thresh: int - number of smiles under which run in serial over or equal run in parallel with dask
    :param return_only_fingerprint: bool - return only the fingerprints
    :param inchi_regex: str - string to make sure a representation is inchi
    :Returns: list, dask dataframe, list
    >>> ccs_fp(["CCN"], return_fingerprints_as_str=True)
    ['010000100000000000000000000000000000000000000000010000000000000000000000']
    >>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True)
    ['010000100000000000000000000000000000000000000000010000000000000000000000']
    >>> ccs_fp(["CCN"], return_fingerprints_as_str=True, version=2)
    ['0100000000000000000000000000000000000000000000010000000000000000']
    >>> ccs_fp(["InChI=1S/C2H7N/c1-2-3/h2-3H2,1H3"], return_fingerprints_as_str=True, version=2)
    ['0100000000000000000000000000000000000000000000010000000000000000']
    """
    log = logging.getLogger(__name__)

    # Essentially these substructures look for the amine environment and groups which can interact with it. The
    # first elements identify specific groups. The later elements look for the motifs of closeness of certain
    # functional groups to the amine groups.
    ccus_substructs = ccus_fps(
        names=substructure_names,
        substructures=substructures,
        fingerprint_version=version,
        log=log,
    )
    substructure_names = ccus_substructs.names
    substructures = ccus_substructs.substructures

    if return_smarts_only is True:
        return substructures

    log.info(
        "Number of substructures: {} Number of substructure names: {}".format(
            len(substructures), len(substructure_names),
        ),
    )
    if len(substructures) != len(substructure_names):
        log.error(
            "Number of substructures and names differ cannot produce dataframe: {} "
            "{}".format(len(substructures), len(substructure_names)),
        )
        for s, n in zip(substructures, substructure_names):
            log.info("{} {}".format(n, s))

    # ASSUMPTION: no one puts a mix of smiles and inchi in
    if inchi_regex in representation[0]:
        log.info(
            "'inchi=' found in the first molecule, assume all molecules will be inchi not smiles!",
        )
        inchi = representation

        log.info("Making fingerprint from {} InChI".format(len(inchi)))

        # chunk larger datasets manually
        if len(inchi) >= thresh:
            fps = []
            iters = int(np.floor(len(inchi) / thresh)) + 1
            limit = int(iters)
            bases = [0 + i * thresh for i in range(iters)]
            uppers = [thresh + i * thresh for i in range(iters)]
            uppers[-1] = None
            for b, u in zip(bases, uppers):
                log.info("fingerprints computed for {} InChI".format(b))
                log.info("InChI[{}:{}]".format(b, u))
                inchs = inchi[b:u]

                fp_tmp = [
                    dask.delayed(dask_substructure_checker)(
                        inc, substructures=substructures, smiles=False,
                    )
                    for inc in inchs
                ]
                fps = fps + fp_tmp

            # compute fingerprints
            log.info("Running DASK computation .....")
            fps = dask.compute(*fps)
            log.info("DASK complete fingerprints generated.")
        else:
            log.info(
                "Length of the InChI list is less than the threshold ({} change through function call) running "
                "without DASK".format(thresh),
            )
            fps = [
                dask_substructure_checker(
                    inch, substructures=substructures, smiles=False,
                )
                for inch in inchi
            ]
            fps = dask.compute(*fps)

        log.info("Preparing fingerprints")
        if substructure_names is not None:
            log.info("Building dataframe .....")
            df = pd.DataFrame(data=fps, columns=substructure_names)
            log.info("Building RDKit bits .....")
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]
            log.info("Fingerprint generation finished.")

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, df, substructures
        else:
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, substructures

    else:
        log.info(
            "'inchi=' not found in the first molecule, assume all molecules will be SMILES not InChI!",
        )
        smiles = representation

        log.info("Making fingerprint from {} SMILES".format(len(smiles)))

        # chunk larger datasets manually
        if len(smiles) >= thresh:
            fps = []
            iters = int(np.floor(len(smiles) / thresh)) + 1
            limit = int(iters)
            bases = [0 + i * thresh for i in range(iters)]
            uppers = [thresh + i * thresh for i in range(iters)]
            uppers[-1] = None
            for b, u in zip(bases, uppers):
                log.info("fingerprints computed for {} smiles".format(b))
                log.info("smiles[{}:{}]".format(b, u))
                smis = smiles[b:u]

                fp_tmp = [
                    dask.delayed(dask_substructure_checker)(
                        smi, substructures=substructures, smiles=True,
                    )
                    for smi in smis
                ]
                fps = fps + fp_tmp

            # compute fingerprints
            log.info("Running DASK computation .....")
            fps = dask.compute(*fps)
            log.info("DASK complete fingerprints generated.")
        else:
            log.info(
                "Length of the smiles list is less than the threshold ({} change through function call) running "
                "without DASK".format(thresh),
            )
            fps = [
                dask_substructure_checker(
                    smi, substructures=substructures, smiles=True)
                for smi in smiles
            ]
            fps = dask.compute(*fps)

        log.info("Preparing fingerprints")
        if substructure_names is not None:
            log.info("Building dataframe .....")
            df = pd.DataFrame(data=fps, columns=substructure_names)
            log.info("Building RDKit bits .....")
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]
            log.info("Fingerprint generation finished.")

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, df, substructures
        else:
            ffps = [
                DataStructs.cDataStructs.CreateFromBitString(
                    "".join([str(ent) for ent in f]),
                )
                for f in fps
            ]

            if return_fingerprints_as_str is True:
                return [bits_to_text(f) for f in ffps]
            elif return_only_fingerprint is True:
                return ffps
            else:
                return ffps, substructures
def ccus_fp_bitstr(mol: rdkit.Chem.rdchem.Mol, substructures: list = None, substructure_names: list = None, version: int = 1)

Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in return the ccus fingerprint as a cDatastructs array. :param mol: str - RDkit molecule :param substructures: iterable - SMARTS defining the substructure to search for :param substructure_names: iterable - names to describe the SMARTS substructure strings meaning :param version: int - version of the fingerprints to use :return: bitstr

Expand source code
def ccus_fp_bitstr(
    mol: rdkit.Chem.rdchem.Mol,
    substructures: list = None,
    substructure_names: list = None,
    version: int = 1,
):
    """
    Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in
    return the ccus fingerprint as a cDatastructs array.
    :param mol: str - RDkit molecule
    :param substructures: iterable - SMARTS defining the substructure to search for
    :param substructure_names: iterable - names to describe the SMARTS substructure strings meaning
    :param version: int - version of the fingerprints to use
    :return: bitstr
    """

    log = logging.getLogger(__name__)

    if substructures is None:
        ccus_substructs = ccus_fps(
            names=substructure_names,
            substructures=substructures,
            fingerprint_version=version,
            verbose=False,
        )
        if substructure_names is None:
            substructure_names = ccus_substructs.names
        substructures = ccus_substructs.substructures
    if substructure_names is None:
        ccus_substructs = ccus_fps(
            names=substructure_names,
            substructures=substructures,
            fingerprint_version=version,
            verbose=False,
        )
        if substructures is None:
            substructures = ccus_substructs.substructures
        substructure_names = ccus_substructs.names

    substructs = [Chem.MolFromSmarts(substructure)
                  for substructure in substructures]
    fps = [int(mol.HasSubstructMatch(substruct)) for substruct in substructs]
    ffps = DataStructs.cDataStructs.CreateFromBitString(
        "".join([str(ent) for ent in fps]),
    )

    return ffps
def contains_substructures(smiles: list, substructures: tuple = ('[NH3]', '[NX3;H2][C;!$(C=[#7,#8])]', '[NX3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]', '[NX3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]', '[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]'), substructure_names: tuple = ('ammonia', 'primiary_amine', 'secondary_amine', 'tertiary_amine', 'aromatic_sp2_n'), version_name: int = 1, thresh: int = 1000, remove_no_match_rows: bool = False, test: bool = False)

Function to check if the smiles are amines or contain an aromatic N sp2 :param smiles: str - smiles string to look for substructure :param substructures: iterable of str - smarts patterns to look for :param substructure_names: iterable of str - description of the SMARTS patterns :param version_name: int - ccs fp version number :param thresh: int - batch threshold for fingerprint code :param remove_no_match_rows: bool - remove rows with no matches :param test: bool - for testing the function :return: dataframe

Expand source code
def contains_substructures(
    smiles: list,
    substructures: tuple = (
        "[NH3]",
        "[NX3;H2][C;!$(C=[#7,#8])]",
        "[NX3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
        "[NX3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
        "[$([nX3,X2](:[c,n,o,b,s]):[c,n,o,b,s])]",
    ),
    substructure_names: tuple = (
        "ammonia",
        "primiary_amine",
        "secondary_amine",
        "tertiary_amine",
        "aromatic_sp2_n",
    ),
    version_name: int = 1,
    thresh: int = 1000,
    remove_no_match_rows: bool = False,
    test: bool = False,
):
    """
    Function to check if the smiles are amines or contain an aromatic N sp2
    :param smiles: str - smiles string to look for substructure
    :param substructures: iterable of str - smarts patterns to look for
    :param substructure_names: iterable of str - description of the SMARTS patterns
    :param version_name: int - ccs fp version number
    :param thresh: int - batch threshold for fingerprint code
    :param remove_no_match_rows: bool - remove rows with no matches
    :param test: bool - for testing the function
    :return: dataframe

    """

    log = logging.getLogger(__name__)

    log.info("Passing smiles and substructures to dask fp")
    log.info(
        "Substructures:\n{}\n-----\n".format(
            "\n".join(
                [
                    "{} ; {}".format(n, s)
                    for n, s in zip(substructure_names, substructures)
                ],
            ),
        ),
    )

    if isinstance(smiles, str):
        log.info(
            "Smiles is expected to be a list assume it is one smiles and put in a list",
        )
        smiles = [smiles]

    fingps, fingps_df, smarts = ccs_fp(
        smiles,
        substructures=substructures,
        substructure_names=substructure_names,
        return_smarts_only=False,
        version=version_name,
        thresh=thresh,
    )

    any_true = fingps_df.any(axis=1)
    log.info(
        "The following rows have at least one of the substructures found: {}".format(
            any_true,
        ),
    )
    fingps_df["any_true"] = any_true

    # dataframe index values which have ata least one matching substructure
    idx = fingps_df.index[fingps_df["any_true"] == 0]

    if remove_no_match_rows is True:
        log.info("{}".format(fingps_df))
        log.info("Dropping rows: {}".format(idx))
        fingps_df.drop(idx, axis=0, inplace=True)

    return fingps_df
def dask_substructure_checker(representation: str, substructures: list = None, smiles=True) ‑> list

Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in :param smi: str - smiles :param substructures: iterable - SMARTS defining the substructure to search for :return: tuple - smiles substructure and True/False for looking for the substructure

>>> dask_substructure_checker("CC", ["*CC*"])
[1]
Expand source code
def dask_substructure_checker(
    representation: str, substructures: list = None, smiles=True,
) -> list:
    """
    Function to find a substructure using SMARTS - Does not use dask but is used in functions that dask is used in
    :param smi: str - smiles
    :param substructures: iterable - SMARTS defining the substructure to search for
    :return: tuple - smiles substructure and True/False for looking for the substructure
    >>> dask_substructure_checker("CC", ["*CC*"])
    [1]
    """

    log = logging.getLogger(__name__)

    if smiles is True:
        mol = mai.smiles_to_molecule(representation)
    else:
        mol = mai.inchi_to_molecule(representation)

    substructs = [Chem.MolFromSmarts(substructure)
                  for substructure in substructures]

    fp_vec = [int(mol.HasSubstructMatch(substruct))
              for substruct in substructs]

    return fp_vec
def dice_similarity(v1, v2)

Function to return dice similarity between two bitvectors :param v1: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint :param v2: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint

Expand source code
def dice_similarity(v1, v2):
    """
    Function to return dice similarity between two bitvectors
    :param v1: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint
    :param v2: RDKit bit vector - chemcical represention as bit vector eg a bit vector fingerprint
    """

    return DataStructs.DiceSimilarity(v1, v2)
def diverse_set_picking(fps: list, n_diverse_batch: int = 10)

A function using the commonly applied maxmin picking methods https://onlinelibrary.wiley.com/doi/epdf/10.1002/qsar.200290002 essentially the algorithm selects a seed molecule calculates dissimilarlity from a fingerprint distance metric and adds the most dissimilar molecule to the set. This is stopped when either n molecules are picked or m threshold in the distance metric is surpassed by all molecules. :param fps: list of RDKit fingerprint - molecule fingerprints to use to pick a diverse set from :param n_diverse_batch : int - the number of set members to include in the diverse set

Expand source code
def diverse_set_picking(fps: list, n_diverse_batch: int = 10):
    """
    A function using the commonly applied maxmin picking methods https://onlinelibrary.wiley.com/doi/epdf/10.1002/qsar.200290002
    essentially the algorithm selects a seed molecule calculates dissimilarlity from a fingerprint distance metric
    and adds the most dissimilar molecule to the set. This is stopped when either n molecules are picked or m threshold
    in the distance metric is surpassed by all molecules.
    :param fps: list of RDKit fingerprint - molecule fingerprints to use to pick a diverse set from
    :param n_diverse_batch : int - the number of set members to include in the diverse set
    """

    log = logging.getLogger(__name__)

    diversity_picker = SimDivFilters.rdSimDivPickers.MaxMinPicker()

    number_fps = len(fps)
    diverse_indices = diversity_picker.LazyBitVectorPick(
        fps, poolSize=number_fps, pickSize=n_diverse_batch, seed=random_seed,
    )

    log.debug("Diverse indices: {}".format(list(diverse_indices)))

    return diverse_indices
def fingerprint_similarity(fps1, fps2, dice: bool = False, return_distance: bool = False) ‑> float

Function to calculate fingerprint similarity :param fps1: RDKit fingerprint - fingerprint of molecule 1 :param fps2: RDKit fingerprint - fingerprint of molecule 2 :param dice: true/false - Use dice similarity :param return_distance: bool - return distance (1 - similarity)

Expand source code
def fingerprint_similarity(
    fps1, fps2, dice: bool = False, return_distance: bool = False,
) -> float:
    """
    Function to calculate fingerprint similarity
    :param fps1: RDKit fingerprint - fingerprint of molecule 1
    :param fps2: RDKit fingerprint - fingerprint of molecule 2
    :param dice: true/false - Use dice similarity
    :param return_distance: bool - return distance (1 - similarity)
    """

    if dice is True:
        similarity = dice_similarity(fps1, fps2)
    else:
        similarity = DataStructs.TanimotoSimilarity(fps1, fps2)

    if return_distance is True:
        similarity = 1.0 - similarity

    return similarity
def maccskeys_fingerprints(smiles: list) ‑> list

Function to get MACCS fingerprints :param smiles: list - smiles representation of the molecules to make fingerprints of

Expand source code
def maccskeys_fingerprints(smiles: list) -> list:
    """
    Function to get MACCS fingerprints
    :param smiles: list - smiles representation of the molecules to make fingerprints of
    """

    log = logging.getLogger(__name__)

    mols = [mai.smiles_to_molecule(smile) for smile in smiles]
    fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]

    return fps
def substructure_checker(smiles: str, substructure: str = None) ‑> int

Function to find a substructure using SMARTS :param smi: str - smiles :param substructure: str - SMARTS defining the substructure to search for :return: tuple - smiles substructure and True/False for looking for the substructure

>>> substructure_checker("CC", "*CC*")
1
Expand source code
def substructure_checker(smiles: str, substructure: str = None) -> int:
    """
    Function to find a substructure using SMARTS
    :param smi: str - smiles
    :param substructure: str - SMARTS defining the substructure to search for
    :return: tuple - smiles substructure and True/False for looking for the substructure
    >>> substructure_checker("CC", "*CC*")
    1
    """

    mol = mai.smiles_to_molecule(smiles)

    substruct = Chem.MolFromSmarts(substructure)

    has_substructure = 0
    if mol.HasSubstructMatch(substruct):
        has_substructure = 1

    return has_substructure

Classes

class ccus_fps (fingerprint_version: int = 1, names: list = None, substructures: list = None, log: logging.Logger = None, verbose: bool = True)

Initialise the class :param fingerprint_version: int - version number :param names: iterable - list of names of the substructure to use for the fingerprint :param substructures: iterable - list of substructure strings in smarts notation :param log: logging.Logger - logger object :param verbose: bool - print extra information is verbose

Expand source code
class ccus_fps(object):
    def __init__(
        self,
        fingerprint_version: int = 1,
        names: list = None,
        substructures: list = None,
        log: logging.Logger = None,
        verbose: bool = True,
    ):
        """
        Initialise the class
        :param fingerprint_version: int - version number
        :param names: iterable - list of names of the substructure to use for the fingerprint
        :param substructures: iterable - list of substructure strings in smarts notation
        :param log: logging.Logger - logger object
        :param verbose: bool - print extra information is verbose
        """
        self.version_explanations = {
            1: "This is a filtered set which seems to perform well for modelling this finger print includes rarer groups like"
            "sulphur.",
            2: "This is a filtered set which seems to perform well for modelling but does not include rarer groups like"
            "sulphur containing hetrocycles.",
        }

        self.version_names = {
            1: [
                "ammonia",
                "primary_amine",
                "secondary_amine",
                "tertiary_amine",
                "quaternary_N",
                "imine",
                "nitrogen_bonded_to_carbon",
                "aromatic_N_sp2",
                "carboxylic_acid",
                "primary_alcohol",
                "secondary_alcohol",
                "tertiary_alcohol",
                "t_butyl",
                "carbonyl",
                "halocarbon",
                "benezene_ring",
                "6_member_aromatic_c_and_n_ring",
                "6_member_c_and_o_ring",
                "5_c_ring",
                "5_member_aromatic_c_and_n_ring",
                "5_member_c_and_o_ring",
                "Cyclohexane",
                "Cyclohexylamine",
                "Aniline",
                "benzylamine",
                "piperidine",
                "pyridine",
                "pyrrole",
                "primary_amino_alcohol_two_carbon_separation",
                "secondary_amino_alcohol_two_carbon_separation",
                "tertiary_amino_alcohol_two_carbon_separation",
                "primary_amino_alcohol_three_carbon_separation",
                "secondary_amino_alcohol_three_carbon_separation",
                "tertiary_amino_alcohol_three_carbon_separation",
                "aliphatic_primary_amino_alcohol_two_carbon_separation",
                "aliphatic_secondary_amino_alcohol_two_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_two_carbon_separation",
                "aliphatic_primary_amino_alcohol_three_carbon_separation",
                "aliphatic_secondary_amino_alcohol_three_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_three_carbon_separation",
                "primary_amine_one_carbon_aromatic_group",
                "primary_amine_two_carbon_aromatic_group",
                "primary_amine_three_carbon_aromatic_group",
                "secondary_amine_one_carbon_aromatic_group",
                "secondary_amine_two_carbon_aromatic_group",
                "secondary_amine_three_carbon_aromatic_group",
                "tertiary_amine_one_carbon_aromatic_group",
                "tertiary_amine_two_carbon_aromatic_group",
                "tertiary_amine_three_carbon_aromatic_group",
                "methyl_branch_one_carbon_from_a_N_atom",
                "methyl_branch_two_carbon_from_a_N_atom",
                "methyl_branch_three_carbon_from_a_N_atom",
                "methyl_branch_four_carbon_from_a_N_atom",
                "methyl_branch_five_carbon_from_a_N_atom",
                "methyl_branch_six_carbon_from_a_N_atom",
                "ethyl_chain",
                "propyl_chain",
                "butyl_chain",
                "pentyl_chain",
                "hexyl_chain",
                "poly_primary_and_or_secondary_amine",
                "poly_primary_and_or_secondary_and_or_tertiary_amine",
                "poly_alcohol",
                "pyrazine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_substitution",
                "Presence_of_Boron",
                "Presence_of_Silicon",
                "Presence_of_Phosphurus",
                "Presence_of_Sulphur",
                "positive_charge_group",
                "negative_charge_group",
            ],
            2: [
                "ammonia",
                "primary_amine",
                "secondary_amine",
                "tertiary_amine",
                "quaternary_N",
                "aromatic_N_sp2",
                "carboxylic_acid",
                "primary_alcohol",
                "secondary_alcohol",
                "tertiary_alcohol",
                "t_butyl",
                "carbonyl",
                "halocarbon",
                "benezene_ring",
                "6_member_aromatic_c_and_n_ring",
                "6_member_c_and_o_ring",
                "5_c_ring",
                "5_member_aromatic_c_and_n_ring",
                "5_member_c_and_o_ring",
                "Cyclohexane",
                "Cyclohexylamine",
                "Aniline",
                "benzylamine",
                "piperidine",
                "pyridine",
                "pyrrole",
                "primary_amino_alcohol_two_carbon_separation",
                "secondary_amino_alcohol_two_carbon_separation",
                "tertiary_amino_alcohol_two_carbon_separation",
                "primary_amino_alcohol_three_carbon_separation",
                "secondary_amino_alcohol_three_carbon_separation",
                "tertiary_amino_alcohol_three_carbon_separation",
                "aliphatic_primary_amino_alcohol_two_carbon_separation",
                "aliphatic_secondary_amino_alcohol_two_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_two_carbon_separation",
                "aliphatic_primary_amino_alcohol_three_carbon_separation",
                "aliphatic_secondary_amino_alcohol_three_carbon_separation",
                "aliphatic_tertiary_amino_alcohol_three_carbon_separation",
                "primary_amine_one_carbon_aromatic_group",
                "primary_amine_two_carbon_aromatic_group",
                "primary_amine_three_carbon_aromatic_group",
                "secondary_amine_one_carbon_aromatic_group",
                "secondary_amine_two_carbon_aromatic_group",
                "secondary_amine_three_carbon_aromatic_group",
                "tertiary_amine_one_carbon_aromatic_group",
                "tertiary_amine_two_carbon_aromatic_group",
                "tertiary_amine_three_carbon_aromatic_group",
                "methyl_branch_one_carbon_from_a_N_atom",
                "methyl_branch_two_carbon_from_a_N_atom",
                "methyl_branch_three_carbon_from_a_N_atom",
                "methyl_branch_four_carbon_from_a_N_atom",
                "methyl_branch_five_carbon_from_a_N_atom",
                "methyl_branch_six_carbon_from_a_N_atom",
                "ethyl_chain",
                "propyl_chain",
                "butyl_chain",
                "pentyl_chain",
                "hexyl_chain",
                "poly_primary_and_or_secondary_amine",
                "poly_primary_and_or_secondary_and_or_tertiary_amine",
                "poly_alcohol",
                "pyrazine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_and_5_substitution",
                "pyridine_aliphatic_C_2_substitution",
            ],
        }

        self.version_substructures = {
            1: [
                "[NH3]",  # ammonia
                "[NX3;H2][C;!$(C=[#7,#8])]",  # 1' amine
                "[NX3;H1][C;!$(C=[#7,#8])][C;!$(C=[#7,#8])]",  # 2' amine
                # 3' amine
                "[NX3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]",
                "[NX4+]",  # ammonium
                "[N]=[C]",  # imine,
                # N bonded to C "[$([#6]~[#7]);!$([#6]-[#7])]", # nitrogen bonded to carbon with any bond other than a single bond
                "[#6]~[#7]",
                "[a]:[nX3,X2]:[a]",  # SP2 aromatic N
                "[CX3;$([#6]),$([O;H1])](=[OX1])[$([O])]",  # carboxylic acid
                "[#6][#6;!$(C(=O)[OH])][OH]",  # 1' alcohol
                "[#6][#6]([#6])[OH]",  # 2' alcohol'
                "[#6][#6]([#6])([#6])[OH]",  # 3' alcohol
                "[#6]C([CH3])([CH3])([CH3])",  # t-butyl
                "[CX3]=[O;!$(O*)]",  # Carbonyl
                "[#6]~[F,Cl,Br,I]",  # halo carbon
                "c1ccccc1",  # benzene
                # aromatic n or c 6 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n][c,n]1",
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",  # Any O and C 6 ring
                "[#6]1~[#6]~[#6]~[#6]~[#6]~1",  # any C 5 ring
                # aromatic n or c 5 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n]1",
                # any O or C 5 member ring system
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",
                "C1CCCCC1",  # cyclohexane
                # amine bound to ring
                "[NX3;H2,H1][#6]1~[#6]~[#6]~[#6]~[#6]~[#6]~1",
                "[NH2]c1ccccc1",  # 1' amine bound to benzene
                "c1ccccc1[CH2][NH2]",  # benzyl NH2
                "C1N([#1])CCCC1",  # H connected to N in an unsaturated ring
                "c1ncccc1",  # Pyridine
                "c1n([H])ccc1",  # pyrrole
                # see description
                "[$([#6]([OH])[#6][#7H2]);!$([#6]([OH])(=O)[#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#7H]([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#7]([#6])([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H2]);!$([#6]([OH])(=O)[#6][#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#6][#7H]([#6]))]",
                "[$([#6]([OH])[#6][#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#6][#7]([#6])([#6]))]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7]([CX4])([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[N]([CX4])([CX4])",
                "[a][C][#7H2]",  # see description
                "[a][C][C][#7H2]",  # see description
                "[a][C][C][C][#7H2]",  # see description
                "[a][C][#7H]([#6])",  # see description
                "[a][C][C][#7H]([#6])",  # see description
                "[a][C][C][C][#7H]([#6])",  # see description
                "[a][C][#7]([#6])([#6])",  # see description
                "[a][C][C][#7]([#6])([#6])",  # see description
                "[a][C][C][C][#7]([#6])([#6])",  # see description
                "[NH2][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4]([CH3])",  # see description
                # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4][CX4]([CH3])",
                "[CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",
                "[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])].[$([#7X3;H2][C;!$(C=[#7,#8])]),"
                # poly 1' 2' amine
                "$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])]",
                "[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]),$([#7X3]([C;!$(C=[#7,#8])])"
                "([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])].[$([#7X3;H2][C;!$(C=[#7,#8])]),$([#7X3;H1]([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])]),"
                # poly 1' 2' or 3' amine
                "$([#7X3]([C;!$(C=[#7,#8])])([C;!$(C=[#7,#8])])[C;!$(C=[#7,#8])])]",
                "[#6][O;H1].[#6][O;H1]",  # poly alcohol
                # pyrazine aliphatic C2 and C5 substitution
                "n1c([CX4])cnc([CX4])c1",
                # pyridine_aliphatic_C_2_and_5_substitution
                "n1c([CX4])ccc([CX4])c1",
                "n1cccc([CX4])c1",  # pyridine_aliphatic_C_2_substitution
                "[#5]",  # B
                "[#14]",  # Si
                "[#15]",  # P
                "[#16]",  # S
                "[+]",  # positive cahrged group
                "[-]",  # negative charge group
            ],
            2: [
                "[NH3]",  # ammonia
                "[NX3;H2][CX4;!$(C=[#7,#8])]",  # 1' amine
                "[NX3;H1][CX4;!$(C=[#7,#8])][CX4;!$(C=[#7,#8])]",  # 2' amine
                # 3' amine
                "[NX3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]",
                "[NX4+]",  # ammonium
                "[a]:[nX3,X2]:[a]",  # SP2 aromatic N
                "[CX3;$([#6]),$([O;H1])](=[OX1])[$([O])]",  # carboxylic acid
                "[#6][#6;!$(C(=O)[OH])][OH]",  # 1' alcohol
                "[#6][#6]([#6])[OH]",  # 2' alcohol'
                "[#6][#6]([#6])([#6])[OH]",  # 3' alcohol
                "[#6]C([CH3])([CH3])([CH3])",  # t-butyl
                "[CX3]=[O;!$(O*)]",  # Carbonyl
                "[#6]~[F,Cl,Br,I]",  # halo carbon
                "c1ccccc1",  # benzene
                # aromatic n or c 6 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n][c,n]1",
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",  # Any O and C 6 ring
                "[#6]1~[#6]~[#6]~[#6]~[#6]~1",  # any C 5 ring
                # aromatic n or c 5 member hetrocycle
                "[c,n]1[c,n][c,n][c,n][c,n]1",
                # any O or C 5 member ring system
                "[#6,#8]1~[#6,#8]~[#6,#8]~[#6,#8]~[#6,#8]~1",
                "C1CCCCC1",  # cyclohexane
                # amine bound to ring
                "[NX3;H2,H1][#6]1~[#6]~[#6]~[#6]~[#6]~[#6]~1",
                "[NH2]c1ccccc1",  # 1' amine bound to benzene
                "c1ccccc1[CH2][NH2]",  # benzyl NH2
                "C1N([#1])CCCC1",  # H connected to N in an unsaturated ring
                "c1ncccc1",  # Pyridine
                "c1n([H])ccc1",  # pyrrole
                # see description
                "[$([#6]([OH])[#6][#7H2]);!$([#6]([OH])(=O)[#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#7H]([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#7]([#6])([#6]))]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H2]);!$([#6]([OH])(=O)[#6][#6][#7H2])]",
                # see description
                "[$([#6]([OH])[#6][#6][#7H]([#6]));!$([#6]([OH])(=O)[#6][#6][#7H]([#6]))]",
                "[$([#6]([OH])[#6][#6][#7]([#6])([#6]));!$([#6]([OH])(=O)[#6][#6][#7]([#6])([#6]))]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7H]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])[#7]([CX4])([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH2]",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[NH]([CX4])",
                # see description
                "C([#6,#1])([#6,#1])([OH])C([#6,#1])([#6,#1])C([#6,#1])([#6,#1])[N]([CX4])([CX4])",
                "[a][C][#7H2]",  # see description
                "[a][C][C][#7H2]",  # see description
                "[a][C][C][C][#7H2]",  # see description
                "[a][C][#7H]([#6])",  # see description
                "[a][C][C][#7H]([#6])",  # see description
                "[a][C][C][C][#7H]([#6])",  # see description
                "[a][C][#7]([#6])([#6])",  # see description
                "[a][C][C][#7]([#6])([#6])",  # see description
                "[a][C][C][C][#7]([#6])([#6])",  # see description
                "[NH2][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4]([CH3])",  # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4]([CH3])",  # see description
                # see description
                "[NH2][CX4][CX4][CX4][CX4][CX4][CX4]([CH3])",
                "[CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",  # see description
                # see description
                "[CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2][CX4;H2]",
                "[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])].[$([#7X3;H2][CX4;!$(C=[#7,#8])]),"
                # poly 1' 2' amine
                "$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])]",
                "[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]),$([#7X3]([CX4;!$(C=[#7,#8])])"
                "([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])].[$([#7X3;H2][CX4;!$(C=[#7,#8])]),$([#7X3;H1]([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])]),"
                # poly 1' 2' or 3' amine
                "$([#7X3]([CX4;!$(C=[#7,#8])])([CX4;!$(C=[#7,#8])])[CX4;!$(C=[#7,#8])])]",
                "[#6][O;H1].[#6][O;H1]",  # poly alcohol
                # pyrazine aliphatic C2 and C5 substitution
                "n1c([CX4])cnc([CX4])c1",
                # pyridine_aliphatic_C_2_and_5_substitution
                "n1c([CX4])ccc([CX4])c1",
                "n1cccc([CX4])c1",  # pyridine_aliphatic_C_2_substitution
            ],
        }

        try:
            log.info("\n")
        except Exception:
            log = logging.getLogger(__name__)
            log.info("\n")

        self.fingerprint_version = fingerprint_version

        if names is None:
            self.names = self.get_default_names(
                version=self.fingerprint_version)
        else:
            self.names = names

        if substructures is None:
            self.substructures = self.get_default_substructures(
                version=self.fingerprint_version,
            )
        else:
            self.substructures = substructures

        if substructures is None and names is None:
            self.fingerprint_explanation = self.get_default_explanation(
                version=self.fingerprint_version,
            )
            if verbose is True:
                log.info(
                    "Finger print is version {}\n{}".format(
                        self.fingerprint_version, self.fingerprint_explanation,
                    ),
                )
        else:
            log.warning(
                "No fingerprint explanation avaliable as custom substructures have been given, hence you know better than I do what they mean.",
            )

        if len(self.names) != len(self.substructures):
            try:
                log.warning(
                    "WARNING - the number of names ({}) and the number of substructures ({}) is different, "
                    "This will cause issues for defualt functions in this module. Names will be reset to indexes.".format(
                        len(self.names), len(self.substructures),
                    ),
                )
                self.names = [str(ith)
                              for ith in enumerate(self.substructures)]
                log.warning(
                    "New names and substructures:\n{}".format(
                        "\n".join(
                            [
                                "{} : {}".format(n, s)
                                for n, s in zip(self.names, self.substructures)
                            ],
                        ),
                    ),
                )
            except NameError:
                print(
                    "WARNING - the number of names ({}) and the number of substructures ({}) is different, "
                    "This will cause issues for defualt functions in this module.".format(
                        len(self.names), len(self.substructures),
                    ),
                )

        log.info(
            "Please use the citation below for use of this code:\n{}".format(
                citation),
        )

    def get_default_names(self, version: int = None) -> list:
        """
        Function to get the names descriptive names of the substructures we are looking for
        Essentially these substructures look for the amine environment and groups which can interact with it. The first
        elements identify specific groups. The later elements look for the motifs of closeness of certain functional
        groups to the amine groups.
        :param version: int - which version of the fingerprints to get name for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_names[version]

    def get_default_substructures(self, version: int = None) -> list:
        """
        Function get the smarts to search for substructures. Essentially these substructures look for the amine
        environment and groups which can interact with it. The first elements identify specific groups. The
        later elements look for the motifs of closeness of certain functional groups to the amine groups.
        :param version: int - which version of the fingerprints to get substructures for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_substructures[version]

    def get_default_explanation(self, version: int = None) -> str:
        """
        Function to get the description of the version you have picked  names of the substrictires we are looking for
        Essentially these substructures look for the amine environment and groups which can interact with it. The first
        elements identify specific groups. The later elements look for the motifs of closeness of certain
        functional groups to the amine groups.
        :param version: int - which version of the fingerprints to get explanations for
        """

        if version is None:
            version = self.fingerprint_version

        return self.version_explanations[version]

    def get_fp_information(self, return_df: bool = False):
        """
        Print the infomration related to the current fingerprint instantiation
        """

        log = logging.getLogger(__name__)

        log.info(
            "{:4} | {:59} | {:50}\n---------------------------------------------------------"
            "-------------------------------------".format(
                "bit", "description", "smarts",
            ),
        )

        names = []
        for nam in self.names:
            nam = " ".join(nam.split("_"))
            nam = " ".join(nam.split("-"))
            names.append(nam)

        for ith, ds in enumerate(zip(names, self.substructures)):
            log.info("{:4} | {:59} | {:50}".format(ith, ds[0], ds[1]))

        if return_df is True:
            df_information = pd.DataFrame(
                np.array([names, self.substructures]).T,
                columns=["description", "smarts"],
            )
            return df_information

Methods

def get_default_explanation(self, version: int = None) ‑> str

Function to get the description of the version you have picked names of the substrictires we are looking for Essentially these substructures look for the amine environment and groups which can interact with it. The first elements identify specific groups. The later elements look for the motifs of closeness of certain functional groups to the amine groups. :param version: int - which version of the fingerprints to get explanations for

Expand source code
def get_default_explanation(self, version: int = None) -> str:
    """
    Function to get the description of the version you have picked  names of the substrictires we are looking for
    Essentially these substructures look for the amine environment and groups which can interact with it. The first
    elements identify specific groups. The later elements look for the motifs of closeness of certain
    functional groups to the amine groups.
    :param version: int - which version of the fingerprints to get explanations for
    """

    if version is None:
        version = self.fingerprint_version

    return self.version_explanations[version]
def get_default_names(self, version: int = None) ‑> list

Function to get the names descriptive names of the substructures we are looking for Essentially these substructures look for the amine environment and groups which can interact with it. The first elements identify specific groups. The later elements look for the motifs of closeness of certain functional groups to the amine groups. :param version: int - which version of the fingerprints to get name for

Expand source code
def get_default_names(self, version: int = None) -> list:
    """
    Function to get the names descriptive names of the substructures we are looking for
    Essentially these substructures look for the amine environment and groups which can interact with it. The first
    elements identify specific groups. The later elements look for the motifs of closeness of certain functional
    groups to the amine groups.
    :param version: int - which version of the fingerprints to get name for
    """

    if version is None:
        version = self.fingerprint_version

    return self.version_names[version]
def get_default_substructures(self, version: int = None) ‑> list

Function get the smarts to search for substructures. Essentially these substructures look for the amine environment and groups which can interact with it. The first elements identify specific groups. The later elements look for the motifs of closeness of certain functional groups to the amine groups. :param version: int - which version of the fingerprints to get substructures for

Expand source code
def get_default_substructures(self, version: int = None) -> list:
    """
    Function get the smarts to search for substructures. Essentially these substructures look for the amine
    environment and groups which can interact with it. The first elements identify specific groups. The
    later elements look for the motifs of closeness of certain functional groups to the amine groups.
    :param version: int - which version of the fingerprints to get substructures for
    """

    if version is None:
        version = self.fingerprint_version

    return self.version_substructures[version]
def get_fp_information(self, return_df: bool = False)

Print the infomration related to the current fingerprint instantiation

Expand source code
def get_fp_information(self, return_df: bool = False):
    """
    Print the infomration related to the current fingerprint instantiation
    """

    log = logging.getLogger(__name__)

    log.info(
        "{:4} | {:59} | {:50}\n---------------------------------------------------------"
        "-------------------------------------".format(
            "bit", "description", "smarts",
        ),
    )

    names = []
    for nam in self.names:
        nam = " ".join(nam.split("_"))
        nam = " ".join(nam.split("-"))
        names.append(nam)

    for ith, ds in enumerate(zip(names, self.substructures)):
        log.info("{:4} | {:59} | {:50}".format(ith, ds[0], ds[1]))

    if return_df is True:
        df_information = pd.DataFrame(
            np.array([names, self.substructures]).T,
            columns=["description", "smarts"],
        )
        return df_information