Source code for synkit.Chem.Reaction.tautomerize

from typing import List, Dict, Optional
from rdkit import Chem
from joblib import Parallel, delayed

from synkit.Graph.FG import smiles_to_graph_and_functional_groups



[docs]
class Tautomerize:
    """Standardize molecules by converting enol and hemiketal tautomers into
    their more stable carbonyl forms, and apply these corrections to individual
    SMILES or collections of reaction data."""


[docs]
    @staticmethod
    def standardize_enol(smiles: str, atom_indices: Optional[List[int]] = None) -> str:
        """Convert an enol tautomer into its corresponding carbonyl form.

        :param smiles: SMILES string of the enol-containing molecule.
        :type smiles: str
        :param atom_indices: List of three atom indices [C1, C2, O]
            defining the enol. If None, defaults to [0, 1, 2].
        :type atom_indices: List[int] or None
        :returns: SMILES of the molecule after enol→carbonyl conversion,
            or an error message if the input is invalid or indices fail.
        :rtype: str
        """
        if atom_indices is None:
            atom_indices = [0, 1, 2]

        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return "Invalid SMILES format."
        emol = Chem.EditableMol(mol)

        try:
            c_idxs = [
                i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "C"
            ]
            c1_idx, c2_idx = c_idxs[:2]
            o_idx = next(
                i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "O"
            )
        except Exception as e:
            return f"Error processing indices: {e}"

        try:
            emol.RemoveBond(c1_idx, c2_idx)
            emol.RemoveBond(c2_idx, o_idx)
            emol.AddBond(c1_idx, c2_idx, Chem.rdchem.BondType.SINGLE)
            emol.AddBond(c2_idx, o_idx, Chem.rdchem.BondType.DOUBLE)
            new_mol = emol.GetMol()
            Chem.SanitizeMol(new_mol)
            return Chem.MolToSmiles(new_mol)
        except Exception as e:
            return f"Error in modifying molecule: {e}"



[docs]
    @staticmethod
    def standardize_hemiketal(smiles: str, atom_indices: List[int]) -> str:
        """Convert a hemiketal tautomer into its corresponding carbonyl form.

        :param smiles: SMILES string of the hemiketal-containing
            molecule.
        :type smiles: str
        :param atom_indices: List of atom indices [C, O1, O2] defining
            the hemiketal.
        :type atom_indices: List[int]
        :returns: SMILES of the molecule after hemiketal→carbonyl
            conversion, or an error message if the input is invalid.
        :rtype: str
        """
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return "Invalid SMILES format."
        emol = Chem.EditableMol(mol)

        try:
            c_idx = next(
                i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "C"
            )
            o_idxs = [
                i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "O"
            ]
            o1_idx = o_idxs[0]
        except Exception as e:
            return f"Error processing indices: {e}"

        try:
            emol.RemoveBond(c_idx, o1_idx)
            if len(o_idxs) > 1:
                emol.RemoveBond(c_idx, o_idxs[1])
            emol.AddBond(c_idx, o1_idx, Chem.rdchem.BondType.DOUBLE)
            new_mol = emol.GetMol()
            Chem.SanitizeMol(new_mol)
            return Chem.MolToSmiles(new_mol)
        except Exception as e:
            return f"Error in modifying molecule: {e}"



[docs]
    @staticmethod
    def fix_smiles(smiles: str) -> str:
        """Iteratively apply enol and hemiketal standardizations until no
        further changes, then return the canonical SMILES.

        :param smiles: SMILES string to standardize.
        :type smiles: str
        :returns: Canonical SMILES of the standardized molecule.
        :rtype: str
        """
        while True:
            targets = Tautomerize._tautomer_targets(smiles)
            if not targets:
                break
            label, indices = targets[0]
            if label == "hemiketal":
                smiles = Tautomerize.standardize_hemiketal(smiles, indices)
            elif label == "enol":
                smiles = Tautomerize.standardize_enol(smiles, indices)
        return Chem.CanonSmiles(smiles)


    @staticmethod
    def _tautomer_targets(smiles: str) -> list[tuple[str, List[int]]]:
        """Return RDKit-index targets used by the tautomer repair helpers."""
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return []
        graph, groups = smiles_to_graph_and_functional_groups(smiles)
        node_to_idx = {
            (
                atom.GetAtomMapNum() if atom.GetAtomMapNum() else atom.GetIdx() + 1
            ): atom.GetIdx()
            for atom in mol.GetAtoms()
        }

        targets = [
            (label, [node_to_idx[node] for node in nodes])
            for label, nodes in groups
            if label in {"hemiketal", "enol"}
        ]
        targets.extend(
            ("hemiketal", [node_to_idx[node] for node in nodes])
            for nodes in Tautomerize._geminal_diol_nodes(graph)
        )
        return targets

    @staticmethod
    def _geminal_diol_nodes(graph) -> list[tuple[int, ...]]:
        """Legacy tautomerization compatibility for hydrated carbonyls."""
        targets: list[tuple[int, ...]] = []
        for carbon, data in graph.nodes(data=True):
            if data.get("element") != "C":
                continue
            hydroxyls = [
                neighbor
                for neighbor in graph.neighbors(carbon)
                if graph.nodes[neighbor].get("element") == "O"
                and graph.nodes[neighbor].get("hcount", 0) >= 1
                and graph.edges[carbon, neighbor].get("order") == 1.0
            ]
            if len(hydroxyls) >= 2:
                targets.append((carbon, hydroxyls[0], hydroxyls[1]))
        return targets


[docs]
    @staticmethod
    def fix_dict(data: Dict[str, str], reaction_column: str) -> Dict[str, str]:
        """Standardize the reactant and product SMILES in a reaction
        dictionary.

        :param data: Dictionary containing a reaction SMILES under `reaction_column`.
        :type data: Dict[str, str]
        :param reaction_column: Key in `data` where the reaction SMILES is stored.
        :type reaction_column: str
        :returns: The same dictionary with standardized reaction SMILES.
        :rtype: Dict[str, str]
        """
        try:
            react, prod = data[reaction_column].split(">>")
            data[reaction_column] = (
                f"{Tautomerize.fix_smiles(react)}>>{Tautomerize.fix_smiles(prod)}"
            )
        except ValueError:
            data[reaction_column] = Tautomerize.fix_smiles(data[reaction_column])
        return data



[docs]
    @staticmethod
    def fix_dicts(
        data: List[Dict[str, str]],
        reaction_column: str,
        n_jobs: int = 4,
        verbose: int = 0,
    ) -> List[Dict[str, str]]:
        """Standardize multiple reaction dictionaries in parallel.

        :param data: List of dictionaries containing reaction SMILES under `reaction_column`.
        :type data: List[Dict[str, str]]
        :param reaction_column: Key in each dictionary for the reaction SMILES.
        :type reaction_column: str
        :param n_jobs: Number of parallel jobs to run. Defaults to 4.
        :type n_jobs: int
        :param verbose: Verbosity level for the joblib Parallel call. Defaults to 0.
        :type verbose: int
        :returns: List of dictionaries with standardized SMILES.
        :rtype: List[Dict[str, str]]
        """
        results = Parallel(n_jobs=n_jobs, verbose=verbose)(
            delayed(Tautomerize.fix_dict)(d, reaction_column) for d in data
        )
        return results