from typing import List, Dict, Optional
from rdkit import Chem
from joblib import Parallel, delayed
from synkit.Graph.FG import smiles_to_graph_and_functional_groups
[docs]
class Tautomerize:
"""Standardize molecules by converting enol and hemiketal tautomers into
their more stable carbonyl forms, and apply these corrections to individual
SMILES or collections of reaction data."""
[docs]
@staticmethod
def standardize_enol(smiles: str, atom_indices: Optional[List[int]] = None) -> str:
"""Convert an enol tautomer into its corresponding carbonyl form.
:param smiles: SMILES string of the enol-containing molecule.
:type smiles: str
:param atom_indices: List of three atom indices [C1, C2, O]
defining the enol. If None, defaults to [0, 1, 2].
:type atom_indices: List[int] or None
:returns: SMILES of the molecule after enol→carbonyl conversion,
or an error message if the input is invalid or indices fail.
:rtype: str
"""
if atom_indices is None:
atom_indices = [0, 1, 2]
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return "Invalid SMILES format."
emol = Chem.EditableMol(mol)
try:
c_idxs = [
i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "C"
]
c1_idx, c2_idx = c_idxs[:2]
o_idx = next(
i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "O"
)
except Exception as e:
return f"Error processing indices: {e}"
try:
emol.RemoveBond(c1_idx, c2_idx)
emol.RemoveBond(c2_idx, o_idx)
emol.AddBond(c1_idx, c2_idx, Chem.rdchem.BondType.SINGLE)
emol.AddBond(c2_idx, o_idx, Chem.rdchem.BondType.DOUBLE)
new_mol = emol.GetMol()
Chem.SanitizeMol(new_mol)
return Chem.MolToSmiles(new_mol)
except Exception as e:
return f"Error in modifying molecule: {e}"
[docs]
@staticmethod
def standardize_hemiketal(smiles: str, atom_indices: List[int]) -> str:
"""Convert a hemiketal tautomer into its corresponding carbonyl form.
:param smiles: SMILES string of the hemiketal-containing
molecule.
:type smiles: str
:param atom_indices: List of atom indices [C, O1, O2] defining
the hemiketal.
:type atom_indices: List[int]
:returns: SMILES of the molecule after hemiketal→carbonyl
conversion, or an error message if the input is invalid.
:rtype: str
"""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return "Invalid SMILES format."
emol = Chem.EditableMol(mol)
try:
c_idx = next(
i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "C"
)
o_idxs = [
i for i in atom_indices if mol.GetAtomWithIdx(i).GetSymbol() == "O"
]
o1_idx = o_idxs[0]
except Exception as e:
return f"Error processing indices: {e}"
try:
emol.RemoveBond(c_idx, o1_idx)
if len(o_idxs) > 1:
emol.RemoveBond(c_idx, o_idxs[1])
emol.AddBond(c_idx, o1_idx, Chem.rdchem.BondType.DOUBLE)
new_mol = emol.GetMol()
Chem.SanitizeMol(new_mol)
return Chem.MolToSmiles(new_mol)
except Exception as e:
return f"Error in modifying molecule: {e}"
[docs]
@staticmethod
def fix_smiles(smiles: str) -> str:
"""Iteratively apply enol and hemiketal standardizations until no
further changes, then return the canonical SMILES.
:param smiles: SMILES string to standardize.
:type smiles: str
:returns: Canonical SMILES of the standardized molecule.
:rtype: str
"""
while True:
targets = Tautomerize._tautomer_targets(smiles)
if not targets:
break
label, indices = targets[0]
if label == "hemiketal":
smiles = Tautomerize.standardize_hemiketal(smiles, indices)
elif label == "enol":
smiles = Tautomerize.standardize_enol(smiles, indices)
return Chem.CanonSmiles(smiles)
@staticmethod
def _tautomer_targets(smiles: str) -> list[tuple[str, List[int]]]:
"""Return RDKit-index targets used by the tautomer repair helpers."""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return []
graph, groups = smiles_to_graph_and_functional_groups(smiles)
node_to_idx = {
(
atom.GetAtomMapNum() if atom.GetAtomMapNum() else atom.GetIdx() + 1
): atom.GetIdx()
for atom in mol.GetAtoms()
}
targets = [
(label, [node_to_idx[node] for node in nodes])
for label, nodes in groups
if label in {"hemiketal", "enol"}
]
targets.extend(
("hemiketal", [node_to_idx[node] for node in nodes])
for nodes in Tautomerize._geminal_diol_nodes(graph)
)
return targets
@staticmethod
def _geminal_diol_nodes(graph) -> list[tuple[int, ...]]:
"""Legacy tautomerization compatibility for hydrated carbonyls."""
targets: list[tuple[int, ...]] = []
for carbon, data in graph.nodes(data=True):
if data.get("element") != "C":
continue
hydroxyls = [
neighbor
for neighbor in graph.neighbors(carbon)
if graph.nodes[neighbor].get("element") == "O"
and graph.nodes[neighbor].get("hcount", 0) >= 1
and graph.edges[carbon, neighbor].get("order") == 1.0
]
if len(hydroxyls) >= 2:
targets.append((carbon, hydroxyls[0], hydroxyls[1]))
return targets
[docs]
@staticmethod
def fix_dict(data: Dict[str, str], reaction_column: str) -> Dict[str, str]:
"""Standardize the reactant and product SMILES in a reaction
dictionary.
:param data: Dictionary containing a reaction SMILES under `reaction_column`.
:type data: Dict[str, str]
:param reaction_column: Key in `data` where the reaction SMILES is stored.
:type reaction_column: str
:returns: The same dictionary with standardized reaction SMILES.
:rtype: Dict[str, str]
"""
try:
react, prod = data[reaction_column].split(">>")
data[reaction_column] = (
f"{Tautomerize.fix_smiles(react)}>>{Tautomerize.fix_smiles(prod)}"
)
except ValueError:
data[reaction_column] = Tautomerize.fix_smiles(data[reaction_column])
return data
[docs]
@staticmethod
def fix_dicts(
data: List[Dict[str, str]],
reaction_column: str,
n_jobs: int = 4,
verbose: int = 0,
) -> List[Dict[str, str]]:
"""Standardize multiple reaction dictionaries in parallel.
:param data: List of dictionaries containing reaction SMILES under `reaction_column`.
:type data: List[Dict[str, str]]
:param reaction_column: Key in each dictionary for the reaction SMILES.
:type reaction_column: str
:param n_jobs: Number of parallel jobs to run. Defaults to 4.
:type n_jobs: int
:param verbose: Verbosity level for the joblib Parallel call. Defaults to 0.
:type verbose: int
:returns: List of dictionaries with standardized SMILES.
:rtype: List[Dict[str, str]]
"""
results = Parallel(n_jobs=n_jobs, verbose=verbose)(
delayed(Tautomerize.fix_dict)(d, reaction_column) for d in data
)
return results