Source code for synkit.Chem.Reaction.standardize

from typing import List, Optional, Tuple
from rdkit import Chem



[docs]
class Standardize:
    """Utilities to normalize and filter reaction and molecule SMILES.

    This class provides methods to remove atom‑mapping, filter invalid molecules,
    canonicalize reaction SMILES, and a full pipeline via `fit`.

    :ivar None: Stateless helper class.
    """

    def __init__(self) -> None:
        """Initialize the Standardize helper.

        No instance attributes are set.
        """
        pass


[docs]
    @staticmethod
    def remove_atom_mapping(reaction_smiles: str, symbol: str = ">>") -> str:
        """Remove atom‑map numbers from a reaction SMILES string.

        :param reaction_smiles: Reaction SMILES with atom maps, e.g.
            'C[CH3:1]>>C'.
        :type reaction_smiles: str
        :param symbol: Separator between reactants and products.
            Defaults to '>>'.
        :type symbol: str
        :returns: Reaction SMILES without atom‑mapping annotations.
        :rtype: str
        :raises ValueError: If the input format is invalid or contains
            invalid SMILES.
        """
        parts = reaction_smiles.split(symbol)
        if len(parts) != 2:
            raise ValueError(
                "Invalid reaction SMILES format. Expected 'reactants>>products'."
            )

        def clean_smiles(smi: str) -> str:
            mol = Chem.MolFromSmiles(smi)
            if mol is None:
                raise ValueError(f"Invalid SMILES string: {smi}")
            for atom in mol.GetAtoms():
                atom.SetAtomMapNum(0)
            return Chem.MolToSmiles(mol, canonical=True)

        react, prod = map(clean_smiles, parts)
        return f"{react}{symbol}{prod}"



[docs]
    @staticmethod
    def filter_valid_molecules(smiles_list: List[str]) -> List[Chem.Mol]:
        """Filter and sanitize a list of SMILES, returning only valid Mol
        objects.

        :param smiles_list: List of SMILES strings to validate.
        :type smiles_list: List[str]
        :returns: List of sanitized RDKit Mol objects.
        :rtype: List[rdkit.Chem.Mol]
        """
        valid: List[Chem.Mol] = []
        for smi in smiles_list:
            mol = Chem.MolFromSmiles(smi, sanitize=False)
            if mol:
                try:
                    Chem.SanitizeMol(mol)
                    valid.append(mol)
                except Exception:
                    continue
        return valid


    @staticmethod
    def _parse_molecule_fragments(
        smiles_list: List[str],
    ) -> Tuple[List[Chem.Mol], bool]:
        """Parse and sanitize SMILES fragments.

        :param smiles_list: List of SMILES strings to validate.
        :type smiles_list: List[str]
        :returns: Tuple of valid molecules and whether any fragment was invalid.
        :rtype: Tuple[List[rdkit.Chem.Mol], bool]
        """
        valid: List[Chem.Mol] = []
        had_invalid = False
        for smi in smiles_list:
            mol = Chem.MolFromSmiles(smi, sanitize=False)
            if mol:
                try:
                    Chem.SanitizeMol(mol)
                    valid.append(mol)
                except Exception:
                    had_invalid = True
            else:
                had_invalid = True
        return valid, had_invalid


[docs]
    @staticmethod
    def standardize_rsmi(
        rsmi: str, stereo: bool = False, remove_invalid: bool = True
    ) -> Optional[str]:
        """
        Normalize a reaction SMILES: validate molecules, sort fragments, optionally keep stereo.

        :param rsmi: Reaction SMILES in 'reactants>>products' format.
        :type rsmi: str
        :param stereo: If True, include stereochemistry in the output. Defaults to False.
        :type stereo: bool
        :param remove_invalid: If True, drop invalid fragments and standardize
            remaining molecules. If False, return None when any invalid fragment
            exists. Defaults to True.
        :type remove_invalid: bool
        :returns: Standardized reaction SMILES or None if no valid molecules remain.
        :rtype: Optional[str]
        :raises ValueError: If the input format is invalid.
        """
        try:
            react_str, prod_str = rsmi.split(">>")
        except ValueError:
            raise ValueError(
                "Invalid reaction SMILES format. Expected 'reactants>>products'."
            )

        react_mols, react_invalid = Standardize._parse_molecule_fragments(
            react_str.split(".")
        )
        prod_mols, prod_invalid = Standardize._parse_molecule_fragments(
            prod_str.split(".")
        )

        if not remove_invalid and (react_invalid or prod_invalid):
            return None

        if not react_mols or not prod_mols:
            return None

        sorted_react = ".".join(
            sorted(Chem.MolToSmiles(m, isomericSmiles=stereo) for m in react_mols)
        )
        sorted_prod = ".".join(
            sorted(Chem.MolToSmiles(m, isomericSmiles=stereo) for m in prod_mols)
        )

        return f"{sorted_react}>>{sorted_prod}"



[docs]
    def fit(
        self,
        rsmi: str,
        remove_aam: bool = True,
        ignore_stereo: bool = True,
        remove_invalid: bool = True,
    ) -> Optional[str]:
        """
        Full standardization pipeline: strip atom‑mapping, normalize SMILES, fix hydrogen notation.

        :param rsmi: Reaction SMILES to process.
        :type rsmi: str
        :param remove_aam: If True, remove atom‑mapping annotations. Defaults to True.
        :type remove_aam: bool
        :param ignore_stereo: If True, drop stereochemistry. Defaults to True.
        :type ignore_stereo: bool
        :param remove_invalid: If True, drop invalid fragments and standardize
            remaining molecules. If False, return None when any invalid fragment
            exists. Defaults to True.
        :type remove_invalid: bool
        :returns: The standardized reaction SMILES, or None if standardization fails.
        :rtype: Optional[str]
        """
        std = self.standardize_rsmi(
            rsmi, stereo=not ignore_stereo, remove_invalid=remove_invalid
        )
        if std is None:
            return None

        if remove_aam:
            std = self.remove_atom_mapping(std)

        # Format any double‑hydrogen notation
        return std.replace("[HH]", "[H][H]")



[docs]
    @staticmethod
    def categorize_reactions(
        reactions: List[str], target_reaction: str
    ) -> Tuple[List[str], List[str]]:
        """Partition reactions into those matching a target and those not.

        :param reactions: List of reaction SMILES to categorize.
        :type reactions: List[str]
        :param target_reaction: Benchmark reaction SMILES for comparison.
        :type target_reaction: str
        :returns: Tuple of (matches, non_matches):
                  - matches: reactions equal to standardized target
                  - non_matches: all others
        :rtype: Tuple[List[str], List[str]]
        """
        tgt = Standardize.standardize_rsmi(target_reaction, stereo=False)
        matches: List[str] = []
        non_matches: List[str] = []
        for rxn in reactions:
            if rxn == tgt:
                matches.append(rxn)
            else:
                non_matches.append(rxn)
        return matches, non_matches