Source code for synkit.Chem.utils

from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit.Chem import rdChemReactions
import re
from typing import List, Optional, Tuple, Union



[docs]
def clean_radical_rsmi(rsmi: str) -> str:
    """
    Load each side of a reaction SMILES (rSMI) into RDKit, split into disconnected fragments,
    remove any fragment that contains an atom with nonzero radical electrons,
    then reassemble back into a cleaned reaction SMILES.

    :param rsmi: Reaction SMILES string, e.g.
                 'A>>B.C'
    :type rsmi: str
    :returns: Cleaned reaction SMILES with radical-containing fragments removed.
    :rtype: str

    Example:
    >>> clean_radical_rsmi(
    ...   'COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O'
    ...   '>>COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(N)=O.COc1c[c]c(O)c(C(C)(C)C)c1'
    ... )
    'COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O'
    '>>COC(=O)C(CCCCNC(=O)OCc1ccccc1)NC(N)=O'
    """
    if ">>" not in rsmi:
        return rsmi

    def _clean_side(side: str) -> str:
        mol = Chem.MolFromSmiles(side)
        if mol is None:
            return ""
        frags = Chem.GetMolFrags(mol, asMols=True)
        kept = []
        for frag in frags:
            if any(atom.GetNumRadicalElectrons() > 0 for atom in frag.GetAtoms()):
                continue
            kept.append(Chem.MolToSmiles(frag, isomericSmiles=True))
        return ".".join(kept)

    reac, prod = rsmi.split(">>", 1)
    return f"{_clean_side(reac)}>>{_clean_side(prod)}"




[docs]
def enumerate_tautomers(reaction_smiles: str) -> Optional[List[str]]:
    """Enumerate possible tautomers of reactants while canonicalizing products.

    :param reaction_smiles: Reaction SMILES in 'reactants>>products'
        format.
    :type reaction_smiles: str
    :returns: List of reaction SMILES for each reactant tautomer
        (including the original), or None on error.
    :rtype: Optional[List[str]]
    :raises ValueError: If reactant or product SMILES are invalid.
    """
    try:
        reactants_smiles, products_smiles = reaction_smiles.split(">>")
        reactants_mol = Chem.MolFromSmiles(reactants_smiles)
        products_mol = Chem.MolFromSmiles(products_smiles)
        if reactants_mol is None or products_mol is None:
            raise ValueError("Invalid reactant or product SMILES.")
        enumerator = rdMolStandardize.TautomerEnumerator()
        reactants_tautos = enumerator.Enumerate(reactants_mol) or [reactants_mol]
        prod_can = Chem.MolToSmiles(products_mol, canonical=True)
        rsmi_list = [Chem.MolToSmiles(m) + ">>" + prod_can for m in reactants_tautos]
        rsmi_list.insert(0, reaction_smiles)
        return rsmi_list
    except ValueError:
        raise
    except Exception:
        return None




[docs]
def mapping_success_rate(list_mapping_data: List[str]) -> float:
    """Calculate percentage of entries containing atom‑mapping annotations.

    :param list_mapping_data: List of strings to search for mappings.
    :type list_mapping_data: List[str]
    :returns: Percentage of entries containing `:<digits>` patterns,
        rounded to two decimals.
    :rtype: float
    :raises ValueError: If input list is empty.
    """
    if not list_mapping_data:
        raise ValueError("The input list is empty, cannot calculate success rate.")
    pattern = re.compile(r":\d+")
    success = sum(1 for entry in list_mapping_data if pattern.search(entry))
    return round(100 * success / len(list_mapping_data), 2)




[docs]
def count_carbons(smiles: str) -> int:
    """Count the number of carbon atoms in a molecule.

    :param smiles: SMILES string of the molecule.
    :type smiles: str
    :returns: Number of carbon atoms, or raises ValueError if SMILES
        invalid.
    :rtype: int
    :raises ValueError: If the SMILES string is invalid.
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES string: {smiles}")
    return sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == "C")




[docs]
def get_max_fragment(smiles: Union[str, List[str]]) -> str:
    """Return the largest fragment by atom count from SMILES.

    :param smiles: SMILES string(s), possibly with '.' separators.
    :type smiles: str or List[str]
    :returns: SMILES of the fragment with the most atoms, or empty
        string if none valid.
    :rtype: str
    """
    if isinstance(smiles, str):
        fragments = smiles.split(".")
    else:
        fragments = [frag for s in smiles for frag in s.split(".")]
    mols = [Chem.MolFromSmiles(f) for f in fragments if f]
    mols = [m for m in mols if m]
    if not mols:
        return ""
    max_mol = max(mols, key=lambda m: m.GetNumAtoms())
    return Chem.MolToSmiles(max_mol)




[docs]
def filter_smiles(smiles_list: List[str], target_smiles: str) -> List[str]:
    """Filter SMILES list to those containing carbon and not equal to a target.

    :param smiles_list: List of SMILES strings to filter.
    :type smiles_list: List[str]
    :param target_smiles: SMILES string to exclude.
    :type target_smiles: str
    :returns: Filtered list containing SMILES with at least one carbon atom
              and not matching `target_smiles`.
    :rtype: List[str]
    """
    target_mol = Chem.MolFromSmiles(target_smiles)
    target_can = Chem.MolToSmiles(target_mol) if target_mol else ""
    result: List[str] = []
    for smi in smiles_list:
        mol = Chem.MolFromSmiles(smi)
        if mol and any(atom.GetSymbol() == "C" for atom in mol.GetAtoms()):
            can = Chem.MolToSmiles(mol)
            if can != target_can:
                result.append(smi)
    return result




[docs]
def remove_atom_mappings(mol: Chem.Mol) -> Chem.Mol:
    """Strip atom‑mapping numbers from a molecule.

    :param mol: RDKit Mol object.
    :type mol: Chem.Mol
    :returns: The same Mol with all atom‑map numbers set to zero.
    :rtype: Chem.Mol
    """
    for atom in mol.GetAtoms():
        atom.SetAtomMapNum(0)
    return mol




[docs]
def get_sanitized_smiles(smiles_list: List[str]) -> List[str]:
    """Sanitize SMILES list by removing mappings and invalid entries.

    :param smiles_list: List of SMILES strings to sanitize.
    :type smiles_list: List[str]
    :returns: List of sanitized, isomeric SMILES of the largest
        fragments only.
    :rtype: List[str]
    """
    sanitized: List[str] = []
    for smiles in smiles_list:
        if "->" in smiles:
            continue
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            continue
        mol = remove_atom_mappings(mol)
        try:
            Chem.SanitizeMol(mol)
            sanitized.append(Chem.MolToSmiles(mol, isomericSmiles=True))
        except Exception:
            continue
    # keep only the largest fragment across all
    if sanitized:
        sanitized = [get_max_fragment(sanitized)]
    return sanitized




[docs]
def remove_duplicates(smiles_list: List[str]) -> List[str]:
    """Remove duplicate strings from a list, preserving first occurrence.

    :param smiles_list: List of strings (e.g., SMILES) possibly with
        duplicates.
    :type smiles_list: List[str]
    :returns: List with duplicates removed in original order.
    :rtype: List[str]
    """
    seen = set()
    unique: List[str] = []
    for s in smiles_list:
        if s not in seen:
            unique.append(s)
            seen.add(s)
    return unique




[docs]
def process_smiles_list(smiles_list: List[str]) -> List[str]:
    """Split dot‑connected SMILES into individual components.

    :param smiles_list: List of SMILES strings, some containing '.'
        separators.
    :type smiles_list: List[str]
    :returns: Flattened list of component SMILES strings.
    :rtype: List[str]
    """
    new_list: List[str] = []
    for smiles in smiles_list:
        if "." in smiles:
            new_list.extend(smiles.split("."))
        else:
            new_list.append(smiles)
    return new_list




[docs]
def remove_explicit_H_from_rsmi(rsmi: str) -> str:
    """Remove explicit H atoms from a reaction SMILES, preserving AAM.

    :param rsmi: Atom‑mapped reaction SMILES with explicit hydrogens.
    :type rsmi: str
    :returns: Simplified reaction SMILES with implicit hydrogens.
    :rtype: str
    """
    rxn = rdChemReactions.ReactionFromSmarts(rsmi, useSmiles=True)

    def cleaned(mols):
        return ".".join(
            Chem.MolToSmiles(Chem.RemoveHs(m), isomericSmiles=True) for m in mols
        )

    react = cleaned(rxn.GetReactants())
    prod = cleaned(rxn.GetProducts())
    return f"{react}>>{prod}"




[docs]
def remove_common_reagents(reaction_smiles: str) -> Tuple[Optional[str], Optional[str]]:
    """Remove reagents present on both sides of a reaction SMILES.

    :param reaction_smiles: Reaction SMILES 'reactants>>products'.
    :type reaction_smiles: str
    :returns: Tuple(cleaned_reaction, list_of_removed_reagents or None
        if none found).
    :rtype: Tuple[str, Optional[List[str]]]
    """
    reactants, products = reaction_smiles.split(">>")
    reactant_list = reactants.split(".")
    product_list = products.split(".")
    common_reagents = set(reactant_list) & set(product_list)

    filtered_reactants = [r for r in reactant_list if r not in common_reagents]
    filtered_products = [p for p in product_list if p not in common_reagents]
    cleaned_reaction_smiles = (
        ".".join(filtered_reactants) + ">>" + ".".join(filtered_products)
    )

    return cleaned_reaction_smiles




[docs]
def reverse_reaction(rsmi: str) -> str:
    """Reverse a reaction SMILES.

    :param rsmi: Reaction SMILES 'reactants>>products'.
    :type rsmi: str
    :returns: Reaction SMILES 'products>>reactants'.
    :rtype: str
    """
    parts = rsmi.split(">>")
    return f"{parts[1]}>>{parts[0]}" if len(parts) == 2 else rsmi




[docs]
def merge_reaction(rsmi_1: str, rsmi_2: str) -> Optional[str]:
    """Merge two reaction SMILES into a single combined reaction.

    :param rsmi_1: First reaction SMILES.
    :type rsmi_1: str
    :param rsmi_2: Second reaction SMILES.
    :type rsmi_2: str
    :returns: Merged reaction SMILES or None if inputs invalid.
    :rtype: Optional[str]
    """
    try:
        r1, p1 = rsmi_1.split(">>")
        r2, p2 = rsmi_2.split(">>")
    except ValueError:
        return None
    if not all([r1, p1, r2, p2]):
        return None
    return f"{r1}.{r2}>>{p1}.{p2}"




[docs]
def find_longest_fragment(input_list: List[str]) -> Optional[str]:
    """Find the longest string in a list.

    :param input_list: List of strings to search.
    :type input_list: List[str]
    :returns: Longest string or None if list empty.
    :rtype: Optional[str]
    """
    if not input_list:
        return None
    return max(input_list, key=len)