Source code for synkit.CRN.Query.kegg_parse

from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Any, Mapping, Optional, Sequence

from rdkit import Chem

_CID_PATTERN = re.compile(r"C\d{5}")
_MODULE_PATTERN = re.compile(r"M\d{5}")
_TERM_PATTERN = re.compile(r"^(?:(\d+)\s+)?(C\d{5})$")
_RID_PATTERN = re.compile(r"R\d{5}")
_MODULE_REACTION_LINE = re.compile(
    r"^(?P<rids>R\d{5}(?:,R\d{5})*)\s+(?P<lhs>.+?)\s*(?P<arrow><=>|->|=>|<-|<=)\s*(?P<rhs>.+)$"
)

CompoundStoich = tuple[str, int]
CompoundRecord = Mapping[str, Any]
ReactionSmilesMissing = dict[str, list[str]]



[docs]
@dataclass(frozen=True, slots=True)
class KEGGEquation:
    """
    Structured representation of a parsed KEGG reaction equation.

    :param reactants:
        Left-hand side of the equation as ``(compound_id, stoichiometry)``
        pairs.
    :type reactants: list[tuple[str, int]]
    :param products:
        Right-hand side of the equation as ``(compound_id, stoichiometry)``
        pairs.
    :type products: list[tuple[str, int]]
    :param reversible:
        Whether the original equation used the reversible arrow ``<=>``.
    :type reversible: bool

    Example
    -------
    .. code-block:: python

        equation = KEGGEquation(
            reactants=[("C00001", 1), ("C00002", 2)],
            products=[("C00008", 1)],
            reversible=False,
        )
    """

    reactants: list[CompoundStoich]
    products: list[CompoundStoich]
    reversible: bool




[docs]
def parse_kegg_field_blocks(text: str, field: str) -> list[str]:
    """
    Extract payloads from a KEGG flatfile field, including continuation lines.

    Continuation lines are recognized as lines beginning with spaces or tabs and
    are concatenated to the payload of the preceding field occurrence.

    :param text:
        Raw KEGG flatfile text.
    :type text: str
    :param field:
        Flatfile field name such as ``"MODULE"``, ``"REACTION"``,
        ``"EQUATION"``, or ``"NAME"``.
    :type field: str

    :returns:
        One payload string per matching field occurrence.
    :rtype: list[str]

    Example
    -------
    .. code-block:: python

        text = (
            "MODULE      M00001 Glycolysis\n"
            "            continuation line\n"
        )
        payloads = parse_kegg_field_blocks(text, "MODULE")
    """
    payloads: list[str] = []
    lines = text.splitlines()
    i = 0

    while i < len(lines):
        line = lines[i]
        if line.startswith(field):
            payload = line[len(field) :].strip()  # noqa
            j = i + 1
            continuation: list[str] = []

            while j < len(lines) and (
                lines[j].startswith(" ") or lines[j].startswith("\t")
            ):
                continuation.append(lines[j].strip())
                j += 1

            if continuation:
                payload = (payload + " " + " ".join(continuation)).strip()

            payloads.append(payload)
            i = j
        else:
            i += 1

    return payloads




[docs]
def normalize_module_id(module_id: str) -> Optional[str]:
    """
    Normalize a token to canonical KEGG module form.

    Supported examples include strings such as ``"hsa_M00001"`` and
    ``"M00001"``, both of which normalize to ``"M00001"``.

    :param module_id:
        Raw module token or containing text.
    :type module_id: str

    :returns:
        Canonical KEGG module identifier, or ``None`` when no module identifier
        is present.
    :rtype: Optional[str]

    Example
    -------
    .. code-block:: python

        canonical = normalize_module_id("hsa_M00001")
    """
    match = _MODULE_PATTERN.search(module_id)
    return match.group(0) if match else None




[docs]
def parse_side(side: str) -> list[CompoundStoich]:
    """
    Parse one side of a KEGG equation into compound/stoichiometry pairs.

    For example, ``"2 C00139 + C00001"`` becomes
    ``[("C00139", 2), ("C00001", 1)]``.

    :param side:
        One side of a KEGG equation.
    :type side: str

    :returns:
        Parsed ``(compound_id, coefficient)`` pairs.
    :rtype: list[tuple[str, int]]

    :raises ValueError:
        Raised when any term does not match KEGG compound-stoichiometry syntax.

    Example
    -------
    .. code-block:: python

        items = parse_side("2 C00139 + C00001")
    """
    side = side.strip()
    if not side:
        return []

    terms = [term.strip() for term in side.split("+")]
    parsed: list[CompoundStoich] = []

    for term in terms:
        match = _TERM_PATTERN.match(term)
        if not match:
            raise ValueError(f"Unparsed KEGG equation term: {term!r}")

        coefficient = int(match.group(1)) if match.group(1) else 1
        compound_id = match.group(2)
        parsed.append((compound_id, coefficient))

    return parsed




[docs]
def parse_equation(equation: str) -> KEGGEquation:
    """
    Parse a KEGG equation string into reactants, products, and arrow type.

    Supported arrows are ``<=>``, ``<->``, ``=>``, ``->``, ``<=``, and ``<-``.

    :param equation:
        KEGG equation string.
    :type equation: str

    :returns:
        Parsed equation object.
    :rtype: KEGGEquation

    :raises ValueError:
        Raised when the equation does not contain a supported KEGG arrow.

    Example
    -------
    .. code-block:: python

        parsed = parse_equation("C00001 + C00002 <=> C00003")
    """
    if "<=>" in equation:
        lhs, rhs = equation.split("<=>")
        reversible = True
    elif "<->" in equation:
        lhs, rhs = equation.split("<->")
        reversible = True
    elif "=>" in equation:
        lhs, rhs = equation.split("=>")
        reversible = False
    elif "->" in equation:
        lhs, rhs = equation.split("->")
        reversible = False
    elif "<=" in equation:
        rhs, lhs = equation.split("<=")
        reversible = False
    elif "<-" in equation:
        rhs, lhs = equation.split("<-")
        reversible = False
    else:
        raise ValueError(f"Unknown KEGG equation arrow in: {equation!r}")

    return KEGGEquation(
        reactants=parse_side(lhs),
        products=parse_side(rhs),
        reversible=reversible,
    )




[docs]
def get_compound_ids_from_equations(
    equations_by_rid: Mapping[str, Optional[str]],
) -> tuple[list[str], dict[str, KEGGEquation]]:
    """
    Collect all compound identifiers appearing across KEGG reaction equations.

    Empty or missing equation strings are skipped.

    :param equations_by_rid:
        Mapping from reaction identifier to KEGG equation string.
    :type equations_by_rid: Mapping[str, Optional[str]]

    :returns:
        A tuple containing the sorted unique compound identifiers and the parsed
        equations keyed by reaction identifier.
    :rtype: tuple[list[str], dict[str, KEGGEquation]]

    Example
    -------
    .. code-block:: python

        compound_ids, parsed = get_compound_ids_from_equations(
            {"R00001": "C00001 + C00002 => C00003"}
        )
    """
    compound_ids: set[str] = set()
    parsed_by_rid: dict[str, KEGGEquation] = {}

    for rid, equation in equations_by_rid.items():
        if not equation:
            continue

        parsed = parse_equation(equation)
        parsed_by_rid[rid] = parsed

        for cid, _ in parsed.reactants + parsed.products:
            compound_ids.add(cid)

    return sorted(compound_ids), parsed_by_rid




[docs]
def get_compound_ids_from_text(text: str) -> list[str]:
    """
    Extract sorted unique KEGG compound identifiers from free text.

    :param text:
        Source text that may contain KEGG compound identifiers.
    :type text: str

    :returns:
        Sorted unique KEGG compound identifiers.
    :rtype: list[str]

    Example
    -------
    .. code-block:: python

        ids = get_compound_ids_from_text("C00001 and C00002 appear here")
    """
    return sorted(set(_CID_PATTERN.findall(text or "")))




[docs]
def molblock_to_smiles(molblock: Optional[str]) -> Optional[str]:
    """
    Convert a MOL block into canonical RDKit SMILES.

    :param molblock:
        MOL block text, typically retrieved from a KEGG compound record.
    :type molblock: Optional[str]

    :returns:
        Canonical RDKit SMILES when parsing succeeds, otherwise ``None``.
    :rtype: Optional[str]

    Example
    -------
    .. code-block:: python

        smiles = molblock_to_smiles(molblock_text)
    """
    if not molblock:
        return None

    molecule = Chem.MolFromMolBlock(molblock, sanitize=True)
    if molecule is None:
        return None

    return Chem.MolToSmiles(molecule)




[docs]
def expand_stoichiometry(items: Sequence[CompoundStoich]) -> list[str]:
    """
    Expand stoichiometric pairs into repeated KEGG compound identifiers.

    For example, ``[("C00001", 2), ("C00002", 1)]`` becomes
    ``["C00001", "C00001", "C00002"]``.

    :param items:
        Compound/coefficient pairs.
    :type items: Sequence[tuple[str, int]]

    :returns:
        Expanded compound identifier list.
    :rtype: list[str]

    Example
    -------
    .. code-block:: python

        expanded = expand_stoichiometry([("C00001", 2), ("C00002", 1)])
    """
    expanded: list[str] = []
    for compound_id, coefficient in items:
        expanded.extend([compound_id] * coefficient)
    return expanded




[docs]
def reaction_smiles_from_equation(
    parsed_equation: KEGGEquation,
    compounds_by_cid: Mapping[str, CompoundRecord],
) -> tuple[str, ReactionSmilesMissing]:
    """
    Build reaction SMILES from a parsed KEGG equation and compound table.

    Stoichiometric multiplicities are expanded into repeated SMILES fragments.
    Missing compounds are reported separately for reactants and products.

    :param parsed_equation:
        Parsed KEGG equation object.
    :type parsed_equation: KEGGEquation
    :param compounds_by_cid:
        Compound table keyed by KEGG compound identifier. Each record should
        provide a ``"smiles"`` entry.
    :type compounds_by_cid: Mapping[str, Mapping[str, Any]]

    :returns:
        Tuple ``(reaction_smiles, missing)`` where ``missing`` contains lists of
        unresolved reactant and product KEGG compound identifiers.
    :rtype: tuple[str, dict[str, list[str]]]

    Example
    -------
    .. code-block:: python

        parsed = parse_equation("C00001 + C00002 => C00003")
        reaction_smiles, missing = reaction_smiles_from_equation(
            parsed,
            {
                "C00001": {"smiles": "O"},
                "C00002": {"smiles": "CCO"},
                "C00003": {"smiles": "CC(=O)O"},
            },
        )
    """
    reactant_ids = expand_stoichiometry(parsed_equation.reactants)
    product_ids = expand_stoichiometry(parsed_equation.products)

    reactant_smiles: list[str] = []
    product_smiles: list[str] = []
    missing: ReactionSmilesMissing = {"reactants": [], "products": []}

    for compound_id in reactant_ids:
        smiles = compounds_by_cid.get(compound_id, {}).get("smiles")
        if smiles:
            reactant_smiles.append(smiles)
        else:
            missing["reactants"].append(compound_id)

    for compound_id in product_ids:
        smiles = compounds_by_cid.get(compound_id, {}).get("smiles")
        if smiles:
            product_smiles.append(smiles)
        else:
            missing["products"].append(compound_id)

    return ".".join(reactant_smiles) + ">>" + ".".join(product_smiles), missing




[docs]
def parse_module_reaction_directions(
    text: str,
) -> dict[str, tuple[list[str], list[str], str]]:
    """
    Parse directional hints from a KEGG MODULE entry.

    Returns
    -------
    dict
        Mapping:
        {
            reaction_id: (left_compound_ids, right_compound_ids, arrow)
        }
    """
    directions: dict[str, tuple[list[str], list[str], str]] = {}

    in_reaction = False
    for line in text.splitlines():
        if line.startswith("REACTION"):
            payload = line[len("REACTION") :].strip()  # noqa
            in_reaction = True
        elif in_reaction and (line.startswith(" ") or line.startswith("\t")):
            payload = line.strip()
        else:
            if in_reaction:
                break
            continue

        match = _MODULE_REACTION_LINE.match(payload)
        if not match:
            continue

        reaction_ids = _RID_PATTERN.findall(match.group("rids"))
        left_ids = get_compound_ids_from_text(match.group("lhs"))
        right_ids = get_compound_ids_from_text(match.group("rhs"))
        arrow = match.group("arrow")

        for rid in reaction_ids:
            directions[rid] = (left_ids, right_ids, arrow)

    return directions




[docs]
def orient_equation_to_module(
    parsed: KEGGEquation,
    left_ids: list[str],
    right_ids: list[str],
) -> KEGGEquation:
    """
    Orient a parsed KEGG equation according to module direction.
    """
    reactant_ids = {cid for cid, _ in parsed.reactants}
    product_ids = {cid for cid, _ in parsed.products}

    left_set = set(left_ids)
    right_set = set(right_ids)

    keep_score = len(left_set & reactant_ids) + len(right_set & product_ids)
    flip_score = len(left_set & product_ids) + len(right_set & reactant_ids)

    if flip_score > keep_score:
        return KEGGEquation(
            reactants=list(parsed.products),
            products=list(parsed.reactants),
            reversible=parsed.reversible,
        )

    return parsed




[docs]
def equation_to_text(parsed: KEGGEquation, arrow: str | None = None) -> str:
    """
    Convert KEGGEquation back to text, optionally forcing the arrow from the
    module hint.
    """

    def side_to_text(items: list[tuple[str, int]]) -> str:
        parts = []
        for cid, coeff in items:
            parts.append(f"{coeff} {cid}" if coeff != 1 else cid)
        return " + ".join(parts)

    if arrow == "->":
        arrow = "=>"
    elif arrow == "<-":
        arrow = "<="

    if arrow is None:
        arrow = "<=>" if parsed.reversible else "=>"

    return f"{side_to_text(parsed.reactants)} {arrow} {side_to_text(parsed.products)}"