Source code for synkit.CRN.Query.kegg_extract

from __future__ import annotations

import json
import re
from dataclasses import dataclass
from typing import Any, Optional, Mapping

try:
    from rxnmapper import RXNMapper as _RXNMapper
except Exception:
    _RXNMapper = None

from .kegg_api import KEGGClient
from .kegg_parse import (
    equation_to_text,
    get_compound_ids_from_equations,
    get_compound_ids_from_text,
    molblock_to_smiles,
    normalize_module_id,
    orient_equation_to_module,
    parse_equation,
    parse_kegg_field_blocks,
    parse_module_reaction_directions,
    reaction_smiles_from_equation,
)

_RID_PATTERN = re.compile(r"R\d{5}")


ReactionEquationMap = dict[str, Optional[str]]
CompoundTable = dict[str, dict[str, Any]]
ReactionSmilesMap = dict[str, str]
MissingByReaction = dict[str, dict[str, list[str]]]
JSONDict = dict[str, Any]



[docs]
@dataclass(slots=True)
class KEGGExtractor:
    """
    High-level extractor for KEGG pathway and module reaction data.

    This class orchestrates KEGG entry retrieval, module membership parsing,
    reaction equation collection, compound-table construction, reaction SMILES
    assembly, and optional atom mapping.

    :param client:
        Optional KEGG REST client. When omitted, a default :class:`KEGGClient`
        instance is created during :meth:`__post_init__`.
    :type client: Optional[KEGGClient]
    :param mapper_cls:
        Optional atom-mapper class used by :meth:`atom_map_reactions`. The
        class must be instantiable without arguments and must provide
        ``get_attention_guided_atom_maps``.
    :type mapper_cls: Optional[type[Any]]

    Example
    -------
    .. code-block:: python

        extractor = KEGGExtractor()
        data = extractor.build_module_json(
            "M00001",
            with_compounds=True,
            with_atom_maps=False,
        )
    """

    client: Optional[KEGGClient] = None
    mapper_cls: Optional[type[Any]] = _RXNMapper

    def __post_init__(self) -> None:
        if self.client is None:
            self.client = KEGGClient()


[docs]
    @staticmethod
    def save_json(data: Mapping[str, Any], save_as: Optional[str]) -> None:
        """
        Save JSON data to disk when an output path is provided.

        :param data:
            JSON-serializable data to write.
        :type data: Mapping[str, Any]
        :param save_as:
            Optional output path.
        :type save_as: Optional[str]

        :returns:
            ``None``.
        :rtype: None

        Example
        -------
        .. code-block:: python

            KEGGExtractor._save_json({"x": 1}, "out.json")
        """
        if save_as:
            with open(save_as, "w", encoding="utf-8") as handle:
                json.dump(data, handle, ensure_ascii=False, indent=2)



[docs]
    def get_modules_from_pathway(self, pathway_id: str) -> list[str]:
        """
        Extract module IDs from a KEGG pathway entry.

        :param pathway_id:
            KEGG pathway identifier such as ``"hsa00010"``.
        :type pathway_id: str

        :returns:
            Canonical KEGG module identifiers such as ``["M00001",
            "M00002"]``.
        :rtype: list[str]

        Example
        -------
        .. code-block:: python

            modules = extractor.get_modules_from_pathway("hsa00010")
        """
        text = self.client.get_text(f"get/{pathway_id}")
        payloads = parse_kegg_field_blocks(text, "MODULE")

        modules: list[str] = []
        for payload in payloads:
            for token in payload.split():
                normalized = normalize_module_id(token)
                if normalized is not None:
                    modules.append(normalized)

        return modules



[docs]
    def get_reaction_ids_from_module(self, module_id: str) -> list[str]:
        """
        Collect KEGG reaction IDs from a module entry, preserving module order when
        directional REACTION lines can be parsed.

        :param module_id:
            KEGG module identifier such as ``"M00001"``.
        :type module_id: str

        :returns:
            Sorted unique KEGG reaction identifiers.
        :rtype: list[str]

        Example
        -------
        .. code-block:: python

            reaction_ids = extractor.get_reaction_ids_from_module("M00001")
        """
        text = self.client.get_text(f"get/{module_id}")
        directions = parse_module_reaction_directions(text)
        if directions:
            return list(directions.keys())

        payloads = parse_kegg_field_blocks(text, "REACTION")
        reaction_ids: set[str] = set()
        for payload in payloads:
            reaction_ids.update(_RID_PATTERN.findall(payload))

        return sorted(reaction_ids)



[docs]
    def get_equation_for_reaction(self, reaction_id: str) -> Optional[str]:
        """
        Fetch the KEGG equation string for a reaction.

        :param reaction_id:
            KEGG reaction identifier such as ``"R00200"``.
        :type reaction_id: str

        :returns:
            Equation string when present, otherwise ``None``.
        :rtype: Optional[str]

        Example
        -------
        .. code-block:: python

            equation = extractor.get_equation_for_reaction("R00200")
        """
        text = self.client.get_text(f"get/rn:{reaction_id}")
        payloads = parse_kegg_field_blocks(text, "EQUATION")
        return payloads[0].strip() if payloads else None



[docs]
    def get_module_equations(self, module_id: str) -> ReactionEquationMap:
        """
        Build a reaction-to-equation mapping for a KEGG module.

        :param module_id:
            KEGG module identifier.
        :type module_id: str

        :returns:
            Mapping from reaction identifier to equation string.
        :rtype: dict[str, Optional[str]]

        Example
        -------
        .. code-block:: python

            equations = extractor.get_module_equations("M00001")
        """
        module_text = self.client.get_text(f"get/{module_id}")
        directions = parse_module_reaction_directions(module_text)

        reaction_ids = (
            list(directions.keys())
            if directions
            else self.get_reaction_ids_from_module(module_id)
        )

        equations_by_rid: ReactionEquationMap = {}

        for reaction_id in reaction_ids:
            equation = self.get_equation_for_reaction(reaction_id)
            if equation is None:
                equations_by_rid[reaction_id] = None
                continue

            if reaction_id not in directions:
                equations_by_rid[reaction_id] = equation
                continue

            left_ids, right_ids, module_arrow = directions[reaction_id]
            parsed = parse_equation(equation)
            oriented = orient_equation_to_module(parsed, left_ids, right_ids)
            equations_by_rid[reaction_id] = equation_to_text(
                oriented, arrow=module_arrow
            )

        return equations_by_rid



[docs]
    def get_pathway_equations(
        self,
        pathway_id: str,
    ) -> dict[str, ReactionEquationMap]:
        """
        Build nested module/reaction equation mappings for a pathway.

        :param pathway_id:
            KEGG pathway identifier.
        :type pathway_id: str

        :returns:
            Mapping of the form ``{module_id: {reaction_id: equation}}``.
        :rtype: dict[str, dict[str, Optional[str]]]

        Example
        -------
        .. code-block:: python

            nested = extractor.get_pathway_equations("hsa00010")
        """
        modules = self.get_modules_from_pathway(pathway_id)
        return {
            module_id: self.get_module_equations(module_id) for module_id in modules
        }



[docs]
    def get_compound_name(self, compound_id: str) -> Optional[str]:
        """
        Retrieve the primary KEGG compound name.

        When multiple synonyms are present in the ``NAME`` field, only the
        first entry is returned.

        :param compound_id:
            KEGG compound identifier such as ``"C00001"``.
        :type compound_id: str

        :returns:
            Primary compound name if available, otherwise ``None``.
        :rtype: Optional[str]

        Example
        -------
        .. code-block:: python

            name = extractor.get_compound_name("C00001")
        """
        text = self.client.get_text(f"get/cpd:{compound_id}")
        payloads = parse_kegg_field_blocks(text, "NAME")
        if not payloads:
            return None

        first_payload = payloads[0].strip()
        return first_payload.split(";")[0].strip()



[docs]
    def get_compound_molblock(self, compound_id: str) -> Optional[str]:
        """
        Retrieve the KEGG MOL block for a compound.

        :param compound_id:
            KEGG compound identifier.
        :type compound_id: str

        :returns:
            MOL block text when available, otherwise ``None``.
        :rtype: Optional[str]

        Example
        -------
        .. code-block:: python

            molblock = extractor.get_compound_molblock("C00001")
        """
        return self.client.get_optional_text(f"get/cpd:{compound_id}/mol")



[docs]
    def build_compound_table(
        self,
        compound_ids: list[str],
    ) -> CompoundTable:
        """
        Build a compound table for a list of KEGG compound identifiers.

        Each returned record includes the KEGG compound identifier, the primary
        compound name, the optional MOL block, and a canonical SMILES string
        derived from the MOL block when RDKit parsing succeeds.

        :param compound_ids:
            KEGG compound identifiers.
        :type compound_ids: list[str]

        :returns:
            Compound table of the form
            ``{cid: {"id", "name", "smiles", "molblock"}}``.
        :rtype: dict[str, dict[str, Any]]

        Example
        -------
        .. code-block:: python

            compounds = extractor.build_compound_table(["C00001", "C00002"])
        """
        compounds: CompoundTable = {}

        for compound_id in compound_ids:
            name = self.get_compound_name(compound_id)
            molblock = self.get_compound_molblock(compound_id)
            smiles = molblock_to_smiles(molblock)

            compounds[compound_id] = {
                "id": compound_id,
                "name": name,
                "smiles": smiles,
                "molblock": molblock,
            }

        return compounds



[docs]
    def build_reaction_smiles_dict(
        self,
        parsed_by_rid: Mapping[str, Any],
        compounds_by_cid: Mapping[str, Mapping[str, Any]],
    ) -> tuple[ReactionSmilesMap, MissingByReaction]:
        """
        Build reaction SMILES strings for parsed KEGG equations.

        :param parsed_by_rid:
            Parsed equation objects keyed by reaction identifier.
        :type parsed_by_rid: Mapping[str, Any]
        :param compounds_by_cid:
            Compound table keyed by KEGG compound identifier.
        :type compounds_by_cid: Mapping[str, Mapping[str, Any]]

        :returns:
            Tuple ``(reaction_smiles_by_id, missing_by_id)``.
        :rtype: tuple[dict[str, str], dict[str, dict[str, list[str]]]]

        Example
        -------
        .. code-block:: python

            rsmi_by_rid, missing = extractor.build_reaction_smiles_dict(
                parsed_by_rid,
                compounds_by_cid,
            )
        """
        reaction_smiles: ReactionSmilesMap = {}
        missing_by_rid: MissingByReaction = {}

        for reaction_id, parsed_equation in parsed_by_rid.items():
            rsmi, missing = reaction_smiles_from_equation(
                parsed_equation,
                compounds_by_cid,
            )
            reaction_smiles[reaction_id] = rsmi
            missing_by_rid[reaction_id] = missing

        return reaction_smiles, missing_by_rid



[docs]
    def atom_map_reactions(
        self,
        reaction_smiles_by_id: Mapping[str, str],
    ) -> dict[str, Optional[str]]:
        """
        Atom-map reaction SMILES using RXNMapper.

        :param reaction_smiles_by_id:
            Mapping ``{reaction_id: reaction_smiles}``.
        :type reaction_smiles_by_id: Mapping[str, str]

        :returns:
            Mapping ``{reaction_id: mapped_reaction_smiles_or_none}``.
        :rtype: dict[str, Optional[str]]

        Example
        -------
        .. code-block:: python

            mapped = extractor.atom_map_reactions({"R00001": "CCO>>CC=O"})
        """
        mapper = self.mapper_cls() if self.mapper_cls is not None else None
        mapped_by_id: dict[str, Optional[str]] = {}

        for reaction_id, reaction_smiles in reaction_smiles_by_id.items():
            if (
                ">>" not in reaction_smiles
                or reaction_smiles.startswith(">>")
                or reaction_smiles.endswith(">>")
            ):
                mapped_by_id[reaction_id] = None
                continue

            try:
                result = mapper.get_attention_guided_atom_maps([reaction_smiles])[0]
                mapped_by_id[reaction_id] = result.get("mapped_rxn")
            except Exception:
                mapped_by_id[reaction_id] = None

        return mapped_by_id



[docs]
    def build_missing_compound_report(
        self,
        equations_by_rid: ReactionEquationMap,
        compounds_by_cid: Mapping[str, Mapping[str, Any]],
    ) -> JSONDict:
        """
        Build a report for compounds lacking SMILES.

        :param equations_by_rid:
            Reaction equations keyed by reaction identifier.
        :type equations_by_rid: dict[str, Optional[str]]
        :param compounds_by_cid:
            Compound records keyed by KEGG compound identifier.
        :type compounds_by_cid: Mapping[str, Mapping[str, Any]]

        :returns:
            Report containing missing compounds and per-reaction provenance.
        :rtype: dict[str, Any]

        Example
        -------
        .. code-block:: python

            report = extractor.build_missing_compound_report(
                equations_by_rid,
                compounds_by_cid,
            )
        """
        cid_to_rids: dict[str, set[str]] = {}

        for reaction_id, equation in equations_by_rid.items():
            if not equation:
                continue

            for compound_id in get_compound_ids_from_text(equation):
                cid_to_rids.setdefault(compound_id, set()).add(reaction_id)

        missing_compounds: list[dict[str, Any]] = []
        missing_ids: set[str] = set()
        involving_reactions: set[str] = set()

        for compound_id, record in compounds_by_cid.items():
            if record.get("smiles") is None:
                reaction_ids = sorted(cid_to_rids.get(compound_id, set()))
                missing_compounds.append(
                    {
                        "id": compound_id,
                        "name": record.get("name"),
                        "reactions": reaction_ids,
                    }
                )
                missing_ids.add(compound_id)
                involving_reactions.update(reaction_ids)

        missing_compounds.sort(key=lambda record: record["id"])

        return {
            "missing_compounds": missing_compounds,
            "missing_compound_ids": sorted(missing_ids),
            "reactions_involving_missing": sorted(involving_reactions),
        }



[docs]
    def build_kegg_json(
        self,
        equations_by_rid: ReactionEquationMap,
        *,
        smiles_by_rid: Optional[Mapping[str, str]] = None,
        rules_by_rid: Optional[Mapping[str, Optional[str]]] = None,
        molecules_by_cid: Optional[Mapping[str, Mapping[str, Any]]] = None,
    ) -> JSONDict:
        """
        Build a compact KEGG JSON block with reactions and molecules.

        :param equations_by_rid:
            Reaction equations keyed by reaction identifier.
        :type equations_by_rid: dict[str, Optional[str]]
        :param smiles_by_rid:
            Optional reaction SMILES keyed by reaction identifier.
        :type smiles_by_rid: Optional[Mapping[str, str]]
        :param rules_by_rid:
            Optional atom-mapped reaction strings keyed by reaction identifier.
        :type rules_by_rid: Optional[Mapping[str, Optional[str]]]
        :param molecules_by_cid:
            Optional molecule table keyed by compound identifier.
        :type molecules_by_cid: Optional[Mapping[str, Mapping[str, Any]]]

        :returns:
            Dictionary with ``"reactions"`` and ``"molecules"`` entries.
        :rtype: dict[str, Any]

        Example
        -------
        .. code-block:: python

            data = extractor.build_kegg_json(equations_by_rid)
        """
        smiles_by_rid = dict(smiles_by_rid or {})
        rules_by_rid = dict(rules_by_rid or {})
        molecules_by_cid = dict(molecules_by_cid or {})

        all_compound_ids: set[str] = set()
        reactions: list[dict[str, Any]] = []

        for reaction_id in sorted(equations_by_rid.keys()):
            equation = equations_by_rid[reaction_id]
            if not equation:
                continue

            all_compound_ids.update(get_compound_ids_from_text(equation))
            reactions.append(
                {
                    "id": reaction_id,
                    "reaction": equation,
                    "rule": rules_by_rid.get(reaction_id),
                    "smiles": smiles_by_rid.get(reaction_id),
                }
            )

        molecules: list[dict[str, Any]] = []
        for compound_id in sorted(all_compound_ids):
            record = molecules_by_cid.get(
                compound_id,
                {"id": compound_id, "name": None, "smiles": None},
            )
            molecules.append(
                {
                    "id": compound_id,
                    "name": record.get("name"),
                    "smiles": record.get("smiles"),
                }
            )

        return {"reactions": reactions, "molecules": molecules}



[docs]
    def build_module_json(
        self,
        module_id: str,
        *,
        with_compounds: bool = True,
        with_atom_maps: bool = True,
        save_as: Optional[str] = None,
    ) -> JSONDict:
        """
        Build a JSON block for a KEGG module.

        :param module_id:
            KEGG module ID.
        :type module_id: str
        :param with_compounds:
            Whether to resolve compound names, MOL blocks, and SMILES strings.
        :type with_compounds: bool
        :param with_atom_maps:
            Whether to compute atom-mapped reactions.
        :type with_atom_maps: bool
        :param save_as:
            Optional output path for writing the JSON block to disk.
        :type save_as: Optional[str]

        :returns:
            Module JSON dictionary.
        :rtype: dict[str, Any]

        Example
        -------
        .. code-block:: python

            data = extractor.build_module_json(
                "M00001",
                with_compounds=True,
                with_atom_maps=False,
            )
        """
        equations_by_rid = self.get_module_equations(module_id)
        compound_ids, parsed_by_rid = get_compound_ids_from_equations(equations_by_rid)

        compounds_by_cid = (
            self.build_compound_table(compound_ids) if with_compounds else {}
        )

        reaction_smiles_by_rid, _ = (
            self.build_reaction_smiles_dict(parsed_by_rid, compounds_by_cid)
            if with_compounds
            else ({}, {})
        )

        rules_by_rid = (
            self.atom_map_reactions(reaction_smiles_by_rid)
            if (with_compounds and with_atom_maps)
            else {}
        )

        data: JSONDict = {"module_id": module_id}
        data.update(
            self.build_kegg_json(
                equations_by_rid,
                smiles_by_rid=reaction_smiles_by_rid,
                rules_by_rid=rules_by_rid,
                molecules_by_cid={
                    cid: {
                        "id": cid,
                        "name": compounds_by_cid[cid]["name"],
                        "smiles": compounds_by_cid[cid]["smiles"],
                    }
                    for cid in compounds_by_cid
                },
            )
        )

        if with_compounds:
            data["missing"] = self.build_missing_compound_report(
                equations_by_rid,
                compounds_by_cid,
            )
        else:
            data["missing"] = {
                "missing_compounds": [],
                "missing_compound_ids": [],
                "reactions_involving_missing": [],
            }

        self.save_json(data, save_as)

        return data



[docs]
    def build_pathway_json(
        self,
        pathway_id: str,
        *,
        with_compounds: bool = True,
        with_atom_maps: bool = True,
        save_as: Optional[str] = None,
    ) -> JSONDict:
        """
        Build a JSON block for a KEGG pathway, organized by module.

        :param pathway_id:
            KEGG pathway ID.
        :type pathway_id: str
        :param with_compounds:
            Whether to resolve compound records.
        :type with_compounds: bool
        :param with_atom_maps:
            Whether to compute atom-mapped reactions.
        :type with_atom_maps: bool
        :param save_as:
            Optional output path for writing the JSON block to disk.
        :type save_as: Optional[str]

        :returns:
            Pathway JSON dictionary.
        :rtype: dict[str, Any]

        Example
        -------
        .. code-block:: python

            data = extractor.build_pathway_json(
                "hsa00010",
                with_compounds=True,
                with_atom_maps=False,
            )
        """
        modules = self.get_modules_from_pathway(pathway_id)
        by_module: dict[str, Any] = {}

        aggregate_missing_ids: set[str] = set()
        aggregate_reaction_ids: set[str] = set()

        for module_id in modules:
            module_block = self.build_module_json(
                module_id,
                with_compounds=with_compounds,
                with_atom_maps=with_atom_maps,
            )
            by_module[module_id] = module_block

            if with_compounds and "missing" in module_block:
                aggregate_missing_ids.update(
                    module_block["missing"].get("missing_compound_ids", [])
                )
                aggregate_reaction_ids.update(
                    module_block["missing"].get("reactions_involving_missing", [])
                )

        data: JSONDict = {
            "pathway_id": pathway_id,
            "modules": modules,
            "by_module": by_module,
        }

        if with_compounds:
            data["missing"] = {
                "missing_compound_ids": sorted(aggregate_missing_ids),
                "reactions_involving_missing": sorted(aggregate_reaction_ids),
            }

        self.save_json(data, save_as)

        return data