Source code for synkit.CRN.Query.kegg_impute

from __future__ import annotations

import copy
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Set, Tuple

from .kegg_extract import KEGGExtractor
from .kegg_parse import parse_equation, reaction_smiles_from_equation

MoleculeRecord = dict[str, Any]
ReactionRecord = dict[str, Any]
FixRecord = dict[str, str]



[docs]
@dataclass
class KEGGImputer:
    """
    Impute missing compound SMILES and repair reaction records in KEGG-style
    module or pathway JSON blocks.

    The imputer supports two fix types through the same ``fixes`` argument:

    - molecule fixes, for example
      ``{"id": "C00404a", "smiles": "..."}``
    - reaction fixes, for example
      ``{"id": "R02189", "reaction": "C00404 + C00267 <=> C00404a + C00668"}``

    Reaction fixes are applied first, then molecule fixes are applied, and
    finally impacted reaction SMILES, atom-mapped rules, and missing-compound
    summaries are rebuilt.

    :param extractor:
        Optional high-level KEGG extractor used for atom-mapping utilities and
        missing-compound report generation. When omitted, a default
        :class:`KEGGExtractor` instance is created.
    :type extractor: Optional[KEGGExtractor]

    Example
    -------
    .. code-block:: python

        imputer = KEGGImputer()
        updated = imputer.impute_module(
            module_data,
            fixes=[
                {
                    "id": "R02189",
                    "reaction": "C00404 + C00267 <=> C00404a + C00668",
                },
                {
                    "id": "C00404a",
                    "name": "Polyphosphate fragment",
                    "smiles": "O=P(O)(O)OP(=O)(O)O",
                },
            ],
        )
    """

    extractor: Optional[KEGGExtractor] = None

    def __post_init__(self) -> None:
        if self.extractor is None:
            self.extractor = KEGGExtractor()

    @staticmethod
    def _restore_molecule_list(
        original_molecules: List[MoleculeRecord],
        molecules_by_id: Dict[str, MoleculeRecord],
        *,
        id_key: str = "id",
    ) -> List[MoleculeRecord]:
        """
        Restore a molecule list while preserving original order.

        Molecules present in ``molecules_by_id`` but absent from the original
        list are appended in sorted identifier order.

        :param original_molecules:
            Original molecule records.
        :type original_molecules: List[dict[str, Any]]
        :param molecules_by_id:
            Updated molecule mapping keyed by identifier.
        :type molecules_by_id: Dict[str, dict[str, Any]]
        :param id_key:
            Dictionary key used as the molecule identifier.
        :type id_key: str

        :returns:
            Restored molecule list.
        :rtype: List[dict[str, Any]]

        Example
        -------
        .. code-block:: python

            restored = KEGGImputer._restore_molecule_list(
                [{"id": "C00001"}],
                {"C00001": {"id": "C00001"}, "C00002": {"id": "C00002"}},
            )
        """
        original_ids = [
            record[id_key] for record in original_molecules if id_key in record
        ]
        restored: List[MoleculeRecord] = []

        for molecule_id in original_ids:
            if molecule_id in molecules_by_id:
                restored.append(molecules_by_id[molecule_id])

        original_id_set = set(original_ids)
        for molecule_id in sorted(molecules_by_id):
            if molecule_id not in original_id_set:
                restored.append(molecules_by_id[molecule_id])

        return restored

    @staticmethod
    def _split_fixes(
        fixes: List[FixRecord],
        *,
        reaction_id_key: str,
        equation_key: str,
    ) -> Tuple[List[FixRecord], List[FixRecord]]:
        """
        Split mixed fix records into reaction fixes and molecule fixes.

        A fix is treated as a reaction fix when it contains both the reaction
        identifier key and the equation key. All other fixes are treated as
        molecule fixes.

        :param fixes:
            Mixed fix records.
        :type fixes: List[dict[str, str]]
        :param reaction_id_key:
            Key used to identify reactions.
        :type reaction_id_key: str
        :param equation_key:
            Key used to store reaction equations.
        :type equation_key: str

        :returns:
            Tuple ``(reaction_fixes, molecule_fixes)``.
        :rtype: Tuple[List[dict[str, str]], List[dict[str, str]]]

        Example
        -------
        .. code-block:: python

            reaction_fixes, molecule_fixes = KEGGImputer._split_fixes(
                fixes,
                reaction_id_key="id",
                equation_key="reaction",
            )
        """
        reaction_fixes: List[FixRecord] = []
        molecule_fixes: List[FixRecord] = []

        for fix in fixes:
            if reaction_id_key in fix and equation_key in fix:
                reaction_fixes.append(fix)
            else:
                molecule_fixes.append(fix)

        return reaction_fixes, molecule_fixes

    @staticmethod
    def _apply_reaction_fixes(
        reactions: List[ReactionRecord],
        reaction_fixes: List[FixRecord],
        *,
        reaction_id_key: str,
        equation_key: str,
        reaction_smiles_key: str = "smiles",
        reaction_rule_key: str = "rule",
    ) -> Set[str]:
        """
        Apply reaction equation fixes in place.

        The target reaction equation is replaced for each matching reaction
        record. Existing SMILES and rule fields for edited reactions are reset
        so they can be rebuilt from the updated equation.

        :param reactions:
            Reaction records to update in place.
        :type reactions: List[dict[str, Any]]
        :param reaction_fixes:
            Reaction-level fix records.
        :type reaction_fixes: List[dict[str, str]]
        :param reaction_id_key:
            Dictionary key storing reaction identifiers.
        :type reaction_id_key: str
        :param equation_key:
            Dictionary key storing equation strings.
        :type equation_key: str
        :param reaction_smiles_key:
            Dictionary key used for reaction SMILES.
        :type reaction_smiles_key: str
        :param reaction_rule_key:
            Dictionary key used for atom-mapped rules.
        :type reaction_rule_key: str

        :returns:
            Set of edited reaction identifiers.
        :rtype: Set[str]

        :raises KeyError:
            If a reaction fix refers to a reaction identifier that is not
            present in the provided reaction list.

        Example
        -------
        .. code-block:: python

            edited = KEGGImputer._apply_reaction_fixes(
                reactions,
                [{"id": "R00001", "reaction": "C00001 => C00002"}],
                reaction_id_key="id",
                equation_key="reaction",
            )
        """
        reactions_by_id = {
            reaction.get(reaction_id_key): reaction
            for reaction in reactions
            if reaction.get(reaction_id_key)
        }

        edited_reaction_ids: Set[str] = set()

        for fix in reaction_fixes:
            reaction_id = fix.get(reaction_id_key)
            if not reaction_id:
                continue
            if reaction_id not in reactions_by_id:
                continue

            reaction = reactions_by_id[reaction_id]
            reaction[equation_key] = fix[equation_key]
            reaction[reaction_smiles_key] = None
            reaction[reaction_rule_key] = None
            edited_reaction_ids.add(reaction_id)

        return edited_reaction_ids

    @staticmethod
    def _apply_molecule_fixes(
        molecules_by_id: Dict[str, MoleculeRecord],
        molecule_fixes: List[FixRecord],
        *,
        molecule_id_key: str = "id",
    ) -> Set[str]:
        """
        Apply molecule fixes to a molecule mapping in place.

        Existing molecule records are updated, while missing identifiers are
        inserted as new molecule records.

        :param molecules_by_id:
            Molecule mapping keyed by identifier.
        :type molecules_by_id: Dict[str, dict[str, Any]]
        :param molecule_fixes:
            Molecule-level fix records.
        :type molecule_fixes: List[dict[str, str]]
        :param molecule_id_key:
            Dictionary key used as the molecule identifier.
        :type molecule_id_key: str

        :returns:
            Set of updated compound identifiers.
        :rtype: Set[str]

        Example
        -------
        .. code-block:: python

            updated_ids = KEGGImputer._apply_molecule_fixes(
                molecules_by_id,
                [{"id": "C00002", "smiles": "O"}],
            )
        """
        updated_compound_ids: Set[str] = set()

        for fix in molecule_fixes:
            compound_id = fix.get(molecule_id_key)
            if not compound_id:
                continue

            updated_compound_ids.add(compound_id)

            if compound_id not in molecules_by_id:
                molecules_by_id[compound_id] = {
                    molecule_id_key: compound_id,
                    "name": fix.get("name"),
                    "smiles": fix.get("smiles"),
                }
            else:
                if fix.get("name") is not None:
                    molecules_by_id[compound_id]["name"] = fix["name"]
                if "smiles" in fix:
                    molecules_by_id[compound_id]["smiles"] = fix.get("smiles")

        return updated_compound_ids

    @staticmethod
    def _infer_impacted_reaction_ids(
        reactions: List[Dict[str, Any]],
        updated_compound_ids: Set[str],
        *,
        reaction_id_key: str,
        equation_key: str,
    ) -> Set[str]:
        """
        Infer which reactions are affected by updated compound identifiers.

        The method first consults the existing ``missing`` block. If that block
        already records which reactions involve the updated compounds, those
        reaction identifiers are reused directly. Otherwise, the method falls
        back to scanning reaction equation strings.

        :param reactions:
            Reaction records from a module block.
        :type reactions: List[Dict[str, Any]]
        :param updated_compound_ids:
            Compound identifiers whose records were changed by imputation.
        :type updated_compound_ids: Set[str]
        :param reaction_id_key:
            Dictionary key storing reaction identifiers.
        :type reaction_id_key: str
        :param equation_key:
            Dictionary key storing equation text.
        :type equation_key: str

        :returns:
            Set of impacted reaction identifiers.
        :rtype: Set[str]

        Example
        -------
        .. code-block:: python

            impacted = KEGGImputer._infer_impacted_reaction_ids(
                reactions,
                {"C00138"},
                reaction_id_key="id",
                equation_key="reaction",
            )
        """
        impacted: Set[str] = set()

        for reaction in reactions:
            reaction_id = reaction.get(reaction_id_key)
            equation = reaction.get(equation_key, "") or ""
            if reaction_id and any(cid in equation for cid in updated_compound_ids):
                impacted.add(reaction_id)

        return impacted

    def _rebuild_reaction_fields(
        self,
        reactions: List[Dict[str, Any]],
        molecules: List[Dict[str, Any]],
        impacted_reaction_ids: Set[str],
        *,
        reaction_id_key: str = "id",
        equation_key: str = "reaction",
        reaction_smiles_key: str = "smiles",
        reaction_rule_key: str = "rule",
        molecule_id_key: str = "id",
    ) -> None:
        """
        Recompute reaction SMILES and mapped rules in place for impacted reactions.

        Only reactions listed in ``impacted_reaction_ids`` are rebuilt. Reaction
        SMILES are regenerated from the updated molecule table, then optional
        atom-mapped rules are refreshed via
        :meth:`KEGGExtractor.atom_map_reactions`.

        :param reactions:
            Reaction records to update in place.
        :type reactions: List[Dict[str, Any]]
        :param molecules:
            Molecule records containing current SMILES values.
        :type molecules: List[Dict[str, Any]]
        :param impacted_reaction_ids:
            Reaction identifiers requiring recomputation.
        :type impacted_reaction_ids: Set[str]
        :param reaction_id_key:
            Dictionary key storing reaction identifiers.
        :type reaction_id_key: str
        :param equation_key:
            Dictionary key storing reaction equation strings.
        :type equation_key: str
        :param reaction_smiles_key:
            Dictionary key used for reaction SMILES.
        :type reaction_smiles_key: str
        :param reaction_rule_key:
            Dictionary key used for atom-mapped reaction rules.
        :type reaction_rule_key: str
        :param molecule_id_key:
            Dictionary key storing molecule identifiers.
        :type molecule_id_key: str

        :returns:
            ``None``.
        :rtype: None

        Example
        -------
        .. code-block:: python

            imputer._rebuild_reaction_fields(
                reactions,
                molecules,
                {"R00001"},
                reaction_id_key="id",
                equation_key="reaction",
            )
        """
        if not impacted_reaction_ids:
            return

        molecules_by_id = {
            molecule[molecule_id_key]: molecule
            for molecule in molecules
            if molecule_id_key in molecule
        }
        compounds_by_cid = {
            compound_id: {"smiles": molecules_by_id[compound_id].get("smiles")}
            for compound_id in molecules_by_id
        }

        reactions_by_id = {
            reaction.get(reaction_id_key): reaction
            for reaction in reactions
            if reaction.get(reaction_id_key)
        }

        rebuilt_smiles: Dict[str, str] = {}

        for reaction_id in impacted_reaction_ids:
            reaction = reactions_by_id.get(reaction_id)
            if reaction is None:
                continue

            equation = reaction.get(equation_key)
            if not equation:
                continue

            parsed_equation = parse_equation(equation)
            rsmi, _ = reaction_smiles_from_equation(parsed_equation, compounds_by_cid)
            reaction[reaction_smiles_key] = rsmi
            rebuilt_smiles[reaction_id] = rsmi

        if rebuilt_smiles:
            mapped_rules = self.extractor.atom_map_reactions(rebuilt_smiles)
            for reaction_id, rule in mapped_rules.items():
                reaction = reactions_by_id.get(reaction_id)
                if reaction is not None:
                    reaction[reaction_rule_key] = rule


[docs]
    def impute_module(
        self,
        module_data: Dict[str, Any],
        fixes: List[Dict[str, str]],
        save_as: Optional[str] = None,
        *,
        molecule_id_key: str = "id",
        reaction_id_key: str = "id",
        equation_key: str = "reaction",
        reaction_smiles_key: str = "smiles",
        reaction_rule_key: str = "rule",
    ) -> Dict[str, Any]:
        """
        Apply molecule and reaction fixes to a module JSON block.

        Reaction fixes are applied first, then molecule fixes are applied, then
        impacted reaction SMILES and atom-mapped rules are rebuilt, and finally
        the ``missing`` block is regenerated by delegating to
        :meth:`KEGGExtractor.build_missing_compound_report`.

        :param module_data:
            Module JSON dictionary.
        :type module_data: Dict[str, Any]
        :param fixes:
            List of mixed fix records. Reaction fixes must contain the reaction
            identifier key and the equation key. Molecule fixes use molecule
            identifiers and optional ``"name"`` / ``"smiles"`` fields.
        :type fixes: List[Dict[str, str]]
        :param save_as:
            Optional output path.
        :type save_as: Optional[str]
        :param molecule_id_key:
            Molecule identifier key.
        :type molecule_id_key: str
        :param reaction_id_key:
            Reaction identifier key.
        :type reaction_id_key: str
        :param equation_key:
            Equation text key.
        :type equation_key: str
        :param reaction_smiles_key:
            Reaction SMILES field key.
        :type reaction_smiles_key: str
        :param reaction_rule_key:
            Atom-mapped rule field key.
        :type reaction_rule_key: str

        :returns:
            Updated module JSON dictionary.
        :rtype: Dict[str, Any]

        Example
        -------
        .. code-block:: python

            updated = imputer.impute_module(
                module_data,
                fixes=[
                    {
                        "id": "R02189",
                        "reaction": "C00404 + C00267 <=> C00404a + C00668",
                    },
                    {
                        "id": "C00404a",
                        "name": "Polyphosphate fragment",
                        "smiles": "O=P(O)(O)OP(=O)(O)O",
                    },
                ],
            )
        """
        new_data = copy.deepcopy(module_data)

        molecules = new_data.get("molecules", []) or []
        reactions = new_data.get("reactions", []) or []

        reaction_fixes, molecule_fixes = self._split_fixes(
            fixes,
            reaction_id_key=reaction_id_key,
            equation_key=equation_key,
        )

        edited_reaction_ids = self._apply_reaction_fixes(
            reactions,
            reaction_fixes,
            reaction_id_key=reaction_id_key,
            equation_key=equation_key,
            reaction_smiles_key=reaction_smiles_key,
            reaction_rule_key=reaction_rule_key,
        )

        molecules_by_id = {
            molecule[molecule_id_key]: molecule
            for molecule in molecules
            if molecule_id_key in molecule
        }

        updated_compound_ids = self._apply_molecule_fixes(
            molecules_by_id,
            molecule_fixes,
            molecule_id_key=molecule_id_key,
        )

        new_data["molecules"] = self._restore_molecule_list(
            molecules,
            molecules_by_id,
            id_key=molecule_id_key,
        )

        impacted_reaction_ids = set(edited_reaction_ids)
        impacted_reaction_ids.update(
            self._infer_impacted_reaction_ids(
                reactions,
                updated_compound_ids,
                reaction_id_key=reaction_id_key,
                equation_key=equation_key,
            )
        )

        self._rebuild_reaction_fields(
            reactions,
            new_data["molecules"],
            impacted_reaction_ids,
            reaction_id_key=reaction_id_key,
            equation_key=equation_key,
            reaction_smiles_key=reaction_smiles_key,
            reaction_rule_key=reaction_rule_key,
            molecule_id_key=molecule_id_key,
        )

        new_data["reactions"] = reactions

        equations_by_rid = {
            reaction[reaction_id_key]: reaction.get(equation_key)
            for reaction in reactions
            if reaction.get(reaction_id_key)
        }
        compounds_by_cid = {
            molecule[molecule_id_key]: {
                "id": molecule.get(molecule_id_key),
                "name": molecule.get("name"),
                "smiles": molecule.get("smiles"),
            }
            for molecule in new_data["molecules"]
            if molecule.get(molecule_id_key)
        }

        new_data["missing"] = self.extractor.build_missing_compound_report(
            equations_by_rid,
            compounds_by_cid,
        )

        self.extractor.save_json(new_data, save_as)

        return new_data



[docs]
    def impute_pathway(
        self,
        pathway_data: Dict[str, Any],
        fixes: List[Dict[str, str]],
        save_as: Optional[str] = None,
        *,
        molecule_id_key: str = "id",
        reaction_id_key: str = "id",
        equation_key: str = "reaction",
        reaction_smiles_key: str = "smiles",
        reaction_rule_key: str = "rule",
    ) -> Dict[str, Any]:
        """
        Apply molecule and reaction fixes across all modules in a pathway JSON
        block.

        Each module is processed independently through :meth:`impute_module`,
        then the pathway-level ``missing`` summary is rebuilt by aggregating the
        updated module summaries.

        :param pathway_data:
            Pathway JSON dictionary with ``"by_module"``.
        :type pathway_data: Dict[str, Any]
        :param fixes:
            Mixed reaction and molecule fix records.
        :type fixes: List[Dict[str, str]]
        :param save_as:
            Optional output path.
        :type save_as: Optional[str]
        :param molecule_id_key:
            Molecule identifier key.
        :type molecule_id_key: str
        :param reaction_id_key:
            Reaction identifier key.
        :type reaction_id_key: str
        :param equation_key:
            Equation text key.
        :type equation_key: str
        :param reaction_smiles_key:
            Reaction SMILES field key.
        :type reaction_smiles_key: str
        :param reaction_rule_key:
            Atom-mapped rule field key.
        :type reaction_rule_key: str

        :returns:
            Updated pathway JSON dictionary.
        :rtype: Dict[str, Any]

        Example
        -------
        .. code-block:: python

            updated = imputer.impute_pathway(
                pathway_data,
                fixes=[
                    {
                        "id": "R02189",
                        "reaction": "C00404 + C00267 <=> C00404a + C00668",
                    },
                    {
                        "id": "C00404a",
                        "name": "Polyphosphate fragment",
                        "smiles": "O=P(O)(O)OP(=O)(O)O",
                    },
                ],
            )
        """
        new_pathway = copy.deepcopy(pathway_data)
        by_module = new_pathway.get("by_module", {}) or {}

        for module_id, block in list(by_module.items()):
            by_module[module_id] = self.impute_module(
                block,
                fixes,
                save_as=None,
                molecule_id_key=molecule_id_key,
                reaction_id_key=reaction_id_key,
                equation_key=equation_key,
                reaction_smiles_key=reaction_smiles_key,
                reaction_rule_key=reaction_rule_key,
            )

        missing_compound_ids: Set[str] = set()
        missing_reaction_ids: Set[str] = set()

        for block in by_module.values():
            missing = block.get("missing", {}) or {}
            missing_compound_ids.update(missing.get("missing_compound_ids", []) or [])
            missing_reaction_ids.update(
                missing.get("reactions_involving_missing", []) or []
            )

        new_pathway["by_module"] = by_module
        new_pathway["missing"] = {
            "missing_compound_ids": sorted(missing_compound_ids),
            "reactions_involving_missing": sorted(missing_reaction_ids),
        }

        self.extractor.save_json(new_pathway, save_as)

        return new_pathway