Source code for synkit.CRN.Query.kegg_extract
from __future__ import annotations
import json
import re
from dataclasses import dataclass
from typing import Any, Optional, Mapping
try:
from rxnmapper import RXNMapper as _RXNMapper
except Exception:
_RXNMapper = None
from .kegg_api import KEGGClient
from .kegg_parse import (
equation_to_text,
get_compound_ids_from_equations,
get_compound_ids_from_text,
molblock_to_smiles,
normalize_module_id,
orient_equation_to_module,
parse_equation,
parse_kegg_field_blocks,
parse_module_reaction_directions,
reaction_smiles_from_equation,
)
_RID_PATTERN = re.compile(r"R\d{5}")
ReactionEquationMap = dict[str, Optional[str]]
CompoundTable = dict[str, dict[str, Any]]
ReactionSmilesMap = dict[str, str]
MissingByReaction = dict[str, dict[str, list[str]]]
JSONDict = dict[str, Any]
[docs]
@dataclass(slots=True)
class KEGGExtractor:
"""
High-level extractor for KEGG pathway and module reaction data.
This class orchestrates KEGG entry retrieval, module membership parsing,
reaction equation collection, compound-table construction, reaction SMILES
assembly, and optional atom mapping.
:param client:
Optional KEGG REST client. When omitted, a default :class:`KEGGClient`
instance is created during :meth:`__post_init__`.
:type client: Optional[KEGGClient]
:param mapper_cls:
Optional atom-mapper class used by :meth:`atom_map_reactions`. The
class must be instantiable without arguments and must provide
``get_attention_guided_atom_maps``.
:type mapper_cls: Optional[type[Any]]
Example
-------
.. code-block:: python
extractor = KEGGExtractor()
data = extractor.build_module_json(
"M00001",
with_compounds=True,
with_atom_maps=False,
)
"""
client: Optional[KEGGClient] = None
mapper_cls: Optional[type[Any]] = _RXNMapper
def __post_init__(self) -> None:
if self.client is None:
self.client = KEGGClient()
[docs]
@staticmethod
def save_json(data: Mapping[str, Any], save_as: Optional[str]) -> None:
"""
Save JSON data to disk when an output path is provided.
:param data:
JSON-serializable data to write.
:type data: Mapping[str, Any]
:param save_as:
Optional output path.
:type save_as: Optional[str]
:returns:
``None``.
:rtype: None
Example
-------
.. code-block:: python
KEGGExtractor._save_json({"x": 1}, "out.json")
"""
if save_as:
with open(save_as, "w", encoding="utf-8") as handle:
json.dump(data, handle, ensure_ascii=False, indent=2)
[docs]
def get_modules_from_pathway(self, pathway_id: str) -> list[str]:
"""
Extract module IDs from a KEGG pathway entry.
:param pathway_id:
KEGG pathway identifier such as ``"hsa00010"``.
:type pathway_id: str
:returns:
Canonical KEGG module identifiers such as ``["M00001",
"M00002"]``.
:rtype: list[str]
Example
-------
.. code-block:: python
modules = extractor.get_modules_from_pathway("hsa00010")
"""
text = self.client.get_text(f"get/{pathway_id}")
payloads = parse_kegg_field_blocks(text, "MODULE")
modules: list[str] = []
for payload in payloads:
for token in payload.split():
normalized = normalize_module_id(token)
if normalized is not None:
modules.append(normalized)
return modules
[docs]
def get_reaction_ids_from_module(self, module_id: str) -> list[str]:
"""
Collect KEGG reaction IDs from a module entry, preserving module order when
directional REACTION lines can be parsed.
:param module_id:
KEGG module identifier such as ``"M00001"``.
:type module_id: str
:returns:
Sorted unique KEGG reaction identifiers.
:rtype: list[str]
Example
-------
.. code-block:: python
reaction_ids = extractor.get_reaction_ids_from_module("M00001")
"""
text = self.client.get_text(f"get/{module_id}")
directions = parse_module_reaction_directions(text)
if directions:
return list(directions.keys())
payloads = parse_kegg_field_blocks(text, "REACTION")
reaction_ids: set[str] = set()
for payload in payloads:
reaction_ids.update(_RID_PATTERN.findall(payload))
return sorted(reaction_ids)
[docs]
def get_equation_for_reaction(self, reaction_id: str) -> Optional[str]:
"""
Fetch the KEGG equation string for a reaction.
:param reaction_id:
KEGG reaction identifier such as ``"R00200"``.
:type reaction_id: str
:returns:
Equation string when present, otherwise ``None``.
:rtype: Optional[str]
Example
-------
.. code-block:: python
equation = extractor.get_equation_for_reaction("R00200")
"""
text = self.client.get_text(f"get/rn:{reaction_id}")
payloads = parse_kegg_field_blocks(text, "EQUATION")
return payloads[0].strip() if payloads else None
[docs]
def get_module_equations(self, module_id: str) -> ReactionEquationMap:
"""
Build a reaction-to-equation mapping for a KEGG module.
:param module_id:
KEGG module identifier.
:type module_id: str
:returns:
Mapping from reaction identifier to equation string.
:rtype: dict[str, Optional[str]]
Example
-------
.. code-block:: python
equations = extractor.get_module_equations("M00001")
"""
module_text = self.client.get_text(f"get/{module_id}")
directions = parse_module_reaction_directions(module_text)
reaction_ids = (
list(directions.keys())
if directions
else self.get_reaction_ids_from_module(module_id)
)
equations_by_rid: ReactionEquationMap = {}
for reaction_id in reaction_ids:
equation = self.get_equation_for_reaction(reaction_id)
if equation is None:
equations_by_rid[reaction_id] = None
continue
if reaction_id not in directions:
equations_by_rid[reaction_id] = equation
continue
left_ids, right_ids, module_arrow = directions[reaction_id]
parsed = parse_equation(equation)
oriented = orient_equation_to_module(parsed, left_ids, right_ids)
equations_by_rid[reaction_id] = equation_to_text(
oriented, arrow=module_arrow
)
return equations_by_rid
[docs]
def get_pathway_equations(
self,
pathway_id: str,
) -> dict[str, ReactionEquationMap]:
"""
Build nested module/reaction equation mappings for a pathway.
:param pathway_id:
KEGG pathway identifier.
:type pathway_id: str
:returns:
Mapping of the form ``{module_id: {reaction_id: equation}}``.
:rtype: dict[str, dict[str, Optional[str]]]
Example
-------
.. code-block:: python
nested = extractor.get_pathway_equations("hsa00010")
"""
modules = self.get_modules_from_pathway(pathway_id)
return {
module_id: self.get_module_equations(module_id) for module_id in modules
}
[docs]
def get_compound_name(self, compound_id: str) -> Optional[str]:
"""
Retrieve the primary KEGG compound name.
When multiple synonyms are present in the ``NAME`` field, only the
first entry is returned.
:param compound_id:
KEGG compound identifier such as ``"C00001"``.
:type compound_id: str
:returns:
Primary compound name if available, otherwise ``None``.
:rtype: Optional[str]
Example
-------
.. code-block:: python
name = extractor.get_compound_name("C00001")
"""
text = self.client.get_text(f"get/cpd:{compound_id}")
payloads = parse_kegg_field_blocks(text, "NAME")
if not payloads:
return None
first_payload = payloads[0].strip()
return first_payload.split(";")[0].strip()
[docs]
def get_compound_molblock(self, compound_id: str) -> Optional[str]:
"""
Retrieve the KEGG MOL block for a compound.
:param compound_id:
KEGG compound identifier.
:type compound_id: str
:returns:
MOL block text when available, otherwise ``None``.
:rtype: Optional[str]
Example
-------
.. code-block:: python
molblock = extractor.get_compound_molblock("C00001")
"""
return self.client.get_optional_text(f"get/cpd:{compound_id}/mol")
[docs]
def build_compound_table(
self,
compound_ids: list[str],
) -> CompoundTable:
"""
Build a compound table for a list of KEGG compound identifiers.
Each returned record includes the KEGG compound identifier, the primary
compound name, the optional MOL block, and a canonical SMILES string
derived from the MOL block when RDKit parsing succeeds.
:param compound_ids:
KEGG compound identifiers.
:type compound_ids: list[str]
:returns:
Compound table of the form
``{cid: {"id", "name", "smiles", "molblock"}}``.
:rtype: dict[str, dict[str, Any]]
Example
-------
.. code-block:: python
compounds = extractor.build_compound_table(["C00001", "C00002"])
"""
compounds: CompoundTable = {}
for compound_id in compound_ids:
name = self.get_compound_name(compound_id)
molblock = self.get_compound_molblock(compound_id)
smiles = molblock_to_smiles(molblock)
compounds[compound_id] = {
"id": compound_id,
"name": name,
"smiles": smiles,
"molblock": molblock,
}
return compounds
[docs]
def build_reaction_smiles_dict(
self,
parsed_by_rid: Mapping[str, Any],
compounds_by_cid: Mapping[str, Mapping[str, Any]],
) -> tuple[ReactionSmilesMap, MissingByReaction]:
"""
Build reaction SMILES strings for parsed KEGG equations.
:param parsed_by_rid:
Parsed equation objects keyed by reaction identifier.
:type parsed_by_rid: Mapping[str, Any]
:param compounds_by_cid:
Compound table keyed by KEGG compound identifier.
:type compounds_by_cid: Mapping[str, Mapping[str, Any]]
:returns:
Tuple ``(reaction_smiles_by_id, missing_by_id)``.
:rtype: tuple[dict[str, str], dict[str, dict[str, list[str]]]]
Example
-------
.. code-block:: python
rsmi_by_rid, missing = extractor.build_reaction_smiles_dict(
parsed_by_rid,
compounds_by_cid,
)
"""
reaction_smiles: ReactionSmilesMap = {}
missing_by_rid: MissingByReaction = {}
for reaction_id, parsed_equation in parsed_by_rid.items():
rsmi, missing = reaction_smiles_from_equation(
parsed_equation,
compounds_by_cid,
)
reaction_smiles[reaction_id] = rsmi
missing_by_rid[reaction_id] = missing
return reaction_smiles, missing_by_rid
[docs]
def atom_map_reactions(
self,
reaction_smiles_by_id: Mapping[str, str],
) -> dict[str, Optional[str]]:
"""
Atom-map reaction SMILES using RXNMapper.
:param reaction_smiles_by_id:
Mapping ``{reaction_id: reaction_smiles}``.
:type reaction_smiles_by_id: Mapping[str, str]
:returns:
Mapping ``{reaction_id: mapped_reaction_smiles_or_none}``.
:rtype: dict[str, Optional[str]]
Example
-------
.. code-block:: python
mapped = extractor.atom_map_reactions({"R00001": "CCO>>CC=O"})
"""
mapper = self.mapper_cls() if self.mapper_cls is not None else None
mapped_by_id: dict[str, Optional[str]] = {}
for reaction_id, reaction_smiles in reaction_smiles_by_id.items():
if (
">>" not in reaction_smiles
or reaction_smiles.startswith(">>")
or reaction_smiles.endswith(">>")
):
mapped_by_id[reaction_id] = None
continue
try:
result = mapper.get_attention_guided_atom_maps([reaction_smiles])[0]
mapped_by_id[reaction_id] = result.get("mapped_rxn")
except Exception:
mapped_by_id[reaction_id] = None
return mapped_by_id
[docs]
def build_missing_compound_report(
self,
equations_by_rid: ReactionEquationMap,
compounds_by_cid: Mapping[str, Mapping[str, Any]],
) -> JSONDict:
"""
Build a report for compounds lacking SMILES.
:param equations_by_rid:
Reaction equations keyed by reaction identifier.
:type equations_by_rid: dict[str, Optional[str]]
:param compounds_by_cid:
Compound records keyed by KEGG compound identifier.
:type compounds_by_cid: Mapping[str, Mapping[str, Any]]
:returns:
Report containing missing compounds and per-reaction provenance.
:rtype: dict[str, Any]
Example
-------
.. code-block:: python
report = extractor.build_missing_compound_report(
equations_by_rid,
compounds_by_cid,
)
"""
cid_to_rids: dict[str, set[str]] = {}
for reaction_id, equation in equations_by_rid.items():
if not equation:
continue
for compound_id in get_compound_ids_from_text(equation):
cid_to_rids.setdefault(compound_id, set()).add(reaction_id)
missing_compounds: list[dict[str, Any]] = []
missing_ids: set[str] = set()
involving_reactions: set[str] = set()
for compound_id, record in compounds_by_cid.items():
if record.get("smiles") is None:
reaction_ids = sorted(cid_to_rids.get(compound_id, set()))
missing_compounds.append(
{
"id": compound_id,
"name": record.get("name"),
"reactions": reaction_ids,
}
)
missing_ids.add(compound_id)
involving_reactions.update(reaction_ids)
missing_compounds.sort(key=lambda record: record["id"])
return {
"missing_compounds": missing_compounds,
"missing_compound_ids": sorted(missing_ids),
"reactions_involving_missing": sorted(involving_reactions),
}
[docs]
def build_kegg_json(
self,
equations_by_rid: ReactionEquationMap,
*,
smiles_by_rid: Optional[Mapping[str, str]] = None,
rules_by_rid: Optional[Mapping[str, Optional[str]]] = None,
molecules_by_cid: Optional[Mapping[str, Mapping[str, Any]]] = None,
) -> JSONDict:
"""
Build a compact KEGG JSON block with reactions and molecules.
:param equations_by_rid:
Reaction equations keyed by reaction identifier.
:type equations_by_rid: dict[str, Optional[str]]
:param smiles_by_rid:
Optional reaction SMILES keyed by reaction identifier.
:type smiles_by_rid: Optional[Mapping[str, str]]
:param rules_by_rid:
Optional atom-mapped reaction strings keyed by reaction identifier.
:type rules_by_rid: Optional[Mapping[str, Optional[str]]]
:param molecules_by_cid:
Optional molecule table keyed by compound identifier.
:type molecules_by_cid: Optional[Mapping[str, Mapping[str, Any]]]
:returns:
Dictionary with ``"reactions"`` and ``"molecules"`` entries.
:rtype: dict[str, Any]
Example
-------
.. code-block:: python
data = extractor.build_kegg_json(equations_by_rid)
"""
smiles_by_rid = dict(smiles_by_rid or {})
rules_by_rid = dict(rules_by_rid or {})
molecules_by_cid = dict(molecules_by_cid or {})
all_compound_ids: set[str] = set()
reactions: list[dict[str, Any]] = []
for reaction_id in sorted(equations_by_rid.keys()):
equation = equations_by_rid[reaction_id]
if not equation:
continue
all_compound_ids.update(get_compound_ids_from_text(equation))
reactions.append(
{
"id": reaction_id,
"reaction": equation,
"rule": rules_by_rid.get(reaction_id),
"smiles": smiles_by_rid.get(reaction_id),
}
)
molecules: list[dict[str, Any]] = []
for compound_id in sorted(all_compound_ids):
record = molecules_by_cid.get(
compound_id,
{"id": compound_id, "name": None, "smiles": None},
)
molecules.append(
{
"id": compound_id,
"name": record.get("name"),
"smiles": record.get("smiles"),
}
)
return {"reactions": reactions, "molecules": molecules}
[docs]
def build_module_json(
self,
module_id: str,
*,
with_compounds: bool = True,
with_atom_maps: bool = True,
save_as: Optional[str] = None,
) -> JSONDict:
"""
Build a JSON block for a KEGG module.
:param module_id:
KEGG module ID.
:type module_id: str
:param with_compounds:
Whether to resolve compound names, MOL blocks, and SMILES strings.
:type with_compounds: bool
:param with_atom_maps:
Whether to compute atom-mapped reactions.
:type with_atom_maps: bool
:param save_as:
Optional output path for writing the JSON block to disk.
:type save_as: Optional[str]
:returns:
Module JSON dictionary.
:rtype: dict[str, Any]
Example
-------
.. code-block:: python
data = extractor.build_module_json(
"M00001",
with_compounds=True,
with_atom_maps=False,
)
"""
equations_by_rid = self.get_module_equations(module_id)
compound_ids, parsed_by_rid = get_compound_ids_from_equations(equations_by_rid)
compounds_by_cid = (
self.build_compound_table(compound_ids) if with_compounds else {}
)
reaction_smiles_by_rid, _ = (
self.build_reaction_smiles_dict(parsed_by_rid, compounds_by_cid)
if with_compounds
else ({}, {})
)
rules_by_rid = (
self.atom_map_reactions(reaction_smiles_by_rid)
if (with_compounds and with_atom_maps)
else {}
)
data: JSONDict = {"module_id": module_id}
data.update(
self.build_kegg_json(
equations_by_rid,
smiles_by_rid=reaction_smiles_by_rid,
rules_by_rid=rules_by_rid,
molecules_by_cid={
cid: {
"id": cid,
"name": compounds_by_cid[cid]["name"],
"smiles": compounds_by_cid[cid]["smiles"],
}
for cid in compounds_by_cid
},
)
)
if with_compounds:
data["missing"] = self.build_missing_compound_report(
equations_by_rid,
compounds_by_cid,
)
else:
data["missing"] = {
"missing_compounds": [],
"missing_compound_ids": [],
"reactions_involving_missing": [],
}
self.save_json(data, save_as)
return data
[docs]
def build_pathway_json(
self,
pathway_id: str,
*,
with_compounds: bool = True,
with_atom_maps: bool = True,
save_as: Optional[str] = None,
) -> JSONDict:
"""
Build a JSON block for a KEGG pathway, organized by module.
:param pathway_id:
KEGG pathway ID.
:type pathway_id: str
:param with_compounds:
Whether to resolve compound records.
:type with_compounds: bool
:param with_atom_maps:
Whether to compute atom-mapped reactions.
:type with_atom_maps: bool
:param save_as:
Optional output path for writing the JSON block to disk.
:type save_as: Optional[str]
:returns:
Pathway JSON dictionary.
:rtype: dict[str, Any]
Example
-------
.. code-block:: python
data = extractor.build_pathway_json(
"hsa00010",
with_compounds=True,
with_atom_maps=False,
)
"""
modules = self.get_modules_from_pathway(pathway_id)
by_module: dict[str, Any] = {}
aggregate_missing_ids: set[str] = set()
aggregate_reaction_ids: set[str] = set()
for module_id in modules:
module_block = self.build_module_json(
module_id,
with_compounds=with_compounds,
with_atom_maps=with_atom_maps,
)
by_module[module_id] = module_block
if with_compounds and "missing" in module_block:
aggregate_missing_ids.update(
module_block["missing"].get("missing_compound_ids", [])
)
aggregate_reaction_ids.update(
module_block["missing"].get("reactions_involving_missing", [])
)
data: JSONDict = {
"pathway_id": pathway_id,
"modules": modules,
"by_module": by_module,
}
if with_compounds:
data["missing"] = {
"missing_compound_ids": sorted(aggregate_missing_ids),
"reactions_involving_missing": sorted(aggregate_reaction_ids),
}
self.save_json(data, save_as)
return data