Source code for synkit.CRN.Construct.abstract
from __future__ import annotations
import json
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union
PathLike = Union[str, Path]
ReactionSides = Tuple[List[str], List[str]]
def _split_reaction_smiles(reaction_smiles: str) -> ReactionSides:
"""
Split a reaction SMILES string into reactant and product molecule lists.
The expected format is ``"A.B>>C.D"``. Empty left or right sides are allowed.
:param reaction_smiles:
Reaction SMILES string.
:type reaction_smiles: str
:returns:
Tuple of reactant and product molecule lists.
:rtype: Tuple[List[str], List[str]]
:raises ValueError:
If the reaction string does not contain ``">>"``.
Example
-------
.. code-block:: python
reactants, products = _split_reaction_smiles("CCO.O>>CC=O")
"""
value = reaction_smiles.strip()
if ">>" not in value:
raise ValueError(f"Invalid reaction SMILES (missing '>>'): {reaction_smiles}")
left, right = value.split(">>", 1)
reactants = [token.strip() for token in left.split(".") if token.strip()]
products = [token.strip() for token in right.split(".") if token.strip()]
return reactants, products
def _excel_label(index: int) -> str:
"""
Convert a zero-based integer index into an Excel-style alphabetic label.
Examples include ``0 -> "A"``, ``25 -> "Z"``, and ``26 -> "AA"``.
:param index:
Zero-based index.
:type index: int
:returns:
Excel-style alphabetic label.
:rtype: str
:raises ValueError:
If ``index`` is negative.
Example
-------
.. code-block:: python
label = _excel_label(27) # "AB"
"""
if index < 0:
raise ValueError("Index must be non-negative")
label = ""
value = index + 1
while value > 0:
value, remainder = divmod(value - 1, 26)
label = chr(ord("A") + remainder) + label
return label
def _normalize_abstract_side(side: str) -> List[str]:
"""
Normalize one abstract reaction side by splitting on ``"+"`` and sorting tokens.
:param side:
Abstract reaction side.
:type side: str
:returns:
Sorted abstract token list.
:rtype: List[str]
Example
-------
.. code-block:: python
tokens = _normalize_abstract_side("B+A+C")
"""
return sorted(token.strip() for token in side.split("+") if token.strip())
def _first_present(
record: Mapping[str, Any],
keys: Sequence[str],
) -> Optional[Any]:
"""
Return the first non-``None`` value found in a mapping for the given keys.
:param record:
Input mapping.
:type record: Mapping[str, Any]
:param keys:
Candidate keys to try in order.
:type keys: Sequence[str]
:returns:
First matching value, or ``None`` if none are present.
:rtype: Optional[Any]
Example
-------
.. code-block:: python
value = _first_present(record, ["smiles", "reaction", "rxn_smiles"])
"""
for key in keys:
if key in record and record[key] is not None:
return record[key]
return None
[docs]
def deduplicate_abstract_reactions(reactions: Sequence[str]) -> List[str]:
"""
Remove identity reactions and duplicate abstract reactions.
Reactant and product order are normalized internally before comparison.
The original retained representative is the first encountered entry.
:param reactions:
Abstract reactions such as ``"A+B>>C+D"``.
:type reactions: Sequence[str]
:returns:
Filtered abstract reactions.
:rtype: List[str]
Example
-------
.. code-block:: python
filtered = deduplicate_abstract_reactions(
["A+B>>C", "B+A>>C", "A>>A"]
)
"""
seen: set[str] = set()
filtered: List[str] = []
for reaction in reactions:
if ">>" not in reaction:
continue
left, right = reaction.split(">>", 1)
reactants = _normalize_abstract_side(left)
products = _normalize_abstract_side(right)
if reactants == products:
continue
normalized = ">>".join(["+".join(reactants), "+".join(products)])
if normalized in seen:
continue
seen.add(normalized)
filtered.append(reaction)
return filtered
[docs]
@dataclass(frozen=True)
class AbstractReactionNetwork:
"""
Symbolic abstraction of a reaction network.
:param molecule_pool:
Unique molecule pool in the original full representation.
:type molecule_pool: List[str]
:param reactions:
Abstract symbolic reactions such as ``"A+B>>C+D"``.
:type reactions: List[str]
:param templates:
Optional mapping from reaction identifiers to rule or template strings.
:type templates: Dict[str, str]
:param label_to_molecule:
Mapping from abstract labels back to original molecule strings.
:type label_to_molecule: Dict[str, str]
"""
molecule_pool: List[str]
reactions: List[str]
templates: Dict[str, str] = field(default_factory=dict)
label_to_molecule: Dict[str, str] = field(default_factory=dict)
[docs]
def to_dict(self) -> Dict[str, Any]:
"""
Convert the abstract network to a plain dictionary.
:returns:
Dictionary representation of the abstract network.
:rtype: Dict[str, Any]
Example
-------
.. code-block:: python
payload = network.to_dict()
"""
return {
"molecule_pool": list(self.molecule_pool),
"reactions": list(self.reactions),
"templates": dict(self.templates),
"label_to_molecule": dict(self.label_to_molecule),
}
[docs]
def to_json_payload(
self, name: str = "abstract_reaction_network"
) -> Dict[str, Any]:
"""
Convert the abstract network into a SynKit-style JSON payload.
:param name:
Name stored in the metadata block.
:type name: str
:returns:
JSON-serializable payload.
:rtype: Dict[str, Any]
Example
-------
.. code-block:: python
payload = network.to_json_payload(name="glycolysis_abstract")
"""
return {
"meta": {"name": name, "version": 1},
"examples": [
{
"molecule_pool": list(self.molecule_pool),
"reactions": list(self.reactions),
"templates": dict(self.templates),
"label_to_molecule": dict(self.label_to_molecule),
}
],
}
[docs]
def save_json(self, path: PathLike, *, name: Optional[str] = None) -> None:
"""
Save the abstract network as a JSON file.
:param path:
Output JSON path.
:type path: PathLike
:param name:
Optional metadata name. If omitted, the filename stem is used.
:type name: Optional[str]
:returns:
``None``
:rtype: None
Example
-------
.. code-block:: python
network.save_json("abstract_network.json")
"""
output_path = Path(path)
payload = self.to_json_payload(name=name or output_path.stem)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as handle:
json.dump(payload, handle, indent=4)
[docs]
@dataclass
class AbstractReactionExtractor:
"""
Build abstract symbolic reaction networks from reaction SMILES lists or
SynKit-style module/pathway JSON blocks.
This class supports configurable field names when extracting reactions from
JSON-like input records.
Example
-------
.. code-block:: python
extractor = KEGGExtractor()
data = extractor.build_module_json(
"M00001",
with_compounds=True,
with_atom_maps=False,
)
abstractor = AbstractReactionExtractor()
network = abstractor.build(
data=data,
deduplicate=True,
order="appearance",
)
"""
[docs]
def iter_reaction_records(
self,
data: Mapping[str, Any],
) -> Iterable[Tuple[Mapping[str, Any], str]]:
"""
Iterate over reaction records from a module-like or pathway-like JSON block.
Supported structures include:
- module-like: ``{"reactions": [...]}`
- pathway-like: ``{"by_module": {"M00001": {"reactions": [...]}, ...}}``
:param data:
Input JSON-like mapping.
:type data: Mapping[str, Any]
:yields:
Tuples of ``(reaction_record, module_id)``.
:rtype: Iterable[Tuple[Mapping[str, Any], str]]
Example
-------
.. code-block:: python
abstractor = AbstractReactionExtractor()
for record, module_id in abstractor.iter_reaction_records(data):
print(module_id, record.get("id"))
"""
by_module = data.get("by_module")
if isinstance(by_module, Mapping):
for module_id, block in by_module.items():
if not isinstance(block, Mapping):
continue
for reaction in block.get("reactions", []) or []:
if isinstance(reaction, Mapping):
yield reaction, str(module_id)
return
for reaction in data.get("reactions", []) or []:
if isinstance(reaction, Mapping):
yield reaction, ""
[docs]
def extract_reactions_and_templates(
self,
reactions: Optional[Sequence[str]] = None,
*,
data: Optional[Mapping[str, Any]] = None,
templates: Optional[Mapping[str, str]] = None,
drop_missing_smiles_reactions: bool = True,
prefix_module_in_reaction_id: bool = True,
reaction_id_keys: Optional[Sequence[str]] = None,
reaction_smiles_keys: Optional[Sequence[str]] = None,
template_keys: Optional[Sequence[str]] = None,
) -> Tuple[List[str], Dict[str, str]]:
"""
Extract reaction SMILES and rule-template mappings from raw inputs.
Either a direct list of reaction SMILES or a JSON data block may be
provided. If both are given, the explicit ``reactions`` list takes
precedence for reaction extraction, while ``templates`` is still merged.
When ``data`` is used, the user may customize which keys are searched
for reaction identifiers, reaction SMILES strings, and templates.
:param reactions:
Direct reaction SMILES list.
:type reactions: Optional[Sequence[str]]
:param data:
Module-like or pathway-like JSON block.
:type data: Optional[Mapping[str, Any]]
:param templates:
Optional external mapping from reaction identifiers to templates.
:type templates: Optional[Mapping[str, str]]
:param drop_missing_smiles_reactions:
Whether to skip records that do not contain a reaction SMILES string.
:type drop_missing_smiles_reactions: bool
:param prefix_module_in_reaction_id:
Whether to prefix reaction identifiers with module IDs in pathway-style
inputs.
:type prefix_module_in_reaction_id: bool
:param reaction_id_keys:
Candidate keys used to find reaction identifiers in each reaction record.
The keys are tried in order.
:type reaction_id_keys: Optional[Sequence[str]]
:param reaction_smiles_keys:
Candidate keys used to find reaction SMILES strings in each reaction
record. The keys are tried in order.
:type reaction_smiles_keys: Optional[Sequence[str]]
:param template_keys:
Candidate keys used to find rule or template strings in each reaction
record. The keys are tried in order.
:type template_keys: Optional[Sequence[str]]
:returns:
Tuple of reaction SMILES list and template mapping.
:rtype: Tuple[List[str], Dict[str, str]]
Example
-------
.. code-block:: python
extractor = KEGGExtractor()
data = extractor.build_module_json(
"M00001",
with_compounds=True,
with_atom_maps=False,
)
abstractor = AbstractReactionExtractor()
reactions, templates = abstractor.extract_reactions_and_templates(
data=data,
reaction_id_keys=["id", "kegg_id", "rid"],
reaction_smiles_keys=["smiles", "reaction", "rxn_smiles"],
template_keys=["rule", "template", "smirks"],
)
"""
reaction_id_keys = list(reaction_id_keys or ["id", "kegg_id"])
reaction_smiles_keys = list(
reaction_smiles_keys or ["smiles", "reaction", "rxn_smiles"]
)
template_keys = list(template_keys or ["rule", "template", "smirks"])
reaction_list = list(reactions or [])
template_pool: Dict[str, str] = dict(templates or {})
if reaction_list:
return reaction_list, template_pool
if not data:
return [], template_pool
extracted_reactions: List[str] = []
extracted_templates: Dict[str, str] = {}
for reaction_record, module_id in self.iter_reaction_records(data):
reaction_id = _first_present(reaction_record, reaction_id_keys)
reaction_smiles = _first_present(reaction_record, reaction_smiles_keys)
template = _first_present(reaction_record, template_keys)
if not reaction_smiles:
if drop_missing_smiles_reactions:
continue
reaction_smiles = ""
extracted_reactions.append(str(reaction_smiles))
if reaction_id is not None and template is not None:
key = str(reaction_id)
if module_id and prefix_module_in_reaction_id:
key = f"{module_id}:{key}"
extracted_templates[key] = str(template)
extracted_templates.update(template_pool)
return extracted_reactions, extracted_templates
[docs]
def build_molecule_pool(
self,
parsed_reactions: Sequence[ReactionSides],
*,
order: str = "appearance",
) -> List[str]:
"""
Build the unique molecule pool from parsed reactions.
:param parsed_reactions:
Parsed reaction sides as ``(reactants, products)`` tuples.
:type parsed_reactions: Sequence[Tuple[List[str], List[str]]]
:param order:
Molecule ordering mode. Supported values are ``"appearance"`` and
``"sorted"``.
:type order: str
:returns:
Ordered unique molecule pool.
:rtype: List[str]
:raises ValueError:
If ``order`` is not supported.
Example
-------
.. code-block:: python
molecule_pool = abstractor.build_molecule_pool(
parsed_reactions,
order="appearance",
)
"""
if order == "appearance":
seen: set[str] = set()
molecule_pool: List[str] = []
for reactants, products in parsed_reactions:
for molecule in reactants + products:
if molecule not in seen:
seen.add(molecule)
molecule_pool.append(molecule)
return molecule_pool
if order == "sorted":
unique_molecules: set[str] = set()
for reactants, products in parsed_reactions:
unique_molecules.update(reactants)
unique_molecules.update(products)
return sorted(unique_molecules)
raise ValueError("order must be 'appearance' or 'sorted'")
[docs]
def build(
self,
reactions: Optional[Sequence[str]] = None,
*,
data: Optional[Mapping[str, Any]] = None,
drop_missing_smiles_reactions: bool = True,
deduplicate: bool = False,
templates: Optional[Mapping[str, str]] = None,
order: str = "appearance",
reactant_join: str = "+",
product_join: str = "+",
prefix_module_in_reaction_id: bool = True,
reaction_id_keys: Optional[Sequence[str]] = None,
reaction_smiles_keys: Optional[Sequence[str]] = None,
template_keys: Optional[Sequence[str]] = None,
save_as: Optional[PathLike] = None,
) -> AbstractReactionNetwork:
"""
Convert full reaction SMILES into an abstract symbolic reaction network.
You may provide either a direct list of reaction SMILES or a module/pathway
JSON block.
If ``data`` is provided, field names for reaction identifiers, reaction
SMILES strings, and templates may be customized.
:param reactions:
Direct reaction SMILES list.
:type reactions: Optional[Sequence[str]]
:param data:
Module-like or pathway-like reaction JSON block.
:type data: Optional[Mapping[str, Any]]
:param drop_missing_smiles_reactions:
Whether to skip records missing reaction SMILES.
:type drop_missing_smiles_reactions: bool
:param deduplicate:
Whether to remove identity and duplicate abstract reactions.
:type deduplicate: bool
:param templates:
Optional external mapping from reaction identifiers to templates.
:type templates: Optional[Mapping[str, str]]
:param order:
Molecule ordering strategy, one of ``"appearance"`` or ``"sorted"``.
:type order: str
:param reactant_join:
Join token for abstract reactants.
:type reactant_join: str
:param product_join:
Join token for abstract products.
:type product_join: str
:param prefix_module_in_reaction_id:
Whether to prefix pathway reaction IDs with module IDs.
:type prefix_module_in_reaction_id: bool
:param reaction_id_keys:
Candidate keys used to find reaction identifiers in each reaction record.
:type reaction_id_keys: Optional[Sequence[str]]
:param reaction_smiles_keys:
Candidate keys used to find reaction SMILES strings in each reaction
record.
:type reaction_smiles_keys: Optional[Sequence[str]]
:param template_keys:
Candidate keys used to find rule or template strings in each reaction
record.
:type template_keys: Optional[Sequence[str]]
:param save_as:
Optional JSON output path.
:type save_as: Optional[PathLike]
:returns:
Abstract symbolic reaction network.
:rtype: AbstractReactionNetwork
Example
-------
.. code-block:: python
extractor = KEGGExtractor()
data = extractor.build_module_json(
"M00001",
with_compounds=True,
with_atom_maps=False,
)
abstractor = AbstractReactionExtractor()
network = abstractor.build(
data=data,
deduplicate=True,
order="appearance",
reaction_id_keys=["id", "kegg_id", "rid"],
reaction_smiles_keys=["smiles", "reaction", "rxn_smiles"],
template_keys=["rule", "template", "smirks"],
save_as="M00001_abstract.json",
)
"""
full_reactions, template_pool = self.extract_reactions_and_templates(
reactions=reactions,
data=data,
templates=templates,
drop_missing_smiles_reactions=drop_missing_smiles_reactions,
prefix_module_in_reaction_id=prefix_module_in_reaction_id,
reaction_id_keys=reaction_id_keys,
reaction_smiles_keys=reaction_smiles_keys,
template_keys=template_keys,
)
parsed_reactions: List[ReactionSides] = [
_split_reaction_smiles(reaction_smiles)
for reaction_smiles in full_reactions
]
molecule_pool = self.build_molecule_pool(parsed_reactions, order=order)
molecule_to_label = {
molecule: _excel_label(index)
for index, molecule in enumerate(molecule_pool)
}
label_to_molecule = {
label: molecule for molecule, label in molecule_to_label.items()
}
abstracted_reactions: List[str] = []
for reactants, products in parsed_reactions:
left = reactant_join.join(molecule_to_label[mol] for mol in reactants)
right = product_join.join(molecule_to_label[mol] for mol in products)
abstracted_reactions.append(f"{left}>>{right}")
if deduplicate:
abstracted_reactions = deduplicate_abstract_reactions(abstracted_reactions)
network = AbstractReactionNetwork(
molecule_pool=molecule_pool,
reactions=abstracted_reactions,
templates=dict(template_pool),
label_to_molecule=label_to_molecule,
)
if save_as is not None:
network.save_json(save_as)
return network