Source code for synkit.Synthesis.Reactor.mod_reactor

from __future__ import annotations

"""modreactor.py
=========================
A **hardened** and **typed** re-write of the original ``MODReactor`` wrapper
around the MØD toolkit. The public API remains 100 % compatible but the
internals are now:

* **Safer**  – avoids mutating inputs, validates arguments, logs diagnostics.
* **Faster** – lazy-builds the derivation graph and reaction SMILES only when first accessed.
* **Cleaner** – exhaustive doc-strings, typing everywhere, and single-purpose
  helpers. All heavy lifting lives in private methods prefixed `_`.

External behavior is unchanged:
```python
r = MODReactor("CC.O", "rule.gml", strategy="bt").run()
smiles = r.get_reaction_smiles()
"""
import importlib.util
from pathlib import Path
from collections import Counter
from typing import Any, List, Optional, Union


from synkit.IO.debug import setup_logging
from synkit.IO.data_io import load_gml_as_text
from synkit.IO.chem_converter import smart_to_gml
from synkit.Chem.Molecule.standardize import sanitize_and_canonicalize_smiles

from synkit.Synthesis.Reactor.strategy import Strategy
from synkit.Synthesis.reactor_utils import _deduplicateGraphs

# ──────────────────────────────────────────────────────────────────────────────
# Logging
# ──────────────────────────────────────────────────────────────────────────────
log = setup_logging(task_type="MODReactor")
if importlib.util.find_spec("mod"):
    from mod import smiles, ruleGMLString, DG, config
else:
    ruleGMLString = None
    smiles = None
    DG = None
    config = None
    log.warning("Optional 'mod' package not found")


# ──────────────────────────────────────────────────────────────────────────────
# MODReactor
# ──────────────────────────────────────────────────────────────────────────────
[docs] class MODReactor: """Lazy, ergonomic wrapper around the MØD toolkit’s derivation pipeline. Workflow -------- 1. Instantiate: give substrate SMILES and a rule GML (path or string). 2. Call `.run()` to execute the reaction strategy. 3. Inspect results via `.get_reaction_smiles()`, `.product_sets`, `.get_dg()`, etc. Attributes ---------- initial_smiles : List[str] List of SMILES strings for reactants (or products, if inverted). rule_file : Path Filesystem path or raw GML string or raw smart with AAM for the reaction rule. invert : bool If True, apply the rule in reverse (products → reactants). strategy : Strategy One of ALL, COMPONENT, or BACKTRACK. verbosity : int Verbosity level for the MØD DG.apply() call. print_results : bool If True, prints the derivation graph to stdout. """ # ------------------------------------------------------------------ # Construction # ------------------------------------------------------------------ def __init__( self, substrate: Union[str, List[str]], rule_file: Union[str, Path], *, invert: bool = False, strategy: Union[str, Strategy] = Strategy.BACKTRACK, verbosity: int = 0, print_results: bool = False, ) -> None: self.initial_smiles: List[str] = ( substrate if isinstance(substrate, list) else substrate.split(".") ) self.rule_file = rule_file self.invert = bool(invert) self.strategy = Strategy.from_string(strategy) self.verbosity = verbosity self.print_results = print_results # Prepared artefacts (lazy) self._initial_molecules: List[Any] = self._prepare_initial_molecules() self._reaction_rule: Any = self._parse_reaction_rule() self._dg: Optional[DG] = None self._temp_results: Optional[List[List[str]]] = None self._reaction_smiles: Optional[List[str]] = None # ------------------------------------------------------------------ # Public high‑level API # ------------------------------------------------------------------
[docs] def run(self) -> "MODReactor": """Execute the chosen strategy **once** and return *self* so you can chain: ```python r = MODReactor(...).run() smiles = r.get_reaction_smiles() ``` """ if self._temp_results is None: self._temp_results = self._predict() # ← may build DG return self
# helpers for outside world ------------------------------------------------
[docs] def get_reaction_smiles(self) -> List[str]: """Retrieve the reaction SMILES strings (lazy). Returns ------- List[str] List of reaction SMILES, in “A>>B” format. """ return self.reaction_smiles
[docs] def get_dg(self) -> DG: """Access the underlying derivation graph. Returns ------- DG The MØD derivation graph constructed during `.run()`. Raises ------ RuntimeError If `.run()` has not yet been called. """ if self._dg is None: raise RuntimeError("Call `.run()` before accessing the derivation graph.") return self._dg
# ------------------------------------------------------------------ # Introspection / niceties # ------------------------------------------------------------------ def __str__(self) -> str: return ( f"<MODReactor n_substrate={len(self.initial_smiles)} " f"invert={self.invert} strategy={self.strategy.value} " f"predictions={self.prediction_count}>" ) __repr__ = __str__
[docs] def help(self) -> None: """Print a one-page summary of reactor configuration and results.""" print("MODReactor".ljust(60, "─")) print(f"Rule file : {self.rule_file}") print(f"Substrate : {'.'.join(self.initial_smiles)}") print(f"Invert rule : {self.invert}") print(f"Strategy : {self.strategy.value}") print(f"Verbosity : {self.verbosity}") print(f"Predictions : {self.prediction_count}") if self._reaction_smiles: print(f"First result : {self._reaction_smiles[0]}") print("─" * 60)
# ------------------------------------------------------------------ # Convenience properties # ------------------------------------------------------------------ @property def dg(self) -> Optional[DG]: """DG or None – cached derivation graph. See also -------- get_dg """ return self._dg @property def product_sets(self) -> List[List[str]]: """Raw product sets (lists of SMILES) before joining into full reactions.""" return self.temp_results @property def product_smiles(self) -> List[str]: """Flattened list of all product SMILES (may contain duplicates).""" return [s for batch in self.temp_results for s in batch] @property def prediction_count(self) -> int: """Number of distinct prediction batches generated.""" return len(self._temp_results or []) # ------------------------------------------------------------------ # Internals – lazy properties # ------------------------------------------------------------------ @property def temp_results(self) -> List[List[str]]: """Lazy-loaded raw product lists. Returns ------- List[List[str]] """ if self._temp_results is None: self._temp_results = self._predict() return self._temp_results @property def reaction_smiles(self) -> List[str]: """Lazy-loaded reaction SMILES strings of form “A>>B”. Returns ------- List[str] """ if self._reaction_smiles is None: base = ".".join(self.initial_smiles) self._reaction_smiles = self.generate_reaction_smiles( self.temp_results, base, invert=self.invert ) return self._reaction_smiles # ------------------------------------------------------------------ # Internals – setup # ------------------------------------------------------------------ def _prepare_initial_molecules(self) -> List[Any]: """Convert SMILES → MØD molecule objects, dedupe, and sort. Returns ------- List[Any] """ mols = [smiles(s, add=False) for s in self.initial_smiles] mols = _deduplicateGraphs(mols) mols.sort(key=lambda m: getattr(m, "numVertices", 0)) log.debug("Prepared %d initial molecules", len(mols)) return mols def _parse_reaction_rule(self) -> Any: """Load or parse the reaction rule from raw GML or file. Returns ------- Any Rule object from ruleGMLString(). """ # First try raw text parse try: raw = str(self.rule_file) rule = ruleGMLString(raw, invert=self.invert, add=False) log.debug("Parsed rule from raw text") return rule except Exception: log.debug("Raw parse failed; trying file load", exc_info=True) # Second assume this is smart try: raw = smart_to_gml(self.rule_file) rule = ruleGMLString(raw, invert=self.invert, add=False) log.debug("Parsed smart from raw text") return rule except Exception: log.debug("Smart parse failed; trying file load", exc_info=True) # Then try file try: gml = load_gml_as_text(self.rule_file) rule = ruleGMLString(gml, invert=self.invert, add=False) log.debug("Loaded rule from file %s", self.rule_file) return rule except Exception: log.error( "Failed to load rule from text or file %s", self.rule_file, exc_info=True, ) raise # ------------------------------------------------------------------ # Internals – strategy dispatch # ------------------------------------------------------------------ def _predict(self) -> List[List[str]]: """Dispatch to the appropriate application strategy. Returns ------- List[List[str]] Raw product batches. """ dispatch = { Strategy.ALL: self._apply_all, Strategy.COMPONENT: self._apply_components, Strategy.BACKTRACK: self._apply_backtrack, } func = dispatch[self.strategy] log.info("Running strategy %s", self.strategy.value) results = func() if not results: log.warning("No predictions generated") return results # ------------------------------------------------------------------ # Internals – concrete strategy routines # ------------------------------------------------------------------ def _apply_components(self) -> List[List[str]]: """ Component-aware application: no cross-CC backtracking. Returns ------- List[List[str]] Product batches. """ self._dg = DG(graphDatabase=self._initial_molecules) config.dg.doRuleIsomorphismDuringBinding = False self._dg.build().apply( self._initial_molecules, self._reaction_rule, verbosity=self.verbosity ) if self.print_results: self._dg.print() products = [] for e in self._dg.edges: productSmiles = [v.graph.smiles for v in e.targets] products.append(productSmiles) return products def _apply_all(self) -> List[List[str]]: """Classic “ALL” strategy: VF2 with reagents included. Returns ------- List[List[str]] Product batches (including unused reagents). """ self._dg = DG(graphDatabase=self._initial_molecules) config.dg.doRuleIsomorphismDuringBinding = False self._dg.build().apply( self._initial_molecules, self._reaction_rule, verbosity=self.verbosity, onlyProper=False, ) if self.print_results: self._dg.print() products, educts = [], [] for e in self._dg.edges: products.append([v.graph.smiles for v in e.targets]) educts.append([v.graph.smiles for v in e.sources]) # re‑attach unused reagents base = Counter(sanitize_and_canonicalize_smiles(s) for s in self.initial_smiles) for batch, used in zip(products, educts): missing = list( ( base - Counter(sanitize_and_canonicalize_smiles(s) for s in used) ).elements() ) batch.extend(missing) return products def _apply_backtrack(self) -> List[List[str]]: """ BACKTRACK strategy: try COMPONENT, sanitize, else fall back to ALL. Returns ------- List[List[str]] Sanitized product batches. """ prod = self._apply_components() prod = [[sanitize_and_canonicalize_smiles(s) for s in batch] for batch in prod] if prod: return prod log.info("Component strategy returned 0 → falling back to ALL") return self._apply_all() # ------------------------------------------------------------------ # Static helpers # ------------------------------------------------------------------
[docs] @staticmethod def generate_reaction_smiles( temp_results: List[List[str]], base_smiles: str, *, invert: bool = False, arrow: str = ">>", separator: str = ".", ) -> List[str]: """Build reaction SMILES of the form “A>>B”, where A and B swap roles if invert=True. Parameters ---------- temp_results : List[List[str]] Batches of product (or reactant) SMILES. base_smiles : str The “other side” of the reaction: the reactant side when invert=False, or the product side when invert=True. invert : bool If False, generates “base_smiles>>joined_batch”; if True, generates “joined_batch>>base_smiles”. arrow : str The reaction arrow to use (default ">>"). separator : str How to join multiple SMILES in a batch (default "."). Returns ------- List[str] Reaction SMILES strings, one per batch. """ reactions: List[str] = [] for batch in temp_results: if all(x is not None for x in batch): joined = separator.join(batch) if batch else "" left, right = (joined, base_smiles) if invert else (base_smiles, joined) reactions.append(f"{left}{arrow}{right}") return reactions