Source code for synkit.Chem.Molecule.descriptors

# synkit/Chem/Molecule/descriptors.py
from __future__ import annotations

from dataclasses import dataclass
from typing import (
    List,
    Tuple,
    Optional,
    Protocol,
    runtime_checkable,
    Dict,
    Any,
    Sequence,
)

from rdkit import Chem
from rdkit.Chem import AllChem, rdMolDescriptors
from rdkit import RDLogger

# Keep RDKit quiet in helper scope (best effort)
try:
    RDLogger.DisableLog("rdApp.*")
except Exception:
    pass


@runtime_checkable
class _MolLike(Protocol):
    """
    Minimal protocol for RDKit-like molecules used by this module.
    """

    def GetNumAtoms(self) -> int: ...
    def GetAtoms(self): ...
    def GetAtomWithIdx(self, idx: int): ...


# -----------------------------
# Internal small helpers
# -----------------------------
[docs] def compute_gasteiger_inplace(mol: Chem.Mol | Any) -> None: """ Compatibility helper: compute Gasteiger charges in-place (best-effort). Reintroduced for backward compatibility with code that imports this from synkit.Chem.Molecule.descriptors. :param mol: RDKit Mol (or mol-like object) to annotate. Mutates in place. :returns: None """ try: AllChem.ComputeGasteigerCharges(mol) except Exception: # best-effort: swallow RDKit quirks and continue return
def _make_copy_and_sanitize(mol: Chem.Mol | _MolLike, sanitize: bool) -> Chem.Mol: """ Make a defensive copy of ``mol`` and optionally sanitize it. :param mol: input molecule-like object :param sanitize: whether to call ``Chem.SanitizeMol`` on the copy :returns: copied (and possibly sanitized) RDKit Mol """ m = Chem.Mol(mol) if sanitize: try: Chem.SanitizeMol(m) except Exception: # best-effort: keep working with the copy even if sanitization fails pass return m def _safe_get_gasteiger_value(atom: Any) -> float: """ Extract a Gasteiger charge value from an RDKit atom, tolerant to API variants. :param atom: RDKit atom-like object. :returns: float value (0.0 on failure). """ try: if atom.HasProp("_GasteigerCharge"): try: return float(atom.GetProp("_GasteigerCharge")) except Exception: try: return float(atom.GetDoubleProp("_GasteigerCharge")) except Exception: return 0.0 except Exception: return 0.0 return 0.0 def _compute_gasteiger_list(m: Chem.Mol, n_atoms: int) -> List[float]: """ Compute per-atom Gasteiger charges for molecule ``m``. Best-effort: returns a list of length ``n_atoms`` filled with floats. """ gasteiger: List[float] = [0.0] * n_atoms try: AllChem.ComputeGasteigerCharges(m) for a in m.GetAtoms(): try: idx = a.GetIdx() except Exception: continue gasteiger[idx] = _safe_get_gasteiger_value(a) except Exception: # On any failure leave zeros pass return gasteiger def _compute_estate_list(m: Chem.Mol, n_atoms: int) -> List[float]: """ Compute per-atom EState indices (best-effort). Returns zeros if unavailable. """ try: estate_vals = list( __import__("rdkit.Chem.EState", fromlist=["EState"]).EState.EStateIndices(m) ) if len(estate_vals) != n_atoms: estate_vals = [0.0] * n_atoms except Exception: estate_vals = [0.0] * n_atoms return estate_vals def _calc_crippen_contribs(m: Chem.Mol) -> Optional[List[Tuple[float, float]]]: """ Version-tolerant wrapper for RDKit's Crippen contribution routine. Kept small and used by _compute_crippen_lists. """ fn = getattr(rdMolDescriptors, "CalcCrippenContribs", None) if fn is None: fn = getattr(rdMolDescriptors, "_CalcCrippenContribs", None) if fn is None: return None try: return list(fn(m)) except Exception: return None def _compute_crippen_lists( m: Chem.Mol, n_atoms: int ) -> Tuple[List[float], List[float]]: """ Compute per-atom Crippen (logP, MR) lists. Returns two lists of length n_atoms. """ cr_logp: List[float] = [0.0] * n_atoms cr_mr: List[float] = [0.0] * n_atoms cr = _calc_crippen_contribs(m) if cr is not None and len(cr) == n_atoms: try: cr_logp = [float(x[0]) for x in cr] cr_mr = [float(x[1]) for x in cr] except Exception: cr_logp = [0.0] * n_atoms cr_mr = [0.0] * n_atoms return cr_logp, cr_mr def _normalize_vector(vec: Sequence[float], method: Optional[str]) -> List[float]: """ Normalize a numeric vector in-place style (returns a new list). Supported methods: None, "zscore", "minmax". If method is None or vector length <= 1, returns a float copy of vec. """ n = len(vec) if n == 0 or method is None: return [float(x) for x in vec] vals = [float(x) for x in vec] if method == "zscore": mean = sum(vals) / n var = sum((x - mean) ** 2 for x in vals) / n std = var**0.5 if std == 0.0: return [0.0 for _ in vals] return [(x - mean) / std for x in vals] elif method == "minmax": lo = min(vals) hi = max(vals) if hi == lo: return [0.0 for _ in vals] return [(x - lo) / (hi - lo) for x in vals] else: raise ValueError(f"Unsupported normalization method: {method!r}") def _apply_normalization_to_all( vectors: Dict[str, List[float]], method: Optional[str] ) -> Dict[str, List[float]]: """ Apply normalization ``method`` to all vectors in the dict and return a new dict. """ if method is None: # ensure float conversion return {k: [float(x) for x in v] for k, v in vectors.items()} normalized: Dict[str, List[float]] = {} for k, v in vectors.items(): normalized[k] = _normalize_vector(v, method) return normalized # ----------------------------- # Public dataclass + builder # -----------------------------
[docs] @dataclass(frozen=True) class PerMolDescriptors: """ Immutable container for per-atom descriptor lists. :param gasteiger: per-atom Gasteiger charges :param estate: per-atom EState indices :param crippen_logp: per-atom Crippen logP contributions :param crippen_mr: per-atom Crippen MR contributions """ gasteiger: List[float] estate: List[float] crippen_logp: List[float] crippen_mr: List[float] # ----------------- Convenience constructors ---------------------------
[docs] @classmethod def compute( cls, mol: Chem.Mol | _MolLike, sanitize: bool = True, normalize: Optional[str] = None, ) -> "PerMolDescriptors": """ Best-effort compute per-atom descriptors for the given molecule. This function delegates work to small helpers for clarity and easier testing. Behavior is identical to the previous implementation. :param mol: RDKit molecule (or duck-typed equivalent). :param sanitize: try to sanitize the copied molecule (default True). :param normalize: normalization method: None (default), "zscore", or "minmax". :returns: PerMolDescriptors instance. """ # Defensive copy and optional sanitization m = _make_copy_and_sanitize(mol, sanitize) n_atoms = m.GetNumAtoms() if hasattr(m, "GetNumAtoms") else 0 # Compute components (small focused helpers) gasteiger = _compute_gasteiger_list(m, n_atoms) estate_vals = _compute_estate_list(m, n_atoms) cr_logp, cr_mr = _compute_crippen_lists(m, n_atoms) # Apply normalization if requested vectors = { "gasteiger": gasteiger, "estate": estate_vals, "crippen_logp": cr_logp, "crippen_mr": cr_mr, } vectors = _apply_normalization_to_all(vectors, normalize) return cls( gasteiger=vectors["gasteiger"], estate=vectors["estate"], crippen_logp=vectors["crippen_logp"], crippen_mr=vectors["crippen_mr"], )
[docs] @classmethod def from_smiles( cls, smiles: str, sanitize: bool = True, normalize: Optional[str] = None ) -> "PerMolDescriptors": """ Parse SMILES and compute descriptors. :param smiles: SMILES string to parse. :param sanitize: try to sanitize the parsed molecule (default True). :param normalize: optional normalization ("zscore" | "minmax" | None). :returns: PerMolDescriptors instance. :raises ValueError: if SMILES fails to parse. """ m = Chem.MolFromSmiles(smiles) if m is None: raise ValueError(f"Failed to parse SMILES: {smiles!r}") return cls.compute(m, sanitize=sanitize, normalize=normalize)
# ----------------- Small helpers / serialization --------------------- def __repr__(self) -> str: return ( f"{self.__class__.__name__}(num_atoms={self.num_atoms}, " f"has_gasteiger={any(v != 0.0 for v in self.gasteiger)})" ) @property def num_atoms(self) -> int: """ Infer atom count from stored lists (prefers gasteiger length). :returns: inferred atom count. """ return len(self.gasteiger)
[docs] def to_dict(self) -> Dict[str, List[float]]: """ Convert to a plain dictionary. :returns: dict with the per-atom lists. """ return { "gasteiger": list(self.gasteiger), "estate": list(self.estate), "crippen_logp": list(self.crippen_logp), "crippen_mr": list(self.crippen_mr), }
[docs] class PerMolDescriptorsBuilder: """ Fluent builder for PerMolDescriptors. Usage example: desc = ( PerMolDescriptorsBuilder(mol) .compute_gasteiger() .compute_estate() .compute_crippen() .normalize("zscore") .build() .descriptor ) The builder's chainable methods return ``self``; call ``.build()`` then access the final PerMolDescriptors via the ``.descriptor`` property. """ def __init__(self, mol: Chem.Mol | _MolLike, sanitize: bool = True): """ Create a builder for the given molecule. :param mol: RDKit Mol or equivalent. :param sanitize: try to sanitize the internal copy (default True). """ self._mol = Chem.Mol(mol) self._sanitize = sanitize if sanitize: try: Chem.SanitizeMol(self._mol) except Exception: pass n_atoms = self._mol.GetNumAtoms() if hasattr(self._mol, "GetNumAtoms") else 0 self._n_atoms = n_atoms # Internal working vectors (None until computed) self._gasteiger: Optional[List[float]] = None self._estate: Optional[List[float]] = None self._crippen_logp: Optional[List[float]] = None self._crippen_mr: Optional[List[float]] = None self._normalized_method: Optional[str] = None self._built: Optional[PerMolDescriptors] = None # ---------- Chainable compute methods --------------------------------
[docs] def compute_gasteiger(self) -> "PerMolDescriptorsBuilder": """ Compute Gasteiger charges (best-effort) and store internally. :returns: self (chainable). """ self._gasteiger = _compute_gasteiger_list(self._mol, self._n_atoms) self._built = None return self
[docs] def compute_estate(self) -> "PerMolDescriptorsBuilder": """ Compute EState indices (best-effort). :returns: self (chainable). """ self._estate = _compute_estate_list(self._mol, self._n_atoms) self._built = None return self
[docs] def compute_crippen(self) -> "PerMolDescriptorsBuilder": """ Compute Crippen per-atom contributions (best-effort). :returns: self (chainable). """ cr_logp, cr_mr = _compute_crippen_lists(self._mol, self._n_atoms) self._crippen_logp = cr_logp self._crippen_mr = cr_mr self._built = None return self
# ---------- Normalization --------------------------------------------
[docs] def normalize(self, method: Optional[str]) -> "PerMolDescriptorsBuilder": """ Normalize any computed vectors using ``method`` ("zscore" | "minmax" | None). :param method: normalization method or None to skip. :returns: self (chainable). """ if method is None: self._normalized_method = None return self if method not in ("zscore", "minmax"): raise ValueError(f"Unsupported normalization method: {method!r}") if self._gasteiger is not None: self._gasteiger = _normalize_vector(self._gasteiger, method) if self._estate is not None: self._estate = _normalize_vector(self._estate, method) if self._crippen_logp is not None: self._crippen_logp = _normalize_vector(self._crippen_logp, method) if self._crippen_mr is not None: self._crippen_mr = _normalize_vector(self._crippen_mr, method) self._normalized_method = method self._built = None return self
# ---------- Finalize / retrieve -------------------------------------
[docs] def build(self) -> "PerMolDescriptorsBuilder": """ Finalize internal state and prepare the immutable PerMolDescriptors. The method stores the result internally and returns ``self``. Use the ``.descriptor`` property to access the final object. :returns: self """ # If a vector wasn't computed, fill with zeros of appropriate length n = self._n_atoms g = self._gasteiger if self._gasteiger is not None else [0.0] * n e = self._estate if self._estate is not None else [0.0] * n crp = self._crippen_logp if self._crippen_logp is not None else [0.0] * n crm = self._crippen_mr if self._crippen_mr is not None else [0.0] * n self._built = PerMolDescriptors( gasteiger=g, estate=e, crippen_logp=crp, crippen_mr=crm ) return self
@property def descriptor(self) -> PerMolDescriptors: """ Retrieve the built PerMolDescriptors. If not built yet, ``build()`` is called implicitly. :returns: PerMolDescriptors """ if self._built is None: self.build() assert self._built is not None return self._built