Source code for synkit.Chem.Cluster.butina

from __future__ import annotations
from typing import List, Optional

import numpy as np
from rdkit.DataStructs import cDataStructs, CreateFromBitString, BulkTanimotoSimilarity
from rdkit.ML.Cluster import Butina
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt



[docs]
class ButinaCluster:
    """Cluster chemical fingerprint vectors using the Butina algorithm from
    RDKit, with integrated t-SNE visualization of clusters.

    Key features
    ------------
    * **Butina clustering** – fast hierarchical clustering with a similarity cutoff.
    * **t-SNE visualization** – 2D embedding of fingerprints, highlighting top‑k clusters.
    * **NumPy support** – accepts 2D arrays of 0/1 fingerprint data.
    * **Configurable** – user‑defined cutoff, perplexity, and top‑k highlight.

    Quick start
    -----------
    >>> from synkit.Chem.Fingerprint.fingerprint_clusterer import ButinaCluster
    >>> clusters = ButinaCluster.cluster(arr, cutoff=0.3)
    >>> ButinaCluster.visualize(arr, clusters, k=5)
    """


[docs]
    @staticmethod
    def cluster(arr: np.ndarray, cutoff: float = 0.2) -> List[List[int]]:
        """Perform Butina clustering on fingerprint bit-vectors.

        :param arr: 2D array of shape (n_samples, n_bits) with 0/1
            dtype.
        :type arr: np.ndarray
        :param cutoff: Distance cutoff (1 – similarity) to form
            clusters. Defaults to 0.2.
        :type cutoff: float
        :returns: List of clusters, each a list of sample indices.
        :rtype: list of list of int
        """
        # Convert rows to RDKit ExplicitBitVect
        fps: List[cDataStructs.ExplicitBitVect] = []
        for row in arr:
            bitstr = "".join(str(int(b)) for b in row.tolist())
            fps.append(CreateFromBitString(bitstr))

        n = len(fps)
        # Build flattened upper‐triangular distance list
        distances: List[float] = []
        for i in range(n):
            # fmt: off
            sims = BulkTanimotoSimilarity(fps[i], fps[i + 1:])
            # fmt: on
            distances.extend((1.0 - np.array(sims, dtype=float)).tolist())

        # Cluster: ClusterData(distanceList, nPts, cutoff, isDistData)
        clusters = Butina.ClusterData(distances, n, cutoff, True)
        return clusters



[docs]
    @staticmethod
    def visualize(
        arr: np.ndarray,
        clusters: List[List[int]],
        k: Optional[int] = None,
        perplexity: float = 30.0,
        random_state: int = 42,
    ) -> None:
        """Visualize clusters in 2D via t-SNE embedding.

        :param arr: 2D array of shape (n_samples, n_features) with fingerprint data.
        :type arr: np.ndarray
        :param clusters: Clusters as returned by `cluster()`.
        :type clusters: list of list of int
        :param k: If provided, highlight only the top‑k largest clusters; others shown as 'Other'.
        :type k: int or None
        :param perplexity: t-SNE perplexity parameter. Defaults to 30.0.
        :type perplexity: float
        :param random_state: Random seed for reproducibility. Defaults to 42.
        :type random_state: int
        :returns: None
        :rtype: NoneType

        :example:
        >>> clusters = ButinaCluster.cluster(arr, cutoff=0.3)
        >>> ButinaCluster.visualize(arr, clusters, k=5)
        """
        n = arr.shape[0]
        # assign labels: cluster idx or -1 for 'Other'
        labels = np.full(n, -1, dtype=int)
        # sort clusters by size
        sorted_idx = sorted(
            range(len(clusters)), key=lambda i: len(clusters[i]), reverse=True
        )
        top = set(sorted_idx[:k]) if k is not None else set(sorted_idx)
        for idx, cluster in enumerate(clusters):
            for i in cluster:
                labels[i] = idx if idx in top else -1

        # compute t-SNE embedding
        tsne = TSNE(n_components=2, perplexity=perplexity, random_state=random_state)
        emb = tsne.fit_transform(arr)

        # plot
        plt.figure(figsize=(8, 6))
        unique = sorted(set(labels))
        for lab in unique:
            mask = labels == lab
            if lab == -1:
                plt.scatter(
                    emb[mask, 0], emb[mask, 1], color="gray", alpha=0.3, label="Other"
                )
            else:
                plt.scatter(
                    emb[mask, 0], emb[mask, 1], alpha=0.7, label=f"Cluster {lab}"
                )
        plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
        plt.title("t-SNE visualization of Butina clusters")
        plt.xlabel("t-SNE dim 1")
        plt.ylabel("t-SNE dim 2")
        plt.tight_layout()
        plt.show()


    def __str__(self) -> str:
        """Short description of the clusterer.

        :returns: Class name.
        :rtype: str
        """
        return "<ButinaCluster>"


[docs]
    def help(self) -> None:
        """Print usage summary for clustering and visualization.

        :returns: None
        :rtype: NoneType
        """
        print("ButinaCluster.cluster(arr, cutoff=0.2)")
        print("ButinaCluster.visualize(arr, clusters, k=None, perplexity=30.0)")