Source code for pegasus.tools.doublet_detection

import time
import numpy as np
import pandas as pd
from pandas.api.types import is_categorical_dtype
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture

from scipy.stats import norm
from statsmodels.stats.multitest import fdrcorrection as fdr

from typing import List, Optional, Union

import logging
logger = logging.getLogger(__name__)

from pegasusio import MultimodalData
from pegasusio import timer

from .clustering import partition_cells_by_kmeans



[docs]@timer(logger=logger)
def run_scrublet(
    data: MultimodalData,
    channel_attr: Optional[str] = None, 
    expected_doublet_rate: Optional[float] = 0.1,
    nPC: Optional[int] = 30,
    output_plot_prefix: Optional[str] = None,
    random_state: Optional[int] = 0,
    verbose: Optional[bool] = True,
) -> None:
    """Calculate doublet scores using Scrublet for each channel on the current associated data.X matrix.

    This is a wrapper of `Scrublet <https://github.com/AllonKleinLab/scrublet>`_ package.

    See [Wolock18]_ for details on this method.

    Parameters
    -----------
    data: ``MultimodalData`` object.
        Annotated data matrix with rows for cells and columns for genes.

    channel_attr: ``str``, optional, default: None
        Attribute indicating sample channels. If None, consider all data as one channel.

    expected_doublet_rate: ``float``, optional, default: ``0.1``
        The expected doublet rate for the experiment.
    
    output_plot_prefix: ``str``, optional, default: None
        If this option is not None, output Scrublet histogram plots using output_plot_prefix as file name prefix.

    nPC: ``int``, optional, default: ``30``
        Number of principal components used to embed the transcriptomes prior to k-nearest-neighbor graph construction.

    random_state: ``int``, optional, default: ``0``
        Random state for doublet simulation, approximate nearest neighbor search, and PCA/TruncatedSVD if needed.

    verbose: ``bool``, optional, default: ``True``
        If True, print progress updates.

    Returns
    --------
    ``None``

    Update ``data.obs``:
        * ``data.obs['scrublet_score']``: The calculated doublet scores on cells.

    Update ``data.uns``:
        * ``data.uns['scrublet_stats']``: Overall stats during the calculation.

    If output_plot_prefix is not None, save doublet histogram as PDF files named ``output_plot_prefix.scrublet.pdf`` or ``output_plot_prefix_{channel}.scrublet.pdf``

    Examples
    --------
    >>> pg.run_scrublet(data)
    """
    def _get_scrublet_info(scrub):
        scrublet_info = dict()
        scrublet_info['threshold'] = scrub.threshold_
        scrublet_info['detected_doublet_rate'] = scrub.detected_doublet_rate_
        scrublet_info['detectable_doublet_fraction'] = scrub.detectable_doublet_fraction_
        scrublet_info['overall_doublet_rate'] = scrub.overall_doublet_rate_
        return scrublet_info

    import scrublet as scr

    if channel_attr is None:
        scrub = scr.Scrublet(data.X, expected_doublet_rate=expected_doublet_rate, random_state=random_state)
        doublet_scores, predicted_doublets = scrub.scrub_doublets(n_prin_comps=nPC, verbose=verbose)
        if output_plot_prefix is not None:
            fig, axs = scrub.plot_histogram()
            fig.savefig(f"{output_plot_prefix}.scrublet.pdf")
        data.obs['scrublet_score'] = doublet_scores
        data.uns['scrublet_stats'] = _get_scrublet_info(scrub)
    else:
        if not is_categorical_dtype(data.obs[channel_attr]):
            data.obs[channel_attr] = pd.Categorical(data.obs[channel_attr])
        scrublet_info_dict = {}
        scrublet_scores = np.zeros(data.shape[0], dtype = np.float32)
        for channel in data.obs[channel_attr].cat.categories:
            idx = (data.obs[channel_attr] == channel).values
            X_channel = data.X[idx]
            scrub = scr.Scrublet(X_channel, expected_doublet_rate=expected_doublet_rate, random_state=random_state)
            doublet_scores, predicted_doublets = scrub.scrub_doublets(n_prin_comps=nPC, verbose=verbose)
            if output_plot_prefix is not None:
                fig, axs = scrub.plot_histogram()
                fig.savefig(f"{output_plot_prefix}_{channel}.scrublet.pdf")
            scrublet_scores[idx] = doublet_scores
            scrublet_info_dict[channel] = _get_scrublet_info(scrub)
            if verbose:
                logger.info(f"Channel {channel} is processed.")
        data.obs['scrublet_score'] = scrublet_scores
        data.uns['scrublet_stats'] = scrublet_info_dict

    if verbose:
        logger.info("Scrublet is finished.")



def _one_tail_test(scores: List[float], mean: float, std: float, alpha: float = 0.05) -> List[bool]:
    idx = scores > mean
    pvals = 1.0 - norm.cdf(scores[idx], loc = mean, scale = std)
    passed, qvals = fdr(pvals, alpha = alpha)

    outliers = np.zeros(scores.size, dtype = bool)
    outliers[idx] = passed

    return outliers


def _identify_cell_doublets(scores: List[float], alpha: float = 0.05, min_dbl_rate: float = 0.01, random_state: int = 0):
    scores = np.log(scores) # log transformed
    scores_reshaped = scores.reshape(-1, 1)
    min_dbl = scores.size * min_dbl_rate
    # First fit three normal distributions
    gm = GaussianMixture(n_components = 3, random_state = random_state)
    gm.fit(scores_reshaped)
    means = gm.means_.ravel()
    stds = np.sqrt(gm.covariances_.ravel())
    pos = np.argsort(means)[1]
    prev_outliers = _one_tail_test(scores, means[pos], stds[pos], alpha)
    prev_ndbl = prev_outliers.sum()
    # Fit two normals by excluding outliers
    gm.set_params(n_components = 2)
    gm.fit(scores_reshaped[~prev_outliers])
    means = gm.means_.ravel()
    stds = np.sqrt(gm.covariances_.ravel())
    pos = np.argsort(means)[1]
    outliers = _one_tail_test(scores, means[pos], stds[pos], alpha)
    ndbl = outliers.sum()
    # Iteratively reduce false cell doublets
    gm.set_params(warm_start = True)
    while ndbl < prev_ndbl and ndbl >= min_dbl:
        gm.fit(scores_reshaped[~outliers])
        means = gm.means_.ravel()
        stds = np.sqrt(gm.covariances_.ravel())
        pos = np.argsort(means)[1]
        prev_outliers = outliers
        prev_ndbl = ndbl
        outliers = _one_tail_test(scores, means[pos], stds[pos], alpha)
        ndbl = outliers.sum()
    
    if ndbl < min_dbl:
        # Did not run until converge, roll back
        outliers = prev_outliers
        gm.set_params(warm_start = False)
        gm.fit(scores_reshaped[~outliers])
    # Predict singlets and transition cells
    preds = gm.predict(scores_reshaped[~outliers])
    if pos == 0:
        preds = 1 - preds
    # Generate labels: 0, singlets; 1, transition; 2, doublets
    labels = np.zeros(scores.size, dtype = np.int32)
    labels[outliers] = 2
    labels[~outliers] = preds

    return labels


def _identify_doublets_fisher(cluster_labels: Union[pd.Categorical, List[int]], dbl_codes: List[int], alpha: float = 0.05) -> pd.DataFrame:
    dbls = dbl_codes > 1
    df = pd.crosstab(cluster_labels, dbls)

    ndbl = df[True].sum()
    a = df[True].values.astype(np.int32)
    b = df[False].values.astype(np.int32)
    c = ndbl - a
    d = (dbl_codes.size - ndbl) - b

    avg_dblr = ndbl / dbl_codes.size
    freqs = a / (a + b)

    from pegasus.cylib.cfisher import fisher_exact
    _, pvals = fisher_exact(a, b, c, d)
    passed, qvals = fdr(pvals, alpha = alpha)

    posvec = np.where(passed)[0][freqs[passed] > avg_dblr]

    result = pd.DataFrame({'cluster': df.index[posvec], 'percentage': freqs[posvec] * 100.0, 'pval': pvals[posvec], 'qval': qvals[posvec]})
    result.sort_values('percentage', ascending = False, inplace = True)
    result.reset_index(drop=True, inplace=True)

    return result



[docs]@timer(logger=logger)
def infer_doublets(
    data: MultimodalData,
    dbl_attr: Optional[str] = 'scrublet_score',
    channel_attr: Optional[str] = None,
    clust_attr: Optional[str] = None,
    n_components: Optional[int] = 50,
    robust: Optional[bool] = True,
    n_clusters: Optional[int] = 30,
    n_clusters2: Optional[int] = 50,
    n_init: Optional[int] = 10,
    min_avg_cells_per_final_cluster: Optional[int] = 10,
    alpha: Optional[float] = 0.05,
    min_dbl_rate: Optional[float] = 0.01,
    random_state: Optional[int] = 0,
    verbose: Optional[bool] = False,
) -> None:
    """Infer doublets based on Scrublet scores.

    This implementation is inspired by [Pijuan-Sala19]_ and [Popescu19]_.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    dbl_attr: ``str``, optional, default: ``scrublet_scores``
        Attribute indicating calculated doublet scores from Scrublet.

    channel_attr: ``str``, optional, default: None
        Attribute indicating sample channels. If None, assuming cell-level and sample-level doublets are already calculated and saved in 'data.obs[pred_dbl_type]'.

    clust_attr: ``str``, optional, default: None
        Attribute indicating cluster labels. If None, does not perform cluster-level doublet detection.

    n_components: ``int``, optional, default: ``50``
        Number of PC components for sample-level doublet inference. Note that we use all genes (sparse matrix) and truncated SVD to infer PCs. Because truncated SVD does not reduce means and the first component correlates with the mean, we will use n_components + 1 in sklearn.decomposition.TruncatedSVD .

    robust: ``bool``, optional, default ``True``
        If robust == True, use algorithm = 'arpack'; otherwise, use algorithm = 'randomized'.

    n_clusters: ``int``, optional, default: ``30``
        The number of first level clusters.

    n_clusters2: ``int``, optional, default: ``50``
        The number of second level clusters.

    n_init: ``int``, optional, default: ``10``
        Number of kmeans tries for the first level clustering. Default is set to be the same as scikit-learn Kmeans function.

    min_avg_cells_per_final_cluster: ``int``, optional, default: ``10``
        Expected number of cells per cluster after the two-level KMeans.

    alpha: ``float``, optional, default: ``0.05``
        FDR significant level for statistical tests.

    min_dbl_rate: ``float``, optional, default: ``0.01``
        Minimum expected doublet rate for one channel. In some cases, the algorithm will iterate until no doublet detected, which is contradict with our expectation that at least a small percent of cells are doublets. With this parameter, the algorithm will stop before the detected ratio of doublets is less than ``min_dbl_rate``.

    random_state: ``int``, optional, default: ``0``
        Random seed for reproducing results.

    verbose: ``bool``, optional, default: ``False``
        If true, pegasus generates density plots for each channel under working directory with name channel.dbl.png. In addition, diagnostic outputs will be generated.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs['pred_dbl_type']``: Predicted singlet/doublet types.

        * ``data.uns['pred_dbl_cluster']``: Only generated if 'clust_attr' is not None. This is a dataframe with two columns, 'Cluster' and 'Qval'. Only clusters with significantly more doublets than expected will be recorded here.

    Examples
    --------
    >>> pg.infer_doublets(data, channel_attr = 'Channel', clust_attr = 'Annotation')
    """

    if channel_attr is not None:
        assert is_categorical_dtype(data.obs[channel_attr])
        from pegasus.plotting import doublet_plot

        dbl_codes = np.zeros(data.shape[0], dtype = np.int32)

        channels = data.obs[channel_attr].cat.categories
        for channel in channels:
            idx = (data.obs[channel_attr] == channel).values
            dbl_scores = data.obs.loc[idx, dbl_attr].values

            # Cell-level doublet
            dblc_codes = _identify_cell_doublets(dbl_scores, alpha = alpha, min_dbl_rate = min_dbl_rate, random_state = random_state) # dblc doublet at cell level

            idx_dblc = dblc_codes == 2
            idx_dblnc = ~idx_dblc
            ncdbl = idx_dblc.sum()
            freqs = []

            # Sample-level test
            # Truncated SVD including all genes
            if ncdbl > 0:
                tsvd = TruncatedSVD(n_components = n_components + 1, algorithm = 'arpack' if robust else 'randomized', random_state = random_state)
                X_tpca = np.ascontiguousarray(tsvd.fit_transform(data.X[idx]))

                clusters = partition_cells_by_kmeans(X_tpca, n_clusters, n_clusters2, n_init, random_state, min_avg_cells_per_final_cluster)

                sigs = _identify_doublets_fisher(clusters, dblc_codes, alpha = alpha) # significant clusters

                for cluster in sigs['cluster']:
                    idxc = clusters == cluster
                    idx_dbls = idxc & idx_dblnc
                    dblc_codes[idx_dbls] = 3
                    freqs.append(1.0 - idx_dbls.sum() / idxc.sum())

            # assign channel predictions to dbl_codes
            dbl_codes[idx] = dblc_codes

            # QC statistics
            nsdbl = (dblc_codes == 3).sum()
            min_score = dbl_scores[idx_dblc].min() if ncdbl > 0 else None
            min_freq = min(freqs) if len(freqs) > 0 else None

            if verbose:
                fig = doublet_plot(dbl_scores, dblc_codes, return_fig = True)
                fig.savefig(f"{channel}.dbl.png")
                logger.info(f"Channel {channel}: {ncdbl} cell-level doublets and {nsdbl} sample-level doublets were detected!")
                if min_score is not None:
                    logger.info(f"Doublet score cutoff for cell-level duoblets is {min_score:.3f}.")
                if min_freq is not None:
                    logger.info(f"Doublet frequency cutoff for sample-level doublets is {min_freq:.3f}.")
                logger.info(f"Density plot {channel}.dbl.png is generated.")

            logger.info(f"Channel {channel} contains {dbl_scores.size} cells and {ncdbl + nsdbl} predicted doublets. The predicted doublet rate is {(ncdbl+nsdbl)/dbl_scores.size:.2%}.")

        data.obs['pred_dbl_type'] = pd.Categorical.from_codes(dbl_codes, categories=['singlet', 'singlet-2', 'doublet-cell', 'doublet-sample'])

    if clust_attr is not None:
        clusters = data.obs[clust_attr].values
        data.uns['pred_dbl_cluster'] = _identify_doublets_fisher(clusters, dbl_codes, alpha = alpha)

    logger.info('Doublets are predicted!')



[docs]def mark_doublets(
    data: MultimodalData,
    demux_attr: Optional[str] = 'demux_type',
    dbl_clusts: Optional[str] = None,
) -> None:
    """Convert doublet prediction into doublet annotations that Pegasus can recognize.

    Must run ``infer_doublets`` first.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    demux_attr: ``str``, optional, default: ``demux_type``
        Attribute indicating singlets/doublets that Pegasus can recognize. Currently this is 'demux_type', which is also used for hashing.

    dbl_clusts: ``str``, optional, default: None
        Indicate which clusters should be marked as all doublets. It takes the format of 'clust:value1,value2,...', where 'clust' refers to the cluster attribute.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[demux_attr]``: Singlet/doublet annotation.

    Examples
    --------
    >>> pg.mark_singlets(data, dbl_clusts='Annotation:B/T doublets')
    """
    data.obs[demux_attr] = 'singlet'
    idx = data.obs['pred_dbl_type'].map(lambda x: x.startswith('doublet'))
    data.obs.loc[idx, demux_attr] = 'doublet'
    if dbl_clusts is not None:
        cluster, value_str = dbl_clusts.split(':')
        idx = np.isin(data.obs[cluster], value_str.split(','))
        data.obs.loc[idx, demux_attr] = 'doublet'

        codes = data.obs['pred_dbl_type'].values.codes
        idx_4 = idx & (codes < 2)
        if idx_4.sum() > 0:
            codes = codes.copy()
            codes[idx_4] = 4
            data.obs['pred_dbl_type'] = pd.Categorical.from_codes(codes, categories = ['singlet', 'singlet-2', 'doublet-cell', 'doublet-sample', 'doublet-cluster'])

    data.obs[demux_attr] = pd.Categorical(data.obs[demux_attr], categories = ['singlet', 'doublet'])