Source code for pegasus.tools.signature_score

import numpy as np
import pandas as pd

from typing import Dict, List
from anndata import AnnData

import logging

logger = logging.getLogger("pegasus")
from pegasus.utils import decorators as pg_deco


[docs]@pg_deco.TimeLogger() def calc_signature_score(data: AnnData, signatures: Dict[str, List[str]], n_bins: int = 50) -> None: """Calculate signature / gene module score. This is an improved version of Livnat et al. 2018 Cell implementation. Parameters ---------- data: ``anndata.AnnData`` Annotated data matrix with rows for cells and columns for genes. signatures: ``Dict[str, List[str]]`` A dictionary containing multiple signature score calculation requests. Each score will be stored in data.obs field with key as the keyword. The value of the dict is a list of gene symbols. n_bins: ``int``, optional, default: 50 Returns ------- ``None``. Update ``data.obs``: * ``data.obs["key"]``: signature / gene module score for signature "key" Update ``data.var``: * ``data.var["mean"]``: Mean expression of each gene across all cells. Only updated if "mean" does not exist in data.var. * ``data.var["bins"]``: Bin category for each gene. Only updated if data.uns["sig_n_bins"] is updated. Update ``data.obsm``: * ``data.obsm["sig_background"]``: Expected signature score for each bin category. Only updated if data.uns["sig_n_bins"] is updated. Update ``data.uns``: * ``data.uns["sig_n_bins"]``: Number of bins to partition genes into. Only updated if "sig_n_bins" does not exist or the recorded number of bins does not match n_bins. Examples -------- >>> pg.calc_signature_score(data, {"T_cell_sig": ["CD3D", "CD3E", "CD3G", "TRAC"]}) """ if "mean" not in data.var: data.var["mean"] = data.X.mean(axis = 0).A1 if data.uns.get("sig_n_bins", 0) != n_bins: data.uns["sig_n_bins"] = n_bins mean_vec = data.var["mean"] bins = pd.qcut(mean_vec, n_bins) bins.cat.categories = bins.cat.categories.astype(str) data.var["bins"] = bins # calculate background expectations sig_background = np.zeros((data.shape[0], n_bins)) for code in range(n_bins): idx = (bins.cat.codes == code).values base = mean_vec[idx].mean() sig_background[:, code] = data.X[:, idx].mean(axis = 1).A1 - base data.obsm["sig_background"] = sig_background for key, gene_list in signatures.items(): genes = pd.Index(gene_list) idx = data.var_names.isin(genes) if idx.sum() < genes.size: omitted = ~genes.isin(data.var_names) logger.warning("For signature {}, genes {} are not in the data and thus omitted!".format(key, str(list(genes[omitted]))[1:-1])) if key in data.obs: logger.warning("Signature key {} exists in data.obs, the existing content will be overwritten!".format(key)) data.obs[key] = (data.X[:, idx].mean(axis = 1).A1 - data.var.loc[idx, "mean"].mean()) - data.obsm["sig_background"][:, data.var["bins"].cat.codes[idx]].mean(axis = 1)