import pegasus as pg


data = pg.read_input("MantonBM_nonmix_subset.zarr.zip")
data

2020-10-23 18:04:32,471 - pegasusio.readwrite - INFO - zarr file 'MantonBM_nonmix_subset.zarr.zip' is loaded.
2020-10-23 18:04:32,471 - pegasusio.readwrite - INFO - Function 'read_input' finished in 0.23s.

MultimodalData object with 1 UnimodalData: 'GRCh38-rna'
    It currently binds to UnimodalData object GRCh38-rna

UnimodalData object with n_obs x n_vars = 48219 x 36601
    Genome: GRCh38; Modality: rna
    It contains 1 matrices: 'X'
    It currently binds to matrix 'X' as X

    obs: 'n_genes', 'Channel'
    var: 'featureid'
    obsm: 
    varm: 
    uns: 'genome', 'modality'


pg.qc_metrics(data, percent_mito=10)
pg.filter_data(data)
pg.identify_robust_genes(data)
pg.log_norm(data)

2020-10-23 18:04:32,985 - pegasusio.qc_utils - INFO - After filtration, 35465 out of 48219 cell barcodes are kept in UnimodalData object GRCh38-rna.
2020-10-23 18:04:33,375 - pegasus.tools.preprocessing - INFO - After filtration, 25653/36601 genes are kept. Among 25653 genes, 17516 genes are robust.
2020-10-23 18:04:34,213 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 0.84s.


data.obs['Channel'].value_counts()

MantonBM2_HiSeq_1    4935
MantonBM6_HiSeq_1    4665
MantonBM8_HiSeq_1    4511
MantonBM7_HiSeq_1    4452
MantonBM1_HiSeq_1    4415
MantonBM3_HiSeq_1    4225
MantonBM4_HiSeq_1    4172
MantonBM5_HiSeq_1    4090
Name: Channel, dtype: int64


data_baseline = data.copy()
pg.highly_variable_features(data_baseline, consider_batch=False)

2020-10-23 18:04:34,980 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2020-10-23 18:04:34,980 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.51s.


pg.pca(data_baseline, robust=True)
pg.neighbors(data_baseline)
pg.louvain(data_baseline)
pg.umap(data_baseline)
pg.scatter(data_baseline, attrs=['louvain_labels', 'Channel'], basis='umap')

2020-10-23 18:04:41,214 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 6.23s.
2020-10-23 18:04:45,138 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 3.92s.
2020-10-23 18:04:46,152 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 1.01s.
2020-10-23 18:04:47,375 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.22s.
2020-10-23 18:05:03,271 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 19 clusters.
2020-10-23 18:05:03,410 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 17.26s.
2020-10-23 18:05:03,421 - pegasus.tools.visualization - INFO - UMAP(min_dist=0.5, random_state=0, verbose=True)
2020-10-23 18:05:03,422 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2020-10-23 18:05:05,123 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2020-10-23 18:05:25,755 - pegasus.tools.visualization - INFO - UMAP is calculated. Time spent = 22.34s.
2020-10-23 18:05:25,756 - pegasus.tools.visualization - INFO - Function 'umap' finished in 22.34s.


pg.highly_variable_features(data, consider_batch=True)

2020-10-23 18:05:27,699 - pegasus.tools.hvf_selection - INFO - Estimation on feature statistics per channel is finished. Time spent = 0.67s.
2020-10-23 18:05:27,740 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2020-10-23 18:05:27,741 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.72s.


data_harmony = data.copy()
pg.pca(data_harmony, robust=True)

2020-10-23 18:05:34,017 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 6.16s.


harmony_key = pg.run_harmony(data_harmony)

2020-10-23 18:05:34,383 - pegasus.tools.batch_correction - INFO - Start integration using Harmony.
	Initialization is completed.
	Completed 1 / 10 iteration(s).
	Completed 2 / 10 iteration(s).
	Completed 3 / 10 iteration(s).
	Completed 4 / 10 iteration(s).
	Completed 5 / 10 iteration(s).
	Completed 6 / 10 iteration(s).
	Completed 7 / 10 iteration(s).
	Completed 8 / 10 iteration(s).
	Completed 9 / 10 iteration(s).
	Completed 10 / 10 iteration(s).
Reach convergence after 10 iteration(s).
2020-10-23 18:05:53,669 - pegasus.tools.batch_correction - INFO - Function 'run_harmony' finished in 19.65s.


pg.neighbors(data_harmony, rep=harmony_key)
pg.louvain(data_harmony, rep=harmony_key)
pg.umap(data_harmony, rep=harmony_key)

2020-10-23 18:05:58,429 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.75s.
2020-10-23 18:05:59,403 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.97s.
2020-10-23 18:06:00,821 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.42s.
2020-10-23 18:06:23,741 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 16 clusters.
2020-10-23 18:06:23,875 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 24.47s.
2020-10-23 18:06:23,889 - pegasus.tools.visualization - INFO - UMAP(min_dist=0.5, random_state=0, verbose=True)
2020-10-23 18:06:23,890 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2020-10-23 18:06:24,040 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2020-10-23 18:06:44,098 - pegasus.tools.visualization - INFO - UMAP is calculated. Time spent = 20.22s.
2020-10-23 18:06:44,099 - pegasus.tools.visualization - INFO - Function 'umap' finished in 20.22s.


pg.scatter(data_harmony, attrs=['louvain_labels', 'Channel'], basis='umap')


data_ls = data.copy()
pg.correct_batch(data_ls, features='highly_variable_features')

2020-10-23 18:06:45,433 - pegasus.tools.batch_correction - INFO - Adjustment parameters are estimated.
2020-10-23 18:06:45,786 - pegasus.tools.batch_correction - INFO - Features are selected.
2020-10-23 18:06:46,375 - pegasus.tools.batch_correction - INFO - Batch correction is finished. Time spent = 0.60s.


data_ls.uns['fmat_highly_variable_features'].shape

(35465, 2000)


pg.pca(data_ls, robust=True)
pg.neighbors(data_ls)
pg.louvain(data_ls)
pg.umap(data_ls)

2020-10-23 18:06:53,062 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 6.68s.
2020-10-23 18:06:57,386 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.32s.
2020-10-23 18:06:58,346 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.96s.
2020-10-23 18:06:59,606 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.26s.
2020-10-23 18:07:15,849 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 17 clusters.
2020-10-23 18:07:15,983 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 17.64s.
2020-10-23 18:07:15,997 - pegasus.tools.visualization - INFO - UMAP(min_dist=0.5, random_state=0, verbose=True)
2020-10-23 18:07:15,998 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2020-10-23 18:07:16,158 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2020-10-23 18:07:36,045 - pegasus.tools.visualization - INFO - UMAP is calculated. Time spent = 20.06s.
2020-10-23 18:07:36,048 - pegasus.tools.visualization - INFO - Function 'umap' finished in 20.06s.


pg.scatter(data_ls, attrs=['louvain_labels', 'Channel'], basis='umap')


data_scan = data.copy()
scan_key = pg.run_scanorama(data_scan)

2020-10-23 18:07:37,368 - pegasus.tools.batch_correction - INFO - Start integration using Scanorama.
Found 2000 genes among all datasets
[[0.         0.70328426 0.29183432 0.48765572 0.51271394 0.45594855
  0.57168743 0.40345821]
 [0.         0.         0.14177515 0.67092199 0.3190709  0.60688956
  0.66828774 0.21813345]
 [0.         0.         0.         0.17689358 0.61609467 0.08681672
  0.17230769 0.55384615]
 [0.         0.         0.         0.         0.45647922 0.64415863
  0.55848514 0.42629129]
 [0.         0.         0.         0.         0.         0.40953545
  0.5        0.68166704]
 [0.         0.         0.         0.         0.         0.
  0.6846731  0.33204716]
 [0.         0.         0.         0.         0.         0.
  0.         0.60518732]
 [0.         0.         0.         0.         0.         0.
  0.         0.        ]]
Processing datasets (0, 1)
Processing datasets (5, 6)
Processing datasets (4, 7)
Processing datasets (1, 3)
Processing datasets (1, 6)
Processing datasets (3, 5)
Processing datasets (2, 4)
Processing datasets (1, 5)
Processing datasets (6, 7)
Processing datasets (0, 6)
Processing datasets (3, 6)
Processing datasets (2, 7)
Processing datasets (0, 4)
Processing datasets (4, 6)
Processing datasets (0, 3)
Processing datasets (3, 4)
Processing datasets (0, 5)
Processing datasets (3, 7)
Processing datasets (4, 5)
Processing datasets (0, 7)
Processing datasets (5, 7)
Processing datasets (1, 4)
Processing datasets (0, 2)
Processing datasets (1, 7)
Processing datasets (2, 3)
Processing datasets (2, 6)
Processing datasets (1, 2)
2020-10-23 18:09:01,923 - pegasus.tools.batch_correction - INFO - Function 'run_scanorama' finished in 84.57s.


pg.neighbors(data_scan, rep=scan_key)
pg.louvain(data_scan, rep=scan_key)
pg.umap(data_scan, rep=scan_key)

2020-10-23 18:09:06,700 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.77s.
2020-10-23 18:09:07,692 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.99s.
2020-10-23 18:09:09,013 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.32s.
2020-10-23 18:09:36,264 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 18 clusters.
2020-10-23 18:09:36,419 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 28.73s.
2020-10-23 18:09:36,434 - pegasus.tools.visualization - INFO - UMAP(min_dist=0.5, random_state=0, verbose=True)
2020-10-23 18:09:36,435 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2020-10-23 18:09:36,620 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2020-10-23 18:09:57,459 - pegasus.tools.visualization - INFO - UMAP is calculated. Time spent = 21.04s.
2020-10-23 18:09:57,461 - pegasus.tools.visualization - INFO - Function 'umap' finished in 21.04s.


pg.scatter(data_scan, attrs=['louvain_labels', 'Channel'], basis='umap')


_, _, kBET_baseline = pg.calc_kBET(data_baseline, attr='Channel', rep='umap')
_, _, kBET_harmony = pg.calc_kBET(data_harmony, attr='Channel', rep='umap')
_, _, kBET_ls = pg.calc_kBET(data_ls, attr='Channel', rep='umap')
_, _, kBET_scan = pg.calc_kBET(data_scan, attr='Channel', rep='umap')

2020-10-23 18:10:00,592 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.00s.
2020-10-23 18:10:11,082 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.14s.
2020-10-23 18:10:15,824 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.17s.
2020-10-23 18:10:20,507 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 2.14s.


import pandas as pd
import numpy as np

df_celltypes = pd.read_csv("cell_types.csv", index_col='barcodekey')

assert np.sum(df_celltypes.index!=data_baseline.obs_names) == 0
data_baseline.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_harmony.obs_names) == 0
data_harmony.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_ls.obs_names) == 0
data_ls.obs['cell_types'] = df_celltypes['cell_types']

assert np.sum(df_celltypes.index!=data_scan.obs_names) == 0
data_scan.obs['cell_types'] = df_celltypes['cell_types']


_, kSIM_baseline = pg.calc_kSIM(data_baseline, attr='cell_types', rep='umap')
_, kSIM_harmony = pg.calc_kSIM(data_harmony, attr='cell_types', rep='umap')
_, kSIM_ls = pg.calc_kSIM(data_ls, attr='cell_types', rep='umap')
_, kSIM_scan = pg.calc_kSIM(data_scan, attr='cell_types', rep='umap')

2020-10-23 18:10:23,153 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2020-10-23 18:10:23,154 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2020-10-23 18:10:23,166 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2020-10-23 18:10:23,167 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2020-10-23 18:10:23,179 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2020-10-23 18:10:23,179 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2020-10-23 18:10:23,188 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2020-10-23 18:10:23,189 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.


import seaborn as sns
import matplotlib.pyplot as plt

df_plot = pd.DataFrame({'method': ['Baseline', 'Harmony', 'L/S', 'Scanorama'],
                        'kBET': [kBET_baseline, kBET_harmony, kBET_ls, kBET_scan],
                        'kSIM': [kSIM_baseline, kSIM_harmony, kSIM_ls, kSIM_scan]})

plt.figure(dpi=100)
ax = sns.scatterplot(x = 'kSIM', y = 'kBET', hue = 'method', data = df_plot, legend = False)
for line in range(0, df_plot.shape[0]):
    x_pos = df_plot.kSIM[line] + 0.003
    if df_plot.method[line] == 'Baseline':
        x_pos = df_plot.kSIM[line] - 0.003
    y_pos = df_plot.kBET[line]
    alignment = 'right' if df_plot.method[line] == 'Baseline' else 'left'
    ax.text(x_pos, y_pos, df_plot.method[line], ha = alignment, size = 'medium', color = 'black')
plt.xlabel('kSIM acceptance rate')
plt.ylabel('kBET acceptance rate')

Text(0, 0.5, 'kBET acceptance rate')

Batch Correction Tutorial¶

Dataset¶

Sections¶

Preprocessing¶

Clustering without Correcting Batch Effects¶

Batch Correction Methods¶

Harmony Algorithm¶

L/S Adjustment Method¶

Scanorama Algorithm¶

Comparing Batch Correction Methods¶

kBET Acceptance Rate¶

kSIM Acceptance Rate¶

Performance Plot¶