import pegasus as pg


data = pg.read_input("MantonBM_nonmix_subset.zarr.zip")
data

2021-06-24 00:14:59,926 - pegasusio.readwrite - INFO - zarr file 'MantonBM_nonmix_subset.zarr.zip' is loaded.
2021-06-24 00:14:59,927 - pegasusio.readwrite - INFO - Function 'read_input' finished in 0.48s.

MultimodalData object with 1 UnimodalData: 'GRCh38-rna'
    It currently binds to UnimodalData object GRCh38-rna

UnimodalData object with n_obs x n_vars = 48219 x 36601
    Genome: GRCh38; Modality: rna
    It contains 1 matrix: 'X'
    It currently binds to matrix 'X' as X

    obs: 'n_genes', 'Channel'
    var: 'featureid'
    obsm: 
    varm: 
    uns: 'genome', 'modality'


data.X

<48219x36601 sparse matrix of type '<class 'numpy.int32'>'
	with 39997174 stored elements in Compressed Sparse Row format>


data.obs.head()


data.obs['Channel'].value_counts()

MantonBM6_HiSeq_1    6748
MantonBM8_HiSeq_1    6092
MantonBM4_HiSeq_1    6068
MantonBM7_HiSeq_1    6025
MantonBM5_HiSeq_1    5963
MantonBM2_HiSeq_1    5930
MantonBM1_HiSeq_1    5837
MantonBM3_HiSeq_1    5556
Name: Channel, dtype: int64


data.var.head()


data.uns['genome']

'GRCh38'


data.uns['modality']

'rna'


pg.qc_metrics(data, min_genes=500, max_genes=6000, mito_prefix='MT-', percent_mito=10)


df_qc = pg.get_filter_stats(data)
df_qc


pg.qcviolin(data, plot_type='gene', dpi=100)


pg.qcviolin(data, plot_type='count', dpi=100)


pg.qcviolin(data, plot_type='mito', dpi=100)


pg.filter_data(data)

2021-06-24 00:15:02,379 - pegasusio.qc_utils - INFO - After filtration, 35465 out of 48219 cell barcodes are kept in UnimodalData object GRCh38-rna.
2021-06-24 00:15:02,379 - pegasus.tools.preprocessing - INFO - Function 'filter_data' finished in 0.22s.


pg.identify_robust_genes(data)

2021-06-24 00:15:03,341 - pegasus.tools.preprocessing - INFO - After filtration, 25653/36601 genes are kept. Among 25653 genes, 17516 genes are robust.
2021-06-24 00:15:03,341 - pegasus.tools.preprocessing - INFO - Function 'identify_robust_genes' finished in 0.96s.


data.obs['Channel'].value_counts()

MantonBM2_HiSeq_1    4935
MantonBM6_HiSeq_1    4665
MantonBM8_HiSeq_1    4511
MantonBM7_HiSeq_1    4452
MantonBM1_HiSeq_1    4415
MantonBM3_HiSeq_1    4225
MantonBM4_HiSeq_1    4172
MantonBM5_HiSeq_1    4090
Name: Channel, dtype: int64


pg.log_norm(data)

2021-06-24 00:15:03,882 - pegasus.tools.preprocessing - INFO - Function 'log_norm' finished in 0.53s.


data_trial = data.copy()


pg.highly_variable_features(data_trial, consider_batch=False)

2021-06-24 00:15:05,111 - pegasus.tools.hvf_selection - INFO - Function 'estimate_feature_statistics' finished in 0.45s.
2021-06-24 00:15:05,148 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2021-06-24 00:15:05,149 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.49s.


data_trial.var.loc[data_trial.var['highly_variable_features']].sort_values(by='hvf_rank')


pg.hvfplot(data_trial, dpi=200)


pg.pca(data_trial)

2021-06-24 00:15:12,181 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 5.49s.


coord_pc1 = data_trial.uns['PCs'][:, 0]
coord_pc1

array([ 0.02221761,  0.01772111, -0.00582949, ..., -0.00050337,
        0.04850996,  0.03549923], dtype=float32)


data_trial.var.loc[data_trial.var['highly_variable_features']].index.values

array(['HES4', 'ISG15', 'TNFRSF18', ..., 'RPS4Y2', 'MT-CO1', 'MT-CO3'],
      dtype=object)


data_trial.obsm['X_pca'].shape

(35465, 50)


pg.neighbors(data_trial)

2021-06-24 00:15:16,530 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.30s.
2021-06-24 00:15:17,551 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 1.02s.


print(f"Get {data_trial.uns['pca_knn_indices'].shape[1]} nearest neighbors (excluding itself) for each cell.")
data_trial.uns['pca_knn_indices']

Get 99 nearest neighbors (excluding itself) for each cell.

array([[30526, 27514, 28825, ..., 18019, 22273, 32915],
       [29723,  2651,  5282, ..., 29922, 14100, 30101],
       [35262, 20170, 30032, ..., 34146,  3880,  2412],
       ...,
       [ 6096, 30824, 18992, ..., 14345, 34251, 20801],
       [34252, 34709, 17569, ..., 32455,  8648,  6047],
       [ 5379, 35401, 31722, ...,  1585, 32585, 16009]])


data_trial.uns['pca_knn_distances']

array([[ 4.984169 ,  5.486617 ,  5.531816 , ...,  6.568183 ,  6.570032 ,
         6.5835915],
       [ 8.275161 ,  8.679711 ,  8.974715 , ..., 10.486567 , 10.48813  ,
        10.531117 ],
       [ 4.793777 ,  5.1250052,  5.205284 , ...,  6.08579  ,  6.0924664,
         6.0957317],
       ...,
       [ 7.3680816,  7.4144983,  7.4873137, ...,  9.415309 ,  9.418577 ,
         9.421833 ],
       [ 8.759402 ,  8.7638645,  9.546374 , ..., 11.836472 , 11.852649 ,
        11.85586  ],
       [ 7.1296797,  7.2653217,  7.413504 , ..., 11.326677 , 11.330711 ,
        11.331037 ]], dtype=float32)


pg.louvain(data_trial)

2021-06-24 00:15:18,801 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.22s.
2021-06-24 00:15:36,113 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 19 clusters.
2021-06-24 00:15:36,246 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 18.68s.


data_trial.obs['louvain_labels'].value_counts()

1     5623
2     4320
3     3594
4     2889
5     2851
6     2660
7     2227
8     1927
9     1625
10    1432
11    1298
12    1032
13     938
14     897
15     550
16     430
17     407
18     386
19     379
Name: louvain_labels, dtype: int64


pg.compo_plot(data_trial, 'louvain_labels', 'Channel')


pg.tsne(data_trial)

2021-06-24 00:16:04,307 - pegasus.tools.visualization - INFO - Function 'tsne' finished in 27.27s.


pg.scatter(data_trial, attrs=['louvain_labels', 'Channel'], basis='tsne')


pg.highly_variable_features(data, consider_batch=True)
pg.pca(data)
pca_key = pg.run_harmony(data)

2021-06-24 00:16:05,692 - pegasus.tools.hvf_selection - INFO - Function 'estimate_feature_statistics' finished in 0.22s.
2021-06-24 00:16:05,730 - pegasus.tools.hvf_selection - INFO - 2000 highly variable features have been selected.
2021-06-24 00:16:05,730 - pegasus.tools.hvf_selection - INFO - Function 'highly_variable_features' finished in 0.26s.
2021-06-24 00:16:09,669 - pegasus.tools.preprocessing - INFO - Function 'pca' finished in 3.94s.
2021-06-24 00:16:10,190 - pegasus.tools.batch_correction - INFO - Start integration using Harmony.
	Initialization is completed.
	Completed 1 / 10 iteration(s).
	Completed 2 / 10 iteration(s).
	Completed 3 / 10 iteration(s).
	Completed 4 / 10 iteration(s).
	Completed 5 / 10 iteration(s).
	Completed 6 / 10 iteration(s).
	Completed 7 / 10 iteration(s).
	Completed 8 / 10 iteration(s).
Reach convergence after 8 iteration(s).
2021-06-24 00:16:28,917 - pegasus.tools.batch_correction - INFO - Function 'run_harmony' finished in 19.25s.


data.obsm['X_pca_harmony'].shape

(35465, 50)


pg.neighbors(data, rep=pca_key)
pg.louvain(data, rep=pca_key)

2021-06-24 00:16:33,131 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 4.20s.
2021-06-24 00:16:34,168 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 1.04s.
2021-06-24 00:16:35,502 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 1.33s.
2021-06-24 00:16:50,876 - pegasus.tools.clustering - INFO - Louvain clustering is done. Get 16 clusters.
2021-06-24 00:16:51,012 - pegasus.tools.clustering - INFO - Function 'louvain' finished in 16.84s.


pg.compo_plot(data, 'louvain_labels', 'Channel')


pg.tsne(data, rep=pca_key)

2021-06-24 00:17:16,622 - pegasus.tools.visualization - INFO - Function 'tsne' finished in 24.95s.


pg.scatter(data, attrs=['louvain_labels', 'Channel'], basis='tsne')


pg.umap(data, rep=pca_key)

2021-06-24 00:17:17,947 - pegasus.tools.nearest_neighbors - INFO - Found cached kNN results, no calculation is required.
2021-06-24 00:17:17,947 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 0.00s.
2021-06-24 00:17:17,965 - pegasus.tools.visualization - INFO - UMAP(dens_frac=0.0, dens_lambda=0.0, min_dist=0.5, random_state=0, verbose=True)
2021-06-24 00:17:17,966 - pegasus.tools.visualization - INFO - Construct fuzzy simplicial set
2021-06-24 00:17:20,230 - pegasus.tools.visualization - INFO - Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
2021-06-24 00:17:41,603 - pegasus.tools.visualization - INFO - Function 'umap' finished in 23.66s.


pg.scatter(data, attrs=['louvain_labels', 'Channel'], basis='umap')


pg.de_analysis(data, cluster='louvain_labels')

2021-06-24 00:17:43,825 - pegasus.tools.diff_expr - INFO - CSR matrix is converted to CSC matrix. Time spent = 0.9813s.
2021-06-24 00:18:15,259 - pegasus.tools.diff_expr - INFO - MWU test and AUROC calculation are finished. Time spent = 31.4329s.
2021-06-24 00:18:15,706 - pegasus.tools.diff_expr - INFO - Sufficient statistics are collected. Time spent = 0.4479s.
2021-06-24 00:18:15,804 - pegasus.tools.diff_expr - INFO - Differential expression analysis is finished.
2021-06-24 00:18:15,806 - pegasus.tools.diff_expr - INFO - Function 'de_analysis' finished in 33.13s.


marker_dict = pg.markers(data)


marker_dict['1']['up'].sort_values(by='log2FC', ascending=False)


pg.volcano(data, cluster_id = '1', dpi=200)


pg.write_results_to_excel(marker_dict, "MantonBM_subset.de.xlsx")

2021-06-24 00:18:54,006 - pegasus.tools.diff_expr - INFO - Excel spreadsheet is written.
2021-06-24 00:18:54,142 - pegasus.tools.diff_expr - INFO - Function 'write_results_to_excel' finished in 16.90s.


celltype_dict = pg.infer_cell_types(data, markers = 'human_immune')
cluster_names = pg.infer_cluster_names(celltype_dict)


celltype_dict['1']

[name: T cell; score: 1.00; average marker percentage: 65.21%; strong support: (CD3D+,75.12%),(CD3E+,69.03%),(CD3G+,43.72%),(TRAC+,72.99%)]


pg.annotate(data, name='anno', based_on='louvain_labels', anno_dict=cluster_names)
data.obs['anno'].value_counts()

Naive T cell                   6290
CD14+ Monocyte                 5177
B cell                         4332
T helper cell                  3272
Cytotoxic T cell               2953
Natural killer cell            2600
Cytotoxic T cell-2             2596
Cytotoxic T cell-3             1739
Erythroid cells                1629
Hematopoietic stem cell        1441
Pre B cell                      933
CD14+ Monocyte-2                685
CD1C+ dendritic cell            550
CD16+ Monocyte                  469
Plasma cell                     408
Plasmacytoid dendritic cell     391
Name: anno, dtype: int64


pg.scatter(data, attrs='anno', basis='tsne', dpi=100)


pg.scatter(data, attrs='anno', basis='umap', legend_loc='on data', dpi=150)


data

MultimodalData object with 1 UnimodalData: 'GRCh38-rna'
    It currently binds to UnimodalData object GRCh38-rna

UnimodalData object with n_obs x n_vars = 35465 x 25653
    Genome: GRCh38; Modality: rna
    It contains 2 matrices: 'X', 'raw.X'
    It currently binds to matrix 'X' as X

    obs: 'n_genes', 'Channel', 'n_counts', 'percent_mito', 'scale', 'Group', 'louvain_labels', 'anno'
    var: 'featureid', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features', 'mean', 'var', 'hvf_loess', 'hvf_rank'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap'
    varm: 'means', 'partial_sum', 'gmeans', 'gstds', 'de_res'
    uns: 'genome', 'modality', 'df_qcplot', 'norm_count', 'Channels', 'Groups', 'ncells', 'gncells', 'c2gid', 'stdzn_mean', 'stdzn_std', 'stdzn_max_value', '_tmp_fmat_highly_variable_features', 'PCs', 'pca', 'pca_features', 'pca_harmony_knn_indices', 'pca_harmony_knn_distances', 'W_pca_harmony', 'louvain_resolution'


data.select_matrix('raw.X')
data

MultimodalData object with 1 UnimodalData: 'GRCh38-rna'
    It currently binds to UnimodalData object GRCh38-rna

UnimodalData object with n_obs x n_vars = 35465 x 25653
    Genome: GRCh38; Modality: rna
    It contains 2 matrices: 'X', 'raw.X'
    It currently binds to matrix 'raw.X' as X

    obs: 'n_genes', 'Channel', 'n_counts', 'percent_mito', 'scale', 'Group', 'louvain_labels', 'anno'
    var: 'featureid', 'n_cells', 'percent_cells', 'robust', 'highly_variable_features', 'mean', 'var', 'hvf_loess', 'hvf_rank'
    obsm: 'X_pca', 'X_pca_harmony', 'X_tsne', 'X_umap'
    varm: 'means', 'partial_sum', 'gmeans', 'gstds', 'de_res'
    uns: 'genome', 'modality', 'df_qcplot', 'norm_count', 'Channels', 'Groups', 'ncells', 'gncells', 'c2gid', 'stdzn_mean', 'stdzn_std', 'stdzn_max_value', '_tmp_fmat_highly_variable_features', 'PCs', 'pca', 'pca_features', 'pca_harmony_knn_indices', 'pca_harmony_knn_distances', 'W_pca_harmony', 'louvain_resolution'


data.select_matrix('X')


pg.diffmap(data, rep=pca_key)

2021-06-24 00:18:55,040 - pegasus.tools.diffusion_map - INFO - Calculating connected components is done.
2021-06-24 00:18:55,338 - pegasus.tools.diffusion_map - INFO - Calculating normalized affinity matrix is done.
2021-06-24 00:19:04,777 - pegasus.tools.diffusion_map - INFO - Detected knee point at t = 196.
2021-06-24 00:19:04,838 - pegasus.tools.diffusion_map - INFO - Function 'diffmap' finished in 9.90s.


data.obsm['X_diffmap'].shape

(35465, 99)


pg.fle(data)

2021-06-24 00:19:08,278 - pegasus.tools.nearest_neighbors - INFO - Function 'get_neighbors' finished in 3.43s.
2021-06-24 00:19:09,071 - pegasus.tools.nearest_neighbors - INFO - Function 'calculate_affinity_matrix' finished in 0.79s.
2021-06-24 00:19:09,580 - pegasus.tools.graph_operations - INFO - Function 'construct_graph' finished in 0.51s.
2021-06-24 00:23:53,207 - pegasus.tools.visualization - INFO - Function 'fle' finished in 288.36s.


pg.scatter(data, attrs='anno', basis='fle')


pg.write_output(data, "result.zarr.zip")

2021-06-24 00:23:55,585 - pegasusio.readwrite - INFO - zarr.zip file 'result.zarr.zip' is written.
2021-06-24 00:23:55,585 - pegasusio.readwrite - INFO - Function 'write_output' finished in 1.71s.

	featureid
featurekey
MIR1302-2HG	ENSG00000243485
FAM138A	ENSG00000237613
OR4F5	ENSG00000186092
AL627309.1	ENSG00000238009
AL627309.3	ENSG00000239945

	kept	median_n_genes	median_n_umis	median_percent_mito	filt	total	median_n_genes_before	median_n_umis_before	median_percent_mito_before
Channel
MantonBM5_HiSeq_1	4090	770	2795.0	3.136190	1873	5963	650.0	2139.0	3.399615
MantonBM4_HiSeq_1	4172	790	2278.5	3.271181	1896	6068	672.0	1764.0	3.519009
MantonBM3_HiSeq_1	4225	779	2621.0	3.274451	1331	5556	715.0	2229.0	3.449398
MantonBM1_HiSeq_1	4415	790	2533.0	3.713331	1422	5837	723.0	2149.0	3.855422
MantonBM7_HiSeq_1	4452	745	2403.5	3.053718	1573	6025	679.0	2053.0	3.177570
MantonBM8_HiSeq_1	4511	735	2561.0	3.520510	1581	6092	671.5	2212.0	3.706849
MantonBM6_HiSeq_1	4665	852	2700.0	3.032258	2083	6748	741.0	2129.0	3.345829
MantonBM2_HiSeq_1	4935	801	2486.0	3.514056	995	5930	756.0	2261.5	3.534756

	featureid	n_cells	percent_cells	robust	highly_variable_features	mean	var	hvf_loess	hvf_rank
featurekey
LYZ	ENSG00000090382	8566	24.153391	True	True	1.526394	8.110589	3.775874	3
S100A9	ENSG00000163220	8182	23.070633	True	True	1.423049	7.649132	3.657402	5
S100A8	ENSG00000143546	7674	21.638235	True	True	1.328664	7.228290	3.463029	7
HLA-DRA	ENSG00000204287	14836	41.832793	True	True	2.242150	7.513039	4.208062	12
GNLY	ENSG00000115523	5196	14.651064	True	True	0.882395	4.859677	2.504143	13
...	...	...	...	...	...	...	...	...	...
TPK1	ENSG00000196511	917	2.585648	True	True	0.085252	0.289441	0.268601	5554
AL355916.1	ENSG00000232774	99	0.279148	True	True	0.010097	0.037130	0.032563	5557
MEI1	ENSG00000167077	1813	5.112082	True	True	0.178823	0.611417	0.574647	5559
AL035701.1	ENSG00000231769	222	0.625969	True	True	0.021673	0.078620	0.070820	5563
KIAA1324	ENSG00000116299	155	0.437051	True	True	0.015310	0.054929	0.048948	5564

	log2Mean	log2Mean_other	log2FC	percentage	percentage_other	percentage_fold_change	auroc	mwu_U	mwu_pval	mwu_qval
feature
LTB	6.078218	2.750911	3.327308	88.982513	41.377892	2.150485e+00	0.752276	138050716.5	0.000000	0.000000
TRAC	4.584767	1.860424	2.724343	72.988869	29.610968	2.464927e+00	0.715238	131253949.5	0.000000	0.000000
BCL11B	3.803313	1.143754	2.659559	64.610497	20.260496	3.188989e+00	0.731830	134298667.0	0.000000	0.000000
CD3D	4.584280	2.014685	2.569595	75.119240	32.243359	2.329759e+00	0.701153	128669176.5	0.000000	0.000000
CD3E	4.084703	1.814133	2.270570	69.030205	30.365038	2.273345e+00	0.686962	126064947.0	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...
AL513008.1	0.001603	0.000000	0.001603	0.031797	0.000000	1.000000e+30	0.500159	91784550.0	0.002321	0.005037
AC141002.1	0.001573	0.000000	0.001573	0.031797	0.000000	1.000000e+30	0.500159	91784550.0	0.002321	0.005037
AP001972.3	0.001547	0.000000	0.001547	0.031797	0.000000	1.000000e+30	0.500159	91784550.0	0.002321	0.005037
AC093797.1	0.001542	0.000000	0.001542	0.031797	0.000000	1.000000e+30	0.500159	91784550.0	0.002321	0.005037
AC023051.1	0.001522	0.000000	0.001522	0.031797	0.000000	1.000000e+30	0.500159	91784550.0	0.002321	0.005037

Pegasus Tutorial¶

Count Matrix File¶

Preprocessing¶

Filtration¶

Normalization and Logarithmic Transformation¶

Highly Variable Gene Selection¶

Principal Component Analysis¶

Nearest Neighbors¶

Clustering and Visualization¶

Batch Correction¶

Repeat Previous Steps on the Corrected Data¶

Visualization¶

tSNE Plot¶

UMAP Plot¶

Differential Expression Analysis¶

Cell Type Annotation¶

Raw Count vs Log-norm Count¶

Cell Development Trajectory and Diffusion Map¶

Save Result to File¶

Read More...¶

	n_genes	Channel
barcodekey
MantonBM1_HiSeq_1-AAACCTGAGCAGGTCA	816	MantonBM1_HiSeq_1
MantonBM1_HiSeq_1-AAACCTGCACACTGCG	716	MantonBM1_HiSeq_1
MantonBM1_HiSeq_1-AAACCTGCACCGGAAA	554	MantonBM1_HiSeq_1
MantonBM1_HiSeq_1-AAACCTGCATAGACTC	967	MantonBM1_HiSeq_1
MantonBM1_HiSeq_1-AAACCTGCATCGATGT	1704	MantonBM1_HiSeq_1