1. 构建new_cell_data_set对象
Usage
new_cell_data_set(expression_data, cell_metadata = https://www.it610.com/article/NULL, gene_metadata = NULL)
Arguments
expression_data |
expression data matrix for an experiment, can be a sparseMatrix. |
cell_metadata |
【r语言|monocle3包分析单细胞转录组数据】data frame containing attributes of individual cells, where row.names(cell_metadata) = colnames(expression_data) . |
gene_metadata |
data frame containing attributes of features (e.g. genes), where row.names(gene_metadata) = row.names(expression_data) . |
# 设置工作目录
setwd("your/working/path")# 载入包
library(monocle3)
# packageVersion('monocle3') # monocle版本
# ls("package:monocle3")
library(ggplot2)
library(dplyr)# 1.分别读入表达矩阵,细胞注释数据框以及基因注释数据框,并构建new_cell_data_set对象
# Load the data
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_rowData.rds"))# Make the CDS object
cds <- new_cell_data_set(expression_matrix,
cell_metadata = https://www.it610.com/article/cell_metadata,
gene_metadata = gene_annotation)# 如果表达矩阵不是"dgCMatrix",要转为sparseMatrix,加快下游的数据运算。
# cds <- new_cell_data_set(as(umi_matrix, "sparseMatrix"),
#cell_metadata = https://www.it610.com/article/cell_metadata,
#gene_metadata = gene_metadata)## 2. 从CellRanger的输出构建new_cell_data_set对象
## Provide the path to the Cell Ranger output.
#cds <- load_cellranger_data("~/Downloads/10x_data")
#cds <- load_mm_data(mat_path = "~/Downloads/matrix.mtx",
#feature_anno_path = "~/Downloads/features.tsv",
#cell_anno_path = "~/Downloads/barcodes.tsv")## 3. 合并new_cell_data_set对象
## combine cds objects
# make a fake second cds object for demonstration
#cds2 <- cds[1:100,]
#big_cds <- combine_cds(list(cds, cds2))
2. 数据预处理,降维 可以先降维看是否有批次效应,消除批次效应后要重新进行降维运算。
Usage
plot_cells( cds, x = 1, y = 2, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), color_cells_by = "cluster", group_cells_by = c("cluster", "partition"), genes = NULL, show_trajectory_graph = TRUE, trajectory_graph_color = "grey28", trajectory_graph_segment_size = 0.75, norm_method = c("log", "size_only"), label_cell_groups = TRUE, label_groups_by_cluster = TRUE, group_label_size = 2, labels_per_group = 1, label_branch_points = TRUE, label_roots = TRUE, label_leaves = TRUE, graph_label_size = 2, cell_size = 0.35, cell_stroke = I(cell_size/2), alpha = 1, min_expr = 0.1, rasterize = FALSE, scale_to_range = FALSE, label_principal_points = FALSE )
# 表型数据
pData(cds)
# 表达矩阵
exprs(cds)# preprocess_cds首先通过log和size factor (或仅仅size factor )对数据进行标准化,以解决深度差# 异。接下来,preprocess_cds计算一个低维空间,该空间将用作进一步降维的输入,如tSNE和UMAP。
cds <- preprocess_cds(cds, num_dim = 100)# whether the PCs capture most of the variation
# in gene expression across all the cells in the data set?
plot_pc_variance_explained(cds)# reduce_dimension,用pca(n=100)数据降维# 单细胞数量 >10,000, 设置umap.fast_sgd=TRUE,加快计算速度
cds <- reduce_dimension(cds)
#cds <- reduce_dimension(cds,reduction_method="tSNE")#获得降纬度后的数据矩阵
reducedDim(cds, "PCA")
reducedDim(cds, "UMAP")# 根据UMAP作图
plot_cells(cds)
#plot_cells(cds,reduction_method="tSNE")#check for batch effects
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)# 去除批次效应
cds = align_cds(cds, num_dim = 100, alignment_group = "plate")
plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)# 根据表型加上颜色
#colnames(pData(cds))
plot_cells(cds, color_cells_by="cao_cell_type")
## 不加label
#plot_cells(cds, color_cells_by="cao_cell_type",label_cell_groups=FALSE)#plot_cells(cds, reduction_method="tSNE",color_cells_by="cao_cell_type") # 显示各细胞中特定基因的表达量
plot_cells(cds, genes=c("cpna-2", "egl-21", "ram-2", "inos-1"))
3. 聚类分析
cluster_cells( cds, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), k = 20, cluster_method = c("leiden", "louvain"), num_iter = 2, partition_qval = 0.05, weight = FALSE, resolution = NULL, random_seed = NULL, verbose = F, ... )
### 聚类
cds = cluster_cells(cds, resolution=1e-5)
# 根据cluster标记颜色
plot_cells(cds)
# cds = cluster_cells(cds, reduction_method="tSNE",resolution=1e-5)
# plot_cells(cds,reduction_method="tSNE")# 根据partition标记颜色
plot_cells(cds, color_cells_by="partition", group_cells_by="partition")# plot_cells(cds, color_cells_by="cao_cell_type")### Find marker genes expressed by each cluster
marker_test_res <- top_markers(cds, group_cells_by="partition",
reference_cells=1000, cores=8)
# colnames(marker_test_res)
top_specific_markers <- marker_test_res %>%
filter(fraction_expressing >= 0.10) %>%
group_by(cell_group) %>%
top_n(1, pseudo_R2)top_specific_marker_ids <- unique(top_specific_markers %>% pull(gene_id))
# plot the expression and fraction of cells that express each marker in each group
# 比较不同partition/cluster中marker基因的平均表达量
plot_genes_by_group(cds,
top_specific_marker_ids,
group_cells_by="partition",
ordering_type="maximal_on_diag",
max.size=3)
4. 细胞类型注释 细胞类型的注释是研究中最难的部分,可能含有新发现的细胞亚群,需要专家知识。细胞亚群往往需要进一步聚类,注释,最后合并一张图上(包含所有的细胞群及精细的注释信息)。
### 1. 手工注释(根据marker基因的表达...)。本例中仅仅把colData中的细胞类型注释到不同的cluster
colData(cds)$assigned_cell_type <- as.character(partitions(cds))
# rename细胞类型
colData(cds)$assigned_cell_type = dplyr::recode(colData(cds)$assigned_cell_type,
"1"="Germline",
"2"="Body wall muscle",
"3"="Unclassified neurons",
"4"="Vulval precursors",
"5"="Failed QC",
"6"="Seam cells",
"7"="Pharyngeal epithelia",
"8"="Coelomocytes",
"9"="Am/PH sheath cells",
"10"="Failed QC",
"11"="Touch receptor neurons",
"12"="Intestinal/rectal muscle",
"13"="Pharyngeal neurons",
"14"="NA",
"15"="flp-1(+) interneurons",
"16"="Canal associated neurons",
"17"="Ciliated sensory neurons",
"18"="Other interneurons",
"19"="Pharyngeal gland",
"20"="Failed QC",
"21"="Ciliated sensory neurons",
"22"="Oxygen sensory neurons",
"23"="Ciliated sensory neurons",
"24"="Ciliated sensory neurons",
"25"="Ciliated sensory neurons",
"26"="Ciliated sensory neurons",
"27"="Oxygen sensory neurons",
"28"="Ciliated sensory neurons",
"29"="Unclassified neurons",
"30"="Socket cells",
"31"="Failed QC",
"32"="Pharyngeal gland",
"33"="Ciliated sensory neurons",
"34"="Ciliated sensory neurons",
"35"="Ciliated sensory neurons",
"36"="Failed QC",
"37"="Ciliated sensory neurons",
"38"="Pharyngeal muscle")plot_cells(cds, group_cells_by="partition", color_cells_by="assigned_cell_type")# 交互选择子细胞群
cds_subset <- choose_cells(cds)## 对子细胞群进行聚类
cds_subset = cluster_cells(cds_subset, resolution=1e-2)
plot_cells(cds_subset, color_cells_by="cluster")
plot_cells(cds, color_cells_by="cluster")# 注释子细胞群中的细胞类型
colData(cds_subset)$assigned_cell_type <- as.character(clusters(cds_subset))
colData(cds_subset)$assigned_cell_type <- dplyr::recode(colData(cds_subset)$assigned_cell_type,
"1"="Somatic gonad precursors",
"2"="Somatic gonad precursors",
"3"="Vulval precursors",
"4"="Sex myoblasts",
"5"="Sex myoblasts",
"6"="Vulval precursors",
"7"="Failed QC",
"8"="Vulval precursors",
"10"="Unclassified neurons",
"11"="Distal tip cells")
plot_cells(cds_subset, group_cells_by="cluster",
color_cells_by="assigned_cell_type")# 在所有细胞群图中显示子细胞群的注释信息
colData(cds)[colnames(cds_subset),]$assigned_cell_type <- colData(cds_subset)$assigned_cell_type
cds <- cds[,colData(cds)$assigned_cell_type != "Failed QC" | is.na(colData(cds)$assigned_cell_type )]
plot_cells(cds, group_cells_by="partition",
color_cells_by="assigned_cell_type",
labels_per_group=5)### 2. Automated annotation with Garnett
assigned_type_marker_test_res <- top_markers(cds,
group_cells_by="assigned_cell_type",
reference_cells=1000,
cores=8)
# Require that markers have at least JS specificty score > 0.5 and
# be significant in the logistic test for identifying their cell type:
garnett_markers <- assigned_type_marker_test_res %>%
filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>%
group_by(cell_group) %>%
top_n(5, marker_score)
# Exclude genes that are good markers for more than one cell type:
garnett_markers <- garnett_markers %>%
group_by(gene_short_name) %>%
filter(n() == 1)# 生成用于细胞类型注释的文件,可以根据专业知识修改用于细胞类型识别的基因
generate_garnett_marker_file(garnett_markers, file="./marker_file.txt")
colData(cds)$garnett_cluster <- clusters(cds)# 安装相应的包
BiocManager::install(c("org.Mm.eg.db", "org.Hs.eg.db")) # garnett依赖的包
devtools::install_github("cole-trapnell-lab/garnett", ref="monocle3")
BiocManager::install("org.Ce.eg.db")
library(garnett)# 训练分类器,训练好的分类器可以保存起来用于其他相似数据的注释
worm_classifier <- train_cell_classifier(cds = cds,
marker_file = "./marker_file.txt",
db=org.Ce.eg.db::org.Ce.eg.db,
cds_gene_id_type = "ENSEMBL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL",
cores=8)#saveRDS(worm_classifier,"worm_classifier.RDS")
#worm_classifier <- readRDS("worm_classifier.RDS")# 根据分类器进行细胞类型注释
cds <- classify_cells(cds, worm_classifier,
db = org.Ce.eg.db::org.Ce.eg.db,
cluster_extend = TRUE,
cds_gene_id_type = "ENSEMBL")
plot_cells(cds,
group_cells_by="partition",
color_cells_by="cluster_ext_type")
5.差异表达基因分析 monocle3的差异分析有两种方法:
回归分析:使用fit_models(),您可以评估每个基因是否取决于时间、治疗等变量。
图形自相关分析:使用Graph_test(),您可以找到在轨迹上或簇之间变化的基因。
# 演示数据,取子集,对部分基因做差异表达分析
ciliated_genes <- c("che-1",
"hlh-17",
"nhr-6",
"dmd-6",
"ceh-36",
"ham-1")
cds_subset <- cds[rowData(cds)$gene_short_name %in% ciliated_genes,]### 1.回归分析差异表达基因
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time")
# fit_models:该函数适用于细胞数据集中每个基因的广义线性模型。
#可以提供公式来解释额外的协变量 ~embryo.time + batch
# (例如收集的天数、细胞基因型、培养基条件等)。# 主要参数1: model_formula_str 可以是~cluster or ~partition等colData列名
# 主要参数2:expression_family:Specifies the family function used for expression responses.
#Can be one of "quasipoisson", "negbinomial", "poisson", "binomial", "gaussian",
#"zipoisson", or "zinegbinomial". Default is "quasipoisson".class(gene_fits) # "tbl_df""tbl""data.frame" fit_coefs <- coefficient_table(gene_fits)
emb_time_terms <- fit_coefs %>% filter(term == "embryo.time")emb_time_terms %>% filter (q_value < 0.05) %>%
select(gene_short_name, term, q_value, estimate)plot_genes_violin(cds_subset, group_cells_by="embryo.time.bin", ncol=2) +
theme(axis.text.x=element_text(angle=45, hjust=1))# control batch effect
gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time + batch")
# gene_fits2 <- fit_models(cds_subset, model_formula_str = "~cluster+ batch")
fit_coefs <- coefficient_table(gene_fits)
fit_coefs %>% filter(term != "(Intercept)") %>%
select(gene_short_name, term, q_value, estimate)evaluate_fits(gene_fits)# 模型比较,是否存在批次效应
time_batch_models <- fit_models(cds_subset,
model_formula_str = "~embryo.time + batch",
expression_family="negbinomial")
time_models <- fit_models(cds_subset,
model_formula_str = "~embryo.time",
expression_family="negbinomial")
compare_models(time_batch_models, time_models) %>% select(gene_short_name, q_value)# 比较结果:所有基因的似然比检验都是显著的,这表明数据中存在大量的批量效应。
# 因此,我们有理由将批处理项添加到我们的模型中。### 2.图形自相关分析差异表达基因
# 依据子细胞群在低维空间的位置,运用Moran’s I 检验,检测空间相关的差异表达基因
pr_graph_test_res <- graph_test(cds_subset, neighbor_graph="knn", cores=8)
pr_deg_ids <- row.names(subset(pr_graph_test_res,
morans_I > 0.01 & q_value < 0.05))
#将基因聚集成跨细胞共表达的模块
gene_module_df <- find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-3)
plot_cells(cds_subset, genes=gene_module_df,
show_trajectory_graph=FALSE,
label_cell_groups=FALSE)
6. 单细胞伪时间轨迹分析
####### 单细胞伪时间轨迹分析#######
###1.构建cell_data_set对象
expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_expression.rds"))
cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_colData.rds"))
gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_rowData.rds"))cds <- new_cell_data_set(expression_matrix,
cell_metadata = https://www.it610.com/article/cell_metadata,
gene_metadata = gene_annotation)### 2.预处理
#normalization and dimension reduction.cds <- preprocess_cds(cds, num_dim = 50)
# plot_pc_variance_explained(cds) # 查看pca的分量解释多少变异# remove batch effects
head(colData(cds))
cds <- align_cds(cds, alignment_group ="batch",
residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading")# alignment_group:指定用于对齐单元格组的colData列的字符串。
# 指定的列必须是一个因子。对齐可用于以非线性方式减去批次效果。
# 要更正连续效果,请使用残差模型公式。默认值为空。# residual_model_formula_str:NULL或字符串模型公式,
# 指定在降维之前从数据中减去的任何影响。使用线性模型减去效果。
# 对于非线性效果,请使用对齐组。默认值为空。# 使用pca处理的数据进一步降维以便聚类和轨迹推断
cds <- reduce_dimension(cds)
plot_cells(cds, label_groups_by_cluster=FALSE,color_cells_by = "cell.type")# 图中显示基因表达情况
ciliated_genes <- c("che-1",
"hlh-17",
"nhr-6",
"dmd-6",
"ceh-36",
"ham-1")plot_cells(cds,
genes=ciliated_genes,
label_cell_groups=FALSE,
show_trajectory_graph=FALSE)### 3.单细胞聚类
cds <- cluster_cells(cds)
plot_cells(cds, color_cells_by = "partition")
# partition和cluster有区别
# plot_cells(cds, color_cells_by = "cluster")### 4. 学习伪时间轨迹图
# 伪时间是衡量单个细胞在细胞分化等过程中所取得进展的指标。
cds <- learn_graph(cds)
plot_cells(cds,
color_cells_by = "cell.type",
label_groups_by_cluster=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE)# 在轨迹图标记胚胎时间段,
# 黑线表示图形的结构,黑圈表示分支节点,浅灰色圆圈表示细胞状态
plot_cells(cds,
color_cells_by = "embryo.time.bin",
label_cell_groups=FALSE,
label_leaves=TRUE,
label_branch_points=TRUE,
graph_label_size=1.5)
# 交互选择根节点(最初状态的细胞)
cds <- order_cells(cds)# 伪时间轨迹图作图,灰色表示无限的伪时间,因为从拾取的根节点无法访问它们。
# 一般一个partition选择一个root,就能解决无限伪时间问题
plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
graph_label_size=1.5)# # 不显示轨迹图
# plot_cells(cds,
#color_cells_by = "pseudotime",
#label_cell_groups=FALSE,
#label_leaves=FALSE,
#label_branch_points=FALSE,
#show_trajectory_graph = FALSE)## 自动选择根节点(root)
# a helper function to identify the root principal points:
get_earliest_principal_node <- function(cds, time_bin="130-170"){
cell_ids <- which(colData(cds)[, "embryo.time.bin"] == time_bin)closest_vertex <-
cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
root_pr_nodes <-
igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
(which.max(table(closest_vertex[cell_ids,]))))]root_pr_nodes
}
cds <- order_cells(cds, root_pr_nodes=get_earliest_principal_node(cds))plot_cells(cds,
color_cells_by = "pseudotime",
label_cell_groups=FALSE,
label_leaves=FALSE,
label_branch_points=FALSE,
show_trajectory_graph = FALSE)# 沿轨迹图交互选择起始节点以及终节点的单细胞
cds_sub <- choose_graph_segments(cds)### 5. 单细胞轨迹图上显示差异表达基因的表达量
ciliated_cds_pr_test_res <- graph_test(cds, neighbor_graph="principal_graph", cores=4)
pr_deg_ids <- row.names(subset(ciliated_cds_pr_test_res, q_value < 0.05))
plot_cells(cds, genes=c("hlh-4", "gcy-8", "dac-1", "oig-8"),
show_trajectory_graph=FALSE,
label_cell_groups=FALSE,
label_leaves=FALSE)
### 6. 将沿单细胞轨迹图的差异表达基因进行模块聚类分析,共调节模块分析
gene_module_df <- find_gene_modules(cds[pr_deg_ids,], resolution=c(0,10^seq(-6,-1)))cell_group_df <- tibble::tibble(cell=row.names(colData(cds)),
cell_group=colData(cds)$cell.type)
# 聚合表达矩阵:行为module,列为cell.type
agg_mat <- aggregate_gene_expression(cds, gene_module_df, cell_group_df)
row.names(agg_mat) <- stringr::str_c("Module ", row.names(agg_mat))
# 可视化:热图
pheatmap::pheatmap(agg_mat,
scale="column", clustering_method="ward.D2")# 比较不同module的基因聚合表达值
plot_cells(cds,
genes=gene_module_df %>% filter(module %in% c(27, 10, 7, 30)),
label_cell_groups=FALSE,
show_trajectory_graph=FALSE)
# 特定基因在特定细胞类型中的伪时间轨迹图中的表达量作图
AFD_genes <- c("gcy-8", "dac-1", "oig-8")
AFD_lineage_cds <- cds[rowData(cds)$gene_short_name %in% AFD_genes,
colData(cds)$cell.type %in% c("AFD")]plot_genes_in_pseudotime(AFD_lineage_cds,
color_cells_by="embryo.time.bin",
min_expr=0.5)### 7. 3d单细胞伪时间轨迹图
cds_3d <- reduce_dimension(cds, max_components = 3)
cds_3d <- cluster_cells(cds_3d)
cds_3d <- learn_graph(cds_3d)
cds_3d <- order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))cds_3d_plot_obj <- plot_cells_3d(cds_3d, color_cells_by="partition")
参考:https://cole-trapnell-lab.github.io/monocle3/docs/clustering/
推荐阅读
- Monocle3
- R|解决R 3.6.0下rvcheck版本过高导致安装clusterProfiler失败
- R语言|R 语言读写文件
- R语言|R语言基于ggplot绘制多条ROC曲线(2)
- 机器学习|NTU20220813-数据结构化和深度学习-WSL音频转发
- r语言|R语言 —— 多元线性回归
- 科研人生|刘小乐教授(我与生物信息学的不解之缘)
- R语言观察日志(part25)--将某列设置为行名
- r语言|统计学--基于R(第3版)(基于R应用的统计学丛书)作者(贾俊平 习题答案 第九章)