r语言|monocle3包分析单细胞转录组数据 r语言|生物信息学

1. 构建new_cell_data_set对象 Usage

new_cell_data_set(expression_data, cell_metadata = https://www.it610.com/article/NULL, gene_metadata = NULL)

Arguments

`expression_data`	expression data matrix for an experiment, can be a sparseMatrix.
`cell_metadata`	【r语言\|monocle3包分析单细胞转录组数据】data frame containing attributes of individual cells, where `row.names(cell_metadata) = colnames(expression_data)`.
`gene_metadata`	data frame containing attributes of features (e.g. genes), where `row.names(gene_metadata) = row.names(expression_data)`.

# 设置工作目录 setwd("your/working/path")# 载入包 library(monocle3) # packageVersion('monocle3') # monocle版本 # ls("package:monocle3") library(ggplot2) library(dplyr)# 1.分别读入表达矩阵，细胞注释数据框以及基因注释数据框，并构建new_cell_data_set对象 # Load the data expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_expression.rds")) cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_colData.rds")) gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/cao_l2_rowData.rds"))# Make the CDS object cds <- new_cell_data_set(expression_matrix, cell_metadata = https://www.it610.com/article/cell_metadata, gene_metadata = gene_annotation)# 如果表达矩阵不是"dgCMatrix"，要转为sparseMatrix，加快下游的数据运算。 # cds <- new_cell_data_set(as(umi_matrix, "sparseMatrix"), #cell_metadata = https://www.it610.com/article/cell_metadata, #gene_metadata = gene_metadata)## 2. 从CellRanger的输出构建new_cell_data_set对象 ## Provide the path to the Cell Ranger output. #cds <- load_cellranger_data("~/Downloads/10x_data") #cds <- load_mm_data(mat_path = "~/Downloads/matrix.mtx", #feature_anno_path = "~/Downloads/features.tsv", #cell_anno_path = "~/Downloads/barcodes.tsv")## 3. 合并new_cell_data_set对象 ## combine cds objects # make a fake second cds object for demonstration #cds2 <- cds[1:100,] #big_cds <- combine_cds(list(cds, cds2))

2. 数据预处理，降维可以先降维看是否有批次效应，消除批次效应后要重新进行降维运算。

Usage

plot_cells( cds, x = 1, y = 2, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), color_cells_by = "cluster", group_cells_by = c("cluster", "partition"), genes = NULL, show_trajectory_graph = TRUE, trajectory_graph_color = "grey28", trajectory_graph_segment_size = 0.75, norm_method = c("log", "size_only"), label_cell_groups = TRUE, label_groups_by_cluster = TRUE, group_label_size = 2, labels_per_group = 1, label_branch_points = TRUE, label_roots = TRUE, label_leaves = TRUE, graph_label_size = 2, cell_size = 0.35, cell_stroke = I(cell_size/2), alpha = 1, min_expr = 0.1, rasterize = FALSE, scale_to_range = FALSE, label_principal_points = FALSE )

# 表型数据 pData(cds) # 表达矩阵 exprs(cds)# preprocess_cds首先通过log和size factor (或仅仅size factor )对数据进行标准化，以解决深度差# 异。接下来，preprocess_cds计算一个低维空间，该空间将用作进一步降维的输入，如tSNE和UMAP。 cds <- preprocess_cds(cds, num_dim = 100)# whether the PCs capture most of the variation # in gene expression across all the cells in the data set? plot_pc_variance_explained(cds)# reduce_dimension,用pca（n=100）数据降维# 单细胞数量 >10,000, 设置umap.fast_sgd=TRUE，加快计算速度 cds <- reduce_dimension(cds) #cds <- reduce_dimension(cds,reduction_method="tSNE")#获得降纬度后的数据矩阵 reducedDim(cds, "PCA") reducedDim(cds, "UMAP")# 根据UMAP作图 plot_cells(cds) #plot_cells(cds,reduction_method="tSNE")#check for batch effects plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)# 去除批次效应 cds = align_cds(cds, num_dim = 100, alignment_group = "plate") plot_cells(cds, color_cells_by="plate", label_cell_groups=FALSE)# 根据表型加上颜色 #colnames(pData(cds)) plot_cells(cds, color_cells_by="cao_cell_type") ## 不加label #plot_cells(cds, color_cells_by="cao_cell_type",label_cell_groups=FALSE)#plot_cells(cds, reduction_method="tSNE",color_cells_by="cao_cell_type") # 显示各细胞中特定基因的表达量 plot_cells(cds, genes=c("cpna-2", "egl-21", "ram-2", "inos-1"))

3. 聚类分析

cluster_cells( cds, reduction_method = c("UMAP", "tSNE", "PCA", "LSI", "Aligned"), k = 20, cluster_method = c("leiden", "louvain"), num_iter = 2, partition_qval = 0.05, weight = FALSE, resolution = NULL, random_seed = NULL, verbose = F, ... )

### 聚类 cds = cluster_cells(cds, resolution=1e-5) # 根据cluster标记颜色 plot_cells(cds) # cds = cluster_cells(cds, reduction_method="tSNE",resolution=1e-5) # plot_cells(cds,reduction_method="tSNE")# 根据partition标记颜色 plot_cells(cds, color_cells_by="partition", group_cells_by="partition")# plot_cells(cds, color_cells_by="cao_cell_type")### Find marker genes expressed by each cluster marker_test_res <- top_markers(cds, group_cells_by="partition", reference_cells=1000, cores=8) # colnames(marker_test_res) top_specific_markers <- marker_test_res %>% filter(fraction_expressing >= 0.10) %>% group_by(cell_group) %>% top_n(1, pseudo_R2)top_specific_marker_ids <- unique(top_specific_markers %>% pull(gene_id)) # plot the expression and fraction of cells that express each marker in each group # 比较不同partition/cluster中marker基因的平均表达量 plot_genes_by_group(cds, top_specific_marker_ids, group_cells_by="partition", ordering_type="maximal_on_diag", max.size=3)

4. 细胞类型注释细胞类型的注释是研究中最难的部分，可能含有新发现的细胞亚群，需要专家知识。细胞亚群往往需要进一步聚类，注释，最后合并一张图上（包含所有的细胞群及精细的注释信息）。

### 1. 手工注释（根据marker基因的表达...）。本例中仅仅把colData中的细胞类型注释到不同的cluster colData(cds)$assigned_cell_type <- as.character(partitions(cds)) # rename细胞类型 colData(cds)$assigned_cell_type = dplyr::recode(colData(cds)$assigned_cell_type, "1"="Germline", "2"="Body wall muscle", "3"="Unclassified neurons", "4"="Vulval precursors", "5"="Failed QC", "6"="Seam cells", "7"="Pharyngeal epithelia", "8"="Coelomocytes", "9"="Am/PH sheath cells", "10"="Failed QC", "11"="Touch receptor neurons", "12"="Intestinal/rectal muscle", "13"="Pharyngeal neurons", "14"="NA", "15"="flp-1(+) interneurons", "16"="Canal associated neurons", "17"="Ciliated sensory neurons", "18"="Other interneurons", "19"="Pharyngeal gland", "20"="Failed QC", "21"="Ciliated sensory neurons", "22"="Oxygen sensory neurons", "23"="Ciliated sensory neurons", "24"="Ciliated sensory neurons", "25"="Ciliated sensory neurons", "26"="Ciliated sensory neurons", "27"="Oxygen sensory neurons", "28"="Ciliated sensory neurons", "29"="Unclassified neurons", "30"="Socket cells", "31"="Failed QC", "32"="Pharyngeal gland", "33"="Ciliated sensory neurons", "34"="Ciliated sensory neurons", "35"="Ciliated sensory neurons", "36"="Failed QC", "37"="Ciliated sensory neurons", "38"="Pharyngeal muscle")plot_cells(cds, group_cells_by="partition", color_cells_by="assigned_cell_type")# 交互选择子细胞群 cds_subset <- choose_cells(cds)## 对子细胞群进行聚类 cds_subset = cluster_cells(cds_subset, resolution=1e-2) plot_cells(cds_subset, color_cells_by="cluster") plot_cells(cds, color_cells_by="cluster")# 注释子细胞群中的细胞类型 colData(cds_subset)$assigned_cell_type <- as.character(clusters(cds_subset)) colData(cds_subset)$assigned_cell_type <- dplyr::recode(colData(cds_subset)$assigned_cell_type, "1"="Somatic gonad precursors", "2"="Somatic gonad precursors", "3"="Vulval precursors", "4"="Sex myoblasts", "5"="Sex myoblasts", "6"="Vulval precursors", "7"="Failed QC", "8"="Vulval precursors", "10"="Unclassified neurons", "11"="Distal tip cells") plot_cells(cds_subset, group_cells_by="cluster", color_cells_by="assigned_cell_type")# 在所有细胞群图中显示子细胞群的注释信息 colData(cds)[colnames(cds_subset),]$assigned_cell_type <- colData(cds_subset)$assigned_cell_type cds <- cds[,colData(cds)$assigned_cell_type != "Failed QC" | is.na(colData(cds)$assigned_cell_type )] plot_cells(cds, group_cells_by="partition", color_cells_by="assigned_cell_type", labels_per_group=5)### 2. Automated annotation with Garnett assigned_type_marker_test_res <- top_markers(cds, group_cells_by="assigned_cell_type", reference_cells=1000, cores=8) # Require that markers have at least JS specificty score > 0.5 and # be significant in the logistic test for identifying their cell type: garnett_markers <- assigned_type_marker_test_res %>% filter(marker_test_q_value < 0.01 & specificity >= 0.5) %>% group_by(cell_group) %>% top_n(5, marker_score) # Exclude genes that are good markers for more than one cell type: garnett_markers <- garnett_markers %>% group_by(gene_short_name) %>% filter(n() == 1)# 生成用于细胞类型注释的文件，可以根据专业知识修改用于细胞类型识别的基因 generate_garnett_marker_file(garnett_markers, file="./marker_file.txt") colData(cds)$garnett_cluster <- clusters(cds)# 安装相应的包 BiocManager::install(c("org.Mm.eg.db", "org.Hs.eg.db")) # garnett依赖的包 devtools::install_github("cole-trapnell-lab/garnett", ref="monocle3") BiocManager::install("org.Ce.eg.db") library(garnett)# 训练分类器，训练好的分类器可以保存起来用于其他相似数据的注释 worm_classifier <- train_cell_classifier(cds = cds, marker_file = "./marker_file.txt", db=org.Ce.eg.db::org.Ce.eg.db, cds_gene_id_type = "ENSEMBL", num_unknown = 50, marker_file_gene_id_type = "SYMBOL", cores=8)#saveRDS(worm_classifier,"worm_classifier.RDS") #worm_classifier <- readRDS("worm_classifier.RDS")# 根据分类器进行细胞类型注释 cds <- classify_cells(cds, worm_classifier, db = org.Ce.eg.db::org.Ce.eg.db, cluster_extend = TRUE, cds_gene_id_type = "ENSEMBL") plot_cells(cds, group_cells_by="partition", color_cells_by="cluster_ext_type")

5.差异表达基因分析 monocle3的差异分析有两种方法：
回归分析：使用fit_models（），您可以评估每个基因是否取决于时间、治疗等变量。
图形自相关分析：使用Graph_test（），您可以找到在轨迹上或簇之间变化的基因。

# 演示数据，取子集，对部分基因做差异表达分析 ciliated_genes <- c("che-1", "hlh-17", "nhr-6", "dmd-6", "ceh-36", "ham-1") cds_subset <- cds[rowData(cds)$gene_short_name %in% ciliated_genes,]### 1.回归分析差异表达基因 gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time") # fit_models:该函数适用于细胞数据集中每个基因的广义线性模型。 #可以提供公式来解释额外的协变量 ~embryo.time + batch # （例如收集的天数、细胞基因型、培养基条件等）。# 主要参数1: model_formula_str 可以是~cluster or ~partition等colData列名 # 主要参数2:expression_family:Specifies the family function used for expression responses. #Can be one of "quasipoisson", "negbinomial", "poisson", "binomial", "gaussian", #"zipoisson", or "zinegbinomial". Default is "quasipoisson".class(gene_fits) # "tbl_df""tbl""data.frame" fit_coefs <- coefficient_table(gene_fits) emb_time_terms <- fit_coefs %>% filter(term == "embryo.time")emb_time_terms %>% filter (q_value < 0.05) %>% select(gene_short_name, term, q_value, estimate)plot_genes_violin(cds_subset, group_cells_by="embryo.time.bin", ncol=2) + theme(axis.text.x=element_text(angle=45, hjust=1))# control batch effect gene_fits <- fit_models(cds_subset, model_formula_str = "~embryo.time + batch") # gene_fits2 <- fit_models(cds_subset, model_formula_str = "~cluster+ batch") fit_coefs <- coefficient_table(gene_fits) fit_coefs %>% filter(term != "(Intercept)") %>% select(gene_short_name, term, q_value, estimate)evaluate_fits(gene_fits)# 模型比较，是否存在批次效应 time_batch_models <- fit_models(cds_subset, model_formula_str = "~embryo.time + batch", expression_family="negbinomial") time_models <- fit_models(cds_subset, model_formula_str = "~embryo.time", expression_family="negbinomial") compare_models(time_batch_models, time_models) %>% select(gene_short_name, q_value)# 比较结果：所有基因的似然比检验都是显著的，这表明数据中存在大量的批量效应。 # 因此，我们有理由将批处理项添加到我们的模型中。### 2.图形自相关分析差异表达基因 # 依据子细胞群在低维空间的位置，运用Moran’s I 检验，检测空间相关的差异表达基因 pr_graph_test_res <- graph_test(cds_subset, neighbor_graph="knn", cores=8) pr_deg_ids <- row.names(subset(pr_graph_test_res, morans_I > 0.01 & q_value < 0.05)) #将基因聚集成跨细胞共表达的模块 gene_module_df <- find_gene_modules(cds_subset[pr_deg_ids,], resolution=1e-3) plot_cells(cds_subset, genes=gene_module_df, show_trajectory_graph=FALSE, label_cell_groups=FALSE)

6. 单细胞伪时间轨迹分析

####### 单细胞伪时间轨迹分析####### ###1.构建cell_data_set对象 expression_matrix <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_expression.rds")) cell_metadata <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_colData.rds")) gene_annotation <- readRDS(url("http://staff.washington.edu/hpliner/data/packer_embryo_rowData.rds"))cds <- new_cell_data_set(expression_matrix, cell_metadata = https://www.it610.com/article/cell_metadata, gene_metadata = gene_annotation)### 2.预处理 #normalization and dimension reduction.cds <- preprocess_cds(cds, num_dim = 50) # plot_pc_variance_explained(cds) # 查看pca的分量解释多少变异# remove batch effects head(colData(cds)) cds <- align_cds(cds, alignment_group ="batch", residual_model_formula_str = "~ bg.300.loading + bg.400.loading + bg.500.1.loading + bg.500.2.loading + bg.r17.loading + bg.b01.loading + bg.b02.loading")# alignment_group:指定用于对齐单元格组的colData列的字符串。 # 指定的列必须是一个因子。对齐可用于以非线性方式减去批次效果。 # 要更正连续效果，请使用残差模型公式。默认值为空。# residual_model_formula_str:NULL或字符串模型公式， # 指定在降维之前从数据中减去的任何影响。使用线性模型减去效果。 # 对于非线性效果，请使用对齐组。默认值为空。# 使用pca处理的数据进一步降维以便聚类和轨迹推断 cds <- reduce_dimension(cds) plot_cells(cds, label_groups_by_cluster=FALSE,color_cells_by = "cell.type")# 图中显示基因表达情况 ciliated_genes <- c("che-1", "hlh-17", "nhr-6", "dmd-6", "ceh-36", "ham-1")plot_cells(cds, genes=ciliated_genes, label_cell_groups=FALSE, show_trajectory_graph=FALSE)### 3.单细胞聚类 cds <- cluster_cells(cds) plot_cells(cds, color_cells_by = "partition") # partition和cluster有区别 # plot_cells(cds, color_cells_by = "cluster")### 4. 学习伪时间轨迹图 # 伪时间是衡量单个细胞在细胞分化等过程中所取得进展的指标。 cds <- learn_graph(cds) plot_cells(cds, color_cells_by = "cell.type", label_groups_by_cluster=FALSE, label_leaves=FALSE, label_branch_points=FALSE)# 在轨迹图标记胚胎时间段， # 黑线表示图形的结构，黑圈表示分支节点，浅灰色圆圈表示细胞状态 plot_cells(cds, color_cells_by = "embryo.time.bin", label_cell_groups=FALSE, label_leaves=TRUE, label_branch_points=TRUE, graph_label_size=1.5) # 交互选择根节点（最初状态的细胞） cds <- order_cells(cds)# 伪时间轨迹图作图，灰色表示无限的伪时间，因为从拾取的根节点无法访问它们。 # 一般一个partition选择一个root，就能解决无限伪时间问题 plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups=FALSE, label_leaves=FALSE, label_branch_points=FALSE, graph_label_size=1.5)# # 不显示轨迹图 # plot_cells(cds, #color_cells_by = "pseudotime", #label_cell_groups=FALSE, #label_leaves=FALSE, #label_branch_points=FALSE, #show_trajectory_graph = FALSE)## 自动选择根节点（root） # a helper function to identify the root principal points: get_earliest_principal_node <- function(cds, time_bin="130-170"){ cell_ids <- which(colData(cds)[, "embryo.time.bin"] == time_bin)closest_vertex <- cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex closest_vertex <- as.matrix(closest_vertex[colnames(cds), ]) root_pr_nodes <- igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names (which.max(table(closest_vertex[cell_ids,]))))]root_pr_nodes } cds <- order_cells(cds, root_pr_nodes=get_earliest_principal_node(cds))plot_cells(cds, color_cells_by = "pseudotime", label_cell_groups=FALSE, label_leaves=FALSE, label_branch_points=FALSE, show_trajectory_graph = FALSE)# 沿轨迹图交互选择起始节点以及终节点的单细胞 cds_sub <- choose_graph_segments(cds)### 5. 单细胞轨迹图上显示差异表达基因的表达量 ciliated_cds_pr_test_res <- graph_test(cds, neighbor_graph="principal_graph", cores=4) pr_deg_ids <- row.names(subset(ciliated_cds_pr_test_res, q_value < 0.05)) plot_cells(cds, genes=c("hlh-4", "gcy-8", "dac-1", "oig-8"), show_trajectory_graph=FALSE, label_cell_groups=FALSE, label_leaves=FALSE) ### 6. 将沿单细胞轨迹图的差异表达基因进行模块聚类分析，共调节模块分析 gene_module_df <- find_gene_modules(cds[pr_deg_ids,], resolution=c(0,10^seq(-6,-1)))cell_group_df <- tibble::tibble(cell=row.names(colData(cds)), cell_group=colData(cds)$cell.type) # 聚合表达矩阵:行为module，列为cell.type agg_mat <- aggregate_gene_expression(cds, gene_module_df, cell_group_df) row.names(agg_mat) <- stringr::str_c("Module ", row.names(agg_mat)) # 可视化:热图 pheatmap::pheatmap(agg_mat, scale="column", clustering_method="ward.D2")# 比较不同module的基因聚合表达值 plot_cells(cds, genes=gene_module_df %>% filter(module %in% c(27, 10, 7, 30)), label_cell_groups=FALSE, show_trajectory_graph=FALSE) # 特定基因在特定细胞类型中的伪时间轨迹图中的表达量作图 AFD_genes <- c("gcy-8", "dac-1", "oig-8") AFD_lineage_cds <- cds[rowData(cds)$gene_short_name %in% AFD_genes, colData(cds)$cell.type %in% c("AFD")]plot_genes_in_pseudotime(AFD_lineage_cds, color_cells_by="embryo.time.bin", min_expr=0.5)### 7. 3d单细胞伪时间轨迹图 cds_3d <- reduce_dimension(cds, max_components = 3) cds_3d <- cluster_cells(cds_3d) cds_3d <- learn_graph(cds_3d) cds_3d <- order_cells(cds_3d, root_pr_nodes=get_earliest_principal_node(cds))cds_3d_plot_obj <- plot_cells_3d(cds_3d, color_cells_by="partition")

参考：https://cole-trapnell-lab.github.io/monocle3/docs/clustering/