写在前面:经常做转录组分析,就是把差异基因搞个火山图和Venn图看各组差异基因的共有和特有情况。看见有个比较好的选择,能直观比较各种处理带来的影响,如下:
image.png
来自Nature plants的一篇文章
Ref:
https://github.com/YulongNiu/MPIPZ_microbe-host_homeostasis
https://www.nature.com/articles/s41477-021-00920-2
这个图很牛逼啊,表示的信息量很全,值得学习。去扒作者的代码,复现出了大部分
所需文件:
总的基因丰度表,即各个基因在每个样品中的丰度
image.png
各个样品的基因差异表达情况
举一个例子,这是Deseq2算出来的:
image.png
1. 导入数据并做一些处理
setwd("C:/Users/yjk/Desktop/cluster_DEGs")
library("dplyr")
library("ComplexHeatmap")
library("tibble")
library("RColorBrewer")
library("ggplot2")
my_theme()
all_genes <- read.table("all_genes.txt", header = TRUE) # fpkm
# DESeq2获得的差异表达基因(DEGs), |log2foldchange| > 1, padj < 0.05
HF12N_Rs <- read.table("HF12N_Rs.txt", header = TRUE, sep = "\t")
HF12N_S <- read.table("HF12N_S.txt", header = TRUE, sep = "\t")
HF12Rs_N_S <- read.table("HF12Rs_N_S.txt", header = TRUE, sep = "\t")
HF12S_Rs <- read.table("HF12S_Rs.txt", header = TRUE, sep = "\t")
HG64N_Rs <- read.table("HG64N_Rs.txt", header = TRUE, sep = "\t")
HG64N_S <- read.table("HG64N_S.txt", header = TRUE, sep = "\t")
HG64Rs_N_S <- read.table("HG64Rs_N_S.txt", header = TRUE, sep = "\t")
HG64S_Rs <- read.table("HG64S_Rs.txt", header = TRUE, sep = "\t")
# 合并差异表达和基因丰度
all <- HF12N_Rs %>%
full_join(HF12N_S) %>%
full_join(HF12Rs_N_S) %>%
full_join(HF12S_Rs) %>%
full_join(HG64N_Rs) %>%
full_join(HG64N_S) %>%
full_join(HG64Rs_N_S) %>%
full_join(HG64S_Rs) %>%
left_join(all_genes)
all[is.na(all)] <- "No"
## mean func
meanGp <- function(v) {
require('magrittr')
res <- v %>%
split(rep(1 : 8, each = 3)) %>%
sapply(mean, na.rm = TRUE)
return(res)
}
all_for_cluster <- select(all, -contains("vs")) # 只选择原始fpkm数据
rownames(all_for_cluster) <- all_for_cluster$id
all_for_cluster <- all_for_cluster[-1]
## sample name
sampleN <- c("HG64NRs","HG64SRs","HG64N","HG64S",
"HF12NRs","HF12SRs","HF12N","HF12S")
meanCount <- all_for_cluster %>%
apply(1, meanGp) %>%
t
colnames(meanCount) <- sampleN
## scale
scaleCount <- meanCount %>%
t %>%
scale %>%
t
scaleCount %<>% .[complete.cases(.), ]
2. 然后要先计算多少个cluster合适:
# set.seed(123) 聚类结果一致
set.seed(123)
## choose cluster num
## 1. sum of squared error
wss <- (nrow(scaleCount) - 1) * sum(apply(scaleCount, 2, var))
for (i in 2:20) {
wss[i] <- sum(kmeans(scaleCount,
centers=i,
algorithm = 'MacQueen')$withinss)
}
ggplot(tibble(k = 1:20, wss = wss), aes(k, wss)) +
geom_point(colour = '#D55E00', size = 3) +
geom_line(linetype = 'dashed') +
xlab('Number of clusters') +
ylab('Sum of squared error')
# ggsave("Sum_of_squared_error.pdf", height = 3, width = 4)
## 2. Akaike information criterion
kmeansAIC = function(fit){
m = ncol(fit$centers)
n = length(fit$cluster)
k = nrow(fit$centers)
D = fit$tot.withinss
return(D + 2*m*k)
}
aic <- numeric(20)
for (i in 1:20) {
fit <- kmeans(x = scaleCount, centers = i, algorithm = 'MacQueen')
aic[i] <- kmeansAIC(fit)
}
ggplot(tibble(k = 1:20, aic = aic), aes(k, wss)) +
geom_point(colour = '#009E73', size = 3) +
geom_line(linetype = 'dashed') +
xlab('Number of clusters') +
ylab('Akaike information criterion')
# ggsave("Akaike_information_criterion.pdf", height = 3, width = 4)
# choose cluster 10
withinss: Vector of within-cluster sum of squares, one component per cluster.
我们上面计算的是withinss,即cluster内的误差平方和,同一个cluster趋势越一致则越小。所以cluster越多则这个值越小,这和我们认知一致。但是,cluster太多了信息很杂,太少了就成了生拉硬扯成cluster了
image.png
可以看出选则10比较合适
利用另外一个参数:赤池信息量准则(Akaike information criterion,简称AIC)来衡量
AIC介绍,https://www.biaodianfu.com/aic-bic.html
理论上来讲,增加自由参数的数目可以提高拟合的优良性,但是过多,模型过于复杂容易造成过拟合现象。因此,AIC不仅要提高模型拟合度(极大似然),而且引入了惩罚项,使模型参数尽可能少,有助于降低过拟合的可能性。
image.png
可以看出,两种方法,同样结果。选择10
3. 然后聚类:
kclust10 <- kmeans(scaleCount, centers = 10, algorithm = "MacQueen", nstart = 1000, iter.max = 20)
cl <- as.data.frame(kclust10$cluster) %>%
rownames_to_column("id") %>%
set_colnames(c("id","cl"))
heat_all <- all %>% left_join(cl)
# 把所有DEGs的cluster信息保存,为后面富集分析所用
degs_cl <- heat_all %>%
select(c("id","cl"))
write.table(degs_cl, "./enrichment/degs_cl.txt", sep = "\t", quote = FALSE)
scaleC <- heat_all %>%
select(-contains("vs")) %>%
select(-c("id","cl")) %>%
t %>%
scale %>%
t %>%
as_tibble %>%
bind_cols(heat_all %>% select(id, cl))
# 排序
scaleC <- scaleC[,c("HG64S1", "HG64S2", "HG64S3",
"HG64N1", "HG64N2", "HG64N3",
"HG64SRs1", "HG64SRs2", "HG64SRs3",
"HG64NRs1", "HG64NRs2", "HG64NRs3",
"HF12S1", "HF12S2", "HF12S3",
"HF12N1", "HF12N2", "HF12N3",
"HF12SRs1", "HF12SRs2", "HF12SRs3",
"HF12NRs1", "HF12NRs2", "HF12NRs3",
"id", "cl")]
## col annotation
NatSoil <- HeatmapAnnotation(NatSoil = c(rep(rep(c("No", "Yes", "No", "Yes"), each = 3),2)),
col = list(NatSoil = c("Yes" = "grey", "No" = "white")),
gp = gpar(col = "black"))
Rs <- HeatmapAnnotation(Rs = c(rep(rep(c("No", "Yes"), each = 6),2)),
col = list(Rs = c("Yes" = "grey", "No" = "white")),
gp = gpar(col = "black"))
## DEG annotation
deg <- heat_all %>% select(matches("vs"))
# order
deg <- deg[,c("HG64N_vs_HG64S", "HF12N_vs_HF12S", "HG64NRs_vs_HG64SRs", "HF12RsN_vs_HF12RsS",
"HG64NRs_vs_HG64N", "HF12NRs_vs_HF12N", "HG64SRs_vs_HG64S", "HF12SRs_vs_HF12S")]
Heatmap(matrix = scaleC[1:24],
name = 'Scaled Counts',
row_split = scaleC$cl,
row_gap = unit(2, "mm"),
column_order = 1:24,
# column_split = rep(c("HG64S","HG64N","HG64SRs","HG64NRs",
# "HF12S","HF12N","HF12SRs","HF12NRs"), each = 3),
column_split = factor(rep(c("HG64","HF12"), each = 12), levels = c("HG64","HF12")),
show_column_names = FALSE,
col = colorRampPalette(rev(brewer.pal(n = 10, name = 'Spectral'))[c(-3,-8)])(100),
top_annotation = c(NatSoil, Rs),
row_title_gp = gpar(fontsize = 10),
use_raster = FALSE) +
Heatmap(deg,
col = c('Down' = '#00bbf9', 'No' = 'white', 'Up' = '#f94144'),
column_names_gp = gpar(fontsize = 6),
heatmap_legend_param = list(title = 'DEGs'),
cluster_columns = FALSE,
column_names_rot = 45,
use_raster = FALSE)
image.png
4. 然后每个cluster有什么功能呢?富集分析
#####################
setwd("C:/Users/yjk/Desktop/cluster_DEGs/enrichment")
library("clusterProfiler")
library("magrittr")
library("tidyverse")
library("RColorBrewer")
library("AnnotationHub")
my_theme()
# > packageVersion("AnnotationHub")
# [1] ‘3.0.2’
# packageVersion("clusterProfiler")
# [1] ‘4.0.5’
hub <- AnnotationHub()
# snapshotDate(): 2021-05-18
query(hub, "solanum")
# AnnotationHub with 9 records
# # snapshotDate(): 2021-05-18
# # $dataprovider: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/, Inparanoid8, WikiPathways
# # $species: Solanum lycopersicum, Solanum tuberosum, Solanum pennellii, Solanum pennelli, Sola...
# # $rdataclass: OrgDb, Inparanoid8Db, Tibble
# # additional mcols(): taxonomyid, genome, description, coordinate_1_based, maintainer,
# # rdatadateadded, preparerclass, tags, rdatapath, sourceurl, sourcetype
# # retrieve records with, e.g., 'object[["AH10593"]]'
#
# title
# AH10593 | hom.Solanum_lycopersicum.inp8.sqlite
# AH10606 | hom.Solanum_tuberosum.inp8.sqlite
# AH91815 | wikipathways_Solanum_lycopersicum_metabolites.rda
# AH94101 | org.Solanum_pennellii.eg.sqlite
# AH94102 | org.Solanum_pennelli.eg.sqlite
# AH94160 | org.Solanum_tuberosum.eg.sqlite
# AH94246 | org.Solanum_esculentum.eg.sqlite
# AH94247 | org.Solanum_lycopersicum.eg.sqlite
# AH94248 | org.Solanum_lycopersicum_var._humboldtii.eg.sqlite
sly.db <- hub[["AH94247"]]
这里如果遇到提示
hub <- AnnotationHub()
snapshotDate(): 2021-05-18
Warning message:DEPRECATION: As of AnnotationHub (>2.23.2), default caching location has changed.
Problematic cache: C:\Users\yjk\AppData\Local/AnnotationHub/AnnotationHub/Cache
See https://bioconductor.org/packages/devel/bioc/vignettes/AnnotationHub/inst/doc/TroubleshootingTheCache.html#default-caching-location-update
运行下面的命令即可
moveFiles<-function(package){
olddir <- path.expand(rappdirs::user_cache_dir(appname=package))
newdir <- tools::R_user_dir(package, which="cache")
dir.create(path=newdir, recursive=TRUE)
files <- list.files(olddir, full.names =TRUE)
moveres <- vapply(files,
FUN=function(fl){
filename = basename(fl)
newname = file.path(newdir, filename)
file.rename(fl, newname)
},
FUN.VALUE = logical(1))
if(all(moveres)) unlink(olddir, recursive=TRUE)
}
下面就可以进行GO和KEGG富集分析了
kmeansRes <- read.table("degs_cl.txt")
prefix <- 'kmeans10'
savepath <- "C:/Users/yjk/Desktop/cluster_DEGs/enrichment"
for (i in kmeansRes$cl %>% unique) {
## BP
goBP <- enrichGO(gene = kmeansRes %>% filter(cl == i) %>% .$id,
OrgDb = sly.db,
keyType= 'SYMBOL',
ont = 'BP',
pAdjustMethod = 'BH',
pvalueCutoff = 0.05,
qvalueCutoff = 0.1)
goBPSim <- clusterProfiler::simplify(goBP,
cutoff = 0.5,
by = 'p.adjust',
select_fun = min)
## check and plot
write.table(as.data.frame(goBPSim),
paste0(prefix, '_cl', i, '_cp_BP.txt') %>% file.path(savepath, .),
quote = FALSE,
sep = "\t")
## KEGG
kk2 <- enrichKEGG(gene = kmeansRes %>% filter(cl == i) %>% .$id %>%
bitr(.,"SYMBOL", "ENTREZID", sly.db) %>% dplyr::select("ENTREZID") %>%
unlist(),
organism = 'sly',
pvalueCutoff = 0.05)
write.table(as.data.frame(kk2),
paste0(prefix, '_cl', i, '_cp_KEGG.txt') %>% file.path(savepath, .),
quote = FALSE,
sep = "\t")
}
kall <- lapply(kmeansRes$cl %>% unique, function(x) {
eachG <- kmeansRes %>% filter(cl == x) %>% .$id
return(eachG)
}) %>%
set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))
kallGOBP <- compareCluster(geneCluster = kall,
fun = 'enrichGO',
OrgDb = sly.db,
keyType= 'SYMBOL',
ont = 'BP',
pAdjustMethod = 'BH',
pvalueCutoff=0.01,
qvalueCutoff=0.1)
kallGOBPSim <- clusterProfiler::simplify(kallGOBP,
cutoff = 0.9,
by = 'p.adjust',
select_fun = min)
dotplot(kallGOBPSim, showCategory = 10) +
ggtitle("Biological process")
# ggsave("kallGOBPSim.pdf", width = 6.4, height = 5.4)
kallGOCC <- compareCluster(geneCluster = kall,
fun = 'enrichGO',
OrgDb = sly.db,
keyType= 'SYMBOL',
ont = "CC",
pAdjustMethod = 'BH',
pvalueCutoff=0.01,
qvalueCutoff=0.1)
kallGOCCSim <- clusterProfiler::simplify(kallGOCC,
cutoff = 0.9,
by = 'p.adjust',
select_fun = min)
dotplot(kallGOCCSim, showCategory = 20) +
ggtitle("Cellular component")
# ggsave("kallGOCCSim.pdf", width = 6.4, height = 4)
kallGOMF <- compareCluster(geneCluster = kall,
fun = 'enrichGO',
OrgDb = sly.db,
keyType= 'SYMBOL',
ont = "MF",
pAdjustMethod = 'BH',
pvalueCutoff=0.01,
qvalueCutoff=0.1)
kallGOMFSim <- clusterProfiler::simplify(kallGOMF,
cutoff = 0.9,
by = 'p.adjust',
select_fun = min)
dotplot(kallGOMFSim, showCategory = 10) +
ggtitle("Molecular function")
ggsave("kallGOMFSim.pdf", width = 6.9, height = 4)
kallGOBP %>%
as.data.frame %>%
write.table('kmeans10_GOBP.txt', quote = FALSE, sep = "\t")
emapplot(kallGOBP,
showCategory = 5,
pie='count',
pie_scale=1,
layout='kk')
kallKEGG_input <- lapply(kmeansRes$cl %>% unique, function(x) {
eachG <- kmeansRes %>% filter(cl == x) %>%
.$id %>%
bitr(.,"SYMBOL", "ENTREZID", sly.db) %>%
dplyr::select("ENTREZID") %>%
unlist()
return(eachG)
}) %>%
set_names(kmeansRes$cl %>% unique %>% paste0('cl', .))
kallKEGG <- compareCluster(geneCluster = kallKEGG_input,
fun = 'enrichKEGG',
organism = "sly",
pvalueCutoff = 0.05)
dotplot(kallKEGG)
# ggsave('kmeans10_KEGGALL.pdf', width = 8, height = 4)
kallKEGG %>%
as.data.frame %>%
write.table('kmeans10_KEGG.txt', quote = FALSE, sep = "\t")
列一个例子,GO富集的生物学过程
image.png
到此结束