众所周知,基因集富集分析的图片不够美观,这么多年的时间依旧没有改进,直接用在文章中显得即臃肿占地,又缺乏美感,因此本文尝试还原文献中对GSEA可视化的方法,并用R语言ggplot2包复现。

原图解析

首先,让我们看一下文献中是如何绘制的。

首先我们观察到图片从上到下分为三部分:

  • 第一部分为上方的折线图,该图的数据实际为GSEA中某个基因集的Enrichment score;
  • 第二部分为中间的线条图,每个线条代表基因集中的一个基因,其X值为基因在所有基因中的Rank(排序/位置);
  • 第三部分为最底部的colorbar, 在GSEA官方软件中,其可视化为所有基因的rank metric score, 而在此处应该是借鉴了clusterprofile包中可视化采用的rank累计法,即展示当前基因集在正/负向调控中的偏移密集度。

需要什么数据

我们观察到我们只需要下载某个基因集的数据即可,我们需要的排序,富集分数都在里面。

代码阶段

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
library(ggplot2)
library(readr)
library(dplyr)
library(RColorBrewer)

# ============================================================
# 0. 手动设置区域
# ============================================================

pathway_file <- "example_pathway.tsv" # 单个 pathway 的 GSEA 结果文件

pathway_name <- "Necroptosis"
NES_value <- -1.353
p_value <- 0

# pathway 文件中用于绘图的列
# 你之前用的是 select(4:6),分别对应 rank, metric, score
pathway_rank_col <- 4
pathway_metric_col <- 5
pathway_score_col <- 6

# 完整 ranked gene list 的总基因数
# 只用 pathway 文件时,必须手动提供这个值
# 如果不知道,可以先用 max(pathway rank) 近似,但不推荐
n_total_gene <- 40727

# ============================================================
# 1. 读取 pathway 文件:用于 ES 曲线和 hit 竖线
# pathway_df: 包含rank,metric,score的三列数据,核心数据框
# es_df: 用于绘制折线图
# hit_df: 用于绘制竖线图
# color_block_df: 用于绘制底部colorbar
# ============================================================

pathway_df <- read_tsv(pathway_file, show_col_types = FALSE)

pathway_df <- pathway_df %>%
select(
rank = all_of(pathway_rank_col),
metric_hit = all_of(pathway_metric_col),
score = all_of(pathway_score_col)
) %>%
mutate(
rank = as.numeric(rank),
metric_hit = as.numeric(metric_hit),
score = as.numeric(score)
) %>%
filter(
!is.na(rank),
!is.na(score)
) %>%
arrange(rank)

es_df <- pathway_df %>%
transmute(
x = rank,
ES = score
)

hit_df <- pathway_df %>%
transmute(
x = rank
)

# ============================================================
# 2. enrichplot::gseaplot2() 风格的底部色块逻辑
# 不使用所有基因 metric,只使用 pathway hit rank
# ============================================================

n <- n_total_gene

position <- rep(0, n)
hit_rank <- pathway_df$rank
hit_rank <- hit_rank[hit_rank >= 1 & hit_rank <= n]
position[hit_rank] <- 1

# enrichplot::gseaplot2() 风格色块逻辑
n_col <- 10

v <- seq(1, sum(position), length.out = n_col)

inv <- findInterval(
rev(cumsum(position)),
v,
all.inside = TRUE
)

col <- c(
rev(RColorBrewer::brewer.pal(5, "Blues")),
RColorBrewer::brewer.pal(5, "Reds")
)

block_id <- unique(inv)

xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(block_id)])

color_block_df <- tibble(
xmin = xmin,
xmax = xmax,
col = col[block_id]
) %>%
mutate(
xmin = pmax(xmin, 1),
xmax = pmin(xmax, n)
)

# 防止 xmax 超出 n
color_block_df <- color_block_df %>%
mutate(
xmin = pmax(xmin, 1),
xmax = pmin(xmax, n)
)

# ============================================================
# 3. 自动设置画图范围和标注位置
# ============================================================

es_min <- min(es_df$ES, na.rm = TRUE)
es_max <- max(es_df$ES, na.rm = TRUE)

# ES 区域下方额外留空间,避免 ES 曲线和 hit 竖线重叠
extra_bottom <- 0.13

y_min <- es_min - extra_bottom
y_max <- es_max + 0.05

# 底部 colorbar:加厚
bar_ymin <- y_min + 0.010
bar_ymax <- y_min + 0.035

# hit 竖线:放在 colorbar 上方,但不要碰到 ES 曲线
hit_ymin <- bar_ymax + 0.008
hit_ymax <- hit_ymin + 0.070

# 在 ES 区域和 hit 区域之间加一条横线,模仿第二张图
sep_y <- hit_ymax + 0.008

# NES 和 P value 的默认位置
nes_x <- n * 0.07
nes_y <- es_min + (es_max - es_min) * 0.18

p_x <- n * 0.60
p_y <- es_max - (es_max - es_min) * 0.18

# y 轴刻度只显示 ES 曲线附近的刻度,不显示底部装饰区域
y_breaks <- pretty(c(es_min, es_max), n = 4)
y_breaks <- y_breaks[y_breaks >= es_min & y_breaks <= es_max]

# ============================================================
# 4. 作图
# ============================================================

ggplot() +

# enrichplot::gseaplot2() 风格底部色块,加厚
geom_rect(
data = color_block_df,
aes(
xmin = xmin,
xmax = xmax,
ymin = bar_ymin,
ymax = bar_ymax
),
fill = color_block_df$col,
color = NA
) +

# hit 竖线:独立放在 ES 曲线下方
geom_segment(
data = hit_df,
aes(
x = x,
xend = x,
y = hit_ymin,
yend = hit_ymax
),
linewidth = 0.35,
color = "black"
) +

# ES 区域和 hit 区域之间的分隔线
geom_hline(
yintercept = sep_y,
linewidth = 1.6,
color = "black"
) +

# ES running score 曲线
geom_line(
data = es_df,
aes(x = x, y = ES),
color = "#8a2be2",
linewidth = 0.75
) +

annotate(
"text",
x = nes_x,
y = nes_y,
label = paste0("NES = ", NES_value),
hjust = 0,
vjust = 0,
size = 7.5,
color = "black"
) +

annotate(
"text",
x = p_x,
y = p_y,
label = paste0("italic(P) == ", p_value),
hjust = 0,
size = 6.6,
color = "black",
parse = TRUE
) +

scale_x_continuous(
limits = c(1, n),
expand = c(0, 0),
breaks = pretty_breaks(n = 8)
) +

scale_y_continuous(
limits = c(y_min, y_max),
breaks = y_breaks,
labels = function(x) sprintf("%.1f", x),
expand = c(0, 0)
) +

labs(
title = pathway_name,
x = NULL,
y = NULL
) +

theme_classic(base_size = 24) +
theme(
panel.background = element_rect(fill = "white", color = NA),
plot.background = element_rect(fill = "white", color = NA),

plot.title = element_text(
hjust = 0.5,
size = 34,
face = "plain",
margin = margin(b = 2)
),

axis.text.x = element_blank(),
axis.ticks.x = element_blank(),

axis.text.y = element_text(size = 28, color = "black"),
axis.ticks.y = element_line(linewidth = 1.1, color = "black"),
axis.ticks.length = unit(0.18, "cm"),

panel.border = element_rect(
color = "black",
fill = NA,
linewidth = 2.2
),
axis.line = element_blank(),

panel.grid.major.x = element_line(
color = "grey88",
linewidth = 0.5
),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),

plot.margin = margin(5, 8, 5, 8)
)

ggsave("GSEA_Necroptosis.pdf",width=6.61,height = 2.85)

大功告成!!