16 dplyr verbs for manipulating enrichment result
16.1 filter
filter(x, p.adjust < .05, qvalue < 0.2)
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology DO
## #...@keytype ENTREZID
## #...@gene chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...28 enriched terms found
## 'data.frame': 28 obs. of 9 variables:
## $ ID : chr "DOID:0060071" "DOID:5295" "DOID:8719" "DOID:3007" ...
## $ Description: chr "pre-malignant neoplasm" "intestinal disease" "in situ carcinoma" "breast ductal carcinoma" ...
## $ GeneRatio : chr "5/77" "9/77" "4/77" "4/77" ...
## $ BgRatio : chr "22/8007" "157/8007" "18/8007" "29/8007" ...
## $ pvalue : num 1.67e-06 1.76e-05 2.18e-05 1.56e-04 2.08e-04 ...
## $ p.adjust : num 0.00064 0.00279 0.00279 0.0136 0.0136 ...
## $ qvalue : num 0.000461 0.002008 0.002008 0.009796 0.009796 ...
## $ geneID : chr "6280/6278/10232/332/4321" "4312/6279/3627/10563/4283/890/366/4902/3620" "6280/6278/10232/332" "6280/6279/4751/6286" ...
## $ Count : int 5 9 4 4 13 6 13 5 5 6 ...
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
## R/Bioconductor package for Disease Ontology Semantic and Enrichment
## analysis. Bioinformatics 2015, 31(4):608-609
16.2 arrange
mutate(x, geneRatio = parse_ratio(GeneRatio)) %>%
arrange(desc(geneRatio))
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology DO
## #...@keytype ENTREZID
## #...@gene chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...28 enriched terms found
## 'data.frame': 28 obs. of 10 variables:
## $ ID : chr "DOID:3908" "DOID:120" "DOID:2394" "DOID:3459" ...
## $ Description: chr "non-small cell lung carcinoma" "female reproductive organ cancer" "ovarian cancer" "breast carcinoma" ...
## $ GeneRatio : chr "13/77" "13/77" "10/77" "10/77" ...
## $ BgRatio : chr "431/8007" "455/8007" "312/8007" "383/8007" ...
## $ pvalue : num 2.08e-04 3.52e-04 7.54e-04 3.48e-03 1.76e-05 ...
## $ p.adjust : num 0.0136 0.01928 0.02625 0.04756 0.00279 ...
## $ qvalue : num 0.0098 0.01389 0.0189 0.03424 0.00201 ...
## $ geneID : chr "6280/2305/9133/6279/7153/6278/6241/11065/10232/332/6286/3002/9212" "4312/6279/7153/3627/820/983/10232/6362/332/6286/9212/4321/6790" "4312/820/983/10232/6362/332/6286/9212/4321/6790" "4312/6280/6279/7153/4751/890/4085/332/6286/6790" ...
## $ Count : int 13 13 10 10 9 7 7 6 6 6 ...
## $ geneRatio : num 0.169 0.169 0.13 0.13 0.117 ...
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
## R/Bioconductor package for Disease Ontology Semantic and Enrichment
## analysis. Bioinformatics 2015, 31(4):608-609
16.3 select
## ID Description GeneRatio BgRatio
## DOID:0060071 DOID:0060071 pre-malignant neoplasm 5/77 22/8007
## DOID:5295 DOID:5295 intestinal disease 9/77 157/8007
## DOID:8719 DOID:8719 in situ carcinoma 4/77 18/8007
## DOID:3007 DOID:3007 breast ductal carcinoma 4/77 29/8007
## DOID:3908 DOID:3908 non-small cell lung carcinoma 13/77 431/8007
## DOID:0050589 DOID:0050589 inflammatory bowel disease 6/77 90/8007
## pvalue p.adjust qvalue Count
## DOID:0060071 1.671524e-06 0.0006401937 0.0004609887 5
## DOID:5295 1.759049e-05 0.0027885022 0.0020079362 9
## DOID:8719 2.184205e-05 0.0027885022 0.0020079362 4
## DOID:3007 1.564603e-04 0.0136037018 0.0097957122 4
## DOID:3908 2.075001e-04 0.0136037018 0.0097957122 13
## DOID:0050589 2.131128e-04 0.0136037018 0.0097957122 6
16.4 mutate
# k/M
y <- mutate(x, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
y
## #
## # over-representation test
## #
## #...@organism Homo sapiens
## #...@ontology DO
## #...@keytype ENTREZID
## #...@gene chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
## #...pvalues adjusted by 'BH' with cutoff <0.05
## #...28 enriched terms found
## 'data.frame': 28 obs. of 10 variables:
## $ ID : chr "DOID:0060071" "DOID:5295" "DOID:8719" "DOID:3007" ...
## $ Description: chr "pre-malignant neoplasm" "intestinal disease" "in situ carcinoma" "breast ductal carcinoma" ...
## $ GeneRatio : chr "5/77" "9/77" "4/77" "4/77" ...
## $ BgRatio : chr "22/8007" "157/8007" "18/8007" "29/8007" ...
## $ pvalue : num 1.67e-06 1.76e-05 2.18e-05 1.56e-04 2.08e-04 ...
## $ p.adjust : num 0.00064 0.00279 0.00279 0.0136 0.0136 ...
## $ qvalue : num 0.000461 0.002008 0.002008 0.009796 0.009796 ...
## $ geneID : chr "6280/6278/10232/332/4321" "4312/6279/3627/10563/4283/890/366/4902/3620" "6280/6278/10232/332" "6280/6279/4751/6286" ...
## $ Count : int 5 9 4 4 13 6 13 5 5 6 ...
## $ richFactor : num 0.2273 0.0573 0.2222 0.1379 0.0302 ...
## #...Citation
## Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an
## R/Bioconductor package for Disease Ontology Semantic and Enrichment
## analysis. Bioinformatics 2015, 31(4):608-609
library(ggplot2)
library(forcats)
library(enrichplot)
ggplot(y, showCategory = 20,
aes(richFactor, fct_reorder(Description, richFactor))) +
geom_segment(aes(xend=0, yend = Description)) +
geom_point(aes(color=p.adjust, size = Count)) +
scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
scale_size_continuous(range=c(2, 10)) +
theme_minimal() +
xlab("rich factor") +
ylab(NULL) +
ggtitle("Enriched Disease Ontology")
A very similar concept is Fold Enrichment, which is defined as the ratio of two proportions, (k/n) / (M/N). Using mutate
to add the fold enrichment variable is also easy:
mutate(x, FoldEnrichment = parse_ratio(GeneRatio) / parse_ratio(BgRatio))
16.5 slice
We can use slice
to choose rows by their ordinal position in the enrichment result. Grouped result use the ordinal position with the group.
In the following example, a GSEA result of Reactome pathway was sorted by the absolute values of NES and the result was grouped by the sign of NES. We then extracted first 5 rows of each groups. The result was displayed in Figure 16.2.
library(ReactomePA)
x <- gsePathway(geneList)
y <- arrange(x, abs(NES)) %>%
group_by(sign(NES)) %>%
slice(1:5)
library(forcats)
library(ggplot2)
library(ggstance)
library(enrichplot)
ggplot(y, aes(NES, fct_reorder(Description, NES), fill=qvalues), showCategory=10) +
geom_col(orientation='y') +
scale_fill_continuous(low='red', high='blue', guide=guide_colorbar(reverse=TRUE)) +
theme_minimal() + ylab(NULL)
16.6 summarise
pbar <- function(x) {
pi=seq(0, 1, length.out=11)
mutate(x, pp = cut(p.adjust, pi)) %>%
group_by(pp) %>%
summarise(cnt = n()) %>%
ggplot(aes(pp, cnt)) + geom_col() +
theme_minimal() +
xlab("p value intervals") +
ylab("Frequency") +
ggtitle("p value distribution")
}
x <- enrichDO(de, pvalueCutoff=1, qvalueCutoff=1)
set.seed(2020-09-10)
random_genes <- sample(names(geneList), 100)
y <- enrichDO(random_genes, pvalueCutoff=1, qvalueCutoff=1)
p1 <- pbar(x)
p2 <- pbar(y)
cowplot::plot_grid(p1, p2, ncol=1, labels = LETTERS[1:2])