14  dplyr verbs for manipulating enrichment result

library(DOSE)
data(geneList)
de = names(geneList)[1:100]
x = enrichDO(de)

14.1 filter

filter(x, p.adjust < .05, qvalue < 0.2)
#
# over-representation test
#
#...@organism    Homo sapiens 
#...@ontology    HDO 
#...@keytype     ENTREZID 
#...@gene    chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
#...pvalues adjusted by 'BH' with cutoff <0.05 
#...13 enriched terms found
'data.frame':   13 obs. of  12 variables:
 $ ID            : chr  "DOID:11054" "DOID:2799" "DOID:14004" "DOID:3996" ...
 $ Description   : chr  "urinary bladder cancer" "bronchiolitis obliterans" "thoracic aortic aneurysm" "urinary system cancer" ...
 $ GeneRatio     : chr  "9/57" "4/57" "4/57" "10/57" ...
 $ BgRatio       : chr  "180/7865" "26/7865" "36/7865" "370/7865" ...
 $ RichFactor    : num  0.05 0.1538 0.1111 0.027 0.0327 ...
 $ FoldEnrichment: num  6.9 21.23 15.33 3.73 4.51 ...
 $ zScore        : num  6.84 8.83 7.36 4.59 4.76 ...
 $ pvalue        : num  4.93e-06 3.29e-05 1.23e-04 2.80e-04 3.49e-04 ...
 $ p.adjust      : num  0.00186 0.00622 0.01548 0.02637 0.02637 ...
 $ qvalue        : num  0.0015 0.00501 0.01246 0.02122 0.02122 ...
 $ geneID        : chr  "4312/6280/6279/7153/6241/983/332/2146/6790" "3627/6373/4283/3002" "4312/3627/4283/4321" "4312/6280/6279/7153/6241/983/332/2146/4321/6790" ...
 $ Count         : int  9 4 4 10 8 5 5 4 6 6 ...
#...Citation
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics. 2015, 31(4):608-609 

14.2 arrange

mutate(x, geneRatio = parse_ratio(GeneRatio)) %>%
  arrange(desc(geneRatio))
#
# over-representation test
#
#...@organism    Homo sapiens 
#...@ontology    HDO 
#...@keytype     ENTREZID 
#...@gene    chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
#...pvalues adjusted by 'BH' with cutoff <0.05 
#...13 enriched terms found
'data.frame':   13 obs. of  13 variables:
 $ ID            : chr  "DOID:3996" "DOID:11054" "DOID:10534" "DOID:5041" ...
 $ Description   : chr  "urinary system cancer" "urinary bladder cancer" "stomach cancer" "esophageal cancer" ...
 $ GeneRatio     : chr  "10/57" "9/57" "8/57" "6/57" ...
 $ BgRatio       : chr  "370/7865" "180/7865" "245/7865" "155/7865" ...
 $ RichFactor    : num  0.027 0.05 0.0327 0.0387 0.0368 ...
 $ FoldEnrichment: num  3.73 6.9 4.51 5.34 5.08 ...
 $ zScore        : num  4.59 6.84 4.76 4.66 4.5 ...
 $ pvalue        : num  2.80e-04 4.93e-06 3.49e-04 8.42e-04 1.09e-03 ...
 $ p.adjust      : num  0.02637 0.00186 0.02637 0.03536 0.04137 ...
 $ qvalue        : num  0.0212 0.0015 0.0212 0.0285 0.0333 ...
 $ geneID        : chr  "4312/6280/6279/7153/6241/983/332/2146/4321/6790" "4312/6280/6279/7153/6241/983/332/2146/6790" "4312/2305/10403/259266/8140/81930/332/2146" "4312/3868/8140/7850/2146/4321" ...
 $ Count         : int  10 9 8 6 6 5 5 4 4 4 ...
 $ geneRatio     : num  0.175 0.158 0.14 0.105 0.105 ...
#...Citation
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics. 2015, 31(4):608-609 

14.3 select

select(x, -geneID) %>% head
                   ID              Description GeneRatio  BgRatio RichFactor
DOID:11054 DOID:11054   urinary bladder cancer      9/57 180/7865 0.05000000
DOID:2799   DOID:2799 bronchiolitis obliterans      4/57  26/7865 0.15384615
DOID:14004 DOID:14004 thoracic aortic aneurysm      4/57  36/7865 0.11111111
DOID:3996   DOID:3996    urinary system cancer     10/57 370/7865 0.02702703
DOID:10534 DOID:10534           stomach cancer      8/57 245/7865 0.03265306
DOID:3627   DOID:3627          aortic aneurysm      5/57  89/7865 0.05617978
           FoldEnrichment   zScore       pvalue    p.adjust      qvalue Count
DOID:11054       6.899123 6.840550 4.925937e-06 0.001862004 0.001498522     9
DOID:2799       21.228070 8.826735 3.291471e-05 0.006220880 0.005006500     4
DOID:14004      15.331384 7.363351 1.228827e-04 0.015483224 0.012460740     4
DOID:3996        3.729256 4.594609 2.797046e-04 0.026365852 0.021218967    10
DOID:10534       4.505550 4.762675 3.487547e-04 0.026365852 0.021218967     8
DOID:3627        7.751823 5.473022 4.365904e-04 0.027505194 0.022135898     5

14.4 mutate

# k/M
y <- mutate(x, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
y
#
# over-representation test
#
#...@organism    Homo sapiens 
#...@ontology    HDO 
#...@keytype     ENTREZID 
#...@gene    chr [1:100] "4312" "8318" "10874" "55143" "55388" "991" "6280" "2305" ...
#...pvalues adjusted by 'BH' with cutoff <0.05 
#...13 enriched terms found
'data.frame':   13 obs. of  13 variables:
 $ ID            : chr  "DOID:11054" "DOID:2799" "DOID:14004" "DOID:3996" ...
 $ Description   : chr  "urinary bladder cancer" "bronchiolitis obliterans" "thoracic aortic aneurysm" "urinary system cancer" ...
 $ GeneRatio     : chr  "9/57" "4/57" "4/57" "10/57" ...
 $ BgRatio       : chr  "180/7865" "26/7865" "36/7865" "370/7865" ...
 $ RichFactor    : num  0.05 0.1538 0.1111 0.027 0.0327 ...
 $ FoldEnrichment: num  6.9 21.23 15.33 3.73 4.51 ...
 $ zScore        : num  6.84 8.83 7.36 4.59 4.76 ...
 $ pvalue        : num  4.93e-06 3.29e-05 1.23e-04 2.80e-04 3.49e-04 ...
 $ p.adjust      : num  0.00186 0.00622 0.01548 0.02637 0.02637 ...
 $ qvalue        : num  0.0015 0.00501 0.01246 0.02122 0.02122 ...
 $ geneID        : chr  "4312/6280/6279/7153/6241/983/332/2146/6790" "3627/6373/4283/3002" "4312/3627/4283/4321" "4312/6280/6279/7153/6241/983/332/2146/4321/6790" ...
 $ Count         : int  9 4 4 10 8 5 5 4 6 6 ...
 $ richFactor    : num  0.05 0.1538 0.1111 0.027 0.0327 ...
#...Citation
Guangchuang Yu, Li-Gen Wang, Guang-Rong Yan, Qing-Yu He. DOSE: an R/Bioconductor package for Disease Ontology Semantic and Enrichment analysis. Bioinformatics. 2015, 31(4):608-609 
library(ggplot2)
library(forcats)
library(enrichplot)

ggplot(y, showCategory = 20, 
  aes(richFactor, fct_reorder(Description, richFactor))) + 
  geom_segment(aes(xend=0, yend = Description)) +
  geom_point(aes(color=p.adjust, size = Count)) +
  scale_color_viridis_c(guide=guide_colorbar(reverse=TRUE)) +
  scale_size_continuous(range=c(2, 10)) +
  theme_minimal() + 
  xlab("rich factor") +
  ylab(NULL) + 
  ggtitle("Enriched Disease Ontology")
Figure 14.1: Visualizing rich factor of enriched terms using lolliplot.

A very similar concept is Fold Enrichment, which is defined as the ratio of two proportions, (k/n) / (M/N). Using mutate to add the fold enrichment variable is also easy:

mutate(x, FoldEnrichment = parse_ratio(GeneRatio) / parse_ratio(BgRatio))

Here, the calculation of rich factor and fold enrichment is only for demonstration purposes. The enrichplot package provides the dotplot function that can directly visualize these two values without adding them to the enrichment result.

14.5 slice

We can use slice to choose rows by their ordinal position in the enrichment result. Grouped result use the ordinal position with the group.

In the following example, a GSEA result of Reactome pathway was sorted by the absolute values of NES and the result was grouped by the sign of NES. We then extracted first 5 rows of each groups. The result was displayed in Figure 14.2.

library(ReactomePA)
x <- gsePathway(geneList)


y <- arrange(x, abs(NES)) %>% 
        group_by(sign(NES)) %>% 
        slice(1:5)

library(forcats)
library(ggplot2)
library(enrichplot)

ggplot(y, aes(NES, fct_reorder(Description, NES), fill=qvalue), showCategory=10) + 
    geom_col(orientation='y') + 
    scale_fill_continuous(low='red', high='blue', guide=guide_colorbar(reverse=TRUE)) + 
    theme_minimal() + ylab(NULL)
Figure 14.2: Choose pathways by ordinal positions.

14.6 summarise

library(ggplot2)

pbar <- function(x) {
  pi=seq(0, 1, length.out=11)

  mutate(x, pp = cut(p.adjust, pi)) |>
    group_by(pp) |>
    summarise(cnt = n()) |> 
    ggplot(aes(pp, cnt)) + geom_col() + 
    theme_minimal() +
    xlab("p value intervals") +
    ylab("Frequency") + 
    ggtitle("p value distribution")
}    

x <- enrichDO(de, pvalueCutoff=1, qvalueCutoff=1)
set.seed(2020-09-10)
random_genes <- sample(names(geneList), 100)
y <- enrichDO(random_genes, pvalueCutoff=1, qvalueCutoff=1)
p1 <- pbar(x)
p2 <- pbar(y)
aplot::plot_list(p1, p2, ncol=1, tag_levels = 'A')