1 Basic: preprocessing and clustering

We created Seurat-like functions to help you get started effortlessly.

1.1 Read 10X data

library(sclet)

dir <- "filtered_gene_bc_matrices/hg19"
pbmc <- Read10X(dir)

1.2 QC

# colData(pbmc)$percent.mt <- PercentageFeatureSet(pbmc, "^MT-")
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, "^MT-")
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))

subset_feature(pbmc, 10)

## ************************************************** >=0 | 32738 (100%)
## ************************* >=1 | 16634 (51%)
## ********************** >=2 | 14702 (45%)
## ********************* >=3 | 13714 (42%)
## ******************** >=4 | 13034 (40%)
## ******************* >=5 | 12572 (38%)
## ******************* >=6 | 12198 (37%)
## ****************** >=7 | 11876 (36%)
## ****************** >=8 | 11627 (36%)
## ***************** >=9 | 11379 (35%)
## ***************** >=10 | 11139 (34%)

pbmc <- subset_feature(pbmc, mincell = 3, peek=FALSE)

pbmc <- subset(pbmc, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
pbmc

## class: SingleCellExperiment 
## dim: 13714 2638 
## metadata(1): Samples
## assays(1): counts
## rownames(13714): AL627309.1 AP006222.2 ...
##   PNRC2_ENSG00000215700 SRSF10_ENSG00000215699
## rowData names(2): ID Symbol
## colnames: NULL
## colData names(5): Sample Barcode nFeature_RNA
##   nCount_RNA percent.mt
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

library(aplot)
plot_list(plot1, plot2)

Users can also use plotColData() to visualize the QC data.

# VlnPlot-like
p1 <- plotColData(pbmc, y = 'nCount_RNA')

# FeatureScatter-like
p2 <- plotColData(pbmc, x = "nCount_RNA", y = "percent.mt")
plot_list(p1, p2)

1.3 Variable features

pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc)
top10 <- head(VariableFeatures(pbmc), 10)
top10

##  [1] "PPBP"   "LYZ"    "S100A9" "IGLL5"  "GNLY"   "FTL"   
##  [7] "PF4"    "FTH1"   "GNG11"  "S100A8"

VariableFeaturePlot(pbmc, label = top10)

1.4 Dimensional reduction

pbmc <- ScaleData(pbmc)
pbmc <- runPCA(pbmc, subset_row = VariableFeatures(pbmc), exprs_values = "scaled")
ElbowPlot(pbmc)

1.5 Clustering

set.seed(2024-10-01)
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc)

head(Idents(pbmc), 5)

## AAACATACAACCAC-1 AAACATTGAGCTAC-1 AAACATTGATCAGC-1 
##                1                2                3 
## AAACCGTGCTTCCG-1 AAACCGTGTATGCG-1 
##                4                5 
## Levels: 1 2 3 4 5 6 7 8

1.6 UMAP

pbmc <- RunUMAP(pbmc, 1:10)

library(ggsc)
library(aplot)
p1 <- sc_dim(pbmc, reduction="PCA")
p2 <- sc_dim(pbmc, reduction="UMAP")

plot_list(gglist = list(PCA=p1, UMAP=p2))

1.7 Find Markers

# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2)
head(cluster2.markers, n = 5)

##                   pval          padj avg_log2FC pct.1 pct.2
## CD79A     0.000000e+00  0.000000e+00   6.888332 0.923 0.041
## MS4A1     0.000000e+00  0.000000e+00   5.689795 0.840 0.053
## TCL1A    6.735525e-271 3.079033e-267   6.827811 0.617 0.021
## CD79B    6.080891e-269 2.084833e-265   4.609790 0.903 0.142
## HLA-DQA1 4.615369e-267 1.265904e-263   3.960900 0.886 0.116

# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))
head(cluster5.markers, n = 5)

##                 pval          padj avg_log2FC pct.1 pct.2
## GZMB   1.469274e-130 2.014962e-126   8.447240 0.955 0.033
## NKG7   1.545205e-114 1.059547e-110   7.562537 1.000 0.142
## GNLY   4.489308e-113 2.052212e-109   7.791358 0.975 0.120
## FGFBP2 2.694631e-112 9.238541e-109   7.771659 0.860 0.035
## PRF1   1.983747e-111 5.441020e-108   6.136492 0.936 0.087

# find markers for every cluster compared to all remaining cells, 
# report only the positive ones
library(dplyr)
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)

topN_marker <- function(markers, n) {
    markers %>%
        group_by(cluster) %>%
        dplyr::filter(avg_log2FC > 1) %>%
        slice_head(n = n) %>%
        ungroup()
}

top10 <- topN_marker(pbmc.markers, 10)
split(top10$gene, top10$cluster)

## $`1`
##  [1] "CCL5"  "GZMK"  "NKG7"  "GZMA"  "CST7"  "CD8A"  "CTSW" 
##  [8] "IL32"  "LYAR"  "KLRG1"
## 
## $`2`
##  [1] "CD79A"     "MS4A1"     "TCL1A"     "CD79B"    
##  [5] "HLA-DQA1"  "LINC00926" "VPREB3"    "HLA-DQB1" 
##  [9] "CD74"      "HLA-DRA"  
## 
## $`3`
##  [1] "LTB"     "IL7R"    "IL32"    "LDHB"    "CD3D"   
##  [6] "TNFRSF4" "AQP3"    "CD2"     "CD3E"    "CD40LG" 
## 
## $`4`
##  [1] "LGALS2" "S100A8" "FCN1"   "S100A9" "MS4A6A" "CST3"  
##  [7] "TYROBP" "CD14"   "LYZ"    "TYMP"  
## 
## $`5`
##  [1] "GZMB"   "FGFBP2" "SPON2"  "PRF1"   "GNLY"   "XCL2"  
##  [7] "AKR1C3" "CLIC3"  "KLRD1"  "CST7"  
## 
## $`6`
##  [1] "CDKN1C"        "HES4"          "RP11-290F20.3"
##  [4] "MS4A7"         "FCGR3A"        "CKB"          
##  [7] "LILRA3"        "IFITM3"        "LRRC25"       
## [10] "LST1"         
## 
## $`7`
##  [1] "CCR7"      "LDHB"      "LEF1"      "FHIT"     
##  [5] "PRKCQ-AS1" "CD3E"      "NOSIP"     "PIK3IP1"  
##  [9] "CD27"      "LINC00176"
## 
## $`8`
##  [1] "GP9"                   "ITGA2B"               
##  [3] "AP001189.4"            "LY6G6F"               
##  [5] "TMEM40"                "SEPT5"                
##  [7] "HGD"                   "TREML1"               
##  [9] "ITGB3_ENSG00000259207" "SPARC"

1.7.1 Marker gene information

top1 <- topN_marker(pbmc.markers, 1)

gene_table <- gene_summary_table(top1, gene_col = 'gene', 
    cluster_col = 'cluster', keyType='SYMBOL', output = "data.frame")

gene_table

##   ENTREZID   gene          pval          padj avg_log2FC
## 1     1028 CDKN1C 9.252218e-233 1.268849e-228   5.835202
## 2     1236   CCR7 7.265363e-105 1.423388e-101   2.579844
## 3     2815    GP9  0.000000e+00  0.000000e+00  11.419366
## 4     3002   GZMB 7.268536e-269 9.968070e-265   6.001673
## 5     3957 LGALS2  0.000000e+00  0.000000e+00   6.351912
## 6     4050    LTB 4.712630e-115 6.462900e-111   1.484305
## 7     6352   CCL5 1.016869e-271 1.394534e-267   3.820950
## 8      973  CD79A  0.000000e+00  0.000000e+00   6.888332
##   pct.1 pct.2 cluster   name
## 1 0.531 0.008       6 CDKN1C
## 2 0.526 0.116       7   CCR7
## 3 0.917 0.002       8    GP9
## 4 0.955 0.068       5   GZMB
## 5 0.906 0.047       4 LGALS2
## 6 0.987 0.631       3    LTB
## 7 0.982 0.201       1   CCL5
## 8 0.923 0.041       2  CD79A
##                            description
## 1 cyclin dependent kinase inhibitor 1C
## 2       C-C motif chemokine receptor 7
## 3             glycoprotein IX platelet
## 4                           granzyme B
## 5                           galectin 2
## 6                     lymphotoxin beta
## 7         C-C motif chemokine ligand 5
## 8                       CD79a molecule
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       summary
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     This gene is imprinted, with preferential expression of the maternal allele. The encoded protein is a tight-binding, strong inhibitor of several G1 cyclin/Cdk complexes and a negative regulator of cell proliferation. Mutations in this gene are implicated in sporadic cancers and Beckwith-Wiedemann syndorome, suggesting that this gene is a tumor suppressor candidate. Three transcript variants encoding two different isoforms have been found for this gene. [provided by RefSeq, Oct 2010]
## 2                                                                                                      The protein encoded by this gene is a member of the G protein-coupled receptor family. This receptor was identified as a gene induced by the Epstein-Barr virus (EBV), and is thought to be a mediator of EBV effects on B lymphocytes. This receptor is expressed in various lymphoid tissues and activates B and T lymphocytes. It has been shown to control the migration of memory T cells to inflamed tissues, as well as stimulate dendritic cell maturation. The chemokine (C-C motif) ligand 19 (CCL19/ECL) has been reported to be a specific ligand of this receptor. Signals mediated by this receptor regulate T cell homeostasis in lymph nodes, and may also function in the activation and polarization of T cells, and in chronic inflammation pathogenesis. Alternative splicing of this gene results in multiple transcript variants. [provided by RefSeq, Sep 2014]
## 3                                                                                                                                                                                                                                                                                                                                      This gene encodes a small membrane glycoprotein found on the surface of human platelets. It forms a 1-to-1 noncovalent complex with glycoprotein Ib, a platelet surface membrane glycoprotein complex that functions as a receptor for von Willebrand factor. The complete receptor complex includes noncovalent association of the alpha and beta subunits with the protein encoded by this gene and platelet glycoprotein V. Defects in this gene are a cause of Bernard-Soulier syndrome, also known as giant platelet disease. These patients have unusually large platelets and have a clinical bleeding tendency. [provided by RefSeq, Oct 2008]
## 4                                                                                                                                                                                                                                                                                                                                                                                This gene encodes a member of the granzyme subfamily of proteins, part of the peptidase S1 family of serine proteases. The encoded preproprotein is secreted by natural killer (NK) cells and cytotoxic T lymphocytes (CTLs) and proteolytically processed to generate the active protease, which induces target cell apoptosis. This protein also processes cytokines and degrades extracellular matrix proteins, and these roles are implicated in chronic inflammation and wound healing. Expression of this gene may be elevated in human patients with cardiac fibrosis. [provided by RefSeq, Sep 2016]
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       The protein encoded by this gene is a soluble beta-galactoside binding lectin. The encoded protein is found as a homodimer and can bind to lymphotoxin-alpha. A single nucleotide polymorphism in an intron of this gene can alter the transcriptional level of the protein, with a resultant increased risk of myocardial infarction. [provided by RefSeq, Jul 2008]
## 6                                                                                                                                                                         Lymphotoxin beta is a type II membrane protein of the TNF family. It anchors lymphotoxin-alpha to the cell surface through heterotrimer formation. The predominant form on the lymphocyte surface is the lymphotoxin-alpha 1/beta 2 complex (e.g. 1 molecule alpha/2 molecules beta) and this complex is the primary ligand for the lymphotoxin-beta receptor. The minor complex is lymphotoxin-alpha 2/beta 1. LTB is an inducer of the inflammatory response system and involved in normal development of lymphoid tissue. Lymphotoxin-beta isoform b is unable to complex with lymphotoxin-alpha suggesting a function for lymphotoxin-beta which is independent of lympyhotoxin-alpha. Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Jul 2008]
## 7 This gene is one of several chemokine genes clustered on the q-arm of chromosome 17. Chemokines form a superfamily of secreted proteins involved in immunoregulatory and inflammatory processes. The superfamily is divided into four subfamilies based on the arrangement of the N-terminal cysteine residues of the mature peptide. This chemokine, a member of the CC subfamily, functions as a chemoattractant for blood monocytes, memory T helper cells and eosinophils. It causes the release of histamine from basophils and activates eosinophils. This cytokine is one of the major HIV-suppressive factors produced by CD8+ cells. It functions as one of the natural ligands for the chemokine receptor chemokine (C-C motif) receptor 5 (CCR5), and it suppresses in vitro replication of the R5 strains of HIV-1, which use CCR5 as a coreceptor. Alternative splicing results in multiple transcript variants that encode different isoforms. [provided by RefSeq, Jul 2013]
## 8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The B lymphocyte antigen receptor is a multimeric complex that includes the antigen-specific component, surface immunoglobulin (Ig). Surface Ig non-covalently associates with two other proteins, Ig-alpha and Ig-beta, which are necessary for expression and function of the B-cell antigen receptor. This gene encodes the Ig-alpha protein of the B-cell antigen component. Alternatively spliced transcript variants encoding different isoforms have been described. [provided by RefSeq, Jul 2008]

1.8 Cell cluster annotation

new_labels <- c("CD8+ T","B","Memory CD4+","CD14+ Mono",
    "NK","FCGR3A+ Mono","Native CD4+ T","Platelet")
pbmc <- RenameIdents(pbmc, new_labels)
sc_dim(pbmc, reduction="UMAP") + sc_dim_geom_label()