3 Basic Preprocessing and Clustering

This chapter is the fastest way to get a feel for the package. If you are new to sclet, think of it as the equivalent of the classic PBMC tutorial: read the data, perform quality control, normalize, identify highly variable genes, reduce dimensions, cluster cells, and inspect marker genes.

The difference is that everything happens directly on top of SingleCellExperiment, with sclet providing a coherent high-level analysis interface. The goal here is not only to reproduce a standard workflow, but also to show how little boilerplate is needed once the state-aware design is in place.

3.1 Read 10X data

library(sclet)
pbmc_10x <- TENxPBMCData::TENxPBMCData("pbmc3k")

## see ?TENxPBMCData and browseVignettes('TENxPBMCData') for documentation

## loading from cache

pbmc_10x

## class: SingleCellExperiment 
## dim: 32738 2700 
## metadata(0):
## assays(1): counts
## rownames(32738): ENSG00000243485 ENSG00000237613 ...
##   ENSG00000215616 ENSG00000215611
## rowData names(3): ENSEMBL_ID Symbol_TENx Symbol
## colnames: NULL
## colData names(11): Sample Barcode ... Individual
##   Date_published
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):

If you are starting from a local Cell Ranger directory instead, the corresponding sclet call is simply pbmc <- Read10X("filtered_feature_bc_matrix/").

3.2 QC

# colData(pbmc)$percent.mt <- PercentageFeatureSet(pbmc, "^MT-")
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, "^MT-")
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"))

subset_feature(pbmc, 10)

## ************************************************** >=0 | 13714 (100%)
## ************************************************** >=1 | 13713 (100%)
## ************************************************** >=2 | 13706 (100%)
## ************************************************** >=3 | 13656 (100%)
## *********************************************** >=4 | 12972 (95%)
## ********************************************** >=5 | 12519 (91%)
## ******************************************** >=6 | 12138 (89%)
## ******************************************* >=7 | 11813 (86%)
## ****************************************** >=8 | 11564 (84%)
## ***************************************** >=9 | 11319 (83%)
## **************************************** >=10 | 11095 (81%)

pbmc <- subset_feature(pbmc, mincell = 3, peek=FALSE)

keep <- colData(pbmc)$nFeature_RNA > 200 &
    colData(pbmc)$nFeature_RNA < 2500 &
    colData(pbmc)$percent.mt < 5
pbmc <- pbmc[, keep, drop = FALSE]
pbmc

## class: SingleCellExperiment 
## dim: 13656 2638 
## metadata(1): sclet
## assays(3): counts logcounts scaled
## rownames(13656): AL627309.1 AP006222.2 ...
##   PNRC2_ENSG00000215700 SRSF10_ENSG00000215699
## rowData names(9): ENSEMBL_ID Symbol_TENx ... p.value
##   FDR
## colnames: NULL
## colData names(16): Sample Barcode ... label ident
## reducedDimNames(3): PCA .dimred UMAP
## mainExpName: NULL
## altExpNames(0):

plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

library(aplot)
plot_list(plot1, plot2)

Users can also use plotColData() to visualize the QC data.

# VlnPlot-like
p1 <- plotColData(pbmc, y = 'nCount_RNA')

# FeatureScatter-like
p2 <- plotColData(pbmc, x = "nCount_RNA", y = "percent.mt")
plot_list(p1, p2)

3.3 Variable features

pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, method = "scran")
top10 <- head(VariableFeatures(pbmc, method = "scran"), 10)
top10

##  [1] "LYZ"     "S100A9"  "HLA-DRA" "CST3"    "TYROBP" 
##  [6] "S100A8"  "CD74"    "S100A4"  "NKG7"    "FTL"

VariableFeaturePlot(pbmc, label = top10, method = "scran")

3.4 Dimensional reduction

pbmc <- ScaleData(pbmc)
pbmc <- RunPCA(pbmc, subset_row = VariableFeatures(pbmc, method = "scran"), layer = "scaled")
ElbowPlot(pbmc)

Here the PCA step is intentionally pointed to the logical scaled layer. In current sclet, this is the recommended way to declare which expression view should drive the downstream reduction, instead of hard-coding the underlying assay name.

3.5 Clustering

set.seed(2024-10-01)
pbmc <- FindNeighbors(pbmc, dims = 1:10)
pbmc <- FindClusters(pbmc, resolution = 0.88)

head(Idents(pbmc), 5)

## [1] 1 2 3 4 5
## Levels: 1 2 3 4 5 6 7 8 9

3.6 UMAP

pbmc <- RunUMAP(pbmc, 1:10)

library(aplot)
# DimPlot delegates to ggsc::sc_dim
p1 <- DimPlot(pbmc, reduction="PCA")
p2 <- DimPlot(pbmc, reduction="UMAP")

plot_list(gglist = list(PCA=p1, UMAP=p2))

3.7 Find Markers

# find all markers of cluster 2
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2)

## Using 'logcounts' for marker testing instead of active layer 'scaled'.

head(cluster2.markers, n = 5)

##                    pval          padj avg_log2FC pct.1
## CD79A      0.000000e+00  0.000000e+00   6.902666 0.931
## MS4A1      0.000000e+00  0.000000e+00   5.704095 0.847
## TCL1A     9.482714e-274 4.316532e-270   6.842121 0.622
## CD79B     2.249658e-272 7.680334e-269   4.624108 0.911
## LINC00926 1.410438e-269 3.852188e-266   7.365360 0.559
##           pct.2
## CD79A     0.041
## MS4A1     0.053
## TCL1A     0.021
## CD79B     0.141
## LINC00926 0.009

# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0, 3))

## Using 'logcounts' for marker testing instead of active layer 'scaled'.

head(cluster5.markers, n = 5)

##                 pval          padj avg_log2FC pct.1 pct.2
## GZMB   4.579849e-132 6.254242e-128   8.274930 0.968 0.038
## PRF1   7.445905e-117 5.084064e-113   6.214512 0.961 0.085
## FGFBP2 2.674185e-116 1.217289e-112   7.832385 0.877 0.034
## NKG7   1.793180e-114 6.121915e-111   7.423024 1.000 0.141
## GNLY   2.752842e-109 7.518562e-106   7.651881 0.961 0.125

# find markers for every cluster compared to all remaining cells, 
# report only the positive ones
library(dplyr)
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE)

## Using 'logcounts' for marker testing instead of active layer 'scaled'.

## Calculating markers for all clusters using presto...

topN_marker <- function(markers, n) {
    markers %>%
        group_by(cluster) %>%
        dplyr::filter(avg_log2FC > 1) %>%
        slice_head(n = n) %>%
        ungroup()
}

top10 <- topN_marker(pbmc.markers, 10)
split(top10$gene, top10$cluster)

## $`1`
## [1] "CCL5" "GZMK" "NKG7" "CST7" "GZMA" "CD8A" "CTSW" "IL32"
## [9] "CD3D"
## 
## $`2`
##  [1] "CD79A"     "MS4A1"     "TCL1A"     "CD79B"    
##  [5] "LINC00926" "HLA-DQA1"  "VPREB3"    "HLA-DQB1" 
##  [9] "CD74"      "HLA-DRA"  
## 
## $`3`
## [1] "LTB"  "IL32" "IL7R" "LDHB" "CD3D"
## 
## $`4`
##  [1] "RP11-290F20.3" "FCGR3A"        "MS4A7"        
##  [4] "IFITM3"        "LRRC25"        "LST1"         
##  [7] "HCK"           "SERPINA1"      "FCER1G"       
## [10] "PILRA"        
## 
## $`5`
##  [1] "GZMB"   "FGFBP2" "SPON2"  "PRF1"   "GNLY"   "CLIC3" 
##  [7] "XCL2"   "KLRD1"  "CST7"   "GZMA"  
## 
## $`6`
## [1] "LDHB" "CD3D"
## 
## $`7`
##  [1] "S100A8" "LGALS2" "S100A9" "CD14"   "FCN1"   "TYROBP"
##  [7] "MS4A6A" "LYZ"    "CST3"   "TYMP"  
## 
## $`8`
##  [1] "FCER1A"   "CLEC10A"  "HLA-DQA2" "CPVL"     "HLA-DQA1"
##  [6] "HLA-DQB1" "HLA-DRB5" "HLA-DMB"  "HLA-DPA1" "CST3"    
## 
## $`9`
##  [1] "GP9"        "AP001189.4" "ITGA2B"     "LY6G6F"    
##  [5] "TMEM40"     "SEPT5"      "TREML1"     "CLDN5"     
##  [9] "HGD"        "PTCRA"

3.7.1 Marker gene information

top1 <- topN_marker(pbmc.markers, 1)

gene_table <- gene_summary_table(top1, gene_col = 'gene', 
    cluster_col = 'cluster', keyType='SYMBOL', output = "data.frame")

## Please cite:
## 
## D Wang, G Chen, L Li, S Wen, Z Xie, X Luo, L
## Zhan, S Xu, J Li, R Wang, Q Wang, G Yu. Reducing language
## barriers, promoting information absorption, and
## communication using fanyi. Chinese Medical Journal. 2024,
## 137(16):1950-1956. doi: 10.1097/CM9.0000000000003242

##

##

## 'select()' returned 1:1 mapping between keys and
## columns

gene_table

##   ENTREZID        gene          pval          padj
## 1     2205      FCER1A 1.406360e-225 1.920525e-221
## 2     2815         GP9  0.000000e+00  0.000000e+00
## 3     3002        GZMB 2.105904e-271 2.875822e-267
## 4     3945        LDHB 5.301429e-100  6.033026e-97
## 5     4050         LTB 4.642685e-109 6.340050e-105
## 6     6279      S100A8  0.000000e+00  0.000000e+00
## 7     6352        CCL5 8.548851e-238 1.167431e-233
## 8      973       CD79A  0.000000e+00  0.000000e+00
## 9     <NA> RP11-290F20 2.549479e-214 1.740784e-210
##   avg_log2FC pct.1 pct.2 cluster  avgExpr statistic
## 1   2.230133 0.862 0.011       8 2.251988   70201.0
## 2   3.143334 0.923 0.002       9 3.146220   32806.5
## 3   3.466439 0.968 0.068       5 3.611869  373837.0
## 4   1.147535 0.934 0.607       6 2.507821  902735.5
## 5   1.544459 0.976 0.632       3 3.235589  924181.5
## 6   3.744559 0.982 0.130       7 3.988260  978634.5
## 7   3.163484 0.966 0.217       1 3.665564  763745.5
## 8   2.703317 0.931 0.041       2 2.770227  765272.5
## 9   1.459260 0.810 0.062       4 1.568584  406468.0
##         auc   name                     description
## 1 0.9278360 FCER1A          Fc epsilon receptor Ia
## 2 0.9613626    GP9        glycoprotein IX platelet
## 3 0.9772597   GZMB                      granzyme B
## 4 0.7892583   LDHB         lactate dehydrogenase B
## 5 0.8026087    LTB                lymphotoxin beta
## 6 0.9852703 S100A8 S100 calcium binding protein A8
## 7 0.9491389   CCL5    C-C motif chemokine ligand 5
## 8 0.9626348  CD79A                  CD79a molecule
## 9 0.8781643   <NA>                            <NA>
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       summary
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The immunoglobulin epsilon receptor (IgE receptor) is the initiator of the allergic response. When two or more high-affinity IgE receptors are brought together by allergen-bound IgE molecules, mediators such as histamine that are responsible for allergy symptoms are released. This receptor is comprised of an alpha subunit, a beta subunit, and two gamma subunits. The protein encoded by this gene represents the alpha subunit. [provided by RefSeq, Aug 2011]
## 2                                                                                                                                                                                                                                                                                                                                      This gene encodes a small membrane glycoprotein found on the surface of human platelets. It forms a 1-to-1 noncovalent complex with glycoprotein Ib, a platelet surface membrane glycoprotein complex that functions as a receptor for von Willebrand factor. The complete receptor complex includes noncovalent association of the alpha and beta subunits with the protein encoded by this gene and platelet glycoprotein V. Defects in this gene are a cause of Bernard-Soulier syndrome, also known as giant platelet disease. These patients have unusually large platelets and have a clinical bleeding tendency. [provided by RefSeq, Oct 2008]
## 3                                                                                                                                                                                                                                                                                                                                                                                This gene encodes a member of the granzyme subfamily of proteins, part of the peptidase S1 family of serine proteases. The encoded preproprotein is secreted by natural killer (NK) cells and cytotoxic T lymphocytes (CTLs) and proteolytically processed to generate the active protease, which induces target cell apoptosis. This protein also processes cytokines and degrades extracellular matrix proteins, and these roles are implicated in chronic inflammation and wound healing. Expression of this gene may be elevated in human patients with cardiac fibrosis. [provided by RefSeq, Sep 2016]
## 4                                                                                                                                                                                                                                                                                  This gene encodes the B subunit of lactate dehydrogenase enzyme, which catalyzes the interconversion of pyruvate and lactate with concomitant interconversion of NADH and NAD+ in a post-glycolysis process. Alternatively spliced transcript variants have been found for this gene. Recent studies have shown that a C-terminally extended isoform is produced by use of an alternative in-frame translation termination codon via a stop codon readthrough mechanism, and that this isoform is localized in the peroxisomes. Mutations in this gene are associated with lactate dehydrogenase B deficiency. Pseudogenes have been identified on chromosomes X, 5 and 13. [provided by RefSeq, Feb 2016]
## 5                                                                                                                                                                         Lymphotoxin beta is a type II membrane protein of the TNF family. It anchors lymphotoxin-alpha to the cell surface through heterotrimer formation. The predominant form on the lymphocyte surface is the lymphotoxin-alpha 1/beta 2 complex (e.g. 1 molecule alpha/2 molecules beta) and this complex is the primary ligand for the lymphotoxin-beta receptor. The minor complex is lymphotoxin-alpha 2/beta 1. LTB is an inducer of the inflammatory response system and involved in normal development of lymphoid tissue. Lymphotoxin-beta isoform b is unable to complex with lymphotoxin-alpha suggesting a function for lymphotoxin-beta which is independent of lympyhotoxin-alpha. Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Jul 2008]
## 6                                                                                                                                                                                                                                                                     The protein encoded by this gene is a member of the S100 family of proteins containing 2 EF-hand calcium-binding motifs. S100 proteins are localized in the cytoplasm and/or nucleus of a wide range of cells, and involved in the regulation of a number of cellular processes such as cell cycle progression and differentiation. S100 genes include at least 13 members which are located as a cluster on chromosome 1q21. This protein may function in the inhibition of casein kinase and as a cytokine. Altered expression of this protein is associated with the disease cystic fibrosis. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Jan 2016]
## 7 This gene is one of several chemokine genes clustered on the q-arm of chromosome 17. Chemokines form a superfamily of secreted proteins involved in immunoregulatory and inflammatory processes. The superfamily is divided into four subfamilies based on the arrangement of the N-terminal cysteine residues of the mature peptide. This chemokine, a member of the CC subfamily, functions as a chemoattractant for blood monocytes, memory T helper cells and eosinophils. It causes the release of histamine from basophils and activates eosinophils. This cytokine is one of the major HIV-suppressive factors produced by CD8+ cells. It functions as one of the natural ligands for the chemokine receptor chemokine (C-C motif) receptor 5 (CCR5), and it suppresses in vitro replication of the R5 strains of HIV-1, which use CCR5 as a coreceptor. Alternative splicing results in multiple transcript variants that encode different isoforms. [provided by RefSeq, Jul 2013]
## 8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  The B lymphocyte antigen receptor is a multimeric complex that includes the antigen-specific component, surface immunoglobulin (Ig). Surface Ig non-covalently associates with two other proteins, Ig-alpha and Ig-beta, which are necessary for expression and function of the B-cell antigen receptor. This gene encodes the Ig-alpha protein of the B-cell antigen component. Alternatively spliced transcript variants encoding different isoforms have been described. [provided by RefSeq, Jul 2008]
## 9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        <NA>

3.8 Cell cluster annotation

new_labels <- c(
    "1" = "Naive CD4+ T", 
    "2" = "B", 
    "3" = "Memory CD4+", 
    "4" = "CD14+ Mono", 
    "5" = "NK", 
    "6" = "CD8+ T", 
    "7" = "Stress Response / Malignant Cells", 
    "8" = "FCGR3A+ Mono", 
    "9" = "Myeloid / Neutrophil"
)
pbmc <- RenameIdents(pbmc, new_labels)
DimPlot(pbmc, reduction="UMAP") + ggsc::sc_dim_geom_label()

3.9 Automatic Annotation (SingleR)

sclet provides RunSingleR to easily perform automatic annotation using SingleR.

# This requires SingleR and celldex packages.
hpca <- celldex::HumanPrimaryCellAtlasData()
pbmc2 <- RunSingleR(pbmc, ref = hpca, labels = hpca$label.main)

## Using 'logcounts' for SingleR instead of active layer 'scaled'.

## Annotation added to colData columns: 'SingleR_labels' and 'SingleR_pruned.labels'

# or be explicit about the expression layer used for annotation
pbmc2 <- RunSingleR(pbmc, ref = hpca, labels = hpca$label.main, layer = "logcounts")

## Annotation added to colData columns: 'SingleR_labels' and 'SingleR_pruned.labels'

# View the results
head(pbmc2$SingleR_labels)

## [1] "T_cells"  "B_cell"   "T_cells"  "Monocyte" "NK_cell" 
## [6] "T_cells"

get_mapping(pbmc2)

## $id
## [1] "singler"
## 
## $type
## [1] "mapping"
## 
## $status
## [1] "completed"
## 
## $method
## [1] "SingleR"
## 
## $inputs
## $inputs$assay
## [1] "logcounts"
## 
## $inputs$layer
## [1] "logcounts"
## 
## $inputs$reference_class
## [1] "SummarizedExperiment"
## 
## $inputs$mode
## [1] "label_transfer"
## 
## 
## $artifacts
## $artifacts$labels_col
## [1] "SingleR_labels"
## 
## $artifacts$mapping_type
## [1] "reference_mapping"
## 
## 
## $params
## list()
## 
## $summary
## list()
## 
## $created_at
## [1] "2026-05-31 03:49:48 UTC"

# Visualize
DimPlot(pbmc2, reduction="UMAP") + 
    ggplot2::aes(color = SingleR_labels)

## Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'

## Also defined by 'alabaster.base'

## Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'

## Also defined by 'alabaster.base'

RunSingleR() now follows the same layer contract as the rest of the package. If you do not specify layer, it resolves from DefaultLayer(pbmc) and falls back to a non-scaled expression source when needed.

In addition to writing annotation labels back to colData, it also records a lightweight reference-mapping state that you can inspect with get_mapping(pbmc).