Last updated: 2025-02-24

Checks: 7 0

Knit directory: muse/

Read HDF5 files into a list.

hdf5_files <- list.files(path = "data", pattern = "5k_Human", full.names = TRUE)
[1] "data/5k_Human_Donor1_PBMC_3p_gem-x_5k_Human_Donor1_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5"
[2] "data/5k_Human_Donor2_PBMC_3p_gem-x_5k_Human_Donor2_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5"
[3] "data/5k_Human_Donor3_PBMC_3p_gem-x_5k_Human_Donor3_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5"
[4] "data/5k_Human_Donor4_PBMC_3p_gem-x_5k_Human_Donor4_PBMC_3p_gem-x_count_sample_filtered_feature_bc_matrix.h5"

Read raw counts into a list of matrices.

mats <- purrr::map(seq_along(hdf5_files), function(x){
  my_mat <- Seurat::Read10X_h5(hdf5_files[x])
  colnames(my_mat) <- paste0('donor', x, '_', colnames(my_mat))
str(mats, max.level = 1)
List of 4
 $ :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
 $ :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
 $ :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
 $ :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots

Seurat object

Create Seurat object from the list of matrices.

pbmc20k <- CreateSeuratObject(
  counts = mats,
  min.cells = 3,
  min.features = 200
An object of class Seurat 
27385 features across 22061 samples within 1 assay 
Active assay: RNA (27385 features, 0 variable features)
 4 layers present: counts.1, counts.2, counts.3, counts.4

Create one count layer.

pbmc20k <- JoinLayers(pbmc20k)
An object of class Seurat 
27385 features across 22061 samples within 1 assay 
Active assay: RNA (27385 features, 0 variable features)
 1 layer present: counts

Donor information in orig.ident.

                          orig.ident nCount_RNA nFeature_RNA
donor1_AAACCAAAGGTGACGA-1     donor1      42833         7079
donor1_AAACCCTGTGACGAGT-1     donor1       4890         2102
donor1_AAACGAATCAGGCTAC-1     donor1      12498         3564
donor1_AAACGACAGATTGACT-1     donor1      22193         4366
donor1_AAACGATGTCTTGAAC-1     donor1      10305         2945
donor1_AAACGATGTGCGCGAA-1     donor1      15947         4160

Use {BPCells} to convert the matrices in your already created Seurat objects to on-disk matrices. Note, that this is only possible for V5 assays. Convert the counts matrix of the RNA assay to a BPCells matrix.

  mat = BPCells::convert_matrix_type(matrix = pbmc20k@assays$RNA$counts, type = "uint32_t"),
  dir = 'data/pbmc20k',
  overwrite = TRUE
27385 x 22061 IterableMatrix object with class MatrixDir

Row names: ENSG00000238009, ENSG00000239945 ... AMELY

Data type: uint32_t
Storage order: column major

Queued Operations:
1. Load compressed matrix from directory /home/rstudio/muse/data/pbmc20k
pbmc20k.mat <- open_matrix_dir(dir = "data/pbmc20k")

pbmc20k@assays$RNA$counts <- pbmc20k.mat
27385 x 22061 IterableMatrix object with class RenameDims

Row names: ENSG00000238009, ENSG00000239945 ... AMELY

Data type: uint32_t
Storage order: column major

Queued Operations:
1. Load compressed matrix from directory /home/rstudio/muse/data/pbmc20k
2. Reset dimnames

Seurat workflow

Process with the Seurat 4 workflow.

options(future.globals.maxSize = 2 * 1024^3)

fixed_PrepDR5 <- function(object, features = NULL, layer = '', verbose = TRUE) {
  layer <- layer[1L]
  olayer <- layer
  layer <- SeuratObject::Layers(object = object, search = layer)
  if (is.null(layer)) {
    abort(paste0("No layer matching pattern '", olayer, "' not found. Please run ScaleData and retry"))
  data.use <- SeuratObject::LayerData(object = object, layer = layer)
  features <- features %||% VariableFeatures(object = object)
  if (!length(x = features)) {
    stop("No variable features, run FindVariableFeatures() or provide a vector of features", call. = FALSE)
  if (is(data.use, "IterableMatrix")) {
    features.var <- BPCells::matrix_stats(matrix=data.use, row_stats="variance")$row_stats["variance",]
  } else {
    features.var <- apply(X = data.use, MARGIN = 1L, FUN = var)
  features.keep <- features[features.var > 0]
  if (!length(x = features.keep)) {
    stop("None of the requested features have any variance", call. = FALSE)
  } else if (length(x = features.keep) < length(x = features)) {
    exclude <- setdiff(x = features, y = features.keep)
    if (isTRUE(x = verbose)) {
        "The following ",
        length(x = exclude),
        " features requested have zero variance; running reduction without them: ",
        paste(exclude, collapse = ', '),
        call. = FALSE,
        immediate. = TRUE
  features <- features.keep
  features <- features[! = features)]
  features.use <- features[features %in% rownames(data.use)]
  if(!isTRUE(all.equal(features, features.use))) {
    missing_features <- setdiff(features, features.use)
    if(length(missing_features) > 0) {
    warning_message <- paste("The following features were not available: ",
                             paste(missing_features, collapse = ", "),
                             ".", sep = "")
    warning(warning_message, immediate. = TRUE)
  data.use <- data.use[features.use, ]

assignInNamespace('PrepDR5', fixed_PrepDR5, 'Seurat')

seurat_wf_v4 <- function(seurat_obj, scale_factor = 1e4, num_features = 2000, num_pcs = 30, cluster_res = 0.5, debug_flag = FALSE){
  seurat_obj <- NormalizeData(seurat_obj, normalization.method = "LogNormalize", scale.factor = scale_factor, verbose = debug_flag)
  seurat_obj <- FindVariableFeatures(seurat_obj, selection.method = 'vst', nfeatures = num_features, verbose = debug_flag)
  seurat_obj <- ScaleData(seurat_obj, verbose = debug_flag)
  seurat_obj <- RunPCA(seurat_obj, verbose = debug_flag)
  seurat_obj <- RunHarmony(seurat_obj, "orig.ident")
  seurat_obj <- RunUMAP(seurat_obj, reduction = "harmony",  dims = 1:num_pcs, verbose = debug_flag)

pbmc20k <- seurat_wf_v4(pbmc20k)
Transposing data matrix
Initializing state using k-means centroids initialization
Harmony 1/10
Harmony 2/10
Harmony 3/10
Harmony converged after 3 iterations
Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session

Normalised and scaled data are stored as IterableMatrix objects.

27385 x 22061 IterableMatrix object with class RenameDims

Row names: ENSG00000238009, ENSG00000239945 ... AMELY

Data type: double
Storage order: column major

Queued Operations:
1. Load compressed matrix from directory /home/rstudio/muse/data/pbmc20k
2. Reset dimnames
3. Convert type from uint32_t to double
4. Scale by 1e+04
5. Scale columns by 2.33e-05, 0.000204 ... 7.7e-05
6. Transform log1p
7. Reset dimnames
2000 x 22061 IterableMatrix object with class RenameDims

Row names: HES4, ISG15 ... ENSG00000265995

Data type: double
Storage order: column major

Queued Operations:
1. Load compressed matrix from directory /home/rstudio/muse/data/pbmc20k
2. Select rows: 19, 20 ... 27288 and cols: all
3. Reset dimnames
4. Convert type from uint32_t to double
5. Scale by 1e+04
6. Scale columns by 2.33e-05, 0.000204 ... 7.7e-05
7. Transform log1p
8. Select rows: 489, 1810 ... 438 and cols: all
9. Reset dimnames
10. Transform min by row: 2.8, 2.49 ... 1
11. Scale rows by 3.59, 4.04 ... 10.1
12. Shift rows by -0.0687, -0.0609 ... -0.12
13. Select rows: 1243, 443 ... 1553 and cols: all
14. Reset dimnames


DimPlot(pbmc20k, reduction = "umap", = "orig.ident", pt.size = .1)

Version Author Date
f6301ec Dave Tang 2025-02-22


Annotate using {SingleR}.

monaco_immune <- fetchReference("monaco_immune", "2024-02-26")
class: SummarizedExperiment 
dim: 46077 114 
assays(1): logcounts
rownames(46077): A1BG A1BG-AS1 ... ZYX ZZEF1
rowData names(0):
colnames(114): DZQV_CD8_naive DZQV_CD8_CM ... G4YW_Neutrophils
colData names(3): label.main label.fine label.ont
pbmc20k.anno <- SingleR(
  test=as(pbmc20k@assays$RNA$data, "sparseMatrix"),
Warning: Converting to a dense matrix may use excessive memory
This message is displayed once every 8 hours.
Warning in asMethod(object): sparse->dense coercion: allocating vector of size
4.5 GiB
DataFrame with 6 rows and 4 columns
                                                  scores       labels
                                                <matrix>  <character>
donor1_AAACCAAAGGTGACGA-1 0.324853:0.258408:0.483121:...      T cells
donor1_AAACCCTGTGACGAGT-1 0.151325:0.124804:0.304293:... CD4+ T cells
donor1_AAACGAATCAGGCTAC-1 0.252303:0.235463:0.436783:... CD4+ T cells
donor1_AAACGACAGATTGACT-1 0.318234:0.338696:0.122619:...    Monocytes
donor1_AAACGATGTCTTGAAC-1 0.230288:0.199487:0.418062:... CD4+ T cells
donor1_AAACGATGTGCGCGAA-1 0.470569:0.243399:0.252875:...      B cells
                           <numeric>   <character>
donor1_AAACCAAAGGTGACGA-1  0.0820203       T cells
donor1_AAACCCTGTGACGAGT-1  0.0984110  CD4+ T cells
donor1_AAACGAATCAGGCTAC-1  0.0642314  CD4+ T cells
donor1_AAACGACAGATTGACT-1  0.1762658     Monocytes
donor1_AAACGATGTCTTGAAC-1  0.0913007  CD4+ T cells
donor1_AAACGATGTGCGCGAA-1  0.1458069       B cells

Add annotations to metadata.

) ->

UMAP with annotations.

DimPlot(pbmc20k, reduction = "umap", = "labels", pt.size = .1, label = TRUE, repel = TRUE)

Version Author Date
ffad125 Dave Tang 2025-02-23

Saving and loading

If you save your object and load it in in the future, Seurat will access the on-disk matrices by their path, which is stored in the assay level data. To make it easy to ensure these are saved in the same place, we provide new functionality to the SaveSeuratRds() function. In this function, you specify your filename. The pointer to the path in the Seurat object will change to the current directory.

This also makes it easy to share your Seurat objects with BPCells matrices by sharing a folder that contains both the object and the BPCells directory.

Make sure you use a different directory than where the on-disk matrices are stored or they will be recursively copied.

output_dir <- "data/pbmc20k_seurat"

  object = pbmc20k,
  file = paste0(output_dir, "/pbmc20k.rds")
Warning: Trying to move '/home/rstudio/muse/data/pbmc20k' to itself, skipping
Trying to move '/home/rstudio/muse/data/pbmc20k' to itself, skipping
Trying to move '/home/rstudio/muse/data/pbmc20k' to itself, skipping
[1] "pbmc20k"     "pbmc20k.rds"
list.files(paste0(output_dir, "/pbmc20k"))
 [1] "col_names"         "idxptr"            "index_data"       
 [4] "index_idx"         "index_idx_offsets" "index_starts"     
 [7] "row_names"         "shape"             "storage_order"    
[10] "val_data"          "val_idx"           "val_idx_offsets"  
[13] "version"          

Need to use LoadSeuratRds() to load or else none of the layers will be imported.

pbmc20k_import <- LoadSeuratRds(paste0(output_dir, '/pbmc20k.rds'))
An object of class Seurat 
27385 features across 22061 samples within 1 assay 
Active assay: RNA (27385 features, 2000 variable features)
 3 layers present: counts, data,
 3 dimensional reductions calculated: pca, harmony, umap
27385 x 22061 IterableMatrix object with class RenameDims

Row names: ENSG00000238009, ENSG00000239945 ... AMELY

Data type: uint32_t
Storage order: column major

Queued Operations:
1. Load compressed matrix from directory /home/rstudio/muse/data/pbmc20k
2. Reset dimnames

R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/ 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/;  LAPACK version 3.10.0

 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] celldex_1.16.0              SingleR_2.8.0              
 [3] SummarizedExperiment_1.36.0 Biobase_2.66.0             
 [5] GenomicRanges_1.58.0        GenomeInfoDb_1.42.3        
 [7] IRanges_2.40.1              S4Vectors_0.44.0           
 [9] BiocGenerics_0.52.0         MatrixGenerics_1.18.1      
[11] matrixStats_1.4.1           BPCells_0.3.0              
[13] Seurat_5.1.0                SeuratObject_5.0.2         
[15] sp_2.1-4                    harmony_1.2.1              
[17] Rcpp_1.0.13                 patchwork_1.3.0            
[19] lubridate_1.9.3             forcats_1.0.0              
[21] stringr_1.5.1               dplyr_1.1.4                
[23] purrr_1.0.2                 readr_2.1.5                
[25] tidyr_1.3.1                 tibble_3.2.1               
[27] ggplot2_3.5.1               tidyverse_2.0.0            
[29] workflowr_1.7.1            

loaded via a namespace (and not attached):
  [1] fs_1.6.4                  spatstat.sparse_3.1-0    
  [3] httr_1.4.7                RColorBrewer_1.1-3       
  [5] tools_4.4.1               sctransform_0.4.1        
  [7] alabaster.base_1.6.1      utf8_1.2.4               
  [9] R6_2.5.1                  HDF5Array_1.34.0         
 [11] lazyeval_0.2.2            uwot_0.2.2               
 [13] rhdf5filters_1.18.0       withr_3.0.2              
 [15] gridExtra_2.3             progressr_0.15.0         
 [17] cli_3.6.3                 spatstat.explore_3.3-3   
 [19] fastDummies_1.7.4         labeling_0.4.3           
 [21] alabaster.se_1.6.0        sass_0.4.9               
 [23] spatstat.data_3.1-2       ggridges_0.5.6           
 [25] pbapply_1.7-2             parallelly_1.38.0        
 [27] rstudioapi_0.17.1         RSQLite_2.3.7            
 [29] generics_0.1.3            ica_1.0-3                
 [31] spatstat.random_3.3-2     Matrix_1.7-0             
 [33] fansi_1.0.6               abind_1.4-8              
 [35] lifecycle_1.0.4           whisker_0.4.1            
 [37] yaml_2.3.10               rhdf5_2.50.2             
 [39] SparseArray_1.6.1         BiocFileCache_2.14.0     
 [41] Rtsne_0.17                grid_4.4.1               
 [43] blob_1.2.4                promises_1.3.0           
 [45] ExperimentHub_2.14.0      crayon_1.5.3             
 [47] miniUI_0.1.1.1            lattice_0.22-6           
 [49] beachmat_2.22.0           cowplot_1.1.3            
 [51] KEGGREST_1.46.0           pillar_1.9.0             
 [53] knitr_1.48                future.apply_1.11.3      
 [55] codetools_0.2-20          leiden_0.4.3.1           
 [57] glue_1.8.0                getPass_0.2-4            
 [59] spatstat.univar_3.0-1     data.table_1.16.2        
 [61] vctrs_0.6.5               png_0.1-8                
 [63] gypsum_1.2.0              spam_2.11-0              
 [65] gtable_0.3.6              cachem_1.1.0             
 [67] xfun_0.48                 S4Arrays_1.6.0           
 [69] mime_0.12                 survival_3.6-4           
 [71] fitdistrplus_1.2-1        ROCR_1.0-11              
 [73] nlme_3.1-164              bit64_4.5.2              
 [75] alabaster.ranges_1.6.0    filelock_1.0.3           
 [77] RcppAnnoy_0.0.22          rprojroot_2.0.4          
 [79] bslib_0.8.0               irlba_2.3.5.1            
 [81] KernSmooth_2.23-24        colorspace_2.1-1         
 [83] DBI_1.2.3                 tidyselect_1.2.1         
 [85] processx_3.8.4            bit_4.5.0                
 [87] compiler_4.4.1            curl_5.2.3               
 [89] git2r_0.35.0              httr2_1.0.5              
 [91] BiocNeighbors_2.0.1       hdf5r_1.3.11             
 [93] DelayedArray_0.32.0       plotly_4.10.4            
 [95] scales_1.3.0              lmtest_0.9-40            
 [97] callr_3.7.6               rappdirs_0.3.3           
 [99] digest_0.6.37             goftest_1.2-3            
[101] spatstat.utils_3.1-0      alabaster.matrix_1.6.1   
[103] rmarkdown_2.28            RhpcBLASctl_0.23-42      
[105] XVector_0.46.0            htmltools_0.5.8.1        
[107] pkgconfig_2.0.3           sparseMatrixStats_1.18.0 
[109] highr_0.11                dbplyr_2.5.0             
[111] fastmap_1.2.0             rlang_1.1.4              
[113] htmlwidgets_1.6.4         UCSC.utils_1.2.0         
[115] shiny_1.9.1               DelayedMatrixStats_1.28.1
[117] farver_2.1.2              jquerylib_0.1.4          
[119] zoo_1.8-12                jsonlite_1.8.9           
[121] BiocParallel_1.40.0       BiocSingular_1.22.0      
[123] magrittr_2.0.3            GenomeInfoDbData_1.2.13  
[125] dotCall64_1.2             Rhdf5lib_1.28.0          
[127] munsell_0.5.1             reticulate_1.39.0        
[129] stringi_1.8.4             alabaster.schemas_1.6.0  
[131] zlibbioc_1.52.0           MASS_7.3-60.2            
[133] AnnotationHub_3.14.0      plyr_1.8.9               
[135] parallel_4.4.1            listenv_0.9.1            
[137] ggrepel_0.9.6             deldir_2.0-4             
[139] Biostrings_2.74.1         splines_4.4.1            
[141] tensor_1.5                hms_1.1.3                
[143] ps_1.8.1                  igraph_2.1.1             
[145] spatstat.geom_3.3-3       RcppHNSW_0.6.0           
[147] reshape2_1.4.4            ScaledMatrix_1.14.0      
[149] BiocVersion_3.20.0        evaluate_1.0.1           
[151] BiocManager_1.30.25       tzdb_0.4.0               
[153] httpuv_1.6.15             RANN_2.6.2               
[155] polyclip_1.10-7           future_1.34.0            
[157] scattermore_1.2           rsvd_1.0.5               
[159] xtable_1.8-4              RSpectra_0.16-2          
[161] later_1.3.2               viridisLite_0.4.2        
[163] memoise_2.0.1             AnnotationDbi_1.68.0     
[165] cluster_2.1.6             timechange_0.3.0         
[167] globals_0.16.3