Last updated: 2025-02-28

The results in this page were generated with repository version 9db5d31.

Create matrix with 99% zeros.

ngenes <- 16000
ncells <- 10000

  data = rbinom(n = ngenes*ncells, size = 1, prob = 0.01),
  nrow = ngenes,
  ncol = ncells
) -> my_mat

# theoretical number of zeros
[1] 158400000
# number of zeros
sum(my_mat == 0)
[1] 158400459

Dense matrix object size.

640000216 bytes

Export dense matrix as rds.

dense_matrix <- "my_mat.rds"
saveRDS(object = my_mat, file = dense_matrix)

Size of dense matrix file.

paste0(file.size(dense_matrix) / 1024 / 1024, " MBs")
[1] "4.30414867401123 MBs"

Convert to sparse matrix.

my_mat_sparse <- as(my_mat, "sparseMatrix")
[1] "dgCMatrix"
[1] "Matrix"

Sparse matrix object size.

19236000 bytes

Export sparse matrix as rds.

sparse_matrix <- "my_mat_sparse.rds"
saveRDS(object = my_mat_sparse, file = sparse_matrix)

Size of sparse matrix file.

paste0(file.size(sparse_matrix) / 1024 / 1024, " MBs")
[1] "3.51174449920654 MBs"

Clean up.

[1] TRUE
[1] TRUE


Export as HDF5.


hdf5_file <- "my_mat.h5"
file.h5 <- H5File$new(hdf5_file, mode="w")

Class: H5Group
Filename: /home/rstudio/muse/my_mat.h5
Group: /data
file.h5[["data/matrix"]] <- my_mat

## Close the file at the end
## the 'close' method closes only the file-id, but leaves object inside the file open
## This may prevent re-opening of the file. 'close_all' closes the file and all objects in it

Size of HDF5 file.

paste0(file.size(hdf5_file) / 1024 / 1024, " MBs")
[1] "10.5204658508301 MBs"


## now re-open it
file.h5 <- H5File$new(hdf5_file, mode="r+")

my_mat_import <- file.h5[["data/matrix"]][,]
[1] "matrix" "array" 
identical(my_mat, my_mat_import)
[1] TRUE

Clean up.

[1] TRUE


As a workflow to check file sizes as we change the number of zeros.

file_size_wf <- function(prob, ngenes = 16000, ncells = 10000){
    data = rbinom(n = ngenes*ncells, size = 1, prob = prob),
    nrow = ngenes,
    ncol = ncells
  ) -> my_mat
  my_mat_sparse <- as(my_mat, "sparseMatrix")
  dense_matrix <- paste0("my_mat_", prob, ".rds")
  saveRDS(object = my_mat, file = dense_matrix)

  sparse_matrix <- paste0("my_mat_sparse_", prob, ".rds")
  saveRDS(object = my_mat_sparse, file = sparse_matrix)
  hdf5_file <- paste0("my_mat_", prob, ".h5")
  file.h5 <- H5File$new(hdf5_file, mode="w")
  file.h5[["data/matrix"]] <- my_mat
    prob = prob,
    dense_size = paste0(file.size(dense_matrix) / 1024 / 1024, " MBs"),
    sparse_size = paste0(file.size(sparse_matrix) / 1024 / 1024, " MBs"),
    hdf5_size = paste0(file.size(hdf5_file) / 1024 / 1024, " MBs")
  ) -> res

purrr::map_df(.x = c(0.01, 0.05, 0.25, 0.5), \(x) file_size_wf(x))
# A tibble: 4 × 4
   prob dense_size           sparse_size          hdf5_size           
  <dbl> <chr>                <chr>                <chr>               
1  0.01 4.30162048339844 MBs 3.50793647766113 MBs 10.5156774520874 MBs
2  0.05 12.7624349594116 MBs 16.807599067688 MBs  23.2547760009766 MBs
3  0.25 32.0998592376709 MBs 79.5521965026855 MBs 46.7966976165771 MBs
4  0.5  38.3305673599243 MBs 144.26503276825 MBs  51.6437215805054 MBs

R version 4.4.1 (2024-06-14)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.5 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/ 
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/;  LAPACK version 3.10.0

 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            

time zone: Etc/UTC
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] hdf5r_1.3.11    Matrix_1.7-0    workflowr_1.7.1

loaded via a namespace (and not attached):
 [1] bit_4.5.0         jsonlite_1.8.9    dplyr_1.1.4       compiler_4.4.1   
 [5] promises_1.3.0    tidyselect_1.2.1  Rcpp_1.0.13       stringr_1.5.1    
 [9] git2r_0.35.0      callr_3.7.6       later_1.3.2       jquerylib_0.1.4  
[13] yaml_2.3.10       fastmap_1.2.0     lattice_0.22-6    R6_2.5.1         
[17] generics_0.1.3    knitr_1.48        tibble_3.2.1      rprojroot_2.0.4  
[21] bslib_0.8.0       pillar_1.9.0      rlang_1.1.4       utf8_1.2.4       
[25] cachem_1.1.0      stringi_1.8.4     httpuv_1.6.15     xfun_0.48        
[29] getPass_0.2-4     fs_1.6.4          sass_0.4.9        bit64_4.5.2      
[33] cli_3.6.3         magrittr_2.0.3    ps_1.8.1          grid_4.4.1       
[37] digest_0.6.37     processx_3.8.4    rstudioapi_0.17.1 lifecycle_1.0.4  
[41] vctrs_0.6.5       evaluate_1.0.1    glue_1.8.0        whisker_0.4.1    
[45] fansi_1.0.6       purrr_1.0.2       rmarkdown_2.28    httr_1.4.7       
[49] tools_4.4.1       pkgconfig_2.0.3   htmltools_0.5.8.1