Skip to content
This repository has been archived by the owner on Dec 8, 2020. It is now read-only.

Commit

Permalink
Merge pull request #12 from vib-singlecell-nf/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
dweemx committed May 14, 2020
2 parents cf6b730 + 0685372 commit 0eaee03
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 36 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ RUN apt-get -y update && \
apt-get install -y libcurl4-openssl-dev libxml2-dev zlib1g-dev libhdf5-dev && \
apt-get install -y libssl-dev && \
# png.h: No such file or directory
apt-get install -y libpng-dev && \
apt-get install -y libpng-dev && \
R -e "install.packages('doFuture')" && \
R -e "install.packages('doRNG')" && \
R -e "install.packages('optparse')" && \
R -e "install.packages('foreach')" && \
R -e "install.packages('doMC')" && \
R -e "install.packages('dismo')" && \
R -e "devtools::install_github(repo = 'aertslab/SCopeLoomR')" && \
# Need to run ps
Expand Down
78 changes: 50 additions & 28 deletions bin/run_pca_cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@ print("##################################################")
print("# Calculating Optimal Number of PCs using PCA CV #")
print("##################################################")

# Loading dependencies scripts

# matrix <- GetDataFromS4ObjectByAccessPath(object = combined, path = "@assays$integrated@scale.data")
# e.g.: Rscript /ddn1/vol1/staging/leuven/stg_00002/lcb/dwmax/documents/aertslab/scripts/src_dwmax/ngs_digest/bin/scrna_seq/dim_reduction_pca_cv.R -i 10x_REW_Seurat_v3_Integrate_MaxFeatures.Rds -a "@assays\$integrated@scale.data"
# This a R implementation of the algorithm described at:
# - https://stats.stackexchange.com/questions/93845/how-to-perform-leave-one-out-cross-validation-for-pca-to-determine-the-number-of

library("optparse")
parser <- OptionParser(
Expand Down Expand Up @@ -64,12 +62,15 @@ parser <- add_option(
)
parser <- add_option(
parser,
c("-m", "--max-iters"), action="store", default = 500, help="Maximum number of iteration. [default %default]")
c("-m", "--max-iters"),
action="store",
default = 500,
help="Maximum number of iteration. [default %default]")
parser <- add_option(
parser,
c("-s", "--seed"),
action = "store",
default = 617,
default = NULL,
help="Seed. [default %default]"
)
parser <- add_option(
Expand Down Expand Up @@ -142,46 +143,68 @@ RunPCACV <- function(
n.cores = 1,
verbose = T,
default.svd = F) {


# Load the libraries required for parallelization
library(doFuture)
library(doRNG)

if(is.null(seed))
stop("No seed is set, this will likely give none reproducible results. Please set one.")

# Required by irlba::irlba for reproducibility
if(!is.null(seed)) {
set.seed(seed)
} else {
warnings("No seed is set!")
}
library(foreach)
library(doMC)
registerDoMC(cores = n.cores)
library(irlba)

set.seed(seed)

# Setup the parallelization
print(paste0("Number of workers used: ", n.cores))
registerDoFuture()
plan(multiprocess, workers = n.cores)
registerDoRNG(as.numeric(seed))

if(default.svd) {
print("Default SVD is using for estimating the number PCs.")
print("Default singular value decomposition (SVD) is used for estimating the number PCs.")
}

pc <- seq(from = from, to = to, by = by)

# Init the error matrices
error <- matrix(0, nrow = length(c(1:k)), ncol = length(x = pc))

# Partition the data into K-folds
print(paste0(k,"-fold paritioning..."))
# K-Fold Partitioning
data_kfold <- dismo::kfold(x = Matrix::t(x = data), k=k)
# Reference: https://stats.stackexchange.com/questions/93845/how-to-perform-leave-one-out-cross-validation-for-pca-to-determine-the-number-of

# Perform PCA cross-validation
for(i in c(1:k)) {
print(paste0("k:",i))
print(paste0("Processing ", i, "th fold..."))
data_train <- Matrix::t(x = data[, data_kfold!=i])
data_test <- Matrix::t(x = data[, data_kfold==i])
print("Running SVD...")
if(!default.svd) {
pca_results <- irlba::irlba(A = data_train, nv = to, maxit = maxit, verbose = verbose)
print("...fast truncated SVD")
pca_results <- irlba::irlba(
A = data_train
, nv = to
, maxit = maxit
, verbose = verbose
)
} else {
pca_results <- base::svd(x = data_train, nv = to)
print("...regular SVD")
pca_results <- base::svd(
x = data_train
, nv = to
)
}
print("==========================================")
gl <- pca_results$v
res <- foreach(j=1:length(x = pc), .combine='rbind') %dopar% {
print(paste0("Ndims:", pc[j]))
res <- foreach(j=1:length(x = pc), .combine='rbind') %do% {
# for(j in 1:length(x = pc)) {
print(paste0("...with ", pc[j], " PCs."))
P <- gl[,1:pc[j]]%*%Matrix::t(gl[,1:pc[j]])
err <- data_test %*% (diag(x = dim(x = P)[1]) - P + diag(x = diag(x = P)))
data.frame("i"=i, "j"=j, err=sum(err^2))
return (
data.frame("i"=i, "j"=j, err=sum(err^2))
)
}
apply(X = res, MARGIN = 1, FUN = function(r) {
error[r[["i"]],r[["j"]]] <<- r[["err"]]
Expand All @@ -190,7 +213,6 @@ RunPCACV <- function(
errors <- colSums(x = error)
return (data.frame("PC"=pc,"error"=log(x = errors)))
}

cat("Parameters: \n")
print(args)

Expand All @@ -199,7 +221,7 @@ input_ext <- tools::file_ext(args$`input-file`)
if(input_ext == "loom") {
print("Input type: Loom...")
# Get the data matrix from loom
warnings("Assumption: Expecting a scaled data matrix in the main layer of the loom file!")
warnings("ASSUMPTION: Expecting a scaled data matrix in the main layer of the loom file!")
loom <- SCopeLoomR::open_loom(file.path = args$`input-file`)
data <- SCopeLoomR::get_dgem(loom = loom)
} else if(input_ext == "Rds") {
Expand Down
2 changes: 1 addition & 1 deletion pcacv.config
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
params {

pcacv {
container = "vibsinglecellnf/pcacv:0.1.0"
container = "vibsinglecellnf/pcacv:0.2.0"
find_optimal_npcs {
accessor = '@assays$RNA@scale.data'
// useVariableFeatures = true // or false
Expand Down
16 changes: 12 additions & 4 deletions processes/runPCACV.nf
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,27 @@ process PCACV__FIND_OPTIMAL_NPCS {

container params.pcacv.container
publishDir "${params.global.outdir}/data/intermediate", mode: 'symlink'
clusterOptions "-l nodes=1:ppn=${params.global.threads} -l walltime=1:00:00 -A ${params.global.qsubaccount}"
clusterOptions "-l nodes=1:ppn=${processParams.nCores} -l walltime=1:00:00 -A ${params.global.qsubaccount}"

input:
tuple val(sampleId), path(f)
tuple \
val(sampleId), \
path(f)

output:
tuple val(sampleId), stdout, emit: optimalNumberPC
tuple val(sampleId), path("${sampleId}.PCACV__FIND_OPTIMAL_NPCS.*")
tuple \
val(sampleId), \
stdout, \
emit: optimalNumberPC
tuple \
val(sampleId), \
path("${sampleId}.PCACV__FIND_OPTIMAL_NPCS.*")

script:
def sampleParams = params.parseConfig(sampleId, params.global, params.pcacv.find_optimal_npcs)
processParams = sampleParams.local
"""
export OPENBLAS_NUM_THREADS=${processParams.nCores}
${binDir}/run_pca_cv.R \
--input-file ${f} \
--seed ${params.global.seed} \
Expand Down

0 comments on commit 0eaee03

Please sign in to comment.