stemangiola · stemangiola · Jun 17, 2020 · Jun 17, 2020 · Jun 17, 2020 · Jun 17, 2020
diff --git a/.gitignore b/.gitignore
@@ -13,5 +13,9 @@ dev/false_positive_study.RData
 dev/false_positive_study.pdf
 dev/*rds
 dev/*rda
+dev/*csv
+dev/*xlsx
+dev/*RData
+dev/*txt
 dev/approximation_bias_correction_cache*
 temp*
diff --git a/R/ppcseq.R b/R/ppcseq.R
@@ -64,7 +64,8 @@ identify_outliers = function(.data,
 														 tol_rel_obj = 0.01,
 														 just_discovery = F,
 														 seed = sample(1:99999, size = 1),
-														 adj_prob_theshold_2 = NULL
+														 adj_prob_theshold_2 = NULL,
+														 return_fit = FALSE
 ) {
 	# Prepare column same enquo
 	.sample = enquo(.sample)
@@ -281,23 +282,32 @@ identify_outliers = function(.data,
 	merge_results(
 
 		# Calculate CI 2 for discovery for plotting
-		res_discovery %>%
-			left_join(
-				(.) %>%
-					attr("fit") %>%
-					fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>%
-					select(S, G, .lower_1 = .lower, .upper_1 = .upper)
-			),
-		res_test, formula,
+		res_discovery,
+
+		# Just used for article comparative analyses between 1 and 2 steps
+		# %>%
+		# 	left_join(
+		# 		(.) %>%
+		# 			attr("fit") %>%
+		# 			fit_to_counts_rng_approximated(adj_prob_theshold_2, how_many_posterior_draws_2, truncation_compensation = 0.7352941, cores) %>%
+		# 			select(S, G, .lower_1 = .lower, .upper_1 = .upper)
+		# 	),
+
+		res_test,
+		formula,
 		!!.transcript,
 		!!.abundance,
 		!!.sample,
 		do_check_only_on_detrimental
 	) %>%
 
-		# Add fit attribute if any
-		add_attr(res_discovery %>% attr("fit"), "fit 1") %>%
-		add_attr(res_test %>% attr("fit"), "fit 2") %>%
+		# If return_fit
+		when(
+			return_fit ~ (.) %>%
+				add_attr(res_discovery %>% attr("fit"), "fit 1") %>%
+				add_attr(res_test %>% attr("fit"), "fit 2"),
+			~ (.)
+		)	%>%
 
 		# Add total draws
 		add_attr(res_test %>% attr("total_draws"), "total_draws")
@@ -483,7 +493,7 @@ do_inference = function(my_df,
 		as_matrix() %>% t
 
 	# Get the matrix of the idexes of the outlier data points
-	# to explude from the model if it is the second passage
+	# to exclude from the model if it is the second passage
 	to_exclude_MPI = get_outlier_data_to_exlude(counts_MPI, to_exclude, shards)
 
 	# Package data
@@ -547,7 +557,7 @@ do_inference = function(my_df,
 
 		ifelse_pipe(
 			approximate_posterior_analysis,
-			~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores),
+			~ .x %>% fit_to_counts_rng_approximated(adj_prob_theshold, how_many_posterior_draws * 10, truncation_compensation, cores, how_many_to_check),
 			~ .x %>% fit_to_counts_rng(adj_prob_theshold)
 		) %>%
 

diff --git a/R/utilities.R b/R/utilities.R
@@ -448,8 +448,8 @@ merge_results = function(res_discovery, res_test, formula, .transcript, .abundan
 			!!.abundance,
 			!!.sample,
 			mean,
-			`.lower_1`,
-			`.upper_1`,
+			# `.lower_1`,
+			# `.upper_1`,
 			`exposure rate`,
 			slope_1 = slope,
 			one_of(parse_formula(formula))
@@ -525,8 +525,6 @@ format_results = function(.data, formula, .transcript, .abundance, .sample, do_c
 		)
 }
 
-
-
 #' Select only significant genes plus background for efficient normalisation
 #'
 #' @importFrom rstan sampling
@@ -564,7 +562,6 @@ select_to_check_and_house_keeping = function(.data, .do_check, .significance, .t
 		}
 }
 
-
 #' add_exposure_rate
 #'
 #' @importFrom tidyr separate
@@ -655,49 +652,123 @@ fit_to_counts_rng = function(fit, adj_prob_theshold){
 #'
 #' @export
 
-fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores){
+fit_to_counts_rng_approximated = function(fit, adj_prob_theshold, how_many_posterior_draws, truncation_compensation, cores, how_many_to_check){
 
 	writeLines(sprintf("executing %s", "fit_to_counts_rng_approximated"))
 
-	draws_mu =
-		fit %>% extract("lambda_log_param") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par)
-	draws_sigma =
-		fit %>% extract("sigma_raw") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par)
-	draws_exposure =
-		fit %>% extract("exposure_rate") %>% `[[` (1) %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>%
-		as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par)
-
-	draws_mu %>%
-		left_join(draws_sigma, by = c(".draw", "G")) %>%
-		left_join(draws_exposure, by = c(".draw", "S")) %>%
-		nest(data = -c(S, G)) %>%
+	draws_mu = fit %>% extract("lambda_log_param") %>% .[[1]] %>% .[,,1:how_many_to_check]
+
+	# %>% as.data.frame() %>% setNames(sprintf("mu.%s", colnames(.))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, mu, -.draw) %>% separate(par, c("par", "S", "G"), sep="\\.") %>% select(-par)
+
+	draws_sigma = fit %>% extract("sigma_raw") %>% .[[1]] %>% .[,1:how_many_to_check]
+
+	# %>% as.data.frame() %>% setNames(sprintf("sigma.%s", colnames(.) %>% gsub("V", "", .))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, sigma, -.draw) %>% separate(par, c("par", "G"), sep="\\.") %>% select(-par)
+
+	draws_exposure = 	fit %>% extract("exposure_rate") %>% .[[1]]
+
+	# %>% as.data.frame() %>% setNames(sprintf("exposure.%s", colnames(.) %>% gsub("V", "", .))) %>%
+	# 	as_tibble() %>% mutate(.draw = 1:n()) %>% gather(par, exposure, -.draw) %>% separate(par, c("par", "S"), sep="\\.") %>% select(-par)
+
+	expand_grid(S = 1:(dim(draws_mu)[2]), G = 1:(dim(draws_mu)[3])) %>%
+		mutate(truncation_compensation = !!truncation_compensation) %>%
 		mutate(
-			CI = map(
-				data,
-				~ {
-					.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
-					draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * truncation_compensation	)
-					draws %>%
-						# Process quantile
-						quantile(c(adj_prob_theshold, 1 - adj_prob_theshold)) %>%
-				  	tibble::as_tibble(rownames="prop") %>%
-						tidyr::spread(prop, value) %>%
-						setNames(c(".lower", ".upper")) %>%
-						# Add mean and sd
-						dplyr::mutate(mean = mean(draws), sd = sd(draws))
-				}
+			CI = pmap(
+				list(  S,G, truncation_compensation),
+				~ tibble(
+					mu = draws_mu[,..1, ..2],
+					sigma = draws_sigma[,..2]	,
+					exposure = draws_exposure[,..1],
+					truncation_compensation = ..3
+				) %>%
+				get_CI_semi_analytically_pnbinom(	adj_prob_theshold	) %>%
+
+				# Add mean and sd
+				dplyr::mutate(
+					mean = mean(exp(draws_mu[,..1, ..2] + draws_exposure[,..1])),
+					sd = mean(1/exp(draws_sigma[,..2]) * truncation_compensation)
+				)
 			)
 		) %>%
-		select(-data) %>%
-		unnest(CI) %>%
-
-		# Adapt to old dataset
 		mutate(.variable = "counts_rng") %>%
-		mutate(S = as.integer(S), G = as.integer(G))
+		unnest(CI)
+
+	# draws_mu %>%
+	# 	left_join(draws_sigma, by = c(".draw", "G")) %>%
+	# 	left_join(draws_exposure, by = c(".draw", "S")) %>%
+	# 	nest(data = -c(S, G)) %>%
+	# 	mutate(
+	# 		CI = future_map(
+	# 			data,
+	# 			~ {
+	# 				get_CI_semi_analytically_pnbinom(
+	# 					.x,
+	# 					adj_prob_theshold
+	# 				) %>%
+	# 					# Add mean and sd
+	# 					dplyr::mutate(
+	# 						mean = mean(exp(.x$mu + .x$exposure)),
+	# 						sd = mean(1/exp(.x$sigma) * truncation_compensation)
+	# 					)
+	# 			}
+	# 		)
+	# 	) %>%
+	# 	select(-data) %>%
+	# 	unnest(CI) %>%
+	#
+	# 	# Adapt to old dataset
+	# 	mutate(.variable = "counts_rng") %>%
+	# 	mutate(S = as.integer(S), G = as.integer(G))
 
+}
+
+get_CI_semi_analytically_pnbinom_core = function(.x, .quantile){
+	ab<-range(
+		qnbinom(
+			.quantile,
+			mu = exp(.x$mu + .x$exposure),
+			size = 1/exp(.x$sigma) * .x$truncation_compensation
+		)
+	);
+
+	if(sum(ab) == 0) return(0);
+
+	opt<-optim(
+		mean(ab),
+		function (x)
+			sapply(
+				x,
+				function(p)
+					((.quantile)-mean(pnbinom(p, 	mu = exp(.x$mu + .x$exposure),
+																		size = 1/exp(.x$sigma) * .x$truncation_compensation )))^2
+			),
+		method='Brent',
+		lower=ab[1],
+		upper=ab[2]
+	)
+	opt$par
+}
+
+get_CI_semi_analytically_pnbinom = function(.x, .quantile){
+
+	tibble(
+		.lower = .x %>% get_CI_semi_analytically_pnbinom_core(.quantile),
+		.upper = .x %>% get_CI_semi_analytically_pnbinom_core(1-.quantile)
+	)
+
+
+}
 
+get_CI_semi_analytically_rnbinom = function(.x, .quantile, how_many_posterior_draws){
+	.x_supersampled = .x %>%	sample_n(how_many_posterior_draws, replace = T)
+	draws = rnbinom(n =how_many_posterior_draws,	mu = exp(.x_supersampled$mu + .x_supersampled$exposure),	size = 1/exp(.x_supersampled$sigma) * .x$truncation_compensation	)
+	draws %>%
+		# Process quantile
+		quantile(c(.quantile, 1 - .quantile)) %>%
+		tibble::as_tibble(rownames="prop") %>%
+		tidyr::spread(prop, value) %>%
+		setNames(c(".lower", ".upper"))
 }
 
 save_generated_quantities_in_case = function(.data, fit, save_generated_quantities){

diff --git a/inst/stan/negBinomial_MPI.stan b/inst/stan/negBinomial_MPI.stan
@@ -112,6 +112,7 @@ functions{
   }
 
 matrix merge_coefficients(row_vector intercept, row_vector alpha_sub_1, matrix alpha_2,  int C, int S, int G){
+	// Here I build the coefficient matrix appending the house keeping genes
 	matrix[C,G] my_alpha;