Please uncomment the below and run
#setting seed to ensure reproducibility
set.seed(123)
#params
n_genes <- 1000
sample_counts <- c(5, 10, 20, 50, 100)
n_reps <- 3
#' Function records run time and RAM peak usage
#'
#' @param n_genes number of genes
#' @param n_samples
#'
#' @return run time and RAM peak usage
#' @export
time_ram_single_run <- function(n_genes, n_samples) {
elapsed <- NA_real_
pr <- peakRAM({
expr_matrix <- matrix(
rlnorm(n = n_genes * n_samples, meanlog = 2, sdlog = 1),
nrow = n_genes,
ncol = n_samples
)
expr_df <- as.data.frame(expr_matrix)
rownames(expr_df) <- paste0("Gene", seq_len(n_genes))
colnames(expr_df) <- paste0("Sample", seq_len(n_samples))
t0 <- Sys.time()
res <- surprisal_analysis(expr_df)
t1 <- Sys.time()
elapsed <- as.numeric(difftime(t1, t0, units = "secs"))
})
list(
time_sec = elapsed,
peak_ram_mib = pr$Peak_RAM_Used_MiB[1]
)
}
#store info
results_df <- data.frame(
n_samples = integer(length(sample_counts)),
mean_time = numeric(length(sample_counts)),
sd_time = numeric(length(sample_counts)),
ci_lower = numeric(length(sample_counts)),
ci_upper = numeric(length(sample_counts)),
mean_peak_ram_mib = numeric(length(sample_counts)),
sd_peak_ram_mib = numeric(length(sample_counts)),
ci_lower_peak_ram_mib = numeric(length(sample_counts)),
ci_upper_peak_ram_mib = numeric(length(sample_counts)),
stringsAsFactors = FALSE
)
ci_halfwidth_fun <- function(x, n) {
sdev <- sd(x)
se <- sdev / sqrt(n)
qt(0.975, df = n - 1) * se
}
for (i in seq_along(sample_counts)) {
s <- sample_counts[i]
rep_times <- numeric(n_reps)
rep_ram <- numeric(n_reps)
for (r in seq_len(n_reps)) {
gc()
tr <- time_ram_single_run(n_genes = n_genes, n_samples = s)
rep_times[r] <- tr$time_sec
rep_ram[r] <- tr$peak_ram_mib
}
mu_time <- mean(rep_times)
sd_time <- sd(rep_times)
hw_time <- ci_halfwidth_fun(rep_times, n_reps)
mu_ram <- mean(rep_ram)
sd_ram <- sd(rep_ram)
hw_ram <- ci_halfwidth_fun(rep_ram, n_reps)
results_df[i, ] <- list(
s,
mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
)
print(i)
message(sprintf(
"n_samples = %3d : time = %.4f s (sd=%.4f, 95%% CI=[%.4f, %.4f]); peak RAM = %.2f MiB (sd=%.2f, 95%% CI=[%.2f, %.2f])",
s, mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
))
}
print(results_df)
model_cubic <- lm(mean_time ~ poly(n_samples, 3), data = results_df)
summary(model_cubic)
results_df$pred_cubic <- predict(model_cubic, newdata = results_df)
ggplot(results_df, aes(x = n_samples, y = mean_time)) +
geom_ribbon(aes(ymin = mean_time - sd_time, ymax = mean_time + sd_time),
fill = "steelblue", alpha = 0.2) +
geom_point(color = "steelblue4", size = 2, shape=8, stroke=1.5) +
geom_line(color = "steelblue4") +
stat_smooth(method = "lm",
formula = y ~ poly(x, 3),
se = FALSE,
color = "firebrick",
linetype = "dashed",
size = 1) +
labs(
x = "Number of samples",
y = "Average elapsed time (seconds)",
title = sprintf("SurprisalAnalysis Runtime vs. Number of Samples\n(gene count = %d; mean ± 95%% CI over %d runs)", n_genes, n_reps), tag="A"
) +
theme_minimal(base_size = 12)
is_bytes <- max(results_df$mean_peak_ram_mib, na.rm = TRUE) > 1e5
scale_factor <- if (is_bytes) 1/(1024^2) else 1
use_ci <- all(c("ci_lower_peak_ram_mib", "ci_upper_peak_ram_mib") %in% names(results_df))
df_plot <- within(results_df, {
mean_ram_plot <- mean_peak_ram_mib * scale_factor
lower_ram_plot <- (if (use_ci) ci_lower_peak_ram_mib else mean_peak_ram_mib - sd_peak_ram_mib) * scale_factor
upper_ram_plot <- (if (use_ci) ci_upper_peak_ram_mib else mean_peak_ram_mib + sd_peak_ram_mib) * scale_factor
})
ggplot(df_plot, aes(x = n_samples, y = mean_ram_plot)) +
geom_ribbon(aes(ymin = lower_ram_plot, ymax = upper_ram_plot),
fill = "steelblue", alpha = 0.2) +
geom_point(color = "steelblue4", size = 2, shape=8, stroke = 1.5) +
geom_line(color = "steelblue4") +
scale_x_continuous(breaks = sample_counts) +
labs(
title = sprintf("SurprisalAnalysis Peak RAM vs. Number of Samples\n(gene count = %d; mean ± 95%% CI over %d runs)", n_genes, n_reps),
x = "Number of Samples",
y = "Peak RAM (MiB)", tag="B"
) +
theme_minimal(base_size = 12)
#write.csv(results_df, "~/Downloads/runtime_altering_samples_try_2.csv")
####Altering the number of genes
#params
n_samples_fixed <- 50
gene_counts <- c(100, 500, 1000, 2000, 5000)
n_reps <- 3
results_genes_df <- data.frame(
n_genes = integer(length(gene_counts)),
mean_time = numeric(length(gene_counts)),
sd_time = numeric(length(gene_counts)),
ci_lower = numeric(length(gene_counts)),
ci_upper = numeric(length(gene_counts)),
mean_peak_ram_mib = numeric(length(gene_counts)),
sd_peak_ram_mib = numeric(length(gene_counts)),
ci_lower_peak_ram_mib = numeric(length(gene_counts)),
ci_upper_peak_ram_mib = numeric(length(gene_counts)),
stringsAsFactors = FALSE
)
ci_halfwidth_fun <- function(x, n) {
sdev <- sd(x); se <- sdev / sqrt(n); qt(0.975, df = n - 1) * se
}
for (i in seq_along(gene_counts)) {
g <- gene_counts[i]
rep_times <- numeric(n_reps)
rep_ram <- numeric(n_reps)
for (r in seq_len(n_reps)) {
gc()
tr <- time_ram_single_run(n_genes = g, n_samples = n_samples_fixed)
rep_times[r] <- tr$time_sec
rep_ram[r] <- tr$peak_ram_mib
}
mu_time <- mean(rep_times); sd_time <- sd(rep_times); hw_time <- ci_halfwidth_fun(rep_times, n_reps)
mu_ram <- mean(rep_ram); sd_ram <- sd(rep_ram); hw_ram <- ci_halfwidth_fun(rep_ram, n_reps)
results_genes_df[i, ] <- list(
g,
mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
)
message(sprintf(
"n_genes = %5d : time = %.4f s (sd=%.4f, 95%% CI=[%.4f, %.4f]); peak RAM = %.2f MiB (sd=%.2f, 95%% CI=[%.2f, %.2f])",
g, mu_time, sd_time, mu_time - hw_time, mu_time + hw_time,
mu_ram, sd_ram, mu_ram - hw_ram, mu_ram + hw_ram
))
}
print(results_genes_df)
ggplot(results_genes_df, aes(x = n_genes, y = mean_time)) +
geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper),
fill = "steelblue", alpha = 0.2) +
geom_point(color = "steelblue4", size = 2, shape = 8, stroke = 1.5) +
geom_line(color = "steelblue4") +
geom_smooth(method = "lm", se = FALSE, color = "firebrick", size = 1) +
scale_x_continuous(breaks = gene_counts) +
labs(
x = "Number of genes",
y = "Average elapsed time (seconds)",
title = sprintf(
"SurprisalAnalysis Runtime vs. Number of Genes\n(sample count = %d; mean ± 95%% CI over %d runs)",
n_samples_fixed, n_reps
),
tag = "C"
) +
theme_minimal(base_size = 12)
is_bytes_g <- max(results_genes_df$mean_peak_ram_mib, na.rm = TRUE) > 1e5
scale_factor_g <- if (is_bytes_g) 1/(1024^2) else 1
use_ci_g <- all(c("ci_lower_peak_ram_mib", "ci_upper_peak_ram_mib") %in% names(results_genes_df))
df_plot_genes <- within(results_genes_df, {
mean_ram_plot_g <- mean_peak_ram_mib * scale_factor_g
lower_ram_plot_g <- (if (use_ci_g) ci_lower_peak_ram_mib else mean_peak_ram_mib - sd_peak_ram_mib) * scale_factor_g
upper_ram_plot_g <- (if (use_ci_g) ci_upper_peak_ram_mib else mean_peak_ram_mib + sd_peak_ram_mib) * scale_factor_g
})
ggplot(df_plot_genes, aes(x = n_genes, y = mean_ram_plot_g)) +
geom_ribbon(aes(ymin = lower_ram_plot_g, ymax = upper_ram_plot_g),
fill = "steelblue", alpha = 0.2) +
geom_point(color = "steelblue4", size = 2, shape = 8, stroke = 1.5) +
geom_line(color = "steelblue4") +
scale_x_continuous(breaks = gene_counts) +
labs(
title = sprintf("SurprisalAnalysis Peak RAM vs. Number of Genes\n(sample count = %d; mean ± 95%% CI over %d runs)", n_samples_fixed, n_reps),
x = "Number of genes",
y = "Peak RAM (MiB)",
tag = "D"
) +
theme_minimal(base_size = 12)
#write.csv(results_genes_df, "~/Downloads/runtime_altering_genes_try_2.csv")