library(SplitKnockoff)Author : Yuxuan Chen, Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao
Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. This vignette illustrates the usage of SplitKnockoff with simulation experiments in Cao et al. (2023) <arXiv.2310.07605> and will help you apply the split knockoff method in a light way.
install.packages("SplitKnockoff") # just one line code to install our packageThis is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as glmnet can be used directly by
install.packages("glmnet") # just one line code to install the glmnet toolPlease update your glmnet package to the latest version for smooth usage of this package, the examples illustrated here are tested with glmnet 4.1-8.
For more information, please see the manual inside this package.
sk.filter(X, D, y, option) : the main function, Split Knockoff filter, for variable selection in structural sparsity problem.
sk.create(X, y, D, nu, option): generate the split knockoff copy for structural sparsity problem.
sk.W_path(X, D, y, nu, option): generate the \(W\) statistics for split knockoff on a split LASSO path.
sk.W_fixed(X, D, y, nu, option): generate the \(W\) statistics for split knockoff based on a fixed \(\beta(\lambda) = \hat{\beta}\).
select(W, q, method): this function is for variable selection based on \(W\) statistics.
install.packages("SplitKnockoff") # install our package
library(latex2exp)
library(ggplot2)
library(Matrix)
library(glmnet)
library(SplitKnockoff)k <- 20 # sparsity level
A <- 1 # magnitude
n <- 500 # sample size
p <- 100 # dimension of variables
c <- 0.5 # feature correlation
sigma <-1 # noise level
option <- list()
# the target (directional) FDR control
option$q <- 0.2
# whether to normalize the dataset
option$normalize <- 'true'
# fraction of data used to estimate \beta(\lambda)
option$frac = 2/5
# choice on the set of regularization parameters for split LASSO path
option$lambda <- 10.^seq(0, -6, by=-0.01)
option$lambda_cv <- 10.^seq(0, -6, by=-0.01)
# choice of nu for split knockoffs
expo = seq(0, 2, by = 0.2)
option$nu <- 10.^expo
option$nu_cv <- 10.^expo
num_nu <- length(option$nu)
# set random seed
option$seed = 1
# the number of simulation instances
option$tests = 200
option$W = 's'D_G = matrix(0, p-1, p)
for (i in 1:(p-1)){
D_G[i, i] = 1
D_G[i, i+1] = -1
}
# generate D1, D2, and D3
D_1 = diag(p)
D_2 = D_G
D_3 = rbind(diag(p), D_G)
D_s = list(D_1 = D_1, D_2 = D_2, D_3 = D_3)# generate Sigma
Sigma = matrix(0, p, p)
for( i in 1: p){
for(j in 1: p){
Sigma[i, j] <- c^(abs(i - j))
}
}library(mvtnorm) # package mvtnorm needed for this generation
set.seed(100)
X <- rmvnorm(n,matrix(0, p, 1), Sigma) # generate Xbeta_true <- matrix(0, p, 1)
for( i in 1: k){
beta_true[i, 1] = A
if ( i%%3 == 1){
beta_true[i, 1] = 0
}
}Split knockoff for all \(\nu\)
# choice on the way generating the W statistics
option$beta = 'path'
# save Z t_Z r t_Z for making plots
rawvalue = list()
# save y
Y = list()
tests = option$tests # the number of experiments
# create matrices to store results
fdr_split = array(0, dim = c(3, tests, num_nu, 2))
power_split = array(0, dim = c(3, tests, num_nu, 2))
for (test in 1:tests){
# generate varepsilon
set.seed(test)
# generate noise and y
varepsilon = matrix(rnorm(n), ncol = 1) * sqrt(sigma)
y = X %*% beta_true + varepsilon
Y[[test]] = y
raw = list()
for (j in 1:3){
# choose the respective D for each example
D = D_s[[j]]
gamma_true <- D %*% beta_true
sk_results = sk.filter(X, D, y, option)
results = sk_results$results
raw[[j]] = sk_results$stats
for (i in 1:num_nu){
r_sign = sk_results$stats[[i]]$r
result = results[[i]]$sk
fdr_split[j, test, i, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
power_split[j, test, i, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
result = results[[i]]$sk_plus
fdr_split[j, test, i, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
power_split[j, test, i, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
}
}
rawvalue[[test]] = raw
}Split knockoff for \(\nu\) chosen by cross validation
# choice on the way generating the W statistics
option$beta = 'cv_all'
# create matrices to store results
fdr_cv = array(0, dim = c(3, tests, 2))
power_cv = array(0, dim = c(3, tests, 2))
for (test in 1:tests){
y <- Y[[test]]
for (j in 1:3){
# choose the respective D for each example
D = D_s[[j]]
gamma_true <- D %*% beta_true
sk_results = sk.filter(X, D, y, option)
results = sk_results$results
r_sign = sk_results$stats$r
result = results$sk
fdr_cv[j, test, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
power_cv[j, test, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
result = results$sk_plus
fdr_cv[j, test, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1)
power_cv[j, test, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0)
}
}
mean_fdr_cv <- apply(fdr_cv, c(1, 3), mean)
sd_fdr_cv <- apply(fdr_cv, c(1, 3), sd)
mean_power_cv <- apply(power_cv, c(1, 3), mean)
sd_power_cv <- apply(power_cv, c(1, 3), sd)Knockoff
library(knockoff) # package knockoff needed
# create matrices to store results
fdr_knockoff = array(0, dim = c(2, tests, 2))
power_knockoff = array(0, dim = c(2, tests, 2))
D = D_s[[2]]
# Calculate X_bar, y_bar
Z <- t(D) %*% solve(D %*% t(D))
XF <- X %*% rep(1, p)
U_X <- XF / norm(XF, "F")
UU_X <- cbind(U_X, matrix(0, n, n-1))
qrresult <- qr(UU_X)
Qreslt <- qr.Q(qrresult)
UX_perp <- Qreslt[,2:n]
y_bar <- t(UX_perp) %*% y
X_bar <- t(UX_perp) %*% X %*% Z
for (test in 1:tests){
y <- Y[[test]]
# choose D_1 for each example
D = D_s[[1]]
gamma_true <- D %*% beta_true
k_results = knockoff.filter(X, y, knockoffs = {function(x) create.fixed(x, y=y, method='equi')}, statistic = stat.lasso_lambdasmax)
W_k = k_results$statistic
result = select(W_k, option$q, 'knockoff')
fdr_knockoff[1, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
power_knockoff[1, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
result = select(W_k, option$q, 'knockoff+')
fdr_knockoff[1, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
power_knockoff[1, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
# choose the D_2 for each example
D = D_s[[2]]
gamma_true <- D %*% beta_true
k_results = knockoff.filter(X_bar, y_bar, knockoffs = {function(x) create.fixed(x, y=y_bar, method='equi')}, statistic = stat.lasso_lambdasmax)
W_k = k_results$statistic
result = select(W_k, option$q, 'knockoff')
fdr_knockoff[2, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1)
power_knockoff[2, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
result = select(W_k, option$q, 'knockoff+')
fdr_knockoff[2, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1)
power_knockoff[2, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0)
}Plot figure 3
x <- expo
t_value = qt(c(0.1, 0.9), tests - 1)
lower_bound = t_value[1]
upper_bound = t_value[2]
fdr_knockoff_mean <- apply(fdr_knockoff, c(1, 3), mean)
power_knockoff_mean <- apply(power_knockoff, c(1, 3), mean)
fdr_knockoff_sd <- apply(fdr_knockoff, c(1, 3), sd)
power_knockoff_sd <- apply(power_knockoff, c(1, 3), sd)
fdr_split_mean <- apply(fdr_split, c(1, 3, 4), mean)
fdr_split_sd <- apply(fdr_split, c(1, 3, 4), sd)
power_split_mean <- apply(power_split, c(1, 3, 4), mean)
power_split_sd <- apply(power_split, c(1, 3, 4), sd)
fdr_split_top <- fdr_split_mean + fdr_split_sd * upper_bound
fdr_split_bot <- fdr_split_mean + fdr_split_sd * lower_bound
power_split_top <- power_split_mean + power_split_sd * upper_bound
power_split_bot <- power_split_mean + power_split_sd * lower_bound
## for D_1
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 4),
y = c(fdr_split_mean[1, , 1], fdr_split_mean[1, , 2], rep(fdr_knockoff_mean[1, 1], num_nu), rep(fdr_knockoff_mean[1, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 1], ymax = fdr_split_top[1, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 2], ymax = fdr_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[1])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_power.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 4),
y = c(power_split_mean[1, , 1], power_split_mean[1, , 2], rep(power_knockoff_mean[1, 1], num_nu), rep(power_knockoff_mean[1, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 1], ymax = power_split_top[1, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 2], ymax = power_split_top[1, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[1])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## for D_2
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 4),
y = c(fdr_split_mean[2, , 1], fdr_split_mean[2, , 2], rep(fdr_knockoff_mean[2, 1], num_nu), rep(fdr_knockoff_mean[2, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 1], ymax = fdr_split_top[2, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 2], ymax = fdr_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[2])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_power.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 4),
y = c(power_split_mean[2, , 1], power_split_mean[2, , 2], rep(power_knockoff_mean[2, 1], num_nu), rep(power_knockoff_mean[2, 2], num_nu)),
type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 1], ymax = power_split_top[2, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 2], ymax = power_split_top[2, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[2])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## for D_3
## plot for FDR
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_fdr.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 2),
y = c(fdr_split_mean[3, , 1], fdr_split_mean[3, , 2]),
type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 1], ymax = fdr_split_top[3, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 2], ymax = fdr_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
geom_abline(intercept = option$q, slope=0, linetype="dashed") +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) +
ggtitle(expression('D'[3])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()
## plot for Power
png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_power.png', height=1000, width=1000)
plot_data <- data.frame(
x = rep(x, 2),
y = c(power_split_mean[3, , 1], power_split_mean[3, , 2]),
type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu)
)
# Plotting
fig <- ggplot() +
geom_line(data = plot_data, aes(x = x, y = y, color = type)) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 1], ymax = power_split_top[3, , 1]), fill = "red", alpha = 0.05) +
geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 2], ymax = power_split_top[3, , 2]), fill = "blue", alpha = 0.05) +
labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") +
ggtitle(expression('D'[3])) +
coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) +
scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) +
theme_light() +
theme(plot.title = element_text(hjust = 0.5))
print(fig)
dev.off()saveRDS(list(mean_fdr_D1_knockoff = fdr_knockoff_mean[1, 1],
sd_fdr_D1_knockoff = fdr_knockoff_sd[1, 1],
mean_power_D1_knockoff = power_knockoff_mean[1, 1],
sd_power_D1_knockoff = power_knockoff_sd[1, 1],
mean_fdr_D2_knockoff = fdr_knockoff_mean[2, 1],
sd_fdr_D2_knockoff = fdr_knockoff_sd[2, 1],
mean_power_D2_knockoff = power_knockoff_mean[2, 1],
sd_power_D2_knockoff = power_knockoff_sd[2, 1],
mean_fdr_D1_sk = mean_fdr_cv[1, 1],
sd_fdr_D1_sk = sd_fdr_cv[1, 1],
mean_power_D1_sk = mean_power_cv[1, 1],
sd_power_D1_sk = sd_power_cv[1, 1],
mean_fdr_D2_sk = mean_fdr_cv[2, 1],
sd_fdr_D2_sk = sd_fdr_cv[2, 1],
mean_power_D2_sk = mean_power_cv[2, 1],
sd_power_D2_sk = sd_power_cv[2, 1],
mean_fdr_D3_sk = mean_fdr_cv[3, 1],
sd_fdr_D3_sk = sd_fdr_cv[3, 1],
mean_power_D3_sk = mean_power_cv[3, 1],
sd_power_D3_sk = sd_power_cv[3, 1],
mean_fdr_D1_knockoff_pl = fdr_knockoff_mean[1, 2],
sd_fdr_D1_knockoff_pl = fdr_knockoff_sd[1, 2],
mean_power_D1_knockoff_pl = power_knockoff_mean[1, 2],
sd_power_D1_knockoff_pl = power_knockoff_sd[1, 2],
mean_fdr_D2_knockoff_pl = fdr_knockoff_mean[2, 2],
sd_fdr_D2_knockoff_pl = fdr_knockoff_sd[2, 2],
mean_power_D2_knockoff_pl = power_knockoff_mean[2, 2],
sd_power_D2_knockoff_pl = power_knockoff_sd[2, 2],
mean_fdr_D1_sk_pl = mean_fdr_cv[1, 2],
sd_fdr_D1_sk_pl = sd_fdr_cv[1, 2],
mean_power_D1_sk_pl = mean_power_cv[1, 2],
sd_power_D1_sk_pl = sd_power_cv[1, 2],
mean_fdr_D2_sk_pl = mean_fdr_cv[2, 2],
sd_fdr_D2_sk_pl = sd_fdr_cv[2, 2],
mean_power_D2_sk_pl = mean_power_cv[2, 2],
sd_power_D2_sk_pl = sd_power_cv[2, 2],
mean_fdr_D3_sk_pl = mean_fdr_cv[3, 2],
sd_fdr_D3_sk_pl = sd_fdr_cv[3, 2],
mean_power_D3_sk_pl = mean_power_cv[3, 2],
sd_power_D3_sk_pl = sd_power_cv[3, 2]),
file = 'D:/SplitKnockoff_results/simu_experiments/Table/table1.rds')