Quick Start Example

Load Demo Data

# Load demonstration dataset
data(demo)

# The demo data contains:
# S - Study/site indicators (7 hospitals)
# Z - Group indicators (2 groups: IDC vs ILC)
# X - Patient covariates (clinical variables)
# Y - Outcomes (8 gene expression measurements)
# naturalGroupProp - Known population group proportions

Run Analysis

# Estimate causal effects
result <- causal.estimate(
  S = S,                           # Study indicators
  Z = Z,                           # Group indicators
  X = X,                           # Covariates
  Y = Y,                           # Outcomes
  B = 20,                          # Bootstrap samples (use 100+ in practice)
  method = "IC",                   # Weighting method
  naturalGroupProp = naturalGroupProp
)
#> *************************
#> Bootstrap: 1 
#> *************************
#> *************************
#> Bootstrap: 2 
#> *************************
#> *************************
#> Bootstrap: 3 
#> *************************
#> *************************
#> Bootstrap: 4 
#> *************************
#> *************************
#> Bootstrap: 5 
#> *************************
#> *************************
#> Bootstrap: 6 
#> *************************
#> *************************
#> Bootstrap: 7 
#> *************************
#> *************************
#> Bootstrap: 8 
#> *************************
#> *************************
#> Bootstrap: 9 
#> *************************
#> *************************
#> Bootstrap: 10 
#> *************************
#> *************************
#> Bootstrap: 11 
#> *************************
#> *************************
#> Bootstrap: 12 
#> *************************
#> *************************
#> Bootstrap: 13 
#> *************************
#> *************************
#> Bootstrap: 14 
#> *************************
#> *************************
#> Bootstrap: 15 
#> *************************
#> *************************
#> Bootstrap: 16 
#> *************************
#> *************************
#> Bootstrap: 17 
#> *************************
#> *************************
#> Bootstrap: 18 
#> *************************
#> *************************
#> Bootstrap: 19 
#> *************************
#> *************************
#> Bootstrap: 20 
#> *************************

View Results

# Print summary
summary(result)
#> ===========================================
#>      Summary of WMAP Causal Estimates
#> ===========================================
#> Method:                      IC 
#> Bootstrap samples:           20 
#> Effective sample size (ESS): 39.44% 
#> 
#> Mean differences with 95% CI:
#> ------------------------------------------- 
#> Outcome         Estimate (95% CI)
#> ------------------------------------------- 
#> Y 1               0.12 (-0.32,  0.08)
#> Y 2              -0.76 (-0.82, -0.34)
#> Y 3              -0.88 (-1.07, -0.57)
#> Y 4              -0.20 (-0.21,  0.26)
#> Y 5               0.33 ( 0.19,  0.45)
#> Y 6              -0.53 (-0.53, -0.14)
#> Y 7               0.01 (-0.22,  0.18)
#> Y 8              -0.33 (-0.89, -0.36)
#> 
#> Standard Deviation Ratios with 95% CI:
#> ------------------------------------------- 
#> Outcome         Ratio (95% CI)
#> ------------------------------------------- 
#> Y 1               1.61 ( 1.39,  1.85)
#> Y 2               1.43 ( 0.88,  1.37)
#> Y 3               1.40 ( 0.97,  1.55)
#> Y 4               1.68 ( 1.02,  1.50)
#> Y 5               1.76 ( 1.21,  2.07)
#> Y 6               1.08 ( 0.95,  1.32)
#> Y 7               0.73 ( 0.29,  3.35)
#> Y 8               1.43 ( 0.57,  0.99)

# Visualize bootstrap ESS distribution
plot(result)

Understanding the Output

Effective Sample Size (ESS): The Percent ESS indicates how much information remains after weighting:

Mean Differences: Shows estimated differences between Group 1 and Group 2 with 95% confidence intervals.

SD Ratios: Ratios of standard deviations between groups.

Advanced Features

Choosing a Weighting Method

WMAP provides three methods:

IC (Integrative Combined)
IGO (Integrative Generalized Overlap)
FLEXOR (Flexible Overlap)

# Compare methods
result_ic <- causal.estimate(S, Z, X, Y, B = 100, method = "IC", naturalGroupProp)
result_igo <- causal.estimate(S, Z, X, Y, B = 100, method = "IGO", naturalGroupProp)
result_flexor <- causal.estimate(S, Z, X, Y, B = 100, method = "FLEXOR", naturalGroupProp)

# Compare ESS
result_ic$percentESS
result_igo$percentESS
result_flexor$percentESS

Outcome Model

By default, causal.estimate uses observed outcomes directly (outcome_model = FALSE). Set outcome_model = TRUE to fit a random forest on X, S, Z and substitute predicted outcomes for observed Y in all moment calculations (mean, SD, median).

# Default: use observed outcomes directly
result <- causal.estimate(S, Z, X, Y, B = 100, method = "IC",
                          naturalGroupProp = naturalGroupProp)

# RF predictions substituted for observed Y
result <- causal.estimate(S, Z, X, Y, B = 100, method = "IC",
                          naturalGroupProp = naturalGroupProp,
                          outcome_model = TRUE)

When outcome_model = TRUE, a random forest is fit per outcome per bootstrap iteration, which increases runtime.

Cross-fitting

By default (crossfit_folds = NULL), the random forest is fit on the full sample. Set crossfit_folds to an integer >= 2 for K-fold cross-fitting:

# Enable 5-fold cross-fitting
result <- causal.estimate(S, Z, X, Y, B = 100, method = "IC",
                          naturalGroupProp = naturalGroupProp,
                          outcome_model = TRUE,
                          crossfit_folds = 5)

Bootstrap Options

Adjust Bootstrap Sample Size

# Minimum for stable CIs
result <- causal.estimate(S, Z, X, Y, B = 50, method = "IC", naturalGroupProp)

# Recommended for publication
result <- causal.estimate(S, Z, X, Y, B = 200, method = "IC", naturalGroupProp)

# High precision
result <- causal.estimate(S, Z, X, Y, B = 500, method = "IC", naturalGroupProp)

Trade-off: More bootstrap samples = better precision but longer computation time

Fast Point Estimates (No Bootstrap)

Skip bootstrap for quick exploration:

# Point estimates only (no confidence intervals)
result <- causal.estimate(S, Z, X, Y, B = 0, method = "IC", naturalGroupProp)
summary(result)  # Shows point estimates without CIs

Parallel Bootstrap

Speed up computation using multiple cores:

# Install required packages first
install.packages(c("future", "future.apply"))

# Define n_cores based on your system (e.g., 4)
result <- causal.estimate(
  S, Z, X, Y,
  B = 100,
  method = "IC",
  naturalGroupProp = naturalGroupProp,
  parallel = TRUE,
  n_cores = 4
)

The parallelization strategy depends on the environment. On Mac/Linux, multicore is used: workers are created by duplicating the current R process, so data is already in memory and no copying is required. On Windows, multisession is used: each worker is a fresh R process that receives a copy of the data, which has higher startup overhead. RStudio disables multicore regardless of platform, so multisession will be used when running inside RStudio. For best performance on Mac/Linux, run your script from the terminal using Rscript or an interactive R session.

Diagnostic Warnings

WMAP automatically checks for common problems and warns you:

Low ESS Warning

# Example warning:
# Warning message:
# Low effective sample size: 3.1% (threshold: 5%).

Extreme Weights Warning

# Example warning:
# Warning message:
# Extreme weights: max/mean ratio = 45.2 (threshold: 20).

Positivity Violations

# Example warning:
# Warning message:
# Sparse study-group cells: 2 cell(s) with < 5 observations.

# Example warning:
# Warning message:
# Near-zero propensity scores: 6.3% of subjects truncated.

Common Workflows

Standard Analysis Workflow

# 1. Load and check data
data(demo)
table(S, Z)  # Check all cells have observations
sum(is.na(cbind(S, Z, X, Y)))  # Check for missing values

# 2. Run analysis
result <- causal.estimate(
  S, Z, X, Y,
  B = 100,
  method = "FLEXOR",
  naturalGroupProp = naturalGroupProp
)

# 3. Check diagnostics
summary(result)
plot(result)

Method Comparison Workflow

# Compare all three methods
set.seed(123)

results_ic <- causal.estimate(S, Z, X, Y, B = 100, method = "IC", naturalGroupProp)
results_igo <- causal.estimate(S, Z, X, Y, B = 100, method = "IGO", naturalGroupProp)
results_flexor <- causal.estimate(S, Z, X, Y, B = 100, method = "FLEXOR", naturalGroupProp)

# Compare ESS
cat("IC ESS:", results_ic$percentESS, "%\n")
cat("IGO ESS:", results_igo$percentESS, "%\n")
cat("FLEXOR ESS:", results_flexor$percentESS, "%\n")

# Compare estimates
summary(results_ic)
summary(results_igo)
summary(results_flexor)

Sensitivity Analysis Workflow

# Test sensitivity to bootstrap sample size
result_b50 <- causal.estimate(S, Z, X, Y, B = 50, method = "IGO", naturalGroupProp)
result_b100 <- causal.estimate(S, Z, X, Y, B = 100, method = "IGO", naturalGroupProp)
result_b200 <- causal.estimate(S, Z, X, Y, B = 200, method = "IGO", naturalGroupProp)

# Compare CI widths
summary(result_b50)
summary(result_b100)
summary(result_b200)

Working with Weights Only

Extract balancing weights without running full causal analysis:

# Get weights
weights <- balancing.weights(
  S, Z, X,
  method = "IC",
  naturalGroupProp = naturalGroupProp
)

# Examine weight distribution
summary(weights)

# Access weight vector for custom analysis
head(weights$wt.v)

Getting Started with WMAP

Introduction

Installation