Main script
The file diabetes_and_stroke.R
provides the code to run the entire example
## An example use case of the Sima framework constructing a simulator for
## modelling interventions regarding salt use in a synthetic population with
## Finnish population characteristics in the year 2017. Also includes
## interventions that aim to reduce systolic blood pressure in the population.
# Uncomment to install packages
# install.packages(c("data.table", "ggplot2", "bestNormalize", "devtools"))
# devtools::install_github("santikka/Sima")
library(Sima)
library(data.table)
library(ggplot2)
rm(list = ls())
# The file_path should be set to the location of the R-scripts, such that
# the data-directory is present in this location
# file_path <- ""
# setwd(file_path)
# Verify that all data files are available
if (!file.exists("data/vaesto_3112_2017.csv")) {
stop("Data file vaesto_3112_2017.csv not found in data directory.")
}
if (!file.exists("data/kuol_007_201700.csv")) {
stop("Data file kuol_007_201700 not found in data directory.")
}
if (!file.exists("data/form04_3.txt")) {
stop("Data file form04_3.txt not found in data directory.")
}
#####################
# Utility Functions #
#####################
source("functions.R")
##########
# Events #
##########
source("events.R")
######################
# Initial Population #
######################
source("data.R")
##################
# Initialization #
##################
# Main simulator, set n to desired population size
sim <- Simulator$new(
initializer = initializer,
man_events = all_events,
seeds = 242,
keep_init = TRUE,
n = 3.6e6
)
########################
# Parallel computation #
########################
## Uncomment to enable a parallel cluster for remaining computations
# if (require(doParallel)) {
# nc <- parallel::detectCores()
# cl <- parallel::makeCluster(nc)
# doParallel::registerDoParallel(cl)
# ## User made functions/objects have to be manually exported, list them in 'export'
# sim$start_cluster(cl, nc, export = c("logit", "expit", "rbern", "rbern2"))
# }
###############
# Calibration #
###############
# Uncomment to perform calibration
# calib_result <- optim(par = sqrt(c(13.51056, 161.12780, 5.41859)),
# fn = mortality_objective,
# method = "Nelder-Mead",
# calib_sim = sim)
# Using a population of 1 million individuals, the following optimum is found:
# c(12.03600, 171.32261, 12.52546)
# Configure the simulator to use the optimal parameters
# Here, only 1st parameter is configured for stroke mortality
sim$reconfigure(list(
"Other Mortality" = list(shape = 12.03600, scale = 171.32261),
"Stroke Mortality" = list(initial = 12.52546)
))
# Visualize and compare simulated mortality to official statistics
sim$run(364)
sim$stop_cluster()
mortality <- read.csv(file = "data/kuol_007_201700.csv", header = TRUE, sep = ",")
mortality[,3] <- mortality[,3]/1000
n <- nrow(mortality)
mortality <- mortality[mortality$Age >= 31,]
mortality_sim <- mortality_df(sim$get_status(), min_age = 30)$out
out_df <- data.frame(
age = rep(mortality_sim$age_group, 2),
mortality = c(mortality_sim$deaths/mortality_sim$size, mortality[,3]),
group = gl(2, nrow(mortality), labels = c("Simulated", "Reported")))
# Plot comparing mortality in the synthetic population to the official statistics
g <- ggplot(out_df, aes(x = age, y = mortality, linetype = group)) +
geom_line(lwd = 0.5) +
theme_bw(base_size = 16) +
theme(
legend.position = c(0.190, 0.85),
legend.key.width = unit(1.75, "cm"),
legend.key.size = unit(1.5, "line"),
legend.text = element_text(size = 14),
plot.margin = unit(c(5.5, 12, 5.5, 5.5), "points"),
legend.background = element_rect(fill = "white", size = 2, linetype = "solid")
) +
scale_x_continuous(limits = c(30, 100), breaks = 3:10 * 10, expand = c(0,0)) +
scale_y_continuous(limits = c(0, 0.4), expand = c(0.025, 0)) +
scale_linetype_manual(name = NULL, values = c("longdash", "solid")) +
labs(x = "Age", y = "Mortality", title = NULL)
g
######################
# Selective sampling #
######################
# Create a study population corresponding to the target sample
sim_sample <- Simulator$new(
initializer = initializer,
man_events = all_events,
keep_init = TRUE,
seeds = 242,
n = 3.6e6
)
# Configure the simulator to use the optimal parameters
# Here, only 1st parameter is configured for stroke mortality
sim_sample$reconfigure(list(
"Other Mortality" = list(shape = 12.03600, scale = 171.32261),
"Stroke Mortality" = list(initial = 12.52546)
))
missingness <- function(dt) {
beta <- c(-9.484330, -0.311690, 0.113614, -0.090696, 0.036383, 0.689770)
X <- as.matrix(dt[ ,.(const = 1, sex, age, bmi, waist_circumference, smoking)])
rbern(expit(as.numeric(X %*% beta)))
}
pop <- data.table::copy(sim_sample$get_status())
m <- min(1e4, nrow(pop))
baseline_sample <- pop$id %in% sample(1:m, m, replace = FALSE)
baseline_nonresp <- missingness(pop[baseline_sample,])
baseline_stroke <- pop[baseline_sample,stroke]
## Uncomment to enable a parallel cluster for remaining computations
# if (require(doParallel)) {
# nc <- parallel::detectCores()
# cl <- parallel::makeCluster(nc)
# doParallel::registerDoParallel(cl)
# ## User made functions/objects have to be manually exported, list them in 'export'
# sim_sample$start_cluster(cl, nc, export = c("logit", "expit", "rbern", "rbern2"))
# }
sim_sample$run(10 * 365)
sim_sample$stop_cluster()
# Update stroke status after 10 years
pop_10y <- sim_sample$get_status()[order(id),]
pop[baseline_sample, stroke := pop_10y[baseline_sample,stroke]]
# Comparing two subsets: one with dropout (miss), one without (full)
# (and one with the entire population, if included, otherwise same as full)
included_miss <- baseline_sample & baseline_nonresp == 0 & baseline_stroke == 0
included_full <- baseline_sample & baseline_stroke == 0
# Separate models for men and women
model_miss_m <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
diabetes + parent_stroke,
data = pop[included_miss & sex == 0, ], family = binomial())
model_miss_f <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
diabetes + parent_stroke,
data = pop[included_miss & sex == 1, ], family = binomial())
model_full_m <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
diabetes + parent_stroke,
data = pop[included_full & sex == 0, ], family = binomial())
model_full_f <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
diabetes + parent_stroke,
data = pop[included_full & sex == 1, ], family = binomial())
# model_pop_m <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
# diabetes + parent_stroke,
# data = pop[included_pop & sex == 0, ], family = binomial())
# model_pop_f <- glm(stroke ~ age + smoking + blood_pressure + hdl_cholesterol +
# diabetes + parent_stroke,
# data = pop[included_pop & sex == 1, ], family = binomial())
summary(model_miss_m)
summary(model_miss_f)
summary(model_full_m)
summary(model_full_f)
# summary(model_pop_f)
# summary(model_pop_m)
round(exp(coef(model_miss_m)), 2)
round(exp(coef(model_miss_f)), 2)
round(exp(coef(model_full_m)), 2)
round(exp(coef(model_full_f)), 2)
# round(exp(coef(model_pop_m)), 2)
# round(exp(coef(model_pop_f)), 2)
round(exp(confint(model_miss_m)), 2)
round(exp(confint(model_miss_f)), 2)
round(exp(confint(model_full_m)), 2)
round(exp(confint(model_full_f)), 2)
# round(exp(confint(model_pop_m)), 2)
# round(exp(confint(model_pop_f)), 2)
#################
# Interventions #
#################
# Set the interventions for each scenario
# the events 'industry_salt' and 'doctor_advice' are defined in events.R
interventions <- list(
list(), # No intervention, baseline
list(industry_salt),
list(doctor_advice)
)
# Define the relevant parameters for each scenario
intervention_pars <- list(
list(),
list(effect = 0.97),
list(
adherence = 0.5,
effect = -2.0
)
)
# Apply each scenario and record the results
# -- N : simulation time for each scenario
# -- M : number of replications for each scenario
N <- 10 * 365
M <- 100
I <- length(interventions)
init_stroke <- sum(sim$get_status()$stroke)
new_stroke <- matrix(0, M, I)
for (m in 1:M) {
for (i in 1:I) {
sim$reset(seeds = 10 * m + i)
if (length(interventions[[i]])) {
interventions[[i]][[1]]$set_parameters(intervention_pars[[i]])
sim$intervene(man_events = interventions[[i]])
}
sim$run(N, show_progress = FALSE)
new_stroke[m,i] <- sum(sim$get_status()$stroke) - init_stroke
}
}