Create a synthetic population of young people attending primary mental health services
Categories:
This below section renders an R Markdown program. The following alternative options may provide improved viewing experience, more contextual information and access to more useful code formats:
- view the HTML in the dataset containing its outputs (includes contextual information) from that article; and;
- view the HTML along with the current release of its R Markdown code (useful if you plan to run the code) and
- view the HTML along with the current development version of its R Markdown code (useful if you wish to copy or edit the code)
Introduction
This program generates a purely synthetic (i.e. fake - no trace of any real records) population that is reasonably representative of the input data we used for the utility mapping study described in the article https://doi.org/10.1101/2021.07.07.21260129.
No access to the real data is required in order to use this program - it is based on summary statistics (e.g. means and standard deviations of variables, correlation matrices). It should be noted however, that a different (and simpler) workflow can be implemented when you do have access to the source dataset (for example, by using the syn
function from the synthpop
package).
The output of this program is very similar but not identical to a fake dataset created by an earlier version of this program and which is saved in the “ymh_clinical_dict_r3.RDS” file from the https://doi.org/10.7910/DVN/HJXYKQ data repository.
Install required R packages
If you do not have the following packages already installed, uncomment and run the following lines.
# install.packages("faux")
# devtools::install_github("ready4-dev/ready4)
# devtools::install_github("ready4-dev/youthvars)
# devtools::install_github("ready4-dev/scorz)
# devtools::install_github("ready4-dev/specific")
# devtools::install_github("ready4-dev/TTU")
# devtools::install_github("ready4-dev/youthu")
Load the ready4 framework package.
Specify parameters to generate outcome fake data
AQoL item response parameters
The first set of input data are the proportions for each allowed response for each of the twenty AQOL-6D questions at both baseline and followup.
aqol_items_prpns_tbs_ls <- list(bl_answer_props_tb = tibble::tribble(
~Question, ~Answer_1, ~Answer_2, ~Answer_3, ~Answer_4, ~Answer_5, ~Answer_6,
"Q1", 0.35, 0.38, 0.16, 0.03, NA_real_,100, # Check item 5 in real data.
"Q2", 0.28, 0.38, 0.18, 0.08, 0.04,100,
"Q3", 0.78, 0.18, 0.03, 0.01, 0.0, 100,
"Q4", 0.64, 0.23, 0.09, 0.0, 100, NA_real_,
"Q5", 0.3, 0.48, 0.12, 0.05, 100, NA_real_,
"Q6", 0.33, 0.48, 0.15, 100, NA_real_,NA_real_,
"Q7", 0.44, 0.27, 0.11, 100, NA_real_, NA_real_,
"Q8", 0.18, 0.29, 0.23, 0.21, 100, NA_real_,
"Q9", 0.07, 0.27, 0.19, 0.37, 100, NA_real_,
"Q10", 0.04, 0.15, 0.4, 0.25, 100, NA_real_,
"Q11", 0.03, 0.13, 0.52, 0.25, 100, NA_real_,
"Q12", 0.06, 0.21, 0.25, 0.34, 100, NA_real_,
"Q13", 0.05, 0.25, 0.31, 0.28, 100, NA_real_,
"Q14", 0.05, 0.3, 0.34, 0.25, 100, NA_real_,
"Q15", 0.57, 0.25, 0.12, 100, NA_real_,NA_real_,
"Q16", 0.48, 0.42, 0.06, 100, NA_real_, NA_real_,
"Q17", 0.44, 0.3, 0.16, 0.07, 100, NA_real_,
"Q18", 0.33, 0.38, 0.25, 0.04, 0.0, 100,
"Q19", 0.33, 0.49, 0.16, 0.02, 0.0, 100,
"Q20", 0.67, 0.21, 0.02, 100, NA_real_,NA_real_),
fup_answer_props_tb = tibble::tribble(
~Question, ~Answer_1, ~Answer_2, ~Answer_3, ~Answer_4, ~Answer_5, ~Answer_6,
"Q1", 0.51, 0.33, 0.12, 0.02, NA_real_, 100,
"Q2", 0.36, 0.38, 0.16, 0.06, 0.02,100,
"Q3", 0.81, 0.15, 0.04, 0.00, 0.0, 100,
"Q4", 0.73, 0.18, 0.09, 0.0, 100, NA_real_,
"Q5", 0.36, 0.42, 0.12, 0.05, 100, NA_real_,
"Q6", 0.48, 0.40, 0.11, 100, NA_real_,NA_real_,
"Q7", 0.57, 0.25, 0.09, 100, NA_real_, NA_real_,
"Q8", 0.31, 0.33, 0.17, 0.12, 100, NA_real_,
"Q9", 0.13, 0.35, 0.19, 0.23, 100, NA_real_,
"Q10", 0.1, 0.21, 0.43, 0.16, 100, NA_real_,
"Q11", 0.06, 0.25, 0.48, 0.18, 100, NA_real_,
"Q12", 0.08, 0.27, 0.26, 0.25, 100, NA_real_,
"Q13", 0.07, 0.37, 0.31, 0.19, 100, NA_real_,
"Q14", 0.08, 0.37, 0.34, 0.15, 100, NA_real_,
"Q15", 0.62, 0.23, 0.09, 100, NA_real_,NA_real_,
"Q16", 0.52, 0.40, 0.06, 100, NA_real_, NA_real_,
"Q17", 0.51, 0.28, 0.15, 0.06, 100, NA_real_,
"Q18", 0.37, 0.35, 0.25, 0.03, 0.0, 100,
"Q19", 0.43, 0.40, 0.16, 0.01, 0.0, 100,
"Q20", 0.77, 0.21, 0.02, 100, NA_real_,NA_real_)) %>%
youthvars::make_complete_prpns_tbs_ls()
Outcome variable correlation parameters
First we specify the names of variables we will be creating as outcome variables.
var_names_chr <- c("aqol6d_total_w","phq9_total","bads_total",
"gad7_total","oasis_total","scared_total","k6_total")
The next step is to specify the correlations between outcome variables (variables assumed to be ordered as in previous step) at baseline and follow-up timepoints.
cor_mat_ls <- list(matrix(c(1,-0.78,0.72,-0.67,-0.71,-0.65,-0.67,
NA,1,-0.73,0.69,0.66,0.63,0.71,
NA,NA,1,-.57,-0.64,-0.57,-0.65,
NA,NA,NA,1,0.74,0.70,0.63,
NA,NA,NA,NA,1,0.7,0.59,
NA,NA,NA,NA,NA,1,0.55,
NA,NA,NA,NA,NA,NA,1),7,7),
matrix(c(1,-0.81,0.72,-0.71,-0.73,-0.64,-0.68,
NA,1,-0.72,0.69,0.68,0.61,0.68,
NA,NA,1,-0.59,-0.61,-0.51,-0.61,
NA,NA,NA,1,0.75,0.71,0.6,
NA,NA,NA,NA,1,0.68,0.59,
NA,NA,NA,NA,NA,1,0.52,
NA,NA,NA,NA,NA,NA,1),7,7))
We now specify the univariate distribution parameters for each of the outcome variables.
synth_data_spine_ls <- list(cor_mat_ls = cor_mat_ls,
nbr_obs_dbl = c(1068,643),
timepoint_nms_chr = c("BL","FUP"),
means_ls = list(c(0.6,12.8,78.2, 10.4,8.1,34.2,12.2),
c(0.7,9.8,89.4, 7.9,6.3,28.8,9.8)),
sds_ls = list(c(0.2,6.6,24.8,5.7,4.7,17.9,5.8),
c(0.2,6.5,24.4,5.5,4.3,17.8,5.9)),
missing_ls = list(c(0,4,10,6,7,7,4),
c(0,5,2,2,1,2,2)),
min_max_ls = list(c(0.03,1),
c(0,27),
c(0,150),
c(0,21),
c(0,20),
c(0,82),
c(0,24)),
discrete_lgl = c(F,rep(T,6)),
var_names_chr = var_names_chr,
aqol_tots_var_nms_chr = c(cumulative = "aqol6d_total_c",
weighted = "aqol6d_total_w"))
Generate fake data
Create fake outcome variable datasets
We now use the parameters we have just specified to create baseline and follow-up datasets with fake data for our nominated outcome variables.
aqol_scores_pars_ls <- list(means_dbl = c(44.5,40.6),
sds_dbl = c(9.9,9.8),
corr_dbl = -0.95)
aqol6d_adol_pop_tbs_ls <- aqol_items_prpns_tbs_ls %>%
scorz::make_aqol6d_adol_pop_tbs_ls(aqol_scores_pars_ls = aqol_scores_pars_ls,
series_names_chr = c("bl_outcomes_tb",
"fup_outcomes_tb"),
synth_data_spine_ls = synth_data_spine_ls,
temporal_cors_ls = list(aqol6d_total_w = 0.85))
#> Joining with `by = join_by(id, match_var_chr)`
#> Joining with `by = join_by(id)`
#> Joining with `by = join_by(id, match_var_chr)`
#> Joining with `by = join_by(id)`
Create fake descriptive variables
We now specify the names and statistical parameters of the variables we will be using in descriptive statistics. For this analysis we are not interested in capturing the joint distribution between these variables, so we only use univariate parameters.
descriptives_BL_tb <- tibble::tibble(fkClientID = aqol6d_adol_pop_tbs_ls$bl_outcomes_tb$fkClientID,
round = c(1) ,
d_age = rnorm(1068,18.1,3.3) %>%
purrr::map_dbl(~min(.x,25) %>%
max(12)),
d_gender = c(rep(1,653),
rep(2,359),
rep(3,39),
rep(NA_real_,17)) %>%
specific::scramble_xx() %>%
factor(labels = c("Female","Male","Other")),
d_sexual_ori_s = c(rep(1,738),
rep(2,289),
rep(NA_real_,41)) %>%
specific::scramble_xx() %>%
factor(labels = c("Straight","Other")),
Region = c(rep(1,671),
rep(2,397)) %>%
specific::scramble_xx() %>%
factor(labels = c("Metro","Regional")),
CALD = c(rep(T,759),
rep(F,169),
rep(NA,140)) %>%
specific::scramble_xx(),
d_studying_working = c(rep(1,405),
rep(2,167),
rep(3,305),
rep(4,159),
rep(NA_real_,32)) %>%
specific::scramble_xx() %>%
factor(labels = c("Studying only",
"Working only",
"Studying and working",
"Not studying or working")),
c_p_diag_s = c(rep(1,182),
rep(2,264),
rep(3,332),
rep(4,237),
rep(NA_real_,53)) %>%
specific::scramble_xx() %>%
factor(labels = c("Depression", "Anxiety","Depression and Anxiety", "Other")),
c_clinical_staging_s = c(rep(1,625),
rep(2,326),
rep(3,86),
rep(NA_real_,31)) %>%
specific::scramble_xx() %>%
factor(labels = c("0-1a","1b","2-4")),
c_sofas = c(rnorm(1068-30,65.2,9.5),
rep(NA_real_,30)) %>%
purrr::map_dbl(~min(.x,100) %>%
max(0)) %>%
specific::scramble_xx(),
s_centre = NA_character_,
d_agegroup = NA_character_,
d_sex_birth_s = NA_character_,
d_country_bir_s = NA_character_,
d_ATSI = NA_character_,
d_english_home = NA,
d_english_native = NA,
d_relation_s = c(rep(1,325),
rep(2,426),
rep(3,286),
rep(NA_real_,31)) %>%
specific::scramble_xx() %>%
factor(labels = c("REPLACE_ME_1",
"REPLACE_ME_2",
"REPLACE_ME_3"))) %>%
dplyr::mutate(d_sex_birth_s = dplyr::case_when(is.na(d_gender) ~ NA_integer_,
as.integer(d_gender) %in%
c(1L,2L) &
runif(1068)>0.995 ~ as.integer(d_gender) %>%
purrr::map_int(~ ifelse(is.na(.x),
.x,
switch(.x,2L,1L,3L))),
as.integer(d_gender) == 3 ~ sample(c(1L,2L),
1068,
replace = T),
TRUE ~ as.integer(d_gender)
) %>%
factor(labels = c("Female","Male")))
descriptives_FUP_tb <- descriptives_BL_tb %>%
dplyr::filter(fkClientID %in%
aqol6d_adol_pop_tbs_ls$fup_outcomes_tb$fkClientID) %>%
dplyr::mutate(round = 2,
d_age = d_age + 0.25,
Region = Region %>%
specific::randomise_changes_in_fct_lvls(0.98),
d_studying_working = d_studying_working %>%
specific::randomise_changes_in_fct_lvls(0.9),
c_p_diag_s = c_p_diag_s %>%
specific::randomise_changes_in_fct_lvls(0.90),
c_clinical_staging_s = c_clinical_staging_s %>%
specific::randomise_changes_in_fct_lvls(0.8),
c_sofas = c_sofas + rnorm(643,4.7,10) %>%
purrr::map_dbl(~min(.x,100) %>% max(0)))
bl_tb <- dplyr::inner_join(descriptives_BL_tb,
aqol6d_adol_pop_tbs_ls$bl_outcomes_tb)
#> Joining with `by = join_by(fkClientID)`
fup_tb <- dplyr::inner_join(descriptives_FUP_tb,
aqol6d_adol_pop_tbs_ls$fup_outcomes_tb)
#> Joining with `by = join_by(fkClientID)`
We make some adjustments to ensure that the c_sofas
variable is correlated with our aqol6d_total_w
variable at both baseline and follow-up.
bl_tb <- bl_tb %>%
dplyr::mutate(c_sofas = faux::rnorm_pre(bl_tb$aqol6d_total_w %>%
as.vector(),
mu = 65.2,
sd = 9.5,
r = 0.5,
empirical = T) %>%
purrr::map_dbl(~min(.x,100) %>% max(0)))
fup_tb <- fup_tb %>%
dplyr::mutate(c_sofas = faux::rnorm_pre(fup_tb$aqol6d_total_w %>%
as.vector(),
mu = 69.9,
sd = 10,
r = 0.5,
empirical = T) %>%
purrr::map_dbl(~min(.x,100) %>% max(0)))
Combine datasets
We now add the fake outcome variables dataset to the fake descriptive variables dataset.
composite_tb <- dplyr::bind_rows(bl_tb, fup_tb) %>%
dplyr::mutate(d_age = floor(d_age)) %>%
dplyr::mutate(d_gender = d_gender %>% as.character() %>%
purrr::map_chr(~ifelse(.x=="Other",
sample(c("Genderqueer/gender nonconforming/agender",
"Transgender"),1),
.x)),
s_centre = Region %>% as.character() %>%
purrr::map_chr(~ifelse(.x=="Metro",
sample(c("Canberra","Southport","Knox"),1),
"Regional Centre")),
d_country_bir_s = CALD %>%
purrr::map_chr(~ifelse(.x,
"Other",
"Australia")),
d_ATSI = CALD %>%
purrr::map_chr(~ifelse(.x,
"Yes",
"No")),
d_english_home = CALD %>%
purrr::map_chr(~ifelse(.x,
"No",
"Yes")),
d_english_native = CALD %>%
purrr::map_chr(~ifelse(.x,
"No",
"Yes"))
) %>%
dplyr::select(-CALD) %>%
dplyr::select(-Region)
composite_tb <- composite_tb %>%
dplyr::select(-setdiff(names(composite_tb)[startsWith(names(composite_tb),
"aqol6d_")],
names(composite_tb)[startsWith(names(composite_tb),
"aqol6d_q")]))
composite_tb <- composite_tb %>%
dplyr::mutate(c_sofas = as.integer(round(c_sofas,0))) %>%
dplyr::mutate(round = factor(round, labels = c("Baseline",
"Follow-up"))) %>%
dplyr::mutate(d_relation_s = dplyr::case_when(d_relation_s %in%
c("REPLACE_ME_1","REPLACE_ME_2") ~
"Not in a relationship",
T ~ "In a relationship")) %>%
youthu::add_dates_from_dstr(bl_start_date_dtm = Sys.Date() - lubridate::days(600),##
bl_end_date_dtm = Sys.Date() - lubridate::days(420),
duration_args_ls = list(a = 60, b = 140, mean = 90, sd = 10),
duration_fn = truncnorm::rtruncnorm,
date_var_nm_1L_chr = "d_interview_date") %>%
dplyr::select(-duration_prd) %>%
youthvars::transform_raw_ds_for_analysis() %>%
dplyr::rename(phq9_total = PHQ9,
bads_total = BADS,
gad7_total = GAD7,
oasis_total = OASIS,
scared_total = SCARED,
k6_total = K6,
c_sofas = SOFAS) %>%
dplyr::select(-c("d_agegroup","Gender", "CALD", "Region"))