This R Markdown document illustrates example usage of the RESIDE package using the IST dataset.
Load the RESIDE package and set a seed for reproducibility and store the folder directory for export / import.
Select the variables of interest and summarise. Selection of variables is optional, if the variables are known. They may not be known until the marginal distributions have been received.
# Select variables of interest from the IST dataset.
IST_original <- IST |> dplyr::select(
AGE, # AGE at Randomisation
SEX, # SEX M/F
RATRIAL, # Atrial Fibrillation Y/N at Randomisation
# (not coded for 984 patients in the pilot phase)
RSBP, # Systolic Blood Pressure at Randomisation
STRK14 # Indicator of Any Stroke at 14 days
)
# Convert the character variables to factors (to allow for summary)
IST_original <- IST_original |> dplyr::mutate_if(is.character, factor)
# Produce a summary of the variables
summary(IST_original)
#> AGE SEX RATRIAL RSBP STRK14
#> Min. :16.00 F: 9028 : 984 Min. : 70.0 Min. :0.00000
#> 1st Qu.:65.00 M:10407 N:15282 1st Qu.:140.0 1st Qu.:0.00000
#> Median :73.00 Y: 3169 Median :160.0 Median :0.00000
#> Mean :71.72 Mean :160.2 Mean :0.04152
#> 3rd Qu.:80.00 3rd Qu.:180.0 3rd Qu.:0.00000
#> Max. :99.00 Max. :295.0 Max. :1.00000# Load survival and dplyr libraries
library(survival) # For Cox PH model
library(dplyr) # For data manipulation
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# Stroke event is measured at 14 days, so set this for patients
IST_original$DAY <- 14
# Illustrate the 984 missing values
sum(IST_original$RATRIAL == "")
#> [1] 984
# Remove the missing values
IST_original <- IST_original[!IST_original$RATRIAL == "",]
# Drop the factor name for the missing values
IST_original$RATRIAL <- droplevels(IST_original$RATRIAL)
# Summarise the variable to show there are no longer missing values
summary(IST_original$RATRIAL)
#> N Y
#> 15282 3169
# Fit a Cox PH model
cox.ph <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = IST_original)
# Output the summary of the Cox PH Model
cox.ph
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP,
#> data = IST_original)
#>
#> coef exp(coef) se(coef) z p
#> AGE 0.005237 1.005251 0.003387 1.546 0.1220
#> SEXM 0.077489 1.080570 0.074388 1.042 0.2976
#> RATRIALY 0.231692 1.260732 0.091818 2.523 0.0116
#> RSBP 0.000994 1.000995 0.001310 0.759 0.4479
#>
#> Likelihood ratio test=11.8 on 4 df, p=0.01893
#> n= 18451, number of events= 764Use the get_marginal_distributions() function to get the
marginal distributions, additionally selecting which variables using the
variables parameter.
Export the marginal distributions using the
export_marginal_distributions() function, using the
force parameter to override any existing files.
# Export the Marginal Distributions
export_marginal_distributions(marginals,
folder_path = folder_path,
force = TRUE)
#> Exporting categorical to: /tmp/RtmpM4aCTq/categorical_variables.csv
#> Exporting binary to: /tmp/RtmpM4aCTq/binary_variables.csv
#> Exporting continuous to: /tmp/RtmpM4aCTq/continuous_variables.csv
#> Exporting quantiles to: /tmp/RtmpM4aCTq/continuous_quantiles.csv
#> Exporting summary to: /tmp/RtmpM4aCTq/summary.csvImport the exported marginal distributions using the
import_marginal_distributions() function
Synthesise data from the imported marginals using the
synthesise_data function.
Summarise the simulated data
# Convert any Character variables to Factors
sim_df <- sim_df |> dplyr::mutate_if(is.character, factor)
# Summarise the synthesised data
summary(sim_df)
#> id SEX RATRIAL STRK14 AGE
#> Min. : 1 F: 9086 : 982 Min. :0.00000 Min. :17.00
#> 1st Qu.: 4860 M:10349 N:15266 1st Qu.:0.00000 1st Qu.:65.00
#> Median : 9718 Y: 3187 Median :0.00000 Median :73.00
#> Mean : 9718 Mean :0.04229 Mean :71.57
#> 3rd Qu.:14576 3rd Qu.:0.00000 3rd Qu.:80.00
#> Max. :19435 Max. :1.00000 Max. :98.00
#> RSBP
#> Min. : 79.0
#> 1st Qu.:141.0
#> Median :160.0
#> Mean :159.8
#> 3rd Qu.:176.0
#> Max. :290.0Fit the same cox model as earlier except this time on the simulated data.
# As before the events are measured at day 14
sim_df$DAY <- 14
# Show that the missing observations are in the data
sum(sim_df$RATRIAL == "")
#> [1] 982
# Remove the missing observations
sim_df <- sim_df[!sim_df$RATRIAL == "",]
# Remove the missing factor name
sim_df$RATRIAL <- droplevels(sim_df$RATRIAL)
# Show that there are no missing observations
summary(sim_df$RATRIAL)
#> N Y
#> 15266 3187
# Fit the model on the synthesised data
cox.ph.sim <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = sim_df)
# Show a summary of the model
cox.ph.sim
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP,
#> data = sim_df)
#>
#> coef exp(coef) se(coef) z p
#> AGE 0.006539 1.006561 0.003154 2.073 0.0381
#> SEXM -0.128197 0.879680 0.072031 -1.780 0.0751
#> RATRIALY -0.001382 0.998619 0.095327 -0.014 0.9884
#> RSBP -0.002892 0.997113 0.001323 -2.186 0.0288
#>
#> Likelihood ratio test=12.41 on 4 df, p=0.01452
#> n= 18453, number of events= 771Synthesise data from the imported marginals with correlations, by
exporting an empty correlation matrix, then reimporting it and using the
correlation_matrix parameter to specify the correlation
matrix. Note the matrix was edited in R for this example but users would
be expected to edit the correlation matrix in a program such as
excel.
# Export an empty correlation matrix
export_empty_cor_matrix(
imported_marginals,
folder_path = folder_path
)
# Reimport the matrix
cor_matrix <- import_cor_matrix(file.path(folder_path, "correlation_matrix.csv"))
# Add assumed correlations (note the symmetry)
cor_matrix["RATRIAL_N", "STRK14"] <- -0.02
cor_matrix["STRK14", "RATRIAL_N"] <- -0.02
# Synthesise data specifying the correlation matrix
sim_df_cor <- synthesise_data(imported_marginals, correlation_matrix = cor_matrix)Summarise the synthesised data (with correlations)
# Convert to Factors from Character variables
sim_df_cor <- sim_df_cor |> dplyr::mutate_if(is.character, factor)
# Summarise the synthesised dataset
summary(sim_df_cor)
#> id SEX RATRIAL STRK14 AGE
#> Min. : 1 F: 4130 : 151 Min. :0.00000 Min. :16.0
#> 1st Qu.: 4860 M:15305 N:18620 1st Qu.:0.00000 1st Qu.:65.0
#> Median : 9718 Y: 664 Median :0.00000 Median :73.0
#> Mean : 9718 Mean :0.03828 Mean :71.6
#> 3rd Qu.:14576 3rd Qu.:0.00000 3rd Qu.:80.0
#> Max. :19435 Max. :1.00000 Max. :99.0
#> RSBP
#> Min. : 77
#> 1st Qu.:141
#> Median :160
#> Mean :160
#> 3rd Qu.:176
#> Max. :291Using the synthesised data (with correlations) fit a Cox PH model, with the same parameters as earlier.
# Again events are measured at 14 days
sim_df_cor$DAY <- 14
# Again check that the missing values where added
sum(sim_df_cor$RATRIAL == "")
#> [1] 151
# Again remove the missing values
sim_df_cor <- sim_df_cor[!sim_df_cor$RATRIAL == "",]
# Again drop the missing factor
sim_df_cor$RATRIAL <- droplevels(sim_df_cor$RATRIAL)
# Show there are no missing values
summary(sim_df_cor$RATRIAL)
#> N Y
#> 18620 664
# Fit the model on the synthesised data (with correlations)
cox.ph.sim.cor <- coxph(Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP, data = sim_df_cor)
# Show a summary of the model
cox.ph.sim.cor
#> Call:
#> coxph(formula = Surv(DAY, STRK14) ~ AGE + SEX + RATRIAL + RSBP,
#> data = sim_df_cor)
#>
#> coef exp(coef) se(coef) z p
#> AGE 0.003287 1.003292 0.003197 1.028 0.304
#> SEXM 0.026197 1.026543 0.090866 0.288 0.773
#> RATRIALY 0.182798 1.200571 0.186449 0.980 0.327
#> RSBP 0.000207 1.000207 0.001336 0.155 0.877
#>
#> Likelihood ratio test=2.06 on 4 df, p=0.7241
#> n= 19284, number of events= 735