1 |
#' Quick ggplot2 barchart visualization of partially observed/missing variables |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' This function takes a dataframe and outputs a nicely formatted |
|
5 |
#' ggplot2 vertical barchart plot that visualizes the proportion |
|
6 |
#' missing for a given variable (vector) or all existent missing |
|
7 |
#' variables. Results can also be stratified by another variable |
|
8 |
#' in which case the proportion missing refers to the amount of |
|
9 |
#' patients in the respective stratum. |
|
10 |
#' |
|
11 |
#' Important: Function assumes the data is in a one-row-per-patient format. |
|
12 |
#' |
|
13 |
#' @param data dataframe or tibble object with partially observed/missing variables. Assumes a a one-row-per-patient format |
|
14 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation |
|
15 |
#' @param strata character name of variable/column by which results should be stratified |
|
16 |
#' |
|
17 |
#' @return returns ggplot2 graph displaying selected or automatically identified variables by percent missing |
|
18 |
#' |
|
19 |
#' @importFrom magrittr '%>%' |
|
20 |
#' @importFrom forcats fct_reorder |
|
21 |
#' @importFrom ggplot2 aes |
|
22 |
#' @importFrom ggplot2 coord_flip |
|
23 |
#' @importFrom ggplot2 facet_wrap |
|
24 |
#' @importFrom ggplot2 geom_bar |
|
25 |
#' @importFrom ggplot2 geom_text |
|
26 |
#' @importFrom ggplot2 ggplot |
|
27 |
#' @importFrom ggplot2 labs |
|
28 |
#' @importFrom ggplot2 scale_y_continuous |
|
29 |
#' @importFrom ggplot2 theme_bw |
|
30 |
#' @importFrom glue glue |
|
31 |
#' |
|
32 |
#' @export |
|
33 |
#' |
|
34 |
#' @examples |
|
35 |
#' library(smdi) |
|
36 |
#' |
|
37 |
#' smdi_vis(data = smdi_data) |
|
38 |
#' |
|
39 | ||
40 |
smdi_vis <- function(data = NULL, |
|
41 |
covar = NULL, |
|
42 |
strata = NULL |
|
43 |
){ |
|
44 | ||
45 |
# initializing new variables |
|
46 |
# tip: https://www.r-bloggers.com/2019/08/no-visible-binding-for-global-variable/ |
|
47 | 3x |
n_miss <- covariate <- prop_miss <- prop_miss_label <- .data <- NULL |
48 | ||
49 |
# check if data is provided |
|
50 | 1x |
if(is.null(data)){stop("No dataframe provided.")} |
51 | ||
52 |
# run smdi_summary to run check on covariates and |
|
53 |
# compute the # and % missingness for <covar> and potentially by <strata> |
|
54 | 2x |
data_summary <- smdi_summarize( |
55 | 2x |
data = data, |
56 | 2x |
covar = covar, |
57 | 2x |
strata = strata |
58 |
) |
|
59 | ||
60 |
# plot missingness |
|
61 | 2x |
perc_max <- max(data_summary$prop_miss) |
62 | ||
63 | 2x |
plot_summary <- data_summary %>% |
64 | 2x |
dplyr::mutate(covariate = forcats::fct_reorder(covariate, prop_miss)) %>% |
65 | 2x |
ggplot2::ggplot(ggplot2::aes(x = covariate, y = prop_miss)) + |
66 | 2x |
ggplot2::geom_bar(stat="identity", fill = "firebrick4", color = "black") + |
67 | 2x |
ggplot2::coord_flip() + |
68 | 2x |
ggplot2::scale_y_continuous(limits = c(0, perc_max + (perc_max/10))) + |
69 | 2x |
ggplot2::geom_text(ggplot2::aes(label= prop_miss_label), nudge_y = perc_max/15) + |
70 | 2x |
ggplot2::labs( |
71 | 2x |
x = "", |
72 | 2x |
y = "% missing" |
73 |
) + |
|
74 | 2x |
ggplot2::theme_bw() |
75 | ||
76 |
# if <strata> is specified plot faceted and add notes |
|
77 | 2x |
if(!is.null(strata)){ |
78 | ||
79 | 1x |
plot_summary <- plot_summary + |
80 | 1x |
ggplot2::facet_wrap({{strata}}) + |
81 | 1x |
ggplot2::labs( |
82 | 1x |
subtitle = glue::glue("Results stratified by {strata}"), |
83 | 1x |
caption = glue::glue("% refer to the number of observations in each stratum of {strata}.") |
84 |
) |
|
85 | ||
86 |
} |
|
87 | ||
88 | 2x |
return(plot_summary) |
89 | ||
90 |
} |
|
91 | ||
92 |
# Other visualizations as re-exports for missing data patterns #1: upsetR plot |
|
93 |
#' @export |
|
94 |
#' @importFrom naniar gg_miss_upset |
|
95 |
naniar::gg_miss_upset |
|
96 | ||
97 |
# Other visualizations as re-exports for missing data patterns #1: mice md.pattern matrix |
|
98 |
#' @export |
|
99 |
#' @importFrom mice md.pattern |
|
100 |
mice::md.pattern |
1 |
#' Computes three group missing data summary diagnostics |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' This function bundles and calls all three group diagnostics and returns the most important summary metrics. |
|
5 |
#' For more information and details, please refer to the individual functions. |
|
6 |
#' |
|
7 |
#' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
8 |
#' |
|
9 |
#' @details |
|
10 |
#' Wrapper for individual diagnostics function. |
|
11 |
#' |
|
12 |
#' @seealso |
|
13 |
#' \code{\link{smdi_asmd}} |
|
14 |
#' \code{\link{smdi_hotelling}} |
|
15 |
#' \code{\link{smdi_little}} |
|
16 |
#' \code{\link{smdi_rf}} |
|
17 |
#' \code{\link{smdi_outcome}} |
|
18 |
#' |
|
19 |
#' @references |
|
20 |
#' TBD |
|
21 |
#' |
|
22 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
23 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors |
|
24 |
#' @param median logical if the median (= TRUE; recommended default) or mean of all absolute standardized mean differences (asmd) should be computed (smdi_asmd()) |
|
25 |
#' @param includeNA logical, should missingness of other partially observed covariates be explicitly modeled for computation of absolute standardized mean differences (default is FALSE) |
|
26 |
#' @param ntree integer, number of trees for random forest missingness prediction model (defaults to 1000 trees) |
|
27 |
#' @param train_test_ratio numeric vector to indicate the test/train split ratio for random forest missingness prediction model, e.g. c(.7, .3) is the default |
|
28 |
#' @param set_seed seed for reproducibility of random forest missingness prediction model, defaults to 42 |
|
29 |
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems) |
|
30 |
#' @param model character describing which outcome model to fit to assess the association between covar missingness indicator and outcome. Currently supported are models of type logistic, linear and cox (see smdi_outcome) |
|
31 |
#' @param form_lhs string specifying the left-hand side of the outcome formula (see smdi_outcome) |
|
32 |
#' @param exponentiated logical, should results of outcome regression to assess association between missingness and outcome be exponentiated (default is FALSE) |
|
33 |
#' |
|
34 |
#' @return smdi object including a summary table of all three smdi group diagnostics: |
|
35 |
#' |
|
36 |
#' **Group 1 diagnostic:** |
|
37 |
#' |
|
38 |
#' - \code{asmd_mean} or \code{asmd_median}: average/median absolute standardized mean difference (and min, max) of patient characteristics between those without (1) and with (0) observed covariate |
|
39 |
#' |
|
40 |
#' - hotteling_p: p-value of hotelling test. Rejecting the H0 means that Hotelling's test detects a significant difference in the distribution between patients without (1) and with (0) the observed covariate |
|
41 |
#' |
|
42 |
#' **Group 2 diagnostic:** |
|
43 |
#' |
|
44 |
#' - \code{rf_auc}: The area under the receiver operating curve (AUC) as a measure of the ability to predict the missingness of the partially observed covariate |
|
45 |
#' |
|
46 |
#' |
|
47 |
#' **Group 3 diagnostic:** |
|
48 |
#' |
|
49 |
#' - \code{estimate_univariate}: univariate association between missingness indicator of covar and outcome |
|
50 |
#' |
|
51 |
#' - \code{estimate_adjusted}: association between missingness indicator of covar and outcome conditional on other fully observed covariates and missing indicator variables of other partially observed covariates |
|
52 |
#' |
|
53 |
#' @importFrom dplyr filter |
|
54 |
#' @importFrom dplyr left_join |
|
55 |
#' |
|
56 |
#' @export |
|
57 |
#' |
|
58 |
#' @examples |
|
59 |
#' library(smdi) |
|
60 |
#' |
|
61 |
#' smdi_diagnose( |
|
62 |
#' data = smdi_data, |
|
63 |
#' covar = "egfr_cat", |
|
64 |
#' model = "cox", |
|
65 |
#' form_lhs = "Surv(eventtime, status)" |
|
66 |
#' ) |
|
67 |
#' |
|
68 |
smdi_diagnose <- function(data = NULL, |
|
69 |
covar = NULL, |
|
70 | ||
71 |
median = TRUE, |
|
72 |
includeNA = FALSE, |
|
73 | ||
74 |
train_test_ratio = c(.7, .3), |
|
75 |
set_seed = 42, |
|
76 |
ntree = 1000, |
|
77 |
n_cores = 1, |
|
78 | ||
79 |
model = c("logistic", "linear", "cox"), |
|
80 |
form_lhs = NULL, |
|
81 |
exponentiated = FALSE |
|
82 |
){ |
|
83 | ||
84 |
# initialize |
|
85 |
#covariate <- `1 vs 2` <- term <- estimate <- conf.low <- conf.high <- NULL |
|
86 | ||
87 |
# check for missing covariates |
|
88 | 7x |
covar_miss <- smdi::smdi_check_covar( |
89 | 7x |
data = data, |
90 | 7x |
covar = covar |
91 |
) |
|
92 | ||
93 |
# asmd -------------------------------------------------------------------- |
|
94 | 7x |
asmd_out <- smdi::smdi_asmd( |
95 | 7x |
data = data, |
96 | 7x |
covar = covar_miss, |
97 | 7x |
median = median, |
98 | 7x |
includeNA = includeNA |
99 |
) |
|
100 | ||
101 | 7x |
tbl_asmd <- summary(asmd_out) |
102 | 7x |
tbl_asmd[[paste0(colnames(tbl_asmd)[[2]], "_min_max")]] <- paste0(tbl_asmd[[2]], " (", tbl_asmd[[3]], ", ", tbl_asmd[[4]], ")") |
103 | 7x |
tbl_asmd <- tbl_asmd[, c(1,5)] |
104 | ||
105 | ||
106 |
# hotelling --------------------------------------------------------------- |
|
107 | 7x |
hotelling_out <- smdi::smdi_hotelling( |
108 | 7x |
data = data, |
109 | 7x |
covar = covar_miss |
110 |
) |
|
111 | ||
112 | 7x |
tbl_hotelling <- summary(hotelling_out) |
113 | ||
114 | ||
115 |
# little ------------------------------------------------------------------ |
|
116 | 7x |
little_out <- smdi::smdi_little( |
117 | 7x |
data = data |
118 |
) |
|
119 | ||
120 | 7x |
tbl_little <- glue::glue("p_little: {ifelse(little_out$p.value < 0.001, '<.001', formatC(little_out$p.value, format = 'f', digits = 3))}") |
121 | ||
122 |
# random forest ----------------------------------------------------------- |
|
123 | 7x |
rf_out <- smdi::smdi_rf( |
124 | 7x |
data = data, |
125 | 7x |
covar = covar_miss, |
126 | 7x |
train_test_ratio = train_test_ratio, |
127 | 7x |
set_seed = set_seed, |
128 | 7x |
ntree = ntree |
129 |
) |
|
130 | ||
131 | 7x |
tbl_rf <- summary(rf_out) |
132 | ||
133 | ||
134 |
# outcome regression ------------------------------------------------------ |
|
135 | 7x |
tbl_outcome <- smdi::smdi_outcome( |
136 | 7x |
data = data, |
137 | 7x |
model = model, |
138 | 7x |
form_lhs = form_lhs, |
139 | 7x |
exponentiated = exponentiated |
140 |
) |
|
141 | ||
142 |
# combine ----------------------------------------------------------------- |
|
143 | 7x |
smdi_tbl_out <- tbl_asmd %>% |
144 | 7x |
dplyr::left_join(tbl_hotelling, by = "covariate") %>% |
145 | 7x |
dplyr::left_join(tbl_rf, by = "covariate") %>% |
146 | 7x |
dplyr::left_join(tbl_outcome, by = "covariate") |
147 | ||
148 | 7x |
smdi_out <- list( |
149 | 7x |
smdi_tbl = smdi_tbl_out, |
150 | 7x |
p_little = tbl_little |
151 |
) |
|
152 | ||
153 | 7x |
class(smdi_out) <- "smdi" |
154 | ||
155 | 7x |
return(smdi_out) |
156 | ||
157 |
} |
|
158 | ||
159 |
# generic print ----------------------------------------------------------- |
|
160 | ||
161 |
#' @export |
|
162 |
print.smdi <- function(x, ...){ |
|
163 | ||
164 | 7x |
cat("smdi summary table:") |
165 | 7x |
cat("\n") |
166 | 7x |
print(x$smdi_tbl) |
167 | 7x |
cat("\n") |
168 | 7x |
cat(x$p_little) |
169 | ||
170 |
} |
|
171 | ||
172 |
# generic summary --------------------------------------------------------- |
|
173 | ||
174 |
#' @export |
|
175 |
summary.smdi <- function(object, ...){ |
|
176 | ||
177 | 7x |
tbl <- (object$smdi_tbl) |
178 | ||
179 | 7x |
return(tbl) |
180 | ||
181 |
} |
1 |
#' Computes hotelling's multivariate t-test |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' Hotelling's multivariate t-test, which examines variable |
|
5 |
#' differences conditional on having an observed covariate value or not. |
|
6 |
#' As the power of statistical hypothesis tests can be influenced by |
|
7 |
#' sample size, the combined investigation along with \code{\link{smdi_asmd}} is highly recommended. |
|
8 |
#' |
|
9 |
#' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
10 |
#' |
|
11 |
#' @details |
|
12 |
#' CAVE: Hotelling's and Little's show high susceptibility with large sample sizes and |
|
13 |
#' it is recommended to always interpret the results along with the other diagnostics. |
|
14 |
#' |
|
15 |
#' @seealso |
|
16 |
#' \code{\link[Hotelling]{hotelling.test}} |
|
17 |
#' |
|
18 |
#' @references |
|
19 |
#' Hotelling H. The Generalization of Student’s Ratio. Ann Math Stat. 1931;2(3):360-378. |
|
20 |
#' |
|
21 |
#' |
|
22 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
23 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors |
|
24 |
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems) |
|
25 |
#' |
|
26 |
#' @return returns a hotelling object with statistics on hotellings test by covariate. That is, for each covar, the following outputs are provided: |
|
27 |
#' |
|
28 |
#' - stats: hotelling test statistics (for more information see \code{\link[Hotelling]{hotelling.test}}) |
|
29 |
#' |
|
30 |
#' - pval: p-value of hotelling test |
|
31 |
#' |
|
32 |
#' @importFrom magrittr '%>%' |
|
33 |
#' @importFrom dplyr all_of |
|
34 |
#' @importFrom dplyr arrange |
|
35 |
#' @importFrom dplyr filter |
|
36 |
#' @importFrom dplyr mutate |
|
37 |
#' @importFrom dplyr select |
|
38 |
#' @importFrom fastDummies dummy_cols |
|
39 |
#' @importFrom Hotelling hotelling.test |
|
40 |
#' @importFrom parallel detectCores |
|
41 |
#' @importFrom parallel mclapply |
|
42 |
#' @importFrom tibble rownames_to_column |
|
43 | ||
44 |
#' |
|
45 |
#' @export |
|
46 |
#' |
|
47 |
#' @examples |
|
48 |
#' library(smdi) |
|
49 |
#' |
|
50 |
#' smdi_hotelling(data = smdi_data) |
|
51 |
#' |
|
52 | ||
53 |
smdi_hotelling <- function(data = NULL, |
|
54 |
covar = NULL, |
|
55 |
n_cores = 1 |
|
56 |
){ |
|
57 | ||
58 | ||
59 |
# initialize variables |
|
60 | 56x |
.data <- V1 <- NULL |
61 | ||
62 |
# pre-checks |
|
63 | 7x |
if(is.null(data)){stop("No dataframe provided.")} |
64 | ||
65 |
# n_cores on windows |
|
66 | 49x |
if(Sys.info()[["sysname"]]=="Windows"){ |
67 | ! |
message("Windows does not support parallelization based on forking. <n_cores> will be automatically set to 1.") |
68 | ! |
n_cores = 1 |
69 |
} |
|
70 | ||
71 |
# more cores than available |
|
72 | 49x |
if(n_cores > parallel::detectCores()){ |
73 | ! |
message("You specified more <n_cores> than you have available. The function will use all cores available to it.") |
74 |
} |
|
75 | ||
76 |
# check for missing covariate of interest |
|
77 | 49x |
covar_miss <- smdi::smdi_check_covar( |
78 | 49x |
data = data, |
79 | 49x |
covar = covar |
80 |
) |
|
81 | ||
82 |
# apply smdi_na_indicator for datqset to create missing |
|
83 |
# indicator variables; |
|
84 |
# needs to be done for all variables with at least one NA |
|
85 | 49x |
data_encoded <- smdi::smdi_na_indicator( |
86 | 49x |
data = data, |
87 | 49x |
covar = smdi::smdi_check_covar(data = data), |
88 | 49x |
drop_NA_col = TRUE |
89 |
) |
|
90 | ||
91 |
# for hotelling we need to one-hot-encode categorical variables |
|
92 |
# applicable to multi-categorical variables if exist |
|
93 | 49x |
if(any(sapply(data_encoded, function(.x) is.factor(.x) || is.character(.x)))){ |
94 | ||
95 | ! |
data_encoded <- data_encoded %>% |
96 | ! |
fastDummies::dummy_columns( |
97 | ! |
remove_most_frequent_dummy = TRUE, |
98 |
# ignore_na = FALSE will make a dummy column for value_NA and |
|
99 |
# give a 1 in any row which has a NA value. |
|
100 | ! |
ignore_na = FALSE, |
101 | ! |
remove_selected_columns = TRUE |
102 |
) |
|
103 |
} |
|
104 | ||
105 |
# expect no column with missing values (all columns should be complete and NA's should be dummy variables after pre-processing) |
|
106 | 49x |
if(any(sapply(data_encoded, function(.x) is.na(.x)))){ |
107 | ||
108 | ! |
stop("After pre-processing the data, there are columns with NA. Please write an issue if you see this error.") |
109 | ||
110 |
} |
|
111 | ||
112 |
# start applying smd computation over all partially observed covariates |
|
113 | 49x |
hotelling_loop <- function(i){ |
114 | ||
115 |
# create strata variable |
|
116 | 79x |
strata_var <- paste0(i, "_NA") |
117 | ||
118 |
# create matrices |
|
119 | 79x |
hotelling_matrix_missing <- data_encoded %>% |
120 | 79x |
dplyr::filter(.data[[strata_var]] == 1) %>% |
121 | 79x |
dplyr::select(-dplyr::all_of(strata_var)) %>% |
122 | 79x |
as.matrix() |
123 | ||
124 | 79x |
hotelling_matrix_complete <- data_encoded %>% |
125 | 79x |
dplyr::filter(.data[[strata_var]] == 0) %>% |
126 | 79x |
dplyr::select(-dplyr::all_of(strata_var)) %>% |
127 | 79x |
as.matrix() |
128 | ||
129 | 79x |
hotelling <- Hotelling::hotelling.test(hotelling_matrix_missing, hotelling_matrix_complete) |
130 | ||
131 | 79x |
return(hotelling) |
132 | ||
133 |
} |
|
134 | ||
135 | 49x |
hotelling_out <- parallel::mclapply(covar_miss, FUN = hotelling_loop, mc.cores = n_cores) |
136 | 47x |
names(hotelling_out) <- covar_miss |
137 | ||
138 | 47x |
class(hotelling_out) <- "hotelling" |
139 | ||
140 | 47x |
return(hotelling_out) |
141 | ||
142 |
} |
|
143 | ||
144 | ||
145 | ||
146 |
# generic print ----------------------------------------------------------- |
|
147 | ||
148 |
#' @export |
|
149 |
print.hotelling <- function(x, ...){ |
|
150 | ||
151 |
# initialize |
|
152 | 10x |
V1 <- NULL |
153 | ||
154 | 10x |
tbl <- do.call(rbind, lapply(x,'[[',2)) %>% |
155 | 10x |
as.data.frame() %>% |
156 | 10x |
tibble::rownames_to_column(var = "covariate") %>% |
157 | 10x |
dplyr::mutate(hotteling_p = ifelse(V1 < 0.001, "<.001", formatC(V1, format = "f", digits = 3))) %>% |
158 | 10x |
dplyr::select(-V1) |
159 | ||
160 | 10x |
return(print(tbl)) |
161 | ||
162 |
} |
|
163 | ||
164 | ||
165 |
# generic summary --------------------------------------------------------- |
|
166 | ||
167 |
#' @export |
|
168 |
summary.hotelling <- function(object, ...){ |
|
169 | ||
170 |
# initialize |
|
171 | 17x |
V1 <- NULL |
172 | ||
173 | 17x |
tbl <- do.call(rbind, lapply(object,'[[',2)) %>% |
174 | 17x |
as.data.frame() %>% |
175 | 17x |
tibble::rownames_to_column(var = "covariate") %>% |
176 | 17x |
dplyr::mutate(hotteling_p = ifelse(V1 < 0.001, "<.001", formatC(V1, format = "f", digits = 3))) %>% |
177 | 17x |
dplyr::select(-V1) |
178 | ||
179 | 17x |
return(tbl) |
180 | ||
181 |
} |
|
182 | ||
183 |
1 |
#' Computes random forest-based AUC |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' The function trains and fits a random forest model to assess the ability to predict missingness for |
|
5 |
#' the specified covariate(s). If missing indicator can be predicted as a function of observed covariates, |
|
6 |
#' MAR may be a likely scenario and would imply that imputation may be feasible. |
|
7 |
#' |
|
8 |
#' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
9 |
#' |
|
10 |
#' @details |
|
11 |
#' The random forest utilizes the \link[randomForest]{randomForest} engine. |
|
12 |
#' |
|
13 |
#' CAVE: If the missingness indicator variables of other partially observed covariates (indicated by suffix _NA) have an extremely high variable importance (combined with an unusually high AUC), |
|
14 |
#' this might be an indicator of a monotone missing data pattern. In this case it is advisable to exclude other partially observed covariates and run missingness diagnostics separately. |
|
15 |
#' |
|
16 |
#' @seealso |
|
17 |
#' \code{\link[randomForest]{randomForest}} |
|
18 |
#' |
|
19 |
#' @references |
|
20 |
#' Sondhi A, Weberpals J, Yerram P, Jiang C, Taylor M, Samant M, Cherng S. A systematic approach towards missing lab data in electronic health records: A case study in non-small cell lung cancer and multiple myeloma. CPT Pharmacometrics Syst Pharmacol. 2023 Jun 15. <doi: 10.1002/psp4.12998.> Epub ahead of print. PMID: 37322818. |
|
21 |
#' |
|
22 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
23 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors |
|
24 |
#' @param ntree integer, number of trees (defaults to 1000 trees) |
|
25 |
#' @param train_test_ratio numeric vector to indicate the test/train split ratio, e.g. c(.7, .3) which is the default |
|
26 |
#' @param tune logical,if TRUE, a 5-fold cross validation is performed combined with a random search for the optimal number of optimal number of variables randomly sampled as candidates at each split (mtry). FALSE is the default due to potentially extensive computation times. |
|
27 |
#' @param set_seed seed for reproducibility, defaults to 42 |
|
28 |
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems) |
|
29 |
#' |
|
30 |
#' @return returns an rf object which comes as a list that contains the ROC AUC value and corresponding variable importance in training dataset (latter as ggplot object). That is, for each covar, the following outputs are provided: |
|
31 |
#' |
|
32 |
#' - rf_table: The area under the receiver operating curve (AUC) as a measure of the ability to predict the missingness of the partially observed covariate |
|
33 |
#' |
|
34 |
#' - rf_plot: ggplot object illustrating the variable importance for the prediction made expressed by the mean decrease in accuracy per predictor. |
|
35 |
#' That is how much would the accuracy of the prediction (# of correct predictions/Total # of predictions made) decrease, had we left out this specific predictor. |
|
36 |
#' |
|
37 |
#' - OOB: estimated OOB error for each investigated partially observed confounder (indicates the performance of the random forest model for data points that are not used in training a tree.) |
|
38 |
#' |
|
39 |
#' @importFrom caret trainControl train |
|
40 |
#' @importFrom dplyr mutate |
|
41 |
#' @importFrom dplyr select |
|
42 |
#' @importFrom dplyr all_of |
|
43 |
#' @importFrom forcats fct_reorder |
|
44 |
#' @importFrom ggplot2 aes |
|
45 |
#' @importFrom ggplot2 coord_flip |
|
46 |
#' @importFrom ggplot2 geom_point |
|
47 |
#' @importFrom ggplot2 ggplot |
|
48 |
#' @importFrom ggplot2 labs |
|
49 |
#' @importFrom ggplot2 scale_color_identity |
|
50 |
#' @importFrom ggplot2 theme_bw |
|
51 |
#' @importFrom glue glue |
|
52 |
#' @importFrom naniar mcar_test |
|
53 |
#' @importFrom parallel detectCores |
|
54 |
#' @importFrom parallel mclapply |
|
55 |
#' @importFrom pROC roc |
|
56 |
#' @importFrom randomForest randomForest |
|
57 |
#' @importFrom stats predict |
|
58 |
#' @importFrom stringr str_remove |
|
59 |
#' @importFrom tibble tibble |
|
60 |
#' @importFrom tibble rownames_to_column |
|
61 |
#' @export |
|
62 |
#' |
|
63 |
#' @examples |
|
64 |
#' library(smdi) |
|
65 |
#' |
|
66 |
#' smdi_rf(data = smdi_data, covar = "ecog_cat") |
|
67 |
#' |
|
68 | ||
69 |
smdi_rf <- function(data = NULL, |
|
70 |
covar = NULL, |
|
71 |
train_test_ratio = c(.7, .3), |
|
72 |
tune = FALSE, |
|
73 |
set_seed = 42, |
|
74 |
ntree = 1000, |
|
75 |
n_cores = 1 |
|
76 |
){ |
|
77 | ||
78 |
# initialize |
|
79 | 21x |
.data <- MeanDecreaseAccuracy <- V1 <- covariate <- rf_auc <- imp_tmp <- . <- NULL |
80 | ||
81 |
# pre-checks |
|
82 | 3x |
if(is.null(data)){stop("No dataframe provided.")} |
83 | ||
84 |
# n_cores on windows |
|
85 | 18x |
if(Sys.info()[["sysname"]]=="Windows"){ |
86 | ! |
message("Windows does not support parallelization based on forking. <n_cores> will be set to 1.") |
87 | ! |
n_cores = 1 |
88 |
} |
|
89 | ||
90 |
# more cores than available |
|
91 | 18x |
if(n_cores > parallel::detectCores()){ |
92 | ! |
message("You specified more <n_cores> than you have available. The function will use all cores available to it.") |
93 |
} |
|
94 | ||
95 |
# check for missing covariate of interest |
|
96 | 18x |
covar_miss <- smdi::smdi_check_covar( |
97 | 18x |
data = data, |
98 | 18x |
covar = covar |
99 |
) |
|
100 | ||
101 |
# apply smdi_na_indicator for datset to create missing |
|
102 |
# indicator variables; |
|
103 |
# needs to be done for all variables with at least one NA |
|
104 | 18x |
data_encoded <- smdi::smdi_na_indicator( |
105 | 18x |
data = data, |
106 | 18x |
covar = smdi::smdi_check_covar(data = data), |
107 | 18x |
drop_NA_col = TRUE |
108 |
) |
|
109 | ||
110 |
# start applying smd computation over all partially observed covariates |
|
111 | 18x |
rf_loop <- function(i){ |
112 | ||
113 |
# create strata variable |
|
114 | 22x |
target_var <- paste0(i, "_NA") |
115 | ||
116 |
# format missing indicator (target_var) variable correctly |
|
117 | 22x |
data_encoded <- data_encoded %>% |
118 | 22x |
dplyr::mutate(target_var = as.factor(.data[[target_var]])) %>% |
119 | 22x |
dplyr::select(-dplyr::all_of(target_var)) |
120 | ||
121 |
# use x% of dataset as training set and y% as test set |
|
122 | 22x |
set.seed(set_seed) |
123 | 22x |
sample <- sample(c(TRUE, FALSE), nrow(data_encoded), replace = TRUE, prob = train_test_ratio) |
124 | 22x |
train <- data_encoded[sample, ] |
125 | 22x |
test <- data_encoded[!sample, ] |
126 | ||
127 |
# tune mtry if desired |
|
128 | 22x |
if(tune){ |
129 | ||
130 |
# cv 5 folds repeat 1 time & random search for mtry |
|
131 | 1x |
control <- caret::trainControl( |
132 | 1x |
method = 'repeatedcv', |
133 | 1x |
number = 5, |
134 | 1x |
repeats = 1, |
135 | 1x |
search = "random" |
136 |
) |
|
137 | ||
138 | 1x |
set.seed(set_seed) |
139 | 1x |
rf_train <- caret::train( |
140 | 1x |
as.factor(target_var) ~ . , |
141 | 1x |
data = train, |
142 | 1x |
ntree = ntree, |
143 | 1x |
method = "rf", |
144 | 1x |
metric = 'Accuracy', |
145 | 1x |
trControl = control |
146 |
) |
|
147 | ||
148 | 1x |
set.seed(set_seed) |
149 | 1x |
rf <- randomForest::randomForest( |
150 | 1x |
as.factor(target_var) ~ ., |
151 | 1x |
data = train, |
152 | 1x |
ntree = ntree, |
153 | 1x |
mtry = rf_train$bestTune$mtry |
154 |
) |
|
155 | ||
156 |
# compute OOB for cross-validated rf model |
|
157 | 1x |
conf <- rf$confusion[,-ncol(rf$confusion)] |
158 | 1x |
oob <- glue::glue("Estimated OOB error for {i}: {formatC((1 - (sum(diag(conf))/sum(conf)))*100, 4)}%") |
159 | ||
160 |
}else{ |
|
161 | ||
162 | 21x |
set.seed(set_seed) |
163 | 21x |
rf <- randomForest::randomForest(as.factor(target_var) ~ ., data = train, ntree = ntree, importance = TRUE) |
164 | ||
165 |
# compute OOB |
|
166 | 21x |
conf <- rf$confusion[,-ncol(rf$confusion)] |
167 | 21x |
oob <- glue::glue("Estimated OOB error for {i}: {formatC((1 - (sum(diag(conf))/sum(conf)))*100, 4)}%") |
168 | ||
169 |
} |
|
170 | ||
171 |
# evaluate on test set |
|
172 | 22x |
rf_test <- stats::predict(rf, newdata = test, type = "response", importance = T) |
173 | ||
174 | 22x |
rf.roc <- pROC::roc(response = test$target_var, predictor = as.numeric(as.character(rf_test)), quiet = TRUE) |
175 | 22x |
auc <- pROC::auc(rf.roc)[[1]] |
176 | ||
177 | 22x |
rf_tbl_out <- tibble::tibble( |
178 | 22x |
covariate = stringr::str_remove(target_var, "_NA"), |
179 | 22x |
rf_auc = auc |
180 |
) %>% |
|
181 | 22x |
dplyr::mutate(rf_auc = formatC(rf_auc, format = "f", digits = 3)) |
182 | ||
183 | 22x |
rf_plot_out <- rf$importance %>% |
184 | 22x |
as.data.frame() %>% |
185 | 22x |
tibble::rownames_to_column(var = "covariate") %>% |
186 | 22x |
{{. ->> imp_tmp}} %>% |
187 | 22x |
ggplot2::ggplot(ggplot2::aes(x = forcats::fct_reorder(as.factor(covariate), MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) + |
188 | 22x |
ggplot2::geom_point(size = 3, color = "darkblue") + |
189 | 22x |
ggplot2::labs( |
190 | 22x |
x = "Covariate", |
191 | 22x |
y = "Mean decrease in accuracy", |
192 | 22x |
title = glue::glue("Covariate importance for predicting {stringr::str_remove(target_var, '_NA')} (training set)") |
193 |
) + |
|
194 | 22x |
ggplot2::coord_flip() + |
195 | 22x |
ggplot2::theme_bw() |
196 | ||
197 |
# we add a message for very high AUC values |
|
198 |
# to make analyst aware to check for monotonicity |
|
199 |
# we choose AUC of .9 as cut-off |
|
200 | 22x |
if(auc > 0.9){ |
201 | 6x |
cat("\n") |
202 | 6x |
message("Important note: \n") |
203 | 6x |
message(glue::glue("AUC for predicting covariate {rf_tbl_out$covariate} is very high (>0.9).")) |
204 | ||
205 |
# determine most important predictor |
|
206 | 6x |
imp_var_message <- imp_tmp %>% |
207 | 6x |
dplyr::filter(MeanDecreaseAccuracy == max(MeanDecreaseAccuracy, na.rm=T)) %>% |
208 | 6x |
dplyr::pull(covariate) |
209 | ||
210 | 6x |
message(glue::glue("Predictor with highest importance: {imp_var_message}.")) |
211 | 6x |
message("Check for potentially underlying monotone missing data pattern. \n") |
212 | 6x |
cat("\n") |
213 |
} |
|
214 | ||
215 | 22x |
rf_out <- list( |
216 | 22x |
rf_table = rf_tbl_out, |
217 | 22x |
rf_plot = rf_plot_out, |
218 | 22x |
OOB = oob |
219 |
) |
|
220 | ||
221 | 22x |
return(rf_out) |
222 | ||
223 |
} |
|
224 | ||
225 |
# run the function for each covariate |
|
226 | 18x |
rf_out <- parallel::mclapply(covar_miss, FUN = rf_loop, mc.cores = n_cores) |
227 | 16x |
names(rf_out) <- covar_miss |
228 | ||
229 |
# assign class |
|
230 | 16x |
class(rf_out) <- "rf" |
231 | ||
232 | 16x |
return(rf_out) |
233 | ||
234 |
} |
|
235 | ||
236 |
# generic print ----------------------------------------------------------- |
|
237 | ||
238 |
#' @export |
|
239 |
print.rf <- function(x, ...){ |
|
240 | ||
241 | 4x |
tbl <- do.call(rbind, lapply(x,'[[',1)) |
242 | ||
243 | 4x |
return(print(tbl)) |
244 | ||
245 |
} |
|
246 | ||
247 |
# generic summary --------------------------------------------------------- |
|
248 | ||
249 |
#' @export |
|
250 |
summary.rf <- function(object, ...){ |
|
251 | ||
252 | 11x |
tbl <- do.call(rbind, lapply(object,'[[',1)) |
253 | ||
254 | 11x |
return(tbl) |
255 | ||
256 |
} |
1 |
#' Computes mean/median absolute standardized mean differences between observed and missing observations |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' This function takes a dataframe with covariates which are partially observed/missing and returns the |
|
5 |
#' median/average absolute standardized mean difference (asmd) and more details for every specified covariate in covar |
|
6 |
#' (if NULL all covariates with at least one NA are considered). |
|
7 |
#' |
|
8 |
#' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
9 |
#' |
|
10 |
#' @details |
|
11 |
#' The asmd may be one indicator as to how much patient characteristics differ between patients with and without an observed |
|
12 |
#' value for a partially observed covariate. If the median/average asmd is above a certain threshold this may indicate |
|
13 |
#' imbalance in patient covariate distributions which may be indicative of the partially observed covariate following a |
|
14 |
#' missing at random (MAR) mechanims, i.e. the missingness is explainable by other observed covariates. Similarly, |
|
15 |
#' no imbalance between observed covariates may be indicative that missingness cannot be explained with observed |
|
16 |
#' covariates and the underlying missingness mechanism may be completely at random (MCAR) or not at random (e.g. |
|
17 |
#' missingness is only associated with unobserved factors or through the partially observed covariate itself). |
|
18 |
#' |
|
19 |
#' A clear cut-off is hard to determine and analogues to propensity scores, |
|
20 |
#' some researchers have proposed that a standardized difference of 0.1 (10 per cent) |
|
21 |
#' denotes meaningful imbalance in the baseline covariate. |
|
22 |
#' |
|
23 |
#' The asmd is computed for every covariate one-by-one and not jointly. If there is multivariate |
|
24 |
#' missingness, i.e. more than just one missing covariate exists, you can decide what should |
|
25 |
#' happen with the other partially observed 'predictor' covariates using the includeNA parameter. |
|
26 |
#' That is, if includeNA is set to FALSE (default), only the asmd between observed cases will be computed, |
|
27 |
#' and if includeNA is set to TRUE, missingness is modeled as an explicit category (categorical covariates only). |
|
28 |
#' |
|
29 |
#' If any other behavior is desired, data transformations for example with the \code{\link{smdi_na_indicator}} function, may make sense |
|
30 |
#' before calling the function. |
|
31 |
#' |
|
32 |
#' The dataframe should generally consist of the exposure variable, the outcome variable(s), the partially observed covariates |
|
33 |
#' and all other fully observed covariates which are deemed important for the final modeling |
|
34 |
#' and (optionally) which could be considered as auxiliary variables. If no partially observed covariates are provided, |
|
35 |
#' the function automatically looks for all variables/columns with NA (powered by the \code{\link{smdi_summarize}} function) |
|
36 |
#' |
|
37 |
#' @seealso |
|
38 |
#' \code{\link[tableone]{CreateTableOne}} |
|
39 |
#' |
|
40 |
#' @references |
|
41 |
#' Austin PC. Balance diagnostics for comparing the distribution of baseline covariates between treatment groups in propensity-score matched samples. Stat Med. 2009 Nov 10;28(25):3083-107. |
|
42 |
#' |
|
43 |
#' Normand SLT, Landrum MB, Guadagnoli E, Ayanian JZ, Ryan TJ, Cleary PD, McNeil BJ. Validating recommendations for coronary angiography following an acute myocardial infarction in the elderly: a matched analysis using propensity scores. Journal of Clinical Epidemiology. 2001;54:387–398. |
|
44 |
#' |
|
45 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
46 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors |
|
47 |
#' @param median logical if the median (= TRUE; recommended default) or mean of all absolute standardized mean differences (asmd) should be computed |
|
48 |
#' @param includeNA logical, should missingness of other partially observed covariates be explicitly modeled (default is FALSE) |
|
49 |
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems) |
|
50 |
#' |
|
51 |
#' @return returns an asmd object with average/median absolute standardized mean differences. That is, for each covar, the following outputs are provided: |
|
52 |
#' |
|
53 |
#' - asmd_covar: name of covariate investigated |
|
54 |
#' |
|
55 |
#' - asmd_table1: detailed "table 1" illustrating distributions and differences of patient characteristics between those without (1) and with (0) observed covariate |
|
56 |
#' |
|
57 |
#' - asmd_plot: plot of absolute standardized mean differences (asmd) between patients without (1) and with (0) observed covariate (sorted by asmd) |
|
58 |
#' |
|
59 |
#' - asmd_aggregate: average/median absolute standardized mean difference (and min, max) of patient characteristics between those without (1) and with (0) observed covariate |
|
60 |
#' |
|
61 |
#' @importFrom magrittr '%>%' |
|
62 |
#' @importFrom dplyr across |
|
63 |
#' @importFrom dplyr arrange |
|
64 |
#' @importFrom dplyr filter |
|
65 |
#' @importFrom dplyr mutate |
|
66 |
#' @importFrom dplyr pull |
|
67 |
#' @importFrom dplyr summarize_all |
|
68 |
#' @importFrom dplyr where |
|
69 |
#' @importFrom fastDummies dummy_cols |
|
70 |
#' @importFrom forcats fct_reorder |
|
71 |
#' @importFrom ggplot2 aes |
|
72 |
#' @importFrom ggplot2 geom_point |
|
73 |
#' @importFrom ggplot2 ggplot |
|
74 |
#' @importFrom ggplot2 labs |
|
75 |
#' @importFrom ggplot2 scale_color_identity |
|
76 |
#' @importFrom ggplot2 theme_bw |
|
77 |
#' @importFrom glue glue |
|
78 |
#' @importFrom parallel detectCores |
|
79 |
#' @importFrom parallel mclapply |
|
80 |
#' @importFrom stats median |
|
81 |
#' @importFrom tableone CreateTableOne |
|
82 |
#' @importFrom tableone ExtractSmd |
|
83 |
#' @importFrom tibble tibble |
|
84 |
#' @importFrom tibble rownames_to_column |
|
85 |
#' |
|
86 |
#' @export |
|
87 |
#' |
|
88 |
#' @examples |
|
89 |
#' library(smdi) |
|
90 |
#' library(dplyr) |
|
91 |
#' |
|
92 |
#' # S3 print method |
|
93 |
#' asmd <- smdi_asmd(data = smdi_data) |
|
94 |
#' asmd |
|
95 |
#' |
|
96 |
#' # let's look at the first variable |
|
97 |
#' # we can check the complete covariate distribution |
|
98 |
#' asmd$pdl1_num$asmd_table1 |
|
99 |
#' |
|
100 | ||
101 |
smdi_asmd <- function(data = NULL, |
|
102 |
covar = NULL, |
|
103 |
median = TRUE, |
|
104 |
includeNA = FALSE, |
|
105 |
n_cores = 1 |
|
106 |
){ |
|
107 | ||
108 | ||
109 | 44x |
covariate <- `1 vs 2` <- NULL |
110 | ||
111 |
# pre-checks |
|
112 | 7x |
if(is.null(data)){stop("No dataframe provided.")} |
113 | ||
114 |
# n_cores on windows |
|
115 | 37x |
if(Sys.info()[["sysname"]]=="Windows"){ |
116 | ! |
message("Windows does not support parallelization based on forking. <n_cores> will be set to 1.") |
117 | ! |
n_cores = 1 |
118 |
} |
|
119 | ||
120 |
# more cores than available |
|
121 | 37x |
if(n_cores > parallel::detectCores()){ |
122 | ! |
message("You specified more <n_cores> than you have available. The function will use all cores available to it.") |
123 |
} |
|
124 | ||
125 |
# pick missing indicator columns/partially observed covariates |
|
126 |
# check for missing covariates |
|
127 | 37x |
covar_miss <- smdi::smdi_check_covar( |
128 | 37x |
data = data, |
129 | 37x |
covar = covar |
130 |
) |
|
131 | ||
132 |
# start applying smd computation over all partially observed covariates |
|
133 | 37x |
smd_loop <- function(i){ |
134 | ||
135 |
# apply smdi_na_indicator for i to create missing |
|
136 |
# indicator variable |
|
137 | 51x |
strata_df <- smdi::smdi_na_indicator( |
138 | 51x |
data = data, |
139 | 51x |
covar = i, |
140 | 51x |
drop_NA_col = TRUE |
141 |
) |
|
142 | ||
143 |
# create strata variable |
|
144 | 51x |
strata_var <- paste0(i, "_NA") |
145 | ||
146 |
# create tableone |
|
147 | 51x |
tbl1 <- tableone::CreateTableOne( |
148 | 51x |
data = strata_df, |
149 |
# all covariates except strata covariate |
|
150 | 51x |
vars = names(strata_df)[ !names(strata_df) == strata_var], |
151 |
# strata covariate |
|
152 | 51x |
strata = strata_var, |
153 |
# if multiple variables are missing, NA is an own category |
|
154 | 51x |
includeNA = includeNA, |
155 | 51x |
smd = TRUE |
156 |
) |
|
157 | ||
158 |
# extract smd and compute median/mean |
|
159 | 51x |
smd <- tableone::ExtractSmd(tbl1) %>% |
160 | 51x |
as.data.frame() %>% |
161 | 51x |
tibble::rownames_to_column(var = "covariate") %>% |
162 | 51x |
dplyr::filter(covariate != i) %>% |
163 | 51x |
dplyr::rename(smd = `1 vs 2`) |
164 | ||
165 |
# asmd plot |
|
166 | 51x |
asmd_plot <- smd %>% |
167 | 51x |
ggplot2::ggplot( |
168 | 51x |
ggplot2::aes( |
169 | 51x |
y = forcats::fct_reorder(covariate, smd), |
170 | 51x |
x = smd, |
171 | 51x |
color = ifelse(smd < 0.1, "blue", "orange") |
172 |
) |
|
173 |
) + |
|
174 | 51x |
ggplot2::geom_point(size = 3) + |
175 | 51x |
ggplot2::labs( |
176 | 51x |
title = glue::glue("ASMD plot for covariate {i}"), |
177 | 51x |
x = "Absolute standardized mean difference [ASMD]", |
178 | 51x |
y = "", |
179 | 51x |
color = "asmd < 0.1", |
180 | 51x |
caption = glue::glue("ASMD is computed as the asmd between patients with and without observed '{i}'") |
181 |
) + |
|
182 | 51x |
ggplot2::scale_color_identity() + |
183 | 51x |
ggplot2::theme_bw() |
184 | ||
185 | 51x |
if(isTRUE(median)){ |
186 | ||
187 | 37x |
asmd_aggregate <- tibble::tibble( |
188 | 37x |
covariate = paste(i), |
189 | 37x |
asmd_median = stats::median(smd$smd), |
190 | 37x |
asmd_min = min(smd$smd), |
191 | 37x |
asmd_max = max(smd$smd) |
192 |
) |
|
193 | ||
194 |
}else{ |
|
195 | ||
196 | 14x |
asmd_aggregate <- tibble::tibble( |
197 | 14x |
covariate = paste(i), |
198 | 14x |
asmd_mean = mean(smd$smd), |
199 | 14x |
asmd_min = min(smd$smd), |
200 | 14x |
asmd_max = max(smd$smd) |
201 |
) |
|
202 | ||
203 |
} |
|
204 | ||
205 |
# finally, round asmd_aggregate to three decimals |
|
206 | 51x |
asmd_aggregate <- asmd_aggregate %>% |
207 | 51x |
dplyr::mutate( |
208 | 51x |
dplyr::across( |
209 | 51x |
dplyr::where(is.numeric), |
210 | 51x |
~formatC(.x, format = "f", digits = 3) |
211 |
) |
|
212 |
) |
|
213 | ||
214 |
# assemble lapply output object |
|
215 | 51x |
return( |
216 | 51x |
list( |
217 | 51x |
asmd_covar = i, |
218 | 51x |
asmd_table1 = print(tbl1, smd = TRUE, printToggle = FALSE), |
219 | 51x |
asmd_plot = asmd_plot, |
220 | 51x |
asmd_aggregate = asmd_aggregate |
221 |
) |
|
222 |
) |
|
223 | ||
224 | 37x |
} # loop ends |
225 | ||
226 |
# iterate above analyses over all specified |
|
227 |
# partially observed covariates |
|
228 | 37x |
asmd_out <- parallel::mclapply(covar_miss, FUN = smd_loop, mc.cores = n_cores) |
229 | 35x |
names(asmd_out) <- covar_miss |
230 | ||
231 | 35x |
class(asmd_out) <- "asmd" |
232 | ||
233 | 35x |
return(asmd_out) |
234 | ||
235 |
} |
|
236 | ||
237 |
# generic print ----------------------------------------------------------- |
|
238 | ||
239 |
#' @export |
|
240 |
print.asmd <- function(x, ...){ |
|
241 | ||
242 | ! |
tbl <- do.call(rbind, lapply(x,'[[',4)) |
243 | ||
244 | ! |
return(print(tbl)) |
245 | ||
246 |
} |
|
247 | ||
248 |
# generic summary --------------------------------------------------------- |
|
249 | ||
250 |
#' @export |
|
251 |
summary.asmd <- function(object, ...){ |
|
252 | ||
253 | 7x |
tbl <- do.call(rbind, lapply(object,'[[',4)) |
254 | ||
255 | 7x |
return(tbl) |
256 | ||
257 |
} |
|
258 | ||
259 |
1 |
#' Computes association between missingness and outcome |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' This function fits outcome models with a covariate missingness indicator(s) of the covariates specified with *covar*. |
|
5 |
#' The estimates are computed by univariate and adjusted models on all other prognostic covariates |
|
6 |
#' in the dataset. Based on the underlying missingness mechanism, the estimate for the covariate missingness indicator |
|
7 |
#' may indicate a meaningful difference in the outcome between patients with vs w/o |
|
8 |
#' the observed confounder conditional on other covariates that could explain that difference. |
|
9 |
#' |
|
10 |
#' @param model `r lifecycle::badge("deprecated")` `model = "logistic"` is no |
|
11 |
#' longer supported in favor of model = "glm" and the new glm_family parameter |
|
12 |
#' which can be used to specify any glm model. |
|
13 |
#' |
|
14 |
#' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
15 |
#' |
|
16 |
#' @details |
|
17 |
#' The function automatically fits a univariate and adjusted outcome model. The currently supported models are glm (glm), linear (lm) and cox (survival). |
|
18 |
#' For adjusted models, the function uses all available covariates found in the dataset specified with the data parameter. If covariates should not |
|
19 |
#' be include in the outcome model, these covariates should be dropped beforehand (as with all other functions in the smdi package). |
|
20 |
#' |
|
21 |
#' The left-hand side of the formula (form_lhs) needs to specify the outcome in one of the following ways: |
|
22 |
#' |
|
23 |
#' - glm (binary): character of column name with binary outcome, e.g. "MACE" |
|
24 |
#' |
|
25 |
#' - lm (continuous): character of column name with binary outcome, e.g. "WEIGHT_LOSS" |
|
26 |
#' |
|
27 |
#' - cox (time-to-event): LHS specifying time-to-event outcome, e.g. "Surv(TIME, STATUS)" |
|
28 |
#' |
|
29 |
#' @references |
|
30 |
#' ... |
|
31 |
#' |
|
32 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
33 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation and all remaining covariates will be used as predictors |
|
34 |
#' @param model character describing which outcome model to fit to assess the association between covar missingness indicator and outcome. Currently supported are models of type glm, linear and cox |
|
35 |
#' @param glm_family glm family object to specify a certain family of generalized linear models (e.g., binomial, gaussian, Gamma, poisson, etc.). For all options see ?stats::family |
|
36 |
#' @param form_lhs string specifying the left-hand side of the outcome formula (see details) |
|
37 |
#' @param exponentiated logical, should results be exponentiated (default is FALSE) |
|
38 |
#' @param n_cores integer, if >1, computations will be parallelized across amount of cores specified in n_cores (only UNIX systems) |
|
39 |
#' |
|
40 |
#' @seealso |
|
41 |
#' \code{\link[stats]{glm}} |
|
42 |
#' \code{\link[survival]{coxph}} |
|
43 |
#' |
|
44 |
#' @return returns a tibble with univariate and adjusted estimates for each partially observed covar: |
|
45 |
#' |
|
46 |
#' - estimate_univariate: univariate association between missingness indicator of covar and outcome |
|
47 |
#' |
|
48 |
#' - estimate_adjusted: association between missingness indicator of covar and outcome conditional on other fully observed covariates and missing indicator variables of other partially observed covariates |
|
49 |
#' |
|
50 |
#' @importFrom broom tidy |
|
51 |
#' @importFrom dplyr across |
|
52 |
#' @importFrom dplyr filter |
|
53 |
#' @importFrom dplyr left_join |
|
54 |
#' @importFrom dplyr mutate |
|
55 |
#' @importFrom dplyr select |
|
56 |
#' @importFrom glue glue |
|
57 |
#' @importFrom parallel detectCores |
|
58 |
#' @importFrom parallel mclapply |
|
59 |
#' @importFrom stats as.formula |
|
60 |
#' @importFrom stats binomial gaussian glm lm Gamma inverse.gaussian poisson quasi quasibinomial quasipoisson |
|
61 |
#' @importFrom stringr str_remove |
|
62 |
#' @importFrom survival coxph |
|
63 |
#' @importFrom survival Surv |
|
64 |
#' |
|
65 |
#' @export |
|
66 |
#' |
|
67 |
#' @examples |
|
68 |
#' library(smdi) |
|
69 |
#' |
|
70 |
#' smdi_outcome( |
|
71 |
#' data = smdi_data, |
|
72 |
#' model = "cox", |
|
73 |
#' form_lhs = "Surv(eventtime, status)" |
|
74 |
#' ) |
|
75 |
#' |
|
76 |
#' |
|
77 |
smdi_outcome <- function(data = NULL, |
|
78 |
covar = NULL, |
|
79 |
model = c("glm", "linear", "cox"), |
|
80 |
glm_family = NULL, |
|
81 |
form_lhs = NULL, |
|
82 |
exponentiated = FALSE, |
|
83 |
n_cores = 1 |
|
84 |
){ |
|
85 | ||
86 |
# initialize |
|
87 | 47x |
term <- covariate <- estimate <- conf.low <- conf.high <- estimate_univariate <- estimate_adjusted <- V1 <- NULL |
88 | ||
89 |
# superseded 'model' parameter |
|
90 | 47x |
if (model == "logistic") { |
91 | 3x |
lifecycle::deprecate_stop( |
92 | 3x |
when = "0.3.0", |
93 | 3x |
what = "smdi_outcome(model = 'logistic')", |
94 | 3x |
details = "'logistic' is no longer supported and was superseded with 'glm' along with the <glm_family> parameter to specify any glm model supported by stats::glm family." |
95 |
) |
|
96 |
} |
|
97 | ||
98 |
# pre-checks |
|
99 | ! |
if(is.null(data)){stop("No dataframe provided.")} |
100 | ! |
if(is.null(form_lhs)){stop("No <form_lhs> provided.")} |
101 | 3x |
if(is.null(model) || !model %in% c("glm", "linear", "cox")){stop("<model> either not specified or not of type glm, linear or cox")} |
102 | 6x |
if(model %in% c("linear", "cox") & !is.null(glm_family)){stop("<glm_family> is specified although <model> is either 'linear' or 'cox'")} |
103 | 3x |
if(any(is.na(data[[form_lhs]]))){stop("Outcome variable <form_lhs> contains missing observations for which smdi_outcome is not suited.")} |
104 | ||
105 |
# if model is glm but no family is specified |
|
106 |
# logistic regression is default if nothing is specified |
|
107 | 27x |
if(model == "glm" & is.null(glm_family)){ |
108 | ||
109 | 6x |
glm_family <- binomial(link = "logit") |
110 | 6x |
message("<glm_family> not specified. Using 'binomial(link = 'logit')' as default.") |
111 | ||
112 |
} |
|
113 | ||
114 |
# n_cores on windows |
|
115 | 27x |
if(Sys.info()[["sysname"]]=="Windows"){ |
116 | ! |
warning("Windows does not support parallelization based on forking. <n_cores> will be set to 1.") |
117 | ! |
n_cores = 1 |
118 |
} |
|
119 | ||
120 |
# more cores than available |
|
121 | 27x |
if(n_cores > parallel::detectCores()){ |
122 | ! |
warning("You specified more <n_cores> than you have available. The function will use all cores available to it.") |
123 |
} |
|
124 | ||
125 |
# check for missing covariate of interest |
|
126 | 27x |
covar_miss <- smdi::smdi_check_covar( |
127 | 27x |
data = data, |
128 | 27x |
covar = covar |
129 |
) |
|
130 | ||
131 |
# apply smdi_na_indicator for datset to create missing |
|
132 |
# indicator variables; |
|
133 |
# needs to be done for all variables with at least one NA |
|
134 | 27x |
data_encoded <- smdi::smdi_na_indicator( |
135 | 27x |
data = data, |
136 | 27x |
covar = smdi::smdi_check_covar(data = data), |
137 | 27x |
drop_NA_col = TRUE |
138 |
) |
|
139 | ||
140 |
# univariate outcome results --------------------------------------------------- |
|
141 | 27x |
univariate_loop <- function(i){ |
142 | ||
143 |
# create strata variable |
|
144 | 24x |
target_var <- paste0(i, "_NA") |
145 | ||
146 | 24x |
form_univariate <- stats::as.formula(paste(form_lhs, "~", target_var)) |
147 | ||
148 |
# fit model |
|
149 | 24x |
if(model == "glm"){ |
150 | ||
151 | 9x |
univariate_fit <- stats::glm(form_univariate, family = glm_family, data = data_encoded) |
152 | ||
153 | 15x |
}else if(model == "linear"){ |
154 | ||
155 | 5x |
univariate_fit <- stats::lm(form_univariate, data = data_encoded) |
156 | ||
157 | 10x |
}else if(model == "cox"){ |
158 | ||
159 | 10x |
form_univariate <- stats::as.formula(paste("survival::", form_lhs, "~", target_var)) |
160 | 10x |
univariate_fit <- survival::coxph(form_univariate, data = data_encoded) |
161 | ||
162 |
} |
|
163 | ||
164 | 24x |
univariate_result <- univariate_fit %>% |
165 | 24x |
broom::tidy(exponentiate = exponentiated, conf.int = TRUE) %>% |
166 |
# filter out intercept term |
|
167 | 24x |
dplyr::filter(term != "(Intercept)") %>% |
168 |
# go back to initial covariate name to be consistent |
|
169 | 24x |
dplyr::mutate(covariate = stringr::str_remove(term, "_NA")) %>% |
170 |
# summarize estimate |
|
171 | 24x |
dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), ~ formatC(.x, format = "f", digits = 2))) %>% |
172 | 24x |
dplyr::mutate(estimate_univariate = glue::glue("{estimate} (95% CI {conf.low}, {conf.high})")) %>% |
173 | 24x |
dplyr::select(covariate, estimate_univariate) |
174 | ||
175 | 24x |
return(univariate_result) |
176 | ||
177 |
} |
|
178 | ||
179 |
# collect results over all covar_miss |
|
180 | 27x |
univariate_out <- do.call(rbind, parallel::mclapply(covar_miss, FUN = univariate_loop, mc.cores = n_cores)) |
181 | ||
182 | ||
183 |
# adjusted outcome results ------------------------------------------------ |
|
184 | 25x |
form_adjusted <- as.formula(paste(form_lhs, "~ .")) |
185 | ||
186 | 25x |
if(model == "glm"){ |
187 | ||
188 | 9x |
adjusted_fit <- stats::glm(form_adjusted, family = glm_family, data = data_encoded) |
189 | ||
190 | 16x |
}else if(model == "linear"){ |
191 | ||
192 | 6x |
adjusted_fit <- stats::lm(form_adjusted, data = data_encoded) |
193 | ||
194 | 10x |
}else if(model == "cox"){ |
195 | ||
196 | 10x |
form_adjusted <- stats::as.formula(paste("survival::", form_lhs, "~ .")) |
197 | 10x |
adjusted_fit <- survival::coxph(form_adjusted, data = data_encoded) |
198 | ||
199 |
} |
|
200 | ||
201 | 25x |
adjusted_out <- adjusted_fit %>% |
202 | 25x |
broom::tidy(exponentiate = exponentiated, conf.int = TRUE) %>% |
203 |
# go back to initial covariate name to be consistent |
|
204 | 25x |
dplyr::mutate(covariate = stringr::str_remove(term, "_NA")) %>% |
205 |
# filter out intercept term and other covariate estimates |
|
206 | 25x |
dplyr::filter(covariate %in% c(covar_miss)) %>% |
207 | 25x |
dplyr::mutate(dplyr::across(c(estimate, conf.low, conf.high), ~ formatC(.x, format = "f", digits = 2))) %>% |
208 | 25x |
dplyr::mutate(estimate_adjusted = glue::glue("{estimate} (95% CI {conf.low}, {conf.high})")) %>% |
209 | 25x |
dplyr::select(covariate, estimate_adjusted) |
210 | ||
211 | ||
212 |
# combine univariate and adjusted results |
|
213 | 25x |
results_out <- univariate_out %>% |
214 | 25x |
dplyr::left_join(adjusted_out, by = "covariate") |
215 | ||
216 | 25x |
return(results_out) |
217 | ||
218 |
} |
1 |
#' This is a utility function to help check input data and covariates provided |
|
2 |
#' |
|
3 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
4 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically looks for and includes all columns with at least one missing observation |
|
5 |
#' |
|
6 |
#' @return returns the covariate vector for subsequent tasks or warnings/errors |
|
7 |
#' |
|
8 |
#' @importFrom magrittr '%>%' |
|
9 |
#' @importFrom dplyr pull |
|
10 |
#' @importFrom dplyr pull |
|
11 |
#' @importFrom glue glue |
|
12 |
#' @importFrom dplyr where |
|
13 |
#' @importFrom dplyr all_of |
|
14 |
#' |
|
15 |
#' @export |
|
16 |
#' |
|
17 | ||
18 |
smdi_check_covar <- function(data = NULL, |
|
19 |
covar = NULL |
|
20 |
){ |
|
21 | ||
22 |
# check if data is provided |
|
23 | 7x |
if(is.null(data)){stop("No dataframe provided.")} |
24 | ||
25 | ||
26 |
# if <covar> is specified -> select specified NA variables (and make sure all are NA), if not automatically find all that have at least one NA value --- |
|
27 | ||
28 |
# if <covar> is specified: |
|
29 | 459x |
if(!is.null(covar)){ |
30 | ||
31 |
# give an error if not all covariates specified in <covar> are present in <data> |
|
32 | 218x |
if(any(!covar %in% colnames(data))){ |
33 | ||
34 | 7x |
stop("Not all covariates specified in <covar> are present in the <data> provided.") |
35 | ||
36 |
}else{ # if all covariates in <covar> are present |
|
37 | ||
38 |
# check if there are variables specified in <covar> which are NOT missing |
|
39 | 211x |
covar_fully_obs <- data %>% |
40 | 211x |
dplyr::select(dplyr::all_of(covar)) %>% |
41 | 211x |
dplyr::select(dplyr::where(~sum(is.na(.x)) == 0)) |
42 | ||
43 |
# give warning if not all of the specified <covar> variables have a missing => select only the ones with missing and return message |
|
44 | 211x |
if(ncol(covar_fully_obs) > 0){ |
45 | ||
46 | 25x |
covar_fully_obs_collapse <- paste(names(covar_fully_obs), collapse = ", ") |
47 | ||
48 | 25x |
warning(glue::glue("<{covar_fully_obs_collapse}> specified as part of <covar> but does/do not contain any missing value. Please check that missing values are coded as <NA>. <{covar_fully_obs_collapse}> will not be considered as missing <covar>.")) |
49 | ||
50 |
# drop fully observed covariates |
|
51 | 25x |
covar_miss <- data %>% |
52 | 25x |
dplyr::select(dplyr::all_of(covar)) %>% |
53 | 25x |
dplyr::select(-dplyr::all_of(names(covar_fully_obs))) %>% |
54 | 25x |
names() |
55 | ||
56 |
}else{ # if all <covar> are present and all have at least one NA |
|
57 | ||
58 |
# return covariates specified in <covar> for subsequent operations |
|
59 | 186x |
covar_miss <- covar |
60 | ||
61 |
} # end: all covariates are present and have at least one NA |
|
62 | ||
63 |
} # end: if all covariates are present |
|
64 | ||
65 |
# if covar is not specified, i.e. is null |
|
66 |
}else{ |
|
67 | ||
68 |
# select all covariates that have at least one NA value |
|
69 | 241x |
covar_miss <- data %>% |
70 | 241x |
dplyr::select(dplyr::where(~sum(is.na(.x)) > 0)) %>% |
71 | 241x |
names() |
72 | ||
73 |
} # end: if no <covar> are specified |
|
74 | ||
75 |
# give error if there are no covariates with at least one NA value |
|
76 | 452x |
if(length(covar_miss)==0){ |
77 | ||
78 | 17x |
stop("Found no covariates with missing values. Please check that missing values are coded as <NA>.") |
79 | ||
80 |
} |
|
81 | ||
82 |
# return covariate vector |
|
83 | 435x |
return(covar_miss) |
84 | ||
85 |
} |
1 |
#' Create binary missing indicator variables by two different strategies |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#'This function takes a dataframe and creates binary missing indicator variable. This can be realized with two |
|
5 |
#'different approaches: |
|
6 |
#' |
|
7 |
#'Approach 1 (drop_NA_col = FALSE): creates a binary missing indicator variable for partially observed variables and retains both original and indicator variables. |
|
8 |
#' |
|
9 |
#'Approach 2 (drop_NA_col = TRUE): creates a binary missing indicator variable for partially observed variables and only retains indicator variables (and drops the original variables). |
|
10 |
#' |
|
11 |
#'Important: Make sure you have your variables format correct and avoid to include variables like ID variables, ZIP codes, dates, etc. |
|
12 |
#' |
|
13 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
14 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation. |
|
15 |
#' @param drop_NA_col logical, drop specified columns with NA (default) or retain those columns |
|
16 |
#' |
|
17 |
#' @return returns the dataframe with missing indicator variables (column names are ending on "_NA") |
|
18 |
#' |
|
19 |
#' @importFrom magrittr '%>%' |
|
20 |
#' @importFrom dplyr across |
|
21 |
#' @importFrom dplyr all_of |
|
22 |
#' |
|
23 |
#' @export |
|
24 |
#' |
|
25 |
#' @examples |
|
26 |
#'library(smdi) |
|
27 |
#'library(dplyr) |
|
28 |
#' |
|
29 |
#' smdi_data %>% |
|
30 |
#' smdi_na_indicator(drop_NA_col = FALSE) %>% |
|
31 |
#' glimpse() |
|
32 |
#' |
|
33 |
#' smdi_data %>% |
|
34 |
#' smdi_na_indicator(drop_NA_col = TRUE) %>% |
|
35 |
#' glimpse() |
|
36 |
#' |
|
37 |
smdi_na_indicator <- function(data = NULL, |
|
38 |
covar = NULL, |
|
39 |
drop_NA_col = TRUE |
|
40 |
){ |
|
41 | ||
42 |
# initializing new variables |
|
43 | ||
44 |
# pre-checks |
|
45 | 5x |
if(is.null(data)){stop("No dataframe provided.")} |
46 | ||
47 |
# check for missing covariates |
|
48 | 175x |
covar_miss <- smdi::smdi_check_covar( |
49 | 175x |
data = data, |
50 | 175x |
covar = covar |
51 |
) |
|
52 | ||
53 | ||
54 |
# Strategy 1: columns with NA will be retained ---------------------------- |
|
55 | 175x |
data_encoded <- data %>% |
56 |
# first we create a NA category for continuous covariates and then median impute the missings |
|
57 | 175x |
dplyr::mutate(dplyr::across(dplyr::all_of(covar_miss), ~ ifelse(is.na(.x), 1, 0), .names = "{.col}_NA")) |
58 | ||
59 |
# Strategy 2: columns with NA will be dropped ----------------------------- |
|
60 | 175x |
if(isTRUE(drop_NA_col)){ |
61 | ||
62 | 160x |
data_encoded <- data_encoded %>% |
63 | 160x |
dplyr::select(-dplyr::all_of(covar_miss)) |
64 | ||
65 |
} |
|
66 | ||
67 | ||
68 | 175x |
return(data_encoded) |
69 | ||
70 |
} |
|
71 |
1 |
#' Computes Little's test |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' Little’s chi-squared test takes into account possible patterns of missingness across all variables in the dataset. |
|
5 |
#' Rejection of the null hypothesis of this test would provide sufficient evidence to indicate that the data are (globally) not MCAR. |
|
6 |
#' Please note that compared to \link{smdi_hotelling}, this function tests for MCAR globally across all missing covariates. |
|
7 |
#' |
|
8 |
#'#' #' Important: don't include variables like ID variables, ZIP codes, dates, etc. |
|
9 |
#' |
|
10 |
#' @details |
|
11 |
#' CAVE: Hotelling's and Little's show high susceptibility with large sample sizes and it is recommended to always interpret the results along with the other diagnostics. |
|
12 |
#' |
|
13 |
#' @seealso |
|
14 |
#' \code{\link[naniar]{mcar_test}} |
|
15 |
#' |
|
16 |
#' @references |
|
17 |
#' Little RJA. A Test of Missing Completely at Random for Multivariate Data with Missing Values. |
|
18 |
#' J Am Stat Assoc. 1988;83(404):1198-1202. |
|
19 |
#' |
|
20 |
#' |
|
21 |
#' @param data dataframe or tibble object with partially observed/missing variables |
|
22 |
#' |
|
23 |
#' @return returns a little object with statistics on little's test globally. |
|
24 |
#' |
|
25 |
#' @importFrom naniar mcar_test |
|
26 |
#' |
|
27 |
#' @export |
|
28 |
#' |
|
29 |
#' @examples |
|
30 |
#' library(smdi) |
|
31 |
#' library(dplyr) |
|
32 |
#' |
|
33 |
#' smdi_data %>% |
|
34 |
#' smdi_little() |
|
35 |
#' |
|
36 | ||
37 |
smdi_little <- function(data = NULL |
|
38 |
){ |
|
39 | ||
40 |
# pre-checks |
|
41 | 5x |
if(is.null(data)){stop("No dataframe provided.")} |
42 | ||
43 |
# for little we need to one-hot-encode categorical variables |
|
44 |
# multi-categorical variables if exist |
|
45 | 22x |
if(any(sapply(data, function(.x) is.factor(.x) || is.character(.x)))){ |
46 | ||
47 | ! |
data_encoded <- data %>% |
48 | ! |
fastDummies::dummy_columns( |
49 | ! |
remove_most_frequent_dummy = TRUE, |
50 |
# as opposed to smdi_hotelling, for |
|
51 |
# smdi_little ignore_na is set to true |
|
52 |
# since for mcar_test, there need to be missing cells |
|
53 |
# (ignore_na = FALSE would create a missing indicator column) |
|
54 | ! |
ignore_na = TRUE, |
55 | ! |
remove_selected_columns = TRUE |
56 |
) |
|
57 | ||
58 |
}else{ |
|
59 | ||
60 | 22x |
data_encoded <- data |
61 | ||
62 |
} |
|
63 | ||
64 |
# expect at least one column with missing values |
|
65 | 22x |
if(!any(sapply(data, function(.x) is.na(.x)))){ |
66 | ||
67 | 5x |
stop("<data> does not contain any column with missing values.") |
68 | ||
69 |
} |
|
70 | ||
71 | 17x |
little <- naniar::mcar_test(data = data_encoded) |
72 | ||
73 | 17x |
class(little) <- "little" |
74 | ||
75 | 17x |
return(little) |
76 | ||
77 |
} |
1 |
#' Takes an object of class smdi and styles it to a publication-ready gt table |
|
2 |
#' |
|
3 |
#' `r lifecycle::badge("experimental")` |
|
4 |
#' @description |
|
5 |
#' This function takes either an object of class smdi or data.frame or tibble as |
|
6 |
#' input and styles it to a publication-ready table based on the gt package. |
|
7 |
#' The output is of class gt and can take further gt-based arguments for |
|
8 |
#' customization. |
|
9 |
#' |
|
10 |
#' @param smdi_object object of class "smdi" or data.frame/tibble |
|
11 |
#' @param include_little can be logical (TRUE/FALSE) for displaying Little's p-value that is part of an "smdi" object or a separate object of class "little" |
|
12 |
#' @param font_size integer to determine table font size |
|
13 |
#' @param tbl_width integer to determine table width |
|
14 |
#' |
|
15 |
#' |
|
16 |
#' @return returns a formatted gt table object |
|
17 |
#' |
|
18 |
#' @importFrom magrittr '%>%' |
|
19 |
#' @importFrom gt gt tab_footnote html cells_column_labels cols_label tab_options px |
|
20 |
#' @importFrom glue glue |
|
21 |
#' |
|
22 |
#' @seealso |
|
23 |
#' \code{\link[gt]{gt}} |
|
24 |
#' |
|
25 |
#' @export |
|
26 |
#' |
|
27 |
#' @examples |
|
28 |
#'library(smdi) |
|
29 |
#'library(dplyr) |
|
30 |
#' |
|
31 |
#' smdi_diagnose( |
|
32 |
#' data = smdi_data, |
|
33 |
#' covar = "egfr_cat", |
|
34 |
#' model = "cox", |
|
35 |
#' form_lhs = "Surv(eventtime, status)" |
|
36 |
#' ) %>% |
|
37 |
#' smdi_style_gt() |
|
38 |
#' |
|
39 | ||
40 |
smdi_style_gt <- function(smdi_object = NULL, |
|
41 |
include_little = TRUE, |
|
42 |
font_size = 13, |
|
43 |
tbl_width = 800 |
|
44 |
){ |
|
45 | ||
46 | ||
47 | 6x |
asmd_median_min_max <- hotteling_p <- rf_auc <- estimate_univariate <- estimate_adjusted <- NULL |
48 | ||
49 |
# check if smdi object or table |
|
50 | 6x |
if(inherits(smdi_object, "smdi")){ |
51 | ||
52 | 2x |
smdi_table <- smdi_object$smdi_tbl |
53 | ||
54 | 4x |
}else if(any(class(smdi_object) %in% c("data.frame", "tibble", "tbl_df", "tbl"))){ |
55 | ||
56 | 2x |
smdi_table <- smdi_object |
57 | ||
58 |
}else{ |
|
59 | ||
60 | 2x |
stop("<smdi_object> is not of type smdi, data.frame or tibble") |
61 | ||
62 |
} |
|
63 | ||
64 |
# little checks |
|
65 | 4x |
if(isTRUE(include_little) & !inherits(smdi_object, "smdi")){ |
66 | ||
67 | 1x |
warning("If include_little = TRUE, <smdi_object> needs to be of class 'smdi' (not dataframe or tibble). No p-value for Little displayed.") |
68 | ||
69 |
} |
|
70 | ||
71 |
# little (if specified) |
|
72 | 4x |
if(isTRUE(include_little) & inherits(smdi_object, "smdi")){ |
73 | ||
74 |
# in smdi object, the p-value is already formatted |
|
75 | 1x |
little_foot <- glue::glue("{stringr::str_replace(smdi_object$p_little, '_', ' ')}, ") |
76 | ||
77 | 3x |
}else if(inherits(include_little, "little")){ |
78 | ||
79 |
# same formatting as in smdi_diagnose(): |
|
80 | 1x |
p_little_value <- ifelse(include_little$p.value < 0.001, '<.001', formatC(include_little$p.value, format = 'f', digits = 3)) |
81 | 1x |
little_foot <- glue::glue("p little: {p_little_value}, ") |
82 | ||
83 |
}else{ |
|
84 | ||
85 | 2x |
little_foot <- "" |
86 | ||
87 |
} |
|
88 | ||
89 |
# general abbrevations |
|
90 | 4x |
foot_abbr <- "ASMD = Median absolute standardized mean difference across all covariates, AUC = Area under the curve, beta = beta coefficient, CI = Confidence interval, max = Maximum, min = Minimum" |
91 | ||
92 | 4x |
smdi_gt <- smdi_table %>% |
93 | ||
94 |
# make into a gt table |
|
95 | 4x |
gt::gt() %>% |
96 | ||
97 | 4x |
gt::tab_footnote( |
98 | 4x |
footnote = gt::md(glue::glue("{little_foot} Abbreviations: {foot_abbr}")) |
99 |
) %>% |
|
100 | ||
101 | 4x |
gt::cols_label( |
102 | 4x |
covariate = "Covariate", |
103 | 4x |
asmd_median_min_max= "ASMD (min/max)", |
104 | 4x |
hotteling_p = gt::md("p Hotelling"), |
105 | 4x |
rf_auc = "AUC", |
106 | 4x |
estimate_univariate = gt::md("beta univariate (95% CI)"), |
107 | 4x |
estimate_adjusted = gt::md("beta (95% CI)") |
108 |
) %>% |
|
109 | ||
110 |
# add footnotes describing three group diagnostics |
|
111 | 4x |
gt::tab_footnote( |
112 | 4x |
footnote = "Group 1 diagnostic: Differences in patient characteristics between patients with and without covariate", |
113 | 4x |
locations = gt::cells_column_labels( |
114 | 4x |
columns = c(asmd_median_min_max, hotteling_p) |
115 |
) |
|
116 |
) %>% |
|
117 | ||
118 | 4x |
gt::tab_footnote( |
119 | 4x |
footnote = "Group 2 diagnostic: Ability to predict missingness", |
120 | 4x |
locations = gt::cells_column_labels( |
121 | 4x |
columns = rf_auc |
122 |
) |
|
123 |
) %>% |
|
124 | ||
125 | 4x |
gt::tab_footnote( |
126 | 4x |
footnote = "Group 3 diagnostic: Assessment if missingness is associated with the outcome (univariate, adjusted)", |
127 | 4x |
locations = gt::cells_column_labels( |
128 | 4x |
columns = c(estimate_univariate, estimate_adjusted) |
129 |
) |
|
130 |
) %>% |
|
131 | ||
132 | 4x |
gt::tab_options( |
133 | 4x |
table.width = gt::px(tbl_width), |
134 | 4x |
data_row.padding = gt::px(3), |
135 | 4x |
table.font.size = font_size |
136 |
) %>% |
|
137 | ||
138 | 4x |
gt::cols_align(align = "left") |
139 | ||
140 | ||
141 | 4x |
return(smdi_gt) |
142 | ||
143 |
} |
|
144 |
1 |
#' Utility helper to give a light summary of partially observed covariates |
|
2 |
#' |
|
3 |
#' @description |
|
4 |
#' This function takes a dataframe and automatically returns the amount and proportion of |
|
5 |
#' missing for partially observed covariates assuming a one-row-per-patient |
|
6 |
#' dataframe. This is an important utility function for other functions in this package. |
|
7 |
#' Results can also be stratified by another variable |
|
8 |
#' in which case the proportion missing refers to the amount of |
|
9 |
#' patients in the respective stratum. |
|
10 |
#' |
|
11 |
#' @param data dataframe or tibble object with partially observed/missing variables. Assumes a a one-row-per-patient format. |
|
12 |
#' @param covar character covariate or covariate vector with partially observed variable/column name(s) to investigate. If NULL, the function automatically includes all columns with at least one missing observation. |
|
13 |
#' @param strata character name of variable/column by which results should be stratified |
|
14 |
#' |
|
15 |
#' @return returns count and proportion of missing values. If strata is specified, the returned proportion refers to the amount of |
|
16 |
#' patients in the respective stratum. |
|
17 |
#' |
|
18 |
#' @importFrom magrittr '%>%' |
|
19 |
#' @importFrom dplyr all_of |
|
20 |
#' @importFrom dplyr across |
|
21 |
#' @importFrom dplyr arrange |
|
22 |
#' @importFrom dplyr desc |
|
23 |
#' @importFrom dplyr group_by |
|
24 |
#' @importFrom dplyr mutate |
|
25 |
#' @importFrom dplyr n |
|
26 |
#' @importFrom dplyr select |
|
27 |
#' @importFrom dplyr summarize |
|
28 |
#' @importFrom tidyr pivot_longer |
|
29 |
#' |
|
30 |
#' @export |
|
31 |
#' |
|
32 |
#' @examples |
|
33 |
#' library(smdi) |
|
34 |
#' |
|
35 |
#' smdi_vis(data = smdi_data) |
|
36 |
#' |
|
37 | ||
38 |
smdi_summarize <- function(data = NULL, |
|
39 |
covar = NULL, |
|
40 |
strata = NULL |
|
41 |
){ |
|
42 | ||
43 |
# initializing new variables |
|
44 |
# tip: https://www.r-bloggers.com/2019/08/no-visible-binding-for-global-variable/ |
|
45 | 11x |
n_miss <- covariate <- prop_miss <- prop_miss_label <- .data <- NULL |
46 | ||
47 |
# checks |
|
48 | 1x |
if(is.null(data)){stop("No dataframe provided.")} |
49 | ||
50 |
# check for specified/missing covariates |
|
51 | 10x |
covar_miss <- smdi::smdi_check_covar( |
52 | 10x |
data = data, |
53 | 10x |
covar = covar |
54 |
) |
|
55 | ||
56 |
# checks and grouping in case strata is specified |
|
57 | 7x |
if(!is.null(strata)){ |
58 | ||
59 |
# check if strata variable is present in <data> |
|
60 | 1x |
if(!strata %in% names(data)){stop("Strata variable not present in data.")} |
61 | ||
62 |
# raise warning to user if strata variable is not a character or factor or has more than 10 unique levels (in this case the variable likely has wrong encoding) |
|
63 | 4x |
if(!class(data[[strata]]) %in% c("factor", "character") | nlevels(factor(data[[strata]])) > 10){ |
64 | ||
65 | 1x |
warning("Strata variable is not a character/factor or has > 10 unique levels. Consider re-categorizing.") |
66 | ||
67 |
} |
|
68 | ||
69 |
# return a warning if strata variable itself has NA values |
|
70 | 4x |
if(sum(is.na(data[[strata]])) > 0){ |
71 | ||
72 |
# assign an 'unknown' strata level |
|
73 | 1x |
data[[strata]] <- ifelse(is.na(data[[strata]]), "Unknown", data[[strata]]) |
74 | ||
75 |
# remove strata from covar_miss |
|
76 | 1x |
covar_miss <- covar_miss[!covar_miss == strata] |
77 | ||
78 | 1x |
warning("Strata variable has NA. Additional 'Unknown' stratum level was added.") |
79 | ||
80 |
} |
|
81 | ||
82 |
# group data for summarizing NA by stratum |
|
83 | 4x |
data <- data %>% |
84 | 4x |
dplyr::group_by(.data[[strata]]) |
85 | ||
86 |
} |
|
87 | ||
88 |
# now compute count and percentage missing |
|
89 | 6x |
data_summary <- data %>% |
90 | 6x |
dplyr::summarize( |
91 | 6x |
dplyr::across( |
92 | 6x |
dplyr::all_of(covar_miss), |
93 | 6x |
.fns = list( |
94 | 6x |
n_miss = ~ sum(is.na(.x)), |
95 | 6x |
prop_miss = ~ sum(is.na(.x))/dplyr::n()*100 # n is denominator size => stratum (if grouped) or total (if ungrouped) |
96 |
), |
|
97 | 6x |
.names = "{.col}_{.fn}"), |
98 | 6x |
.groups = "drop" |
99 |
) %>% |
|
100 | 6x |
tidyr::pivot_longer( |
101 | 6x |
cols = -{{strata}}, |
102 | 6x |
names_to = c("covariate", ".value"), |
103 | 6x |
names_pattern = "(.+)_(n_miss|prop_miss)" |
104 |
) %>% |
|
105 | 6x |
dplyr::mutate(prop_miss_label = paste0(formatC(prop_miss, format = 'f', digits = 2), "%")) %>% |
106 |
# sort by prop missing and covariate |
|
107 | 6x |
dplyr::arrange(dplyr::desc(prop_miss), covariate) |
108 | ||
109 | 6x |
return(data_summary) |
110 | ||
111 |
} |