diff --git a/bodyfat.Rmd b/bodyfat.Rmd index 4c0afd4..e363b63 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -168,8 +168,11 @@ height as two central IVs [independent variables] for estimating body fat proportion, and will not subject these two to variable selection.'' We subject all variables to selection. ```{r, results='hide'} +doParallel::registerDoParallel(getOption("mc.cores", 1)) fitrhs_cvvs <- cv_varsel(fitrhs, method = 'forward', cv_method = 'loo', - nloo = n, verbose = FALSE) + seed = SEED, verbose = FALSE, parallel = TRUE) +doParallel::stopImplicitCluster() +foreach::registerDoSEQ() ``` And the estimated predictive performance of smaller models compared to the full model. @@ -183,7 +186,7 @@ get a PSIS-LOO [@Vehtari+etal:PSIS-LOO:2017] based recommendation for the model size to choose. ```{r} (nsel <- suggest_size(fitrhs_cvvs, alpha=0.1)) -(vsel <- solution_terms(fitrhs_cvvs)[1:nsel]) +(vsel <- ranking(fitrhs_cvvs)$fulldata[1:nsel]) ``` Based on this recommendation we continue with two variables `abdomen` @@ -193,7 +196,7 @@ Dunkler (2018) had seven variables `height` (fixed), `abdomen` (fixed), Form the projected posterior for the selected model. ```{r} -projrhs <- project(fitrhs_cvvs, nv = nsel, ns = 4000) +projrhs <- project(fitrhs_cvvs, nterms = nsel, ndraws = 4000) ``` Plot the marginals of the projected posterior. @@ -212,38 +215,13 @@ of the selection method they used. Before looking at the corresponding bootstrap results we can look at the stability of selection process based on the LOO-CV selection paths -computed by `cv_varsel` (the code to make the following plot will be -included in __projpred__ package). +computed by `cv_varsel`. ```{r} -source("projpredpct.R") -rows <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]]) -col <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]]) -pctch <- round(fitrhs_cvvs[["pct_solution_terms_cv"]], 2) -colnames(pctch)[1] <- ".size" -pct <- get_pct_arr(pctch, 13) -col_brks <- get_col_brks() -pct$val_grp <- as.character(sapply(pct$val, function(x) sum(x >= col_brks$breaks))) -if (identical(rows, 0)) rows <- pct$var[1] -pct$sel <- (pct$.size == col) & (pct$var %in% rows) -brks <- sort(unique(as.numeric(pct$val_grp)) + 1) -ggplot(pct, aes_(x = ~.size, y = ~var)) + - geom_tile(aes_(fill = ~val_grp, color = ~sel), - width = 1, height = 0.9, size = 1) + - geom_text(aes_(label = ~val, fontface = ~sel+1)) + - coord_cartesian(expand = FALSE) + - scale_y_discrete(limits = rev(levels(pct$var))) + - scale_x_discrete(limits = seq(1,col)) + - scale_color_manual(values = c("white", "black")) + - labs(x = "Model size", y = "", - title = "Fraction of cv-folds that select the given variable") + - scale_fill_manual(breaks = brks, values = col_brks$pal[brks]) + - theme_proj() + - theme(legend.position = "none", - axis.text.y = element_text(angle = 45)) +plot(cv_proportions(fitrhs_cvvs, cumulate = TRUE)) ``` -For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen?, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent. +For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen`, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent. Running `stan_glm` with `prior=hs()` and `cv_varsel` do not take much time when run only once, but for a notebook running them 1000 times would take hours. The code for running the above variable selection procedure for 100 different bootstrapped datasets is as follows. ```{r} @@ -468,10 +446,13 @@ tau0 <- p0/(p-p0) * 1/sqrt(n) hs_prior <- hs(global_scale=tau0) fitrhs2 <- stan_glm(formula2, data = dfr, prior = hs_prior, QR = TRUE, seed=SEED, refresh=0) +doParallel::registerDoParallel(getOption("mc.cores", 1)) fitrhs2_cvvs <- cv_varsel(fitrhs2, method = 'forward', cv_method = 'loo', - nloo=n, verbose = FALSE) + seed = SEED, verbose = FALSE, parallel = TRUE) +doParallel::stopImplicitCluster() +foreach::registerDoSEQ() nsel2 <- suggest_size(fitrhs2_cvvs, alpha=0.1) -vsel2 <- solution_terms(fitrhs2_cvvs)[1:nsel2] +vsel2 <- ranking(fitrhs2_cvvs)$fulldata[1:nsel2] loormse_full2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$ref$mu)^2)) loormse_proj2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$sub[[nsel2]]$mu)^2)) print(paste('PSIS-LOO RMSE Bayesian full model: ', round(loormse_full2,1))) diff --git a/bodyfat_bootstrap.R b/bodyfat_bootstrap.R index c77d4ab..d6a3a17 100644 --- a/bodyfat_bootstrap.R +++ b/bodyfat_bootstrap.R @@ -8,14 +8,18 @@ library(dplyr) #' Bootstrap iterations for projpred for bodyfat df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+"))) p <- length(pred) p0 <- 2 # prior guess for the number of relevant variables @@ -31,14 +35,14 @@ for (i in 1:bootnum) { set.seed(5437854+i) data_id <- sample(1:dim(df)[1], replace = T) bbn[i,] <- length(unique(data_id)) - fitb <- stan_glm(formula, data = df[data_id, ], + fitb <- stan_glm(formula_obj, data = df[data_id, ], prior=rhs_prior, QR=TRUE, seed=i, refresh=0) - bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', nloo=n, - verbose = FALSE) + bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', + seed = 5437854-i, verbose = FALSE) print(nv <- suggest_size(bcvvs,alpha=0.1)) boot_nvs[i,] <- nv print(bcvvs$vind[1:nv]) - projb <- project(bcvvs, nv = nv, ns = 4000) + projb <- project(bcvvs, nterms = nv, ndraws = 4000) boot_est[i, colnames(as.matrix(projb)[,-(nv+2)])] <- colMeans(as.matrix(projb)[,-(nv+2)]) } boot_01 <- (boot_est != 0) * 1 @@ -47,7 +51,7 @@ boot_01 <- data.frame(boot_01) bn <- data.frame(boot_nvs) %>% group_by_all() %>% count(sort=TRUE) bd <- boot_01 %>% group_by_at(vars(-X.Intercept.)) %>% count(sort=TRUE) -boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "X.Intercept.") %>% arrange(-projpred_incp) +boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "(Intercept)") %>% arrange(-projpred_incp) boot_inclusion$steplm_incp <- c(100, 28, 98, 100, 85, 63, 51, 48, 34, 43, 54, 41, 18) boot_inclusion <- boot_inclusion %>% rename(projpred=projpred_incp, steplm=steplm_incp) cumsum(bd$n) diff --git a/bodyfat_kfoldcv.R b/bodyfat_kfoldcv.R index 7eb4fc4..004de18 100644 --- a/bodyfat_kfoldcv.R +++ b/bodyfat_kfoldcv.R @@ -2,55 +2,35 @@ df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+"))) p <- length(pred) p0 <- 5 # prior guess for the number of relevant variables tau0 <- p0/(p-p0) * 1/sqrt(n) rhs_prior <- hs(global_scale=tau0) -fitrhs <- stan_glm(formula, data = df, prior=rhs_prior, QR=TRUE, +fitrhs <- stan_glm(formula_obj, data = df, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) -set.seed(5437854) -perm <- sample.int(n) -K <- 20 -idx <- ceiling(seq(from = 1, to = n, length.out = K + 1)) -bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE) - -muss <- list() -vsmuss <- list() -vsnvss <- list() -vsnvss2 <- list() -for (k in 1:K) { - message("Fitting model ", k, " out of ", K) - omitted <- which(bin == k) - fit_k <- update( - object = fitrhs, - data = df[-omitted,, drop=FALSE], - weights = NULL, - refresh = 0 - ) - fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - nloo=nrow(df[-omitted,, drop=FALSE])) - nvk <- suggest_size(fit_cvvs_k,alpha=0.1) - vsnvss[[k]] <- nvk - proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000) - muss[[k]] <- - colMeans(posterior_linpred(fit_k, - newdata = df[omitted, , drop = FALSE])) - vsmuss[[k]] <- - colMeans(proj_linpred(proj_k, xnew = df[omitted, , drop = FALSE])) -} -mus<-unlist(muss)[order(as.integer(names(unlist(muss))))] -vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))] -vsnvs <- unlist(vsnvss) -rmse_full <- sqrt(mean((df$siri-mus)^2)) -rmse_proj <- sqrt(mean((df$siri-vsmus)^2)) -save(vsnvs, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData") +# NOTE: In contrast to the previous code where `ndraws = 4000` was used in +# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default. +# Of course, `ndraws_pred = 4000` could be used instead, but that would increase +# runtime considerably, because `ndraws_pred = 4000` not only applies to the +# suggested model size, but all model sizes. +fit_cvvs <- cv_varsel(fitrhs, method='forward', cv_method='kfold', K = 20, + seed = 1513306866 + 1) +nv <- suggest_size(fit_cvvs,alpha=0.1) +perfs <- performances(fit_cvvs, stats = "rmse", nterms_max = nv) +rmse_full <- unname(perfs$reference_model["rmse"]) +rmse_proj <- perfs$submodels$rmse[perfs$submodels$size == nv] +save(nv, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData") diff --git a/bodyfat_kfoldcv2.R b/bodyfat_kfoldcv2.R index 4007988..dac32d2 100644 --- a/bodyfat_kfoldcv2.R +++ b/bodyfat_kfoldcv2.R @@ -2,69 +2,43 @@ df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_chr <- paste("siri~", paste(pred, collapse = "+")) set.seed(1513306866) noise <- array(rnorm(87*n), c(n,87)) dfr<-cbind(df,noise=noise) -formula2<-paste(formula,"+",paste(colnames(dfr[,20:106]), collapse = "+")) +formula_obj2<-as.formula( + paste(formula_chr,"+",paste(colnames(dfr[,20:106]), collapse = "+")) +) p <- 100 p0 <- 5 # prior guess for the number of relevant variables tau0 <- p0/(p-p0) * 1/sqrt(n) rhs_prior <- hs(global_scale=tau0) -set.seed(1513306866) -perm <- sample.int(n) -K <- 20 -idx <- ceiling(seq(from = 1, to = n, length.out = K + 1)) -bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE) - -fitrhs2 <- stan_glm(formula2, data = dfr, prior=rhs_prior, QR=TRUE, +fitrhs2 <- stan_glm(formula_obj2, data = dfr, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) -muss <- list() -vsmuss <- list() -vsnvss <- list() -vsnvss2 <- list() -vsnvss3 <- list() -fitcvs <- list() -for (k in 1:K) { - message("Fitting model ", k, " out of ", K) - omitted <- which(bin == k) - fit_k <- update( - object = fitrhs2, - data = dfr[-omitted,, drop=FALSE], - weights = NULL, - refresh = 0 - ) - muss[[k]] <- - colMeans(posterior_linpred(fit_k, - newdata = dfr[omitted, , drop = FALSE])) - fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - nloo = length(which(bin != k)), nv_max=10, - verbose = FALSE) - fitcvs[[k]] <- fit_cvvs_k -} -for (k in 1:K) { - omitted <- which(bin == k) - fit_cvvs_k <- fitcvs[[k]] - print(nvk <- suggest_size(fit_cvvs_k, alpha=0.1)) - vsnvss[[k]] <- nvk - fit_cvvs_k$vind[1:nvk] - proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000) - vsmuss[[k]] <- - colMeans(proj_linpred(proj_k, xnew = dfr[omitted, , drop = FALSE])) -} -mus<-unlist(muss)[order(as.integer(names(unlist(muss))))] -vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))] -vsnvs2 <- unlist(vsnvss) -rmse_full2 <- sqrt(mean((df$siri-mus)^2)) -(rmse_proj2 <- sqrt(mean((df$siri-vsmus)^2))) -save(vsnvs2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData") + +# NOTE: In contrast to the previous code where `ndraws = 4000` was used in +# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default. +# Of course, `ndraws_pred = 4000` could be used instead, but that would increase +# runtime considerably, because `ndraws_pred = 4000` not only applies to the +# suggested model size, but all model sizes. +fit_cvvs2 <- cv_varsel(fitrhs2, method='forward', cv_method='kfold', K = 20, + nterms_max=10, seed = 1513306866 + 1, verbose = FALSE) +print(nv2 <- suggest_size(fit_cvvs2, alpha=0.1)) +perfs2 <- performances(fit_cvvs2, stats = "rmse", nterms_max = nv2) +rmse_full2 <- unname(perfs2$reference_model["rmse"]) +rmse_proj2 <- perfs2$submodels$rmse[perfs2$submodels$size == nv2] +save(nv2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData") diff --git a/projpredpct.R b/projpredpct.R deleted file mode 100644 index 022d110..0000000 --- a/projpredpct.R +++ /dev/null @@ -1,20 +0,0 @@ -library(tidyverse) -library(RColorBrewer) - -get_pct_arr <- function(pctch, nv) { - as_tibble(pctch[1:nv, 1:(nv + 1)]) %>% - gather("var", "val", colnames(pctch)[1:nv+1], factor_key = T) -} - -get_col_brks <- function() { - list(breaks = seq(5e-3, 1-5e-3, length.out = 7), - pal = brewer.pal(11, "RdBu")[3:10]) -} - -theme_proj <- function() { - theme(axis.text = element_text(size = 15), - axis.title = element_text(size = 15), - strip.text = element_text(size = 15), - plot.title = element_text(size = 15, face = "bold", hjust = 0.6)) -} -