From 45039b1515d6b7f61d5eb09c7fd99478708cd650 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 16:41:21 +0200 Subject: [PATCH 01/12] Omit argument `nloo` because subsampled LOO CV is still experimental in projpred (see stan-dev/projpred#94). Setting `nloo = n` should not change results compared to omitting argument `nloo`, but I guess it's better not to point to an experimental feature where not needed. --- bodyfat.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index 4c0afd4..966182a 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -169,7 +169,7 @@ fat proportion, and will not subject these two to variable selection.'' We subject all variables to selection. ```{r, results='hide'} fitrhs_cvvs <- cv_varsel(fitrhs, method = 'forward', cv_method = 'loo', - nloo = n, verbose = FALSE) + verbose = FALSE) ``` And the estimated predictive performance of smaller models compared to the full model. @@ -469,7 +469,7 @@ hs_prior <- hs(global_scale=tau0) fitrhs2 <- stan_glm(formula2, data = dfr, prior = hs_prior, QR = TRUE, seed=SEED, refresh=0) fitrhs2_cvvs <- cv_varsel(fitrhs2, method = 'forward', cv_method = 'loo', - nloo=n, verbose = FALSE) + verbose = FALSE) nsel2 <- suggest_size(fitrhs2_cvvs, alpha=0.1) vsel2 <- solution_terms(fitrhs2_cvvs)[1:nsel2] loormse_full2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$ref$mu)^2)) From 661650d88da3e80345b34e63f9a39546482b7de0 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 21:05:06 +0200 Subject: [PATCH 02/12] Update arguments of `project()`. --- bodyfat.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index 966182a..f93e9a8 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -193,7 +193,7 @@ Dunkler (2018) had seven variables `height` (fixed), `abdomen` (fixed), Form the projected posterior for the selected model. ```{r} -projrhs <- project(fitrhs_cvvs, nv = nsel, ns = 4000) +projrhs <- project(fitrhs_cvvs, nterms = nsel, ndraws = 4000) ``` Plot the marginals of the projected posterior. From ad4d06c58b5d74d308388b8476f96ef80a0d8b3f Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 21:19:41 +0200 Subject: [PATCH 03/12] Fix typo. --- bodyfat.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index f93e9a8..aa99ad6 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -243,7 +243,7 @@ ggplot(pct, aes_(x = ~.size, y = ~var)) + axis.text.y = element_text(angle = 45)) ``` -For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen?, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent. +For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen`, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent. Running `stan_glm` with `prior=hs()` and `cv_varsel` do not take much time when run only once, but for a notebook running them 1000 times would take hours. The code for running the above variable selection procedure for 100 different bootstrapped datasets is as follows. ```{r} From 5e2069ca30b1d9177f1dbdbf45ed5a799776f8a9 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 21:56:44 +0200 Subject: [PATCH 04/12] Convert `fitrhs_cvvs[["pct_solution_terms_cv"]]` from projpred v2.1.0 back to the matrix that `fitrhs_cvvs[["pct_solution_terms_cv"]]` delivered before. --- bodyfat.Rmd | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index aa99ad6..e77df96 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -217,9 +217,11 @@ included in __projpred__ package). ```{r} source("projpredpct.R") -rows <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]]) -col <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]]) -pctch <- round(fitrhs_cvvs[["pct_solution_terms_cv"]], 2) +pct_cum <- apply(fitrhs_cvvs[["pct_solution_terms_cv"]][, -1], 2, cumsum) +pct_cum <- cbind(fitrhs_cvvs[["pct_solution_terms_cv"]][, 1, drop = FALSE], pct_cum) +rows <- nrow(pct_cum) +col <- nrow(pct_cum) +pctch <- round(pct_cum, 2) colnames(pctch)[1] <- ".size" pct <- get_pct_arr(pctch, 13) col_brks <- get_col_brks() From 6ba747649dc9d9930965e6757c7a69aeb0603c9e Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 21:59:16 +0200 Subject: [PATCH 05/12] Avoid the ggplot2 warning ``` Warning message: Continuous limits supplied to discrete scale. Did you mean `limits = factor(...)` or `scale_*_continuous()`? ``` --- bodyfat.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index e77df96..e963bad 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -235,7 +235,7 @@ ggplot(pct, aes_(x = ~.size, y = ~var)) + geom_text(aes_(label = ~val, fontface = ~sel+1)) + coord_cartesian(expand = FALSE) + scale_y_discrete(limits = rev(levels(pct$var))) + - scale_x_discrete(limits = seq(1,col)) + + scale_x_discrete(limits = factor(seq(1,col))) + scale_color_manual(values = c("white", "black")) + labs(x = "Model size", y = "", title = "Fraction of cv-folds that select the given variable") + From d869c34f2f7093df7de4c3ad3664503aec904b88 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 22:12:09 +0200 Subject: [PATCH 06/12] Set seeds where necessary for projpred (actually, argument `seed` does not need to be set after a call to `set.seed()`, but I add it here so that the `cv_varsel()` command can be re-run easily without having to re-run all the code from the previous `set.seed()` call on). --- bodyfat.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bodyfat.Rmd b/bodyfat.Rmd index e963bad..1144ee8 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -169,7 +169,7 @@ fat proportion, and will not subject these two to variable selection.'' We subject all variables to selection. ```{r, results='hide'} fitrhs_cvvs <- cv_varsel(fitrhs, method = 'forward', cv_method = 'loo', - verbose = FALSE) + seed = SEED, verbose = FALSE) ``` And the estimated predictive performance of smaller models compared to the full model. @@ -471,7 +471,7 @@ hs_prior <- hs(global_scale=tau0) fitrhs2 <- stan_glm(formula2, data = dfr, prior = hs_prior, QR = TRUE, seed=SEED, refresh=0) fitrhs2_cvvs <- cv_varsel(fitrhs2, method = 'forward', cv_method = 'loo', - verbose = FALSE) + seed = SEED, verbose = FALSE) nsel2 <- suggest_size(fitrhs2_cvvs, alpha=0.1) vsel2 <- solution_terms(fitrhs2_cvvs)[1:nsel2] loormse_full2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$ref$mu)^2)) From 155d90641c57ed1c1a44e1b199c5ad09c4b00b0e Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 22:39:48 +0200 Subject: [PATCH 07/12] Update `bodyfat_bootstrap.R` in the same way as `bodyfat.Rmd`. --- bodyfat_bootstrap.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bodyfat_bootstrap.R b/bodyfat_bootstrap.R index c77d4ab..ae936bb 100644 --- a/bodyfat_bootstrap.R +++ b/bodyfat_bootstrap.R @@ -33,12 +33,12 @@ for (i in 1:bootnum) { bbn[i,] <- length(unique(data_id)) fitb <- stan_glm(formula, data = df[data_id, ], prior=rhs_prior, QR=TRUE, seed=i, refresh=0) - bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', nloo=n, - verbose = FALSE) + bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', + seed = 5437854-i, verbose = FALSE) print(nv <- suggest_size(bcvvs,alpha=0.1)) boot_nvs[i,] <- nv print(bcvvs$vind[1:nv]) - projb <- project(bcvvs, nv = nv, ns = 4000) + projb <- project(bcvvs, nterms = nv, ndraws = 4000) boot_est[i, colnames(as.matrix(projb)[,-(nv+2)])] <- colMeans(as.matrix(projb)[,-(nv+2)]) } boot_01 <- (boot_est != 0) * 1 From 790a15cb3a0c7b14275ecfd722567d5021205e3f Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 22:39:59 +0200 Subject: [PATCH 08/12] Update `bodyfat_kfoldcv.R` in the same way as `bodyfat.Rmd`. This introduces a "TODO" with respect to `nloo`. --- bodyfat_kfoldcv.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bodyfat_kfoldcv.R b/bodyfat_kfoldcv.R index 7eb4fc4..dd57b68 100644 --- a/bodyfat_kfoldcv.R +++ b/bodyfat_kfoldcv.R @@ -38,10 +38,13 @@ for (k in 1:K) { refresh = 0 ) fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - nloo=nrow(df[-omitted,, drop=FALSE])) + # TODO: This usage of `nloo` is probably not doing what it's + # supposed to, at least in the current CRAN version of projpred: + nloo=nrow(df[-omitted,, drop=FALSE]), + seed = 1513306866 + k) nvk <- suggest_size(fit_cvvs_k,alpha=0.1) vsnvss[[k]] <- nvk - proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000) + proj_k <- project(fit_cvvs_k, nterms = nvk, ndraws = 4000) muss[[k]] <- colMeans(posterior_linpred(fit_k, newdata = df[omitted, , drop = FALSE])) From f7824eda52fb3d3c18d7700dfcf7c152cb18eb5a Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 10 May 2022 22:40:20 +0200 Subject: [PATCH 09/12] Update `bodyfat_kfoldcv2.R` in the same way as `bodyfat.Rmd`. This introduces a "TODO" with respect to `nloo`. --- bodyfat_kfoldcv2.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bodyfat_kfoldcv2.R b/bodyfat_kfoldcv2.R index 4007988..6994baf 100644 --- a/bodyfat_kfoldcv2.R +++ b/bodyfat_kfoldcv2.R @@ -48,8 +48,11 @@ for (k in 1:K) { colMeans(posterior_linpred(fit_k, newdata = dfr[omitted, , drop = FALSE])) fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - nloo = length(which(bin != k)), nv_max=10, - verbose = FALSE) + # TODO: This usage of `nloo` is probably not doing what it's + # supposed to, at least in the current CRAN version of projpred: + nloo = length(which(bin != k)), + nterms_max=10, + seed = 1513306866 + k, verbose = FALSE) fitcvs[[k]] <- fit_cvvs_k } for (k in 1:K) { @@ -58,7 +61,7 @@ for (k in 1:K) { print(nvk <- suggest_size(fit_cvvs_k, alpha=0.1)) vsnvss[[k]] <- nvk fit_cvvs_k$vind[1:nvk] - proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000) + proj_k <- project(fit_cvvs_k, nterms = nvk, ndraws = 4000) vsmuss[[k]] <- colMeans(proj_linpred(proj_k, xnew = dfr[omitted, , drop = FALSE])) } From ad2b35ff65ae65623a0a2dc09527e17e2ca14524 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Wed, 11 May 2022 11:48:31 +0200 Subject: [PATCH 10/12] For `bodyfat_kfoldcv.R`, use the K-fold CV which is natively implemented in projpred. This resolves the `nloo` "TODO" added earlier. --- bodyfat_kfoldcv.R | 55 ++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 39 deletions(-) diff --git a/bodyfat_kfoldcv.R b/bodyfat_kfoldcv.R index dd57b68..eb50f4e 100644 --- a/bodyfat_kfoldcv.R +++ b/bodyfat_kfoldcv.R @@ -18,42 +18,19 @@ rhs_prior <- hs(global_scale=tau0) fitrhs <- stan_glm(formula, data = df, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) -set.seed(5437854) -perm <- sample.int(n) -K <- 20 -idx <- ceiling(seq(from = 1, to = n, length.out = K + 1)) -bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE) - -muss <- list() -vsmuss <- list() -vsnvss <- list() -vsnvss2 <- list() -for (k in 1:K) { - message("Fitting model ", k, " out of ", K) - omitted <- which(bin == k) - fit_k <- update( - object = fitrhs, - data = df[-omitted,, drop=FALSE], - weights = NULL, - refresh = 0 - ) - fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - # TODO: This usage of `nloo` is probably not doing what it's - # supposed to, at least in the current CRAN version of projpred: - nloo=nrow(df[-omitted,, drop=FALSE]), - seed = 1513306866 + k) - nvk <- suggest_size(fit_cvvs_k,alpha=0.1) - vsnvss[[k]] <- nvk - proj_k <- project(fit_cvvs_k, nterms = nvk, ndraws = 4000) - muss[[k]] <- - colMeans(posterior_linpred(fit_k, - newdata = df[omitted, , drop = FALSE])) - vsmuss[[k]] <- - colMeans(proj_linpred(proj_k, xnew = df[omitted, , drop = FALSE])) -} -mus<-unlist(muss)[order(as.integer(names(unlist(muss))))] -vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))] -vsnvs <- unlist(vsnvss) -rmse_full <- sqrt(mean((df$siri-mus)^2)) -rmse_proj <- sqrt(mean((df$siri-vsmus)^2)) -save(vsnvs, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData") +# NOTE: In contrast to the previous code where `ndraws = 4000` was used in +# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default. +# Of course, `ndraws_pred = 4000` could be used instead, but that would increase +# runtime considerably, because `ndraws_pred = 4000` not only applies to the +# suggested model size, but all model sizes. +fit_cvvs <- cv_varsel(fitrhs, method='forward', cv_method='kfold', K = 20, + seed = 1513306866 + 1) +nv <- suggest_size(fit_cvvs,alpha=0.1) +# Currently (projpred v2.1.1), we need to use the internal projpred function +# .tabulate_stats() to obtain the reference model's performance: +rmse_full <- projpred:::.tabulate_stats(fit_cvvs, stats = "rmse") +rmse_full <- rmse_full$value[rmse_full$size == Inf] +# For the submodel, this is the way to go: +smmry <- summary(fit_cvvs, stats = "rmse", nterms_max = nv) +rmse_proj <- smmry$selection$rmse.kfold[smmry$selection$size == nv] +save(nv, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData") From 9e65616c6bf9e234747b2e97a3f602cb5c264c92 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Wed, 11 May 2022 11:55:57 +0200 Subject: [PATCH 11/12] For `bodyfat_kfoldcv2.R`, use the K-fold CV which is natively implemented in projpred. This resolves the `nloo` "TODO" added earlier. --- bodyfat_kfoldcv2.R | 65 ++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/bodyfat_kfoldcv2.R b/bodyfat_kfoldcv2.R index 6994baf..ed14e24 100644 --- a/bodyfat_kfoldcv2.R +++ b/bodyfat_kfoldcv2.R @@ -21,53 +21,22 @@ p0 <- 5 # prior guess for the number of relevant variables tau0 <- p0/(p-p0) * 1/sqrt(n) rhs_prior <- hs(global_scale=tau0) -set.seed(1513306866) -perm <- sample.int(n) -K <- 20 -idx <- ceiling(seq(from = 1, to = n, length.out = K + 1)) -bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE) - fitrhs2 <- stan_glm(formula2, data = dfr, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) -muss <- list() -vsmuss <- list() -vsnvss <- list() -vsnvss2 <- list() -vsnvss3 <- list() -fitcvs <- list() -for (k in 1:K) { - message("Fitting model ", k, " out of ", K) - omitted <- which(bin == k) - fit_k <- update( - object = fitrhs2, - data = dfr[-omitted,, drop=FALSE], - weights = NULL, - refresh = 0 - ) - muss[[k]] <- - colMeans(posterior_linpred(fit_k, - newdata = dfr[omitted, , drop = FALSE])) - fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO', - # TODO: This usage of `nloo` is probably not doing what it's - # supposed to, at least in the current CRAN version of projpred: - nloo = length(which(bin != k)), - nterms_max=10, - seed = 1513306866 + k, verbose = FALSE) - fitcvs[[k]] <- fit_cvvs_k -} -for (k in 1:K) { - omitted <- which(bin == k) - fit_cvvs_k <- fitcvs[[k]] - print(nvk <- suggest_size(fit_cvvs_k, alpha=0.1)) - vsnvss[[k]] <- nvk - fit_cvvs_k$vind[1:nvk] - proj_k <- project(fit_cvvs_k, nterms = nvk, ndraws = 4000) - vsmuss[[k]] <- - colMeans(proj_linpred(proj_k, xnew = dfr[omitted, , drop = FALSE])) -} -mus<-unlist(muss)[order(as.integer(names(unlist(muss))))] -vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))] -vsnvs2 <- unlist(vsnvss) -rmse_full2 <- sqrt(mean((df$siri-mus)^2)) -(rmse_proj2 <- sqrt(mean((df$siri-vsmus)^2))) -save(vsnvs2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData") + +# NOTE: In contrast to the previous code where `ndraws = 4000` was used in +# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default. +# Of course, `ndraws_pred = 4000` could be used instead, but that would increase +# runtime considerably, because `ndraws_pred = 4000` not only applies to the +# suggested model size, but all model sizes. +fit_cvvs2 <- cv_varsel(fitrhs2, method='forward', cv_method='kfold', K = 20, + nterms_max=10, seed = 1513306866 + 1, verbose = FALSE) +print(nv2 <- suggest_size(fit_cvvs2, alpha=0.1)) +# Currently (projpred v2.1.1), we need to use the internal projpred function +# .tabulate_stats() to obtain the reference model's performance: +rmse_full2 <- projpred:::.tabulate_stats(fit_cvvs2, stats = "rmse") +rmse_full2 <- rmse_full2$value[rmse_full2$size == Inf] +# For the submodel, this is the way to go: +smmry2 <- summary(fit_cvvs2, stats = "rmse", nterms_max = nv2) +(rmse_proj2 <- smmry2$selection$rmse.kfold[smmry2$selection$size == nv2]) +save(nv2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData") From 1f010f1a70eedc0a127ef141d83d06481b565dc3 Mon Sep 17 00:00:00 2001 From: fweber144 Date: Tue, 5 Dec 2023 06:08:43 +0100 Subject: [PATCH 12/12] Update to projpred v2.8.0 Changes not related to projpred v2.8.0: 1) Object `df` (files `bodyfat_bootstrap.R`, `bodyfat_kfoldcv.R`, and `bodyfat_kfoldcv2.R`) is modified for consistency with file `bodyfat.Rmd`. 2) Objects consisting of a model formula are coerced to `formula`s because otherwise, `projpred:::get_refmodel.stanreg()` throws an error. --- bodyfat.Rmd | 45 ++++++++++++--------------------------------- bodyfat_bootstrap.R | 12 ++++++++---- bodyfat_kfoldcv.R | 20 ++++++++++---------- bodyfat_kfoldcv2.R | 24 +++++++++++++----------- projpredpct.R | 20 -------------------- 5 files changed, 43 insertions(+), 78 deletions(-) delete mode 100644 projpredpct.R diff --git a/bodyfat.Rmd b/bodyfat.Rmd index 1144ee8..e363b63 100644 --- a/bodyfat.Rmd +++ b/bodyfat.Rmd @@ -168,8 +168,11 @@ height as two central IVs [independent variables] for estimating body fat proportion, and will not subject these two to variable selection.'' We subject all variables to selection. ```{r, results='hide'} +doParallel::registerDoParallel(getOption("mc.cores", 1)) fitrhs_cvvs <- cv_varsel(fitrhs, method = 'forward', cv_method = 'loo', - seed = SEED, verbose = FALSE) + seed = SEED, verbose = FALSE, parallel = TRUE) +doParallel::stopImplicitCluster() +foreach::registerDoSEQ() ``` And the estimated predictive performance of smaller models compared to the full model. @@ -183,7 +186,7 @@ get a PSIS-LOO [@Vehtari+etal:PSIS-LOO:2017] based recommendation for the model size to choose. ```{r} (nsel <- suggest_size(fitrhs_cvvs, alpha=0.1)) -(vsel <- solution_terms(fitrhs_cvvs)[1:nsel]) +(vsel <- ranking(fitrhs_cvvs)$fulldata[1:nsel]) ``` Based on this recommendation we continue with two variables `abdomen` @@ -212,37 +215,10 @@ of the selection method they used. Before looking at the corresponding bootstrap results we can look at the stability of selection process based on the LOO-CV selection paths -computed by `cv_varsel` (the code to make the following plot will be -included in __projpred__ package). +computed by `cv_varsel`. ```{r} -source("projpredpct.R") -pct_cum <- apply(fitrhs_cvvs[["pct_solution_terms_cv"]][, -1], 2, cumsum) -pct_cum <- cbind(fitrhs_cvvs[["pct_solution_terms_cv"]][, 1, drop = FALSE], pct_cum) -rows <- nrow(pct_cum) -col <- nrow(pct_cum) -pctch <- round(pct_cum, 2) -colnames(pctch)[1] <- ".size" -pct <- get_pct_arr(pctch, 13) -col_brks <- get_col_brks() -pct$val_grp <- as.character(sapply(pct$val, function(x) sum(x >= col_brks$breaks))) -if (identical(rows, 0)) rows <- pct$var[1] -pct$sel <- (pct$.size == col) & (pct$var %in% rows) -brks <- sort(unique(as.numeric(pct$val_grp)) + 1) -ggplot(pct, aes_(x = ~.size, y = ~var)) + - geom_tile(aes_(fill = ~val_grp, color = ~sel), - width = 1, height = 0.9, size = 1) + - geom_text(aes_(label = ~val, fontface = ~sel+1)) + - coord_cartesian(expand = FALSE) + - scale_y_discrete(limits = rev(levels(pct$var))) + - scale_x_discrete(limits = factor(seq(1,col))) + - scale_color_manual(values = c("white", "black")) + - labs(x = "Model size", y = "", - title = "Fraction of cv-folds that select the given variable") + - scale_fill_manual(breaks = brks, values = col_brks$pal[brks]) + - theme_proj() + - theme(legend.position = "none", - axis.text.y = element_text(angle = 45)) +plot(cv_proportions(fitrhs_cvvs, cumulate = TRUE)) ``` For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen`, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent. @@ -470,10 +446,13 @@ tau0 <- p0/(p-p0) * 1/sqrt(n) hs_prior <- hs(global_scale=tau0) fitrhs2 <- stan_glm(formula2, data = dfr, prior = hs_prior, QR = TRUE, seed=SEED, refresh=0) +doParallel::registerDoParallel(getOption("mc.cores", 1)) fitrhs2_cvvs <- cv_varsel(fitrhs2, method = 'forward', cv_method = 'loo', - seed = SEED, verbose = FALSE) + seed = SEED, verbose = FALSE, parallel = TRUE) +doParallel::stopImplicitCluster() +foreach::registerDoSEQ() nsel2 <- suggest_size(fitrhs2_cvvs, alpha=0.1) -vsel2 <- solution_terms(fitrhs2_cvvs)[1:nsel2] +vsel2 <- ranking(fitrhs2_cvvs)$fulldata[1:nsel2] loormse_full2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$ref$mu)^2)) loormse_proj2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$sub[[nsel2]]$mu)^2)) print(paste('PSIS-LOO RMSE Bayesian full model: ', round(loormse_full2,1))) diff --git a/bodyfat_bootstrap.R b/bodyfat_bootstrap.R index ae936bb..d6a3a17 100644 --- a/bodyfat_bootstrap.R +++ b/bodyfat_bootstrap.R @@ -8,14 +8,18 @@ library(dplyr) #' Bootstrap iterations for projpred for bodyfat df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+"))) p <- length(pred) p0 <- 2 # prior guess for the number of relevant variables @@ -31,7 +35,7 @@ for (i in 1:bootnum) { set.seed(5437854+i) data_id <- sample(1:dim(df)[1], replace = T) bbn[i,] <- length(unique(data_id)) - fitb <- stan_glm(formula, data = df[data_id, ], + fitb <- stan_glm(formula_obj, data = df[data_id, ], prior=rhs_prior, QR=TRUE, seed=i, refresh=0) bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', seed = 5437854-i, verbose = FALSE) @@ -47,7 +51,7 @@ boot_01 <- data.frame(boot_01) bn <- data.frame(boot_nvs) %>% group_by_all() %>% count(sort=TRUE) bd <- boot_01 %>% group_by_at(vars(-X.Intercept.)) %>% count(sort=TRUE) -boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "X.Intercept.") %>% arrange(-projpred_incp) +boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "(Intercept)") %>% arrange(-projpred_incp) boot_inclusion$steplm_incp <- c(100, 28, 98, 100, 85, 63, 51, 48, 34, 43, 54, 41, 18) boot_inclusion <- boot_inclusion %>% rename(projpred=projpred_incp, steplm=steplm_incp) cumsum(bd$n) diff --git a/bodyfat_kfoldcv.R b/bodyfat_kfoldcv.R index eb50f4e..004de18 100644 --- a/bodyfat_kfoldcv.R +++ b/bodyfat_kfoldcv.R @@ -2,20 +2,24 @@ df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+"))) p <- length(pred) p0 <- 5 # prior guess for the number of relevant variables tau0 <- p0/(p-p0) * 1/sqrt(n) rhs_prior <- hs(global_scale=tau0) -fitrhs <- stan_glm(formula, data = df, prior=rhs_prior, QR=TRUE, +fitrhs <- stan_glm(formula_obj, data = df, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) # NOTE: In contrast to the previous code where `ndraws = 4000` was used in @@ -26,11 +30,7 @@ fitrhs <- stan_glm(formula, data = df, prior=rhs_prior, QR=TRUE, fit_cvvs <- cv_varsel(fitrhs, method='forward', cv_method='kfold', K = 20, seed = 1513306866 + 1) nv <- suggest_size(fit_cvvs,alpha=0.1) -# Currently (projpred v2.1.1), we need to use the internal projpred function -# .tabulate_stats() to obtain the reference model's performance: -rmse_full <- projpred:::.tabulate_stats(fit_cvvs, stats = "rmse") -rmse_full <- rmse_full$value[rmse_full$size == Inf] -# For the submodel, this is the way to go: -smmry <- summary(fit_cvvs, stats = "rmse", nterms_max = nv) -rmse_proj <- smmry$selection$rmse.kfold[smmry$selection$size == nv] +perfs <- performances(fit_cvvs, stats = "rmse", nterms_max = nv) +rmse_full <- unname(perfs$reference_model["rmse"]) +rmse_proj <- perfs$submodels$rmse[perfs$submodels$size == nv] save(nv, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData") diff --git a/bodyfat_kfoldcv2.R b/bodyfat_kfoldcv2.R index ed14e24..dac32d2 100644 --- a/bodyfat_kfoldcv2.R +++ b/bodyfat_kfoldcv2.R @@ -2,26 +2,32 @@ df <- read.table(here("bodyfat.txt"), header = T, sep = ";") df[,4:19] <- scale(df[,4:19]) +# no-one can have 0% body fat +df <- df[df$siri>0,] df <- as.data.frame(df) n <- nrow(df) -colnames(df[c("weight_kg", "height")]) <- c("weight", "height") +### There is already a variable called `weight` with almost the same values: +# colnames(df[c("weight_kg")]) <- c("weight") +### pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip", "thigh", "knee", "ankle", "biceps", "forearm", "wrist") target <- "siri" -formula <- paste("siri~", paste(pred, collapse = "+")) +formula_chr <- paste("siri~", paste(pred, collapse = "+")) set.seed(1513306866) noise <- array(rnorm(87*n), c(n,87)) dfr<-cbind(df,noise=noise) -formula2<-paste(formula,"+",paste(colnames(dfr[,20:106]), collapse = "+")) +formula_obj2<-as.formula( + paste(formula_chr,"+",paste(colnames(dfr[,20:106]), collapse = "+")) +) p <- 100 p0 <- 5 # prior guess for the number of relevant variables tau0 <- p0/(p-p0) * 1/sqrt(n) rhs_prior <- hs(global_scale=tau0) -fitrhs2 <- stan_glm(formula2, data = dfr, prior=rhs_prior, QR=TRUE, +fitrhs2 <- stan_glm(formula_obj2, data = dfr, prior=rhs_prior, QR=TRUE, seed=1513306866, refresh=0) # NOTE: In contrast to the previous code where `ndraws = 4000` was used in @@ -32,11 +38,7 @@ fitrhs2 <- stan_glm(formula2, data = dfr, prior=rhs_prior, QR=TRUE, fit_cvvs2 <- cv_varsel(fitrhs2, method='forward', cv_method='kfold', K = 20, nterms_max=10, seed = 1513306866 + 1, verbose = FALSE) print(nv2 <- suggest_size(fit_cvvs2, alpha=0.1)) -# Currently (projpred v2.1.1), we need to use the internal projpred function -# .tabulate_stats() to obtain the reference model's performance: -rmse_full2 <- projpred:::.tabulate_stats(fit_cvvs2, stats = "rmse") -rmse_full2 <- rmse_full2$value[rmse_full2$size == Inf] -# For the submodel, this is the way to go: -smmry2 <- summary(fit_cvvs2, stats = "rmse", nterms_max = nv2) -(rmse_proj2 <- smmry2$selection$rmse.kfold[smmry2$selection$size == nv2]) +perfs2 <- performances(fit_cvvs2, stats = "rmse", nterms_max = nv2) +rmse_full2 <- unname(perfs2$reference_model["rmse"]) +rmse_proj2 <- perfs2$submodels$rmse[perfs2$submodels$size == nv2] save(nv2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData") diff --git a/projpredpct.R b/projpredpct.R deleted file mode 100644 index 022d110..0000000 --- a/projpredpct.R +++ /dev/null @@ -1,20 +0,0 @@ -library(tidyverse) -library(RColorBrewer) - -get_pct_arr <- function(pctch, nv) { - as_tibble(pctch[1:nv, 1:(nv + 1)]) %>% - gather("var", "val", colnames(pctch)[1:nv+1], factor_key = T) -} - -get_col_brks <- function() { - list(breaks = seq(5e-3, 1-5e-3, length.out = 7), - pal = brewer.pal(11, "RdBu")[3:10]) -} - -theme_proj <- function() { - theme(axis.text = element_text(size = 15), - axis.title = element_text(size = 15), - strip.text = element_text(size = 15), - plot.title = element_text(size = 15, face = "bold", hjust = 0.6)) -} -