Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 14 additions & 33 deletions bodyfat.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,11 @@ height as two central IVs [independent variables] for estimating body
fat proportion, and will not subject these two to variable
selection.'' We subject all variables to selection.
```{r, results='hide'}
doParallel::registerDoParallel(getOption("mc.cores", 1))
fitrhs_cvvs <- cv_varsel(fitrhs, method = 'forward', cv_method = 'loo',
nloo = n, verbose = FALSE)
seed = SEED, verbose = FALSE, parallel = TRUE)
doParallel::stopImplicitCluster()
foreach::registerDoSEQ()
```

And the estimated predictive performance of smaller models compared to the full model.
Expand All @@ -183,7 +186,7 @@ get a PSIS-LOO [@Vehtari+etal:PSIS-LOO:2017] based recommendation for
the model size to choose.
```{r}
(nsel <- suggest_size(fitrhs_cvvs, alpha=0.1))
(vsel <- solution_terms(fitrhs_cvvs)[1:nsel])
(vsel <- ranking(fitrhs_cvvs)$fulldata[1:nsel])
```

Based on this recommendation we continue with two variables `abdomen`
Expand All @@ -193,7 +196,7 @@ Dunkler (2018) had seven variables `height` (fixed), `abdomen` (fixed),

Form the projected posterior for the selected model.
```{r}
projrhs <- project(fitrhs_cvvs, nv = nsel, ns = 4000)
projrhs <- project(fitrhs_cvvs, nterms = nsel, ndraws = 4000)
```

Plot the marginals of the projected posterior.
Expand All @@ -212,38 +215,13 @@ of the selection method they used.

Before looking at the corresponding bootstrap results we can look at the
stability of selection process based on the LOO-CV selection paths
computed by `cv_varsel` (the code to make the following plot will be
included in __projpred__ package).
computed by `cv_varsel`.

```{r}
source("projpredpct.R")
rows <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]])
col <- nrow(fitrhs_cvvs[["pct_solution_terms_cv"]])
pctch <- round(fitrhs_cvvs[["pct_solution_terms_cv"]], 2)
colnames(pctch)[1] <- ".size"
pct <- get_pct_arr(pctch, 13)
col_brks <- get_col_brks()
pct$val_grp <- as.character(sapply(pct$val, function(x) sum(x >= col_brks$breaks)))
if (identical(rows, 0)) rows <- pct$var[1]
pct$sel <- (pct$.size == col) & (pct$var %in% rows)
brks <- sort(unique(as.numeric(pct$val_grp)) + 1)
ggplot(pct, aes_(x = ~.size, y = ~var)) +
geom_tile(aes_(fill = ~val_grp, color = ~sel),
width = 1, height = 0.9, size = 1) +
geom_text(aes_(label = ~val, fontface = ~sel+1)) +
coord_cartesian(expand = FALSE) +
scale_y_discrete(limits = rev(levels(pct$var))) +
scale_x_discrete(limits = seq(1,col)) +
scale_color_manual(values = c("white", "black")) +
labs(x = "Model size", y = "",
title = "Fraction of cv-folds that select the given variable") +
scale_fill_manual(breaks = brks, values = col_brks$pal[brks]) +
theme_proj() +
theme(legend.position = "none",
axis.text.y = element_text(angle = 45))
plot(cv_proportions(fitrhs_cvvs, cumulate = TRUE))
```

For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen?, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent.
For model sizes 1-3 selection paths in different LOO-CV cases are always the same `abdomen`, `weight`, and `wrist`. For larger model sizes there are some small variation, but mostly the order is quite consistent.

Running `stan_glm` with `prior=hs()` and `cv_varsel` do not take much time when run only once, but for a notebook running them 1000 times would take hours. The code for running the above variable selection procedure for 100 different bootstrapped datasets is as follows.
```{r}
Expand Down Expand Up @@ -468,10 +446,13 @@ tau0 <- p0/(p-p0) * 1/sqrt(n)
hs_prior <- hs(global_scale=tau0)
fitrhs2 <- stan_glm(formula2, data = dfr, prior = hs_prior, QR = TRUE,
seed=SEED, refresh=0)
doParallel::registerDoParallel(getOption("mc.cores", 1))
fitrhs2_cvvs <- cv_varsel(fitrhs2, method = 'forward', cv_method = 'loo',
nloo=n, verbose = FALSE)
seed = SEED, verbose = FALSE, parallel = TRUE)
doParallel::stopImplicitCluster()
foreach::registerDoSEQ()
nsel2 <- suggest_size(fitrhs2_cvvs, alpha=0.1)
vsel2 <- solution_terms(fitrhs2_cvvs)[1:nsel2]
vsel2 <- ranking(fitrhs2_cvvs)$fulldata[1:nsel2]
loormse_full2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$ref$mu)^2))
loormse_proj2 <- sqrt(mean((df$siri-fitrhs2_cvvs$summaries$sub[[nsel2]]$mu)^2))
print(paste('PSIS-LOO RMSE Bayesian full model: ', round(loormse_full2,1)))
Expand Down
18 changes: 11 additions & 7 deletions bodyfat_bootstrap.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,18 @@ library(dplyr)
#' Bootstrap iterations for projpred for bodyfat
df <- read.table(here("bodyfat.txt"), header = T, sep = ";")
df[,4:19] <- scale(df[,4:19])
# no-one can have 0% body fat
df <- df[df$siri>0,]
df <- as.data.frame(df)
n <- nrow(df)
colnames(df[c("weight_kg", "height")]) <- c("weight", "height")
### There is already a variable called `weight` with almost the same values:
# colnames(df[c("weight_kg")]) <- c("weight")
###

pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip",
"thigh", "knee", "ankle", "biceps", "forearm", "wrist")
target <- "siri"
formula <- paste("siri~", paste(pred, collapse = "+"))
formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+")))
p <- length(pred)

p0 <- 2 # prior guess for the number of relevant variables
Expand All @@ -31,14 +35,14 @@ for (i in 1:bootnum) {
set.seed(5437854+i)
data_id <- sample(1:dim(df)[1], replace = T)
bbn[i,] <- length(unique(data_id))
fitb <- stan_glm(formula, data = df[data_id, ],
fitb <- stan_glm(formula_obj, data = df[data_id, ],
prior=rhs_prior, QR=TRUE, seed=i, refresh=0)
bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO', nloo=n,
verbose = FALSE)
bcvvs <- cv_varsel(fitb, method='forward', cv_method='LOO',
seed = 5437854-i, verbose = FALSE)
print(nv <- suggest_size(bcvvs,alpha=0.1))
boot_nvs[i,] <- nv
print(bcvvs$vind[1:nv])
projb <- project(bcvvs, nv = nv, ns = 4000)
projb <- project(bcvvs, nterms = nv, ndraws = 4000)
boot_est[i, colnames(as.matrix(projb)[,-(nv+2)])] <- colMeans(as.matrix(projb)[,-(nv+2)])
}
boot_01 <- (boot_est != 0) * 1
Expand All @@ -47,7 +51,7 @@ boot_01 <- data.frame(boot_01)

bn <- data.frame(boot_nvs) %>% group_by_all() %>% count(sort=TRUE)
bd <- boot_01 %>% group_by_at(vars(-X.Intercept.)) %>% count(sort=TRUE)
boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "X.Intercept.") %>% arrange(-projpred_incp)
boot_inclusion <- boot_inclusion %>% tibble::rownames_to_column(var="variable") %>% filter(variable != "(Intercept)") %>% arrange(-projpred_incp)
boot_inclusion$steplm_incp <- c(100, 28, 98, 100, 85, 63, 51, 48, 34, 43, 54, 41, 18)
boot_inclusion <- boot_inclusion %>% rename(projpred=projpred_incp, steplm=steplm_incp)
cumsum(bd$n)
Expand Down
58 changes: 19 additions & 39 deletions bodyfat_kfoldcv.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,35 @@

df <- read.table(here("bodyfat.txt"), header = T, sep = ";")
df[,4:19] <- scale(df[,4:19])
# no-one can have 0% body fat
df <- df[df$siri>0,]
df <- as.data.frame(df)
n <- nrow(df)
colnames(df[c("weight_kg", "height")]) <- c("weight", "height")
### There is already a variable called `weight` with almost the same values:
# colnames(df[c("weight_kg")]) <- c("weight")
###

pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip",
"thigh", "knee", "ankle", "biceps", "forearm", "wrist")
target <- "siri"
formula <- paste("siri~", paste(pred, collapse = "+"))
formula_obj <- as.formula(paste("siri~", paste(pred, collapse = "+")))
p <- length(pred)

p0 <- 5 # prior guess for the number of relevant variables
tau0 <- p0/(p-p0) * 1/sqrt(n)
rhs_prior <- hs(global_scale=tau0)
fitrhs <- stan_glm(formula, data = df, prior=rhs_prior, QR=TRUE,
fitrhs <- stan_glm(formula_obj, data = df, prior=rhs_prior, QR=TRUE,
seed=1513306866, refresh=0)

set.seed(5437854)
perm <- sample.int(n)
K <- 20
idx <- ceiling(seq(from = 1, to = n, length.out = K + 1))
bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE)

muss <- list()
vsmuss <- list()
vsnvss <- list()
vsnvss2 <- list()
for (k in 1:K) {
message("Fitting model ", k, " out of ", K)
omitted <- which(bin == k)
fit_k <- update(
object = fitrhs,
data = df[-omitted,, drop=FALSE],
weights = NULL,
refresh = 0
)
fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO',
nloo=nrow(df[-omitted,, drop=FALSE]))
nvk <- suggest_size(fit_cvvs_k,alpha=0.1)
vsnvss[[k]] <- nvk
proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000)
muss[[k]] <-
colMeans(posterior_linpred(fit_k,
newdata = df[omitted, , drop = FALSE]))
vsmuss[[k]] <-
colMeans(proj_linpred(proj_k, xnew = df[omitted, , drop = FALSE]))
}
mus<-unlist(muss)[order(as.integer(names(unlist(muss))))]
vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))]
vsnvs <- unlist(vsnvss)
rmse_full <- sqrt(mean((df$siri-mus)^2))
rmse_proj <- sqrt(mean((df$siri-vsmus)^2))
save(vsnvs, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData")
# NOTE: In contrast to the previous code where `ndraws = 4000` was used in
# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default.
# Of course, `ndraws_pred = 4000` could be used instead, but that would increase
# runtime considerably, because `ndraws_pred = 4000` not only applies to the
# suggested model size, but all model sizes.
fit_cvvs <- cv_varsel(fitrhs, method='forward', cv_method='kfold', K = 20,
seed = 1513306866 + 1)
nv <- suggest_size(fit_cvvs,alpha=0.1)
perfs <- performances(fit_cvvs, stats = "rmse", nterms_max = nv)
rmse_full <- unname(perfs$reference_model["rmse"])
rmse_proj <- perfs$submodels$rmse[perfs$submodels$size == nv]
save(nv, rmse_full, rmse_proj, file = "bodyfat_kfoldcv.RData")
72 changes: 23 additions & 49 deletions bodyfat_kfoldcv2.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,69 +2,43 @@

df <- read.table(here("bodyfat.txt"), header = T, sep = ";")
df[,4:19] <- scale(df[,4:19])
# no-one can have 0% body fat
df <- df[df$siri>0,]
df <- as.data.frame(df)
n <- nrow(df)
colnames(df[c("weight_kg", "height")]) <- c("weight", "height")
### There is already a variable called `weight` with almost the same values:
# colnames(df[c("weight_kg")]) <- c("weight")
###

pred <- c("age", "weight", "height", "neck", "chest", "abdomen", "hip",
"thigh", "knee", "ankle", "biceps", "forearm", "wrist")
target <- "siri"
formula <- paste("siri~", paste(pred, collapse = "+"))
formula_chr <- paste("siri~", paste(pred, collapse = "+"))

set.seed(1513306866)
noise <- array(rnorm(87*n), c(n,87))
dfr<-cbind(df,noise=noise)
formula2<-paste(formula,"+",paste(colnames(dfr[,20:106]), collapse = "+"))
formula_obj2<-as.formula(
paste(formula_chr,"+",paste(colnames(dfr[,20:106]), collapse = "+"))
)
p <- 100

p0 <- 5 # prior guess for the number of relevant variables
tau0 <- p0/(p-p0) * 1/sqrt(n)
rhs_prior <- hs(global_scale=tau0)

set.seed(1513306866)
perm <- sample.int(n)
K <- 20
idx <- ceiling(seq(from = 1, to = n, length.out = K + 1))
bin <- .bincode(perm, breaks = idx, right = FALSE, include.lowest = TRUE)

fitrhs2 <- stan_glm(formula2, data = dfr, prior=rhs_prior, QR=TRUE,
fitrhs2 <- stan_glm(formula_obj2, data = dfr, prior=rhs_prior, QR=TRUE,
seed=1513306866, refresh=0)
muss <- list()
vsmuss <- list()
vsnvss <- list()
vsnvss2 <- list()
vsnvss3 <- list()
fitcvs <- list()
for (k in 1:K) {
message("Fitting model ", k, " out of ", K)
omitted <- which(bin == k)
fit_k <- update(
object = fitrhs2,
data = dfr[-omitted,, drop=FALSE],
weights = NULL,
refresh = 0
)
muss[[k]] <-
colMeans(posterior_linpred(fit_k,
newdata = dfr[omitted, , drop = FALSE]))
fit_cvvs_k <- cv_varsel(fit_k, method='forward', cv_method='LOO',
nloo = length(which(bin != k)), nv_max=10,
verbose = FALSE)
fitcvs[[k]] <- fit_cvvs_k
}
for (k in 1:K) {
omitted <- which(bin == k)
fit_cvvs_k <- fitcvs[[k]]
print(nvk <- suggest_size(fit_cvvs_k, alpha=0.1))
vsnvss[[k]] <- nvk
fit_cvvs_k$vind[1:nvk]
proj_k <- project(fit_cvvs_k, nv = nvk, ns = 4000)
vsmuss[[k]] <-
colMeans(proj_linpred(proj_k, xnew = dfr[omitted, , drop = FALSE]))
}
mus<-unlist(muss)[order(as.integer(names(unlist(muss))))]
vsmus<-unlist(vsmuss)[order(as.integer(names(unlist(vsmuss))))]
vsnvs2 <- unlist(vsnvss)
rmse_full2 <- sqrt(mean((df$siri-mus)^2))
(rmse_proj2 <- sqrt(mean((df$siri-vsmus)^2)))
save(vsnvs2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData")

# NOTE: In contrast to the previous code where `ndraws = 4000` was used in
# project(), the following cv_varsel() call uses `ndraws_pred = 400` by default.
# Of course, `ndraws_pred = 4000` could be used instead, but that would increase
# runtime considerably, because `ndraws_pred = 4000` not only applies to the
# suggested model size, but all model sizes.
fit_cvvs2 <- cv_varsel(fitrhs2, method='forward', cv_method='kfold', K = 20,
nterms_max=10, seed = 1513306866 + 1, verbose = FALSE)
print(nv2 <- suggest_size(fit_cvvs2, alpha=0.1))
perfs2 <- performances(fit_cvvs2, stats = "rmse", nterms_max = nv2)
rmse_full2 <- unname(perfs2$reference_model["rmse"])
rmse_proj2 <- perfs2$submodels$rmse[perfs2$submodels$size == nv2]
save(nv2, rmse_full2, rmse_proj2, file = "bodyfat_kfoldcv2.RData")
20 changes: 0 additions & 20 deletions projpredpct.R

This file was deleted.