From 90c136edd9ac85b93f94f0d60e1e771ed416a533 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 15 Oct 2024 16:15:59 -0500 Subject: [PATCH 1/2] Initial refactor of kernel interface --- DESCRIPTION | 12 ++-- R/cpp11.R | 8 +++ R/kernel.R | 115 ++++++++++++++++++++------------ include/stochtree/container.h | 3 + include/stochtree/ensemble.h | 49 +++++++++++++- include/stochtree/kernel.h | 2 +- include/stochtree/tree.h | 4 ++ man/computeForestLeafIndices.Rd | 55 ++++++++------- src/container.cpp | 13 ++++ src/cpp11.cpp | 16 +++++ src/kernel.cpp | 40 +++++++++++ src/tree.cpp | 16 +++++ tools/debug/r_kernel.R | 40 +++++++++++ vignettes/EnsembleKernel.Rmd | 37 ++++++---- 14 files changed, 317 insertions(+), 93 deletions(-) create mode 100644 tools/debug/r_kernel.R diff --git a/DESCRIPTION b/DESCRIPTION index 6e35bc0f..71157a8d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,17 +17,17 @@ RoxygenNote: 7.3.1 LinkingTo: cpp11 Suggests: + doParallel, + foreach, + ggplot2, knitr, - rmarkdown, + latex2exp, Matrix, - tgp, MASS, mvtnorm, - ggplot2, - latex2exp, + rmarkdown, testthat (>= 3.0.0), - foreach, - doParallel + tgp VignetteBuilder: knitr SystemRequirements: C++17 Imports: diff --git a/R/cpp11.R b/R/cpp11.R index 4fc4c598..77e801d5 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -344,6 +344,10 @@ forest_kernel_cpp <- function() { .Call(`_stochtree_forest_kernel_cpp`) } +forest_container_get_max_leaf_index_cpp <- function(forest_container, forest_num) { + .Call(`_stochtree_forest_container_get_max_leaf_index_cpp`, forest_container, forest_num) +} + forest_kernel_compute_leaf_indices_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) { invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num)) } @@ -368,6 +372,10 @@ forest_kernel_compute_kernel_train_test_cpp <- function(forest_kernel, covariate .Call(`_stochtree_forest_kernel_compute_kernel_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num) } +compute_leaf_indices_cpp <- function(forest_container, covariates, forest_nums) { + .Call(`_stochtree_compute_leaf_indices_cpp`, forest_container, covariates, forest_nums) +} + sample_gfr_one_iteration_cpp <- function(data, residual, forest_samples, tracker, split_prior, rng, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, pre_initialized) { invisible(.Call(`_stochtree_sample_gfr_one_iteration_cpp`, data, residual, forest_samples, tracker, split_prior, rng, feature_types, cutpoint_grid_size, leaf_model_scale_input, variable_weights, a_forest, b_forest, global_variance, leaf_model_int, pre_initialized)) } diff --git a/R/kernel.R b/R/kernel.R index 2675d2b3..234db6c4 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -200,53 +200,82 @@ computeForestKernels <- function(bart_model, X_train, X_test=NULL, forest_num=NU #' mapped column index that corresponds to a single leaf of a single tree (i.e. #' if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's #' leaf indices begin at 3, etc...). -#' Users may pass a single dataset (which we refer to here as a "training set") -#' or two datasets (which we refer to as "training and test sets"). This verbiage -#' hints that one potential use-case for a matrix of leaf indices is to define a -#' ensemble-based kernel for kriging. #' -#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample -#' @param X_train Matrix of "training" data. In a traditional Gaussian process kriging context, this -#' corresponds to the observations for which outcomes are observed. -#' @param X_test (Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this -#' corresponds to the observations for which outcomes are unobserved and must be estimated -#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, -#' this function will only compute k(X_train, X_train). -#' @param forest_num (Optional) Index of the forest sample to use for kernel computation. If not provided, -#' this function will use the last forest. -#' @param forest_type (Optional) Whether to compute the kernel from the mean or variance forest. Default: "mean". Specify "variance" for the variance forest. -#' All other inputs are invalid. Must have sampled the relevant forest or an error will occur. -#' @return List of vectors. If `X_test = NULL`, the list contains -#' one vector of length `n_train * num_trees`, where `n_train = nrow(X_train)` -#' and `num_trees` is the number of trees in `bart_model`. If `X_test` is not `NULL`, -#' the list contains another vector of length `n_test * num_trees`. +#' @param model_object Object of type `bartmodel` or `bcf` corresponding to a BART / BCF model with at least one forest sample +#' @param covariates Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest. +#' @param forest_type Which forest to use from `model_object`. +#' Valid inputs depend on the model type, and whether or not a +#' +#' **1. BART** +#' +#' - `'mean'`: Extracts leaf indices for the mean forest +#' - `'variance'`: Extracts leaf indices for the variance forest +#' +#' **2. BCF** +#' +#' - `'prognostic'`: Extracts leaf indices for the prognostic forest +#' - `'treatment'`: Extracts leaf indices for the treatment effect forest +#' - `'variance'`: Extracts leaf indices for the variance forest +#' +#' @param forest_inds (Optional) Indices of the forest sample(s) for which to compute leaf indices. If not provided, +#' this function will return leaf indices for every sample of a forest. +#' This function uses 1-indexing, so the first forest sample corresponds to `forest_num = 1`, and so on. +#' @return List of vectors. Each vector is of size `num_obs * num_trees`, where `num_obs = nrow(covariates)` +#' and `num_trees` is the number of trees in the relevant forest of `model_object`. #' @export -computeForestLeafIndices <- function(bart_model, X_train, X_test=NULL, forest_num=NULL, forest_type="mean") { - stopifnot(class(bart_model)=="bartmodel") - if (forest_type=="mean") { - if (!bart_model$model_params$include_mean_forest) { - stop("Mean forest was not sampled in the bart model provided") - } - } else if (forest_type=="variance") { - if (!bart_model$model_params$include_variance_forest) { - stop("Variance forest was not sampled in the bart model provided") +computeForestLeafIndices <- function(model_object, covariates, forest_type, forest_inds=NULL) { + # Extract relevant forest container + stopifnot(class(model_object) %in% c("bartmodel", "bcf")) + model_type <- ifelse(class(model_object)=="bartmodel", "bart", "bcf") + if (model_type == "bart") { + stopifnot(forest_type %in% c("mean", "variance")) + if (forest_type=="mean") { + if (!model_object$model_params$include_mean_forest) { + stop("Mean forest was not sampled in the bart model provided") + } + forest_container <- model_object$mean_forests + } else if (forest_type=="variance") { + if (!model_object$model_params$include_variance_forest) { + stop("Variance forest was not sampled in the bart model provided") + } + forest_container <- model_object$variance_forests } } else { - stop("Must provide either 'mean' or 'variance' for the `forest_type` parameter") + stopifnot(forest_type %in% c("prognostic", "treatment", "variance")) + if (forest_type=="prognostic") { + forest_container <- model_object$forests_mu + } else if (forest_type=="treatment") { + forest_container <- model_object$forests_tau + } else if (forest_type=="variance") { + if (!model_object$model_params$include_variance_forest) { + stop("Variance forest was not sampled in the bcf model provided") + } + forest_container <- model_object$variance_forests + } } - forest_kernel <- createForestKernel() - num_samples <- bart_model$model_params$num_samples - stopifnot(forest_num <= num_samples) - sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1) - if (forest_type == "mean") { - return(forest_kernel$compute_leaf_indices( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$mean_forests, forest_num = sample_index - )) - } else if (forest_type == "variance") { - return(forest_kernel$compute_leaf_indices( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$variance_forests, forest_num = sample_index - )) + + # Preprocess covariates + if ((!is.data.frame(covariates)) && (!is.matrix(covariates))) { + stop("covariates must be a matrix or dataframe") + } + train_set_metadata <- model_object$train_set_metadata + covariates_processed <- preprocessPredictionData(covariates, train_set_metadata) + + # Preprocess forest indices + num_forests <- forest_container$num_samples() + if (is.null(forest_inds)) { + forest_inds <- as.integer(1:num_forests - 1) + } else { + stopifnot(all(forest_inds <= num_forests)) + stopifnot(all(forest_inds >= 1)) + forest_inds <- as.integer(forest_inds - 1) } + + # Compute leaf indices + leaf_ind_matrix <- compute_leaf_indices_cpp( + forest_container$forest_container_ptr, + covariates_processed, forest_inds + ) + + return(leaf_ind_matrix) } diff --git a/include/stochtree/container.h b/include/stochtree/container.h index 7b7719da..94121ee3 100644 --- a/include/stochtree/container.h +++ b/include/stochtree/container.h @@ -36,6 +36,9 @@ class ForestContainer { void PredictInPlace(ForestDataset& dataset, std::vector& output); void PredictRawInPlace(ForestDataset& dataset, std::vector& output); void PredictRawInPlace(ForestDataset& dataset, int forest_num, std::vector& output); + void PredictLeafIndicesInplace(Eigen::Map>& covariates, + Eigen::Map>& output, + std::vector& forest_indices, int num_trees, data_size_t n); inline TreeEnsemble* GetEnsemble(int i) {return forests_[i].get();} inline int32_t NumSamples() {return num_samples_;} diff --git a/include/stochtree/ensemble.h b/include/stochtree/ensemble.h index 6ed52c63..bbec292c 100644 --- a/include/stochtree/ensemble.h +++ b/include/stochtree/ensemble.h @@ -241,6 +241,20 @@ class TreeEnsemble { } } + /*! + * \brief Obtain a 0-based "maximum" leaf index for an ensemble, which is equivalent to the sum of the + * number of leaves in each tree. This is used in conjunction with `PredictLeafIndicesInplace`, + * which returns an observation-specific leaf index for every observation-tree pair. + */ + int GetMaxLeafIndex() { + int max_leaf = 0; + for (int j = 0; j < num_trees_; j++) { + auto &tree = *trees_[j]; + max_leaf += tree.NumLeaves(); + } + return max_leaf; + } + /*! * \brief Obtain a 0-based leaf index for every tree in an ensemble and for each * observation in a ForestDataset. Internally, trees are stored as essentially @@ -274,7 +288,7 @@ class TreeEnsemble { * * Note: this assumes the creation of a vector of column indices of size * `dataset.NumObservations()` x `ensemble.NumTrees()` - * \param ForestDataset Dataset with which to predict leaf indices from the tree + * \param covariates Matrix of covariates * \param output Vector of length num_trees*n which stores the leaf node prediction * \param num_trees Number of trees in an ensemble * \param n Size of dataset @@ -292,6 +306,39 @@ class TreeEnsemble { } } + /*! + * \brief Obtain a 0-based leaf index for every tree in an ensemble and for each + * observation in a ForestDataset. Internally, trees are stored as essentially + * vectors of node information, and the leaves_ vector gives us node IDs for every + * leaf in the tree. Here, we would like to know, for every observation in a dataset, + * which leaf number it is mapped to. Since the leaf numbers themselves + * do not carry any information, we renumber them from 0 to `leaves_.size()-1`. + * We compute this at the tree-level and coordinate this computation at the + * ensemble level. + * + * Note: this assumes the creation of a matrix of column indices with `num_trees*n` rows + * and as many columns as forests that were requested from R / Python + * \param covariates Matrix of covariates + * \param output Matrix with num_trees*n rows and as many columns as forests that were requested from R / Python + * \param column_ind Index of column in `output` into which the result should be unpacked + * \param num_trees Number of trees in an ensemble + * \param n Size of dataset + */ + void PredictLeafIndicesInplace(Eigen::Map>& covariates, + Eigen::Map>& output, + int column_ind, int num_trees, data_size_t n) { + CHECK_GE(output.size(), num_trees*n); + int offset = 0; + int max_leaf = 0; + for (int j = 0; j < num_trees; j++) { + auto &tree = *trees_[j]; + int num_leaves = tree.NumLeaves(); + tree.PredictLeafIndexInplace(covariates, output, column_ind, offset, max_leaf); + offset += n; + max_leaf += num_leaves; + } + } + /*! * \brief Obtain a 0-based leaf index for every tree in an ensemble and for each * observation in a ForestDataset. Internally, trees are stored as essentially diff --git a/include/stochtree/kernel.h b/include/stochtree/kernel.h index 26c98446..747b953c 100644 --- a/include/stochtree/kernel.h +++ b/include/stochtree/kernel.h @@ -252,4 +252,4 @@ class ForestKernel { } // namespace StochTree -#endif // STOCHTREE_TREE_KERNEL_H_ +#endif // STOCHTREE_TREE_KERNEL_H_ \ No newline at end of file diff --git a/include/stochtree/tree.h b/include/stochtree/tree.h index 2e2f7345..d027c4e6 100644 --- a/include/stochtree/tree.h +++ b/include/stochtree/tree.h @@ -693,6 +693,10 @@ class Tree { */ void PredictLeafIndexInplace(Eigen::Map>& covariates, std::vector& output, int32_t offset, int32_t max_leaf); + void PredictLeafIndexInplace(Eigen::Map>& covariates, + Eigen::Map>& output, + int column_ind, int32_t offset, int32_t max_leaf); + // Node info std::vector node_type_; std::vector parent_; diff --git a/man/computeForestLeafIndices.Rd b/man/computeForestLeafIndices.Rd index 2c32b951..7b9b788a 100644 --- a/man/computeForestLeafIndices.Rd +++ b/man/computeForestLeafIndices.Rd @@ -11,42 +11,43 @@ first tree in an ensemble, the next \code{n} elements correspond to predictions the second tree and so on. The "data" for each element corresponds to a uniquely mapped column index that corresponds to a single leaf of a single tree (i.e. if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's -leaf indices begin at 3, etc...). -Users may pass a single dataset (which we refer to here as a "training set") -or two datasets (which we refer to as "training and test sets"). This verbiage -hints that one potential use-case for a matrix of leaf indices is to define a -ensemble-based kernel for kriging.} +leaf indices begin at 3, etc...).} \usage{ computeForestLeafIndices( - bart_model, - X_train, - X_test = NULL, - forest_num = NULL, - forest_type = "mean" + model_object, + covariates, + forest_type, + forest_inds = NULL ) } \arguments{ -\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample} +\item{model_object}{Object of type \code{bartmodel} or \code{bcf} corresponding to a BART / BCF model with at least one forest sample} + +\item{covariates}{Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest.} -\item{X_train}{Matrix of "training" data. In a traditional Gaussian process kriging context, this -corresponds to the observations for which outcomes are observed.} +\item{forest_type}{Which forest to use from \code{model_object}. +Valid inputs depend on the model type, and whether or not a -\item{X_test}{(Optional) Matrix of "test" data. In a traditional Gaussian process kriging context, this -corresponds to the observations for which outcomes are unobserved and must be estimated -based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, -this function will only compute k(X_train, X_train).} +\strong{1. BART} +\itemize{ +\item \code{'mean'}: Extracts leaf indices for the mean forest +\item \code{'variance'}: Extracts leaf indices for the variance forest +} -\item{forest_num}{(Optional) Index of the forest sample to use for kernel computation. If not provided, -this function will use the last forest.} +\strong{2. BCF} +\itemize{ +\item \code{'prognostic'}: Extracts leaf indices for the prognostic forest +\item \code{'treatment'}: Extracts leaf indices for the treatment effect forest +\item \code{'variance'}: Extracts leaf indices for the variance forest +}} -\item{forest_type}{(Optional) Whether to compute the kernel from the mean or variance forest. Default: "mean". Specify "variance" for the variance forest. -All other inputs are invalid. Must have sampled the relevant forest or an error will occur.} +\item{forest_inds}{(Optional) Indices of the forest sample(s) for which to compute leaf indices. If not provided, +this function will return leaf indices for every sample of a forest. +This function uses 1-indexing, so the first forest sample corresponds to \code{forest_num = 1}, and so on.} } \value{ -List of vectors. If \code{X_test = NULL}, the list contains -one vector of length \code{n_train * num_trees}, where \code{n_train = nrow(X_train)} -and \code{num_trees} is the number of trees in \code{bart_model}. If \code{X_test} is not \code{NULL}, -the list contains another vector of length \code{n_test * num_trees}. +List of vectors. Each vector is of size \code{num_obs * num_trees}, where \code{num_obs = nrow(covariates)} +and \code{num_trees} is the number of trees in the relevant forest of \code{model_object}. } \description{ Compute and return a vector representation of a forest's leaf predictions for @@ -59,8 +60,4 @@ the second tree and so on. The "data" for each element corresponds to a uniquely mapped column index that corresponds to a single leaf of a single tree (i.e. if tree 1 has 3 leaves, its column indices range from 0 to 2, and then tree 2's leaf indices begin at 3, etc...). -Users may pass a single dataset (which we refer to here as a "training set") -or two datasets (which we refer to as "training and test sets"). This verbiage -hints that one potential use-case for a matrix of leaf indices is to define a -ensemble-based kernel for kriging. } diff --git a/src/container.cpp b/src/container.cpp index d5adfe51..1112db62 100644 --- a/src/container.cpp +++ b/src/container.cpp @@ -123,6 +123,19 @@ void ForestContainer::PredictRawInPlace(ForestDataset& dataset, int forest_num, forests_[forest_num]->PredictRawInplace(dataset, output, 0, num_trees, offset); } +void ForestContainer::PredictLeafIndicesInplace( + Eigen::Map>& covariates, + Eigen::Map>& output, + std::vector& forest_indices, int num_trees, data_size_t n +) { + int num_forests = forest_indices.size(); + int forest_id; + for (int i = 0; i < num_forests; i++) { + forest_id = forest_indices[i]; + forests_[forest_id]->PredictLeafIndicesInplace(covariates, output, i, num_trees, n); + } +} + /*! \brief Save to JSON */ json ForestContainer::to_json() { json result_obj; diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 4eaa105a..b001582e 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -640,6 +640,13 @@ extern "C" SEXP _stochtree_forest_kernel_cpp() { END_CPP11 } // kernel.cpp +int forest_container_get_max_leaf_index_cpp(cpp11::external_pointer forest_container, int forest_num); +extern "C" SEXP _stochtree_forest_container_get_max_leaf_index_cpp(SEXP forest_container, SEXP forest_num) { + BEGIN_CPP11 + return cpp11::as_sexp(forest_container_get_max_leaf_index_cpp(cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); + END_CPP11 +} +// kernel.cpp void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num); extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) { BEGIN_CPP11 @@ -683,6 +690,13 @@ extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_test_cpp(SEXP fore return cpp11::as_sexp(forest_kernel_compute_kernel_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); END_CPP11 } +// kernel.cpp +cpp11::writable::integers_matrix<> compute_leaf_indices_cpp(cpp11::external_pointer forest_container, cpp11::doubles_matrix<> covariates, cpp11::integers forest_nums); +extern "C" SEXP _stochtree_compute_leaf_indices_cpp(SEXP forest_container, SEXP covariates, SEXP forest_nums) { + BEGIN_CPP11 + return cpp11::as_sexp(compute_leaf_indices_cpp(cpp11::as_cpp>>(forest_container), cpp11::as_cpp>>(covariates), cpp11::as_cpp>(forest_nums))); + END_CPP11 +} // sampler.cpp void sample_gfr_one_iteration_cpp(cpp11::external_pointer data, cpp11::external_pointer residual, cpp11::external_pointer forest_samples, cpp11::external_pointer tracker, cpp11::external_pointer split_prior, cpp11::external_pointer rng, cpp11::integers feature_types, int cutpoint_grid_size, cpp11::doubles_matrix<> leaf_model_scale_input, cpp11::doubles variable_weights, double a_forest, double b_forest, double global_variance, int leaf_model_int, bool pre_initialized); extern "C" SEXP _stochtree_sample_gfr_one_iteration_cpp(SEXP data, SEXP residual, SEXP forest_samples, SEXP tracker, SEXP split_prior, SEXP rng, SEXP feature_types, SEXP cutpoint_grid_size, SEXP leaf_model_scale_input, SEXP variable_weights, SEXP a_forest, SEXP b_forest, SEXP global_variance, SEXP leaf_model_int, SEXP pre_initialized) { @@ -984,6 +998,7 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_adjust_residual_forest_container_cpp", (DL_FUNC) &_stochtree_adjust_residual_forest_container_cpp, 7}, {"_stochtree_all_roots_forest_container_cpp", (DL_FUNC) &_stochtree_all_roots_forest_container_cpp, 2}, {"_stochtree_average_max_depth_forest_container_cpp", (DL_FUNC) &_stochtree_average_max_depth_forest_container_cpp, 1}, + {"_stochtree_compute_leaf_indices_cpp", (DL_FUNC) &_stochtree_compute_leaf_indices_cpp, 3}, {"_stochtree_create_column_vector_cpp", (DL_FUNC) &_stochtree_create_column_vector_cpp, 1}, {"_stochtree_create_forest_dataset_cpp", (DL_FUNC) &_stochtree_create_forest_dataset_cpp, 0}, {"_stochtree_create_rfx_dataset_cpp", (DL_FUNC) &_stochtree_create_rfx_dataset_cpp, 0}, @@ -999,6 +1014,7 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_forest_container_cpp", (DL_FUNC) &_stochtree_forest_container_cpp, 4}, {"_stochtree_forest_container_from_json_cpp", (DL_FUNC) &_stochtree_forest_container_from_json_cpp, 2}, {"_stochtree_forest_container_from_json_string_cpp", (DL_FUNC) &_stochtree_forest_container_from_json_string_cpp, 2}, + {"_stochtree_forest_container_get_max_leaf_index_cpp", (DL_FUNC) &_stochtree_forest_container_get_max_leaf_index_cpp, 2}, {"_stochtree_forest_dataset_add_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_basis_cpp, 2}, {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2}, {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2}, diff --git a/src/kernel.cpp b/src/kernel.cpp index a301cbde..6a9428a0 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -8,6 +8,8 @@ #include typedef Eigen::Map> KernelMatrixType; +typedef Eigen::Map> DoubleMatrixType; +typedef Eigen::Map> IntMatrixType; [[cpp11::register]] cpp11::external_pointer forest_kernel_cpp() { @@ -18,6 +20,11 @@ cpp11::external_pointer forest_kernel_cpp() { return cpp11::external_pointer(forest_kernel_ptr_.release()); } +[[cpp11::register]] +int forest_container_get_max_leaf_index_cpp(cpp11::external_pointer forest_container, int forest_num) { + return forest_container->GetEnsemble(forest_num)->GetMaxLeafIndex(); +} + [[cpp11::register]] void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num) { @@ -151,3 +158,36 @@ cpp11::list forest_kernel_compute_kernel_train_test_cpp( result.push_back(kernel_test); return result; } + +[[cpp11::register]] +cpp11::writable::integers_matrix<> compute_leaf_indices_cpp( + cpp11::external_pointer forest_container, + cpp11::doubles_matrix<> covariates, cpp11::integers forest_nums +) { + // Wrap an Eigen Map around the raw data of the covariate matrix + StochTree::data_size_t num_obs = covariates.nrow(); + int num_covariates = covariates.ncol(); + double* covariate_data_ptr = REAL(PROTECT(covariates)); + DoubleMatrixType covariates_eigen(covariate_data_ptr, num_obs, num_covariates); + + // Extract other output dimensions + int num_trees = forest_container->NumTrees(); + int num_samples = forest_nums.size(); + + // Declare outputs + cpp11::writable::integers_matrix<> output_matrix(num_obs*num_trees, num_samples); + + // Wrap Eigen Maps around kernel and kernel inverse matrices + int* output_data_ptr = INTEGER(PROTECT(output_matrix)); + IntMatrixType output_eigen(output_data_ptr, num_obs*num_trees, num_samples); + + // Compute leaf indices + std::vector forest_indices(forest_nums.begin(), forest_nums.end()); + forest_container->PredictLeafIndicesInplace(covariates_eigen, output_eigen, forest_indices, num_trees, num_obs); + + // Unprotect pointers to R data + UNPROTECT(2); + + // Return matrix + return output_matrix; +} diff --git a/src/tree.cpp b/src/tree.cpp index 896083f4..6eb43910 100644 --- a/src/tree.cpp +++ b/src/tree.cpp @@ -451,6 +451,22 @@ void Tree::PredictLeafIndexInplace(Eigen::Map>& covariates, + Eigen::Map>& output, + int column_ind, int32_t offset, int32_t max_leaf) { + int n = covariates.rows(); + CHECK_GE(output.size(), offset + n); + std::map renumber_map; + for (int i = 0; i < leaves_.size(); i++) { + renumber_map.insert({leaves_[i], i}); + } + int32_t node_id, remapped_node; + for (int i = 0; i < n; i++) { + node_id = EvaluateTree(*this, covariates, i); + output(offset + i, column_ind) = max_leaf + renumber_map.at(node_id); + } +} + void TreeNodeVectorsToJson(json& obj, Tree* tree) { // Initialize a map with names of the node vectors and empty json arrays std::map tree_array_map; diff --git a/tools/debug/r_kernel.R b/tools/debug/r_kernel.R new file mode 100644 index 00000000..859b689f --- /dev/null +++ b/tools/debug/r_kernel.R @@ -0,0 +1,40 @@ +library(stochtree) +library(tgp) + +# Generate the data, add many "noise variables" +n <- 500 +p_extra <- 10 +friedman.df <- friedman.1.data(n=n) +train_inds <- sort(sample(1:n, floor(0.8*n), replace = FALSE)) +test_inds <- (1:n)[!((1:n) %in% train_inds)] +X <- as.matrix(friedman.df)[,1:10] +X <- cbind(X, matrix(runif(n*p_extra), ncol = p_extra)) +y <- as.matrix(friedman.df)[,12] + rnorm(n,0,1)*(sd(as.matrix(friedman.df)[,11])/2) +X_train <- X[train_inds,] +X_test <- X[test_inds,] +y_train <- y[train_inds] +y_test <- y[test_inds] + +# Run BART on the data +X_train <- as.data.frame(X_train) +X_test <- as.data.frame(X_test) +bart_params <- list(num_trees_mean=200, num_trees_variance=50) +bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, params = bart_params, num_mcmc=1000) + +# Compute leaf indices for selected samples from the mean forest +leaf_mat <- computeForestLeafIndices(bart_model, X_test, forest_type = "mean", + forest_inds = c(99,100)) + +# Compute leaf indices for all samples from the mean forest +leaf_mat <- computeForestLeafIndices(bart_model, X_test, forest_type = "mean") + +# Construct sparse matrix of leaf membership +W <- Matrix::sparseMatrix(i=rep(1:length(y_test),200), j=leaf_mat[,forest_num] + 1, x=1) +tcrossprod(W) + +# Compute leaf indices for selected samples from the variance forest +leaf_mat <- computeForestLeafIndices(bart_model, X_test, forest_type = "variance", + forest_inds = c(99,100)) + +# Compute leaf indices for all samples from the variance forest +leaf_mat <- computeForestLeafIndices(bart_model, X_test, forest_type = "variance") \ No newline at end of file diff --git a/vignettes/EnsembleKernel.Rmd b/vignettes/EnsembleKernel.Rmd index 04fab08c..e808163d 100644 --- a/vignettes/EnsembleKernel.Rmd +++ b/vignettes/EnsembleKernel.Rmd @@ -48,6 +48,7 @@ To begin, load the `stochtree` package and the `tgp` package which will serve as library(stochtree) library(tgp) library(MASS) +library(Matrix) library(mvtnorm) ``` @@ -96,20 +97,25 @@ sigma_leaf <- 1/num_trees X_train <- as.data.frame(X_train) X_test <- as.data.frame(X_test) colnames(X_train) <- colnames(X_test) <- "x1" -bart_params <- list(num_trees_mean=num_trees) +bart_params <- list(num_trees_mean=num_trees, sigma_leaf_init=sigma_leaf) bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, params = bart_params) # Extract kernels needed for kriging -result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) -Sigma_11 <- result_kernels$kernel_test -Sigma_12 <- result_kernels$kernel_test_train -Sigma_22 <- result_kernels$kernel_train -Sigma_22_inv <- ginv(Sigma_22) +leaf_mat_train <- computeForestLeafIndices(bart_model, X_train, forest_type = "mean", + forest_inds = bart_model$model_params$num_samples) +leaf_mat_test <- computeForestLeafIndices(bart_model, X_test, forest_type = "mean", + forest_inds = bart_model$model_params$num_samples) +W_train <- sparseMatrix(i=rep(1:length(y_train),num_trees), j=leaf_mat_train + 1, x=1) +W_test <- sparseMatrix(i=rep(1:length(y_test),num_trees), j=leaf_mat_test + 1, x=1) +Sigma_11 <- tcrossprod(W_test) / num_trees +Sigma_12 <- tcrossprod(W_test, W_train) / num_trees +Sigma_22 <- tcrossprod(W_train) / num_trees +Sigma_22_inv <- ginv(as.matrix(Sigma_22)) Sigma_21 <- t(Sigma_12) # Compute mean and covariance for the test set posterior mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train -Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21) +Sigma_tilde <- as.matrix((sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21)) # Sample from f(X_test) | X_test, X_train, f(X_train) gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) @@ -172,16 +178,21 @@ bart_params <- list(num_trees_mean=num_trees) bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test, params = bart_params) # Extract kernels needed for kriging -result_kernels <- computeForestKernels(bart_model=bart_model, X_train=X_train, X_test=X_test) -Sigma_11 <- result_kernels$kernel_test -Sigma_12 <- result_kernels$kernel_test_train -Sigma_22 <- result_kernels$kernel_train -Sigma_22_inv <- ginv(Sigma_22) +leaf_mat_train <- computeForestLeafIndices(bart_model, X_train, forest_type = "mean", + forest_inds = bart_model$model_params$num_samples) +leaf_mat_test <- computeForestLeafIndices(bart_model, X_test, forest_type = "mean", + forest_inds = bart_model$model_params$num_samples) +W_train <- sparseMatrix(i=rep(1:length(y_train),num_trees), j=leaf_mat_train + 1, x=1) +W_test <- sparseMatrix(i=rep(1:length(y_test),num_trees), j=leaf_mat_test + 1, x=1) +Sigma_11 <- tcrossprod(W_test) / num_trees +Sigma_12 <- tcrossprod(W_test, W_train) / num_trees +Sigma_22 <- tcrossprod(W_train) / num_trees +Sigma_22_inv <- ginv(as.matrix(Sigma_22)) Sigma_21 <- t(Sigma_12) # Compute mean and covariance for the test set posterior mu_tilde <- Sigma_12 %*% Sigma_22_inv %*% y_train -Sigma_tilde <- (sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21) +Sigma_tilde <- as.matrix((sigma_leaf)*(Sigma_11 - Sigma_12 %*% Sigma_22_inv %*% Sigma_21)) # Sample from f(X_test) | X_test, X_train, f(X_train) gp_samples <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) From e4004edf0a0542bbcc4d36b832afcc5f7eab6d51 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Tue, 15 Oct 2024 16:30:29 -0500 Subject: [PATCH 2/2] Refactored the earlier "ForestKernel" interface from R and C++ --- NAMESPACE | 3 +- R/cpp11.R | 28 ---- R/kernel.R | 267 ++++++++++-------------------------- _pkgdown.yml | 6 +- include/stochtree/kernel.h | 255 ---------------------------------- man/ForestKernel.Rd | 110 --------------- man/computeForestKernels.Rd | 46 ------- man/computeMaxLeafIndex.Rd | 39 ++++++ man/createForestKernel.Rd | 14 -- src/cpp11.cpp | 58 -------- src/kernel.cpp | 145 -------------------- src/stochtree_types.h | 1 - tools/debug/kernel_debug.R | 46 ------- 13 files changed, 117 insertions(+), 901 deletions(-) delete mode 100644 include/stochtree/kernel.h delete mode 100644 man/ForestKernel.Rd delete mode 100644 man/computeForestKernels.Rd create mode 100644 man/computeMaxLeafIndex.Rd delete mode 100644 man/createForestKernel.Rd delete mode 100644 tools/debug/kernel_debug.R diff --git a/NAMESPACE b/NAMESPACE index c2d9c133..640d7fe3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,8 +7,8 @@ S3method(predict,bcf) export(bart) export(bcf) export(calibrate_inverse_gamma_error_variance) -export(computeForestKernels) export(computeForestLeafIndices) +export(computeMaxLeafIndex) export(convertBARTModelToJson) export(convertBCFModelToJson) export(createBARTModelFromCombinedJson) @@ -26,7 +26,6 @@ export(createForestContainer) export(createForestCovariates) export(createForestCovariatesFromMetadata) export(createForestDataset) -export(createForestKernel) export(createForestModel) export(createOutcome) export(createRNG) diff --git a/R/cpp11.R b/R/cpp11.R index 77e801d5..177b80d5 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -340,38 +340,10 @@ predict_forest_raw_single_forest_cpp <- function(forest_samples, dataset, forest .Call(`_stochtree_predict_forest_raw_single_forest_cpp`, forest_samples, dataset, forest_num) } -forest_kernel_cpp <- function() { - .Call(`_stochtree_forest_kernel_cpp`) -} - forest_container_get_max_leaf_index_cpp <- function(forest_container, forest_num) { .Call(`_stochtree_forest_container_get_max_leaf_index_cpp`, forest_container, forest_num) } -forest_kernel_compute_leaf_indices_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) { - invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num)) -} - -forest_kernel_compute_leaf_indices_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) { - invisible(.Call(`_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num)) -} - -forest_kernel_get_train_leaf_indices_cpp <- function(forest_kernel) { - .Call(`_stochtree_forest_kernel_get_train_leaf_indices_cpp`, forest_kernel) -} - -forest_kernel_get_test_leaf_indices_cpp <- function(forest_kernel) { - .Call(`_stochtree_forest_kernel_get_test_leaf_indices_cpp`, forest_kernel) -} - -forest_kernel_compute_kernel_train_cpp <- function(forest_kernel, covariates_train, forest_container, forest_num) { - .Call(`_stochtree_forest_kernel_compute_kernel_train_cpp`, forest_kernel, covariates_train, forest_container, forest_num) -} - -forest_kernel_compute_kernel_train_test_cpp <- function(forest_kernel, covariates_train, covariates_test, forest_container, forest_num) { - .Call(`_stochtree_forest_kernel_compute_kernel_train_test_cpp`, forest_kernel, covariates_train, covariates_test, forest_container, forest_num) -} - compute_leaf_indices_cpp <- function(forest_container, covariates, forest_nums) { .Call(`_stochtree_compute_leaf_indices_cpp`, forest_container, covariates, forest_nums) } diff --git a/R/kernel.R b/R/kernel.R index 234db6c4..947683da 100644 --- a/R/kernel.R +++ b/R/kernel.R @@ -1,195 +1,3 @@ -#' Class that provides functionality for statistical kernel definition and -#' computation based on shared leaf membership of observations in a tree ensemble. -#' -#' @description -#' Computes leaf membership internally as a sparse matrix and also calculates a -#' (dense) kernel based on the sparse matrix all in C++. - -ForestKernel <- R6::R6Class( - classname = "ForestKernel", - cloneable = FALSE, - public = list( - - #' @field forest_kernel_ptr External pointer to a C++ StochTree::ForestKernel class - forest_kernel_ptr = NULL, - - #' @description - #' Create a new ForestKernel object. - #' @return A new `ForestKernel` object. - initialize = function() { - # Initialize forest kernel and return external pointer to the class - self$forest_kernel_ptr <- forest_kernel_cpp() - }, - - #' @description - #' Compute the leaf indices of each tree in the ensemble for every observation in a dataset. - #' Stores the result internally, which can be extracted from the class via a call to `get_leaf_indices`. - #' @param covariates_train Matrix of training set covariates at which to compute leaf indices - #' @param covariates_test (Optional) Matrix of test set covariates at which to compute leaf indices - #' @param forest_container Object of type `ForestSamples` - #' @param forest_num Index of the forest in forest_container to be assessed - #' @return List of vectors. If `covariates_test = NULL` the list has one element (train set leaf indices), and - #' otherwise the list has two elements (train and test set leaf indices). - compute_leaf_indices = function(covariates_train, covariates_test = NULL, forest_container, forest_num) { - # Convert to matrix format if not provided as such - if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) { - covariates_train <- as.matrix(covariates_train) - } - if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) { - covariates_test <- as.matrix(covariates_test) - } - - # Compute the leaf indices - result = list() - if (is.null(covariates_test)) { - forest_kernel_compute_leaf_indices_train_cpp( - self$forest_kernel_ptr, covariates_train, - forest_container$forest_container_ptr, forest_num - ) - result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr) - } else { - forest_kernel_compute_leaf_indices_train_test_cpp( - self$forest_kernel_ptr, covariates_train, covariates_test, - forest_container$forest_container_ptr, forest_num - ) - result[["leaf_indices_train"]] = forest_kernel_get_train_leaf_indices_cpp(self$forest_kernel_ptr) - result[["leaf_indices_test"]] = forest_kernel_get_test_leaf_indices_cpp(self$forest_kernel_ptr) - } - return(result) - }, - - #' @description - #' Compute the kernel implied by a tree ensemble. This function calls `compute_leaf_indices`, - #' so it is not necessary to call both. `compute_leaf_indices` is exposed at the class level - #' to allow for extracting the vector of leaf indices for an ensemble directly in R. - #' @param covariates_train Matrix of training set covariates at which to assess ensemble kernel - #' @param covariates_test (Optional) Matrix of test set covariates at which to assess ensemble kernel - #' @param forest_container Object of type `ForestSamples` - #' @param forest_num Index of the forest in forest_container to be assessed - #' @return List of matrices. If `covariates_test = NULL`, the list contains - #' one `n_train` x `n_train` matrix, where `n_train = nrow(covariates_train)`. - #' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train` - #' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble. - #' If `covariates_test` is not `NULL`, the list contains two more matrices defined by - #' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`. - compute_kernel = function(covariates_train, covariates_test = NULL, forest_container, forest_num) { - # Convert to matrix format if not provided as such - if ((is.null(dim(covariates_train))) && (!is.null(covariates_train))) { - covariates_train <- as.matrix(covariates_train) - } - if ((is.null(dim(covariates_test))) && (!is.null(covariates_test))) { - covariates_test <- as.matrix(covariates_test) - } - - # Compute the kernels - num_trees <- forest_container$num_trees() - n_train = nrow(covariates_train) - kernel_train = matrix(0., nrow = n_train, ncol = n_train) - inverse_kernel_train = matrix(0., nrow = n_train, ncol = n_train) - if (is.null(covariates_test)) { - result = forest_kernel_compute_kernel_train_cpp( - self$forest_kernel_ptr, covariates_train, - forest_container$forest_container_ptr, forest_num - ) - names(result) <- c("kernel_train") - } else { - n_test = nrow(covariates_test) - kernel_test_train = matrix(0., nrow = n_test, ncol = n_train) - kernel_test = matrix(0., nrow = n_test, ncol = n_test) - result = forest_kernel_compute_kernel_train_test_cpp( - self$forest_kernel_ptr, covariates_train, covariates_test, - forest_container$forest_container_ptr, forest_num - ) - names(result) <- c("kernel_train", "kernel_test_train", "kernel_test") - } - - # Divide each matrix by num_trees - for (i in 1:length(result)) {result[[i]] <- result[[i]] / num_trees} - - return(result) - } - ) -) - -#' Create a `ForestKernel` object -#' -#' @return `ForestKernel` object -#' @export -createForestKernel <- function() { - return(invisible(( - ForestKernel$new() - ))) -} - -#' Compute a kernel from a tree ensemble, defined by the fraction -#' of trees of an ensemble in which two observations fall into the -#' same leaf. -#' -#' @param bart_model Object of type `bartmodel` corresponding to a BART model with at least one sample -#' @param X_train "Training" dataframe. In a traditional Gaussian process kriging context, this -#' corresponds to the observations for which outcomes are observed. -#' @param X_test (Optional) "Test" dataframe. In a traditional Gaussian process kriging context, this -#' corresponds to the observations for which outcomes are unobserved and must be estimated -#' based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, -#' this function will only compute k(X_train, X_train). -#' @param forest_num (Optional) Index of the forest sample to use for kernel computation. If not provided, -#' this function will use the last forest. -#' @param forest_type (Optional) Whether to compute the kernel from the mean or variance forest. Default: "mean". Specify "variance" for the variance forest. -#' All other inputs are invalid. Must have sampled the relevant forest or an error will occur. -#' @return List of kernel matrices. If `X_test = NULL`, the list contains -#' one `n_train` x `n_train` matrix, where `n_train = nrow(X_train)`. -#' This matrix is the kernel defined by `W_train %*% t(W_train)` where `W_train` -#' is a matrix with `n_train` rows and as many columns as there are total leaves in an ensemble. -#' If `X_test` is not `NULL`, the list contains two more matrices defined by -#' `W_test %*% t(W_train)` and `W_test %*% t(W_test)`. -#' @export -computeForestKernels <- function(bart_model, X_train, X_test=NULL, forest_num=NULL, forest_type="mean") { - stopifnot(class(bart_model)=="bartmodel") - if (forest_type=="mean") { - if (!bart_model$model_params$include_mean_forest) { - stop("Mean forest was not sampled in the bart model provided") - } - } else if (forest_type=="variance") { - if (!bart_model$model_params$include_variance_forest) { - stop("Variance forest was not sampled in the bart model provided") - } - } else { - stop("Must provide either 'mean' or 'variance' for the `forest_type` parameter") - } - - # Preprocess covariates - if (!is.data.frame(X_train)) { - stop("X_train must be a dataframe") - } - if (!is.data.frame(X_test)) { - stop("X_test must be a dataframe") - } - train_set_metadata <- bart_model$train_set_metadata - X_train <- preprocessPredictionDataFrame(X_train, train_set_metadata) - X_test <- preprocessPredictionDataFrame(X_test, train_set_metadata) - - # Data checks - stopifnot(bart_model$model_params$num_covariates == ncol(X_train)) - stopifnot(bart_model$model_params$num_covariates == ncol(X_test)) - - # Initialize and compute kernel - forest_kernel <- createForestKernel() - num_samples <- bart_model$model_params$num_samples - stopifnot(forest_num <= num_samples) - sample_index <- ifelse(is.null(forest_num), num_samples-1, forest_num-1) - if (forest_type=="mean") { - return(forest_kernel$compute_kernel( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$mean_forests, forest_num = sample_index - )) - } else if (forest_type=="variance") { - return(forest_kernel$compute_kernel( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$variance_forests, forest_num = sample_index - )) - } -} - #' Compute and return a vector representation of a forest's leaf predictions for #' every observation in a dataset. #' The vector has a "column-major" format that can be easily re-represented as @@ -279,3 +87,78 @@ computeForestLeafIndices <- function(model_object, covariates, forest_type, fore return(leaf_ind_matrix) } + +#' Compute and return the largest possible leaf index computable by `computeForestLeafIndices` for the forests in a designated forest sample container. +#' +#' @param model_object Object of type `bartmodel` or `bcf` corresponding to a BART / BCF model with at least one forest sample +#' @param covariates Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest. +#' @param forest_type Which forest to use from `model_object`. +#' Valid inputs depend on the model type, and whether or not a +#' +#' **1. BART** +#' +#' - `'mean'`: Extracts leaf indices for the mean forest +#' - `'variance'`: Extracts leaf indices for the variance forest +#' +#' **2. BCF** +#' +#' - `'prognostic'`: Extracts leaf indices for the prognostic forest +#' - `'treatment'`: Extracts leaf indices for the treatment effect forest +#' - `'variance'`: Extracts leaf indices for the variance forest +#' +#' @param forest_inds (Optional) Indices of the forest sample(s) for which to compute leaf indices. If not provided, +#' this function will return leaf indices for every sample of a forest. +#' This function uses 1-indexing, so the first forest sample corresponds to `forest_num = 1`, and so on. +#' @return Vector containing the largest possible leaf index computable by `computeForestLeafIndices` for the forests in a designated forest sample container. +#' @export +computeMaxLeafIndex <- function(model_object, covariates, forest_type, forest_inds=NULL) { + # Extract relevant forest container + stopifnot(class(model_object) %in% c("bartmodel", "bcf")) + model_type <- ifelse(class(model_object)=="bartmodel", "bart", "bcf") + if (model_type == "bart") { + stopifnot(forest_type %in% c("mean", "variance")) + if (forest_type=="mean") { + if (!model_object$model_params$include_mean_forest) { + stop("Mean forest was not sampled in the bart model provided") + } + forest_container <- model_object$mean_forests + } else if (forest_type=="variance") { + if (!model_object$model_params$include_variance_forest) { + stop("Variance forest was not sampled in the bart model provided") + } + forest_container <- model_object$variance_forests + } + } else { + stopifnot(forest_type %in% c("prognostic", "treatment", "variance")) + if (forest_type=="prognostic") { + forest_container <- model_object$forests_mu + } else if (forest_type=="treatment") { + forest_container <- model_object$forests_tau + } else if (forest_type=="variance") { + if (!model_object$model_params$include_variance_forest) { + stop("Variance forest was not sampled in the bcf model provided") + } + forest_container <- model_object$variance_forests + } + } + + # Preprocess forest indices + num_forests <- forest_container$num_samples() + if (is.null(forest_inds)) { + forest_inds <- as.integer(1:num_forests - 1) + } else { + stopifnot(all(forest_inds <= num_forests)) + stopifnot(all(forest_inds >= 1)) + forest_inds <- as.integer(forest_inds - 1) + } + + # Compute leaf indices + output <- rep(NA, length(forest_inds)) + for (i in 1:length(forest_inds)) { + output[i] <- forest_container_get_max_leaf_index_cpp( + forest_container$forest_container_ptr,forest_inds[i] + ) + } + + return(output) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 8845ca48..f469d4f9 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -81,13 +81,13 @@ reference: - createForestModel - ForestSamples - createForestContainer - - ForestKernel - - createForestKernel - CppRNG - createRNG - calibrate_inverse_gamma_error_variance - preprocessBartParams - preprocessBcfParams + - computeForestLeafIndices + - computeMaxLeafIndex - subtitle: Random Effects desc: > @@ -105,8 +105,6 @@ reference: - sample_sigma2_one_iteration - sample_tau_one_iteration - sample_tau_one_iteration - - computeForestKernels - - computeForestLeafIndices - title: Package info desc: > diff --git a/include/stochtree/kernel.h b/include/stochtree/kernel.h deleted file mode 100644 index 747b953c..00000000 --- a/include/stochtree/kernel.h +++ /dev/null @@ -1,255 +0,0 @@ -/*! - * Copyright (c) 2024 stochtree authors. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef STOCHTREE_TREE_KERNEL_H_ -#define STOCHTREE_TREE_KERNEL_H_ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace StochTree { - -typedef Eigen::Map> KernelMatrixType; - -class ForestKernel { - public: - ForestKernel() {} - ~ForestKernel() {} - - void ComputeLeafIndices(Eigen::MatrixXd& covariates, TreeEnsemble& forest) { - num_train_observations_ = covariates.rows(); - num_trees_ = forest.NumTrees(); - train_leaf_index_vector_.resize(num_train_observations_*num_trees_); - forest.PredictLeafIndicesInplace(covariates, train_leaf_index_vector_, num_trees_, num_train_observations_); - int max_cols = *std::max_element(train_leaf_index_vector_.begin(), train_leaf_index_vector_.end()); - train_leaf_index_matrix_ = Eigen::SparseMatrix(num_train_observations_,max_cols+1); - int col_num; - for (data_size_t i = 0; i < num_train_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = train_leaf_index_vector_.at(j*num_train_observations_ + i); - train_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - train_leaf_indices_stored_ = true; - } - - void ComputeLeafIndices(KernelMatrixType& covariates, TreeEnsemble& forest) { - num_train_observations_ = covariates.rows(); - num_trees_ = forest.NumTrees(); - train_leaf_index_vector_.resize(num_train_observations_*num_trees_); - forest.PredictLeafIndicesInplace(covariates, train_leaf_index_vector_, num_trees_, num_train_observations_); - int max_cols = *std::max_element(train_leaf_index_vector_.begin(), train_leaf_index_vector_.end()); - train_leaf_index_matrix_ = Eigen::SparseMatrix(num_train_observations_,max_cols+1); - int col_num; - for (data_size_t i = 0; i < num_train_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = train_leaf_index_vector_.at(j*num_train_observations_ + i); - train_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - train_leaf_indices_stored_ = true; - } - - void ComputeLeafIndices(Eigen::MatrixXd& covariates_train, Eigen::MatrixXd& covariates_test, TreeEnsemble& forest) { - CHECK_EQ(covariates_train.cols(), covariates_test.cols()); - num_train_observations_ = covariates_train.rows(); - num_test_observations_ = covariates_test.rows(); - num_trees_ = forest.NumTrees(); - train_leaf_index_vector_.resize(num_train_observations_*num_trees_); - test_leaf_index_vector_.resize(num_test_observations_*num_trees_); - forest.PredictLeafIndicesInplace(covariates_train, train_leaf_index_vector_, num_trees_, num_train_observations_); - forest.PredictLeafIndicesInplace(covariates_test, test_leaf_index_vector_, num_trees_, num_test_observations_); - int max_cols_train = *std::max_element(train_leaf_index_vector_.begin(), train_leaf_index_vector_.end()); - int max_cols_test = *std::max_element(test_leaf_index_vector_.begin(), test_leaf_index_vector_.end()); - int max_cols = max_cols_train > max_cols_test ? max_cols_train : max_cols_test; - train_leaf_index_matrix_ = Eigen::SparseMatrix(num_train_observations_,max_cols+1); - test_leaf_index_matrix_ = Eigen::SparseMatrix(num_test_observations_,max_cols+1); - int col_num; - for (data_size_t i = 0; i < num_train_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = train_leaf_index_vector_.at(j*num_train_observations_ + i); - train_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - train_leaf_indices_stored_ = true; - for (data_size_t i = 0; i < num_test_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = test_leaf_index_vector_.at(j*num_test_observations_ + i); - test_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - test_leaf_indices_stored_ = true; - } - - void ComputeLeafIndices(KernelMatrixType& covariates_train, KernelMatrixType& covariates_test, TreeEnsemble& forest) { - CHECK_EQ(covariates_train.cols(), covariates_test.cols()); - num_train_observations_ = covariates_train.rows(); - num_test_observations_ = covariates_test.rows(); - num_trees_ = forest.NumTrees(); - train_leaf_index_vector_.resize(num_train_observations_*num_trees_); - test_leaf_index_vector_.resize(num_test_observations_*num_trees_); - forest.PredictLeafIndicesInplace(covariates_train, train_leaf_index_vector_, num_trees_, num_train_observations_); - forest.PredictLeafIndicesInplace(covariates_test, test_leaf_index_vector_, num_trees_, num_test_observations_); - int max_cols_train = *std::max_element(train_leaf_index_vector_.begin(), train_leaf_index_vector_.end()); - int max_cols_test = *std::max_element(test_leaf_index_vector_.begin(), test_leaf_index_vector_.end()); - int max_cols = max_cols_train > max_cols_test ? max_cols_train : max_cols_test; - train_leaf_index_matrix_ = Eigen::SparseMatrix(num_train_observations_,max_cols+1); - test_leaf_index_matrix_ = Eigen::SparseMatrix(num_test_observations_,max_cols+1); - int col_num; - for (data_size_t i = 0; i < num_train_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = train_leaf_index_vector_.at(j*num_train_observations_ + i); - train_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - train_leaf_indices_stored_ = true; - for (data_size_t i = 0; i < num_test_observations_; i++) { - for (int j = 0; j < num_trees_; j++) { - col_num = test_leaf_index_vector_.at(j*num_test_observations_ + i); - test_leaf_index_matrix_.insert(i,col_num) = 1.; - } - } - test_leaf_indices_stored_ = true; - } - - void ComputeKernel(Eigen::MatrixXd& covariates, TreeEnsemble& forest) { - ComputeLeafIndices(covariates, forest); - tree_kernel_train_ = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - train_kernel_stored_ = true; - } - - void ComputeKernel(KernelMatrixType& covariates, TreeEnsemble& forest) { - ComputeLeafIndices(covariates, forest); - tree_kernel_train_ = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - train_kernel_stored_ = true; - } - - void ComputeKernelExternal(Eigen::MatrixXd& covariates, TreeEnsemble& forest, KernelMatrixType& kernel_map) { - ComputeLeafIndices(covariates, forest); - kernel_map = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - } - - void ComputeKernelExternal(KernelMatrixType& covariates, TreeEnsemble& forest, KernelMatrixType& kernel_map) { - ComputeLeafIndices(covariates, forest); - kernel_map = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - } - - void ComputeKernel(Eigen::MatrixXd& covariates_train, Eigen::MatrixXd& covariates_test, TreeEnsemble& forest) { - ComputeLeafIndices(covariates_train, covariates_test, forest); - tree_kernel_train_ = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - train_kernel_stored_ = true; - tree_kernel_test_train_ = test_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - tree_kernel_test_ = test_leaf_index_matrix_ * test_leaf_index_matrix_.transpose(); - test_kernel_stored_ = true; - } - - void ComputeKernel(KernelMatrixType& covariates_train, KernelMatrixType& covariates_test, TreeEnsemble& forest) { - ComputeLeafIndices(covariates_train, covariates_test, forest); - tree_kernel_train_ = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - train_kernel_stored_ = true; - tree_kernel_test_train_ = test_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - tree_kernel_test_ = test_leaf_index_matrix_ * test_leaf_index_matrix_.transpose(); - test_kernel_stored_ = true; - } - - void ComputeKernelExternal(Eigen::MatrixXd& covariates_train, Eigen::MatrixXd& covariates_test, TreeEnsemble& forest, - KernelMatrixType& kernel_map_train, KernelMatrixType& kernel_map_test_train, KernelMatrixType& kernel_map_test) { - ComputeLeafIndices(covariates_train, covariates_test, forest); - kernel_map_train = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - kernel_map_test_train = test_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - kernel_map_test = test_leaf_index_matrix_ * test_leaf_index_matrix_.transpose(); - } - - void ComputeKernelExternal(KernelMatrixType& covariates_train, KernelMatrixType& covariates_test, TreeEnsemble& forest, - KernelMatrixType& kernel_map_train, KernelMatrixType& kernel_map_test_train, KernelMatrixType& kernel_map_test) { - ComputeLeafIndices(covariates_train, covariates_test, forest); - kernel_map_train = train_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - kernel_map_test_train = test_leaf_index_matrix_ * train_leaf_index_matrix_.transpose(); - kernel_map_test = test_leaf_index_matrix_ * test_leaf_index_matrix_.transpose(); - } - - std::vector& GetTrainLeafIndices() { - CHECK(train_leaf_indices_stored_); - return train_leaf_index_vector_; - } - - std::vector& GetTestLeafIndices() { - CHECK(test_leaf_indices_stored_); - return test_leaf_index_vector_; - } - - Eigen::MatrixXd& GetTrainKernel() { - CHECK(train_kernel_stored_); - return tree_kernel_train_; - } - - Eigen::MatrixXd& GetTestTrainKernel() { - CHECK(test_kernel_stored_); - return tree_kernel_test_train_; - } - - Eigen::MatrixXd& GetTestKernel() { - CHECK(test_kernel_stored_); - return tree_kernel_test_; - } - - data_size_t NumTrainObservations() { - return num_train_observations_; - } - - data_size_t NumTestObservations() { - return num_test_observations_; - } - - int NumTrees() { - return num_trees_; - } - - bool HasTrainLeafIndices() { - return train_leaf_indices_stored_; - } - - bool HasTestLeafIndices() { - return test_leaf_indices_stored_; - } - - bool HasTrainKernel() { - return train_kernel_stored_; - } - - bool HasTestKernel() { - return test_kernel_stored_; - } - - private: - data_size_t num_train_observations_{0}; - data_size_t num_test_observations_{0}; - int num_trees_{0}; - std::vector train_leaf_index_vector_; - std::vector test_leaf_index_vector_; - Eigen::SparseMatrix train_leaf_index_matrix_; - Eigen::SparseMatrix test_leaf_index_matrix_; - Eigen::MatrixXd tree_kernel_train_; - Eigen::MatrixXd tree_kernel_test_train_; - Eigen::MatrixXd tree_kernel_test_; - bool train_leaf_indices_stored_{false}; - bool test_leaf_indices_stored_{false}; - bool train_kernel_stored_{false}; - bool test_kernel_stored_{false}; -}; - -} // namespace StochTree - -#endif // STOCHTREE_TREE_KERNEL_H_ \ No newline at end of file diff --git a/man/ForestKernel.Rd b/man/ForestKernel.Rd deleted file mode 100644 index a721b369..00000000 --- a/man/ForestKernel.Rd +++ /dev/null @@ -1,110 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/kernel.R -\name{ForestKernel} -\alias{ForestKernel} -\title{Class that provides functionality for statistical kernel definition and -computation based on shared leaf membership of observations in a tree ensemble.} -\description{ -Computes leaf membership internally as a sparse matrix and also calculates a -(dense) kernel based on the sparse matrix all in C++. -} -\section{Public fields}{ -\if{html}{\out{
}} -\describe{ -\item{\code{forest_kernel_ptr}}{External pointer to a C++ StochTree::ForestKernel class} -} -\if{html}{\out{
}} -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-ForestKernel-new}{\code{ForestKernel$new()}} -\item \href{#method-ForestKernel-compute_leaf_indices}{\code{ForestKernel$compute_leaf_indices()}} -\item \href{#method-ForestKernel-compute_kernel}{\code{ForestKernel$compute_kernel()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ForestKernel-new}{}}} -\subsection{Method \code{new()}}{ -Create a new ForestKernel object. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ForestKernel$new()}\if{html}{\out{
}} -} - -\subsection{Returns}{ -A new \code{ForestKernel} object. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ForestKernel-compute_leaf_indices}{}}} -\subsection{Method \code{compute_leaf_indices()}}{ -Compute the leaf indices of each tree in the ensemble for every observation in a dataset. -Stores the result internally, which can be extracted from the class via a call to \code{get_leaf_indices}. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ForestKernel$compute_leaf_indices( - covariates_train, - covariates_test = NULL, - forest_container, - forest_num -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{covariates_train}}{Matrix of training set covariates at which to compute leaf indices} - -\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to compute leaf indices} - -\item{\code{forest_container}}{Object of type \code{ForestSamples}} - -\item{\code{forest_num}}{Index of the forest in forest_container to be assessed} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -List of vectors. If \code{covariates_test = NULL} the list has one element (train set leaf indices), and -otherwise the list has two elements (train and test set leaf indices). -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ForestKernel-compute_kernel}{}}} -\subsection{Method \code{compute_kernel()}}{ -Compute the kernel implied by a tree ensemble. This function calls \code{compute_leaf_indices}, -so it is not necessary to call both. \code{compute_leaf_indices} is exposed at the class level -to allow for extracting the vector of leaf indices for an ensemble directly in R. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ForestKernel$compute_kernel( - covariates_train, - covariates_test = NULL, - forest_container, - forest_num -)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{covariates_train}}{Matrix of training set covariates at which to assess ensemble kernel} - -\item{\code{covariates_test}}{(Optional) Matrix of test set covariates at which to assess ensemble kernel} - -\item{\code{forest_container}}{Object of type \code{ForestSamples}} - -\item{\code{forest_num}}{Index of the forest in forest_container to be assessed} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -List of matrices. If \code{covariates_test = NULL}, the list contains -one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(covariates_train)}. -This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train} -is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble. -If \code{covariates_test} is not \code{NULL}, the list contains two more matrices defined by -\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}. -} -} -} diff --git a/man/computeForestKernels.Rd b/man/computeForestKernels.Rd deleted file mode 100644 index f9a02062..00000000 --- a/man/computeForestKernels.Rd +++ /dev/null @@ -1,46 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/kernel.R -\name{computeForestKernels} -\alias{computeForestKernels} -\title{Compute a kernel from a tree ensemble, defined by the fraction -of trees of an ensemble in which two observations fall into the -same leaf.} -\usage{ -computeForestKernels( - bart_model, - X_train, - X_test = NULL, - forest_num = NULL, - forest_type = "mean" -) -} -\arguments{ -\item{bart_model}{Object of type \code{bartmodel} corresponding to a BART model with at least one sample} - -\item{X_train}{"Training" dataframe. In a traditional Gaussian process kriging context, this -corresponds to the observations for which outcomes are observed.} - -\item{X_test}{(Optional) "Test" dataframe. In a traditional Gaussian process kriging context, this -corresponds to the observations for which outcomes are unobserved and must be estimated -based on the kernels k(X_test,X_test), k(X_test,X_train), and k(X_train,X_train). If not provided, -this function will only compute k(X_train, X_train).} - -\item{forest_num}{(Optional) Index of the forest sample to use for kernel computation. If not provided, -this function will use the last forest.} - -\item{forest_type}{(Optional) Whether to compute the kernel from the mean or variance forest. Default: "mean". Specify "variance" for the variance forest. -All other inputs are invalid. Must have sampled the relevant forest or an error will occur.} -} -\value{ -List of kernel matrices. If \code{X_test = NULL}, the list contains -one \code{n_train} x \code{n_train} matrix, where \code{n_train = nrow(X_train)}. -This matrix is the kernel defined by \code{W_train \%*\% t(W_train)} where \code{W_train} -is a matrix with \code{n_train} rows and as many columns as there are total leaves in an ensemble. -If \code{X_test} is not \code{NULL}, the list contains two more matrices defined by -\code{W_test \%*\% t(W_train)} and \code{W_test \%*\% t(W_test)}. -} -\description{ -Compute a kernel from a tree ensemble, defined by the fraction -of trees of an ensemble in which two observations fall into the -same leaf. -} diff --git a/man/computeMaxLeafIndex.Rd b/man/computeMaxLeafIndex.Rd new file mode 100644 index 00000000..c941acb4 --- /dev/null +++ b/man/computeMaxLeafIndex.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/kernel.R +\name{computeMaxLeafIndex} +\alias{computeMaxLeafIndex} +\title{Compute and return the largest possible leaf index computable by \code{computeForestLeafIndices} for the forests in a designated forest sample container.} +\usage{ +computeMaxLeafIndex(model_object, covariates, forest_type, forest_inds = NULL) +} +\arguments{ +\item{model_object}{Object of type \code{bartmodel} or \code{bcf} corresponding to a BART / BCF model with at least one forest sample} + +\item{covariates}{Covariates to use for prediction. Must have the same dimensions / column types as the data used to train a forest.} + +\item{forest_type}{Which forest to use from \code{model_object}. +Valid inputs depend on the model type, and whether or not a + +\strong{1. BART} +\itemize{ +\item \code{'mean'}: Extracts leaf indices for the mean forest +\item \code{'variance'}: Extracts leaf indices for the variance forest +} + +\strong{2. BCF} +\itemize{ +\item \code{'prognostic'}: Extracts leaf indices for the prognostic forest +\item \code{'treatment'}: Extracts leaf indices for the treatment effect forest +\item \code{'variance'}: Extracts leaf indices for the variance forest +}} + +\item{forest_inds}{(Optional) Indices of the forest sample(s) for which to compute leaf indices. If not provided, +this function will return leaf indices for every sample of a forest. +This function uses 1-indexing, so the first forest sample corresponds to \code{forest_num = 1}, and so on.} +} +\value{ +Vector containing the largest possible leaf index computable by \code{computeForestLeafIndices} for the forests in a designated forest sample container. +} +\description{ +Compute and return the largest possible leaf index computable by \code{computeForestLeafIndices} for the forests in a designated forest sample container. +} diff --git a/man/createForestKernel.Rd b/man/createForestKernel.Rd deleted file mode 100644 index 36cf73cb..00000000 --- a/man/createForestKernel.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/kernel.R -\name{createForestKernel} -\alias{createForestKernel} -\title{Create a \code{ForestKernel} object} -\usage{ -createForestKernel() -} -\value{ -\code{ForestKernel} object -} -\description{ -Create a \code{ForestKernel} object -} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index b001582e..e9145f26 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -633,13 +633,6 @@ extern "C" SEXP _stochtree_predict_forest_raw_single_forest_cpp(SEXP forest_samp END_CPP11 } // kernel.cpp -cpp11::external_pointer forest_kernel_cpp(); -extern "C" SEXP _stochtree_forest_kernel_cpp() { - BEGIN_CPP11 - return cpp11::as_sexp(forest_kernel_cpp()); - END_CPP11 -} -// kernel.cpp int forest_container_get_max_leaf_index_cpp(cpp11::external_pointer forest_container, int forest_num); extern "C" SEXP _stochtree_forest_container_get_max_leaf_index_cpp(SEXP forest_container, SEXP forest_num) { BEGIN_CPP11 @@ -647,50 +640,6 @@ extern "C" SEXP _stochtree_forest_container_get_max_leaf_index_cpp(SEXP forest_c END_CPP11 } // kernel.cpp -void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num); -extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) { - BEGIN_CPP11 - forest_kernel_compute_leaf_indices_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)); - return R_NilValue; - END_CPP11 -} -// kernel.cpp -void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num); -extern "C" SEXP _stochtree_forest_kernel_compute_leaf_indices_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) { - BEGIN_CPP11 - forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num)); - return R_NilValue; - END_CPP11 -} -// kernel.cpp -cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel); -extern "C" SEXP _stochtree_forest_kernel_get_train_leaf_indices_cpp(SEXP forest_kernel) { - BEGIN_CPP11 - return cpp11::as_sexp(forest_kernel_get_train_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel))); - END_CPP11 -} -// kernel.cpp -cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel); -extern "C" SEXP _stochtree_forest_kernel_get_test_leaf_indices_cpp(SEXP forest_kernel) { - BEGIN_CPP11 - return cpp11::as_sexp(forest_kernel_get_test_leaf_indices_cpp(cpp11::as_cpp>>(forest_kernel))); - END_CPP11 -} -// kernel.cpp -cpp11::list forest_kernel_compute_kernel_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::external_pointer forest_container, int forest_num); -extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP forest_container, SEXP forest_num) { - BEGIN_CPP11 - return cpp11::as_sexp(forest_kernel_compute_kernel_train_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); - END_CPP11 -} -// kernel.cpp -cpp11::list forest_kernel_compute_kernel_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num); -extern "C" SEXP _stochtree_forest_kernel_compute_kernel_train_test_cpp(SEXP forest_kernel, SEXP covariates_train, SEXP covariates_test, SEXP forest_container, SEXP forest_num) { - BEGIN_CPP11 - return cpp11::as_sexp(forest_kernel_compute_kernel_train_test_cpp(cpp11::as_cpp>>(forest_kernel), cpp11::as_cpp>>(covariates_train), cpp11::as_cpp>>(covariates_test), cpp11::as_cpp>>(forest_container), cpp11::as_cpp>(forest_num))); - END_CPP11 -} -// kernel.cpp cpp11::writable::integers_matrix<> compute_leaf_indices_cpp(cpp11::external_pointer forest_container, cpp11::doubles_matrix<> covariates, cpp11::integers forest_nums); extern "C" SEXP _stochtree_compute_leaf_indices_cpp(SEXP forest_container, SEXP covariates, SEXP forest_nums) { BEGIN_CPP11 @@ -1019,13 +968,6 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_forest_dataset_add_covariates_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_covariates_cpp, 2}, {"_stochtree_forest_dataset_add_weights_cpp", (DL_FUNC) &_stochtree_forest_dataset_add_weights_cpp, 2}, {"_stochtree_forest_dataset_update_basis_cpp", (DL_FUNC) &_stochtree_forest_dataset_update_basis_cpp, 2}, - {"_stochtree_forest_kernel_compute_kernel_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_cpp, 4}, - {"_stochtree_forest_kernel_compute_kernel_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_kernel_train_test_cpp, 5}, - {"_stochtree_forest_kernel_compute_leaf_indices_train_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_cpp, 4}, - {"_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp", (DL_FUNC) &_stochtree_forest_kernel_compute_leaf_indices_train_test_cpp, 5}, - {"_stochtree_forest_kernel_cpp", (DL_FUNC) &_stochtree_forest_kernel_cpp, 0}, - {"_stochtree_forest_kernel_get_test_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_test_leaf_indices_cpp, 1}, - {"_stochtree_forest_kernel_get_train_leaf_indices_cpp", (DL_FUNC) &_stochtree_forest_kernel_get_train_leaf_indices_cpp, 1}, {"_stochtree_forest_tracker_cpp", (DL_FUNC) &_stochtree_forest_tracker_cpp, 4}, {"_stochtree_get_forest_split_counts_forest_container_cpp", (DL_FUNC) &_stochtree_get_forest_split_counts_forest_container_cpp, 3}, {"_stochtree_get_granular_split_count_array_forest_container_cpp", (DL_FUNC) &_stochtree_get_granular_split_count_array_forest_container_cpp, 2}, diff --git a/src/kernel.cpp b/src/kernel.cpp index 6a9428a0..3a39dbb5 100644 --- a/src/kernel.cpp +++ b/src/kernel.cpp @@ -1,164 +1,19 @@ #include #include "stochtree_types.h" -#include #include #include #include #include #include -typedef Eigen::Map> KernelMatrixType; typedef Eigen::Map> DoubleMatrixType; typedef Eigen::Map> IntMatrixType; -[[cpp11::register]] -cpp11::external_pointer forest_kernel_cpp() { - // Create smart pointer to newly allocated object - std::unique_ptr forest_kernel_ptr_ = std::make_unique(); - - // Release management of the pointer to R session - return cpp11::external_pointer(forest_kernel_ptr_.release()); -} - [[cpp11::register]] int forest_container_get_max_leaf_index_cpp(cpp11::external_pointer forest_container, int forest_num) { return forest_container->GetEnsemble(forest_num)->GetMaxLeafIndex(); } -[[cpp11::register]] -void forest_kernel_compute_leaf_indices_train_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, - cpp11::external_pointer forest_container, int forest_num) { - // Wrap an Eigen Map around the raw data of the covariate_train matrix - StochTree::data_size_t n_train = covariates_train.nrow(); - int num_covariates_train = covariates_train.ncol(); - double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); - KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); - - // Compute leaf indices - forest_kernel->ComputeLeafIndices(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num))); - - // Unprotect pointers to R data - UNPROTECT(1); -} - -[[cpp11::register]] -void forest_kernel_compute_leaf_indices_train_test_cpp(cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, cpp11::doubles_matrix<> covariates_test, - cpp11::external_pointer forest_container, int forest_num) { - // Wrap an Eigen Map around the raw data of the covariate_train matrix - StochTree::data_size_t n_train = covariates_train.nrow(); - int num_covariates_train = covariates_train.ncol(); - double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); - KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); - - // Wrap an Eigen Map around the raw data of the covariate_test matrix - StochTree::data_size_t n_test = covariates_test.nrow(); - int num_covariates_test = covariates_test.ncol(); - double* covariate_test_data_ptr = REAL(PROTECT(covariates_test)); - KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test); - - // Compute leaf indices - forest_kernel->ComputeLeafIndices(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num))); - - // Unprotect pointers to R data - UNPROTECT(2); -} - -[[cpp11::register]] -cpp11::writable::integers forest_kernel_get_train_leaf_indices_cpp(cpp11::external_pointer forest_kernel) { - if (!forest_kernel->HasTrainLeafIndices()) { - cpp11::writable::integers output; - return output; - } - std::vector train_indices = forest_kernel->GetTrainLeafIndices(); - cpp11::writable::integers output(train_indices.begin(), train_indices.end()); - return output; -} - -[[cpp11::register]] -cpp11::writable::integers forest_kernel_get_test_leaf_indices_cpp(cpp11::external_pointer forest_kernel) { - if (!forest_kernel->HasTestLeafIndices()) { - cpp11::writable::integers output; - return output; - } - std::vector test_indices = forest_kernel->GetTestLeafIndices(); - cpp11::writable::integers output(test_indices.begin(), test_indices.end()); - return output; -} - -[[cpp11::register]] -cpp11::list forest_kernel_compute_kernel_train_cpp( - cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, - cpp11::external_pointer forest_container, int forest_num -) { - // Wrap an Eigen Map around the raw data of the covariate_train matrix - StochTree::data_size_t n_train = covariates_train.nrow(); - int num_covariates_train = covariates_train.ncol(); - double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); - KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); - - // Declare outputs - cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train); - - // Wrap Eigen Maps around kernel and kernel inverse matrices - double* kernel_data_ptr = REAL(PROTECT(kernel_train)); - KernelMatrixType kernel_eigen(kernel_data_ptr, n_train, n_train); - - // Compute kernel terms - forest_kernel->ComputeKernelExternal(covariates_train_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_eigen); - - // Unprotect pointers to R data - UNPROTECT(2); - - // Return list of vectors - cpp11::writable::list result; - result.push_back(kernel_train); - return result; -} - -[[cpp11::register]] -cpp11::list forest_kernel_compute_kernel_train_test_cpp( - cpp11::external_pointer forest_kernel, cpp11::doubles_matrix<> covariates_train, - cpp11::doubles_matrix<> covariates_test, cpp11::external_pointer forest_container, int forest_num -) { - // Wrap an Eigen Map around the raw data of the covariate_train matrix - StochTree::data_size_t n_train = covariates_train.nrow(); - int num_covariates_train = covariates_train.ncol(); - double* covariate_train_data_ptr = REAL(PROTECT(covariates_train)); - KernelMatrixType covariates_train_eigen(covariate_train_data_ptr, n_train, num_covariates_train); - - // Wrap an Eigen Map around the raw data of the covariate_test matrix - StochTree::data_size_t n_test = covariates_test.nrow(); - int num_covariates_test = covariates_test.ncol(); - double* covariate_test_data_ptr = REAL(PROTECT(covariates_test)); - KernelMatrixType covariates_test_eigen(covariate_test_data_ptr, n_test, num_covariates_test); - - // Declare outputs - cpp11::writable::doubles_matrix<> kernel_train(n_train, n_train); - cpp11::writable::doubles_matrix<> kernel_test_train(n_test, n_train); - cpp11::writable::doubles_matrix<> kernel_test(n_test, n_test); - - // Wrap Eigen Maps around kernel and kernel inverse matrices - double* kernel_data_ptr = REAL(PROTECT(kernel_train)); - double* kernel_test_train_data_ptr = REAL(PROTECT(kernel_test_train)); - double* kernel_test_data_ptr = REAL(PROTECT(kernel_test)); - KernelMatrixType kernel_train_eigen(kernel_data_ptr, n_train, n_train); - KernelMatrixType kernel_test_train_eigen(kernel_test_train_data_ptr, n_test, n_train); - KernelMatrixType kernel_test_eigen(kernel_test_data_ptr, n_test, n_test); - - // Compute kernel terms - forest_kernel->ComputeKernelExternal(covariates_train_eigen, covariates_test_eigen, *(forest_container->GetEnsemble(forest_num)), kernel_train_eigen, kernel_test_train_eigen, kernel_test_eigen); - - // Unprotect pointers to R data - UNPROTECT(5); - - // Return list of vectors - cpp11::writable::list result; - result.push_back(kernel_train); - result.push_back(kernel_test_train); - result.push_back(kernel_test); - return result; -} - [[cpp11::register]] cpp11::writable::integers_matrix<> compute_leaf_indices_cpp( cpp11::external_pointer forest_container, diff --git a/src/stochtree_types.h b/src/stochtree_types.h index 096dc58d..33a8fb61 100644 --- a/src/stochtree_types.h +++ b/src/stochtree_types.h @@ -1,6 +1,5 @@ #include #include -#include #include #include #include diff --git a/tools/debug/kernel_debug.R b/tools/debug/kernel_debug.R deleted file mode 100644 index 756a2393..00000000 --- a/tools/debug/kernel_debug.R +++ /dev/null @@ -1,46 +0,0 @@ -library(stochtree) - -# Generate the data -X_train <- seq(0,20,length=100) -X_test <- seq(0,20,length=99) -y_train <- (sin(pi*X_train/5) + 0.2*cos(4*pi*X_train/5)) * (X_train <= 9.6) -lin_train <- X_train>9.6; -y_train[lin_train] <- -1 + X_train[lin_train]/10 -y_train <- y_train + rnorm(length(y_train), sd=0.1) -y_test <- (sin(pi*X_test/5) + 0.2*cos(4*pi*X_test/5)) * (X_test <= 9.6) -lin_test <- X_test>9.6; -y_test[lin_test] <- -1 + X_test[lin_test]/10 - -bart_model <- bart(X_train=X_train, y_train=y_train, X_test=X_test) -forest_kernel <- createForestKernel() -result_inds <- forest_kernel$compute_leaf_indices( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$forests, - forest_num = bart_model$model_params$num_samples - 1 -) -result_kernels <- forest_kernel$compute_kernel( - covariates_train = X_train, covariates_test = X_test, - forest_container = bart_model$forests, - forest_num = bart_model$model_params$num_samples - 1 -) - -# GP -mu_tilde <- result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% y_train -Sigma_tilde <- result_kernels$kernel_test - result_kernels$kernel_test_train %*% MASS::ginv(result_kernels$kernel_train) %*% t(result_kernels$kernel_test_train) - -B <- mvtnorm::rmvnorm(1000, mean = mu_tilde, sigma = Sigma_tilde) -yhat_mean_test <- colMeans(B) -plot(yhat_mean_test, y_test, xlab = "predicted", ylab = "actual", main = "Gaussian process") -abline(0,1,lwd=2.5,lty=3,col="red") - -# m <- 200 -# n <- length(X_train) -# dummies <- result[["leaf_indices_train"]] -# max_ix <- max(dummies) -# leaf_train <- Matrix::sparseMatrix(i=rep(1:n, m), -# j = dummies+1, -# x = rep(1, n*m), -# dims = c(n, max_ix+1), -# ) -# leaf_train <- as.matrix(leaf_train) -# check <- leaf_train %*% t(leaf_train)