diff --git a/NAMESPACE b/NAMESPACE
index 7c746a36..47ba8bcc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -12,6 +12,7 @@ export(computeForestLeafVariances)
export(computeMaxLeafIndex)
export(convertBARTModelToJson)
export(convertBCFModelToJson)
+export(convertPreprocessorToJson)
export(createBARTModelFromCombinedJson)
export(createBARTModelFromCombinedJsonString)
export(createBARTModelFromJson)
@@ -31,6 +32,8 @@ export(createForestCovariatesFromMetadata)
export(createForestDataset)
export(createForestModel)
export(createOutcome)
+export(createPreprocessorFromJson)
+export(createPreprocessorFromJsonString)
export(createRNG)
export(createRandomEffectSamples)
export(createRandomEffectsDataset)
@@ -69,6 +72,7 @@ export(saveBARTModelToJsonFile)
export(saveBARTModelToJsonString)
export(saveBCFModelToJsonFile)
export(saveBCFModelToJsonString)
+export(savePreprocessorToJsonString)
importFrom(R6,R6Class)
importFrom(stats,coef)
importFrom(stats,lm)
diff --git a/R/bart.R b/R/bart.R
index 841d7ee6..25699152 100644
--- a/R/bart.R
+++ b/R/bart.R
@@ -1215,6 +1215,12 @@ convertBARTModelToJson <- function(object){
jsonobj$add_string_vector("rfx_unique_group_ids", object$rfx_unique_group_ids)
}
+ # Add covariate preprocessor metadata
+ preprocessor_metadata_string <- savePreprocessorToJsonString(
+ object$train_set_metadata
+ )
+ jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string)
+
return(jsonobj)
}
@@ -1322,7 +1328,7 @@ saveBARTModelToJsonFile <- function(object, filename){
#' Convert the persistent aspects of a BART model to (in-memory) JSON string
#'
#' @param object Object of type `bartmodel` containing draws of a BART model and associated sampling outputs.
-#' @return JSON string
+#' @return in-memory JSON string
#' @export
#'
#' @examples
@@ -1460,6 +1466,12 @@ createBARTModelFromJson <- function(json_object){
output[["rfx_samples"]] <- loadRandomEffectSamplesJson(json_object, 0)
}
+ # Unpack covariate preprocessor
+ preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+ output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+ preprocessor_metadata_string
+ )
+
class(output) <- "bartmodel"
return(output)
}
@@ -1686,6 +1698,12 @@ createBARTModelFromCombinedJson <- function(json_object_list){
output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
}
+ # Unpack covariate preprocessor
+ preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+ output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+ preprocessor_metadata_string
+ )
+
class(output) <- "bartmodel"
return(output)
}
@@ -1832,6 +1850,12 @@ createBARTModelFromCombinedJsonString <- function(json_string_list){
output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
}
+ # Unpack covariate preprocessor
+ preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+ output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+ preprocessor_metadata_string
+ )
+
class(output) <- "bartmodel"
return(output)
}
diff --git a/R/bcf.R b/R/bcf.R
index bc5b9d5f..ed00d25e 100644
--- a/R/bcf.R
+++ b/R/bcf.R
@@ -1708,6 +1708,12 @@ convertBCFModelToJson <- function(object){
jsonobj$add_string("bart_propensity_model", bart_propensity_string)
}
+ # Add covariate preprocessor metadata
+ preprocessor_metadata_string <- savePreprocessorToJsonString(
+ object$train_set_metadata
+ )
+ jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string)
+
return(jsonobj)
}
@@ -1716,7 +1722,7 @@ convertBCFModelToJson <- function(object){
#' @param object Object of type `bcf` containing draws of a Bayesian causal forest model and associated sampling outputs.
#' @param filename String of filepath, must end in ".json"
#'
-#' @return NULL
+#' @return in-memory JSON string
#' @export
#'
#' @examples
@@ -2018,6 +2024,12 @@ createBCFModelFromJson <- function(json_object){
)
}
+ # Unpack covariate preprocessor
+ preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+ output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+ preprocessor_metadata_string
+ )
+
class(output) <- "bcf"
return(output)
}
@@ -2393,6 +2405,12 @@ createBCFModelFromCombinedJsonString <- function(json_string_list){
output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
}
+ # Unpack covariate preprocessor
+ preprocessor_metadata_string <- json_object_default$get_string("preprocessor_metadata")
+ output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+ preprocessor_metadata_string
+ )
+
class(output) <- "bcf"
return(output)
}
diff --git a/R/cpp11.R b/R/cpp11.R
index bf6345b9..bc411e89 100644
--- a/R/cpp11.R
+++ b/R/cpp11.R
@@ -612,6 +612,14 @@ json_add_double_cpp <- function(json_ptr, field_name, field_value) {
invisible(.Call(`_stochtree_json_add_double_cpp`, json_ptr, field_name, field_value))
}
+json_add_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) {
+ invisible(.Call(`_stochtree_json_add_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value))
+}
+
+json_add_integer_cpp <- function(json_ptr, field_name, field_value) {
+ invisible(.Call(`_stochtree_json_add_integer_cpp`, json_ptr, field_name, field_value))
+}
+
json_add_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) {
invisible(.Call(`_stochtree_json_add_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value))
}
@@ -628,6 +636,14 @@ json_add_vector_cpp <- function(json_ptr, field_name, field_vector) {
invisible(.Call(`_stochtree_json_add_vector_cpp`, json_ptr, field_name, field_vector))
}
+json_add_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) {
+ invisible(.Call(`_stochtree_json_add_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector))
+}
+
+json_add_integer_vector_cpp <- function(json_ptr, field_name, field_vector) {
+ invisible(.Call(`_stochtree_json_add_integer_vector_cpp`, json_ptr, field_name, field_vector))
+}
+
json_add_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) {
invisible(.Call(`_stochtree_json_add_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector))
}
@@ -660,6 +676,14 @@ json_extract_double_cpp <- function(json_ptr, field_name) {
.Call(`_stochtree_json_extract_double_cpp`, json_ptr, field_name)
}
+json_extract_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
+ .Call(`_stochtree_json_extract_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name)
+}
+
+json_extract_integer_cpp <- function(json_ptr, field_name) {
+ .Call(`_stochtree_json_extract_integer_cpp`, json_ptr, field_name)
+}
+
json_extract_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
.Call(`_stochtree_json_extract_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name)
}
@@ -684,6 +708,14 @@ json_extract_vector_cpp <- function(json_ptr, field_name) {
.Call(`_stochtree_json_extract_vector_cpp`, json_ptr, field_name)
}
+json_extract_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
+ .Call(`_stochtree_json_extract_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name)
+}
+
+json_extract_integer_vector_cpp <- function(json_ptr, field_name) {
+ .Call(`_stochtree_json_extract_integer_vector_cpp`, json_ptr, field_name)
+}
+
json_extract_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
.Call(`_stochtree_json_extract_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name)
}
diff --git a/R/serialization.R b/R/serialization.R
index 24205f9e..bca7f23f 100644
--- a/R/serialization.R
+++ b/R/serialization.R
@@ -81,6 +81,20 @@ CppJson <- R6::R6Class(
}
},
+ #' @description
+ #' Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+ #' @param field_name The name of the field to be added to json
+ #' @param field_value Integer value of the field to be added to json
+ #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
+ #' @return NULL
+ add_integer = function(field_name, field_value, subfolder_name = NULL) {
+ if (is.null(subfolder_name)) {
+ json_add_integer_cpp(self$json_ptr, field_name, field_value)
+ } else {
+ json_add_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_value)
+ }
+ },
+
#' @description
#' Add a boolean value to the json object under the name "field_name" (with optional subfolder "subfolder_name")
#' @param field_name The name of the field to be added to json
@@ -110,7 +124,7 @@ CppJson <- R6::R6Class(
},
#' @description
- #' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+ #' Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
#' @param field_name The name of the field to be added to json
#' @param field_vector Vector to be stored in json
#' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
@@ -124,6 +138,21 @@ CppJson <- R6::R6Class(
}
},
+ #' @description
+ #' Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+ #' @param field_name The name of the field to be added to json
+ #' @param field_vector Vector to be stored in json
+ #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
+ #' @return NULL
+ add_integer_vector = function(field_name, field_vector, subfolder_name = NULL) {
+ field_vector <- as.numeric(field_vector)
+ if (is.null(subfolder_name)) {
+ json_add_integer_vector_cpp(self$json_ptr, field_name, field_vector)
+ } else {
+ json_add_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_vector)
+ }
+ },
+
#' @description
#' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
#' @param field_name The name of the field to be added to json
@@ -184,6 +213,22 @@ CppJson <- R6::R6Class(
return(result)
},
+ #' @description
+ #' Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+ #' @param field_name The name of the field to be accessed from json
+ #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored
+ #' @return NULL
+ get_integer = function(field_name, subfolder_name = NULL) {
+ if (is.null(subfolder_name)) {
+ stopifnot(json_contains_field_cpp(self$json_ptr, field_name))
+ result <- json_extract_integer_cpp(self$json_ptr, field_name)
+ } else {
+ stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name))
+ result <- json_extract_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name)
+ }
+ return(result)
+ },
+
#' @description
#' Retrieve a boolean value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
#' @param field_name The name of the field to be accessed from json
@@ -232,6 +277,22 @@ CppJson <- R6::R6Class(
return(result)
},
+ #' @description
+ #' Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+ #' @param field_name The name of the field to be accessed from json
+ #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored
+ #' @return NULL
+ get_integer_vector = function(field_name, subfolder_name = NULL) {
+ if (is.null(subfolder_name)) {
+ stopifnot(json_contains_field_cpp(self$json_ptr, field_name))
+ result <- json_extract_integer_vector_cpp(self$json_ptr, field_name)
+ } else {
+ stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name))
+ result <- json_extract_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name)
+ }
+ return(result)
+ },
+
#' @description
#' Retrieve a character vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
#' @param field_name The name of the field to be accessed from json
diff --git a/R/utils.R b/R/utils.R
index a1fc12a8..ea96794b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -359,6 +359,116 @@ preprocessPredictionDataFrame <- function(input_df, metadata) {
return(X)
}
+#' Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object
+#'
+#' @param object List containing information on variables, including train set
+#' categories for categorical variables
+#'
+#' @return wrapper around in-memory C++ JSON object
+#' @export
+#'
+#' @examples
+#' cov_mat <- matrix(1:12, ncol = 3)
+#' preprocess_list <- preprocessTrainData(cov_mat)
+#' preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
+convertPreprocessorToJson <- function(object) {
+ jsonobj <- createCppJson()
+ if (is.null(object$feature_types)) {
+ stop("This covariate preprocessor has not yet been fit")
+ }
+
+ # Add internal scalars
+ jsonobj$add_integer("num_numeric_vars", object$num_numeric_vars)
+ jsonobj$add_integer("num_ordered_cat_vars", object$num_ordered_cat_vars)
+ jsonobj$add_integer("num_unordered_cat_vars", object$num_unordered_cat_vars)
+
+ # Add internal vectors
+ jsonobj$add_vector("feature_types", object$feature_types)
+ jsonobj$add_vector("original_var_indices", object$original_var_indices)
+ if (object$num_numeric_vars > 0) {
+ jsonobj$add_string_vector("numeric_vars", object$numeric_vars)
+ }
+ if (object$num_ordered_cat_vars > 0) {
+ jsonobj$add_string_vector("ordered_cat_vars", object$ordered_cat_vars)
+ jsonobj$add_string_vector("ordered_unique_levels", object$ordered_unique_levels)
+ }
+ if (object$num_unordered_cat_vars > 0) {
+ jsonobj$add_string_vector("unordered_cat_vars", object$unordered_cat_vars)
+ jsonobj$add_string_vector("unordered_unique_levels", object$unordered_unique_levels)
+ }
+
+ return(jsonobj)
+}
+
+#' Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string
+#'
+#' @param object List containing information on variables, including train set
+#' categories for categorical variables
+#'
+#' @return in-memory JSON string
+#' @export
+#'
+#' @examples
+#' cov_mat <- matrix(1:12, ncol = 3)
+#' preprocess_list <- preprocessTrainData(cov_mat)
+#' preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
+savePreprocessorToJsonString <- function(object){
+ # Convert to Json
+ jsonobj <- convertPreprocessorToJson(object)
+
+ # Dump to string
+ return(jsonobj$return_json_string())
+}
+
+#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+#'
+#' @param json_object in-memory wrapper around JSON C++ object containing covariate preprocessor metadata
+#'
+#' @returns Preprocessor object that can be used with the `preprocessPredictionData` function
+#' @export
+createPreprocessorFromJson <- function(json_object){
+ # Initialize the metadata list
+ metadata <- list()
+
+ # Unpack internal scalars
+ metadata[["num_numeric_vars"]] <- json_object$get_integer("num_numeric_vars")
+ metadata[["num_ordered_cat_vars"]] <- json_object$get_integer("num_ordered_cat_vars")
+ metadata[["num_unordered_cat_vars"]] <- json_object$get_integer("num_unordered_cat_vars")
+
+ # Unpack internal vectors
+ metadata[["feature_types"]] <- json_object$get_vector("feature_types")
+ metadata[["original_var_indices"]] <- json_object$get_vector("original_var_indices")
+ if (metadata$num_numeric_vars > 0) {
+ metadata[["numeric_vars"]] <- json_object$get_string_vector("numeric_vars")
+ }
+ if (metadata$num_ordered_cat_vars > 0) {
+ metadata[["ordered_cat_vars"]] <- json_object$get_string_vector("ordered_cat_vars")
+ metadata[["ordered_unique_levels"]] <- json_object$get_string_vector("ordered_unique_levels")
+ }
+ if (metadata$num_unordered_cat_vars > 0) {
+ metadata[["unordered_cat_vars"]] <- json_object$get_string_vector("unordered_cat_vars")
+ metadata[["unordered_unique_levels"]] <- json_object$get_string_vector("unordered_unique_levels")
+ }
+
+ return(metadata)
+}
+
+#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+#'
+#' @param json_string in-memory JSON string containing covariate preprocessor metadata
+#'
+#' @return Preprocessor object that can be used with the `preprocessPredictionData` function
+#' @export
+createPreprocessorFromJsonString <- function(json_string){
+ # Load a `CppJson` object from string
+ preprocessor_json <- createCppJsonString(json_string)
+
+ # Create and return the BCF object
+ preprocessor_object <- createPreprocessorFromJson(preprocessor_json)
+
+ return(preprocessor_object)
+}
+
#' Preprocess a dataframe of covariate values, converting categorical variables
#' to integers and one-hot encoding if need be. Returns a list including a
#' matrix of preprocessed covariate values and associated tracking.
diff --git a/man/CppJson.Rd b/man/CppJson.Rd
index a7f7e448..65f1edec 100644
--- a/man/CppJson.Rd
+++ b/man/CppJson.Rd
@@ -32,16 +32,20 @@ Wrapper around a C++ container of tree ensembles
\item \href{#method-CppJson-add_forest}{\code{CppJson$add_forest()}}
\item \href{#method-CppJson-add_random_effects}{\code{CppJson$add_random_effects()}}
\item \href{#method-CppJson-add_scalar}{\code{CppJson$add_scalar()}}
+\item \href{#method-CppJson-add_integer}{\code{CppJson$add_integer()}}
\item \href{#method-CppJson-add_boolean}{\code{CppJson$add_boolean()}}
\item \href{#method-CppJson-add_string}{\code{CppJson$add_string()}}
\item \href{#method-CppJson-add_vector}{\code{CppJson$add_vector()}}
+\item \href{#method-CppJson-add_integer_vector}{\code{CppJson$add_integer_vector()}}
\item \href{#method-CppJson-add_string_vector}{\code{CppJson$add_string_vector()}}
\item \href{#method-CppJson-add_list}{\code{CppJson$add_list()}}
\item \href{#method-CppJson-add_string_list}{\code{CppJson$add_string_list()}}
\item \href{#method-CppJson-get_scalar}{\code{CppJson$get_scalar()}}
+\item \href{#method-CppJson-get_integer}{\code{CppJson$get_integer()}}
\item \href{#method-CppJson-get_boolean}{\code{CppJson$get_boolean()}}
\item \href{#method-CppJson-get_string}{\code{CppJson$get_string()}}
\item \href{#method-CppJson-get_vector}{\code{CppJson$get_vector()}}
+\item \href{#method-CppJson-get_integer_vector}{\code{CppJson$get_integer_vector()}}
\item \href{#method-CppJson-get_string_vector}{\code{CppJson$get_string_vector()}}
\item \href{#method-CppJson-get_numeric_list}{\code{CppJson$get_numeric_list()}}
\item \href{#method-CppJson-get_string_list}{\code{CppJson$get_string_list()}}
@@ -120,6 +124,30 @@ Add a scalar to the json object under the name "field_name" (with optional subfo
\item{\code{field_value}}{Numeric value of the field to be added to json}
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
+}
+\if{html}{\out{}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-CppJson-add_integer}{}}}
+\subsection{Method \code{add_integer()}}{
+Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{CppJson$add_integer(field_name, field_value, subfolder_name = NULL)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{field_name}}{The name of the field to be added to json}
+
+\item{\code{field_value}}{Integer value of the field to be added to json}
+
\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
}
\if{html}{\out{
}}
@@ -180,7 +208,7 @@ NULL
\if{html}{\out{}}
\if{latex}{\out{\hypertarget{method-CppJson-add_vector}{}}}
\subsection{Method \code{add_vector()}}{
-Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
\subsection{Usage}{
\if{html}{\out{}}\preformatted{CppJson$add_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{
}}
}
@@ -192,6 +220,30 @@ Add an array to the json object under the name "field_name" (with optional subfo
\item{\code{field_vector}}{Vector to be stored in json}
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
+}
+\if{html}{\out{}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-CppJson-add_integer_vector}{}}}
+\subsection{Method \code{add_integer_vector()}}{
+Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{CppJson$add_integer_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{field_name}}{The name of the field to be added to json}
+
+\item{\code{field_vector}}{Vector to be stored in json}
+
\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
}
\if{html}{\out{
}}
@@ -282,6 +334,28 @@ Retrieve a scalar value from the json object under the name "field_name" (with o
\describe{
\item{\code{field_name}}{The name of the field to be accessed from json}
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
+}
+\if{html}{\out{}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-CppJson-get_integer}{}}}
+\subsection{Method \code{get_integer()}}{
+Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{CppJson$get_integer(field_name, subfolder_name = NULL)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{field_name}}{The name of the field to be accessed from json}
+
\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
}
\if{html}{\out{
}}
@@ -348,6 +422,28 @@ Retrieve a vector from the json object under the name "field_name" (with optiona
\describe{
\item{\code{field_name}}{The name of the field to be accessed from json}
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
+}
+\if{html}{\out{}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{
}}
+\if{html}{\out{}}
+\if{latex}{\out{\hypertarget{method-CppJson-get_integer_vector}{}}}
+\subsection{Method \code{get_integer_vector()}}{
+Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{}}\preformatted{CppJson$get_integer_vector(field_name, subfolder_name = NULL)}\if{html}{\out{
}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{}}
+\describe{
+\item{\code{field_name}}{The name of the field to be accessed from json}
+
\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
}
\if{html}{\out{
}}
diff --git a/man/convertPreprocessorToJson.Rd b/man/convertPreprocessorToJson.Rd
new file mode 100644
index 00000000..49716050
--- /dev/null
+++ b/man/convertPreprocessorToJson.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{convertPreprocessorToJson}
+\alias{convertPreprocessorToJson}
+\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object}
+\usage{
+convertPreprocessorToJson(object)
+}
+\arguments{
+\item{object}{List containing information on variables, including train set
+categories for categorical variables}
+}
+\value{
+wrapper around in-memory C++ JSON object
+}
+\description{
+Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object
+}
+\examples{
+cov_mat <- matrix(1:12, ncol = 3)
+preprocess_list <- preprocessTrainData(cov_mat)
+preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
+}
diff --git a/man/createPreprocessorFromJson.Rd b/man/createPreprocessorFromJson.Rd
new file mode 100644
index 00000000..3edca354
--- /dev/null
+++ b/man/createPreprocessorFromJson.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{createPreprocessorFromJson}
+\alias{createPreprocessorFromJson}
+\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor}
+\usage{
+createPreprocessorFromJson(json_object)
+}
+\arguments{
+\item{json_object}{in-memory wrapper around JSON C++ object containing covariate preprocessor metadata}
+}
+\value{
+Preprocessor object that can be used with the \code{preprocessPredictionData} function
+}
+\description{
+Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+}
diff --git a/man/createPreprocessorFromJsonString.Rd b/man/createPreprocessorFromJsonString.Rd
new file mode 100644
index 00000000..00974b83
--- /dev/null
+++ b/man/createPreprocessorFromJsonString.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{createPreprocessorFromJsonString}
+\alias{createPreprocessorFromJsonString}
+\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor}
+\usage{
+createPreprocessorFromJsonString(json_string)
+}
+\arguments{
+\item{json_string}{in-memory JSON string containing covariate preprocessor metadata}
+}
+\value{
+Preprocessor object that can be used with the \code{preprocessPredictionData} function
+}
+\description{
+Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+}
diff --git a/man/saveBCFModelToJsonFile.Rd b/man/saveBCFModelToJsonFile.Rd
index f7685c48..5a0f1512 100644
--- a/man/saveBCFModelToJsonFile.Rd
+++ b/man/saveBCFModelToJsonFile.Rd
@@ -11,6 +11,9 @@ saveBCFModelToJsonFile(object, filename)
\item{filename}{String of filepath, must end in ".json"}
}
+\value{
+in-memory JSON string
+}
\description{
Convert the persistent aspects of a BCF model to (in-memory) JSON and save to a file
}
diff --git a/man/savePreprocessorToJsonString.Rd b/man/savePreprocessorToJsonString.Rd
new file mode 100644
index 00000000..83c54d72
--- /dev/null
+++ b/man/savePreprocessorToJsonString.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{savePreprocessorToJsonString}
+\alias{savePreprocessorToJsonString}
+\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string}
+\usage{
+savePreprocessorToJsonString(object)
+}
+\arguments{
+\item{object}{List containing information on variables, including train set
+categories for categorical variables}
+}
+\value{
+in-memory JSON string
+}
+\description{
+Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string
+}
+\examples{
+cov_mat <- matrix(1:12, ncol = 3)
+preprocess_list <- preprocessTrainData(cov_mat)
+preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
+}
diff --git a/src/cpp11.cpp b/src/cpp11.cpp
index 0091dffd..d9c352c3 100644
--- a/src/cpp11.cpp
+++ b/src/cpp11.cpp
@@ -1134,6 +1134,22 @@ extern "C" SEXP _stochtree_json_add_double_cpp(SEXP json_ptr, SEXP field_name, S
END_CPP11
}
// serialization.cpp
+void json_add_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, int field_value);
+extern "C" SEXP _stochtree_json_add_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) {
+ BEGIN_CPP11
+ json_add_integer_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_value));
+ return R_NilValue;
+ END_CPP11
+}
+// serialization.cpp
+void json_add_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name, int field_value);
+extern "C" SEXP _stochtree_json_add_integer_cpp(SEXP json_ptr, SEXP field_name, SEXP field_value) {
+ BEGIN_CPP11
+ json_add_integer_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_value));
+ return R_NilValue;
+ END_CPP11
+}
+// serialization.cpp
void json_add_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, bool field_value);
extern "C" SEXP _stochtree_json_add_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) {
BEGIN_CPP11
@@ -1166,6 +1182,22 @@ extern "C" SEXP _stochtree_json_add_vector_cpp(SEXP json_ptr, SEXP field_name, S
END_CPP11
}
// serialization.cpp
+void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector);
+extern "C" SEXP _stochtree_json_add_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) {
+ BEGIN_CPP11
+ json_add_integer_vector_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_vector));
+ return R_NilValue;
+ END_CPP11
+}
+// serialization.cpp
+void json_add_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name, cpp11::integers field_vector);
+extern "C" SEXP _stochtree_json_add_integer_vector_cpp(SEXP json_ptr, SEXP field_name, SEXP field_vector) {
+ BEGIN_CPP11
+ json_add_integer_vector_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_vector));
+ return R_NilValue;
+ END_CPP11
+}
+// serialization.cpp
void json_add_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector);
extern "C" SEXP _stochtree_json_add_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) {
BEGIN_CPP11
@@ -1226,6 +1258,20 @@ extern "C" SEXP _stochtree_json_extract_double_cpp(SEXP json_ptr, SEXP field_nam
END_CPP11
}
// serialization.cpp
+int json_extract_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(json_extract_integer_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name)));
+ END_CPP11
+}
+// serialization.cpp
+int json_extract_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_cpp(SEXP json_ptr, SEXP field_name) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(json_extract_integer_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name)));
+ END_CPP11
+}
+// serialization.cpp
bool json_extract_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name);
extern "C" SEXP _stochtree_json_extract_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
BEGIN_CPP11
@@ -1268,6 +1314,20 @@ extern "C" SEXP _stochtree_json_extract_vector_cpp(SEXP json_ptr, SEXP field_nam
END_CPP11
}
// serialization.cpp
+cpp11::writable::integers json_extract_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(json_extract_integer_vector_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name)));
+ END_CPP11
+}
+// serialization.cpp
+cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_vector_cpp(SEXP json_ptr, SEXP field_name) {
+ BEGIN_CPP11
+ return cpp11::as_sexp(json_extract_integer_vector_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name)));
+ END_CPP11
+}
+// serialization.cpp
cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name);
extern "C" SEXP _stochtree_json_extract_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
BEGIN_CPP11
@@ -1415,6 +1475,10 @@ static const R_CallMethodDef CallEntries[] = {
{"_stochtree_json_add_double_cpp", (DL_FUNC) &_stochtree_json_add_double_cpp, 3},
{"_stochtree_json_add_double_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_double_subfolder_cpp, 4},
{"_stochtree_json_add_forest_cpp", (DL_FUNC) &_stochtree_json_add_forest_cpp, 2},
+ {"_stochtree_json_add_integer_cpp", (DL_FUNC) &_stochtree_json_add_integer_cpp, 3},
+ {"_stochtree_json_add_integer_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_integer_subfolder_cpp, 4},
+ {"_stochtree_json_add_integer_vector_cpp", (DL_FUNC) &_stochtree_json_add_integer_vector_cpp, 3},
+ {"_stochtree_json_add_integer_vector_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_integer_vector_subfolder_cpp, 4},
{"_stochtree_json_add_rfx_container_cpp", (DL_FUNC) &_stochtree_json_add_rfx_container_cpp, 2},
{"_stochtree_json_add_rfx_groupids_cpp", (DL_FUNC) &_stochtree_json_add_rfx_groupids_cpp, 2},
{"_stochtree_json_add_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_json_add_rfx_label_mapper_cpp, 2},
@@ -1430,6 +1494,10 @@ static const R_CallMethodDef CallEntries[] = {
{"_stochtree_json_extract_bool_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_bool_subfolder_cpp, 3},
{"_stochtree_json_extract_double_cpp", (DL_FUNC) &_stochtree_json_extract_double_cpp, 2},
{"_stochtree_json_extract_double_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_double_subfolder_cpp, 3},
+ {"_stochtree_json_extract_integer_cpp", (DL_FUNC) &_stochtree_json_extract_integer_cpp, 2},
+ {"_stochtree_json_extract_integer_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_integer_subfolder_cpp, 3},
+ {"_stochtree_json_extract_integer_vector_cpp", (DL_FUNC) &_stochtree_json_extract_integer_vector_cpp, 2},
+ {"_stochtree_json_extract_integer_vector_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_integer_vector_subfolder_cpp, 3},
{"_stochtree_json_extract_string_cpp", (DL_FUNC) &_stochtree_json_extract_string_cpp, 2},
{"_stochtree_json_extract_string_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_string_subfolder_cpp, 3},
{"_stochtree_json_extract_string_vector_cpp", (DL_FUNC) &_stochtree_json_extract_string_vector_cpp, 2},
diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp
index f8cf32b7..ee25e586 100644
--- a/src/py_stochtree.cpp
+++ b/src/py_stochtree.cpp
@@ -1243,6 +1243,27 @@ class JsonCpp {
}
}
+ void AddInteger(std::string field_name, int field_value) {
+ if (json_->contains(field_name)) {
+ json_->at(field_name) = field_value;
+ } else {
+ json_->emplace(std::pair(field_name, field_value));
+ }
+ }
+
+ void AddIntegerSubfolder(std::string subfolder_name, std::string field_name, int field_value) {
+ if (json_->contains(subfolder_name)) {
+ if (json_->at(subfolder_name).contains(field_name)) {
+ json_->at(subfolder_name).at(field_name) = field_value;
+ } else {
+ json_->at(subfolder_name).emplace(std::pair(field_name, field_value));
+ }
+ } else {
+ json_->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+ json_->at(subfolder_name).emplace(std::pair(field_name, field_value));
+ }
+ }
+
void AddBool(std::string field_name, bool field_value) {
if (json_->contains(field_name)) {
json_->at(field_name) = field_value;
@@ -1325,6 +1346,46 @@ class JsonCpp {
}
}
+ void AddIntegerVector(std::string field_name, py::array_t field_vector) {
+ int vec_length = field_vector.size();
+ auto accessor = field_vector.mutable_unchecked<1>();
+ if (json_->contains(field_name)) {
+ json_->at(field_name).clear();
+ for (int i = 0; i < vec_length; i++) {
+ json_->at(field_name).emplace_back(accessor(i));
+ }
+ } else {
+ json_->emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_->at(field_name).emplace_back(accessor(i));
+ }
+ }
+ }
+
+ void AddIntegerVectorSubfolder(std::string subfolder_name, std::string field_name, py::array_t field_vector) {
+ int vec_length = field_vector.size();
+ auto accessor = field_vector.mutable_unchecked<1>();
+ if (json_->contains(subfolder_name)) {
+ if (json_->at(subfolder_name).contains(field_name)) {
+ json_->at(subfolder_name).at(field_name).clear();
+ for (int i = 0; i < vec_length; i++) {
+ json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+ }
+ } else {
+ json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+ }
+ }
+ } else {
+ json_->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+ json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+ }
+ }
+ }
+
void AddStringVector(std::string field_name, std::vector& field_vector) {
int vec_length = field_vector.size();
if (json_->contains(field_name)) {
@@ -1391,6 +1452,14 @@ class JsonCpp {
return json_->at(subfolder_name).at(field_name);
}
+ int ExtractInteger(std::string field_name) {
+ return json_->at(field_name);
+ }
+
+ int ExtractIntegerSubfolder(std::string subfolder_name, std::string field_name) {
+ return json_->at(subfolder_name).at(field_name);
+ }
+
bool ExtractBool(std::string field_name) {
return json_->at(field_name);
}
@@ -1429,6 +1498,28 @@ class JsonCpp {
return result;
}
+ py::array_t ExtractIntegerVector(std::string field_name) {
+ auto json_vec = json_->at(field_name);
+ py::ssize_t json_vec_length = json_->at(field_name).size();
+ auto result = py::array_t(py::detail::any_container({json_vec_length}));
+ auto accessor = result.mutable_unchecked<1>();
+ for (size_t i = 0; i < json_vec_length; i++) {
+ accessor(i) = json_vec.at(i);
+ }
+ return result;
+ }
+
+ py::array_t ExtractIntegerVectorSubfolder(std::string subfolder_name, std::string field_name) {
+ auto json_vec = json_->at(subfolder_name).at(field_name);
+ py::ssize_t json_vec_length = json_->at(subfolder_name).at(field_name).size();
+ auto result = py::array_t(py::detail::any_container({json_vec_length}));
+ auto accessor = result.mutable_unchecked<1>();
+ for (size_t i = 0; i < json_vec_length; i++) {
+ accessor(i) = json_vec.at(i);
+ }
+ return result;
+ }
+
std::vector ExtractStringVector(std::string field_name) {
auto json_vec = json_->at(field_name);
py::ssize_t json_vec_length = json_->at(field_name).size();
@@ -1472,12 +1563,16 @@ PYBIND11_MODULE(stochtree_cpp, m) {
.def("DumpJson", &JsonCpp::DumpJson)
.def("AddDouble", &JsonCpp::AddDouble)
.def("AddDoubleSubfolder", &JsonCpp::AddDoubleSubfolder)
+ .def("AddInteger", &JsonCpp::AddInteger)
+ .def("AddIntegerSubfolder", &JsonCpp::AddIntegerSubfolder)
.def("AddBool", &JsonCpp::AddBool)
.def("AddBoolSubfolder", &JsonCpp::AddBoolSubfolder)
.def("AddString", &JsonCpp::AddString)
.def("AddStringSubfolder", &JsonCpp::AddStringSubfolder)
.def("AddDoubleVector", &JsonCpp::AddDoubleVector)
.def("AddDoubleVectorSubfolder", &JsonCpp::AddDoubleVectorSubfolder)
+ .def("AddIntegerVector", &JsonCpp::AddIntegerVector)
+ .def("AddIntegerVectorSubfolder", &JsonCpp::AddIntegerVectorSubfolder)
.def("AddStringVector", &JsonCpp::AddStringVector)
.def("AddStringVectorSubfolder", &JsonCpp::AddStringVectorSubfolder)
.def("AddForest", &JsonCpp::AddForest)
@@ -1485,12 +1580,16 @@ PYBIND11_MODULE(stochtree_cpp, m) {
.def("ContainsFieldSubfolder", &JsonCpp::ContainsFieldSubfolder)
.def("ExtractDouble", &JsonCpp::ExtractDouble)
.def("ExtractDoubleSubfolder", &JsonCpp::ExtractDoubleSubfolder)
+ .def("ExtractInteger", &JsonCpp::ExtractInteger)
+ .def("ExtractIntegerSubfolder", &JsonCpp::ExtractIntegerSubfolder)
.def("ExtractBool", &JsonCpp::ExtractBool)
.def("ExtractBoolSubfolder", &JsonCpp::ExtractBoolSubfolder)
.def("ExtractString", &JsonCpp::ExtractString)
.def("ExtractStringSubfolder", &JsonCpp::ExtractStringSubfolder)
.def("ExtractDoubleVector", &JsonCpp::ExtractDoubleVector)
.def("ExtractDoubleVectorSubfolder", &JsonCpp::ExtractDoubleVectorSubfolder)
+ .def("ExtractIntegerVector", &JsonCpp::ExtractIntegerVector)
+ .def("ExtractIntegerVectorSubfolder", &JsonCpp::ExtractIntegerVectorSubfolder)
.def("ExtractStringVector", &JsonCpp::ExtractStringVector)
.def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder)
.def("SubsetJsonForest", &JsonCpp::SubsetJsonForest);
diff --git a/src/serialization.cpp b/src/serialization.cpp
index 3593f1a5..749395e8 100644
--- a/src/serialization.cpp
+++ b/src/serialization.cpp
@@ -48,6 +48,29 @@ void json_add_double_cpp(cpp11::external_pointer json_ptr, std::
}
}
+[[cpp11::register]]
+void json_add_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, int field_value) {
+ if (json_ptr->contains(subfolder_name)) {
+ if (json_ptr->at(subfolder_name).contains(field_name)) {
+ json_ptr->at(subfolder_name).at(field_name) = field_value;
+ } else {
+ json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value));
+ }
+ } else {
+ json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+ json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value));
+ }
+}
+
+[[cpp11::register]]
+void json_add_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name, int field_value) {
+ if (json_ptr->contains(field_name)) {
+ json_ptr->at(field_name) = field_value;
+ } else {
+ json_ptr->emplace(std::pair(field_name, field_value));
+ }
+}
+
[[cpp11::register]]
void json_add_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, bool field_value) {
if (json_ptr->contains(subfolder_name)) {
@@ -111,6 +134,46 @@ void json_add_vector_cpp(cpp11::external_pointer json_ptr, std::
}
}
+[[cpp11::register]]
+void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector) {
+ int vec_length = field_vector.size();
+ if (json_ptr->contains(subfolder_name)) {
+ if (json_ptr->at(subfolder_name).contains(field_name)) {
+ json_ptr->at(subfolder_name).at(field_name).clear();
+ for (int i = 0; i < vec_length; i++) {
+ json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+ }
+ } else {
+ json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+ }
+ }
+ } else {
+ json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+ json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+ }
+ }
+}
+
+[[cpp11::register]]
+void json_add_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name, cpp11::integers field_vector) {
+ int vec_length = field_vector.size();
+ if (json_ptr->contains(field_name)) {
+ json_ptr->at(field_name).clear();
+ for (int i = 0; i < vec_length; i++) {
+ json_ptr->at(field_name).emplace_back(field_vector.at(i));
+ }
+ } else {
+ json_ptr->emplace(std::pair(field_name, nlohmann::json::array()));
+ for (int i = 0; i < vec_length; i++) {
+ json_ptr->at(field_name).emplace_back(field_vector.at(i));
+ }
+ }
+}
+
[[cpp11::register]]
void json_add_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector) {
int vec_length = field_vector.size();
@@ -206,6 +269,16 @@ double json_extract_double_cpp(cpp11::external_pointer json_ptr,
return json_ptr->at(field_name);
}
+[[cpp11::register]]
+int json_extract_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) {
+ return json_ptr->at(subfolder_name).at(field_name);
+}
+
+[[cpp11::register]]
+int json_extract_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name) {
+ return json_ptr->at(field_name);
+}
+
[[cpp11::register]]
bool json_extract_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) {
return json_ptr->at(subfolder_name).at(field_name);
@@ -242,6 +315,22 @@ cpp11::writable::doubles json_extract_vector_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) {
+ cpp11::writable::integers output;
+ int vec_length = json_ptr->at(subfolder_name).at(field_name).size();
+ for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(subfolder_name).at(field_name).at(i)));
+ return output;
+}
+
+[[cpp11::register]]
+cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name) {
+ cpp11::writable::integers output;
+ int vec_length = json_ptr->at(field_name).size();
+ for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(field_name).at(i)));
+ return output;
+}
+
[[cpp11::register]]
cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) {
int vec_length = json_ptr->at(subfolder_name).at(field_name).size();
diff --git a/stochtree/__init__.py b/stochtree/__init__.py
index 95b49ae3..8e3cc643 100644
--- a/stochtree/__init__.py
+++ b/stochtree/__init__.py
@@ -3,7 +3,7 @@
from .calibration import calibrate_global_error_variance
from .data import Dataset, Residual
from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer
+from .preprocessing import CovariatePreprocessor
from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel
from .serialization import JSONSerializer
from .utils import NotSampledError
@@ -15,7 +15,7 @@
'Residual',
'ForestContainer',
'Forest',
- 'CovariateTransformer',
+ 'CovariatePreprocessor',
'RNG',
'ForestSampler',
'GlobalVarianceModel',
diff --git a/stochtree/bart.py b/stochtree/bart.py
index 01733c0a..0159fc92 100644
--- a/stochtree/bart.py
+++ b/stochtree/bart.py
@@ -1,6 +1,7 @@
"""
Bayesian Additive Regression Trees (BART) module
"""
+import warnings
from numbers import Number, Integral
from math import log
import numpy as np
@@ -8,7 +9,7 @@
from typing import Optional, Dict, Any, Union
from .data import Dataset, Residual
from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer, _preprocess_params
+from .preprocessing import CovariatePreprocessor, _preprocess_params
from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel
from .serialization import JSONSerializer
from .utils import NotSampledError
@@ -52,7 +53,8 @@ def __init__(self) -> None:
self.sampled = False
self.rng = np.random.default_rng()
- def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = None, X_test: np.array = None, basis_test: np.array = None,
+ def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basis_train: np.array = None,
+ X_test: Union[np.array, pd.DataFrame] = None, basis_test: np.array = None,
num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None,
mean_forest_params: Optional[Dict[str, Any]] = None, variance_forest_params: Optional[Dict[str, Any]] = None) -> None:
"""Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set.
@@ -301,13 +303,13 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N
variable_weights_variance = variable_weights
# Covariate preprocessing
- self._covariate_transformer = CovariateTransformer()
- self._covariate_transformer.fit(X_train)
- X_train_processed = self._covariate_transformer.transform(X_train)
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.fit(X_train)
+ X_train_processed = self._covariate_preprocessor.transform(X_train)
if X_test is not None:
- X_test_processed = self._covariate_transformer.transform(X_test)
- feature_types = np.asarray(self._covariate_transformer._processed_feature_types)
- original_var_indices = self._covariate_transformer.fetch_original_feature_indices()
+ X_test_processed = self._covariate_preprocessor.transform(X_test)
+ feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types)
+ original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices()
# Determine whether a test set is provided
self.has_test = X_test is not None
@@ -718,7 +720,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N
else:
self.sigma2_x_test = sigma_x_test_raw*self.sigma2_init*self.y_std*self.y_std
- def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.array, tuple]:
+ def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None) -> Union[np.array, tuple]:
"""Return predictions from every forest sampled (either / both of mean and variance).
Return type is either a single array of predictions, if a BART model only includes a
mean or variance term, or a tuple of prediction arrays, if a BART model includes both.
@@ -744,22 +746,44 @@ def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.arra
)
raise NotSampledError(msg)
+ # Data checks
+ if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+ raise ValueError("covariates must be a pandas dataframe or numpy array")
+ if basis is not None:
+ if not isinstance(basis, np.ndarray):
+ raise ValueError("basis must be a numpy array")
+ if basis.shape[0] != covariates.shape[0]:
+ raise ValueError("covariates and basis must have the same number of rows")
+
# Convert everything to standard shape (2-dimensional)
- if covariates.ndim == 1:
- covariates = np.expand_dims(covariates, 1)
+ if isinstance(covariates, np.ndarray):
+ if covariates.ndim == 1:
+ covariates = np.expand_dims(covariates, 1)
if basis is not None:
if basis.ndim == 1:
basis = np.expand_dims(basis, 1)
- # Data checks
- if basis is not None:
- if basis.shape[0] != covariates.shape[0]:
- raise ValueError("covariates and basis must have the same number of rows")
+ # Covariate preprocessing
+ if not self._covariate_preprocessor._check_is_fitted():
+ if not isinstance(covariates, np.ndarray):
+ raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+ else:
+ warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+ if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+ raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+ covariates_processed = covariates
+ else:
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.fit(covariates)
+ covariates_processed = self._covariate_preprocessor.transform(covariates)
+ # Dataset construction
pred_dataset = Dataset()
- pred_dataset.add_covariates(covariates)
+ pred_dataset.add_covariates(covariates_processed)
if basis is not None:
pred_dataset.add_basis(basis)
+
+ # Forest predictions
if self.include_mean_forest:
mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
mean_pred = mean_pred_raw*self.y_std + self.y_bar
@@ -808,22 +832,44 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array
)
raise NotSampledError(msg)
+ # Data checks
+ if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+ raise ValueError("covariates must be a pandas dataframe or numpy array")
+ if basis is not None:
+ if not isinstance(basis, np.ndarray):
+ raise ValueError("basis must be a numpy array")
+ if basis.shape[0] != covariates.shape[0]:
+ raise ValueError("covariates and basis must have the same number of rows")
+
# Convert everything to standard shape (2-dimensional)
- if covariates.ndim == 1:
- covariates = np.expand_dims(covariates, 1)
+ if isinstance(covariates, np.ndarray):
+ if covariates.ndim == 1:
+ covariates = np.expand_dims(covariates, 1)
if basis is not None:
if basis.ndim == 1:
basis = np.expand_dims(basis, 1)
- # Data checks
- if basis is not None:
- if basis.shape[0] != covariates.shape[0]:
- raise ValueError("covariates and basis must have the same number of rows")
+ # Covariate preprocessing
+ if not self._covariate_preprocessor._check_is_fitted():
+ if not isinstance(covariates, np.ndarray):
+ raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+ else:
+ warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+ if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+ raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+ covariates_processed = covariates
+ else:
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.fit(covariates)
+ covariates_processed = self._covariate_preprocessor.transform(covariates)
+ # Dataset construction
pred_dataset = Dataset()
- pred_dataset.add_covariates(covariates)
+ pred_dataset.add_covariates(covariates_processed)
if basis is not None:
pred_dataset.add_basis(basis)
+
+ # Mean forest predictions
mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
mean_pred = mean_pred_raw*self.y_std + self.y_bar
@@ -856,12 +902,42 @@ def predict_variance(self, covariates: np.array) -> np.array:
)
raise NotSampledError(msg)
+ # Data checks
+ if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+ raise ValueError("covariates must be a pandas dataframe or numpy array")
+ if basis is not None:
+ if not isinstance(basis, np.ndarray):
+ raise ValueError("basis must be a numpy array")
+ if basis.shape[0] != covariates.shape[0]:
+ raise ValueError("covariates and basis must have the same number of rows")
+
# Convert everything to standard shape (2-dimensional)
- if covariates.ndim == 1:
- covariates = np.expand_dims(covariates, 1)
+ if isinstance(covariates, np.ndarray):
+ if covariates.ndim == 1:
+ covariates = np.expand_dims(covariates, 1)
+ if basis is not None:
+ if basis.ndim == 1:
+ basis = np.expand_dims(basis, 1)
+
+ # Covariate preprocessing
+ if not self._covariate_preprocessor._check_is_fitted():
+ if not isinstance(covariates, np.ndarray):
+ raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+ else:
+ warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+ if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+ raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+ covariates_processed = covariates
+ else:
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.fit(covariates)
+ covariates_processed = self._covariate_preprocessor.transform(covariates)
+ # Dataset construction
pred_dataset = Dataset()
- pred_dataset.add_covariates(covariates)
+ pred_dataset.add_covariates(covariates_processed)
+
+ # Variance forest predictions
variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
if self.sample_sigma_global:
variance_pred = variance_pred_raw
@@ -920,6 +996,10 @@ def to_json(self) -> str:
if self.sample_sigma_leaf:
bart_json.add_numeric_vector("sigma2_leaf_samples", self.leaf_scale_samples, "parameters")
+ # Add covariate preprocessor
+ covariate_preprocessor_string = self._covariate_preprocessor.to_json()
+ bart_json.add_string("covariate_preprocessor", covariate_preprocessor_string)
+
return bart_json.return_json_string()
def from_json(self, json_string: str) -> None:
@@ -971,6 +1051,11 @@ def from_json(self, json_string: str) -> None:
if self.sample_sigma_leaf:
self.leaf_scale_samples = bart_json.get_numeric_vector("sigma2_leaf_samples", "parameters")
+ # Unpack covariate preprocessor
+ covariate_preprocessor_string = bart_json.get_string("covariate_preprocessor")
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.from_json(covariate_preprocessor_string)
+
# Mark the deserialized model as "sampled"
self.sampled = True
diff --git a/stochtree/bcf.py b/stochtree/bcf.py
index 8c7ca21c..4f24234b 100644
--- a/stochtree/bcf.py
+++ b/stochtree/bcf.py
@@ -8,7 +8,7 @@
from .bart import BARTModel
from .data import Dataset, Residual
from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer, _preprocess_params
+from .preprocessing import CovariatePreprocessor, _preprocess_params
from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel
from .serialization import JSONSerializer
from .utils import NotSampledError
@@ -38,7 +38,7 @@ class BCFModel:
\begin{aligned}
y &= a(X) + b_z(X) + \epsilon\\
b_z(X) &= (b_1 Z + b_0 (1-Z)) t(X)\\
- b_0, b_1 &\sim N(0, \frac{1}{2})\\\\
+ b_0, b_1 &\sim N\left(0, \frac{1}{2}\right)\\\\
a(X) &\sim \text{BART}()\\
t(X) &\sim \text{BART}()\\
\epsilon &\sim N(0, \sigma^2)\\
@@ -663,13 +663,13 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr
variable_subset_variance = [i for i in range(X_train.shape[1])]
# Covariate preprocessing
- self._covariate_transformer = CovariateTransformer()
- self._covariate_transformer.fit(X_train)
- X_train_processed = self._covariate_transformer.transform(X_train)
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.fit(X_train)
+ X_train_processed = self._covariate_preprocessor.transform(X_train)
if X_test is not None:
- X_test_processed = self._covariate_transformer.transform(X_test)
- feature_types = np.asarray(self._covariate_transformer._processed_feature_types)
- original_var_indices = self._covariate_transformer.fetch_original_feature_indices()
+ X_test_processed = self._covariate_preprocessor.transform(X_test)
+ feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types)
+ original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices()
# Determine whether a test set is provided
self.has_test = X_test is not None
@@ -1420,6 +1420,10 @@ def to_json(self) -> str:
bart_propensity_string = self.bart_propensity_model.to_json()
bcf_json.add_string("bart_propensity_model", bart_propensity_string)
+ # Add covariate preprocessor
+ covariate_preprocessor_string = self._covariate_preprocessor.to_json()
+ bcf_json.add_string("covariate_preprocessor", covariate_preprocessor_string)
+
return bcf_json.return_json_string()
def from_json(self, json_string: str) -> None:
@@ -1482,6 +1486,11 @@ def from_json(self, json_string: str) -> None:
self.bart_propensity_model = BARTModel()
self.bart_propensity_model.from_json(bart_propensity_string)
+ # Unpack covariate preprocessor
+ covariate_preprocessor_string = bcf_json.get_string("covariate_preprocessor")
+ self._covariate_preprocessor = CovariatePreprocessor()
+ self._covariate_preprocessor.from_json(covariate_preprocessor_string)
+
# Mark the deserialized model as "sampled"
self.sampled = True
diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py
index a586afd8..35633264 100644
--- a/stochtree/preprocessing.py
+++ b/stochtree/preprocessing.py
@@ -7,7 +7,9 @@
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np
import pandas as pd
+from scipy import sparse
import warnings
+from .serialization import JSONSerializer
def _preprocess_params(default_params: Dict[str, Any], user_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
if user_params:
@@ -129,20 +131,44 @@ def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str,
return processed_params
-class CovariateTransformer:
+def _map_to_integer(values: Union[np.array, list], uniques: Union[np.array, list]) -> np.array:
+ r"""
+ Slightly modified version of a [scikit-learn function](https://github.com/scikit-learn/scikit-learn/blob/43d440f1f874ac2117ed848b10a6f07d9083488d/sklearn/utils/_encode.py#L170) by the same name.
+ Converts dataframe column values (which might be string, categorical, etc...) to numpy integer indices.
+
+ Parameters
+ ----------
+ values : np.array or list
+ Array of series values.
+ uniques : np.array or list
+ Sorted array / list of unique values in the series.
"""
- Class that transforms covariates to a format that can be used to define tree splits.
- Modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html).
+ table = dict({val: i for i, val in enumerate(uniques)})
+ return np.asarray([table[v] for v in values])
+
+
+class CovariatePreprocessor:
+ r"""
+ Preprocessing engine for covariates provided as either `np.array` or `pd.DataFrame`, which standardizes inputs as a `np.array`.
+
+ `CovariatePreprocessor` uses [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) in provided
+ dataframes to convert string / categorical variables to numeric variables, either by mapping ordinal variables to integers
+ or by one-hot encoding unordered categorical variables.
+
+ This class is modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html).
"""
def __init__(self) -> None:
self._is_fitted = False
- self._ordinal_encoders = []
- self._onehot_encoders = []
- self._ordinal_feature_index = []
- self._onehot_feature_index = []
- self._processed_feature_types = []
- self._original_feature_types = []
- self._original_feature_indices = []
+ self._num_ordinal_features = 0
+ self._num_onehot_features = 0
+ self._num_original_features = 0
+ self._ordinal_categories_list = []
+ self._onehot_categories_list = []
+ self._ordinal_feature_index = None
+ self._onehot_feature_index = None
+ self._processed_feature_types = None
+ self._original_feature_types = None
+ self._original_feature_indices = None
def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f":
@@ -150,27 +176,41 @@ def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
else:
return False
- def _process_unordered_categorical(self, covariate: pd.Series) -> int:
- num_onehot = len(self._onehot_encoders)
- category_list = covariate.array.categories.to_list()
- enc = OneHotEncoder(categories=[category_list], sparse_output=False)
- enc.fit(pd.DataFrame(covariate))
- self._onehot_encoders.append(enc)
- return num_onehot
+ def _extract_categories_unordered_categorical(self, covariate: pd.Series) -> int:
+ covariate_categories = covariate.array.categories.to_numpy()
+ self._onehot_categories_list.append(covariate_categories)
+ return self._num_onehot_features
+
+ def _extract_categories_ordered_categorical(self, covariate: pd.Series) -> int:
+ covariate_categories = covariate.array.categories.to_numpy()
+ self._ordinal_categories_list.append(covariate_categories)
+ return self._num_ordinal_features
+
+ def _transform_unordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array:
+ """
+ Adapted from https://github.com/scikit-learn/scikit-learn/blob/8f2c1cab50262bcf4a1ade070446c40028ee27f4/sklearn/preprocessing/_encoders.py#L1000
+ """
+ covariate_data = covariate.array.to_numpy()
+ n = len(covariate_data)
+ integer_indices = _map_to_integer(covariate_data, covariate_categories)
+ row_offsets = np.arange(n + 1, dtype=int)
+ onehot_data = np.ones(row_offsets[-1])
+ out = sparse.csr_matrix(
+ (onehot_data, integer_indices, row_offsets),
+ shape=(n, len(covariate_categories)),
+ dtype=np.float64,
+ )
+ return out.toarray()
- def _process_ordered_categorical(self, covariate: pd.Series) -> int:
- num_ord = len(self._ordinal_encoders)
- category_list = covariate.array.categories.to_list()
- enc = OrdinalEncoder(categories=[category_list])
- enc.fit(pd.DataFrame(covariate))
- self._ordinal_encoders.append(enc)
- return num_ord
+ def _transform_ordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array:
+ covariate_data = covariate.array.to_numpy()
+ return _map_to_integer(covariate_data, covariate_categories)
def _fit_pandas(self, covariates: pd.DataFrame) -> None:
self._num_original_features = covariates.shape[1]
- self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
- self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
- self._original_feature_types = [-1 for i in range(self._num_original_features)]
+ self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+ self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+ original_feature_types = [-1 for i in range(self._num_original_features)]
datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x))
object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x))
interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype))
@@ -214,36 +254,42 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None:
warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}"
warnings.warn(warn_msg.format(object_cols))
+ processed_feature_types = []
for i in range(covariates.shape[1]):
covariate = covariates.iloc[:,i]
if categorical_types.iloc[i]:
- self._original_feature_types[i] = "category"
+ original_feature_types[i] = "category"
if covariate.array.ordered:
- ord_index = self._process_ordered_categorical(covariate)
+ ord_index = self._extract_categories_ordered_categorical(covariate)
self._ordinal_feature_index[i] = ord_index
- self._processed_feature_types.append(1)
+ processed_feature_types.append(1)
+ self._num_ordinal_features += 1
else:
- onehot_index = self._process_unordered_categorical(covariate)
+ onehot_index = self._extract_categories_unordered_categorical(covariate)
self._onehot_feature_index[i] = onehot_index
feature_ones = np.repeat(1, len(covariate.array.categories)).tolist()
- self._processed_feature_types.extend(feature_ones)
+ processed_feature_types.extend(feature_ones)
+ self._num_onehot_features += 1
elif string_types.iloc[i]:
- self._original_feature_types[i] = "string"
- onehot_index = self._process_unordered_categorical(covariate)
+ original_feature_types[i] = "string"
+ onehot_index = self._extract_categories_unordered_categorical(covariate)
self._onehot_feature_index[i] = onehot_index
feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist()
- self._processed_feature_types.extend(feature_ones)
+ processed_feature_types.extend(feature_ones)
elif bool_types.iloc[i]:
- self._original_feature_types[i] = "boolean"
- self._processed_feature_types.append(1)
+ original_feature_types[i] = "boolean"
+ processed_feature_types.append(1)
elif integer_types.iloc[i]:
- self._original_feature_types[i] = "integer"
- self._processed_feature_types.append(0)
+ original_feature_types[i] = "integer"
+ processed_feature_types.append(0)
elif float_types.iloc[i]:
- self._original_feature_types[i] = "float"
- self._processed_feature_types.append(0)
+ original_feature_types[i] = "float"
+ processed_feature_types.append(0)
else:
- self._original_feature_types[i] = "unsupported"
+ original_feature_types[i] = "unsupported"
+
+ self._processed_feature_types = np.array(processed_feature_types, dtype=int)
+ self._original_feature_types = np.array(original_feature_types)
def _fit_numpy(self, covariates: np.array) -> None:
if covariates.ndim == 1:
@@ -252,9 +298,9 @@ def _fit_numpy(self, covariates: np.array) -> None:
raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
self._num_original_features = covariates.shape[1]
- self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
- self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
- self._original_feature_types = ["float" for i in range(self._num_original_features)]
+ self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+ self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+ self._original_feature_types = np.array(["float" for i in range(self._num_original_features)])
# Check whether the array is numeric
cov_dtype = covariates.dtype
@@ -269,12 +315,16 @@ def _fit_numpy(self, covariates: np.array) -> None:
raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)")
# Scan for binary columns
+ processed_feature_types = []
for i in range(self._num_original_features):
num_unique = np.unique(covariates[:,i]).size
if num_unique == 2:
- self._processed_feature_types.append(1)
+ processed_feature_types.append(1)
else:
- self._processed_feature_types.append(0)
+ processed_feature_types.append(0)
+ # TODO: Convert to integer if not passed as integer
+
+ self._processed_feature_types = np.array(processed_feature_types, dtype=int)
def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
if isinstance(covariates, pd.DataFrame):
@@ -291,33 +341,38 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64)
output_iter = 0
- self._original_feature_indices = []
+ original_feature_indices = []
+ print(self._original_feature_types)
for i in range(covariates.shape[1]):
covariate = covariates.iloc[:,i]
if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string":
if self._ordinal_feature_index[i] != -1:
ord_ind = self._ordinal_feature_index[i]
- covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate))
+ covariate_categories = self._ordinal_categories_list[ord_ind]
+ covariate_transformed = self._transform_ordered_categorical(covariate, covariate_categories)
output_array[:,output_iter] = np.squeeze(covariate_transformed)
output_iter += 1
- self._original_feature_indices.append(i)
+ original_feature_indices.append(i)
else:
onehot_ind = self._onehot_feature_index[i]
- covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate))
+ covariate_categories = self._onehot_categories_list[onehot_ind]
+ covariate_transformed = self._transform_unordered_categorical(covariate, covariate_categories)
output_dim = covariate_transformed.shape[1]
output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed)
output_iter += output_dim
- self._original_feature_indices.extend([i for _ in range(output_dim)])
+ original_feature_indices.extend([i for _ in range(output_dim)])
elif self._original_feature_types[i] == "boolean":
output_array[:,output_iter] = (covariate*1.0).to_numpy()
output_iter += 1
- self._original_feature_indices.append(i)
+ original_feature_indices.append(i)
elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float":
output_array[:,output_iter] = (covariate).to_numpy()
output_iter += 1
- self._original_feature_indices.append(i)
+ original_feature_indices.append(i)
+
+ self._original_feature_indices = np.array(original_feature_indices, dtype=int)
return output_array
@@ -328,7 +383,7 @@ def _transform_numpy(self, covariates: np.array) -> np.array:
raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
if self._num_original_features != covariates.shape[1]:
raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")
- self._original_feature_indices = [i for i in range(covariates.shape[1])]
+ self._original_feature_indices = np.array([i for i in range(covariates.shape[1])])
return covariates
def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
@@ -346,7 +401,7 @@ def _check_is_fitted(self) -> bool:
return self._is_fitted
def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
- r"""Fits a `CovariateTransformer` by unpacking (and storing) data type information on the input (raw) covariates
+ r"""Fits a `CovariatePreprocessor` by unpacking (and storing) data type information on the input (raw) covariates
and then converting to a numpy array which can be passed to a tree ensemble sampler.
If `covariates` is a `pd.DataFrame`, [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes)
@@ -430,4 +485,106 @@ def fetch_original_feature_indices(self) -> list:
this method would return a list `[0,0,0,0,0]`. If the transformer merely passes
through `k` numeric features, this method would return a list `[0,...,k-1]`.
"""
- return self._original_feature_indices
+ return self._original_feature_indices.tolist()
+
+ def to_json(self) -> str:
+ """
+ Converts a covariate preprocessor to JSON string representation (which can then be saved to a file or
+ processed using the `json` library)
+
+ Returns
+ -------
+ str
+ JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests
+ """
+ # Initialize JSONSerializer object
+ preprocessor_json = JSONSerializer()
+
+ # Add internal scalars
+ preprocessor_json.add_boolean("is_fitted", self._is_fitted)
+ preprocessor_json.add_integer("num_ordinal_features", self._num_ordinal_features)
+ preprocessor_json.add_integer("num_onehot_features", self._num_onehot_features)
+ preprocessor_json.add_integer("num_original_features", self._num_original_features)
+
+ # Add internal lists
+ for i in range(self._num_ordinal_features):
+ dtype_name = "dtype_{:d}".format(i)
+ list_name = "cats_{:d}".format(i)
+ if np.issubdtype(self._ordinal_categories_list[i].dtype, np.integer):
+ array_type = "int"
+ preprocessor_json.add_integer_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+ elif np.issubdtype(self._ordinal_categories_list[i].dtype, np.floating):
+ array_type = "float"
+ preprocessor_json.add_numeric_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+ else:
+ array_type = "str"
+ preprocessor_json.add_string_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+ preprocessor_json.add_string(dtype_name, array_type, "ordinal_dtype_list")
+ for i in range(self._num_onehot_features):
+ dtype_name = "dtype_{:d}".format(i)
+ list_name = "cats_{:d}".format(i)
+ if np.issubdtype(self._onehot_categories_list[i].dtype, np.integer):
+ array_type = "int"
+ preprocessor_json.add_integer_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+ elif np.issubdtype(self._onehot_categories_list[i].dtype, np.floating):
+ array_type = "float"
+ preprocessor_json.add_numeric_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+ else:
+ array_type = "str"
+ preprocessor_json.add_string_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+ preprocessor_json.add_string(dtype_name, array_type, "onehot_dtype_list")
+ preprocessor_json.add_integer_vector("ordinal_feature_index", self._ordinal_feature_index)
+ preprocessor_json.add_integer_vector("onehot_feature_index", self._onehot_feature_index)
+ preprocessor_json.add_integer_vector("processed_feature_types", self._processed_feature_types)
+ preprocessor_json.add_string_vector("original_feature_types", self._original_feature_types)
+ preprocessor_json.add_integer_vector("original_feature_indices", self._original_feature_indices)
+
+ return preprocessor_json.return_json_string()
+
+ def from_json(self, json_string: str) -> None:
+ """
+ Converts a JSON string to an in-memory BART model.
+
+ Parameters
+ ----------
+ json_string : str
+ JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests
+ """
+ # Parse string to a JSON object in C++
+ preprocessor_json = JSONSerializer()
+ preprocessor_json.load_from_json_string(json_string)
+
+ # Unpack internal scalars
+ self._is_fitted = preprocessor_json.get_boolean("is_fitted")
+ self._num_ordinal_features = preprocessor_json.get_integer("num_ordinal_features")
+ self._num_onehot_features = preprocessor_json.get_integer("num_onehot_features")
+ self._num_original_features = preprocessor_json.get_integer("num_original_features")
+
+ # Unpack internal lists
+ self._ordinal_categories_list = []
+ for i in range(self._num_ordinal_features):
+ dtype_name = "dtype_{:d}".format(i)
+ list_name = "cats_{:d}".format(i)
+ array_type = preprocessor_json.get_string(dtype_name, "ordinal_dtype_list")
+ if array_type == "int":
+ self._ordinal_categories_list.append(preprocessor_json.get_integer_vector(list_name, "ordinal_categories_list"))
+ elif array_type == "float":
+ self._ordinal_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "ordinal_categories_list"))
+ else:
+ self._ordinal_categories_list.append(preprocessor_json.get_string_vector(list_name, "ordinal_categories_list"))
+ self._onehot_categories_list = []
+ for i in range(self._num_onehot_features):
+ dtype_name = "dtype_{:d}".format(i)
+ list_name = "cats_{:d}".format(i)
+ array_type = preprocessor_json.get_string(dtype_name, "onehot_dtype_list")
+ if array_type == "int":
+ self._onehot_categories_list.append(preprocessor_json.get_integer_vector(list_name, "onehot_categories_list"))
+ elif array_type == "float":
+ self._onehot_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "onehot_categories_list"))
+ else:
+ self._onehot_categories_list.append(np.array(preprocessor_json.get_string_vector(list_name, "onehot_categories_list")))
+ self._ordinal_feature_index = preprocessor_json.get_integer_vector("ordinal_feature_index")
+ self._onehot_feature_index = preprocessor_json.get_integer_vector("onehot_feature_index")
+ self._processed_feature_types = preprocessor_json.get_integer_vector("processed_feature_types")
+ self._original_feature_types = preprocessor_json.get_string_vector("original_feature_types")
+ self._original_feature_indices = preprocessor_json.get_integer_vector("original_feature_indices")
diff --git a/stochtree/serialization.py b/stochtree/serialization.py
index acbb9e85..b6d3a93b 100644
--- a/stochtree/serialization.py
+++ b/stochtree/serialization.py
@@ -1,6 +1,7 @@
import warnings
import numpy as np
import pandas as pd
+from typing import Union
from scipy.linalg import lstsq
from scipy.stats import gamma
from .forest import ForestContainer
@@ -66,6 +67,23 @@ def add_scalar(self, field_name: str, field_value: float, subfolder_name: str =
else:
self.json_cpp.AddDoubleSubfolder(subfolder_name, field_name, field_value)
+ def add_integer(self, field_name: str, field_value: int, subfolder_name: str = None) -> None:
+ """Adds an integer value to a json object
+
+ Parameters
+ ----------
+ field_name : str
+ Name of the json field / label under which the numeric value will be stored
+ field_value : int
+ Integer value to be stored
+ subfolder_name : str, optional
+ Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+ """
+ if subfolder_name is None:
+ self.json_cpp.AddInteger(field_name, field_value)
+ else:
+ self.json_cpp.AddIntegerSubfolder(subfolder_name, field_name, field_value)
+
def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = None) -> None:
"""Adds a scalar (boolean) value to a json object
@@ -125,6 +143,33 @@ def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_
else:
self.json_cpp.AddDoubleVectorSubfolder(subfolder_name, field_name, field_vector)
+ def add_integer_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None:
+ """Adds a integer vector (stored as a numpy array) to a json object
+
+ Parameters
+ ----------
+ field_name : str
+ Name of the json field / label under which the integer vector will be stored
+ field_vector : np.array
+ Numpy array containing the vector to be stored in json. Should be one-dimensional.
+ subfolder_name : str, optional
+ Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+ """
+ # Runtime checks
+ if not isinstance(field_vector, np.ndarray):
+ raise ValueError("field_vector must be a numpy array")
+ if not np.issubdtype(field_vector.dtype, np.integer):
+ raise ValueError("field_vector must be a numpy array with integer data types")
+ field_vector = np.squeeze(field_vector)
+ if field_vector.ndim > 1:
+ warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()")
+ field_vector = np.ravel(field_vector, order = "C")
+
+ if subfolder_name is None:
+ self.json_cpp.AddIntegerVector(field_name, field_vector)
+ else:
+ self.json_cpp.AddIntegerVectorSubfolder(subfolder_name, field_name, field_vector)
+
def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: str = None) -> None:
"""Adds a list of strings to a json object as an array
@@ -138,9 +183,11 @@ def add_string_vector(self, field_name: str, field_vector: list, subfolder_name:
Name of "subfolder" under which `field_name` to be stored in the json hierarchy
"""
# Runtime checks
- if not isinstance(field_vector, list):
- raise ValueError("field_vector must be a list")
+ if not isinstance(field_vector, list) and not isinstance(field_vector, np.ndarray):
+ raise ValueError("field_vector must be a list or numpy object array")
+ if isinstance(field_vector, np.ndarray):
+ field_vector = field_vector.tolist()
if subfolder_name is None:
self.json_cpp.AddStringVector(field_name, field_vector)
else:
@@ -161,6 +208,21 @@ def get_scalar(self, field_name: str, subfolder_name: str = None) -> float:
else:
return self.json_cpp.ExtractDoubleSubfolder(subfolder_name, field_name)
+ def get_integer(self, field_name: str, subfolder_name: str = None) -> int:
+ """Retrieves an integer value from a json object
+
+ Parameters
+ ----------
+ field_name : str
+ Name of the json field / label under which the numeric value is stored
+ subfolder_name : str, optional
+ Name of "subfolder" under which `field_name` is stored in the json hierarchy
+ """
+ if subfolder_name is None:
+ return self.json_cpp.ExtractInteger(field_name)
+ else:
+ return self.json_cpp.ExtractIntegerSubfolder(subfolder_name, field_name)
+
def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool:
"""Retrieves a scalar (boolean) value from a json object
@@ -177,12 +239,12 @@ def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool:
return self.json_cpp.ExtractBoolSubfolder(subfolder_name, field_name)
def get_string(self, field_name: str, subfolder_name: str = None) -> str:
- """Retrieve a string to a json object
+ """Retrieve a string from a json object
Parameters
----------
field_name : str
- Name of the json field / label under which the numeric value is stored
+ Name of the json field / label under which the string is stored
subfolder_name : str, optional
Name of "subfolder" under which `field_name` is stored in the json hierarchy
"""
@@ -192,7 +254,7 @@ def get_string(self, field_name: str, subfolder_name: str = None) -> str:
return self.json_cpp.ExtractStringSubfolder(subfolder_name, field_name)
def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.array:
- """Adds a string to a json object
+ """Retrieve numeric vector from a json object
Parameters
----------
@@ -206,6 +268,21 @@ def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.
else:
return self.json_cpp.ExtractDoubleVectorSubfolder(subfolder_name, field_name)
+ def get_integer_vector(self, field_name: str, subfolder_name: str = None) -> np.array:
+ """Retrieve integer vector from a json object
+
+ Parameters
+ ----------
+ field_name : str
+ Name of the json field / label under which the integer vector is stored
+ subfolder_name : str, optional
+ Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+ """
+ if subfolder_name is None:
+ return self.json_cpp.ExtractIntegerVector(field_name)
+ else:
+ return self.json_cpp.ExtractIntegerVectorSubfolder(subfolder_name, field_name)
+
def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list:
"""Adds a string to a json object
diff --git a/test/python/test_calibration.py b/test/python/test_calibration.py
index 312b9632..0cc437a8 100644
--- a/test/python/test_calibration.py
+++ b/test/python/test_calibration.py
@@ -3,7 +3,6 @@
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from scipy.stats import gamma
-from stochtree import CovariateTransformer
from stochtree import calibrate_global_error_variance
import pytest
diff --git a/test/python/test_json.py b/test/python/test_json.py
index 2bd71cd8..4d8d903c 100644
--- a/test/python/test_json.py
+++ b/test/python/test_json.py
@@ -1,7 +1,8 @@
import numpy as np
+import pandas as pd
from stochtree import (
BARTModel, BCFModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual,
- RNG, ForestSampler, ForestContainer, GlobalVarianceModel
+ RNG, ForestSampler, ForestContainer, GlobalVarianceModel, CovariatePreprocessor
)
class TestJson:
@@ -26,6 +27,46 @@ def test_array(self):
np.testing.assert_array_equal(a, json_test.get_numeric_vector("a"))
assert b == json_test.get_string_vector("b")
+ def test_preprocessor(self):
+ df = pd.DataFrame(
+ {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
+ "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']),
+ "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]}
+ )
+ cov_transformer = CovariatePreprocessor()
+ df_transformed_orig = cov_transformer.fit_transform(df)
+ cov_transformer_json = cov_transformer.to_json()
+ cov_transformer_reloaded = CovariatePreprocessor()
+ cov_transformer_reloaded.from_json(cov_transformer_json)
+ df_transformed_reloaded = cov_transformer_reloaded.transform(df)
+ np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded)
+
+ df_2 = pd.DataFrame(
+ {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
+ "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']),
+ "x3": pd.Categorical(['a', 'c', 'd', 'b', 'd', 'b'], ordered=False, categories=['c', 'b', 'a', 'd']),
+ "x4": pd.Categorical(['a', 'b', 'f', 'f', 'c', 'a'], ordered=True, categories=['c', 'b', 'a', 'f']),
+ "x5": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]}
+ )
+ cov_transformer_2 = CovariatePreprocessor()
+ df_transformed_orig_2 = cov_transformer_2.fit_transform(df_2)
+ cov_transformer_json_2 = cov_transformer_2.to_json()
+ cov_transformer_reloaded_2 = CovariatePreprocessor()
+ cov_transformer_reloaded_2.from_json(cov_transformer_json_2)
+ df_transformed_reloaded_2 = cov_transformer_reloaded_2.transform(df_2)
+ np.testing.assert_array_equal(df_transformed_orig_2, df_transformed_reloaded_2)
+
+ np_3 = np.array(
+ [[1.5, 1.2], [2.7, 5.4], [3.6, 9.3], [4.4, 10.4], [5.3, 3.6], [6.1, 4.4]]
+ )
+ cov_transformer_3 = CovariatePreprocessor()
+ df_transformed_orig_3 = cov_transformer_3.fit_transform(np_3)
+ cov_transformer_json_3 = cov_transformer_3.to_json()
+ cov_transformer_reloaded_3 = CovariatePreprocessor()
+ cov_transformer_reloaded_3.from_json(cov_transformer_json_3)
+ df_transformed_reloaded_3 = cov_transformer_reloaded_3.transform(np_3)
+ np.testing.assert_array_equal(df_transformed_orig_3, df_transformed_reloaded_3)
+
def test_forest(self):
# Generate sample data
random_seed = 1234
diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py
index acc41593..87e338e7 100644
--- a/test/python/test_preprocessor.py
+++ b/test/python/test_preprocessor.py
@@ -1,10 +1,10 @@
import numpy as np
import pandas as pd
-from stochtree import CovariateTransformer
+from stochtree import CovariatePreprocessor
class TestPreprocessor:
def test_numpy(self):
- cov_transformer = CovariateTransformer()
+ cov_transformer = CovariatePreprocessor()
np_1 = np.array(
[[1.5, 8.7, 1.2],
[2.7, 3.4, 5.4],
@@ -15,7 +15,7 @@ def test_numpy(self):
)
np_1_transformed = cov_transformer.fit_transform(np_1)
np.testing.assert_array_equal(np_1, np_1_transformed)
- assert cov_transformer._processed_feature_types == [0,0,0]
+ np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0]))
def test_pandas(self):
df_1 = pd.DataFrame(
@@ -31,10 +31,10 @@ def test_pandas(self):
[5.3, 9.3, 3.6],
[6.1, 10.4, 4.4]]
)
- cov_transformer = CovariateTransformer()
+ cov_transformer = CovariatePreprocessor()
df_1_transformed = cov_transformer.fit_transform(df_1)
np.testing.assert_array_equal(np_1, df_1_transformed)
- assert cov_transformer._processed_feature_types == [0,0,0]
+ np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0]))
df_2 = pd.DataFrame(
{"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
@@ -49,10 +49,10 @@ def test_pandas(self):
[5.3, 1, 3.6],
[6.1, 0, 4.4]]
)
- cov_transformer = CovariateTransformer()
+ cov_transformer = CovariatePreprocessor()
df_2_transformed = cov_transformer.fit_transform(df_2)
np.testing.assert_array_equal(np_2, df_2_transformed)
- assert cov_transformer._processed_feature_types == [0,1,0]
+ np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,0]))
df_3 = pd.DataFrame(
{"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
@@ -67,7 +67,27 @@ def test_pandas(self):
[5.3, 0, 1, 0, 3.6],
[6.1, 1, 0, 0, 4.4]]
)
- cov_transformer = CovariateTransformer()
+ cov_transformer = CovariatePreprocessor()
df_3_transformed = cov_transformer.fit_transform(df_3)
np.testing.assert_array_equal(np_3, df_3_transformed)
- assert cov_transformer._processed_feature_types == [0,1,1,1,0]
+ np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,0]))
+
+ df_4 = pd.DataFrame(
+ {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6],
+ "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'c'], ordered=False, categories=['c', 'b', 'a', 'd']),
+ "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]}
+ )
+ np_4 = np.array(
+ [[1.5, 0, 0, 1, 0, 1.2],
+ [2.7, 0, 1, 0, 0, 5.4],
+ [3.6, 1, 0, 0, 0, 9.3],
+ [4.4, 0, 0, 1, 0, 10.4],
+ [5.3, 0, 1, 0, 0, 3.6],
+ [6.1, 1, 0, 0, 0, 4.4],
+ [7.6, 1, 0, 0, 0, 3.4]]
+ )
+ cov_transformer = CovariatePreprocessor()
+ df_4_transformed = cov_transformer.fit_transform(df_4)
+ np.testing.assert_array_equal(np_4, df_4_transformed)
+ np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,1,0]))
+
\ No newline at end of file