From 53dab88d9c7218d05651f9f073310f32001dad76 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 20 Jan 2025 14:08:57 -0600 Subject: [PATCH 1/5] Added test for python edge case (unseen categories in unordered categorical) --- test/python/test_preprocessor.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py index acc41593..4282a20e 100644 --- a/test/python/test_preprocessor.py +++ b/test/python/test_preprocessor.py @@ -71,3 +71,22 @@ def test_pandas(self): df_3_transformed = cov_transformer.fit_transform(df_3) np.testing.assert_array_equal(np_3, df_3_transformed) assert cov_transformer._processed_feature_types == [0,1,1,1,0] + + df_4 = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6], + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ordered=False, categories=['c', 'b', 'a']), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]} + ) + np_4 = np.array( + [[1.5, 0, 0, 1, 0, 1.2], + [2.7, 0, 1, 0, 0, 5.4], + [3.6, 1, 0, 0, 0, 9.3], + [4.4, 0, 0, 1, 0, 10.4], + [5.3, 0, 1, 0, 0, 3.6], + [6.1, 1, 0, 0, 1, 4.4]] + ) + cov_transformer = CovariateTransformer() + with np.testing.assert_raises(ValueError): + df_4_transformed = cov_transformer.fit_transform(df_4) + # np.testing.assert_array_equal(np_4, df_4_transformed) + # assert cov_transformer._processed_feature_types == [0,1,1,1,1,0] From 3c3570cf16e6b11fd36f9a71ff08cafb6faa398b Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 22 Jan 2025 03:30:56 -0600 Subject: [PATCH 2/5] Refactored and added serialization for python covariate preprocessor --- src/py_stochtree.cpp | 99 ++++++++++++ stochtree/__init__.py | 4 +- stochtree/bart.py | 4 +- stochtree/bcf.py | 6 +- stochtree/preprocessing.py | 267 ++++++++++++++++++++++++------- stochtree/serialization.py | 87 +++++++++- test/python/test_calibration.py | 1 - test/python/test_json.py | 25 ++- test/python/test_preprocessor.py | 33 ++-- 9 files changed, 441 insertions(+), 85 deletions(-) diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp index f8cf32b7..ee25e586 100644 --- a/src/py_stochtree.cpp +++ b/src/py_stochtree.cpp @@ -1243,6 +1243,27 @@ class JsonCpp { } } + void AddInteger(std::string field_name, int field_value) { + if (json_->contains(field_name)) { + json_->at(field_name) = field_value; + } else { + json_->emplace(std::pair(field_name, field_value)); + } + } + + void AddIntegerSubfolder(std::string subfolder_name, std::string field_name, int field_value) { + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name) = field_value; + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } + void AddBool(std::string field_name, bool field_value) { if (json_->contains(field_name)) { json_->at(field_name) = field_value; @@ -1325,6 +1346,46 @@ class JsonCpp { } } + void AddIntegerVector(std::string field_name, py::array_t field_vector) { + int vec_length = field_vector.size(); + auto accessor = field_vector.mutable_unchecked<1>(); + if (json_->contains(field_name)) { + json_->at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(accessor(i)); + } + } else { + json_->emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(field_name).emplace_back(accessor(i)); + } + } + } + + void AddIntegerVectorSubfolder(std::string subfolder_name, std::string field_name, py::array_t field_vector) { + int vec_length = field_vector.size(); + auto accessor = field_vector.mutable_unchecked<1>(); + if (json_->contains(subfolder_name)) { + if (json_->at(subfolder_name).contains(field_name)) { + json_->at(subfolder_name).at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } else { + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } + } else { + json_->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_->at(subfolder_name).at(field_name).emplace_back(accessor(i)); + } + } + } + void AddStringVector(std::string field_name, std::vector& field_vector) { int vec_length = field_vector.size(); if (json_->contains(field_name)) { @@ -1391,6 +1452,14 @@ class JsonCpp { return json_->at(subfolder_name).at(field_name); } + int ExtractInteger(std::string field_name) { + return json_->at(field_name); + } + + int ExtractIntegerSubfolder(std::string subfolder_name, std::string field_name) { + return json_->at(subfolder_name).at(field_name); + } + bool ExtractBool(std::string field_name) { return json_->at(field_name); } @@ -1429,6 +1498,28 @@ class JsonCpp { return result; } + py::array_t ExtractIntegerVector(std::string field_name) { + auto json_vec = json_->at(field_name); + py::ssize_t json_vec_length = json_->at(field_name).size(); + auto result = py::array_t(py::detail::any_container({json_vec_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < json_vec_length; i++) { + accessor(i) = json_vec.at(i); + } + return result; + } + + py::array_t ExtractIntegerVectorSubfolder(std::string subfolder_name, std::string field_name) { + auto json_vec = json_->at(subfolder_name).at(field_name); + py::ssize_t json_vec_length = json_->at(subfolder_name).at(field_name).size(); + auto result = py::array_t(py::detail::any_container({json_vec_length})); + auto accessor = result.mutable_unchecked<1>(); + for (size_t i = 0; i < json_vec_length; i++) { + accessor(i) = json_vec.at(i); + } + return result; + } + std::vector ExtractStringVector(std::string field_name) { auto json_vec = json_->at(field_name); py::ssize_t json_vec_length = json_->at(field_name).size(); @@ -1472,12 +1563,16 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("DumpJson", &JsonCpp::DumpJson) .def("AddDouble", &JsonCpp::AddDouble) .def("AddDoubleSubfolder", &JsonCpp::AddDoubleSubfolder) + .def("AddInteger", &JsonCpp::AddInteger) + .def("AddIntegerSubfolder", &JsonCpp::AddIntegerSubfolder) .def("AddBool", &JsonCpp::AddBool) .def("AddBoolSubfolder", &JsonCpp::AddBoolSubfolder) .def("AddString", &JsonCpp::AddString) .def("AddStringSubfolder", &JsonCpp::AddStringSubfolder) .def("AddDoubleVector", &JsonCpp::AddDoubleVector) .def("AddDoubleVectorSubfolder", &JsonCpp::AddDoubleVectorSubfolder) + .def("AddIntegerVector", &JsonCpp::AddIntegerVector) + .def("AddIntegerVectorSubfolder", &JsonCpp::AddIntegerVectorSubfolder) .def("AddStringVector", &JsonCpp::AddStringVector) .def("AddStringVectorSubfolder", &JsonCpp::AddStringVectorSubfolder) .def("AddForest", &JsonCpp::AddForest) @@ -1485,12 +1580,16 @@ PYBIND11_MODULE(stochtree_cpp, m) { .def("ContainsFieldSubfolder", &JsonCpp::ContainsFieldSubfolder) .def("ExtractDouble", &JsonCpp::ExtractDouble) .def("ExtractDoubleSubfolder", &JsonCpp::ExtractDoubleSubfolder) + .def("ExtractInteger", &JsonCpp::ExtractInteger) + .def("ExtractIntegerSubfolder", &JsonCpp::ExtractIntegerSubfolder) .def("ExtractBool", &JsonCpp::ExtractBool) .def("ExtractBoolSubfolder", &JsonCpp::ExtractBoolSubfolder) .def("ExtractString", &JsonCpp::ExtractString) .def("ExtractStringSubfolder", &JsonCpp::ExtractStringSubfolder) .def("ExtractDoubleVector", &JsonCpp::ExtractDoubleVector) .def("ExtractDoubleVectorSubfolder", &JsonCpp::ExtractDoubleVectorSubfolder) + .def("ExtractIntegerVector", &JsonCpp::ExtractIntegerVector) + .def("ExtractIntegerVectorSubfolder", &JsonCpp::ExtractIntegerVectorSubfolder) .def("ExtractStringVector", &JsonCpp::ExtractStringVector) .def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder) .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest); diff --git a/stochtree/__init__.py b/stochtree/__init__.py index 95b49ae3..8e3cc643 100644 --- a/stochtree/__init__.py +++ b/stochtree/__init__.py @@ -3,7 +3,7 @@ from .calibration import calibrate_global_error_variance from .data import Dataset, Residual from .forest import ForestContainer, Forest -from .preprocessing import CovariateTransformer +from .preprocessing import CovariatePreprocessor from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError @@ -15,7 +15,7 @@ 'Residual', 'ForestContainer', 'Forest', - 'CovariateTransformer', + 'CovariatePreprocessor', 'RNG', 'ForestSampler', 'GlobalVarianceModel', diff --git a/stochtree/bart.py b/stochtree/bart.py index 01733c0a..1e4ee6b0 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -8,7 +8,7 @@ from typing import Optional, Dict, Any, Union from .data import Dataset, Residual from .forest import ForestContainer, Forest -from .preprocessing import CovariateTransformer, _preprocess_params +from .preprocessing import CovariatePreprocessor, _preprocess_params from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError @@ -301,7 +301,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N variable_weights_variance = variable_weights # Covariate preprocessing - self._covariate_transformer = CovariateTransformer() + self._covariate_transformer = CovariatePreprocessor() self._covariate_transformer.fit(X_train) X_train_processed = self._covariate_transformer.transform(X_train) if X_test is not None: diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 8c7ca21c..492cd4dd 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -8,7 +8,7 @@ from .bart import BARTModel from .data import Dataset, Residual from .forest import ForestContainer, Forest -from .preprocessing import CovariateTransformer, _preprocess_params +from .preprocessing import CovariatePreprocessor, _preprocess_params from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel from .serialization import JSONSerializer from .utils import NotSampledError @@ -38,7 +38,7 @@ class BCFModel: \begin{aligned} y &= a(X) + b_z(X) + \epsilon\\ b_z(X) &= (b_1 Z + b_0 (1-Z)) t(X)\\ - b_0, b_1 &\sim N(0, \frac{1}{2})\\\\ + b_0, b_1 &\sim N\left(0, \frac{1}{2}\right)\\\\ a(X) &\sim \text{BART}()\\ t(X) &\sim \text{BART}()\\ \epsilon &\sim N(0, \sigma^2)\\ @@ -663,7 +663,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr variable_subset_variance = [i for i in range(X_train.shape[1])] # Covariate preprocessing - self._covariate_transformer = CovariateTransformer() + self._covariate_transformer = CovariatePreprocessor() self._covariate_transformer.fit(X_train) X_train_processed = self._covariate_transformer.transform(X_train) if X_test is not None: diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index a586afd8..40019cc8 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -7,7 +7,9 @@ from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder import numpy as np import pandas as pd +from scipy import sparse import warnings +from .serialization import JSONSerializer def _preprocess_params(default_params: Dict[str, Any], user_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: if user_params: @@ -129,20 +131,44 @@ def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, return processed_params -class CovariateTransformer: +def _map_to_integer(values: Union[np.array, list], uniques: Union[np.array, list]) -> np.array: + r""" + Slightly modified version of a [scikit-learn function](https://github.com/scikit-learn/scikit-learn/blob/43d440f1f874ac2117ed848b10a6f07d9083488d/sklearn/utils/_encode.py#L170) by the same name. + Converts dataframe column values (which might be string, categorical, etc...) to numpy integer indices. + + Parameters + ---------- + values : np.array or list + Array of series values. + uniques : np.array or list + Sorted array / list of unique values in the series. """ - Class that transforms covariates to a format that can be used to define tree splits. - Modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html). + table = dict({val: i for i, val in enumerate(uniques)}) + return np.asarray([table[v] for v in values]) + + +class CovariatePreprocessor: + r""" + Preprocessing engine for covariates provided as either `np.array` or `pd.DataFrame`, which standardizes inputs as a `np.array`. + + `CovariatePreprocessor` uses [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) in provided + dataframes to convert string / categorical variables to numeric variables, either by mapping ordinal variables to integers + or by one-hot encoding unordered categorical variables. + + This class is modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html). """ def __init__(self) -> None: self._is_fitted = False - self._ordinal_encoders = [] - self._onehot_encoders = [] - self._ordinal_feature_index = [] - self._onehot_feature_index = [] - self._processed_feature_types = [] - self._original_feature_types = [] - self._original_feature_indices = [] + self._num_ordinal_features = 0 + self._num_onehot_features = 0 + self._num_original_features = 0 + self._ordinal_categories_list = [] + self._onehot_categories_list = [] + self._ordinal_feature_index = None + self._onehot_feature_index = None + self._processed_feature_types = None + self._original_feature_types = None + self._original_feature_indices = None def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool: if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f": @@ -150,27 +176,41 @@ def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool: else: return False - def _process_unordered_categorical(self, covariate: pd.Series) -> int: - num_onehot = len(self._onehot_encoders) - category_list = covariate.array.categories.to_list() - enc = OneHotEncoder(categories=[category_list], sparse_output=False) - enc.fit(pd.DataFrame(covariate)) - self._onehot_encoders.append(enc) - return num_onehot + def _extract_categories_unordered_categorical(self, covariate: pd.Series) -> int: + covariate_categories = covariate.array.categories.to_numpy() + self._onehot_categories_list.append(covariate_categories) + return self._num_onehot_features + + def _extract_categories_ordered_categorical(self, covariate: pd.Series) -> int: + covariate_categories = covariate.array.categories.to_numpy() + self._ordinal_categories_list.append(covariate_categories) + return self._num_ordinal_features + + def _transform_unordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array: + """ + Adapted from https://github.com/scikit-learn/scikit-learn/blob/8f2c1cab50262bcf4a1ade070446c40028ee27f4/sklearn/preprocessing/_encoders.py#L1000 + """ + covariate_data = covariate.array.to_numpy() + n = len(covariate_data) + integer_indices = _map_to_integer(covariate_data, covariate_categories) + row_offsets = np.arange(n + 1, dtype=int) + onehot_data = np.ones(row_offsets[-1]) + out = sparse.csr_matrix( + (onehot_data, integer_indices, row_offsets), + shape=(n, len(covariate_categories)), + dtype=np.float64, + ) + return out.toarray() - def _process_ordered_categorical(self, covariate: pd.Series) -> int: - num_ord = len(self._ordinal_encoders) - category_list = covariate.array.categories.to_list() - enc = OrdinalEncoder(categories=[category_list]) - enc.fit(pd.DataFrame(covariate)) - self._ordinal_encoders.append(enc) - return num_ord + def _transform_ordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array: + covariate_data = covariate.array.to_numpy() + return _map_to_integer(covariate_data, covariate_categories) def _fit_pandas(self, covariates: pd.DataFrame) -> None: self._num_original_features = covariates.shape[1] - self._ordinal_feature_index = [-1 for i in range(self._num_original_features)] - self._onehot_feature_index = [-1 for i in range(self._num_original_features)] - self._original_feature_types = [-1 for i in range(self._num_original_features)] + self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) + self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) + original_feature_types = [-1 for i in range(self._num_original_features)] datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x)) object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x)) interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype)) @@ -214,36 +254,42 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None: warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}" warnings.warn(warn_msg.format(object_cols)) + processed_feature_types = [] for i in range(covariates.shape[1]): covariate = covariates.iloc[:,i] if categorical_types.iloc[i]: - self._original_feature_types[i] = "category" + original_feature_types[i] = "category" if covariate.array.ordered: - ord_index = self._process_ordered_categorical(covariate) + ord_index = self._extract_categories_ordered_categorical(covariate) self._ordinal_feature_index[i] = ord_index - self._processed_feature_types.append(1) + processed_feature_types.append(1) + self._num_ordinal_features += 1 else: - onehot_index = self._process_unordered_categorical(covariate) + onehot_index = self._extract_categories_unordered_categorical(covariate) self._onehot_feature_index[i] = onehot_index feature_ones = np.repeat(1, len(covariate.array.categories)).tolist() - self._processed_feature_types.extend(feature_ones) + processed_feature_types.extend(feature_ones) + self._num_onehot_features += 1 elif string_types.iloc[i]: - self._original_feature_types[i] = "string" - onehot_index = self._process_unordered_categorical(covariate) + original_feature_types[i] = "string" + onehot_index = self._extract_categories_unordered_categorical(covariate) self._onehot_feature_index[i] = onehot_index feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist() - self._processed_feature_types.extend(feature_ones) + processed_feature_types.extend(feature_ones) elif bool_types.iloc[i]: - self._original_feature_types[i] = "boolean" - self._processed_feature_types.append(1) + original_feature_types[i] = "boolean" + processed_feature_types.append(1) elif integer_types.iloc[i]: - self._original_feature_types[i] = "integer" - self._processed_feature_types.append(0) + original_feature_types[i] = "integer" + processed_feature_types.append(0) elif float_types.iloc[i]: - self._original_feature_types[i] = "float" - self._processed_feature_types.append(0) + original_feature_types[i] = "float" + processed_feature_types.append(0) else: - self._original_feature_types[i] = "unsupported" + original_feature_types[i] = "unsupported" + + self._processed_feature_types = np.array(processed_feature_types, dtype=int) + self._original_feature_types = np.array(original_feature_types) def _fit_numpy(self, covariates: np.array) -> None: if covariates.ndim == 1: @@ -252,9 +298,9 @@ def _fit_numpy(self, covariates: np.array) -> None: raise ValueError("Covariates passed as a numpy array must be 1d or 2d") self._num_original_features = covariates.shape[1] - self._ordinal_feature_index = [-1 for i in range(self._num_original_features)] - self._onehot_feature_index = [-1 for i in range(self._num_original_features)] - self._original_feature_types = ["float" for i in range(self._num_original_features)] + self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) + self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int) + self._original_feature_types = np.array(["float" for i in range(self._num_original_features)]) # Check whether the array is numeric cov_dtype = covariates.dtype @@ -269,12 +315,16 @@ def _fit_numpy(self, covariates: np.array) -> None: raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)") # Scan for binary columns + processed_feature_types = [] for i in range(self._num_original_features): num_unique = np.unique(covariates[:,i]).size if num_unique == 2: - self._processed_feature_types.append(1) + processed_feature_types.append(1) else: - self._processed_feature_types.append(0) + processed_feature_types.append(0) + # TODO: Convert to integer if not passed as integer + + self._processed_feature_types = np.array(processed_feature_types, dtype=int) def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: if isinstance(covariates, pd.DataFrame): @@ -291,33 +341,38 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array: output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64) output_iter = 0 - self._original_feature_indices = [] + original_feature_indices = [] + print(self._original_feature_types) for i in range(covariates.shape[1]): covariate = covariates.iloc[:,i] if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string": if self._ordinal_feature_index[i] != -1: ord_ind = self._ordinal_feature_index[i] - covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate)) + covariate_categories = self._ordinal_categories_list[ord_ind] + covariate_transformed = self._transform_ordered_categorical(covariate, covariate_categories) output_array[:,output_iter] = np.squeeze(covariate_transformed) output_iter += 1 - self._original_feature_indices.append(i) + original_feature_indices.append(i) else: onehot_ind = self._onehot_feature_index[i] - covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate)) + covariate_categories = self._onehot_categories_list[onehot_ind] + covariate_transformed = self._transform_unordered_categorical(covariate, covariate_categories) output_dim = covariate_transformed.shape[1] output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed) output_iter += output_dim - self._original_feature_indices.extend([i for _ in range(output_dim)]) + original_feature_indices.extend([i for _ in range(output_dim)]) elif self._original_feature_types[i] == "boolean": output_array[:,output_iter] = (covariate*1.0).to_numpy() output_iter += 1 - self._original_feature_indices.append(i) + original_feature_indices.append(i) elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float": output_array[:,output_iter] = (covariate).to_numpy() output_iter += 1 - self._original_feature_indices.append(i) + original_feature_indices.append(i) + + self._original_feature_indices = np.array(original_feature_indices, dtype=int) return output_array @@ -346,7 +401,7 @@ def _check_is_fitted(self) -> bool: return self._is_fitted def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None: - r"""Fits a `CovariateTransformer` by unpacking (and storing) data type information on the input (raw) covariates + r"""Fits a `CovariatePreprocessor` by unpacking (and storing) data type information on the input (raw) covariates and then converting to a numpy array which can be passed to a tree ensemble sampler. If `covariates` is a `pd.DataFrame`, [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) @@ -431,3 +486,105 @@ def fetch_original_feature_indices(self) -> list: through `k` numeric features, this method would return a list `[0,...,k-1]`. """ return self._original_feature_indices + + def to_json(self) -> str: + """ + Converts a covariate preprocessor to JSON string representation (which can then be saved to a file or + processed using the `json` library) + + Returns + ------- + str + JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests + """ + # Initialize JSONSerializer object + preprocessor_json = JSONSerializer() + + # Add internal scalars + preprocessor_json.add_boolean("is_fitted", self._is_fitted) + preprocessor_json.add_integer("num_ordinal_features", self._num_ordinal_features) + preprocessor_json.add_integer("num_onehot_features", self._num_onehot_features) + preprocessor_json.add_integer("num_original_features", self._num_original_features) + + # Add internal lists + for i in range(self._num_ordinal_features): + dtype_name = "dtype_{:d}".format(i) + list_name = "cats_{:d}".format(i) + if np.issubdtype(self._ordinal_categories_list[i].dtype, np.integer): + array_type = "int" + preprocessor_json.add_integer_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + elif np.issubdtype(self._ordinal_categories_list[i].dtype, np.floating): + array_type = "float" + preprocessor_json.add_numeric_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + else: + array_type = "str" + preprocessor_json.add_string_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list") + preprocessor_json.add_string(dtype_name, array_type, "ordinal_dtype_list") + for i in range(self._num_onehot_features): + dtype_name = "dtype_{:d}".format(i) + list_name = "cats_{:d}".format(i) + if np.issubdtype(self._onehot_categories_list[i].dtype, np.integer): + array_type = "int" + preprocessor_json.add_integer_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + elif np.issubdtype(self._onehot_categories_list[i].dtype, np.floating): + array_type = "float" + preprocessor_json.add_numeric_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + else: + array_type = "str" + preprocessor_json.add_string_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list") + preprocessor_json.add_string(dtype_name, array_type, "onehot_dtype_list") + preprocessor_json.add_integer_vector("ordinal_feature_index", self._ordinal_feature_index) + preprocessor_json.add_integer_vector("onehot_feature_index", self._onehot_feature_index) + preprocessor_json.add_integer_vector("processed_feature_types", self._processed_feature_types) + preprocessor_json.add_string_vector("original_feature_types", self._original_feature_types) + preprocessor_json.add_integer_vector("original_feature_indices", self._original_feature_indices) + + return preprocessor_json.return_json_string() + + def from_json(self, json_string: str) -> None: + """ + Converts a JSON string to an in-memory BART model. + + Parameters + ---------- + json_string : str + JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests + """ + # Parse string to a JSON object in C++ + preprocessor_json = JSONSerializer() + preprocessor_json.load_from_json_string(json_string) + + # Unpack internal scalars + self._is_fitted = preprocessor_json.get_boolean("is_fitted") + self._num_ordinal_features = preprocessor_json.get_integer("num_ordinal_features") + self._num_onehot_features = preprocessor_json.get_integer("num_onehot_features") + self._num_original_features = preprocessor_json.get_integer("num_original_features") + + # Unpack internal lists + self._ordinal_categories_list = [] + for i in range(self._num_ordinal_features): + dtype_name = "dtype_{:d}".format(i) + list_name = "cats_{:d}".format(i) + array_type = preprocessor_json.get_string(dtype_name, "ordinal_dtype_list") + if array_type == "int": + self._ordinal_categories_list.append(preprocessor_json.get_integer_vector(list_name, "ordinal_categories_list")) + elif array_type == "float": + self._ordinal_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "ordinal_categories_list")) + else: + self._ordinal_categories_list.append(preprocessor_json.get_string_vector(list_name, "ordinal_categories_list")) + self._onehot_categories_list = [] + for i in range(self._num_onehot_features): + dtype_name = "dtype_{:d}".format(i) + list_name = "cats_{:d}".format(i) + array_type = preprocessor_json.get_string(dtype_name, "onehot_dtype_list") + if array_type == "int": + self._onehot_categories_list.append(preprocessor_json.get_integer_vector(list_name, "onehot_categories_list")) + elif array_type == "float": + self._onehot_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "onehot_categories_list")) + else: + self._onehot_categories_list.append(np.array(preprocessor_json.get_string_vector(list_name, "onehot_categories_list"))) + self._ordinal_feature_index = preprocessor_json.get_integer_vector("ordinal_feature_index") + self._onehot_feature_index = preprocessor_json.get_integer_vector("onehot_feature_index") + self._processed_feature_types = preprocessor_json.get_integer_vector("processed_feature_types") + self._original_feature_types = preprocessor_json.get_string_vector("original_feature_types") + self._original_feature_indices = preprocessor_json.get_integer_vector("original_feature_indices") diff --git a/stochtree/serialization.py b/stochtree/serialization.py index acbb9e85..b6d3a93b 100644 --- a/stochtree/serialization.py +++ b/stochtree/serialization.py @@ -1,6 +1,7 @@ import warnings import numpy as np import pandas as pd +from typing import Union from scipy.linalg import lstsq from scipy.stats import gamma from .forest import ForestContainer @@ -66,6 +67,23 @@ def add_scalar(self, field_name: str, field_value: float, subfolder_name: str = else: self.json_cpp.AddDoubleSubfolder(subfolder_name, field_name, field_value) + def add_integer(self, field_name: str, field_value: int, subfolder_name: str = None) -> None: + """Adds an integer value to a json object + + Parameters + ---------- + field_name : str + Name of the json field / label under which the numeric value will be stored + field_value : int + Integer value to be stored + subfolder_name : str, optional + Name of "subfolder" under which `field_name` to be stored in the json hierarchy + """ + if subfolder_name is None: + self.json_cpp.AddInteger(field_name, field_value) + else: + self.json_cpp.AddIntegerSubfolder(subfolder_name, field_name, field_value) + def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = None) -> None: """Adds a scalar (boolean) value to a json object @@ -125,6 +143,33 @@ def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_ else: self.json_cpp.AddDoubleVectorSubfolder(subfolder_name, field_name, field_vector) + def add_integer_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None: + """Adds a integer vector (stored as a numpy array) to a json object + + Parameters + ---------- + field_name : str + Name of the json field / label under which the integer vector will be stored + field_vector : np.array + Numpy array containing the vector to be stored in json. Should be one-dimensional. + subfolder_name : str, optional + Name of "subfolder" under which `field_name` to be stored in the json hierarchy + """ + # Runtime checks + if not isinstance(field_vector, np.ndarray): + raise ValueError("field_vector must be a numpy array") + if not np.issubdtype(field_vector.dtype, np.integer): + raise ValueError("field_vector must be a numpy array with integer data types") + field_vector = np.squeeze(field_vector) + if field_vector.ndim > 1: + warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()") + field_vector = np.ravel(field_vector, order = "C") + + if subfolder_name is None: + self.json_cpp.AddIntegerVector(field_name, field_vector) + else: + self.json_cpp.AddIntegerVectorSubfolder(subfolder_name, field_name, field_vector) + def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: str = None) -> None: """Adds a list of strings to a json object as an array @@ -138,9 +183,11 @@ def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: Name of "subfolder" under which `field_name` to be stored in the json hierarchy """ # Runtime checks - if not isinstance(field_vector, list): - raise ValueError("field_vector must be a list") + if not isinstance(field_vector, list) and not isinstance(field_vector, np.ndarray): + raise ValueError("field_vector must be a list or numpy object array") + if isinstance(field_vector, np.ndarray): + field_vector = field_vector.tolist() if subfolder_name is None: self.json_cpp.AddStringVector(field_name, field_vector) else: @@ -161,6 +208,21 @@ def get_scalar(self, field_name: str, subfolder_name: str = None) -> float: else: return self.json_cpp.ExtractDoubleSubfolder(subfolder_name, field_name) + def get_integer(self, field_name: str, subfolder_name: str = None) -> int: + """Retrieves an integer value from a json object + + Parameters + ---------- + field_name : str + Name of the json field / label under which the numeric value is stored + subfolder_name : str, optional + Name of "subfolder" under which `field_name` is stored in the json hierarchy + """ + if subfolder_name is None: + return self.json_cpp.ExtractInteger(field_name) + else: + return self.json_cpp.ExtractIntegerSubfolder(subfolder_name, field_name) + def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool: """Retrieves a scalar (boolean) value from a json object @@ -177,12 +239,12 @@ def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool: return self.json_cpp.ExtractBoolSubfolder(subfolder_name, field_name) def get_string(self, field_name: str, subfolder_name: str = None) -> str: - """Retrieve a string to a json object + """Retrieve a string from a json object Parameters ---------- field_name : str - Name of the json field / label under which the numeric value is stored + Name of the json field / label under which the string is stored subfolder_name : str, optional Name of "subfolder" under which `field_name` is stored in the json hierarchy """ @@ -192,7 +254,7 @@ def get_string(self, field_name: str, subfolder_name: str = None) -> str: return self.json_cpp.ExtractStringSubfolder(subfolder_name, field_name) def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.array: - """Adds a string to a json object + """Retrieve numeric vector from a json object Parameters ---------- @@ -206,6 +268,21 @@ def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np. else: return self.json_cpp.ExtractDoubleVectorSubfolder(subfolder_name, field_name) + def get_integer_vector(self, field_name: str, subfolder_name: str = None) -> np.array: + """Retrieve integer vector from a json object + + Parameters + ---------- + field_name : str + Name of the json field / label under which the integer vector is stored + subfolder_name : str, optional + Name of "subfolder" under which `field_name` to be stored in the json hierarchy + """ + if subfolder_name is None: + return self.json_cpp.ExtractIntegerVector(field_name) + else: + return self.json_cpp.ExtractIntegerVectorSubfolder(subfolder_name, field_name) + def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list: """Adds a string to a json object diff --git a/test/python/test_calibration.py b/test/python/test_calibration.py index 312b9632..0cc437a8 100644 --- a/test/python/test_calibration.py +++ b/test/python/test_calibration.py @@ -3,7 +3,6 @@ from sklearn import linear_model from sklearn.metrics import mean_squared_error from scipy.stats import gamma -from stochtree import CovariateTransformer from stochtree import calibrate_global_error_variance import pytest diff --git a/test/python/test_json.py b/test/python/test_json.py index 2bd71cd8..d1291254 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -1,7 +1,8 @@ import numpy as np +import pandas as pd from stochtree import ( BARTModel, BCFModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual, - RNG, ForestSampler, ForestContainer, GlobalVarianceModel + RNG, ForestSampler, ForestContainer, GlobalVarianceModel, CovariatePreprocessor ) class TestJson: @@ -26,6 +27,28 @@ def test_array(self): np.testing.assert_array_equal(a, json_test.get_numeric_vector("a")) assert b == json_test.get_string_vector("b") + def test_preprocessor(self): + df = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), + "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + ) + # arr = np.array( + # [[1.5, 0, 0, 1, 1.2], + # [2.7, 0, 1, 0, 5.4], + # [3.6, 1, 0, 0, 9.3], + # [4.4, 0, 0, 1, 10.4], + # [5.3, 0, 1, 0, 3.6], + # [6.1, 1, 0, 0, 4.4]] + # ) + cov_transformer = CovariatePreprocessor() + df_transformed_orig = cov_transformer.fit_transform(df) + cov_transformer_json = cov_transformer.to_json() + cov_transformer_reloaded = CovariatePreprocessor() + cov_transformer_reloaded.from_json(cov_transformer_json) + df_transformed_reloaded = cov_transformer_reloaded.transform(df) + np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded) + def test_forest(self): # Generate sample data random_seed = 1234 diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py index 4282a20e..87e338e7 100644 --- a/test/python/test_preprocessor.py +++ b/test/python/test_preprocessor.py @@ -1,10 +1,10 @@ import numpy as np import pandas as pd -from stochtree import CovariateTransformer +from stochtree import CovariatePreprocessor class TestPreprocessor: def test_numpy(self): - cov_transformer = CovariateTransformer() + cov_transformer = CovariatePreprocessor() np_1 = np.array( [[1.5, 8.7, 1.2], [2.7, 3.4, 5.4], @@ -15,7 +15,7 @@ def test_numpy(self): ) np_1_transformed = cov_transformer.fit_transform(np_1) np.testing.assert_array_equal(np_1, np_1_transformed) - assert cov_transformer._processed_feature_types == [0,0,0] + np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0])) def test_pandas(self): df_1 = pd.DataFrame( @@ -31,10 +31,10 @@ def test_pandas(self): [5.3, 9.3, 3.6], [6.1, 10.4, 4.4]] ) - cov_transformer = CovariateTransformer() + cov_transformer = CovariatePreprocessor() df_1_transformed = cov_transformer.fit_transform(df_1) np.testing.assert_array_equal(np_1, df_1_transformed) - assert cov_transformer._processed_feature_types == [0,0,0] + np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0])) df_2 = pd.DataFrame( {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], @@ -49,10 +49,10 @@ def test_pandas(self): [5.3, 1, 3.6], [6.1, 0, 4.4]] ) - cov_transformer = CovariateTransformer() + cov_transformer = CovariatePreprocessor() df_2_transformed = cov_transformer.fit_transform(df_2) np.testing.assert_array_equal(np_2, df_2_transformed) - assert cov_transformer._processed_feature_types == [0,1,0] + np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,0])) df_3 = pd.DataFrame( {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], @@ -67,14 +67,14 @@ def test_pandas(self): [5.3, 0, 1, 0, 3.6], [6.1, 1, 0, 0, 4.4]] ) - cov_transformer = CovariateTransformer() + cov_transformer = CovariatePreprocessor() df_3_transformed = cov_transformer.fit_transform(df_3) np.testing.assert_array_equal(np_3, df_3_transformed) - assert cov_transformer._processed_feature_types == [0,1,1,1,0] + np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,0])) df_4 = pd.DataFrame( {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6], - "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ordered=False, categories=['c', 'b', 'a']), + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'c'], ordered=False, categories=['c', 'b', 'a', 'd']), "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]} ) np_4 = np.array( @@ -83,10 +83,11 @@ def test_pandas(self): [3.6, 1, 0, 0, 0, 9.3], [4.4, 0, 0, 1, 0, 10.4], [5.3, 0, 1, 0, 0, 3.6], - [6.1, 1, 0, 0, 1, 4.4]] + [6.1, 1, 0, 0, 0, 4.4], + [7.6, 1, 0, 0, 0, 3.4]] ) - cov_transformer = CovariateTransformer() - with np.testing.assert_raises(ValueError): - df_4_transformed = cov_transformer.fit_transform(df_4) - # np.testing.assert_array_equal(np_4, df_4_transformed) - # assert cov_transformer._processed_feature_types == [0,1,1,1,1,0] + cov_transformer = CovariatePreprocessor() + df_4_transformed = cov_transformer.fit_transform(df_4) + np.testing.assert_array_equal(np_4, df_4_transformed) + np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,1,0])) + \ No newline at end of file From 8aacffea2f59c9c86dddfb5482b29e18d168e3d3 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 22 Jan 2025 13:41:37 -0600 Subject: [PATCH 3/5] Updated python serialization test suite --- test/python/test_json.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/test/python/test_json.py b/test/python/test_json.py index d1291254..ed002626 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -33,14 +33,6 @@ def test_preprocessor(self): "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} ) - # arr = np.array( - # [[1.5, 0, 0, 1, 1.2], - # [2.7, 0, 1, 0, 5.4], - # [3.6, 1, 0, 0, 9.3], - # [4.4, 0, 0, 1, 10.4], - # [5.3, 0, 1, 0, 3.6], - # [6.1, 1, 0, 0, 4.4]] - # ) cov_transformer = CovariatePreprocessor() df_transformed_orig = cov_transformer.fit_transform(df) cov_transformer_json = cov_transformer.to_json() @@ -49,6 +41,21 @@ def test_preprocessor(self): df_transformed_reloaded = cov_transformer_reloaded.transform(df) np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded) + df_2 = pd.DataFrame( + {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1], + "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']), + "x3": pd.Categorical(['a', 'c', 'd', 'b', 'd', 'b'], ordered=False, categories=['c', 'b', 'a', 'd']), + "x4": pd.Categorical(['a', 'b', 'f', 'f', 'c', 'a'], ordered=True, categories=['c', 'b', 'a', 'f']), + "x5": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]} + ) + cov_transformer_2 = CovariatePreprocessor() + df_transformed_orig_2 = cov_transformer_2.fit_transform(df_2) + cov_transformer_json_2 = cov_transformer_2.to_json() + cov_transformer_reloaded_2 = CovariatePreprocessor() + cov_transformer_reloaded_2.from_json(cov_transformer_json_2) + df_transformed_reloaded_2 = cov_transformer_reloaded_2.transform(df_2) + np.testing.assert_array_equal(df_transformed_orig_2, df_transformed_reloaded_2) + def test_forest(self): # Generate sample data random_seed = 1234 From b288d9b292e2d3cc55cdb5d09a30bbfc1fbf49d4 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 22 Jan 2025 15:48:06 -0600 Subject: [PATCH 4/5] Fixed bugs and added covariate preprocessor serialization to BART / BCF --- stochtree/bart.py | 135 ++++++++++++++++++++++++++++++------- stochtree/bcf.py | 21 ++++-- stochtree/preprocessing.py | 4 +- test/python/test_json.py | 11 +++ 4 files changed, 138 insertions(+), 33 deletions(-) diff --git a/stochtree/bart.py b/stochtree/bart.py index 1e4ee6b0..0159fc92 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -1,6 +1,7 @@ """ Bayesian Additive Regression Trees (BART) module """ +import warnings from numbers import Number, Integral from math import log import numpy as np @@ -52,7 +53,8 @@ def __init__(self) -> None: self.sampled = False self.rng = np.random.default_rng() - def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = None, X_test: np.array = None, basis_test: np.array = None, + def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basis_train: np.array = None, + X_test: Union[np.array, pd.DataFrame] = None, basis_test: np.array = None, num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None, mean_forest_params: Optional[Dict[str, Any]] = None, variance_forest_params: Optional[Dict[str, Any]] = None) -> None: """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. @@ -301,13 +303,13 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N variable_weights_variance = variable_weights # Covariate preprocessing - self._covariate_transformer = CovariatePreprocessor() - self._covariate_transformer.fit(X_train) - X_train_processed = self._covariate_transformer.transform(X_train) + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(X_train) + X_train_processed = self._covariate_preprocessor.transform(X_train) if X_test is not None: - X_test_processed = self._covariate_transformer.transform(X_test) - feature_types = np.asarray(self._covariate_transformer._processed_feature_types) - original_var_indices = self._covariate_transformer.fetch_original_feature_indices() + X_test_processed = self._covariate_preprocessor.transform(X_test) + feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types) + original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices() # Determine whether a test set is provided self.has_test = X_test is not None @@ -718,7 +720,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N else: self.sigma2_x_test = sigma_x_test_raw*self.sigma2_init*self.y_std*self.y_std - def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.array, tuple]: + def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None) -> Union[np.array, tuple]: """Return predictions from every forest sampled (either / both of mean and variance). Return type is either a single array of predictions, if a BART model only includes a mean or variance term, or a tuple of prediction arrays, if a BART model includes both. @@ -744,22 +746,44 @@ def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.arra ) raise NotSampledError(msg) + # Data checks + if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + raise ValueError("covariates must be a pandas dataframe or numpy array") + if basis is not None: + if not isinstance(basis, np.ndarray): + raise ValueError("basis must be a numpy array") + if basis.shape[0] != covariates.shape[0]: + raise ValueError("covariates and basis must have the same number of rows") + # Convert everything to standard shape (2-dimensional) - if covariates.ndim == 1: - covariates = np.expand_dims(covariates, 1) + if isinstance(covariates, np.ndarray): + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) if basis is not None: if basis.ndim == 1: basis = np.expand_dims(basis, 1) - # Data checks - if basis is not None: - if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(covariates, np.ndarray): + raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + else: + warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) + if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + covariates_processed = covariates + else: + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(covariates) + covariates_processed = self._covariate_preprocessor.transform(covariates) + # Dataset construction pred_dataset = Dataset() - pred_dataset.add_covariates(covariates) + pred_dataset.add_covariates(covariates_processed) if basis is not None: pred_dataset.add_basis(basis) + + # Forest predictions if self.include_mean_forest: mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp) mean_pred = mean_pred_raw*self.y_std + self.y_bar @@ -808,22 +832,44 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array ) raise NotSampledError(msg) + # Data checks + if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + raise ValueError("covariates must be a pandas dataframe or numpy array") + if basis is not None: + if not isinstance(basis, np.ndarray): + raise ValueError("basis must be a numpy array") + if basis.shape[0] != covariates.shape[0]: + raise ValueError("covariates and basis must have the same number of rows") + # Convert everything to standard shape (2-dimensional) - if covariates.ndim == 1: - covariates = np.expand_dims(covariates, 1) + if isinstance(covariates, np.ndarray): + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) if basis is not None: if basis.ndim == 1: basis = np.expand_dims(basis, 1) - # Data checks - if basis is not None: - if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(covariates, np.ndarray): + raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + else: + warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) + if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + covariates_processed = covariates + else: + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(covariates) + covariates_processed = self._covariate_preprocessor.transform(covariates) + # Dataset construction pred_dataset = Dataset() - pred_dataset.add_covariates(covariates) + pred_dataset.add_covariates(covariates_processed) if basis is not None: pred_dataset.add_basis(basis) + + # Mean forest predictions mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp) mean_pred = mean_pred_raw*self.y_std + self.y_bar @@ -856,12 +902,42 @@ def predict_variance(self, covariates: np.array) -> np.array: ) raise NotSampledError(msg) + # Data checks + if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray): + raise ValueError("covariates must be a pandas dataframe or numpy array") + if basis is not None: + if not isinstance(basis, np.ndarray): + raise ValueError("basis must be a numpy array") + if basis.shape[0] != covariates.shape[0]: + raise ValueError("covariates and basis must have the same number of rows") + # Convert everything to standard shape (2-dimensional) - if covariates.ndim == 1: - covariates = np.expand_dims(covariates, 1) + if isinstance(covariates, np.ndarray): + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) + if basis is not None: + if basis.ndim == 1: + basis = np.expand_dims(basis, 1) + + # Covariate preprocessing + if not self._covariate_preprocessor._check_is_fitted(): + if not isinstance(covariates, np.ndarray): + raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.") + else: + warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning) + if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer): + raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.") + covariates_processed = covariates + else: + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(covariates) + covariates_processed = self._covariate_preprocessor.transform(covariates) + # Dataset construction pred_dataset = Dataset() - pred_dataset.add_covariates(covariates) + pred_dataset.add_covariates(covariates_processed) + + # Variance forest predictions variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) if self.sample_sigma_global: variance_pred = variance_pred_raw @@ -920,6 +996,10 @@ def to_json(self) -> str: if self.sample_sigma_leaf: bart_json.add_numeric_vector("sigma2_leaf_samples", self.leaf_scale_samples, "parameters") + # Add covariate preprocessor + covariate_preprocessor_string = self._covariate_preprocessor.to_json() + bart_json.add_string("covariate_preprocessor", covariate_preprocessor_string) + return bart_json.return_json_string() def from_json(self, json_string: str) -> None: @@ -971,6 +1051,11 @@ def from_json(self, json_string: str) -> None: if self.sample_sigma_leaf: self.leaf_scale_samples = bart_json.get_numeric_vector("sigma2_leaf_samples", "parameters") + # Unpack covariate preprocessor + covariate_preprocessor_string = bart_json.get_string("covariate_preprocessor") + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.from_json(covariate_preprocessor_string) + # Mark the deserialized model as "sampled" self.sampled = True diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 492cd4dd..4f24234b 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -663,13 +663,13 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr variable_subset_variance = [i for i in range(X_train.shape[1])] # Covariate preprocessing - self._covariate_transformer = CovariatePreprocessor() - self._covariate_transformer.fit(X_train) - X_train_processed = self._covariate_transformer.transform(X_train) + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.fit(X_train) + X_train_processed = self._covariate_preprocessor.transform(X_train) if X_test is not None: - X_test_processed = self._covariate_transformer.transform(X_test) - feature_types = np.asarray(self._covariate_transformer._processed_feature_types) - original_var_indices = self._covariate_transformer.fetch_original_feature_indices() + X_test_processed = self._covariate_preprocessor.transform(X_test) + feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types) + original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices() # Determine whether a test set is provided self.has_test = X_test is not None @@ -1420,6 +1420,10 @@ def to_json(self) -> str: bart_propensity_string = self.bart_propensity_model.to_json() bcf_json.add_string("bart_propensity_model", bart_propensity_string) + # Add covariate preprocessor + covariate_preprocessor_string = self._covariate_preprocessor.to_json() + bcf_json.add_string("covariate_preprocessor", covariate_preprocessor_string) + return bcf_json.return_json_string() def from_json(self, json_string: str) -> None: @@ -1482,6 +1486,11 @@ def from_json(self, json_string: str) -> None: self.bart_propensity_model = BARTModel() self.bart_propensity_model.from_json(bart_propensity_string) + # Unpack covariate preprocessor + covariate_preprocessor_string = bcf_json.get_string("covariate_preprocessor") + self._covariate_preprocessor = CovariatePreprocessor() + self._covariate_preprocessor.from_json(covariate_preprocessor_string) + # Mark the deserialized model as "sampled" self.sampled = True diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index 40019cc8..35633264 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -383,7 +383,7 @@ def _transform_numpy(self, covariates: np.array) -> np.array: raise ValueError("Covariates passed as a numpy array must be 1d or 2d") if self._num_original_features != covariates.shape[1]: raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality") - self._original_feature_indices = [i for i in range(covariates.shape[1])] + self._original_feature_indices = np.array([i for i in range(covariates.shape[1])]) return covariates def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array: @@ -485,7 +485,7 @@ def fetch_original_feature_indices(self) -> list: this method would return a list `[0,0,0,0,0]`. If the transformer merely passes through `k` numeric features, this method would return a list `[0,...,k-1]`. """ - return self._original_feature_indices + return self._original_feature_indices.tolist() def to_json(self) -> str: """ diff --git a/test/python/test_json.py b/test/python/test_json.py index ed002626..4d8d903c 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -56,6 +56,17 @@ def test_preprocessor(self): df_transformed_reloaded_2 = cov_transformer_reloaded_2.transform(df_2) np.testing.assert_array_equal(df_transformed_orig_2, df_transformed_reloaded_2) + np_3 = np.array( + [[1.5, 1.2], [2.7, 5.4], [3.6, 9.3], [4.4, 10.4], [5.3, 3.6], [6.1, 4.4]] + ) + cov_transformer_3 = CovariatePreprocessor() + df_transformed_orig_3 = cov_transformer_3.fit_transform(np_3) + cov_transformer_json_3 = cov_transformer_3.to_json() + cov_transformer_reloaded_3 = CovariatePreprocessor() + cov_transformer_reloaded_3.from_json(cov_transformer_json_3) + df_transformed_reloaded_3 = cov_transformer_reloaded_3.transform(np_3) + np.testing.assert_array_equal(df_transformed_orig_3, df_transformed_reloaded_3) + def test_forest(self): # Generate sample data random_seed = 1234 From 0ef2cdde4ce9eaf32e5841fec2768ccdcc2c4c38 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Wed, 22 Jan 2025 17:36:23 -0600 Subject: [PATCH 5/5] Added preprocessor serialization to the R frontend as well --- NAMESPACE | 4 + R/bart.R | 26 +++++- R/bcf.R | 20 ++++- R/cpp11.R | 32 +++++++ R/serialization.R | 63 +++++++++++++- R/utils.R | 110 ++++++++++++++++++++++++ man/CppJson.Rd | 98 ++++++++++++++++++++- man/convertPreprocessorToJson.Rd | 23 +++++ man/createPreprocessorFromJson.Rd | 17 ++++ man/createPreprocessorFromJsonString.Rd | 17 ++++ man/saveBCFModelToJsonFile.Rd | 3 + man/savePreprocessorToJsonString.Rd | 23 +++++ src/cpp11.cpp | 68 +++++++++++++++ src/serialization.cpp | 89 +++++++++++++++++++ 14 files changed, 589 insertions(+), 4 deletions(-) create mode 100644 man/convertPreprocessorToJson.Rd create mode 100644 man/createPreprocessorFromJson.Rd create mode 100644 man/createPreprocessorFromJsonString.Rd create mode 100644 man/savePreprocessorToJsonString.Rd diff --git a/NAMESPACE b/NAMESPACE index 7c746a36..47ba8bcc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(computeForestLeafVariances) export(computeMaxLeafIndex) export(convertBARTModelToJson) export(convertBCFModelToJson) +export(convertPreprocessorToJson) export(createBARTModelFromCombinedJson) export(createBARTModelFromCombinedJsonString) export(createBARTModelFromJson) @@ -31,6 +32,8 @@ export(createForestCovariatesFromMetadata) export(createForestDataset) export(createForestModel) export(createOutcome) +export(createPreprocessorFromJson) +export(createPreprocessorFromJsonString) export(createRNG) export(createRandomEffectSamples) export(createRandomEffectsDataset) @@ -69,6 +72,7 @@ export(saveBARTModelToJsonFile) export(saveBARTModelToJsonString) export(saveBCFModelToJsonFile) export(saveBCFModelToJsonString) +export(savePreprocessorToJsonString) importFrom(R6,R6Class) importFrom(stats,coef) importFrom(stats,lm) diff --git a/R/bart.R b/R/bart.R index 841d7ee6..25699152 100644 --- a/R/bart.R +++ b/R/bart.R @@ -1215,6 +1215,12 @@ convertBARTModelToJson <- function(object){ jsonobj$add_string_vector("rfx_unique_group_ids", object$rfx_unique_group_ids) } + # Add covariate preprocessor metadata + preprocessor_metadata_string <- savePreprocessorToJsonString( + object$train_set_metadata + ) + jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string) + return(jsonobj) } @@ -1322,7 +1328,7 @@ saveBARTModelToJsonFile <- function(object, filename){ #' Convert the persistent aspects of a BART model to (in-memory) JSON string #' #' @param object Object of type `bartmodel` containing draws of a BART model and associated sampling outputs. -#' @return JSON string +#' @return in-memory JSON string #' @export #' #' @examples @@ -1460,6 +1466,12 @@ createBARTModelFromJson <- function(json_object){ output[["rfx_samples"]] <- loadRandomEffectSamplesJson(json_object, 0) } + # Unpack covariate preprocessor + preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata") + output[["train_set_metadata"]] <- createPreprocessorFromJsonString( + preprocessor_metadata_string + ) + class(output) <- "bartmodel" return(output) } @@ -1686,6 +1698,12 @@ createBARTModelFromCombinedJson <- function(json_object_list){ output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0) } + # Unpack covariate preprocessor + preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata") + output[["train_set_metadata"]] <- createPreprocessorFromJsonString( + preprocessor_metadata_string + ) + class(output) <- "bartmodel" return(output) } @@ -1832,6 +1850,12 @@ createBARTModelFromCombinedJsonString <- function(json_string_list){ output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0) } + # Unpack covariate preprocessor + preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata") + output[["train_set_metadata"]] <- createPreprocessorFromJsonString( + preprocessor_metadata_string + ) + class(output) <- "bartmodel" return(output) } diff --git a/R/bcf.R b/R/bcf.R index bc5b9d5f..ed00d25e 100644 --- a/R/bcf.R +++ b/R/bcf.R @@ -1708,6 +1708,12 @@ convertBCFModelToJson <- function(object){ jsonobj$add_string("bart_propensity_model", bart_propensity_string) } + # Add covariate preprocessor metadata + preprocessor_metadata_string <- savePreprocessorToJsonString( + object$train_set_metadata + ) + jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string) + return(jsonobj) } @@ -1716,7 +1722,7 @@ convertBCFModelToJson <- function(object){ #' @param object Object of type `bcf` containing draws of a Bayesian causal forest model and associated sampling outputs. #' @param filename String of filepath, must end in ".json" #' -#' @return NULL +#' @return in-memory JSON string #' @export #' #' @examples @@ -2018,6 +2024,12 @@ createBCFModelFromJson <- function(json_object){ ) } + # Unpack covariate preprocessor + preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata") + output[["train_set_metadata"]] <- createPreprocessorFromJsonString( + preprocessor_metadata_string + ) + class(output) <- "bcf" return(output) } @@ -2393,6 +2405,12 @@ createBCFModelFromCombinedJsonString <- function(json_string_list){ output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0) } + # Unpack covariate preprocessor + preprocessor_metadata_string <- json_object_default$get_string("preprocessor_metadata") + output[["train_set_metadata"]] <- createPreprocessorFromJsonString( + preprocessor_metadata_string + ) + class(output) <- "bcf" return(output) } diff --git a/R/cpp11.R b/R/cpp11.R index bf6345b9..bc411e89 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -612,6 +612,14 @@ json_add_double_cpp <- function(json_ptr, field_name, field_value) { invisible(.Call(`_stochtree_json_add_double_cpp`, json_ptr, field_name, field_value)) } +json_add_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) { + invisible(.Call(`_stochtree_json_add_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value)) +} + +json_add_integer_cpp <- function(json_ptr, field_name, field_value) { + invisible(.Call(`_stochtree_json_add_integer_cpp`, json_ptr, field_name, field_value)) +} + json_add_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) { invisible(.Call(`_stochtree_json_add_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value)) } @@ -628,6 +636,14 @@ json_add_vector_cpp <- function(json_ptr, field_name, field_vector) { invisible(.Call(`_stochtree_json_add_vector_cpp`, json_ptr, field_name, field_vector)) } +json_add_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) { + invisible(.Call(`_stochtree_json_add_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector)) +} + +json_add_integer_vector_cpp <- function(json_ptr, field_name, field_vector) { + invisible(.Call(`_stochtree_json_add_integer_vector_cpp`, json_ptr, field_name, field_vector)) +} + json_add_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) { invisible(.Call(`_stochtree_json_add_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector)) } @@ -660,6 +676,14 @@ json_extract_double_cpp <- function(json_ptr, field_name) { .Call(`_stochtree_json_extract_double_cpp`, json_ptr, field_name) } +json_extract_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) { + .Call(`_stochtree_json_extract_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name) +} + +json_extract_integer_cpp <- function(json_ptr, field_name) { + .Call(`_stochtree_json_extract_integer_cpp`, json_ptr, field_name) +} + json_extract_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) { .Call(`_stochtree_json_extract_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name) } @@ -684,6 +708,14 @@ json_extract_vector_cpp <- function(json_ptr, field_name) { .Call(`_stochtree_json_extract_vector_cpp`, json_ptr, field_name) } +json_extract_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) { + .Call(`_stochtree_json_extract_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name) +} + +json_extract_integer_vector_cpp <- function(json_ptr, field_name) { + .Call(`_stochtree_json_extract_integer_vector_cpp`, json_ptr, field_name) +} + json_extract_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) { .Call(`_stochtree_json_extract_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name) } diff --git a/R/serialization.R b/R/serialization.R index 24205f9e..bca7f23f 100644 --- a/R/serialization.R +++ b/R/serialization.R @@ -81,6 +81,20 @@ CppJson <- R6::R6Class( } }, + #' @description + #' Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name") + #' @param field_name The name of the field to be added to json + #' @param field_value Integer value of the field to be added to json + #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value + #' @return NULL + add_integer = function(field_name, field_value, subfolder_name = NULL) { + if (is.null(subfolder_name)) { + json_add_integer_cpp(self$json_ptr, field_name, field_value) + } else { + json_add_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_value) + } + }, + #' @description #' Add a boolean value to the json object under the name "field_name" (with optional subfolder "subfolder_name") #' @param field_name The name of the field to be added to json @@ -110,7 +124,7 @@ CppJson <- R6::R6Class( }, #' @description - #' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name") + #' Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name") #' @param field_name The name of the field to be added to json #' @param field_vector Vector to be stored in json #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value @@ -124,6 +138,21 @@ CppJson <- R6::R6Class( } }, + #' @description + #' Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name") + #' @param field_name The name of the field to be added to json + #' @param field_vector Vector to be stored in json + #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value + #' @return NULL + add_integer_vector = function(field_name, field_vector, subfolder_name = NULL) { + field_vector <- as.numeric(field_vector) + if (is.null(subfolder_name)) { + json_add_integer_vector_cpp(self$json_ptr, field_name, field_vector) + } else { + json_add_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_vector) + } + }, + #' @description #' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name") #' @param field_name The name of the field to be added to json @@ -184,6 +213,22 @@ CppJson <- R6::R6Class( return(result) }, + #' @description + #' Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name") + #' @param field_name The name of the field to be accessed from json + #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored + #' @return NULL + get_integer = function(field_name, subfolder_name = NULL) { + if (is.null(subfolder_name)) { + stopifnot(json_contains_field_cpp(self$json_ptr, field_name)) + result <- json_extract_integer_cpp(self$json_ptr, field_name) + } else { + stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name)) + result <- json_extract_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name) + } + return(result) + }, + #' @description #' Retrieve a boolean value from the json object under the name "field_name" (with optional subfolder "subfolder_name") #' @param field_name The name of the field to be accessed from json @@ -232,6 +277,22 @@ CppJson <- R6::R6Class( return(result) }, + #' @description + #' Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name") + #' @param field_name The name of the field to be accessed from json + #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored + #' @return NULL + get_integer_vector = function(field_name, subfolder_name = NULL) { + if (is.null(subfolder_name)) { + stopifnot(json_contains_field_cpp(self$json_ptr, field_name)) + result <- json_extract_integer_vector_cpp(self$json_ptr, field_name) + } else { + stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name)) + result <- json_extract_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name) + } + return(result) + }, + #' @description #' Retrieve a character vector from the json object under the name "field_name" (with optional subfolder "subfolder_name") #' @param field_name The name of the field to be accessed from json diff --git a/R/utils.R b/R/utils.R index a1fc12a8..ea96794b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -359,6 +359,116 @@ preprocessPredictionDataFrame <- function(input_df, metadata) { return(X) } +#' Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object +#' +#' @param object List containing information on variables, including train set +#' categories for categorical variables +#' +#' @return wrapper around in-memory C++ JSON object +#' @export +#' +#' @examples +#' cov_mat <- matrix(1:12, ncol = 3) +#' preprocess_list <- preprocessTrainData(cov_mat) +#' preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata) +convertPreprocessorToJson <- function(object) { + jsonobj <- createCppJson() + if (is.null(object$feature_types)) { + stop("This covariate preprocessor has not yet been fit") + } + + # Add internal scalars + jsonobj$add_integer("num_numeric_vars", object$num_numeric_vars) + jsonobj$add_integer("num_ordered_cat_vars", object$num_ordered_cat_vars) + jsonobj$add_integer("num_unordered_cat_vars", object$num_unordered_cat_vars) + + # Add internal vectors + jsonobj$add_vector("feature_types", object$feature_types) + jsonobj$add_vector("original_var_indices", object$original_var_indices) + if (object$num_numeric_vars > 0) { + jsonobj$add_string_vector("numeric_vars", object$numeric_vars) + } + if (object$num_ordered_cat_vars > 0) { + jsonobj$add_string_vector("ordered_cat_vars", object$ordered_cat_vars) + jsonobj$add_string_vector("ordered_unique_levels", object$ordered_unique_levels) + } + if (object$num_unordered_cat_vars > 0) { + jsonobj$add_string_vector("unordered_cat_vars", object$unordered_cat_vars) + jsonobj$add_string_vector("unordered_unique_levels", object$unordered_unique_levels) + } + + return(jsonobj) +} + +#' Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string +#' +#' @param object List containing information on variables, including train set +#' categories for categorical variables +#' +#' @return in-memory JSON string +#' @export +#' +#' @examples +#' cov_mat <- matrix(1:12, ncol = 3) +#' preprocess_list <- preprocessTrainData(cov_mat) +#' preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata) +savePreprocessorToJsonString <- function(object){ + # Convert to Json + jsonobj <- convertPreprocessorToJson(object) + + # Dump to string + return(jsonobj$return_json_string()) +} + +#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor +#' +#' @param json_object in-memory wrapper around JSON C++ object containing covariate preprocessor metadata +#' +#' @returns Preprocessor object that can be used with the `preprocessPredictionData` function +#' @export +createPreprocessorFromJson <- function(json_object){ + # Initialize the metadata list + metadata <- list() + + # Unpack internal scalars + metadata[["num_numeric_vars"]] <- json_object$get_integer("num_numeric_vars") + metadata[["num_ordered_cat_vars"]] <- json_object$get_integer("num_ordered_cat_vars") + metadata[["num_unordered_cat_vars"]] <- json_object$get_integer("num_unordered_cat_vars") + + # Unpack internal vectors + metadata[["feature_types"]] <- json_object$get_vector("feature_types") + metadata[["original_var_indices"]] <- json_object$get_vector("original_var_indices") + if (metadata$num_numeric_vars > 0) { + metadata[["numeric_vars"]] <- json_object$get_string_vector("numeric_vars") + } + if (metadata$num_ordered_cat_vars > 0) { + metadata[["ordered_cat_vars"]] <- json_object$get_string_vector("ordered_cat_vars") + metadata[["ordered_unique_levels"]] <- json_object$get_string_vector("ordered_unique_levels") + } + if (metadata$num_unordered_cat_vars > 0) { + metadata[["unordered_cat_vars"]] <- json_object$get_string_vector("unordered_cat_vars") + metadata[["unordered_unique_levels"]] <- json_object$get_string_vector("unordered_unique_levels") + } + + return(metadata) +} + +#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor +#' +#' @param json_string in-memory JSON string containing covariate preprocessor metadata +#' +#' @return Preprocessor object that can be used with the `preprocessPredictionData` function +#' @export +createPreprocessorFromJsonString <- function(json_string){ + # Load a `CppJson` object from string + preprocessor_json <- createCppJsonString(json_string) + + # Create and return the BCF object + preprocessor_object <- createPreprocessorFromJson(preprocessor_json) + + return(preprocessor_object) +} + #' Preprocess a dataframe of covariate values, converting categorical variables #' to integers and one-hot encoding if need be. Returns a list including a #' matrix of preprocessed covariate values and associated tracking. diff --git a/man/CppJson.Rd b/man/CppJson.Rd index a7f7e448..65f1edec 100644 --- a/man/CppJson.Rd +++ b/man/CppJson.Rd @@ -32,16 +32,20 @@ Wrapper around a C++ container of tree ensembles \item \href{#method-CppJson-add_forest}{\code{CppJson$add_forest()}} \item \href{#method-CppJson-add_random_effects}{\code{CppJson$add_random_effects()}} \item \href{#method-CppJson-add_scalar}{\code{CppJson$add_scalar()}} +\item \href{#method-CppJson-add_integer}{\code{CppJson$add_integer()}} \item \href{#method-CppJson-add_boolean}{\code{CppJson$add_boolean()}} \item \href{#method-CppJson-add_string}{\code{CppJson$add_string()}} \item \href{#method-CppJson-add_vector}{\code{CppJson$add_vector()}} +\item \href{#method-CppJson-add_integer_vector}{\code{CppJson$add_integer_vector()}} \item \href{#method-CppJson-add_string_vector}{\code{CppJson$add_string_vector()}} \item \href{#method-CppJson-add_list}{\code{CppJson$add_list()}} \item \href{#method-CppJson-add_string_list}{\code{CppJson$add_string_list()}} \item \href{#method-CppJson-get_scalar}{\code{CppJson$get_scalar()}} +\item \href{#method-CppJson-get_integer}{\code{CppJson$get_integer()}} \item \href{#method-CppJson-get_boolean}{\code{CppJson$get_boolean()}} \item \href{#method-CppJson-get_string}{\code{CppJson$get_string()}} \item \href{#method-CppJson-get_vector}{\code{CppJson$get_vector()}} +\item \href{#method-CppJson-get_integer_vector}{\code{CppJson$get_integer_vector()}} \item \href{#method-CppJson-get_string_vector}{\code{CppJson$get_string_vector()}} \item \href{#method-CppJson-get_numeric_list}{\code{CppJson$get_numeric_list()}} \item \href{#method-CppJson-get_string_list}{\code{CppJson$get_string_list()}} @@ -120,6 +124,30 @@ Add a scalar to the json object under the name "field_name" (with optional subfo \item{\code{field_value}}{Numeric value of the field to be added to json} +\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value} +} +\if{html}{\out{}} +} +\subsection{Returns}{ +NULL +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CppJson-add_integer}{}}} +\subsection{Method \code{add_integer()}}{ +Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name") +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CppJson$add_integer(field_name, field_value, subfolder_name = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{field_name}}{The name of the field to be added to json} + +\item{\code{field_value}}{Integer value of the field to be added to json} + \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value} } \if{html}{\out{
}} @@ -180,7 +208,7 @@ NULL \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-CppJson-add_vector}{}}} \subsection{Method \code{add_vector()}}{ -Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name") +Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name") \subsection{Usage}{ \if{html}{\out{
}}\preformatted{CppJson$add_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{
}} } @@ -192,6 +220,30 @@ Add an array to the json object under the name "field_name" (with optional subfo \item{\code{field_vector}}{Vector to be stored in json} +\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value} +} +\if{html}{\out{}} +} +\subsection{Returns}{ +NULL +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CppJson-add_integer_vector}{}}} +\subsection{Method \code{add_integer_vector()}}{ +Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name") +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CppJson$add_integer_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{field_name}}{The name of the field to be added to json} + +\item{\code{field_vector}}{Vector to be stored in json} + \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value} } \if{html}{\out{
}} @@ -282,6 +334,28 @@ Retrieve a scalar value from the json object under the name "field_name" (with o \describe{ \item{\code{field_name}}{The name of the field to be accessed from json} +\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored} +} +\if{html}{\out{}} +} +\subsection{Returns}{ +NULL +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CppJson-get_integer}{}}} +\subsection{Method \code{get_integer()}}{ +Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name") +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CppJson$get_integer(field_name, subfolder_name = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{field_name}}{The name of the field to be accessed from json} + \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored} } \if{html}{\out{
}} @@ -348,6 +422,28 @@ Retrieve a vector from the json object under the name "field_name" (with optiona \describe{ \item{\code{field_name}}{The name of the field to be accessed from json} +\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored} +} +\if{html}{\out{}} +} +\subsection{Returns}{ +NULL +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-CppJson-get_integer_vector}{}}} +\subsection{Method \code{get_integer_vector()}}{ +Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name") +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{CppJson$get_integer_vector(field_name, subfolder_name = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{field_name}}{The name of the field to be accessed from json} + \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored} } \if{html}{\out{
}} diff --git a/man/convertPreprocessorToJson.Rd b/man/convertPreprocessorToJson.Rd new file mode 100644 index 00000000..49716050 --- /dev/null +++ b/man/convertPreprocessorToJson.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{convertPreprocessorToJson} +\alias{convertPreprocessorToJson} +\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object} +\usage{ +convertPreprocessorToJson(object) +} +\arguments{ +\item{object}{List containing information on variables, including train set +categories for categorical variables} +} +\value{ +wrapper around in-memory C++ JSON object +} +\description{ +Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object +} +\examples{ +cov_mat <- matrix(1:12, ncol = 3) +preprocess_list <- preprocessTrainData(cov_mat) +preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata) +} diff --git a/man/createPreprocessorFromJson.Rd b/man/createPreprocessorFromJson.Rd new file mode 100644 index 00000000..3edca354 --- /dev/null +++ b/man/createPreprocessorFromJson.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{createPreprocessorFromJson} +\alias{createPreprocessorFromJson} +\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor} +\usage{ +createPreprocessorFromJson(json_object) +} +\arguments{ +\item{json_object}{in-memory wrapper around JSON C++ object containing covariate preprocessor metadata} +} +\value{ +Preprocessor object that can be used with the \code{preprocessPredictionData} function +} +\description{ +Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor +} diff --git a/man/createPreprocessorFromJsonString.Rd b/man/createPreprocessorFromJsonString.Rd new file mode 100644 index 00000000..00974b83 --- /dev/null +++ b/man/createPreprocessorFromJsonString.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{createPreprocessorFromJsonString} +\alias{createPreprocessorFromJsonString} +\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor} +\usage{ +createPreprocessorFromJsonString(json_string) +} +\arguments{ +\item{json_string}{in-memory JSON string containing covariate preprocessor metadata} +} +\value{ +Preprocessor object that can be used with the \code{preprocessPredictionData} function +} +\description{ +Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor +} diff --git a/man/saveBCFModelToJsonFile.Rd b/man/saveBCFModelToJsonFile.Rd index f7685c48..5a0f1512 100644 --- a/man/saveBCFModelToJsonFile.Rd +++ b/man/saveBCFModelToJsonFile.Rd @@ -11,6 +11,9 @@ saveBCFModelToJsonFile(object, filename) \item{filename}{String of filepath, must end in ".json"} } +\value{ +in-memory JSON string +} \description{ Convert the persistent aspects of a BCF model to (in-memory) JSON and save to a file } diff --git a/man/savePreprocessorToJsonString.Rd b/man/savePreprocessorToJsonString.Rd new file mode 100644 index 00000000..83c54d72 --- /dev/null +++ b/man/savePreprocessorToJsonString.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{savePreprocessorToJsonString} +\alias{savePreprocessorToJsonString} +\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string} +\usage{ +savePreprocessorToJsonString(object) +} +\arguments{ +\item{object}{List containing information on variables, including train set +categories for categorical variables} +} +\value{ +in-memory JSON string +} +\description{ +Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string +} +\examples{ +cov_mat <- matrix(1:12, ncol = 3) +preprocess_list <- preprocessTrainData(cov_mat) +preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata) +} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 0091dffd..d9c352c3 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -1134,6 +1134,22 @@ extern "C" SEXP _stochtree_json_add_double_cpp(SEXP json_ptr, SEXP field_name, S END_CPP11 } // serialization.cpp +void json_add_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, int field_value); +extern "C" SEXP _stochtree_json_add_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) { + BEGIN_CPP11 + json_add_integer_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_value)); + return R_NilValue; + END_CPP11 +} +// serialization.cpp +void json_add_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name, int field_value); +extern "C" SEXP _stochtree_json_add_integer_cpp(SEXP json_ptr, SEXP field_name, SEXP field_value) { + BEGIN_CPP11 + json_add_integer_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_value)); + return R_NilValue; + END_CPP11 +} +// serialization.cpp void json_add_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, bool field_value); extern "C" SEXP _stochtree_json_add_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) { BEGIN_CPP11 @@ -1166,6 +1182,22 @@ extern "C" SEXP _stochtree_json_add_vector_cpp(SEXP json_ptr, SEXP field_name, S END_CPP11 } // serialization.cpp +void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector); +extern "C" SEXP _stochtree_json_add_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) { + BEGIN_CPP11 + json_add_integer_vector_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_vector)); + return R_NilValue; + END_CPP11 +} +// serialization.cpp +void json_add_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name, cpp11::integers field_vector); +extern "C" SEXP _stochtree_json_add_integer_vector_cpp(SEXP json_ptr, SEXP field_name, SEXP field_vector) { + BEGIN_CPP11 + json_add_integer_vector_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name), cpp11::as_cpp>(field_vector)); + return R_NilValue; + END_CPP11 +} +// serialization.cpp void json_add_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector); extern "C" SEXP _stochtree_json_add_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) { BEGIN_CPP11 @@ -1226,6 +1258,20 @@ extern "C" SEXP _stochtree_json_extract_double_cpp(SEXP json_ptr, SEXP field_nam END_CPP11 } // serialization.cpp +int json_extract_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name); +extern "C" SEXP _stochtree_json_extract_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) { + BEGIN_CPP11 + return cpp11::as_sexp(json_extract_integer_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name))); + END_CPP11 +} +// serialization.cpp +int json_extract_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name); +extern "C" SEXP _stochtree_json_extract_integer_cpp(SEXP json_ptr, SEXP field_name) { + BEGIN_CPP11 + return cpp11::as_sexp(json_extract_integer_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name))); + END_CPP11 +} +// serialization.cpp bool json_extract_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name); extern "C" SEXP _stochtree_json_extract_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) { BEGIN_CPP11 @@ -1268,6 +1314,20 @@ extern "C" SEXP _stochtree_json_extract_vector_cpp(SEXP json_ptr, SEXP field_nam END_CPP11 } // serialization.cpp +cpp11::writable::integers json_extract_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name); +extern "C" SEXP _stochtree_json_extract_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) { + BEGIN_CPP11 + return cpp11::as_sexp(json_extract_integer_vector_subfolder_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(subfolder_name), cpp11::as_cpp>(field_name))); + END_CPP11 +} +// serialization.cpp +cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name); +extern "C" SEXP _stochtree_json_extract_integer_vector_cpp(SEXP json_ptr, SEXP field_name) { + BEGIN_CPP11 + return cpp11::as_sexp(json_extract_integer_vector_cpp(cpp11::as_cpp>>(json_ptr), cpp11::as_cpp>(field_name))); + END_CPP11 +} +// serialization.cpp cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name); extern "C" SEXP _stochtree_json_extract_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) { BEGIN_CPP11 @@ -1415,6 +1475,10 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_json_add_double_cpp", (DL_FUNC) &_stochtree_json_add_double_cpp, 3}, {"_stochtree_json_add_double_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_double_subfolder_cpp, 4}, {"_stochtree_json_add_forest_cpp", (DL_FUNC) &_stochtree_json_add_forest_cpp, 2}, + {"_stochtree_json_add_integer_cpp", (DL_FUNC) &_stochtree_json_add_integer_cpp, 3}, + {"_stochtree_json_add_integer_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_integer_subfolder_cpp, 4}, + {"_stochtree_json_add_integer_vector_cpp", (DL_FUNC) &_stochtree_json_add_integer_vector_cpp, 3}, + {"_stochtree_json_add_integer_vector_subfolder_cpp", (DL_FUNC) &_stochtree_json_add_integer_vector_subfolder_cpp, 4}, {"_stochtree_json_add_rfx_container_cpp", (DL_FUNC) &_stochtree_json_add_rfx_container_cpp, 2}, {"_stochtree_json_add_rfx_groupids_cpp", (DL_FUNC) &_stochtree_json_add_rfx_groupids_cpp, 2}, {"_stochtree_json_add_rfx_label_mapper_cpp", (DL_FUNC) &_stochtree_json_add_rfx_label_mapper_cpp, 2}, @@ -1430,6 +1494,10 @@ static const R_CallMethodDef CallEntries[] = { {"_stochtree_json_extract_bool_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_bool_subfolder_cpp, 3}, {"_stochtree_json_extract_double_cpp", (DL_FUNC) &_stochtree_json_extract_double_cpp, 2}, {"_stochtree_json_extract_double_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_double_subfolder_cpp, 3}, + {"_stochtree_json_extract_integer_cpp", (DL_FUNC) &_stochtree_json_extract_integer_cpp, 2}, + {"_stochtree_json_extract_integer_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_integer_subfolder_cpp, 3}, + {"_stochtree_json_extract_integer_vector_cpp", (DL_FUNC) &_stochtree_json_extract_integer_vector_cpp, 2}, + {"_stochtree_json_extract_integer_vector_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_integer_vector_subfolder_cpp, 3}, {"_stochtree_json_extract_string_cpp", (DL_FUNC) &_stochtree_json_extract_string_cpp, 2}, {"_stochtree_json_extract_string_subfolder_cpp", (DL_FUNC) &_stochtree_json_extract_string_subfolder_cpp, 3}, {"_stochtree_json_extract_string_vector_cpp", (DL_FUNC) &_stochtree_json_extract_string_vector_cpp, 2}, diff --git a/src/serialization.cpp b/src/serialization.cpp index 3593f1a5..749395e8 100644 --- a/src/serialization.cpp +++ b/src/serialization.cpp @@ -48,6 +48,29 @@ void json_add_double_cpp(cpp11::external_pointer json_ptr, std:: } } +[[cpp11::register]] +void json_add_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, int field_value) { + if (json_ptr->contains(subfolder_name)) { + if (json_ptr->at(subfolder_name).contains(field_name)) { + json_ptr->at(subfolder_name).at(field_name) = field_value; + } else { + json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } + } else { + json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value)); + } +} + +[[cpp11::register]] +void json_add_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name, int field_value) { + if (json_ptr->contains(field_name)) { + json_ptr->at(field_name) = field_value; + } else { + json_ptr->emplace(std::pair(field_name, field_value)); + } +} + [[cpp11::register]] void json_add_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, bool field_value) { if (json_ptr->contains(subfolder_name)) { @@ -111,6 +134,46 @@ void json_add_vector_cpp(cpp11::external_pointer json_ptr, std:: } } +[[cpp11::register]] +void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector) { + int vec_length = field_vector.size(); + if (json_ptr->contains(subfolder_name)) { + if (json_ptr->at(subfolder_name).contains(field_name)) { + json_ptr->at(subfolder_name).at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } else { + json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } + } else { + json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object())); + json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i)); + } + } +} + +[[cpp11::register]] +void json_add_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name, cpp11::integers field_vector) { + int vec_length = field_vector.size(); + if (json_ptr->contains(field_name)) { + json_ptr->at(field_name).clear(); + for (int i = 0; i < vec_length; i++) { + json_ptr->at(field_name).emplace_back(field_vector.at(i)); + } + } else { + json_ptr->emplace(std::pair(field_name, nlohmann::json::array())); + for (int i = 0; i < vec_length; i++) { + json_ptr->at(field_name).emplace_back(field_vector.at(i)); + } + } +} + [[cpp11::register]] void json_add_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector) { int vec_length = field_vector.size(); @@ -206,6 +269,16 @@ double json_extract_double_cpp(cpp11::external_pointer json_ptr, return json_ptr->at(field_name); } +[[cpp11::register]] +int json_extract_integer_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) { + return json_ptr->at(subfolder_name).at(field_name); +} + +[[cpp11::register]] +int json_extract_integer_cpp(cpp11::external_pointer json_ptr, std::string field_name) { + return json_ptr->at(field_name); +} + [[cpp11::register]] bool json_extract_bool_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) { return json_ptr->at(subfolder_name).at(field_name); @@ -242,6 +315,22 @@ cpp11::writable::doubles json_extract_vector_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) { + cpp11::writable::integers output; + int vec_length = json_ptr->at(subfolder_name).at(field_name).size(); + for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(subfolder_name).at(field_name).at(i))); + return output; +} + +[[cpp11::register]] +cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer json_ptr, std::string field_name) { + cpp11::writable::integers output; + int vec_length = json_ptr->at(field_name).size(); + for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(field_name).at(i))); + return output; +} + [[cpp11::register]] cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer json_ptr, std::string subfolder_name, std::string field_name) { int vec_length = json_ptr->at(subfolder_name).at(field_name).size();