From 53dab88d9c7218d05651f9f073310f32001dad76 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Mon, 20 Jan 2025 14:08:57 -0600
Subject: [PATCH 1/5] Added test for python edge case (unseen categories in
 unordered categorical)

---
 test/python/test_preprocessor.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py
index acc41593..4282a20e 100644
--- a/test/python/test_preprocessor.py
+++ b/test/python/test_preprocessor.py
@@ -71,3 +71,22 @@ def test_pandas(self):
         df_3_transformed = cov_transformer.fit_transform(df_3)
         np.testing.assert_array_equal(np_3, df_3_transformed)
         assert cov_transformer._processed_feature_types == [0,1,1,1,0]
+
+        df_4 = pd.DataFrame(
+            {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6],
+             "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ordered=False, categories=['c', 'b', 'a']),
+             "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]}
+        )
+        np_4 = np.array(
+            [[1.5, 0, 0, 1, 0, 1.2],
+             [2.7, 0, 1, 0, 0, 5.4],
+             [3.6, 1, 0, 0, 0, 9.3],
+             [4.4, 0, 0, 1, 0, 10.4],
+             [5.3, 0, 1, 0, 0, 3.6],
+             [6.1, 1, 0, 0, 1, 4.4]]
+        )
+        cov_transformer = CovariateTransformer()
+        with np.testing.assert_raises(ValueError):
+            df_4_transformed = cov_transformer.fit_transform(df_4)
+            # np.testing.assert_array_equal(np_4, df_4_transformed)
+            # assert cov_transformer._processed_feature_types == [0,1,1,1,1,0]

From 3c3570cf16e6b11fd36f9a71ff08cafb6faa398b Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Wed, 22 Jan 2025 03:30:56 -0600
Subject: [PATCH 2/5] Refactored and added serialization for python covariate
 preprocessor

---
 src/py_stochtree.cpp             |  99 ++++++++++++
 stochtree/__init__.py            |   4 +-
 stochtree/bart.py                |   4 +-
 stochtree/bcf.py                 |   6 +-
 stochtree/preprocessing.py       | 267 ++++++++++++++++++++++++-------
 stochtree/serialization.py       |  87 +++++++++-
 test/python/test_calibration.py  |   1 -
 test/python/test_json.py         |  25 ++-
 test/python/test_preprocessor.py |  33 ++--
 9 files changed, 441 insertions(+), 85 deletions(-)

diff --git a/src/py_stochtree.cpp b/src/py_stochtree.cpp
index f8cf32b7..ee25e586 100644
--- a/src/py_stochtree.cpp
+++ b/src/py_stochtree.cpp
@@ -1243,6 +1243,27 @@ class JsonCpp {
     }
   }
 
+  void AddInteger(std::string field_name, int field_value) {
+    if (json_->contains(field_name)) {
+      json_->at(field_name) = field_value;
+    } else {
+      json_->emplace(std::pair(field_name, field_value));
+    }
+  }
+
+  void AddIntegerSubfolder(std::string subfolder_name, std::string field_name, int field_value) {
+    if (json_->contains(subfolder_name)) {
+      if (json_->at(subfolder_name).contains(field_name)) {
+        json_->at(subfolder_name).at(field_name) = field_value;
+      } else {
+        json_->at(subfolder_name).emplace(std::pair(field_name, field_value));
+      }
+    } else {
+      json_->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+      json_->at(subfolder_name).emplace(std::pair(field_name, field_value));
+    }
+  }
+
   void AddBool(std::string field_name, bool field_value) {
     if (json_->contains(field_name)) {
       json_->at(field_name) = field_value;
@@ -1325,6 +1346,46 @@ class JsonCpp {
     }
   }
 
+  void AddIntegerVector(std::string field_name, py::array_t<int> field_vector) {
+    int vec_length = field_vector.size();
+    auto accessor = field_vector.mutable_unchecked<1>();
+    if (json_->contains(field_name)) {
+      json_->at(field_name).clear();
+      for (int i = 0; i < vec_length; i++) {
+        json_->at(field_name).emplace_back(accessor(i));
+      }
+    } else {
+      json_->emplace(std::pair(field_name, nlohmann::json::array()));
+      for (int i = 0; i < vec_length; i++) {
+        json_->at(field_name).emplace_back(accessor(i));
+      }
+    }
+  }
+
+  void AddIntegerVectorSubfolder(std::string subfolder_name, std::string field_name, py::array_t<int> field_vector) {
+    int vec_length = field_vector.size();
+    auto accessor = field_vector.mutable_unchecked<1>();
+    if (json_->contains(subfolder_name)) {
+      if (json_->at(subfolder_name).contains(field_name)) {
+        json_->at(subfolder_name).at(field_name).clear();
+        for (int i = 0; i < vec_length; i++) {
+          json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+        }
+      } else {
+        json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+        for (int i = 0; i < vec_length; i++) {
+          json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+        }
+      }
+    } else {
+      json_->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+      json_->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+      for (int i = 0; i < vec_length; i++) {
+        json_->at(subfolder_name).at(field_name).emplace_back(accessor(i));
+      }
+    }
+  }
+
   void AddStringVector(std::string field_name, std::vector<std::string>& field_vector) {
     int vec_length = field_vector.size();
     if (json_->contains(field_name)) {
@@ -1391,6 +1452,14 @@ class JsonCpp {
     return json_->at(subfolder_name).at(field_name);
   }
 
+  int ExtractInteger(std::string field_name) {
+    return json_->at(field_name);
+  }
+
+  int ExtractIntegerSubfolder(std::string subfolder_name, std::string field_name) {
+    return json_->at(subfolder_name).at(field_name);
+  }
+
   bool ExtractBool(std::string field_name) {
     return json_->at(field_name);
   }
@@ -1429,6 +1498,28 @@ class JsonCpp {
     return result;
   }
 
+  py::array_t<int> ExtractIntegerVector(std::string field_name) {
+    auto json_vec = json_->at(field_name);
+    py::ssize_t json_vec_length = json_->at(field_name).size();
+    auto result = py::array_t<int>(py::detail::any_container<py::ssize_t>({json_vec_length}));
+    auto accessor = result.mutable_unchecked<1>();
+    for (size_t i = 0; i < json_vec_length; i++) {
+      accessor(i) = json_vec.at(i);
+    }
+    return result;
+  }
+
+  py::array_t<int> ExtractIntegerVectorSubfolder(std::string subfolder_name, std::string field_name) {
+    auto json_vec = json_->at(subfolder_name).at(field_name);
+    py::ssize_t json_vec_length = json_->at(subfolder_name).at(field_name).size();
+    auto result = py::array_t<int>(py::detail::any_container<py::ssize_t>({json_vec_length}));
+    auto accessor = result.mutable_unchecked<1>();
+    for (size_t i = 0; i < json_vec_length; i++) {
+      accessor(i) = json_vec.at(i);
+    }
+    return result;
+  }
+
   std::vector<std::string> ExtractStringVector(std::string field_name) {
     auto json_vec = json_->at(field_name);
     py::ssize_t json_vec_length = json_->at(field_name).size();
@@ -1472,12 +1563,16 @@ PYBIND11_MODULE(stochtree_cpp, m) {
     .def("DumpJson", &JsonCpp::DumpJson)
     .def("AddDouble", &JsonCpp::AddDouble)
     .def("AddDoubleSubfolder", &JsonCpp::AddDoubleSubfolder)
+    .def("AddInteger", &JsonCpp::AddInteger)
+    .def("AddIntegerSubfolder", &JsonCpp::AddIntegerSubfolder)
     .def("AddBool", &JsonCpp::AddBool)
     .def("AddBoolSubfolder", &JsonCpp::AddBoolSubfolder)
     .def("AddString", &JsonCpp::AddString)
     .def("AddStringSubfolder", &JsonCpp::AddStringSubfolder)
     .def("AddDoubleVector", &JsonCpp::AddDoubleVector)
     .def("AddDoubleVectorSubfolder", &JsonCpp::AddDoubleVectorSubfolder)
+    .def("AddIntegerVector", &JsonCpp::AddIntegerVector)
+    .def("AddIntegerVectorSubfolder", &JsonCpp::AddIntegerVectorSubfolder)
     .def("AddStringVector", &JsonCpp::AddStringVector)
     .def("AddStringVectorSubfolder", &JsonCpp::AddStringVectorSubfolder)
     .def("AddForest", &JsonCpp::AddForest)
@@ -1485,12 +1580,16 @@ PYBIND11_MODULE(stochtree_cpp, m) {
     .def("ContainsFieldSubfolder", &JsonCpp::ContainsFieldSubfolder)
     .def("ExtractDouble", &JsonCpp::ExtractDouble)
     .def("ExtractDoubleSubfolder", &JsonCpp::ExtractDoubleSubfolder)
+    .def("ExtractInteger", &JsonCpp::ExtractInteger)
+    .def("ExtractIntegerSubfolder", &JsonCpp::ExtractIntegerSubfolder)
     .def("ExtractBool", &JsonCpp::ExtractBool)
     .def("ExtractBoolSubfolder", &JsonCpp::ExtractBoolSubfolder)
     .def("ExtractString", &JsonCpp::ExtractString)
     .def("ExtractStringSubfolder", &JsonCpp::ExtractStringSubfolder)
     .def("ExtractDoubleVector", &JsonCpp::ExtractDoubleVector)
     .def("ExtractDoubleVectorSubfolder", &JsonCpp::ExtractDoubleVectorSubfolder)
+    .def("ExtractIntegerVector", &JsonCpp::ExtractIntegerVector)
+    .def("ExtractIntegerVectorSubfolder", &JsonCpp::ExtractIntegerVectorSubfolder)
     .def("ExtractStringVector", &JsonCpp::ExtractStringVector)
     .def("ExtractStringVectorSubfolder", &JsonCpp::ExtractStringVectorSubfolder)
     .def("SubsetJsonForest", &JsonCpp::SubsetJsonForest);
diff --git a/stochtree/__init__.py b/stochtree/__init__.py
index 95b49ae3..8e3cc643 100644
--- a/stochtree/__init__.py
+++ b/stochtree/__init__.py
@@ -3,7 +3,7 @@
 from .calibration import calibrate_global_error_variance
 from .data import Dataset, Residual
 from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer
+from .preprocessing import CovariatePreprocessor
 from .sampler import RNG, ForestSampler, GlobalVarianceModel, LeafVarianceModel
 from .serialization import JSONSerializer
 from .utils import NotSampledError
@@ -15,7 +15,7 @@
   'Residual', 
   'ForestContainer', 
   'Forest', 
-  'CovariateTransformer', 
+  'CovariatePreprocessor', 
   'RNG', 
   'ForestSampler', 
   'GlobalVarianceModel', 
diff --git a/stochtree/bart.py b/stochtree/bart.py
index 01733c0a..1e4ee6b0 100644
--- a/stochtree/bart.py
+++ b/stochtree/bart.py
@@ -8,7 +8,7 @@
 from typing import Optional, Dict, Any, Union
 from .data import Dataset, Residual
 from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer, _preprocess_params
+from .preprocessing import CovariatePreprocessor, _preprocess_params
 from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel
 from .serialization import JSONSerializer
 from .utils import NotSampledError
@@ -301,7 +301,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N
         variable_weights_variance = variable_weights
         
         # Covariate preprocessing
-        self._covariate_transformer = CovariateTransformer()
+        self._covariate_transformer = CovariatePreprocessor()
         self._covariate_transformer.fit(X_train)
         X_train_processed = self._covariate_transformer.transform(X_train)
         if X_test is not None:
diff --git a/stochtree/bcf.py b/stochtree/bcf.py
index 8c7ca21c..492cd4dd 100644
--- a/stochtree/bcf.py
+++ b/stochtree/bcf.py
@@ -8,7 +8,7 @@
 from .bart import BARTModel
 from .data import Dataset, Residual
 from .forest import ForestContainer, Forest
-from .preprocessing import CovariateTransformer, _preprocess_params
+from .preprocessing import CovariatePreprocessor, _preprocess_params
 from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel
 from .serialization import JSONSerializer
 from .utils import NotSampledError
@@ -38,7 +38,7 @@ class BCFModel:
     \begin{aligned}
     y &= a(X) + b_z(X) + \epsilon\\
     b_z(X) &= (b_1 Z + b_0 (1-Z)) t(X)\\
-    b_0, b_1 &\sim N(0, \frac{1}{2})\\\\
+    b_0, b_1 &\sim N\left(0, \frac{1}{2}\right)\\\\
     a(X) &\sim \text{BART}()\\
     t(X) &\sim \text{BART}()\\
     \epsilon &\sim N(0, \sigma^2)\\
@@ -663,7 +663,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr
             variable_subset_variance = [i for i in range(X_train.shape[1])]
         
         # Covariate preprocessing
-        self._covariate_transformer = CovariateTransformer()
+        self._covariate_transformer = CovariatePreprocessor()
         self._covariate_transformer.fit(X_train)
         X_train_processed = self._covariate_transformer.transform(X_train)
         if X_test is not None:
diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py
index a586afd8..40019cc8 100644
--- a/stochtree/preprocessing.py
+++ b/stochtree/preprocessing.py
@@ -7,7 +7,9 @@
 from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
 import numpy as np
 import pandas as pd
+from scipy import sparse
 import warnings
+from .serialization import JSONSerializer
 
 def _preprocess_params(default_params: Dict[str, Any], user_params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
     if user_params:
@@ -129,20 +131,44 @@ def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str,
     return processed_params
 
 
-class CovariateTransformer:
+def _map_to_integer(values: Union[np.array, list], uniques: Union[np.array, list]) -> np.array:
+    r"""
+    Slightly modified version of a [scikit-learn function](https://github.com/scikit-learn/scikit-learn/blob/43d440f1f874ac2117ed848b10a6f07d9083488d/sklearn/utils/_encode.py#L170) by the same name.
+    Converts dataframe column values (which might be string, categorical, etc...) to numpy integer indices.
+
+    Parameters
+    ----------
+    values : np.array or list
+        Array of series values.
+    uniques : np.array or list
+        Sorted array / list of unique values in the series.
     """
-    Class that transforms covariates to a format that can be used to define tree splits. 
-    Modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html).
+    table = dict({val: i for i, val in enumerate(uniques)})
+    return np.asarray([table[v] for v in values])
+
+
+class CovariatePreprocessor:
+    r"""
+    Preprocessing engine for covariates provided as either `np.array` or `pd.DataFrame`, which standardizes inputs as a `np.array`.
+
+    `CovariatePreprocessor` uses [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) in provided 
+    dataframes to convert string / categorical variables to numeric variables, either by mapping ordinal variables to integers 
+    or by one-hot encoding unordered categorical variables.
+    
+    This class is modeled after the [scikit-learn preprocessing classes](https://scikit-learn.org/1.5/modules/preprocessing.html).
     """
     def __init__(self) -> None:
         self._is_fitted = False
-        self._ordinal_encoders = []
-        self._onehot_encoders = []
-        self._ordinal_feature_index = []
-        self._onehot_feature_index = []
-        self._processed_feature_types = []
-        self._original_feature_types = []
-        self._original_feature_indices = []
+        self._num_ordinal_features = 0
+        self._num_onehot_features = 0
+        self._num_original_features = 0
+        self._ordinal_categories_list = []
+        self._onehot_categories_list = []
+        self._ordinal_feature_index = None
+        self._onehot_feature_index = None
+        self._processed_feature_types = None
+        self._original_feature_types = None
+        self._original_feature_indices = None
     
     def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
         if dtype.kind == "b" or dtype.kind == "i" or dtype.kind == "u" or dtype.kind == "f":
@@ -150,27 +176,41 @@ def _check_is_numeric_dtype(self, dtype: np.dtype) -> bool:
         else:
             return False
     
-    def _process_unordered_categorical(self, covariate: pd.Series) -> int:
-        num_onehot = len(self._onehot_encoders)
-        category_list = covariate.array.categories.to_list()
-        enc = OneHotEncoder(categories=[category_list], sparse_output=False)
-        enc.fit(pd.DataFrame(covariate))
-        self._onehot_encoders.append(enc)
-        return num_onehot
+    def _extract_categories_unordered_categorical(self, covariate: pd.Series) -> int:
+        covariate_categories = covariate.array.categories.to_numpy()
+        self._onehot_categories_list.append(covariate_categories)
+        return self._num_onehot_features
+    
+    def _extract_categories_ordered_categorical(self, covariate: pd.Series) -> int:
+        covariate_categories = covariate.array.categories.to_numpy()
+        self._ordinal_categories_list.append(covariate_categories)
+        return self._num_ordinal_features
+    
+    def _transform_unordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array:
+        """
+        Adapted from https://github.com/scikit-learn/scikit-learn/blob/8f2c1cab50262bcf4a1ade070446c40028ee27f4/sklearn/preprocessing/_encoders.py#L1000
+        """
+        covariate_data = covariate.array.to_numpy()
+        n = len(covariate_data)
+        integer_indices = _map_to_integer(covariate_data, covariate_categories)
+        row_offsets = np.arange(n + 1, dtype=int)
+        onehot_data = np.ones(row_offsets[-1])
+        out = sparse.csr_matrix(
+            (onehot_data, integer_indices, row_offsets),
+            shape=(n, len(covariate_categories)),
+            dtype=np.float64,
+        )
+        return out.toarray()
     
-    def _process_ordered_categorical(self, covariate: pd.Series) -> int:
-        num_ord = len(self._ordinal_encoders)
-        category_list = covariate.array.categories.to_list()
-        enc = OrdinalEncoder(categories=[category_list])
-        enc.fit(pd.DataFrame(covariate))
-        self._ordinal_encoders.append(enc)
-        return num_ord
+    def _transform_ordered_categorical(self, covariate: pd.Series, covariate_categories: np.array) -> np.array:
+        covariate_data = covariate.array.to_numpy()
+        return _map_to_integer(covariate_data, covariate_categories)
 
     def _fit_pandas(self, covariates: pd.DataFrame) -> None:
         self._num_original_features = covariates.shape[1]
-        self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
-        self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
-        self._original_feature_types = [-1 for i in range(self._num_original_features)]
+        self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+        self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+        original_feature_types = [-1 for i in range(self._num_original_features)]
         datetime_types = covariates.apply(lambda x: pd.api.types.is_datetime64_any_dtype(x))
         object_types = covariates.apply(lambda x: pd.api.types.is_object_dtype(x))
         interval_types = covariates.apply(lambda x: isinstance(x.dtype, pd.IntervalDtype))
@@ -214,36 +254,42 @@ def _fit_pandas(self, covariates: pd.DataFrame) -> None:
             warn_msg = "The following columns are a type unsupported by stochtree (object) and will be ignored: {}"
             warnings.warn(warn_msg.format(object_cols))
         
+        processed_feature_types = []
         for i in range(covariates.shape[1]):
             covariate = covariates.iloc[:,i]
             if categorical_types.iloc[i]:
-                self._original_feature_types[i] = "category"
+                original_feature_types[i] = "category"
                 if covariate.array.ordered:
-                    ord_index = self._process_ordered_categorical(covariate)
+                    ord_index = self._extract_categories_ordered_categorical(covariate)
                     self._ordinal_feature_index[i] = ord_index
-                    self._processed_feature_types.append(1)
+                    processed_feature_types.append(1)
+                    self._num_ordinal_features += 1
                 else:
-                    onehot_index = self._process_unordered_categorical(covariate)
+                    onehot_index = self._extract_categories_unordered_categorical(covariate)
                     self._onehot_feature_index[i] = onehot_index
                     feature_ones = np.repeat(1, len(covariate.array.categories)).tolist()
-                    self._processed_feature_types.extend(feature_ones)
+                    processed_feature_types.extend(feature_ones)
+                    self._num_onehot_features += 1
             elif string_types.iloc[i]:
-                self._original_feature_types[i] = "string"
-                onehot_index = self._process_unordered_categorical(covariate)
+                original_feature_types[i] = "string"
+                onehot_index = self._extract_categories_unordered_categorical(covariate)
                 self._onehot_feature_index[i] = onehot_index
                 feature_ones = np.repeat(1, len(self._onehot_encoders[onehot_index].categories_[0])).tolist()
-                self._processed_feature_types.extend(feature_ones)
+                processed_feature_types.extend(feature_ones)
             elif bool_types.iloc[i]:
-                self._original_feature_types[i] = "boolean"
-                self._processed_feature_types.append(1)
+                original_feature_types[i] = "boolean"
+                processed_feature_types.append(1)
             elif integer_types.iloc[i]:
-                self._original_feature_types[i] = "integer"
-                self._processed_feature_types.append(0)
+                original_feature_types[i] = "integer"
+                processed_feature_types.append(0)
             elif float_types.iloc[i]:
-                self._original_feature_types[i] = "float"
-                self._processed_feature_types.append(0)
+                original_feature_types[i] = "float"
+                processed_feature_types.append(0)
             else:
-                self._original_feature_types[i] = "unsupported"
+                original_feature_types[i] = "unsupported"
+        
+        self._processed_feature_types = np.array(processed_feature_types, dtype=int)
+        self._original_feature_types = np.array(original_feature_types)
     
     def _fit_numpy(self, covariates: np.array) -> None:
         if covariates.ndim == 1:
@@ -252,9 +298,9 @@ def _fit_numpy(self, covariates: np.array) -> None:
             raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
         
         self._num_original_features = covariates.shape[1]
-        self._ordinal_feature_index = [-1 for i in range(self._num_original_features)]
-        self._onehot_feature_index = [-1 for i in range(self._num_original_features)]
-        self._original_feature_types = ["float" for i in range(self._num_original_features)]
+        self._ordinal_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+        self._onehot_feature_index = np.array([-1 for i in range(self._num_original_features)], dtype=int)
+        self._original_feature_types = np.array(["float" for i in range(self._num_original_features)])
 
         # Check whether the array is numeric
         cov_dtype = covariates.dtype
@@ -269,12 +315,16 @@ def _fit_numpy(self, covariates: np.array) -> None:
             raise ValueError("Covariates passed as np.array must all be simple numeric types (bool, integer, unsigned integer, floating point)")
         
         # Scan for binary columns
+        processed_feature_types = []
         for i in range(self._num_original_features):
             num_unique = np.unique(covariates[:,i]).size
             if num_unique == 2:
-                self._processed_feature_types.append(1)
+                processed_feature_types.append(1)
             else:
-                self._processed_feature_types.append(0)
+                processed_feature_types.append(0)
+            # TODO: Convert to integer if not passed as integer
+        
+        self._processed_feature_types = np.array(processed_feature_types, dtype=int)
 
     def _fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
         if isinstance(covariates, pd.DataFrame):
@@ -291,33 +341,38 @@ def _transform_pandas(self, covariates: pd.DataFrame) -> np.array:
         
         output_array = np.empty((covariates.shape[0], len(self._processed_feature_types)), dtype=np.float64)
         output_iter = 0
-        self._original_feature_indices = []
+        original_feature_indices = []
+        print(self._original_feature_types)
         for i in range(covariates.shape[1]):
             covariate = covariates.iloc[:,i]
             if self._original_feature_types[i] == "category" or self._original_feature_types[i] == "string":
                 if self._ordinal_feature_index[i] != -1:
                     ord_ind = self._ordinal_feature_index[i]
-                    covariate_transformed = self._ordinal_encoders[ord_ind].transform(pd.DataFrame(covariate))
+                    covariate_categories = self._ordinal_categories_list[ord_ind]
+                    covariate_transformed = self._transform_ordered_categorical(covariate, covariate_categories)
                     output_array[:,output_iter] = np.squeeze(covariate_transformed)
                     output_iter += 1
-                    self._original_feature_indices.append(i)
+                    original_feature_indices.append(i)
                 else:
                     onehot_ind = self._onehot_feature_index[i]
-                    covariate_transformed = self._onehot_encoders[onehot_ind].transform(pd.DataFrame(covariate))
+                    covariate_categories = self._onehot_categories_list[onehot_ind]
+                    covariate_transformed = self._transform_unordered_categorical(covariate, covariate_categories)
                     output_dim = covariate_transformed.shape[1]
                     output_array[:,np.arange(output_iter, output_iter + output_dim)] = np.squeeze(covariate_transformed)
                     output_iter += output_dim
-                    self._original_feature_indices.extend([i for _ in range(output_dim)])
+                    original_feature_indices.extend([i for _ in range(output_dim)])
             
             elif self._original_feature_types[i] == "boolean":
                 output_array[:,output_iter] = (covariate*1.0).to_numpy()
                 output_iter += 1
-                self._original_feature_indices.append(i)
+                original_feature_indices.append(i)
             
             elif self._original_feature_types[i] == "integer" or self._original_feature_types[i] == "float":
                 output_array[:,output_iter] = (covariate).to_numpy()
                 output_iter += 1
-                self._original_feature_indices.append(i)
+                original_feature_indices.append(i)
+        
+        self._original_feature_indices = np.array(original_feature_indices, dtype=int)
         
         return output_array
 
@@ -346,7 +401,7 @@ def _check_is_fitted(self) -> bool:
         return self._is_fitted
 
     def fit(self, covariates: Union[pd.DataFrame, np.array]) -> None:
-        r"""Fits a `CovariateTransformer` by unpacking (and storing) data type information on the input (raw) covariates
+        r"""Fits a `CovariatePreprocessor` by unpacking (and storing) data type information on the input (raw) covariates
         and then converting to a numpy array which can be passed to a tree ensemble sampler.
 
         If `covariates` is a `pd.DataFrame`, [column dtypes](https://pandas.pydata.org/docs/user_guide/basics.html#basics-dtypes) 
@@ -431,3 +486,105 @@ def fetch_original_feature_indices(self) -> list:
             through `k` numeric features, this method would return a list `[0,...,k-1]`.
         """
         return self._original_feature_indices
+    
+    def to_json(self) -> str:
+        """
+        Converts a covariate preprocessor to JSON string representation (which can then be saved to a file or 
+        processed using the `json` library)
+
+        Returns
+        -------
+        str
+            JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests
+        """
+        # Initialize JSONSerializer object
+        preprocessor_json = JSONSerializer()
+        
+        # Add internal scalars
+        preprocessor_json.add_boolean("is_fitted", self._is_fitted)
+        preprocessor_json.add_integer("num_ordinal_features", self._num_ordinal_features)
+        preprocessor_json.add_integer("num_onehot_features", self._num_onehot_features)
+        preprocessor_json.add_integer("num_original_features", self._num_original_features)
+
+        # Add internal lists
+        for i in range(self._num_ordinal_features):
+            dtype_name = "dtype_{:d}".format(i)
+            list_name = "cats_{:d}".format(i)
+            if np.issubdtype(self._ordinal_categories_list[i].dtype, np.integer):
+                array_type = "int"
+                preprocessor_json.add_integer_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+            elif np.issubdtype(self._ordinal_categories_list[i].dtype, np.floating):
+                array_type = "float"
+                preprocessor_json.add_numeric_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+            else:
+                array_type = "str"
+                preprocessor_json.add_string_vector(list_name, self._ordinal_categories_list[i], "ordinal_categories_list")
+            preprocessor_json.add_string(dtype_name, array_type, "ordinal_dtype_list")
+        for i in range(self._num_onehot_features):
+            dtype_name = "dtype_{:d}".format(i)
+            list_name = "cats_{:d}".format(i)
+            if np.issubdtype(self._onehot_categories_list[i].dtype, np.integer):
+                array_type = "int"
+                preprocessor_json.add_integer_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+            elif np.issubdtype(self._onehot_categories_list[i].dtype, np.floating):
+                array_type = "float"
+                preprocessor_json.add_numeric_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+            else:
+                array_type = "str"
+                preprocessor_json.add_string_vector(list_name, self._onehot_categories_list[i], "onehot_categories_list")
+            preprocessor_json.add_string(dtype_name, array_type, "onehot_dtype_list")
+        preprocessor_json.add_integer_vector("ordinal_feature_index", self._ordinal_feature_index)
+        preprocessor_json.add_integer_vector("onehot_feature_index", self._onehot_feature_index)
+        preprocessor_json.add_integer_vector("processed_feature_types", self._processed_feature_types)
+        preprocessor_json.add_string_vector("original_feature_types", self._original_feature_types)
+        preprocessor_json.add_integer_vector("original_feature_indices", self._original_feature_indices)
+        
+        return preprocessor_json.return_json_string()
+
+    def from_json(self, json_string: str) -> None:
+        """
+        Converts a JSON string to an in-memory BART model.
+
+        Parameters
+        ----------
+        json_string : str
+            JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests
+        """
+        # Parse string to a JSON object in C++
+        preprocessor_json = JSONSerializer()
+        preprocessor_json.load_from_json_string(json_string)
+        
+        # Unpack internal scalars
+        self._is_fitted = preprocessor_json.get_boolean("is_fitted")
+        self._num_ordinal_features = preprocessor_json.get_integer("num_ordinal_features")
+        self._num_onehot_features = preprocessor_json.get_integer("num_onehot_features")
+        self._num_original_features = preprocessor_json.get_integer("num_original_features")
+
+        # Unpack internal lists
+        self._ordinal_categories_list = []
+        for i in range(self._num_ordinal_features):
+            dtype_name = "dtype_{:d}".format(i)
+            list_name = "cats_{:d}".format(i)
+            array_type = preprocessor_json.get_string(dtype_name, "ordinal_dtype_list")
+            if array_type == "int":
+                self._ordinal_categories_list.append(preprocessor_json.get_integer_vector(list_name, "ordinal_categories_list"))
+            elif array_type == "float":
+                self._ordinal_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "ordinal_categories_list"))
+            else:
+                self._ordinal_categories_list.append(preprocessor_json.get_string_vector(list_name, "ordinal_categories_list"))
+        self._onehot_categories_list = []
+        for i in range(self._num_onehot_features):
+            dtype_name = "dtype_{:d}".format(i)
+            list_name = "cats_{:d}".format(i)
+            array_type = preprocessor_json.get_string(dtype_name, "onehot_dtype_list")
+            if array_type == "int":
+                self._onehot_categories_list.append(preprocessor_json.get_integer_vector(list_name, "onehot_categories_list"))
+            elif array_type == "float":
+                self._onehot_categories_list.append(preprocessor_json.get_numeric_vector(list_name, "onehot_categories_list"))
+            else:
+                self._onehot_categories_list.append(np.array(preprocessor_json.get_string_vector(list_name, "onehot_categories_list")))
+        self._ordinal_feature_index = preprocessor_json.get_integer_vector("ordinal_feature_index")
+        self._onehot_feature_index = preprocessor_json.get_integer_vector("onehot_feature_index")
+        self._processed_feature_types = preprocessor_json.get_integer_vector("processed_feature_types")
+        self._original_feature_types = preprocessor_json.get_string_vector("original_feature_types")
+        self._original_feature_indices = preprocessor_json.get_integer_vector("original_feature_indices")
diff --git a/stochtree/serialization.py b/stochtree/serialization.py
index acbb9e85..b6d3a93b 100644
--- a/stochtree/serialization.py
+++ b/stochtree/serialization.py
@@ -1,6 +1,7 @@
 import warnings
 import numpy as np
 import pandas as pd
+from typing import Union
 from scipy.linalg import lstsq
 from scipy.stats import gamma
 from .forest import ForestContainer
@@ -66,6 +67,23 @@ def add_scalar(self, field_name: str, field_value: float, subfolder_name: str =
         else:
             self.json_cpp.AddDoubleSubfolder(subfolder_name, field_name, field_value)
     
+    def add_integer(self, field_name: str, field_value: int, subfolder_name: str = None) -> None:
+        """Adds an integer value to a json object
+
+        Parameters
+        ----------
+        field_name : str
+            Name of the json field / label under which the numeric value will be stored
+        field_value : int
+            Integer value to be stored
+        subfolder_name : str, optional
+            Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+        """
+        if subfolder_name is None:
+            self.json_cpp.AddInteger(field_name, field_value)
+        else:
+            self.json_cpp.AddIntegerSubfolder(subfolder_name, field_name, field_value)
+    
     def add_boolean(self, field_name: str, field_value: bool, subfolder_name: str = None) -> None:
         """Adds a scalar (boolean) value to a json object
 
@@ -125,6 +143,33 @@ def add_numeric_vector(self, field_name: str, field_vector: np.array, subfolder_
         else:
             self.json_cpp.AddDoubleVectorSubfolder(subfolder_name, field_name, field_vector)
     
+    def add_integer_vector(self, field_name: str, field_vector: np.array, subfolder_name: str = None) -> None:
+        """Adds a integer vector (stored as a numpy array) to a json object
+
+        Parameters
+        ----------
+        field_name : str
+            Name of the json field / label under which the integer vector will be stored
+        field_vector : np.array
+            Numpy array containing the vector to be stored in json. Should be one-dimensional.
+        subfolder_name : str, optional
+            Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+        """
+        # Runtime checks
+        if not isinstance(field_vector, np.ndarray):
+            raise ValueError("field_vector must be a numpy array")
+        if not np.issubdtype(field_vector.dtype, np.integer):
+            raise ValueError("field_vector must be a numpy array with integer data types")
+        field_vector = np.squeeze(field_vector)
+        if field_vector.ndim > 1:
+            warnings.warn("field_vector has more than 1 dimension. It will be flattened in row-major order using np.ravel()")
+            field_vector = np.ravel(field_vector, order = "C")
+        
+        if subfolder_name is None:
+            self.json_cpp.AddIntegerVector(field_name, field_vector)
+        else:
+            self.json_cpp.AddIntegerVectorSubfolder(subfolder_name, field_name, field_vector)
+    
     def add_string_vector(self, field_name: str, field_vector: list, subfolder_name: str = None) -> None:
         """Adds a list of strings to a json object as an array
 
@@ -138,9 +183,11 @@ def add_string_vector(self, field_name: str, field_vector: list, subfolder_name:
             Name of "subfolder" under which `field_name` to be stored in the json hierarchy
         """
         # Runtime checks
-        if not isinstance(field_vector, list):
-            raise ValueError("field_vector must be a list")
+        if not isinstance(field_vector, list) and not isinstance(field_vector, np.ndarray):
+            raise ValueError("field_vector must be a list or numpy object array")
         
+        if isinstance(field_vector, np.ndarray):
+            field_vector = field_vector.tolist()
         if subfolder_name is None:
             self.json_cpp.AddStringVector(field_name, field_vector)
         else:
@@ -161,6 +208,21 @@ def get_scalar(self, field_name: str, subfolder_name: str = None) -> float:
         else:
             return self.json_cpp.ExtractDoubleSubfolder(subfolder_name, field_name)
     
+    def get_integer(self, field_name: str, subfolder_name: str = None) -> int:
+        """Retrieves an integer value from a json object
+
+        Parameters
+        ----------
+        field_name : str
+            Name of the json field / label under which the numeric value is stored
+        subfolder_name : str, optional
+            Name of "subfolder" under which `field_name` is stored in the json hierarchy
+        """
+        if subfolder_name is None:
+            return self.json_cpp.ExtractInteger(field_name)
+        else:
+            return self.json_cpp.ExtractIntegerSubfolder(subfolder_name, field_name)
+    
     def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool:
         """Retrieves a scalar (boolean) value from a json object
 
@@ -177,12 +239,12 @@ def get_boolean(self, field_name: str, subfolder_name: str = None) -> bool:
             return self.json_cpp.ExtractBoolSubfolder(subfolder_name, field_name)
     
     def get_string(self, field_name: str, subfolder_name: str = None) -> str:
-        """Retrieve a string to a json object
+        """Retrieve a string from a json object
 
         Parameters
         ----------
         field_name : str
-            Name of the json field / label under which the numeric value is stored
+            Name of the json field / label under which the string is stored
         subfolder_name : str, optional
             Name of "subfolder" under which `field_name` is stored in the json hierarchy
         """
@@ -192,7 +254,7 @@ def get_string(self, field_name: str, subfolder_name: str = None) -> str:
             return self.json_cpp.ExtractStringSubfolder(subfolder_name, field_name)
     
     def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.array:
-        """Adds a string to a json object
+        """Retrieve numeric vector from a json object
 
         Parameters
         ----------
@@ -206,6 +268,21 @@ def get_numeric_vector(self, field_name: str, subfolder_name: str = None) -> np.
         else:
             return self.json_cpp.ExtractDoubleVectorSubfolder(subfolder_name, field_name)
     
+    def get_integer_vector(self, field_name: str, subfolder_name: str = None) -> np.array:
+        """Retrieve integer vector from a json object
+
+        Parameters
+        ----------
+        field_name : str
+            Name of the json field / label under which the integer vector is stored
+        subfolder_name : str, optional
+            Name of "subfolder" under which `field_name` to be stored in the json hierarchy
+        """
+        if subfolder_name is None:
+            return self.json_cpp.ExtractIntegerVector(field_name)
+        else:
+            return self.json_cpp.ExtractIntegerVectorSubfolder(subfolder_name, field_name)
+    
     def get_string_vector(self, field_name: str, subfolder_name: str = None) -> list:
         """Adds a string to a json object
 
diff --git a/test/python/test_calibration.py b/test/python/test_calibration.py
index 312b9632..0cc437a8 100644
--- a/test/python/test_calibration.py
+++ b/test/python/test_calibration.py
@@ -3,7 +3,6 @@
 from sklearn import linear_model
 from sklearn.metrics import mean_squared_error
 from scipy.stats import gamma
-from stochtree import CovariateTransformer
 from stochtree import calibrate_global_error_variance
 import pytest
 
diff --git a/test/python/test_json.py b/test/python/test_json.py
index 2bd71cd8..d1291254 100644
--- a/test/python/test_json.py
+++ b/test/python/test_json.py
@@ -1,7 +1,8 @@
 import numpy as np
+import pandas as pd
 from stochtree import (
     BARTModel, BCFModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual, 
-    RNG, ForestSampler, ForestContainer, GlobalVarianceModel
+    RNG, ForestSampler, ForestContainer, GlobalVarianceModel, CovariatePreprocessor
 )
 
 class TestJson:
@@ -26,6 +27,28 @@ def test_array(self):
         np.testing.assert_array_equal(a, json_test.get_numeric_vector("a"))
         assert b == json_test.get_string_vector("b")
 
+    def test_preprocessor(self):
+        df = pd.DataFrame(
+            {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
+             "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']),
+             "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]}
+        )
+        # arr = np.array(
+        #     [[1.5, 0, 0, 1, 1.2],
+        #      [2.7, 0, 1, 0, 5.4],
+        #      [3.6, 1, 0, 0, 9.3],
+        #      [4.4, 0, 0, 1, 10.4],
+        #      [5.3, 0, 1, 0, 3.6],
+        #      [6.1, 1, 0, 0, 4.4]]
+        # )
+        cov_transformer = CovariatePreprocessor()
+        df_transformed_orig = cov_transformer.fit_transform(df)
+        cov_transformer_json = cov_transformer.to_json()
+        cov_transformer_reloaded = CovariatePreprocessor()
+        cov_transformer_reloaded.from_json(cov_transformer_json)
+        df_transformed_reloaded = cov_transformer_reloaded.transform(df)
+        np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded)
+
     def test_forest(self):
         # Generate sample data
         random_seed = 1234
diff --git a/test/python/test_preprocessor.py b/test/python/test_preprocessor.py
index 4282a20e..87e338e7 100644
--- a/test/python/test_preprocessor.py
+++ b/test/python/test_preprocessor.py
@@ -1,10 +1,10 @@
 import numpy as np
 import pandas as pd
-from stochtree import CovariateTransformer
+from stochtree import CovariatePreprocessor
 
 class TestPreprocessor:
     def test_numpy(self):
-        cov_transformer = CovariateTransformer()
+        cov_transformer = CovariatePreprocessor()
         np_1 = np.array(
             [[1.5, 8.7, 1.2],
              [2.7, 3.4, 5.4],
@@ -15,7 +15,7 @@ def test_numpy(self):
         )
         np_1_transformed = cov_transformer.fit_transform(np_1)
         np.testing.assert_array_equal(np_1, np_1_transformed)
-        assert cov_transformer._processed_feature_types == [0,0,0]
+        np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0]))
 
     def test_pandas(self):
         df_1 = pd.DataFrame(
@@ -31,10 +31,10 @@ def test_pandas(self):
              [5.3, 9.3, 3.6],
              [6.1, 10.4, 4.4]]
         )
-        cov_transformer = CovariateTransformer()
+        cov_transformer = CovariatePreprocessor()
         df_1_transformed = cov_transformer.fit_transform(df_1)
         np.testing.assert_array_equal(np_1, df_1_transformed)
-        assert cov_transformer._processed_feature_types == [0,0,0]
+        np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,0,0]))
 
         df_2 = pd.DataFrame(
             {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
@@ -49,10 +49,10 @@ def test_pandas(self):
              [5.3, 1, 3.6],
              [6.1, 0, 4.4]]
         )
-        cov_transformer = CovariateTransformer()
+        cov_transformer = CovariatePreprocessor()
         df_2_transformed = cov_transformer.fit_transform(df_2)
         np.testing.assert_array_equal(np_2, df_2_transformed)
-        assert cov_transformer._processed_feature_types == [0,1,0]
+        np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,0]))
 
         df_3 = pd.DataFrame(
             {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
@@ -67,14 +67,14 @@ def test_pandas(self):
              [5.3, 0, 1, 0, 3.6],
              [6.1, 1, 0, 0, 4.4]]
         )
-        cov_transformer = CovariateTransformer()
+        cov_transformer = CovariatePreprocessor()
         df_3_transformed = cov_transformer.fit_transform(df_3)
         np.testing.assert_array_equal(np_3, df_3_transformed)
-        assert cov_transformer._processed_feature_types == [0,1,1,1,0]
+        np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,0]))
 
         df_4 = pd.DataFrame(
             {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1, 7.6],
-             "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'd'], ordered=False, categories=['c', 'b', 'a']),
+             "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c', 'c'], ordered=False, categories=['c', 'b', 'a', 'd']),
              "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4, 3.4]}
         )
         np_4 = np.array(
@@ -83,10 +83,11 @@ def test_pandas(self):
              [3.6, 1, 0, 0, 0, 9.3],
              [4.4, 0, 0, 1, 0, 10.4],
              [5.3, 0, 1, 0, 0, 3.6],
-             [6.1, 1, 0, 0, 1, 4.4]]
+             [6.1, 1, 0, 0, 0, 4.4],
+             [7.6, 1, 0, 0, 0, 3.4]]
         )
-        cov_transformer = CovariateTransformer()
-        with np.testing.assert_raises(ValueError):
-            df_4_transformed = cov_transformer.fit_transform(df_4)
-            # np.testing.assert_array_equal(np_4, df_4_transformed)
-            # assert cov_transformer._processed_feature_types == [0,1,1,1,1,0]
+        cov_transformer = CovariatePreprocessor()
+        df_4_transformed = cov_transformer.fit_transform(df_4)
+        np.testing.assert_array_equal(np_4, df_4_transformed)
+        np.testing.assert_array_equal(cov_transformer._processed_feature_types, np.array([0,1,1,1,1,0]))
+        
\ No newline at end of file

From 8aacffea2f59c9c86dddfb5482b29e18d168e3d3 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Wed, 22 Jan 2025 13:41:37 -0600
Subject: [PATCH 3/5] Updated python serialization test suite

---
 test/python/test_json.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/test/python/test_json.py b/test/python/test_json.py
index d1291254..ed002626 100644
--- a/test/python/test_json.py
+++ b/test/python/test_json.py
@@ -33,14 +33,6 @@ def test_preprocessor(self):
              "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']),
              "x3": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]}
         )
-        # arr = np.array(
-        #     [[1.5, 0, 0, 1, 1.2],
-        #      [2.7, 0, 1, 0, 5.4],
-        #      [3.6, 1, 0, 0, 9.3],
-        #      [4.4, 0, 0, 1, 10.4],
-        #      [5.3, 0, 1, 0, 3.6],
-        #      [6.1, 1, 0, 0, 4.4]]
-        # )
         cov_transformer = CovariatePreprocessor()
         df_transformed_orig = cov_transformer.fit_transform(df)
         cov_transformer_json = cov_transformer.to_json()
@@ -49,6 +41,21 @@ def test_preprocessor(self):
         df_transformed_reloaded = cov_transformer_reloaded.transform(df)
         np.testing.assert_array_equal(df_transformed_orig, df_transformed_reloaded)
 
+        df_2 = pd.DataFrame(
+            {"x1": [1.5, 2.7, 3.6, 4.4, 5.3, 6.1],
+             "x2": pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=False, categories=['c', 'b', 'a']),
+             "x3": pd.Categorical(['a', 'c', 'd', 'b', 'd', 'b'], ordered=False, categories=['c', 'b', 'a', 'd']),
+             "x4": pd.Categorical(['a', 'b', 'f', 'f', 'c', 'a'], ordered=True, categories=['c', 'b', 'a', 'f']),
+             "x5": [1.2, 5.4, 9.3, 10.4, 3.6, 4.4]}
+        )
+        cov_transformer_2 = CovariatePreprocessor()
+        df_transformed_orig_2 = cov_transformer_2.fit_transform(df_2)
+        cov_transformer_json_2 = cov_transformer_2.to_json()
+        cov_transformer_reloaded_2 = CovariatePreprocessor()
+        cov_transformer_reloaded_2.from_json(cov_transformer_json_2)
+        df_transformed_reloaded_2 = cov_transformer_reloaded_2.transform(df_2)
+        np.testing.assert_array_equal(df_transformed_orig_2, df_transformed_reloaded_2)
+
     def test_forest(self):
         # Generate sample data
         random_seed = 1234

From b288d9b292e2d3cc55cdb5d09a30bbfc1fbf49d4 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Wed, 22 Jan 2025 15:48:06 -0600
Subject: [PATCH 4/5] Fixed bugs and added covariate preprocessor serialization
 to BART / BCF

---
 stochtree/bart.py          | 135 ++++++++++++++++++++++++++++++-------
 stochtree/bcf.py           |  21 ++++--
 stochtree/preprocessing.py |   4 +-
 test/python/test_json.py   |  11 +++
 4 files changed, 138 insertions(+), 33 deletions(-)

diff --git a/stochtree/bart.py b/stochtree/bart.py
index 1e4ee6b0..0159fc92 100644
--- a/stochtree/bart.py
+++ b/stochtree/bart.py
@@ -1,6 +1,7 @@
 """
 Bayesian Additive Regression Trees (BART) module
 """
+import warnings
 from numbers import Number, Integral
 from math import log
 import numpy as np
@@ -52,7 +53,8 @@ def __init__(self) -> None:
         self.sampled = False
         self.rng = np.random.default_rng()
     
-    def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = None, X_test: np.array = None, basis_test: np.array = None, 
+    def sample(self, X_train: Union[np.array, pd.DataFrame], y_train: np.array, basis_train: np.array = None, 
+               X_test: Union[np.array, pd.DataFrame] = None, basis_test: np.array = None, 
                num_gfr: int = 5, num_burnin: int = 0, num_mcmc: int = 100, general_params: Optional[Dict[str, Any]] = None, 
                mean_forest_params: Optional[Dict[str, Any]] = None, variance_forest_params: Optional[Dict[str, Any]] = None) -> None:
         """Runs a BART sampler on provided training set. Predictions will be cached for the training set and (if provided) the test set. 
@@ -301,13 +303,13 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N
         variable_weights_variance = variable_weights
         
         # Covariate preprocessing
-        self._covariate_transformer = CovariatePreprocessor()
-        self._covariate_transformer.fit(X_train)
-        X_train_processed = self._covariate_transformer.transform(X_train)
+        self._covariate_preprocessor = CovariatePreprocessor()
+        self._covariate_preprocessor.fit(X_train)
+        X_train_processed = self._covariate_preprocessor.transform(X_train)
         if X_test is not None:
-            X_test_processed = self._covariate_transformer.transform(X_test)
-        feature_types = np.asarray(self._covariate_transformer._processed_feature_types)
-        original_var_indices = self._covariate_transformer.fetch_original_feature_indices()
+            X_test_processed = self._covariate_preprocessor.transform(X_test)
+        feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types)
+        original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices()
 
         # Determine whether a test set is provided
         self.has_test = X_test is not None
@@ -718,7 +720,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N
                 else:
                     self.sigma2_x_test = sigma_x_test_raw*self.sigma2_init*self.y_std*self.y_std
 
-    def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.array, tuple]:
+    def predict(self, covariates: Union[np.array, pd.DataFrame], basis: np.array = None) -> Union[np.array, tuple]:
         """Return predictions from every forest sampled (either / both of mean and variance). 
         Return type is either a single array of predictions, if a BART model only includes a 
         mean or variance term, or a tuple of prediction arrays, if a BART model includes both.
@@ -744,22 +746,44 @@ def predict(self, covariates: np.array, basis: np.array = None) -> Union[np.arra
             )
             raise NotSampledError(msg)
         
+        # Data checks
+        if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+            raise ValueError("covariates must be a pandas dataframe or numpy array")
+        if basis is not None:
+            if not isinstance(basis, np.ndarray):
+                raise ValueError("basis must be a numpy array")
+            if basis.shape[0] != covariates.shape[0]:
+                raise ValueError("covariates and basis must have the same number of rows")
+        
         # Convert everything to standard shape (2-dimensional)
-        if covariates.ndim == 1:
-            covariates = np.expand_dims(covariates, 1)
+        if isinstance(covariates, np.ndarray):
+            if covariates.ndim == 1:
+                covariates = np.expand_dims(covariates, 1)
         if basis is not None:
             if basis.ndim == 1:
                 basis = np.expand_dims(basis, 1)
         
-        # Data checks
-        if basis is not None:
-            if basis.shape[0] != covariates.shape[0]:
-                raise ValueError("covariates and basis must have the same number of rows")
+        # Covariate preprocessing
+        if not self._covariate_preprocessor._check_is_fitted():
+            if not isinstance(covariates, np.ndarray):
+                raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+            else:
+                warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+                if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+                    raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+                covariates_processed = covariates
+        else:
+            self._covariate_preprocessor = CovariatePreprocessor()
+            self._covariate_preprocessor.fit(covariates)
+            covariates_processed = self._covariate_preprocessor.transform(covariates)
 
+        # Dataset construction
         pred_dataset = Dataset()
-        pred_dataset.add_covariates(covariates)
+        pred_dataset.add_covariates(covariates_processed)
         if basis is not None:
             pred_dataset.add_basis(basis)
+        
+        # Forest predictions
         if self.include_mean_forest:
             mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
             mean_pred = mean_pred_raw*self.y_std + self.y_bar
@@ -808,22 +832,44 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array
             )
             raise NotSampledError(msg)
         
+        # Data checks
+        if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+            raise ValueError("covariates must be a pandas dataframe or numpy array")
+        if basis is not None:
+            if not isinstance(basis, np.ndarray):
+                raise ValueError("basis must be a numpy array")
+            if basis.shape[0] != covariates.shape[0]:
+                raise ValueError("covariates and basis must have the same number of rows")
+        
         # Convert everything to standard shape (2-dimensional)
-        if covariates.ndim == 1:
-            covariates = np.expand_dims(covariates, 1)
+        if isinstance(covariates, np.ndarray):
+            if covariates.ndim == 1:
+                covariates = np.expand_dims(covariates, 1)
         if basis is not None:
             if basis.ndim == 1:
                 basis = np.expand_dims(basis, 1)
         
-        # Data checks
-        if basis is not None:
-            if basis.shape[0] != covariates.shape[0]:
-                raise ValueError("covariates and basis must have the same number of rows")
+        # Covariate preprocessing
+        if not self._covariate_preprocessor._check_is_fitted():
+            if not isinstance(covariates, np.ndarray):
+                raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+            else:
+                warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+                if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+                    raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+                covariates_processed = covariates
+        else:
+            self._covariate_preprocessor = CovariatePreprocessor()
+            self._covariate_preprocessor.fit(covariates)
+            covariates_processed = self._covariate_preprocessor.transform(covariates)
 
+        # Dataset construction
         pred_dataset = Dataset()
-        pred_dataset.add_covariates(covariates)
+        pred_dataset.add_covariates(covariates_processed)
         if basis is not None:
             pred_dataset.add_basis(basis)
+        
+        # Mean forest predictions
         mean_pred_raw = self.forest_container_mean.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
         mean_pred = mean_pred_raw*self.y_std + self.y_bar
 
@@ -856,12 +902,42 @@ def predict_variance(self, covariates: np.array) -> np.array:
             )
             raise NotSampledError(msg)
         
+        # Data checks
+        if not isinstance(covariates, pd.DataFrame) and not isinstance(covariates, np.ndarray):
+            raise ValueError("covariates must be a pandas dataframe or numpy array")
+        if basis is not None:
+            if not isinstance(basis, np.ndarray):
+                raise ValueError("basis must be a numpy array")
+            if basis.shape[0] != covariates.shape[0]:
+                raise ValueError("covariates and basis must have the same number of rows")
+        
         # Convert everything to standard shape (2-dimensional)
-        if covariates.ndim == 1:
-            covariates = np.expand_dims(covariates, 1)
+        if isinstance(covariates, np.ndarray):
+            if covariates.ndim == 1:
+                covariates = np.expand_dims(covariates, 1)
+        if basis is not None:
+            if basis.ndim == 1:
+                basis = np.expand_dims(basis, 1)
+        
+        # Covariate preprocessing
+        if not self._covariate_preprocessor._check_is_fitted():
+            if not isinstance(covariates, np.ndarray):
+                raise ValueError("Prediction cannot proceed on a pandas dataframe, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing covariate data as a Pandas dataframe.")
+            else:
+                warnings.warn("This BART model has not run any covariate preprocessing routines. We will attempt to predict on the raw covariate values, but this will trigger an error with non-numeric columns. Please refit your model by passing non-numeric covariate data a a Pandas dataframe.", RuntimeWarning)
+                if not np.issubdtype(covariates.dtype, np.floating) and not np.issubdtype(covariates.dtype, np.integer):
+                    raise ValueError("Prediction cannot proceed on a non-numeric numpy array, since the BART model was not fit with a covariate preprocessor. Please refit your model by passing non-numeric covariate data as a Pandas dataframe.")
+                covariates_processed = covariates
+        else:
+            self._covariate_preprocessor = CovariatePreprocessor()
+            self._covariate_preprocessor.fit(covariates)
+            covariates_processed = self._covariate_preprocessor.transform(covariates)
         
+        # Dataset construction
         pred_dataset = Dataset()
-        pred_dataset.add_covariates(covariates)
+        pred_dataset.add_covariates(covariates_processed)
+        
+        # Variance forest predictions
         variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp)
         if self.sample_sigma_global:
             variance_pred = variance_pred_raw
@@ -920,6 +996,10 @@ def to_json(self) -> str:
         if self.sample_sigma_leaf:
             bart_json.add_numeric_vector("sigma2_leaf_samples", self.leaf_scale_samples, "parameters")
         
+        # Add covariate preprocessor
+        covariate_preprocessor_string = self._covariate_preprocessor.to_json()
+        bart_json.add_string("covariate_preprocessor", covariate_preprocessor_string)
+        
         return bart_json.return_json_string()
 
     def from_json(self, json_string: str) -> None:
@@ -971,6 +1051,11 @@ def from_json(self, json_string: str) -> None:
         if self.sample_sigma_leaf:
             self.leaf_scale_samples = bart_json.get_numeric_vector("sigma2_leaf_samples", "parameters")
         
+        # Unpack covariate preprocessor
+        covariate_preprocessor_string = bart_json.get_string("covariate_preprocessor")
+        self._covariate_preprocessor = CovariatePreprocessor()
+        self._covariate_preprocessor.from_json(covariate_preprocessor_string)
+        
         # Mark the deserialized model as "sampled"
         self.sampled = True
     
diff --git a/stochtree/bcf.py b/stochtree/bcf.py
index 492cd4dd..4f24234b 100644
--- a/stochtree/bcf.py
+++ b/stochtree/bcf.py
@@ -663,13 +663,13 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr
             variable_subset_variance = [i for i in range(X_train.shape[1])]
         
         # Covariate preprocessing
-        self._covariate_transformer = CovariatePreprocessor()
-        self._covariate_transformer.fit(X_train)
-        X_train_processed = self._covariate_transformer.transform(X_train)
+        self._covariate_preprocessor = CovariatePreprocessor()
+        self._covariate_preprocessor.fit(X_train)
+        X_train_processed = self._covariate_preprocessor.transform(X_train)
         if X_test is not None:
-            X_test_processed = self._covariate_transformer.transform(X_test)
-        feature_types = np.asarray(self._covariate_transformer._processed_feature_types)
-        original_var_indices = self._covariate_transformer.fetch_original_feature_indices()
+            X_test_processed = self._covariate_preprocessor.transform(X_test)
+        feature_types = np.asarray(self._covariate_preprocessor._processed_feature_types)
+        original_var_indices = self._covariate_preprocessor.fetch_original_feature_indices()
 
         # Determine whether a test set is provided
         self.has_test = X_test is not None
@@ -1420,6 +1420,10 @@ def to_json(self) -> str:
             bart_propensity_string = self.bart_propensity_model.to_json()
             bcf_json.add_string("bart_propensity_model", bart_propensity_string)
         
+        # Add covariate preprocessor
+        covariate_preprocessor_string = self._covariate_preprocessor.to_json()
+        bcf_json.add_string("covariate_preprocessor", covariate_preprocessor_string)
+        
         return bcf_json.return_json_string()
 
     def from_json(self, json_string: str) -> None:
@@ -1482,6 +1486,11 @@ def from_json(self, json_string: str) -> None:
             self.bart_propensity_model = BARTModel()
             self.bart_propensity_model.from_json(bart_propensity_string)
         
+        # Unpack covariate preprocessor
+        covariate_preprocessor_string = bcf_json.get_string("covariate_preprocessor")
+        self._covariate_preprocessor = CovariatePreprocessor()
+        self._covariate_preprocessor.from_json(covariate_preprocessor_string)
+        
         # Mark the deserialized model as "sampled"
         self.sampled = True
     
diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py
index 40019cc8..35633264 100644
--- a/stochtree/preprocessing.py
+++ b/stochtree/preprocessing.py
@@ -383,7 +383,7 @@ def _transform_numpy(self, covariates: np.array) -> np.array:
             raise ValueError("Covariates passed as a numpy array must be 1d or 2d")
         if self._num_original_features != covariates.shape[1]:
             raise ValueError("Attempting to call transform from a CovariateTransformer that was fit on a dataset with different dimensionality")
-        self._original_feature_indices = [i for i in range(covariates.shape[1])]
+        self._original_feature_indices = np.array([i for i in range(covariates.shape[1])])
         return covariates
 
     def _transform(self, covariates: Union[pd.DataFrame, np.array]) -> np.array:
@@ -485,7 +485,7 @@ def fetch_original_feature_indices(self) -> list:
             this method would return a list `[0,0,0,0,0]`. If the transformer merely passes
             through `k` numeric features, this method would return a list `[0,...,k-1]`.
         """
-        return self._original_feature_indices
+        return self._original_feature_indices.tolist()
     
     def to_json(self) -> str:
         """
diff --git a/test/python/test_json.py b/test/python/test_json.py
index ed002626..4d8d903c 100644
--- a/test/python/test_json.py
+++ b/test/python/test_json.py
@@ -56,6 +56,17 @@ def test_preprocessor(self):
         df_transformed_reloaded_2 = cov_transformer_reloaded_2.transform(df_2)
         np.testing.assert_array_equal(df_transformed_orig_2, df_transformed_reloaded_2)
 
+        np_3 = np.array(
+            [[1.5, 1.2], [2.7, 5.4], [3.6, 9.3], [4.4, 10.4], [5.3, 3.6], [6.1, 4.4]]
+        )
+        cov_transformer_3 = CovariatePreprocessor()
+        df_transformed_orig_3 = cov_transformer_3.fit_transform(np_3)
+        cov_transformer_json_3 = cov_transformer_3.to_json()
+        cov_transformer_reloaded_3 = CovariatePreprocessor()
+        cov_transformer_reloaded_3.from_json(cov_transformer_json_3)
+        df_transformed_reloaded_3 = cov_transformer_reloaded_3.transform(np_3)
+        np.testing.assert_array_equal(df_transformed_orig_3, df_transformed_reloaded_3)
+
     def test_forest(self):
         # Generate sample data
         random_seed = 1234

From 0ef2cdde4ce9eaf32e5841fec2768ccdcc2c4c38 Mon Sep 17 00:00:00 2001
From: Drew Herren <drewherrenopensource@gmail.com>
Date: Wed, 22 Jan 2025 17:36:23 -0600
Subject: [PATCH 5/5] Added preprocessor serialization to the R frontend as
 well

---
 NAMESPACE                               |   4 +
 R/bart.R                                |  26 +++++-
 R/bcf.R                                 |  20 ++++-
 R/cpp11.R                               |  32 +++++++
 R/serialization.R                       |  63 +++++++++++++-
 R/utils.R                               | 110 ++++++++++++++++++++++++
 man/CppJson.Rd                          |  98 ++++++++++++++++++++-
 man/convertPreprocessorToJson.Rd        |  23 +++++
 man/createPreprocessorFromJson.Rd       |  17 ++++
 man/createPreprocessorFromJsonString.Rd |  17 ++++
 man/saveBCFModelToJsonFile.Rd           |   3 +
 man/savePreprocessorToJsonString.Rd     |  23 +++++
 src/cpp11.cpp                           |  68 +++++++++++++++
 src/serialization.cpp                   |  89 +++++++++++++++++++
 14 files changed, 589 insertions(+), 4 deletions(-)
 create mode 100644 man/convertPreprocessorToJson.Rd
 create mode 100644 man/createPreprocessorFromJson.Rd
 create mode 100644 man/createPreprocessorFromJsonString.Rd
 create mode 100644 man/savePreprocessorToJsonString.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 7c746a36..47ba8bcc 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -12,6 +12,7 @@ export(computeForestLeafVariances)
 export(computeMaxLeafIndex)
 export(convertBARTModelToJson)
 export(convertBCFModelToJson)
+export(convertPreprocessorToJson)
 export(createBARTModelFromCombinedJson)
 export(createBARTModelFromCombinedJsonString)
 export(createBARTModelFromJson)
@@ -31,6 +32,8 @@ export(createForestCovariatesFromMetadata)
 export(createForestDataset)
 export(createForestModel)
 export(createOutcome)
+export(createPreprocessorFromJson)
+export(createPreprocessorFromJsonString)
 export(createRNG)
 export(createRandomEffectSamples)
 export(createRandomEffectsDataset)
@@ -69,6 +72,7 @@ export(saveBARTModelToJsonFile)
 export(saveBARTModelToJsonString)
 export(saveBCFModelToJsonFile)
 export(saveBCFModelToJsonString)
+export(savePreprocessorToJsonString)
 importFrom(R6,R6Class)
 importFrom(stats,coef)
 importFrom(stats,lm)
diff --git a/R/bart.R b/R/bart.R
index 841d7ee6..25699152 100644
--- a/R/bart.R
+++ b/R/bart.R
@@ -1215,6 +1215,12 @@ convertBARTModelToJson <- function(object){
         jsonobj$add_string_vector("rfx_unique_group_ids", object$rfx_unique_group_ids)
     }
     
+    # Add covariate preprocessor metadata
+    preprocessor_metadata_string <- savePreprocessorToJsonString(
+        object$train_set_metadata
+    )
+    jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string)
+    
     return(jsonobj)
 }
 
@@ -1322,7 +1328,7 @@ saveBARTModelToJsonFile <- function(object, filename){
 #' Convert the persistent aspects of a BART model to (in-memory) JSON string
 #'
 #' @param object Object of type `bartmodel` containing draws of a BART model and associated sampling outputs.
-#' @return JSON string
+#' @return in-memory JSON string
 #' @export
 #'
 #' @examples
@@ -1460,6 +1466,12 @@ createBARTModelFromJson <- function(json_object){
         output[["rfx_samples"]] <- loadRandomEffectSamplesJson(json_object, 0)
     }
     
+    # Unpack covariate preprocessor
+    preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+    output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+        preprocessor_metadata_string
+    )
+    
     class(output) <- "bartmodel"
     return(output)
 }
@@ -1686,6 +1698,12 @@ createBARTModelFromCombinedJson <- function(json_object_list){
         output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
     }
     
+    # Unpack covariate preprocessor
+    preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+    output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+        preprocessor_metadata_string
+    )
+    
     class(output) <- "bartmodel"
     return(output)
 }
@@ -1832,6 +1850,12 @@ createBARTModelFromCombinedJsonString <- function(json_string_list){
         output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
     }
     
+    # Unpack covariate preprocessor
+    preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+    output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+        preprocessor_metadata_string
+    )
+    
     class(output) <- "bartmodel"
     return(output)
 }
diff --git a/R/bcf.R b/R/bcf.R
index bc5b9d5f..ed00d25e 100644
--- a/R/bcf.R
+++ b/R/bcf.R
@@ -1708,6 +1708,12 @@ convertBCFModelToJson <- function(object){
         jsonobj$add_string("bart_propensity_model", bart_propensity_string)
     }
     
+    # Add covariate preprocessor metadata
+    preprocessor_metadata_string <- savePreprocessorToJsonString(
+        object$train_set_metadata
+    )
+    jsonobj$add_string("preprocessor_metadata", preprocessor_metadata_string)
+
     return(jsonobj)
 }
 
@@ -1716,7 +1722,7 @@ convertBCFModelToJson <- function(object){
 #' @param object Object of type `bcf` containing draws of a Bayesian causal forest model and associated sampling outputs.
 #' @param filename String of filepath, must end in ".json"
 #'
-#' @return NULL
+#' @return in-memory JSON string
 #' @export
 #'
 #' @examples
@@ -2018,6 +2024,12 @@ createBCFModelFromJson <- function(json_object){
         )
     }
     
+    # Unpack covariate preprocessor
+    preprocessor_metadata_string <- json_object$get_string("preprocessor_metadata")
+    output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+        preprocessor_metadata_string
+    )
+
     class(output) <- "bcf"
     return(output)
 }
@@ -2393,6 +2405,12 @@ createBCFModelFromCombinedJsonString <- function(json_string_list){
         output[["rfx_samples"]] <- loadRandomEffectSamplesCombinedJson(json_object_list, 0)
     }
     
+    # Unpack covariate preprocessor
+    preprocessor_metadata_string <- json_object_default$get_string("preprocessor_metadata")
+    output[["train_set_metadata"]] <- createPreprocessorFromJsonString(
+        preprocessor_metadata_string
+    )
+    
     class(output) <- "bcf"
     return(output)
 }
diff --git a/R/cpp11.R b/R/cpp11.R
index bf6345b9..bc411e89 100644
--- a/R/cpp11.R
+++ b/R/cpp11.R
@@ -612,6 +612,14 @@ json_add_double_cpp <- function(json_ptr, field_name, field_value) {
   invisible(.Call(`_stochtree_json_add_double_cpp`, json_ptr, field_name, field_value))
 }
 
+json_add_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) {
+  invisible(.Call(`_stochtree_json_add_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value))
+}
+
+json_add_integer_cpp <- function(json_ptr, field_name, field_value) {
+  invisible(.Call(`_stochtree_json_add_integer_cpp`, json_ptr, field_name, field_value))
+}
+
 json_add_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_value) {
   invisible(.Call(`_stochtree_json_add_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_value))
 }
@@ -628,6 +636,14 @@ json_add_vector_cpp <- function(json_ptr, field_name, field_vector) {
   invisible(.Call(`_stochtree_json_add_vector_cpp`, json_ptr, field_name, field_vector))
 }
 
+json_add_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) {
+  invisible(.Call(`_stochtree_json_add_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector))
+}
+
+json_add_integer_vector_cpp <- function(json_ptr, field_name, field_vector) {
+  invisible(.Call(`_stochtree_json_add_integer_vector_cpp`, json_ptr, field_name, field_vector))
+}
+
 json_add_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name, field_vector) {
   invisible(.Call(`_stochtree_json_add_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name, field_vector))
 }
@@ -660,6 +676,14 @@ json_extract_double_cpp <- function(json_ptr, field_name) {
   .Call(`_stochtree_json_extract_double_cpp`, json_ptr, field_name)
 }
 
+json_extract_integer_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
+  .Call(`_stochtree_json_extract_integer_subfolder_cpp`, json_ptr, subfolder_name, field_name)
+}
+
+json_extract_integer_cpp <- function(json_ptr, field_name) {
+  .Call(`_stochtree_json_extract_integer_cpp`, json_ptr, field_name)
+}
+
 json_extract_bool_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
   .Call(`_stochtree_json_extract_bool_subfolder_cpp`, json_ptr, subfolder_name, field_name)
 }
@@ -684,6 +708,14 @@ json_extract_vector_cpp <- function(json_ptr, field_name) {
   .Call(`_stochtree_json_extract_vector_cpp`, json_ptr, field_name)
 }
 
+json_extract_integer_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
+  .Call(`_stochtree_json_extract_integer_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name)
+}
+
+json_extract_integer_vector_cpp <- function(json_ptr, field_name) {
+  .Call(`_stochtree_json_extract_integer_vector_cpp`, json_ptr, field_name)
+}
+
 json_extract_string_vector_subfolder_cpp <- function(json_ptr, subfolder_name, field_name) {
   .Call(`_stochtree_json_extract_string_vector_subfolder_cpp`, json_ptr, subfolder_name, field_name)
 }
diff --git a/R/serialization.R b/R/serialization.R
index 24205f9e..bca7f23f 100644
--- a/R/serialization.R
+++ b/R/serialization.R
@@ -81,6 +81,20 @@ CppJson <- R6::R6Class(
             }
         }, 
         
+        #' @description
+        #' Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+        #' @param field_name The name of the field to be added to json
+        #' @param field_value Integer value of the field to be added to json
+        #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
+        #' @return NULL
+        add_integer = function(field_name, field_value, subfolder_name = NULL) {
+            if (is.null(subfolder_name)) {
+                json_add_integer_cpp(self$json_ptr, field_name, field_value)
+            } else {
+                json_add_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_value)
+            }
+        }, 
+        
         #' @description
         #' Add a boolean value to the json object under the name "field_name" (with optional subfolder "subfolder_name")
         #' @param field_name The name of the field to be added to json
@@ -110,7 +124,7 @@ CppJson <- R6::R6Class(
         }, 
         
         #' @description
-        #' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+        #' Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
         #' @param field_name The name of the field to be added to json
         #' @param field_vector Vector to be stored in json
         #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
@@ -124,6 +138,21 @@ CppJson <- R6::R6Class(
             }
         }, 
         
+        #' @description
+        #' Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+        #' @param field_name The name of the field to be added to json
+        #' @param field_vector Vector to be stored in json
+        #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which to place the value
+        #' @return NULL
+        add_integer_vector = function(field_name, field_vector, subfolder_name = NULL) {
+            field_vector <- as.numeric(field_vector)
+            if (is.null(subfolder_name)) {
+                json_add_integer_vector_cpp(self$json_ptr, field_name, field_vector)
+            } else {
+                json_add_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name, field_vector)
+            }
+        }, 
+        
         #' @description
         #' Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
         #' @param field_name The name of the field to be added to json
@@ -184,6 +213,22 @@ CppJson <- R6::R6Class(
             return(result)
         }, 
         
+        #' @description
+        #' Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+        #' @param field_name The name of the field to be accessed from json
+        #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored
+        #' @return NULL
+        get_integer = function(field_name, subfolder_name = NULL) {
+            if (is.null(subfolder_name)) {
+                stopifnot(json_contains_field_cpp(self$json_ptr, field_name))
+                result <- json_extract_integer_cpp(self$json_ptr, field_name)
+            } else {
+                stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name))
+                result <- json_extract_integer_subfolder_cpp(self$json_ptr, subfolder_name, field_name)
+            }
+            return(result)
+        }, 
+        
         #' @description
         #' Retrieve a boolean value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
         #' @param field_name The name of the field to be accessed from json
@@ -232,6 +277,22 @@ CppJson <- R6::R6Class(
             return(result)
         }, 
         
+        #' @description
+        #' Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+        #' @param field_name The name of the field to be accessed from json
+        #' @param subfolder_name (Optional) Name of the subfolder / hierarchy under which the field is stored
+        #' @return NULL
+        get_integer_vector = function(field_name, subfolder_name = NULL) {
+            if (is.null(subfolder_name)) {
+                stopifnot(json_contains_field_cpp(self$json_ptr, field_name))
+                result <- json_extract_integer_vector_cpp(self$json_ptr, field_name)
+            } else {
+                stopifnot(json_contains_field_subfolder_cpp(self$json_ptr, subfolder_name, field_name))
+                result <- json_extract_integer_vector_subfolder_cpp(self$json_ptr, subfolder_name, field_name)
+            }
+            return(result)
+        }, 
+        
         #' @description
         #' Retrieve a character vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
         #' @param field_name The name of the field to be accessed from json
diff --git a/R/utils.R b/R/utils.R
index a1fc12a8..ea96794b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -359,6 +359,116 @@ preprocessPredictionDataFrame <- function(input_df, metadata) {
     return(X)
 }
 
+#' Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object
+#'
+#' @param object List containing information on variables, including train set 
+#' categories for categorical variables 
+#'
+#' @return wrapper around in-memory C++ JSON object
+#' @export
+#'
+#' @examples
+#' cov_mat <- matrix(1:12, ncol = 3)
+#' preprocess_list <- preprocessTrainData(cov_mat)
+#' preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
+convertPreprocessorToJson <- function(object) {
+    jsonobj <- createCppJson()
+    if (is.null(object$feature_types)) {
+        stop("This covariate preprocessor has not yet been fit")
+    }
+    
+    # Add internal scalars
+    jsonobj$add_integer("num_numeric_vars", object$num_numeric_vars)
+    jsonobj$add_integer("num_ordered_cat_vars", object$num_ordered_cat_vars)
+    jsonobj$add_integer("num_unordered_cat_vars", object$num_unordered_cat_vars)
+    
+    # Add internal vectors
+    jsonobj$add_vector("feature_types", object$feature_types)
+    jsonobj$add_vector("original_var_indices", object$original_var_indices)
+    if (object$num_numeric_vars > 0) {
+        jsonobj$add_string_vector("numeric_vars", object$numeric_vars)
+    }
+    if (object$num_ordered_cat_vars > 0) {
+        jsonobj$add_string_vector("ordered_cat_vars", object$ordered_cat_vars)
+        jsonobj$add_string_vector("ordered_unique_levels", object$ordered_unique_levels)
+    }
+    if (object$num_unordered_cat_vars > 0) {
+        jsonobj$add_string_vector("unordered_cat_vars", object$unordered_cat_vars)
+        jsonobj$add_string_vector("unordered_unique_levels", object$unordered_unique_levels)
+    }
+    
+    return(jsonobj)
+}
+
+#' Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string
+#'
+#' @param object List containing information on variables, including train set 
+#' categories for categorical variables  
+#'
+#' @return in-memory JSON string
+#' @export
+#'
+#' @examples
+#' cov_mat <- matrix(1:12, ncol = 3)
+#' preprocess_list <- preprocessTrainData(cov_mat)
+#' preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
+savePreprocessorToJsonString <- function(object){
+    # Convert to Json
+    jsonobj <- convertPreprocessorToJson(object)
+    
+    # Dump to string
+    return(jsonobj$return_json_string())
+}
+
+#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+#'
+#' @param json_object in-memory wrapper around JSON C++ object containing covariate preprocessor metadata
+#'
+#' @returns Preprocessor object that can be used with the `preprocessPredictionData` function
+#' @export
+createPreprocessorFromJson <- function(json_object){
+    # Initialize the metadata list
+    metadata <- list()
+    
+    # Unpack internal scalars
+    metadata[["num_numeric_vars"]] <- json_object$get_integer("num_numeric_vars")
+    metadata[["num_ordered_cat_vars"]] <- json_object$get_integer("num_ordered_cat_vars")
+    metadata[["num_unordered_cat_vars"]] <- json_object$get_integer("num_unordered_cat_vars")
+    
+    # Unpack internal vectors
+    metadata[["feature_types"]] <- json_object$get_vector("feature_types")
+    metadata[["original_var_indices"]] <- json_object$get_vector("original_var_indices")
+    if (metadata$num_numeric_vars > 0) {
+        metadata[["numeric_vars"]] <- json_object$get_string_vector("numeric_vars")
+    }
+    if (metadata$num_ordered_cat_vars > 0) {
+        metadata[["ordered_cat_vars"]] <- json_object$get_string_vector("ordered_cat_vars")
+        metadata[["ordered_unique_levels"]] <- json_object$get_string_vector("ordered_unique_levels")
+    }
+    if (metadata$num_unordered_cat_vars > 0) {
+        metadata[["unordered_cat_vars"]] <- json_object$get_string_vector("unordered_cat_vars")
+        metadata[["unordered_unique_levels"]] <- json_object$get_string_vector("unordered_unique_levels")
+    }
+    
+    return(metadata)
+}
+
+#' Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+#'
+#' @param json_string in-memory JSON string containing covariate preprocessor metadata
+#'
+#' @return Preprocessor object that can be used with the `preprocessPredictionData` function
+#' @export
+createPreprocessorFromJsonString <- function(json_string){
+    # Load a `CppJson` object from string
+    preprocessor_json <- createCppJsonString(json_string)
+    
+    # Create and return the BCF object
+    preprocessor_object <- createPreprocessorFromJson(preprocessor_json)
+    
+    return(preprocessor_object)
+}
+
 #' Preprocess a dataframe of covariate values, converting categorical variables 
 #' to integers and one-hot encoding if need be. Returns a list including a 
 #' matrix of preprocessed covariate values and associated tracking.
diff --git a/man/CppJson.Rd b/man/CppJson.Rd
index a7f7e448..65f1edec 100644
--- a/man/CppJson.Rd
+++ b/man/CppJson.Rd
@@ -32,16 +32,20 @@ Wrapper around a C++ container of tree ensembles
 \item \href{#method-CppJson-add_forest}{\code{CppJson$add_forest()}}
 \item \href{#method-CppJson-add_random_effects}{\code{CppJson$add_random_effects()}}
 \item \href{#method-CppJson-add_scalar}{\code{CppJson$add_scalar()}}
+\item \href{#method-CppJson-add_integer}{\code{CppJson$add_integer()}}
 \item \href{#method-CppJson-add_boolean}{\code{CppJson$add_boolean()}}
 \item \href{#method-CppJson-add_string}{\code{CppJson$add_string()}}
 \item \href{#method-CppJson-add_vector}{\code{CppJson$add_vector()}}
+\item \href{#method-CppJson-add_integer_vector}{\code{CppJson$add_integer_vector()}}
 \item \href{#method-CppJson-add_string_vector}{\code{CppJson$add_string_vector()}}
 \item \href{#method-CppJson-add_list}{\code{CppJson$add_list()}}
 \item \href{#method-CppJson-add_string_list}{\code{CppJson$add_string_list()}}
 \item \href{#method-CppJson-get_scalar}{\code{CppJson$get_scalar()}}
+\item \href{#method-CppJson-get_integer}{\code{CppJson$get_integer()}}
 \item \href{#method-CppJson-get_boolean}{\code{CppJson$get_boolean()}}
 \item \href{#method-CppJson-get_string}{\code{CppJson$get_string()}}
 \item \href{#method-CppJson-get_vector}{\code{CppJson$get_vector()}}
+\item \href{#method-CppJson-get_integer_vector}{\code{CppJson$get_integer_vector()}}
 \item \href{#method-CppJson-get_string_vector}{\code{CppJson$get_string_vector()}}
 \item \href{#method-CppJson-get_numeric_list}{\code{CppJson$get_numeric_list()}}
 \item \href{#method-CppJson-get_string_list}{\code{CppJson$get_string_list()}}
@@ -120,6 +124,30 @@ Add a scalar to the json object under the name "field_name" (with optional subfo
 
 \item{\code{field_value}}{Numeric value of the field to be added to json}
 
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-CppJson-add_integer"></a>}}
+\if{latex}{\out{\hypertarget{method-CppJson-add_integer}{}}}
+\subsection{Method \code{add_integer()}}{
+Add a scalar to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{CppJson$add_integer(field_name, field_value, subfolder_name = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{field_name}}{The name of the field to be added to json}
+
+\item{\code{field_value}}{Integer value of the field to be added to json}
+
 \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
 }
 \if{html}{\out{</div>}}
@@ -180,7 +208,7 @@ NULL
 \if{html}{\out{<a id="method-CppJson-add_vector"></a>}}
 \if{latex}{\out{\hypertarget{method-CppJson-add_vector}{}}}
 \subsection{Method \code{add_vector()}}{
-Add an array to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+Add a vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
 \subsection{Usage}{
 \if{html}{\out{<div class="r">}}\preformatted{CppJson$add_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{</div>}}
 }
@@ -192,6 +220,30 @@ Add an array to the json object under the name "field_name" (with optional subfo
 
 \item{\code{field_vector}}{Vector to be stored in json}
 
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-CppJson-add_integer_vector"></a>}}
+\if{latex}{\out{\hypertarget{method-CppJson-add_integer_vector}{}}}
+\subsection{Method \code{add_integer_vector()}}{
+Add an integer vector to the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{CppJson$add_integer_vector(field_name, field_vector, subfolder_name = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{field_name}}{The name of the field to be added to json}
+
+\item{\code{field_vector}}{Vector to be stored in json}
+
 \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which to place the value}
 }
 \if{html}{\out{</div>}}
@@ -282,6 +334,28 @@ Retrieve a scalar value from the json object under the name "field_name" (with o
 \describe{
 \item{\code{field_name}}{The name of the field to be accessed from json}
 
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-CppJson-get_integer"></a>}}
+\if{latex}{\out{\hypertarget{method-CppJson-get_integer}{}}}
+\subsection{Method \code{get_integer()}}{
+Retrieve a integer value from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{CppJson$get_integer(field_name, subfolder_name = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{field_name}}{The name of the field to be accessed from json}
+
 \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
 }
 \if{html}{\out{</div>}}
@@ -348,6 +422,28 @@ Retrieve a vector from the json object under the name "field_name" (with optiona
 \describe{
 \item{\code{field_name}}{The name of the field to be accessed from json}
 
+\item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+NULL
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-CppJson-get_integer_vector"></a>}}
+\if{latex}{\out{\hypertarget{method-CppJson-get_integer_vector}{}}}
+\subsection{Method \code{get_integer_vector()}}{
+Retrieve an integer vector from the json object under the name "field_name" (with optional subfolder "subfolder_name")
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{CppJson$get_integer_vector(field_name, subfolder_name = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{field_name}}{The name of the field to be accessed from json}
+
 \item{\code{subfolder_name}}{(Optional) Name of the subfolder / hierarchy under which the field is stored}
 }
 \if{html}{\out{</div>}}
diff --git a/man/convertPreprocessorToJson.Rd b/man/convertPreprocessorToJson.Rd
new file mode 100644
index 00000000..49716050
--- /dev/null
+++ b/man/convertPreprocessorToJson.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{convertPreprocessorToJson}
+\alias{convertPreprocessorToJson}
+\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object}
+\usage{
+convertPreprocessorToJson(object)
+}
+\arguments{
+\item{object}{List containing information on variables, including train set
+categories for categorical variables}
+}
+\value{
+wrapper around in-memory C++ JSON object
+}
+\description{
+Convert the persistent aspects of a covariate preprocessor to (in-memory) C++ JSON object
+}
+\examples{
+cov_mat <- matrix(1:12, ncol = 3)
+preprocess_list <- preprocessTrainData(cov_mat)
+preprocessor_json <- convertPreprocessorToJson(preprocess_list$metadata)
+}
diff --git a/man/createPreprocessorFromJson.Rd b/man/createPreprocessorFromJson.Rd
new file mode 100644
index 00000000..3edca354
--- /dev/null
+++ b/man/createPreprocessorFromJson.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{createPreprocessorFromJson}
+\alias{createPreprocessorFromJson}
+\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor}
+\usage{
+createPreprocessorFromJson(json_object)
+}
+\arguments{
+\item{json_object}{in-memory wrapper around JSON C++ object containing covariate preprocessor metadata}
+}
+\value{
+Preprocessor object that can be used with the \code{preprocessPredictionData} function
+}
+\description{
+Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+}
diff --git a/man/createPreprocessorFromJsonString.Rd b/man/createPreprocessorFromJsonString.Rd
new file mode 100644
index 00000000..00974b83
--- /dev/null
+++ b/man/createPreprocessorFromJsonString.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{createPreprocessorFromJsonString}
+\alias{createPreprocessorFromJsonString}
+\title{Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor}
+\usage{
+createPreprocessorFromJsonString(json_string)
+}
+\arguments{
+\item{json_string}{in-memory JSON string containing covariate preprocessor metadata}
+}
+\value{
+Preprocessor object that can be used with the \code{preprocessPredictionData} function
+}
+\description{
+Reload a covariate preprocessor object from a JSON string containing a serialized preprocessor
+}
diff --git a/man/saveBCFModelToJsonFile.Rd b/man/saveBCFModelToJsonFile.Rd
index f7685c48..5a0f1512 100644
--- a/man/saveBCFModelToJsonFile.Rd
+++ b/man/saveBCFModelToJsonFile.Rd
@@ -11,6 +11,9 @@ saveBCFModelToJsonFile(object, filename)
 
 \item{filename}{String of filepath, must end in ".json"}
 }
+\value{
+in-memory JSON string
+}
 \description{
 Convert the persistent aspects of a BCF model to (in-memory) JSON and save to a file
 }
diff --git a/man/savePreprocessorToJsonString.Rd b/man/savePreprocessorToJsonString.Rd
new file mode 100644
index 00000000..83c54d72
--- /dev/null
+++ b/man/savePreprocessorToJsonString.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/utils.R
+\name{savePreprocessorToJsonString}
+\alias{savePreprocessorToJsonString}
+\title{Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string}
+\usage{
+savePreprocessorToJsonString(object)
+}
+\arguments{
+\item{object}{List containing information on variables, including train set
+categories for categorical variables}
+}
+\value{
+in-memory JSON string
+}
+\description{
+Convert the persistent aspects of a covariate preprocessor to (in-memory) JSON string
+}
+\examples{
+cov_mat <- matrix(1:12, ncol = 3)
+preprocess_list <- preprocessTrainData(cov_mat)
+preprocessor_json_string <- savePreprocessorToJsonString(preprocess_list$metadata)
+}
diff --git a/src/cpp11.cpp b/src/cpp11.cpp
index 0091dffd..d9c352c3 100644
--- a/src/cpp11.cpp
+++ b/src/cpp11.cpp
@@ -1134,6 +1134,22 @@ extern "C" SEXP _stochtree_json_add_double_cpp(SEXP json_ptr, SEXP field_name, S
   END_CPP11
 }
 // serialization.cpp
+void json_add_integer_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, int field_value);
+extern "C" SEXP _stochtree_json_add_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) {
+  BEGIN_CPP11
+    json_add_integer_subfolder_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(subfolder_name), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name), cpp11::as_cpp<cpp11::decay_t<int>>(field_value));
+    return R_NilValue;
+  END_CPP11
+}
+// serialization.cpp
+void json_add_integer_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name, int field_value);
+extern "C" SEXP _stochtree_json_add_integer_cpp(SEXP json_ptr, SEXP field_name, SEXP field_value) {
+  BEGIN_CPP11
+    json_add_integer_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name), cpp11::as_cpp<cpp11::decay_t<int>>(field_value));
+    return R_NilValue;
+  END_CPP11
+}
+// serialization.cpp
 void json_add_bool_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, bool field_value);
 extern "C" SEXP _stochtree_json_add_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_value) {
   BEGIN_CPP11
@@ -1166,6 +1182,22 @@ extern "C" SEXP _stochtree_json_add_vector_cpp(SEXP json_ptr, SEXP field_name, S
   END_CPP11
 }
 // serialization.cpp
+void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector);
+extern "C" SEXP _stochtree_json_add_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) {
+  BEGIN_CPP11
+    json_add_integer_vector_subfolder_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(subfolder_name), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name), cpp11::as_cpp<cpp11::decay_t<cpp11::integers>>(field_vector));
+    return R_NilValue;
+  END_CPP11
+}
+// serialization.cpp
+void json_add_integer_vector_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name, cpp11::integers field_vector);
+extern "C" SEXP _stochtree_json_add_integer_vector_cpp(SEXP json_ptr, SEXP field_name, SEXP field_vector) {
+  BEGIN_CPP11
+    json_add_integer_vector_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name), cpp11::as_cpp<cpp11::decay_t<cpp11::integers>>(field_vector));
+    return R_NilValue;
+  END_CPP11
+}
+// serialization.cpp
 void json_add_string_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector);
 extern "C" SEXP _stochtree_json_add_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name, SEXP field_vector) {
   BEGIN_CPP11
@@ -1226,6 +1258,20 @@ extern "C" SEXP _stochtree_json_extract_double_cpp(SEXP json_ptr, SEXP field_nam
   END_CPP11
 }
 // serialization.cpp
+int json_extract_integer_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
+  BEGIN_CPP11
+    return cpp11::as_sexp(json_extract_integer_subfolder_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(subfolder_name), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name)));
+  END_CPP11
+}
+// serialization.cpp
+int json_extract_integer_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_cpp(SEXP json_ptr, SEXP field_name) {
+  BEGIN_CPP11
+    return cpp11::as_sexp(json_extract_integer_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name)));
+  END_CPP11
+}
+// serialization.cpp
 bool json_extract_bool_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name);
 extern "C" SEXP _stochtree_json_extract_bool_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
   BEGIN_CPP11
@@ -1268,6 +1314,20 @@ extern "C" SEXP _stochtree_json_extract_vector_cpp(SEXP json_ptr, SEXP field_nam
   END_CPP11
 }
 // serialization.cpp
+cpp11::writable::integers json_extract_integer_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
+  BEGIN_CPP11
+    return cpp11::as_sexp(json_extract_integer_vector_subfolder_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(subfolder_name), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name)));
+  END_CPP11
+}
+// serialization.cpp
+cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name);
+extern "C" SEXP _stochtree_json_extract_integer_vector_cpp(SEXP json_ptr, SEXP field_name) {
+  BEGIN_CPP11
+    return cpp11::as_sexp(json_extract_integer_vector_cpp(cpp11::as_cpp<cpp11::decay_t<cpp11::external_pointer<nlohmann::json>>>(json_ptr), cpp11::as_cpp<cpp11::decay_t<std::string>>(field_name)));
+  END_CPP11
+}
+// serialization.cpp
 cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name);
 extern "C" SEXP _stochtree_json_extract_string_vector_subfolder_cpp(SEXP json_ptr, SEXP subfolder_name, SEXP field_name) {
   BEGIN_CPP11
@@ -1415,6 +1475,10 @@ static const R_CallMethodDef CallEntries[] = {
     {"_stochtree_json_add_double_cpp",                                 (DL_FUNC) &_stochtree_json_add_double_cpp,                                  3},
     {"_stochtree_json_add_double_subfolder_cpp",                       (DL_FUNC) &_stochtree_json_add_double_subfolder_cpp,                        4},
     {"_stochtree_json_add_forest_cpp",                                 (DL_FUNC) &_stochtree_json_add_forest_cpp,                                  2},
+    {"_stochtree_json_add_integer_cpp",                                (DL_FUNC) &_stochtree_json_add_integer_cpp,                                 3},
+    {"_stochtree_json_add_integer_subfolder_cpp",                      (DL_FUNC) &_stochtree_json_add_integer_subfolder_cpp,                       4},
+    {"_stochtree_json_add_integer_vector_cpp",                         (DL_FUNC) &_stochtree_json_add_integer_vector_cpp,                          3},
+    {"_stochtree_json_add_integer_vector_subfolder_cpp",               (DL_FUNC) &_stochtree_json_add_integer_vector_subfolder_cpp,                4},
     {"_stochtree_json_add_rfx_container_cpp",                          (DL_FUNC) &_stochtree_json_add_rfx_container_cpp,                           2},
     {"_stochtree_json_add_rfx_groupids_cpp",                           (DL_FUNC) &_stochtree_json_add_rfx_groupids_cpp,                            2},
     {"_stochtree_json_add_rfx_label_mapper_cpp",                       (DL_FUNC) &_stochtree_json_add_rfx_label_mapper_cpp,                        2},
@@ -1430,6 +1494,10 @@ static const R_CallMethodDef CallEntries[] = {
     {"_stochtree_json_extract_bool_subfolder_cpp",                     (DL_FUNC) &_stochtree_json_extract_bool_subfolder_cpp,                      3},
     {"_stochtree_json_extract_double_cpp",                             (DL_FUNC) &_stochtree_json_extract_double_cpp,                              2},
     {"_stochtree_json_extract_double_subfolder_cpp",                   (DL_FUNC) &_stochtree_json_extract_double_subfolder_cpp,                    3},
+    {"_stochtree_json_extract_integer_cpp",                            (DL_FUNC) &_stochtree_json_extract_integer_cpp,                             2},
+    {"_stochtree_json_extract_integer_subfolder_cpp",                  (DL_FUNC) &_stochtree_json_extract_integer_subfolder_cpp,                   3},
+    {"_stochtree_json_extract_integer_vector_cpp",                     (DL_FUNC) &_stochtree_json_extract_integer_vector_cpp,                      2},
+    {"_stochtree_json_extract_integer_vector_subfolder_cpp",           (DL_FUNC) &_stochtree_json_extract_integer_vector_subfolder_cpp,            3},
     {"_stochtree_json_extract_string_cpp",                             (DL_FUNC) &_stochtree_json_extract_string_cpp,                              2},
     {"_stochtree_json_extract_string_subfolder_cpp",                   (DL_FUNC) &_stochtree_json_extract_string_subfolder_cpp,                    3},
     {"_stochtree_json_extract_string_vector_cpp",                      (DL_FUNC) &_stochtree_json_extract_string_vector_cpp,                       2},
diff --git a/src/serialization.cpp b/src/serialization.cpp
index 3593f1a5..749395e8 100644
--- a/src/serialization.cpp
+++ b/src/serialization.cpp
@@ -48,6 +48,29 @@ void json_add_double_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::
     }
 }
 
+[[cpp11::register]]
+void json_add_integer_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, int field_value) {
+    if (json_ptr->contains(subfolder_name)) {
+        if (json_ptr->at(subfolder_name).contains(field_name)) {
+            json_ptr->at(subfolder_name).at(field_name) = field_value;
+        } else {
+            json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value));
+        }
+    } else {
+        json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+        json_ptr->at(subfolder_name).emplace(std::pair(field_name, field_value));
+    }
+}
+
+[[cpp11::register]]
+void json_add_integer_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name, int field_value) {
+    if (json_ptr->contains(field_name)) {
+        json_ptr->at(field_name) = field_value;
+    } else {
+        json_ptr->emplace(std::pair(field_name, field_value));
+    }
+}
+
 [[cpp11::register]]
 void json_add_bool_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, bool field_value) {
     if (json_ptr->contains(subfolder_name)) {
@@ -111,6 +134,46 @@ void json_add_vector_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::
     }
 }
 
+[[cpp11::register]]
+void json_add_integer_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, cpp11::integers field_vector) {
+    int vec_length = field_vector.size();
+    if (json_ptr->contains(subfolder_name)) {
+        if (json_ptr->at(subfolder_name).contains(field_name)) {
+            json_ptr->at(subfolder_name).at(field_name).clear();
+            for (int i = 0; i < vec_length; i++) {
+                json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+            }
+        } else {
+            json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+            for (int i = 0; i < vec_length; i++) {
+                json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+            }
+        }
+    } else {
+        json_ptr->emplace(std::pair(subfolder_name, nlohmann::json::object()));
+        json_ptr->at(subfolder_name).emplace(std::pair(field_name, nlohmann::json::array()));
+        for (int i = 0; i < vec_length; i++) {
+            json_ptr->at(subfolder_name).at(field_name).emplace_back(field_vector.at(i));
+        }
+    }
+}
+
+[[cpp11::register]]
+void json_add_integer_vector_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name, cpp11::integers field_vector) {
+    int vec_length = field_vector.size();
+    if (json_ptr->contains(field_name)) {
+        json_ptr->at(field_name).clear();
+        for (int i = 0; i < vec_length; i++) {
+            json_ptr->at(field_name).emplace_back(field_vector.at(i));
+        }
+    } else {
+        json_ptr->emplace(std::pair(field_name, nlohmann::json::array()));
+        for (int i = 0; i < vec_length; i++) {
+            json_ptr->at(field_name).emplace_back(field_vector.at(i));
+        }
+    }
+}
+
 [[cpp11::register]]
 void json_add_string_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name, cpp11::strings field_vector) {
     int vec_length = field_vector.size();
@@ -206,6 +269,16 @@ double json_extract_double_cpp(cpp11::external_pointer<nlohmann::json> json_ptr,
     return json_ptr->at(field_name);
 }
 
+[[cpp11::register]]
+int json_extract_integer_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name) {
+    return json_ptr->at(subfolder_name).at(field_name);
+}
+
+[[cpp11::register]]
+int json_extract_integer_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name) {
+    return json_ptr->at(field_name);
+}
+
 [[cpp11::register]]
 bool json_extract_bool_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name) {
     return json_ptr->at(subfolder_name).at(field_name);
@@ -242,6 +315,22 @@ cpp11::writable::doubles json_extract_vector_cpp(cpp11::external_pointer<nlohman
     return output;
 }
 
+[[cpp11::register]]
+cpp11::writable::integers json_extract_integer_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name) {
+    cpp11::writable::integers output;
+    int vec_length = json_ptr->at(subfolder_name).at(field_name).size();
+    for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(subfolder_name).at(field_name).at(i)));
+    return output;
+}
+
+[[cpp11::register]]
+cpp11::writable::integers json_extract_integer_vector_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string field_name) {
+    cpp11::writable::integers output;
+    int vec_length = json_ptr->at(field_name).size();
+    for (int i = 0; i < vec_length; i++) output.push_back((json_ptr->at(field_name).at(i)));
+    return output;
+}
+
 [[cpp11::register]]
 cpp11::writable::strings json_extract_string_vector_subfolder_cpp(cpp11::external_pointer<nlohmann::json> json_ptr, std::string subfolder_name, std::string field_name) {
     int vec_length = json_ptr->at(subfolder_name).at(field_name).size();