From d8549551761a1183c02587f5620079c736df22c1 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Sun, 8 Dec 2024 18:54:47 -0600 Subject: [PATCH 1/4] Initial commit of BCF serialization --- stochtree/bcf.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 052dbaed..5f75625e 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -10,6 +10,7 @@ from .forest import ForestContainer, Forest from .preprocessing import CovariateTransformer, _preprocess_bcf_params from .sampler import ForestSampler, RNG, GlobalVarianceModel, LeafVarianceModel +from .serialization import JSONSerializer from .utils import NotSampledError class BCFModel: @@ -1012,3 +1013,111 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.a # Return result matrices as a tuple return (tau_x, mu_x, yhat_x) + + def to_json(self) -> str: + """ + Converts a sampled BART model to JSON string representation (which can then be saved to a file or + processed using the ``json`` library) + + Returns + ------- + :obj:`str` + JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests + """ + if not self.is_sampled: + msg = ( + "This BARTModel instance has not yet been sampled. " + "Call 'fit' with appropriate arguments before using this model." + ) + raise NotSampledError(msg) + + # Initialize JSONSerializer object + bcf_json = JSONSerializer() + + # Add the forests + bcf_json.add_forest(self.forest_container_mu) + bcf_json.add_forest(self.forest_container_tau) + if self.include_variance_forest: + bcf_json.add_forest(self.forest_container_variance) + + # Add global parameters + bcf_json.add_scalar("variance_scale", self.variance_scale) + bcf_json.add_scalar("outcome_scale", self.y_std) + bcf_json.add_scalar("outcome_mean", self.y_bar) + bcf_json.add_boolean("standardize", self.standardize) + bcf_json.add_scalar("sigma2_init", self.sigma2_init) + bcf_json.add_boolean("sample_sigma_global", self.sample_sigma_global) + bcf_json.add_boolean("sample_sigma_leaf_mu", self.sample_sigma_leaf_mu) + bcf_json.add_boolean("sample_sigma_leaf_tau", self.sample_sigma_leaf_tau) + bcf_json.add_boolean("include_variance_forest", self.include_variance_forest) + bcf_json.add_scalar("num_gfr", self.num_gfr) + bcf_json.add_scalar("num_burnin", self.num_burnin) + bcf_json.add_scalar("num_mcmc", self.num_mcmc) + bcf_json.add_scalar("num_samples", self.num_samples) + bcf_json.add_scalar("adaptive_coding", self.adaptive_coding) + + # Add parameter samples + if self.sample_sigma_global: + bcf_json.add_numeric_vector("sigma2_global_samples", self.global_var_samples, "parameters") + if self.sample_sigma_leaf_mu: + bcf_json.add_numeric_vector("sigma2_leaf_mu_samples", self.leaf_scale_mu_samples, "parameters") + if self.sample_sigma_leaf_tau: + bcf_json.add_numeric_vector("sigma2_leaf_tau_samples", self.leaf_scale_tau_samples, "parameters") + if self.adaptive_coding: + bcf_json.add_numeric_vector("b0_samples", self.b0_samples, "parameters") + bcf_json.add_numeric_vector("b1_samples", self.b1_samples, "parameters") + + return bcf_json.return_json_string() + + def from_json(self, json_string: str) -> None: + """ + Converts a JSON string to an in-memory BART model. + + Parameters + ---------- + json_string : :obj:`str` + JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests + """ + # Parse string to a JSON object in C++ + bart_json = JSONSerializer() + bart_json.load_from_json_string(json_string) + + # Unpack forests + self.include_variance_forest = bart_json.get_boolean("include_variance_forest") + # TODO: don't just make this a placeholder that we overwrite + self.forest_container_mu = ForestContainer(0, 0, False, False) + self.forest_container_mu.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_0") + # TODO: don't just make this a placeholder that we overwrite + self.forest_container_tau = ForestContainer(0, 0, False, False) + self.forest_container_tau.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_1") + if self.include_variance_forest: + # TODO: don't just make this a placeholder that we overwrite + self.forest_container_variance = ForestContainer(0, 0, False, False) + self.forest_container_variance.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_2") + + # Unpack global parameters + self.variance_scale = bart_json.get_scalar("variance_scale") + self.y_std = bart_json.get_scalar("outcome_scale") + self.y_bar = bart_json.get_scalar("outcome_mean") + self.standardize = bart_json.get_boolean("standardize") + self.sigma2_init = bart_json.get_scalar("sigma2_init") + self.sample_sigma_global = bart_json.get_boolean("sample_sigma_global") + self.sample_sigma_leaf_mu = bart_json.get_boolean("sample_sigma_leaf_mu") + self.sample_sigma_leaf_tau = bart_json.get_boolean("sample_sigma_leaf_tau") + self.num_gfr = bart_json.get_scalar("num_gfr") + self.num_burnin = bart_json.get_scalar("num_burnin") + self.num_mcmc = bart_json.get_scalar("num_mcmc") + self.num_samples = bart_json.get_scalar("num_samples") + self.adaptive_coding = bart_json.get_scalar("adaptive_coding") + + # Unpack parameter samples + if self.sample_sigma_global: + self.global_var_samples = bart_json.get_numeric_vector("sigma2_global_samples", "parameters") + if self.sample_sigma_leaf_mu: + self.leaf_scale_mu_samples = bart_json.get_numeric_vector("sigma2_leaf_mu_samples", "parameters") + if self.sample_sigma_leaf_tau: + self.leaf_scale_tau_samples = bart_json.get_numeric_vector("sigma2_leaf_tau_samples", "parameters") + + # Mark the deserialized model as "sampled" + self.sampled = True + From 9e1898407e201432a7c230985df0b2eef02b35d9 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 9 Dec 2024 00:54:12 -0600 Subject: [PATCH 2/4] Added WIP BCF JSON unit test --- test/python/test_json.py | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/test/python/test_json.py b/test/python/test_json.py index ee65e80c..55d30286 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -1,6 +1,6 @@ import numpy as np from stochtree import ( - BARTModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual, + BARTModel, BCFModel, JSONSerializer, ForestContainer, Forest, Dataset, Residual, RNG, ForestSampler, ForestContainer, GlobalVarianceModel ) @@ -206,3 +206,39 @@ def outcome_mean(X, W): bart_reloaded.from_json(bart_json_string) y_hat_reloaded = bart_reloaded.predict(X, W) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) + + def test_bcf_string(self): + # RNG + random_seed = 1234 + rng = np.random.default_rng(random_seed) + + # Generate covariates and basis + n = 100 + p_X = 5 + X = rng.uniform(0, 1, (n, p_X)) + pi_X = 0.25 + 0.5*X[:,0] + Z = rng.binomial(1, pi_X, n).astype(float) + + # Define the outcome mean functions (prognostic and treatment effects) + mu_X = pi_X*5 + tau_X = X[:,1]*2 + + # Generate outcome + epsilon = rng.normal(0, 1, n) + y = mu_X + tau_X*Z + epsilon + + # Run BCF + bcf_orig = BCFModel() + bcf_orig.sample(X_train=X, Z_train=Z, y_train=y, pi_train=pi_X, num_gfr=10, num_mcmc=10) + + # Extract predictions from the sampler + mu_hat_orig, tau_hat_orig, y_hat_orig = bcf_orig.predict(X, Z, pi_X) + + # "Round-trip" the model to JSON string and back and check that the predictions agree + bcf_json_string = bcf_orig.to_json() + bcf_reloaded = BCFModel() + bcf_reloaded.from_json(bcf_json_string) + mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict(X, Z, pi_X) + np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) + np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) + np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) From c09649c2c33885593b13a12be3cdc6a1c07208af Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 9 Dec 2024 23:38:14 -0600 Subject: [PATCH 3/4] Added BCF serialization routines to python interface --- stochtree/bart.py | 34 +-- stochtree/bcf.py | 468 ++++++++++++++++++++++++++++--------- stochtree/preprocessing.py | 21 +- test/python/test_bcf.py | 46 ++-- test/python/test_json.py | 1 + 5 files changed, 411 insertions(+), 159 deletions(-) diff --git a/stochtree/bart.py b/stochtree/bart.py index 4ca409f8..f4796f79 100644 --- a/stochtree/bart.py +++ b/stochtree/bart.py @@ -74,7 +74,7 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N * ``pct_var_variance_forest_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Default: ``1``. Superseded by ``variance_forest_init``. * ``variance_scale`` (``float``): Variance after the data have been scaled. Default: ``1``. * ``variable_weights_mean`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the mean forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. - * ``variable_weights_forest`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. + * ``variable_weights_variance`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. * ``num_trees_mean`` (``int``): Number of trees in the ensemble for the conditional mean model. Defaults to ``200``. If ``num_trees_mean = 0``, the conditional mean will not be modeled using a forest and the function will only proceed if ``num_trees_variance > 0``. * ``num_trees_variance`` (``int``): Number of trees in the ensemble for the conditional variance model. Defaults to ``0``. Variance is only modeled using a tree / forest if ``num_trees_variance > 0``. * ``sample_sigma_global`` (``bool``): Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. @@ -484,19 +484,19 @@ def sample(self, X_train: np.array, y_train: np.array, basis_train: np.array = N if self.include_variance_forest: sigma_x_train_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) if self.sample_sigma_global: - self.sigma_x_train = sigma_x_train_raw + self.sigma2_x_train = sigma_x_train_raw for i in range(self.num_samples): - self.sigma_x_train[:,i] = np.sqrt(sigma_x_train_raw[:,i]*self.global_var_samples[i]) + self.sigma2_x_train[:,i] = sigma_x_train_raw[:,i]*self.global_var_samples[i] else: - self.sigma_x_train = np.sqrt(sigma_x_train_raw*self.sigma2_init)*self.y_std/np.sqrt(self.variance_scale) + self.sigma2_x_train = sigma_x_train_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale if self.has_test: sigma_x_test_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) if self.sample_sigma_global: - self.sigma_x_test = sigma_x_test_raw + self.sigma2_x_test = sigma_x_test_raw for i in range(self.num_samples): - self.sigma_x_test[:,i] = np.sqrt(sigma_x_test_raw[:,i]*self.global_var_samples[i]) + self.sigma2_x_test[:,i] = sigma_x_test_raw[:,i]*self.global_var_samples[i] else: - self.sigma_x_test = np.sqrt(sigma_x_test_raw*self.sigma2_init)*self.y_std/np.sqrt(self.variance_scale) + self.sigma2_x_test = sigma_x_test_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale def predict(self, covariates: np.array, basis: np.array = None) -> np.array: """Return predictions from every forest sampled (either / both of mean and variance) @@ -605,20 +605,18 @@ def predict_mean(self, covariates: np.array, basis: np.array = None) -> np.array return mean_pred - def predict_variance(self, covariates: np.array, basis: np.array = None) -> np.array: + def predict_variance(self, covariates: np.array) -> np.array: """Predict expected conditional variance from a BART model. Parameters ---------- covariates : np.array Test set covariates. - basis_train : :obj:`np.array`, optional - Optional test set basis vector, must be provided if the model was trained with a leaf regression basis. Returns ------- tuple of :obj:`np.array` - Tuple of arrays of predictions corresponding to each forest (mean and variance, depending on whether either / both was included). Each array will contain as many rows as in ``covariates`` and as many columns as retained samples of the algorithm. + Tuple of arrays of predictions corresponding to the variance forest. Each array will contain as many rows as in ``covariates`` and as many columns as retained samples of the algorithm. """ if not self.is_sampled(): msg = ( @@ -637,26 +635,16 @@ def predict_variance(self, covariates: np.array, basis: np.array = None) -> np.a # Convert everything to standard shape (2-dimensional) if covariates.ndim == 1: covariates = np.expand_dims(covariates, 1) - if basis is not None: - if basis.ndim == 1: - basis = np.expand_dims(basis, 1) - # Data checks - if basis is not None: - if basis.shape[0] != covariates.shape[0]: - raise ValueError("covariates and basis must have the same number of rows") - pred_dataset = Dataset() pred_dataset.add_covariates(covariates) - # if basis is not None: - # pred_dataset.add_basis(basis) variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) if self.sample_sigma_global: variance_pred = variance_pred_raw for i in range(self.num_samples): - variance_pred[:,i] = np.sqrt(variance_pred_raw[:,i]*self.global_var_samples[i]) + variance_pred[:,i] = variance_pred_raw[:,i]*self.global_var_samples[i] else: - variance_pred = np.sqrt(variance_pred_raw*self.sigma2_init)*self.y_std/np.sqrt(self.variance_scale) + variance_pred = variance_pred_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale return variance_pred diff --git a/stochtree/bcf.py b/stochtree/bcf.py index 5f75625e..9e045e1d 100644 --- a/stochtree/bcf.py +++ b/stochtree/bcf.py @@ -69,29 +69,40 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. * ``alpha_tau`` (``float``): Prior probability of splitting for a tree of depth 0 for the treatment effect forest. Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``alpha_variance`` (``float``): Prior probability of splitting for a tree of depth 0 in the conditional variance model. Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. * ``beta_mu`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 for the prognostic forest. Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. * ``beta_tau`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 for the treatment effect forest. Tree split prior combines ``alpha`` and ``beta`` via ``alpha*(1+node_depth)^-beta``. + * ``beta_variance`` (``float``): Exponent that decreases split probabilities for nodes of depth > 0 in the conditional variance model. Tree split prior combines ``alpha_variance`` and ``beta_variance`` via ``alpha_variance*(1+node_depth)^-beta_variance``. * ``min_samples_leaf_mu`` (``int``): Minimum allowable size of a leaf, in terms of training samples, for the prognostic forest. Defaults to ``5``. * ``min_samples_leaf_tau`` (``int``): Minimum allowable size of a leaf, in terms of training samples, for the treatment effect forest. Defaults to ``5``. + * ``min_samples_leaf_variance`` (``int``): Minimum allowable size of a leaf, in terms of training samples in the conditional variance model. Defaults to ``5``. * ``max_depth_mu`` (``int``): Maximum depth of any tree in the mu ensemble. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. * ``max_depth_tau`` (``int``): Maximum depth of any tree in the tau ensemble. Defaults to ``5``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. + * ``max_depth_variance`` (``int``): Maximum depth of any tree in the ensemble in the conditional variance model. Defaults to ``10``. Can be overriden with ``-1`` which does not enforce any depth limits on trees. * ``a_global`` (``float``): Shape parameter in the ``IG(a_global, b_global)`` global error variance model. Defaults to ``0``. * ``b_global`` (``float``): Component of the scale parameter in the ``IG(a_global, b_global)`` global error variance prior. Defaults to ``0``. * ``a_leaf_mu`` (``float``): Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Defaults to ``3``. * ``a_leaf_tau`` (``float``): Shape parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Defaults to ``3``. * ``b_leaf_mu`` (``float``): Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the prognostic forest. Calibrated internally as ``0.5/num_trees`` if not set here. * ``b_leaf_tau`` (``float``): Scale parameter in the ``IG(a_leaf, b_leaf)`` leaf node parameter variance model for the treatment effect forest. Calibrated internally as ``0.5/num_trees`` if not set here. - * ``sigma2`` (``float``): Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + * ``sigma2_init`` (``float``): Starting value of global variance parameter. Calibrated internally as in Sparapani et al (2021) if not set here. + * ``variance_forest_leaf_init`` (``float``): Starting value of root forest prediction in conditional (heteroskedastic) error variance model. Calibrated internally as ``np.log(pct_var_variance_forest_init*np.var((y-np.mean(y))/np.std(y)))/num_trees_variance`` if not set. * ``pct_var_sigma2_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Superseded by ``sigma2``. Defaults to ``0.25``. - * ``variable_weights`` (`np.`array``): Numeric weights reflecting the relative probability of splitting on each variable. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly. + * ``pct_var_variance_forest_init`` (``float``): Percentage of standardized outcome variance used to initialize global error variance parameter. Default: ``1``. Superseded by ``variance_forest_init``. + * ``variable_weights_mean`` (`np.`array``): Numeric weights reflecting the relative probability of splitting on each variable in the prognostic and treatment effect forests. Does not need to sum to 1 but cannot be negative. Defaults to ``np.repeat(1/X_train.shape[1], X_train.shape[1])`` if not set here. Note that if the propensity score is included as a covariate in either forest, its weight will default to ``1/X_train.shape[1]``. A workaround if you wish to provide a custom weight for the propensity score is to include it as a column in ``X_train`` and then set ``propensity_covariate`` to ``'none'`` and adjust ``keep_vars_mu`` and ``keep_vars_tau`` accordingly. + * ``variable_weights_variance`` (``np.array``): Numeric weights reflecting the relative probability of splitting on each variable in the variance forest. Does not need to sum to 1 but cannot be negative. Defaults to uniform over the columns of ``X_train`` if not provided. * ``keep_vars_mu`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be included in the prognostic (``mu(X)``) forest. Defaults to ``None``. * ``drop_vars_mu`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the prognostic (``mu(X)``) forest. Defaults to ``None``. If both ``drop_vars_mu`` and ``keep_vars_mu`` are set, ``drop_vars_mu`` will be ignored. + * ``drop_vars_variance`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the variance (``sigma^2(X)``) forest. Defaults to ``None``. If both ``drop_vars_variance`` and ``keep_vars_variance`` are set, ``drop_vars_variance`` will be ignored. * ``keep_vars_tau`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be included in the treatment effect (``tau(X)``) forest. Defaults to ``None``. * ``drop_vars_tau`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the treatment effect (``tau(X)``) forest. Defaults to ``None``. If both ``drop_vars_tau`` and ``keep_vars_tau`` are set, ``drop_vars_tau`` will be ignored. + * ``drop_vars_variance`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be excluded from the variance (``sigma^2(X)``) forest. Defaults to ``None``. If both ``drop_vars_variance`` and ``keep_vars_variance`` are set, ``drop_vars_variance`` will be ignored. + * ``keep_vars_variance`` (``list`` or ``np.array``): Vector of variable names or column indices denoting variables that should be included in the variance (``sigma^2(X)``) forest. Defaults to ``None``. * ``num_trees_mu`` (``int``): Number of trees in the prognostic forest. Defaults to ``200``. * ``num_trees_tau`` (``int``): Number of trees in the treatment effect forest. Defaults to ``50``. + * ``num_trees_variance`` (``int``): Number of trees in the ensemble for the conditional variance model. Defaults to ``0``. Variance is only modeled using a tree / forest if ``num_trees_variance > 0``. * ``sample_sigma_global`` (``bool``): Whether or not to update the ``sigma^2`` global error variance parameter based on ``IG(a_global, b_global)``. Defaults to ``True``. * ``sample_sigma_leaf_mu`` (``bool``): Whether or not to update the ``tau`` leaf scale variance parameter based on ``IG(a_leaf, b_leaf)`` for the prognostic forest. Cannot (currently) be set to true if ``basis_train`` has more than one column. Defaults to ``True``. @@ -120,27 +131,40 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr sigma_leaf_tau = bcf_params['sigma_leaf_tau'] alpha_mu = bcf_params['alpha_mu'] alpha_tau = bcf_params['alpha_tau'] + alpha_variance = bcf_params['alpha_variance'] beta_mu = bcf_params['beta_mu'] beta_tau = bcf_params['beta_tau'] + beta_variance = bcf_params['beta_variance'] min_samples_leaf_mu = bcf_params['min_samples_leaf_mu'] min_samples_leaf_tau = bcf_params['min_samples_leaf_tau'] + min_samples_leaf_variance = bcf_params['min_samples_leaf_variance'] max_depth_mu = bcf_params['max_depth_mu'] max_depth_tau = bcf_params['max_depth_tau'] + max_depth_variance = bcf_params['max_depth_variance'] a_global = bcf_params['a_global'] b_global = bcf_params['b_global'] + a_forest = bcf_params['a_forest'] + b_forest = bcf_params['b_forest'] a_leaf_mu = bcf_params['a_leaf_mu'] a_leaf_tau = bcf_params['a_leaf_tau'] b_leaf_mu = bcf_params['b_leaf_mu'] b_leaf_tau = bcf_params['b_leaf_tau'] - sigma2 = bcf_params['sigma2'] + sigma2_init = bcf_params['sigma2_init'] + variance_forest_leaf_init = bcf_params['variance_forest_leaf_init'] pct_var_sigma2_init = bcf_params['pct_var_sigma2_init'] - variable_weights = bcf_params['variable_weights'] + pct_var_variance_forest_init = bcf_params['pct_var_variance_forest_init'] + variable_weights_mu = bcf_params['variable_weights_mu'] + variable_weights_tau = bcf_params['variable_weights_tau'] + variable_weights_variance = bcf_params['variable_weights_variance'] keep_vars_mu = bcf_params['keep_vars_mu'] drop_vars_mu = bcf_params['drop_vars_mu'] keep_vars_tau = bcf_params['keep_vars_tau'] drop_vars_tau = bcf_params['drop_vars_tau'] + keep_vars_variance = bcf_params['keep_vars_variance'] + drop_vars_variance = bcf_params['drop_vars_variance'] num_trees_mu = bcf_params['num_trees_mu'] num_trees_tau = bcf_params['num_trees_tau'] + num_trees_variance = bcf_params['num_trees_variance'] sample_sigma_global = bcf_params['sample_sigma_global'] sample_sigma_leaf_mu = bcf_params['sample_sigma_leaf_mu'] sample_sigma_leaf_tau = bcf_params['sample_sigma_leaf_tau'] @@ -155,13 +179,30 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr keep_every = bcf_params['keep_every'] # Variable weight preprocessing (and initialization if necessary) - if variable_weights is None: + if variable_weights_mu is None: if X_train.ndim > 1: - variable_weights = np.repeat(1/X_train.shape[1], X_train.shape[1]) + variable_weights_mu = np.repeat(1/X_train.shape[1], X_train.shape[1]) else: - variable_weights = np.repeat(1., 1) - if np.any(variable_weights < 0): - raise ValueError("variable_weights cannot have any negative weights") + variable_weights_mu = np.repeat(1., 1) + if np.any(variable_weights_mu < 0): + raise ValueError("variable_weights_mu cannot have any negative weights") + if variable_weights_tau is None: + if X_train.ndim > 1: + variable_weights_tau = np.repeat(1/X_train.shape[1], X_train.shape[1]) + else: + variable_weights_tau = np.repeat(1., 1) + if np.any(variable_weights_tau < 0): + raise ValueError("variable_weights_tau cannot have any negative weights") + if variable_weights_variance is None: + if X_train.ndim > 1: + variable_weights_variance = np.repeat(1/X_train.shape[1], X_train.shape[1]) + else: + variable_weights_variance = np.repeat(1., 1) + if np.any(variable_weights_variance < 0): + raise ValueError("variable_weights_variance cannot have any negative weights") + + # Determine whether conditional variance model will be fit + self.include_variance_forest = True if num_trees_variance > 0 else False # Check data inputs if not isinstance(X_train, pd.DataFrame) and not isinstance(X_train, np.ndarray): @@ -182,7 +223,6 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if pi_test is not None: if not isinstance(pi_test, np.ndarray): raise ValueError("pi_test must be a numpy array") - # Convert everything to standard shape (2-dimensional) if isinstance(X_train, np.ndarray): @@ -236,6 +276,10 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr self.multivariate_treatment = True if self.treatment_dim > 1 else False treatment_leaf_model = 2 if self.multivariate_treatment else 1 + # Set variance leaf model type (currently only one option) + leaf_model_variance_forest = 3 + self.variance_scale = 1 + # Check parameters if sigma_leaf_tau is not None: if not isinstance(sigma_leaf_tau, float) and not isinstance(sigma_leaf_tau, np.ndarray): @@ -249,10 +293,10 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr raise ValueError("sigma_leaf_tau must have the same number of rows and columns, which must match Z_train.shape[1]") if sigma_leaf_mu is not None: sigma_leaf_mu = check_scalar(x=sigma_leaf_mu, name="sigma_leaf_mu", target_type=float, - min_val=0., max_val=None, include_boundaries="neither") + min_val=0., max_val=None, include_boundaries="neither") if cutpoint_grid_size is not None: cutpoint_grid_size = check_scalar(x=cutpoint_grid_size, name="cutpoint_grid_size", target_type=int, - min_val=1, max_val=None, include_boundaries="left") + min_val=1, max_val=None, include_boundaries="left") if min_samples_leaf_mu is not None: min_samples_leaf_mu = check_scalar(x=min_samples_leaf_mu, name="min_samples_leaf_mu", target_type=int, min_val=1, max_val=None, include_boundaries="left") @@ -282,34 +326,34 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr min_val=0, max_val=1, include_boundaries="neither") if alpha_tau is not None: alpha_tau = check_scalar(x=alpha_tau, name="alpha_tau", target_type=(float,int), - min_val=0, max_val=1, include_boundaries="neither") + min_val=0, max_val=1, include_boundaries="neither") if beta_mu is not None: beta_mu = check_scalar(x=beta_mu, name="beta_mu", target_type=(float,int), - min_val=1, max_val=None, include_boundaries="left") + min_val=1, max_val=None, include_boundaries="left") if beta_tau is not None: beta_tau = check_scalar(x=beta_tau, name="beta_tau", target_type=(float,int), min_val=1, max_val=None, include_boundaries="left") if a_global is not None: a_global = check_scalar(x=a_global, name="a_global", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + min_val=0, max_val=None, include_boundaries="left") if b_global is not None: b_global = check_scalar(x=b_global, name="b_global", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + min_val=0, max_val=None, include_boundaries="left") if a_leaf_mu is not None: a_leaf_mu = check_scalar(x=a_leaf_mu, name="a_leaf_mu", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + min_val=0, max_val=None, include_boundaries="left") if a_leaf_tau is not None: a_leaf_tau = check_scalar(x=a_leaf_tau, name="a_leaf_tau", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + min_val=0, max_val=None, include_boundaries="left") if b_leaf_mu is not None: b_leaf_mu = check_scalar(x=b_leaf_mu, name="b_leaf_mu", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") + min_val=0, max_val=None, include_boundaries="left") if b_leaf_tau is not None: b_leaf_tau = check_scalar(x=b_leaf_tau, name="b_leaf_tau", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="left") - if sigma2 is not None: - sigma2 = check_scalar(x=sigma2, name="sigma2", target_type=(float,int), - min_val=0, max_val=None, include_boundaries="neither") + min_val=0, max_val=None, include_boundaries="left") + if sigma2_init is not None: + sigma2_init = check_scalar(x=sigma2_init, name="sigma2_init", target_type=(float,int), + min_val=0, max_val=None, include_boundaries="neither") if sample_sigma_leaf_mu is not None: if not isinstance(sample_sigma_leaf_mu, bool): raise ValueError("sample_sigma_leaf_mu must be a bool") @@ -449,6 +493,64 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr raise ValueError("drop_vars_tau must be a list or np.array") else: variable_subset_tau = [i for i in range(X_train.shape[1])] + if keep_vars_variance is not None: + if isinstance(keep_vars_variance, list): + if all(isinstance(i, str) for i in keep_vars_variance): + if not np.all(np.isin(keep_vars_variance, X_train.columns)): + raise ValueError("keep_vars_variance includes some variable names that are not in X_train") + variable_subset_variance = [i for i in range(X_train.shape[1]) if keep_vars_variance.count(X_train.columns.array[i]) > 0] + elif all(isinstance(i, int) for i in keep_vars_variance): + if any(i >= X_train.shape[1] for i in keep_vars_variance): + raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in keep_vars_variance): + raise ValueError("keep_vars_variance includes some negative variable indices") + variable_subset_variance = keep_vars_variance + else: + raise ValueError("keep_vars_variance must be a list of variable names (str) or column indices (int)") + elif isinstance(keep_vars_variance, np.ndarray): + if keep_vars_variance.dtype == np.str_: + if not np.all(np.isin(keep_vars_variance, X_train.columns)): + raise ValueError("keep_vars_variance includes some variable names that are not in X_train") + variable_subset_variance = [i for i in range(X_train.shape[1]) if keep_vars_variance.count(X_train.columns.array[i]) > 0] + else: + if np.any(keep_vars_variance >= X_train.shape[1]): + raise ValueError("keep_vars_variance includes some variable indices that exceed the number of columns in X_train") + if np.any(keep_vars_variance < 0): + raise ValueError("keep_vars_variance includes some negative variable indices") + variable_subset_variance = [i for i in keep_vars_variance] + else: + raise ValueError("keep_vars_variance must be a list or np.array") + elif keep_vars_variance is None and drop_vars_variance is not None: + if isinstance(drop_vars_variance, list): + if all(isinstance(i, str) for i in drop_vars_variance): + if not np.all(np.isin(drop_vars_variance, X_train.columns)): + raise ValueError("drop_vars_variance includes some variable names that are not in X_train") + variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(X_train.columns.array[i]) == 0] + elif all(isinstance(i, int) for i in drop_vars_variance): + if any(i >= X_train.shape[1] for i in drop_vars_variance): + raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + if any(i < 0 for i in drop_vars_variance): + raise ValueError("drop_vars_variance includes some negative variable indices") + variable_subset_variance = [i for i in range(X_train.shape[1]) if drop_vars_variance.count(i) == 0] + else: + raise ValueError("drop_vars_variance must be a list of variable names (str) or column indices (int)") + elif isinstance(drop_vars_variance, np.ndarray): + if drop_vars_variance.dtype == np.str_: + if not np.all(np.isin(drop_vars_variance, X_train.columns)): + raise ValueError("drop_vars_variance includes some variable names that are not in X_train") + keep_inds = ~np.isin(X_train.columns.array, drop_vars_variance) + variable_subset_variance = [i for i in keep_inds] + else: + if np.any(drop_vars_variance >= X_train.shape[1]): + raise ValueError("drop_vars_variance includes some variable indices that exceed the number of columns in X_train") + if np.any(drop_vars_variance < 0): + raise ValueError("drop_vars_variance includes some negative variable indices") + keep_inds = ~np.isin(np.arange(X_train.shape[1]), drop_vars_variance) + variable_subset_variance = [i for i in keep_inds] + else: + raise ValueError("drop_vars_variance must be a list or np.array") + else: + variable_subset_variance = [i for i in range(X_train.shape[1])] # Covariate preprocessing self._covariate_transformer = CovariateTransformer() @@ -508,8 +610,10 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr resid_train = (y_train-self.y_bar)/self.y_std # Calibrate priors for global sigma^2 and sigma_leaf_mu / sigma_leaf_tau (don't use regression initializer for warm-start or XBART) - if not sigma2: - sigma2 = pct_var_sigma2_init*np.var(resid_train) + if not sigma2_init: + sigma2_init = pct_var_sigma2_init*np.var(resid_train) + if not variance_forest_leaf_init: + variance_forest_leaf_init = pct_var_variance_forest_init*np.var(resid_train) b_leaf_mu = np.squeeze(np.var(resid_train)) / num_trees_mu if b_leaf_mu is None else b_leaf_mu b_leaf_tau = np.squeeze(np.var(resid_train)) / (2*num_trees_tau) if b_leaf_tau is None else b_leaf_tau sigma_leaf_mu = np.squeeze(np.var(resid_train)) / num_trees_mu if sigma_leaf_mu is None else sigma_leaf_mu @@ -517,23 +621,38 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr if self.multivariate_treatment: if not isinstance(sigma_leaf_tau, np.ndarray): sigma_leaf_tau = np.diagflat(np.repeat(sigma_leaf_tau, self.treatment_dim)) - current_sigma2 = sigma2 + current_sigma2 = sigma2_init + self.sigma2_init = sigma2_init current_leaf_scale_mu = np.array([[sigma_leaf_mu]]) if not isinstance(sigma_leaf_tau, np.ndarray): current_leaf_scale_tau = np.array([[sigma_leaf_tau]]) else: current_leaf_scale_tau = sigma_leaf_tau + if self.include_variance_forest: + if not a_forest: + a_forest = num_trees_variance / 1.5**2 + 0.5 + if not b_forest: + b_forest = num_trees_variance / 1.5**2 + else: + if not a_forest: + a_forest = 1. + if not b_forest: + b_forest = 1. + # Update variable weights variable_counts = [original_var_indices.count(i) for i in original_var_indices] - variable_weights_adj = [1/i for i in variable_counts] - variable_weights = variable_weights[original_var_indices]*variable_weights_adj + variable_weights_mu_adj = [1/i for i in variable_counts] + variable_weights_tau_adj = [1/i for i in variable_counts] + variable_weights_variance_adj = [1/i for i in variable_counts] + variable_weights_mu = variable_weights_mu[original_var_indices]*variable_weights_mu_adj + variable_weights_tau = variable_weights_tau[original_var_indices]*variable_weights_tau_adj + variable_weights_variance = variable_weights_variance[original_var_indices]*variable_weights_variance_adj - # Create mu and tau specific variable weights with weights zeroed out for excluded variables - variable_weights_tau = variable_weights - variable_weights_mu = variable_weights + # Create mu, tau, and variance specific variable weights with weights zeroed out for excluded variables variable_weights_mu[[variable_subset_mu.count(i) == 0 for i in original_var_indices]] = 0 - variable_weights_tau[[variable_subset_tau.count(i) == 0 for i in original_var_indices]] <- 0 + variable_weights_tau[[variable_subset_tau.count(i) == 0 for i in original_var_indices]] = 0 + variable_weights_variance[[variable_subset_variance.count(i) == 0 for i in original_var_indices]] = 0 # Update covariates to include propensities if requested if propensity_covariate not in ["none", "mu", "tau", "both"]: @@ -552,15 +671,20 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr elif propensity_covariate == "both": variable_weights_mu = np.append(variable_weights_mu, np.repeat(1/num_cov_orig, pi_train.shape[1])) variable_weights_tau = np.append(variable_weights_tau, np.repeat(1/num_cov_orig, pi_train.shape[1])) + variable_weights_variance = np.append(variable_weights_variance, np.repeat(0., pi_train.shape[1])) # Renormalize variable weights variable_weights_mu = variable_weights_mu / np.sum(variable_weights_mu) variable_weights_tau = variable_weights_tau / np.sum(variable_weights_tau) + variable_weights_variance = variable_weights_variance / np.sum(variable_weights_variance) # Store propensity score requirements of the BCF forests self.propensity_covariate = propensity_covariate # Container of variance parameter samples + self.num_gfr = num_gfr + self.num_burnin = num_burnin + self.num_mcmc = num_mcmc num_actual_mcmc_iter = num_mcmc * keep_every num_temp_samples = num_gfr + num_burnin + num_actual_mcmc_iter num_retained_samples = num_mcmc @@ -618,19 +742,20 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr else: cpp_rng = RNG(random_seed) - # TODO Placeholder: expose the heteroskedasticity interface through the function signature, as in R - a_forest = 1 - b_forest = 1 - # Sampling data structures forest_sampler_mu = ForestSampler(forest_dataset_train, feature_types, num_trees_mu, self.n_train, alpha_mu, beta_mu, min_samples_leaf_mu, max_depth_mu) forest_sampler_tau = ForestSampler(forest_dataset_train, feature_types, num_trees_tau, self.n_train, alpha_tau, beta_tau, min_samples_leaf_tau, max_depth_tau) + if self.include_variance_forest: + forest_sampler_variance = ForestSampler(forest_dataset_train, feature_types, num_trees_variance, self.n_train, alpha_variance, beta_variance, min_samples_leaf_variance, max_depth_variance) # Container of forest samples self.forest_container_mu = ForestContainer(num_trees_mu, 1, True, False) self.forest_container_tau = ForestContainer(num_trees_tau, Z_train.shape[1], False, False) active_forest_mu = Forest(num_trees_mu, 1, True, False) active_forest_tau = Forest(num_trees_tau, Z_train.shape[1], False, False) + if self.include_variance_forest: + self.forest_container_variance = ForestContainer(num_trees_variance, 1, True, True) + active_forest_variance = Forest(num_trees_variance, 1, True, True) # Variance samplers if self.sample_sigma_global: @@ -651,6 +776,11 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr init_tau = np.array([0.]) forest_sampler_tau.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_tau, treatment_leaf_model, init_tau) + # Initialize the leaves of each tree in the variance forest + if self.include_variance_forest: + init_val_variance = np.array([variance_forest_leaf_init]) + forest_sampler_variance.prepare_for_sampler(forest_dataset_train, residual_train, active_forest_variance, leaf_model_variance_forest, init_val_variance) + # Run GFR (warm start) if specified if num_gfr > 0: for i in range(num_gfr): @@ -681,16 +811,6 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr current_sigma2, treatment_leaf_model, keep_sample, True, True ) - # Sample variance parameters (if requested) - if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) - if keep_sample: - self.global_var_samples[sample_counter] = current_sigma2 - if self.sample_sigma_leaf_tau: - current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) - if keep_sample: - self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] - # Sample coding parameters (if requested) if self.adaptive_coding: mu_x = active_forest_mu.predict_raw(forest_dataset_train) @@ -715,6 +835,24 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Update residual to reflect adjusted basis forest_sampler_tau.propagate_basis_update(forest_dataset_train, residual_train, active_forest_tau) + + # Sample the variance forest + if self.include_variance_forest: + forest_sampler_variance.sample_one_iteration( + self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, + cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_variance, a_forest, b_forest, + current_sigma2, leaf_model_variance_forest, keep_sample, True, True + ) + + # Sample variance parameters (if requested) + if self.sample_sigma_global: + current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + if keep_sample: + self.global_var_samples[sample_counter] = current_sigma2 + if self.sample_sigma_leaf_tau: + current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) + if keep_sample: + self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] # Run MCMC if num_burnin + num_mcmc > 0: @@ -753,17 +891,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr self.forest_container_tau, active_forest_tau, forest_dataset_train, residual_train, cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_tau, variable_weights_tau, a_forest, b_forest, current_sigma2, treatment_leaf_model, keep_sample, False, True - ) - - # Sample variance parameters (if requested) - if self.sample_sigma_global: - current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) - if keep_sample: - self.global_var_samples[sample_counter] = current_sigma2 - if self.sample_sigma_leaf_tau: - current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) - if keep_sample: - self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] + ) # Sample coding parameters (if requested) if self.adaptive_coding: @@ -789,6 +917,24 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr # Update residual to reflect adjusted basis forest_sampler_tau.propagate_basis_update(forest_dataset_train, residual_train, active_forest_tau) + + # Sample the variance forest + if self.include_variance_forest: + forest_sampler_variance.sample_one_iteration( + self.forest_container_variance, active_forest_variance, forest_dataset_train, residual_train, + cpp_rng, feature_types, cutpoint_grid_size, current_leaf_scale_mu, variable_weights_variance, a_forest, b_forest, + current_sigma2, leaf_model_variance_forest, keep_sample, False, True + ) + + # Sample variance parameters (if requested) + if self.sample_sigma_global: + current_sigma2 = global_var_model.sample_one_iteration(residual_train, cpp_rng, a_global, b_global) + if keep_sample: + self.global_var_samples[sample_counter] = current_sigma2 + if self.sample_sigma_leaf_tau: + current_leaf_scale_tau[0,0] = leaf_var_model_tau.sample_one_iteration(active_forest_tau, cpp_rng, a_leaf_tau, b_leaf_tau) + if keep_sample: + self.leaf_scale_tau_samples[sample_counter] = current_leaf_scale_tau[0,0] # Mark the model as sampled self.sampled = True @@ -798,6 +944,8 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr for i in range(num_gfr): self.forest_container_mu.delete_sample(i) self.forest_container_tau.delete_sample(i) + if self.include_variance_forest: + self.forest_container_variance.delete_sample(i) if self.adaptive_coding: self.b1_samples = self.b1_samples[num_gfr:] self.b0_samples = self.b0_samples[num_gfr:] @@ -807,6 +955,7 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr self.leaf_scale_mu_samples = self.leaf_scale_mu_samples[num_gfr:] if self.sample_sigma_leaf_tau: self.leaf_scale_tau_samples = self.leaf_scale_tau_samples[num_gfr:] + self.num_samples -= num_gfr # Store predictions mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) @@ -836,6 +985,23 @@ def sample(self, X_train: Union[pd.DataFrame, np.array], Z_train: np.array, y_tr else: treatment_term_test = Z_test*np.squeeze(self.tau_hat_test) self.y_hat_test = self.mu_hat_test + treatment_term_test + + if self.include_variance_forest: + sigma2_x_train_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_train.dataset_cpp) + if self.sample_sigma_global: + self.sigma2_x_train = sigma2_x_train_raw + for i in range(self.num_samples): + self.sigma2_x_train[:,i] = sigma2_x_train_raw[:,i]*self.global_var_samples[i] + else: + self.sigma2_x_train = sigma2_x_train_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale + if self.has_test: + sigma2_x_test_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + if self.sample_sigma_global: + self.sigma2_x_test = sigma2_x_test_raw + for i in range(self.num_samples): + self.sigma2_x_test[:,i] = sigma2_x_test_raw[:,i]*self.global_var_samples[i] + else: + self.sigma2_x_test = sigma2_x_test_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale if self.sample_sigma_global: self.global_var_samples = self.global_var_samples*self.y_std*self.y_std @@ -898,20 +1064,24 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> raise ValueError("Propensity scores not provided, but no propensity model was trained during sampling") else: propensity = np.mean(self.bart_propensity_model.predict(X), axis=1, keepdims=True) + else: + # Dummy propensities if not provided but also not needed + propensity = np.ones(X.shape[0]) + propensity = np.expand_dims(propensity, 1) # Update covariates to include propensities if requested - if self.propensity_covariate == "tau": - X_tau = np.c_[X, propensity] + if self.propensity_covariate == "none": + X_combined = X else: - X_tau = X + X_combined = np.c_[X, propensity] - # Treatment Forest Dataset (covariates and treatment variable) - forest_dataset_tau = Dataset() - forest_dataset_tau.add_covariates(X_tau) - forest_dataset_tau.add_basis(Z) + # Forest dataset + forest_dataset_test = Dataset() + forest_dataset_test.add_covariates(X_combined) + forest_dataset_test.add_basis(Z) # Estimate treatment effect - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau.dataset_cpp) + tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) tau_raw = tau_raw if self.adaptive_coding: adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) @@ -920,6 +1090,68 @@ def predict_tau(self, X: np.array, Z: np.array, propensity: np.array = None) -> # Return result matrix return tau_x + + def predict_variance(self, covariates: np.array, propensity: np.array = None) -> np.array: + """Predict expected conditional variance from a BART model. + + Parameters + ---------- + covariates : np.array + Test set covariates. + covariates : np.array + Test set propensity scores. Optional (not currently used in variance forests). + + Returns + ------- + tuple of :obj:`np.array` + Tuple of arrays of predictions corresponding to the variance forest. Each array will contain as many rows as in ``covariates`` and as many columns as retained samples of the algorithm. + """ + if not self.is_sampled(): + msg = ( + "This BARTModel instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this model." + ) + raise NotSampledError(msg) + + if not self.include_variance_forest: + msg = ( + "This BARTModel instance was not sampled with a variance forest. " + "Call 'fit' with appropriate arguments before using this model." + ) + raise NotSampledError(msg) + + # Convert everything to standard shape (2-dimensional) + if covariates.ndim == 1: + covariates = np.expand_dims(covariates, 1) + if propensity is not None: + if propensity.ndim == 1: + propensity = np.expand_dims(propensity, 1) + + # Update covariates to include propensities if requested + if self.propensity_covariate == "none": + X_combined = covariates + else: + if propensity is not None: + X_combined = np.c_[covariates, propensity] + else: + # Dummy propensities if not provided but also not needed + propensity = np.ones(covariates.shape[0]) + propensity = np.expand_dims(propensity, 1) + X_combined = np.c_[covariates, propensity] + + # Forest dataset + pred_dataset = Dataset() + pred_dataset.add_covariates(X_combined) + + variance_pred_raw = self.forest_container_variance.forest_container_cpp.Predict(pred_dataset.dataset_cpp) + if self.sample_sigma_global: + variance_pred = variance_pred_raw + for i in range(self.num_samples): + variance_pred[:,i] = variance_pred_raw[:,i]*self.global_var_samples[i] + else: + variance_pred = variance_pred_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale + + return variance_pred def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.array: """Predict outcome model components (CATE function and prognostic function) as well as overall outcome for every provided observation. @@ -939,7 +1171,8 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.a tuple of np.array Tuple of arrays with as many rows as in ``X`` and as many columns as retained samples of the algorithm. The first entry of the tuple contains conditional average treatment effect (CATE) samples, - the second entry contains prognostic effect samples, and the third entry contains outcome prediction samples + the second entry contains prognostic effect samples, and the third entry contains outcome prediction samples. + The optional fourth array contains variance forest samples. """ if not self.is_sampled(): msg = ( @@ -974,45 +1207,45 @@ def predict(self, X: np.array, Z: np.array, propensity: np.array = None) -> np.a propensity = np.mean(self.bart_propensity_model.predict(X), axis=1, keepdims=True) # Update covariates to include propensities if requested - if self.propensity_covariate == "mu": - X_mu = np.c_[X, propensity] - X_tau = X - elif self.propensity_covariate == "tau": - X_mu = X - X_tau = np.c_[X, propensity] - elif self.propensity_covariate == "both": - X_mu = np.c_[X, propensity] - X_tau = np.c_[X, propensity] - elif self.propensity_covariate == "none": - X_mu = X - X_tau = X + if self.propensity_covariate == "none": + X_combined = X + else: + X_combined = np.c_[X, propensity] - # Prognostic Forest Dataset (covariates) - forest_dataset_mu = Dataset() - forest_dataset_mu.add_covariates(X_mu) - - # Treatment Forest Dataset (covariates and treatment variable) - forest_dataset_tau = Dataset() - forest_dataset_tau.add_covariates(X_tau) - forest_dataset_tau.add_basis(Z) + # Forest dataset + forest_dataset_test = Dataset() + forest_dataset_test.add_covariates(X_combined) + forest_dataset_test.add_basis(Z) # Compute predicted outcome and decomposed outcome model terms - mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_tau.dataset_cpp) - mu_x = mu_raw*self.y_std + self.y_bar - tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_tau.dataset_cpp) - tau_raw = tau_raw + mu_raw = self.forest_container_mu.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + mu_x = mu_raw*self.y_std/np.sqrt(self.variance_scale) + self.y_bar + tau_raw = self.forest_container_tau.forest_container_cpp.PredictRaw(forest_dataset_test.dataset_cpp) if self.adaptive_coding: adaptive_coding_weights = np.expand_dims(self.b1_samples - self.b0_samples, axis=(0,2)) tau_raw = tau_raw*adaptive_coding_weights - tau_x = np.squeeze(tau_raw*self.y_std) + tau_x = np.squeeze(tau_raw*self.y_std/np.sqrt(self.variance_scale)) if Z.shape[1] > 1: treatment_term = np.multiply(np.atleast_3d(Z).swapaxes(1,2),tau_x).sum(axis=2) else: treatment_term = Z*np.squeeze(tau_x) yhat_x = mu_x + treatment_term + + # Compute predictions from the variance forest (if included) + if self.include_variance_forest: + sigma2_x_raw = self.forest_container_variance.forest_container_cpp.Predict(forest_dataset_test.dataset_cpp) + if self.sample_sigma_global: + sigma2_x = sigma2_x_raw + for i in range(self.num_samples): + sigma2_x[:,i] = sigma2_x_raw[:,i]*self.global_var_samples[i] + else: + sigma2_x = sigma2_x_raw*self.sigma2_init*self.y_std*self.y_std/self.variance_scale # Return result matrices as a tuple - return (tau_x, mu_x, yhat_x) + if self.include_variance_forest: + return (tau_x, mu_x, yhat_x, sigma2_x) + else: + return (tau_x, mu_x, yhat_x) def to_json(self) -> str: """ @@ -1054,7 +1287,8 @@ def to_json(self) -> str: bcf_json.add_scalar("num_burnin", self.num_burnin) bcf_json.add_scalar("num_mcmc", self.num_mcmc) bcf_json.add_scalar("num_samples", self.num_samples) - bcf_json.add_scalar("adaptive_coding", self.adaptive_coding) + bcf_json.add_boolean("adaptive_coding", self.adaptive_coding) + bcf_json.add_string("propensity_covariate", self.propensity_covariate) # Add parameter samples if self.sample_sigma_global: @@ -1079,44 +1313,48 @@ def from_json(self, json_string: str) -> None: JSON string representing model metadata (hyperparameters), sampled parameters, and sampled forests """ # Parse string to a JSON object in C++ - bart_json = JSONSerializer() - bart_json.load_from_json_string(json_string) + bcf_json = JSONSerializer() + bcf_json.load_from_json_string(json_string) # Unpack forests - self.include_variance_forest = bart_json.get_boolean("include_variance_forest") + self.include_variance_forest = bcf_json.get_boolean("include_variance_forest") # TODO: don't just make this a placeholder that we overwrite self.forest_container_mu = ForestContainer(0, 0, False, False) - self.forest_container_mu.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_0") + self.forest_container_mu.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_0") # TODO: don't just make this a placeholder that we overwrite self.forest_container_tau = ForestContainer(0, 0, False, False) - self.forest_container_tau.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_1") + self.forest_container_tau.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_1") if self.include_variance_forest: # TODO: don't just make this a placeholder that we overwrite self.forest_container_variance = ForestContainer(0, 0, False, False) - self.forest_container_variance.forest_container_cpp.LoadFromJson(bart_json.json_cpp, "forest_2") + self.forest_container_variance.forest_container_cpp.LoadFromJson(bcf_json.json_cpp, "forest_2") # Unpack global parameters - self.variance_scale = bart_json.get_scalar("variance_scale") - self.y_std = bart_json.get_scalar("outcome_scale") - self.y_bar = bart_json.get_scalar("outcome_mean") - self.standardize = bart_json.get_boolean("standardize") - self.sigma2_init = bart_json.get_scalar("sigma2_init") - self.sample_sigma_global = bart_json.get_boolean("sample_sigma_global") - self.sample_sigma_leaf_mu = bart_json.get_boolean("sample_sigma_leaf_mu") - self.sample_sigma_leaf_tau = bart_json.get_boolean("sample_sigma_leaf_tau") - self.num_gfr = bart_json.get_scalar("num_gfr") - self.num_burnin = bart_json.get_scalar("num_burnin") - self.num_mcmc = bart_json.get_scalar("num_mcmc") - self.num_samples = bart_json.get_scalar("num_samples") - self.adaptive_coding = bart_json.get_scalar("adaptive_coding") + self.variance_scale = bcf_json.get_scalar("variance_scale") + self.y_std = bcf_json.get_scalar("outcome_scale") + self.y_bar = bcf_json.get_scalar("outcome_mean") + self.standardize = bcf_json.get_boolean("standardize") + self.sigma2_init = bcf_json.get_scalar("sigma2_init") + self.sample_sigma_global = bcf_json.get_boolean("sample_sigma_global") + self.sample_sigma_leaf_mu = bcf_json.get_boolean("sample_sigma_leaf_mu") + self.sample_sigma_leaf_tau = bcf_json.get_boolean("sample_sigma_leaf_tau") + self.num_gfr = int(bcf_json.get_scalar("num_gfr")) + self.num_burnin = int(bcf_json.get_scalar("num_burnin")) + self.num_mcmc = int(bcf_json.get_scalar("num_mcmc")) + self.num_samples = int(bcf_json.get_scalar("num_samples")) + self.adaptive_coding = bcf_json.get_boolean("adaptive_coding") + self.propensity_covariate = bcf_json.get_string("propensity_covariate") # Unpack parameter samples if self.sample_sigma_global: - self.global_var_samples = bart_json.get_numeric_vector("sigma2_global_samples", "parameters") + self.global_var_samples = bcf_json.get_numeric_vector("sigma2_global_samples", "parameters") if self.sample_sigma_leaf_mu: - self.leaf_scale_mu_samples = bart_json.get_numeric_vector("sigma2_leaf_mu_samples", "parameters") + self.leaf_scale_mu_samples = bcf_json.get_numeric_vector("sigma2_leaf_mu_samples", "parameters") if self.sample_sigma_leaf_tau: - self.leaf_scale_tau_samples = bart_json.get_numeric_vector("sigma2_leaf_tau_samples", "parameters") + self.leaf_scale_tau_samples = bcf_json.get_numeric_vector("sigma2_leaf_tau_samples", "parameters") + if self.adaptive_coding: + self.b1_samples = bcf_json.get_numeric_vector("b1_samples", "parameters") + self.b0_samples = bcf_json.get_numeric_vector("b0_samples", "parameters") # Mark the deserialized model as "sampled" self.sampled = True diff --git a/stochtree/preprocessing.py b/stochtree/preprocessing.py index d0f3f314..c913cab7 100644 --- a/stochtree/preprocessing.py +++ b/stochtree/preprocessing.py @@ -63,27 +63,40 @@ def _preprocess_bcf_params(params: Optional[Dict[str, Any]] = None) -> Dict[str, 'sigma_leaf_tau': None, 'alpha_mu': 0.95, 'alpha_tau': 0.25, + 'alpha_variance': 0.95, 'beta_mu': 2.0, 'beta_tau': 3.0, + 'beta_variance': 2.0, 'min_samples_leaf_mu': 5, 'min_samples_leaf_tau': 5, + 'min_samples_leaf_variance': 5, 'max_depth_mu': 10, 'max_depth_tau': 5, + 'max_depth_variance': 10, 'a_global': 0, 'b_global': 0, 'a_leaf_mu': 3, 'a_leaf_tau': 3, 'b_leaf_mu': None, - 'b_leaf_tau': None, - 'sigma2': None, - 'pct_var_sigma2_init': 0.25, - 'variable_weights': None, + 'b_leaf_tau': None, + 'a_forest' : None, + 'b_forest' : None, + 'sigma2_init': None, + 'variance_forest_leaf_init' : None, + 'pct_var_sigma2_init': 1, + 'pct_var_variance_forest_init' : 1, + 'variable_weights_mu': None, + 'variable_weights_tau': None, + 'variable_weights_variance': None, 'keep_vars_mu': None, 'drop_vars_mu': None, 'keep_vars_tau': None, 'drop_vars_tau': None, + 'keep_vars_variance': None, + 'drop_vars_variance': None, 'num_trees_mu': 200, 'num_trees_tau': 50, + 'num_trees_variance': 0, 'sample_sigma_global': True, 'sample_sigma_leaf_mu': True, 'sample_sigma_leaf_tau': False, diff --git a/test/python/test_bcf.py b/test/python/test_bcf.py index 8d0136b3..01b733da 100644 --- a/test/python/test_bcf.py +++ b/test/python/test_bcf.py @@ -49,9 +49,10 @@ def test_binary_bcf(self): # Run BCF with test set and propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -73,8 +74,9 @@ def test_binary_bcf(self): # Run BCF without test set and with propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -93,9 +95,10 @@ def test_binary_bcf(self): # Run BCF with test set and without propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -119,8 +122,9 @@ def test_binary_bcf(self): # Run BCF without test set and without propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -182,9 +186,10 @@ def test_continuous_univariate_bcf(self): # Run BCF with test set and propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -206,8 +211,9 @@ def test_continuous_univariate_bcf(self): # Run BCF without test set and with propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -226,9 +232,10 @@ def test_continuous_univariate_bcf(self): # Run BCF with test set and without propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -252,8 +259,9 @@ def test_continuous_univariate_bcf(self): # Run BCF without test set and without propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -317,9 +325,10 @@ def test_multivariate_bcf(self): # Run BCF with test set and propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, X_test=X_test, Z_test=Z_test, pi_test=pi_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -341,8 +350,9 @@ def test_multivariate_bcf(self): # Run BCF without test set and with propensity score bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, pi_train=pi_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Assertions assert (bcf_model.y_hat_train.shape == (n_train, num_mcmc)) @@ -361,13 +371,15 @@ def test_multivariate_bcf(self): # Run BCF with test set and without propensity score with pytest.raises(ValueError): - bcf_model = BCFModel() - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, - num_burnin=num_burnin, num_mcmc=num_mcmc) + bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} + bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, + X_test=X_test, Z_test=Z_test, num_gfr=num_gfr, + num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) # Run BCF without test set and without propensity score with pytest.raises(ValueError): - bcf_model = BCFModel() - bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, - num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc) + bcf_model = BCFModel() + bcf_params = {'num_trees_variance': 0} + bcf_model.sample(X_train=X_train, Z_train=Z_train, y_train=y_train, + num_gfr=num_gfr, num_burnin=num_burnin, num_mcmc=num_mcmc, params=bcf_params) diff --git a/test/python/test_json.py b/test/python/test_json.py index 55d30286..66fccfb8 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -239,6 +239,7 @@ def test_bcf_string(self): bcf_reloaded = BCFModel() bcf_reloaded.from_json(bcf_json_string) mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict(X, Z, pi_X) + # print(y_hat_reloaded) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded) From ea6dce88e3238110d28a8b4c9d8407b4c8e2d445 Mon Sep 17 00:00:00 2001 From: Drew Herren Date: Mon, 9 Dec 2024 23:38:48 -0600 Subject: [PATCH 4/4] Removed commented print statement --- test/python/test_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/python/test_json.py b/test/python/test_json.py index 66fccfb8..55d30286 100644 --- a/test/python/test_json.py +++ b/test/python/test_json.py @@ -239,7 +239,6 @@ def test_bcf_string(self): bcf_reloaded = BCFModel() bcf_reloaded.from_json(bcf_json_string) mu_hat_reloaded, tau_hat_reloaded, y_hat_reloaded = bcf_reloaded.predict(X, Z, pi_X) - # print(y_hat_reloaded) np.testing.assert_almost_equal(y_hat_orig, y_hat_reloaded) np.testing.assert_almost_equal(tau_hat_orig, tau_hat_reloaded) np.testing.assert_almost_equal(mu_hat_orig, mu_hat_reloaded)