Skip to content
2 changes: 2 additions & 0 deletions econml/score/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
"""

from .rscorer import RScorer
from .drscorer import DRScorer
from .ensemble_cate import EnsembleCateEstimator

__all__ = ['RScorer',
'DRScorer',
'EnsembleCateEstimator']
277 changes: 277 additions & 0 deletions econml/score/drscorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

from econml.sklearn_extensions.linear_model import StatsModelsLinearRegression
from ..dr import DRLearner
from sklearn.base import clone
import numpy as np
from scipy.special import softmax
from .ensemble_cate import EnsembleCateEstimator


class DRScorer:
""" Scorer based on the DRLearner loss. Fits regression model g (using T-Learner) and propensity model p at fit
time and calculates the regression and propensity of the evaluation data::

g (model_regression) = E[Y | X, W, T]
p (model_propensity) = Pr[T | X, W]

Ydr(g,p) = g(X,W,T) + (Y - g(X,W,T)) / p_T(X,W)

Then for any given cate model calculates the loss::

loss(cate) = E_n[(Ydr(g, p) - cate(X))^2]

Also calculates a baseline loss based on a constant treatment effect model, i.e.::

base_loss = min_{theta} E_n[(Ydr(g, p) - theta)^2]

Returns an analogue of the R-square score for regression::

score = 1 - loss(cate) / base_loss

This corresponds to the extra variance of the outcome explained by introducing heterogeneity
in the effect as captured by the cate model, as opposed to always predicting a constant effect.
A negative score, means that the cate model performs even worse than a constant effect model
and hints at overfitting during training of the cate model.

This method was also advocated in recent work of [Schuleretal2018]_ when compared among several alternatives
for causal model selection and introduced in the work of [NieWager2017]_.

Parameters
----------
model_propensity : scikit-learn classifier or 'auto', default 'auto'
Estimator for Pr[T=t | X, W]. Trained by regressing treatments on (features, controls) concatenated.
Must implement `fit` and `predict_proba` methods. The `fit` method must be able to accept X and T,
where T is a shape (n, ) array.
If 'auto', :class:`~sklearn.linear_model.LogisticRegressionCV` will be chosen.

model_regression : scikit-learn regressor or 'auto', default 'auto'
Estimator for E[Y | X, W, T]. Trained by regressing Y on (features, controls, one-hot-encoded treatments)
concatenated. The one-hot-encoding excludes the baseline treatment. Must implement `fit` and
`predict` methods. If different models per treatment arm are desired, see the
:class:`.MultiModelWrapper` helper class.
If 'auto' :class:`.WeightedLassoCV`/:class:`.WeightedMultiTaskLassoCV` will be chosen.

model_final :
estimator for the final cate model. Trained on regressing the doubly robust potential outcomes
on (features X).

- If X is None, then the fit method of model_final should be able to handle X=None.
- If featurizer is not None and X is not None, then it is trained on the outcome of
featurizer.fit_transform(X).
- If multitask_model_final is True, then this model must support multitasking
and it is trained by regressing all doubly robust target outcomes on (featurized) features simultanteously.
- The output of the predict(X) of the trained model will contain the CATEs for each treatment compared to
baseline treatment (lexicographically smallest). If multitask_model_final is False, it is assumed to be a
mono-task model and a separate clone of the model is trained for each outcome. Then predict(X) of the t-th
clone will be the CATE of the t-th lexicographically ordered treatment compared to the baseline.

multitask_model_final : bool, default False
Whether the model_final should be treated as a multi-task model. See description of model_final.

featurizer : :term:`transformer`, optional
Must support fit_transform and transform. Used to create composite features in the final CATE regression.
It is ignored if X is None. The final CATE will be trained on the outcome of featurizer.fit_transform(X).
If featurizer=None, then CATE is trained on X.

min_propensity : float, default ``1e-6``
The minimum propensity at which to clip propensity estimates to avoid dividing by zero.

categories: 'auto' or list, default 'auto'
The categories to use when encoding discrete treatments (or 'auto' to use the unique sorted values).
The first category will be treated as the control treatment.

cv: int, cross-validation generator or an iterable, default 2
Determines the cross-validation splitting strategy.
Possible inputs for cv are:

- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`
- An iterable yielding (train, test) splits as arrays of indices.

For integer/None inputs, if the treatment is discrete
:class:`~sklearn.model_selection.StratifiedKFold` is used, else,
:class:`~sklearn.model_selection.KFold` is used
(with a random shuffle in either case).

Unless an iterable is used, we call `split(concat[W, X], T)` to generate the splits. If all
W, X are None, then we call `split(ones((T.shape[0], 1)), T)`.

mc_iters: int, optional
The number of times to rerun the first stage models to reduce the variance of the nuisances.

mc_agg: {'mean', 'median'}, default 'mean'
How to aggregate the nuisance value for each sample across the `mc_iters` monte carlo iterations of
cross-fitting.

random_state: int, :class:`~numpy.random.mtrand.RandomState` instance or None
If int, random_state is the seed used by the random number generator;
If :class:`~numpy.random.mtrand.RandomState` instance, random_state is the random number generator;
If None, the random number generator is the :class:`~numpy.random.mtrand.RandomState` instance used
by :mod:`np.random<numpy.random>`.

References
----------
.. [NieWager2017] X. Nie and S. Wager.
Quasi-Oracle Estimation of Heterogeneous Treatment Effects.
arXiv preprint arXiv:1712.04912, 2017.
`<https://arxiv.org/pdf/1712.04912.pdf>`_

.. [Schuleretal2018] Alejandro Schuler, Michael Baiocchi, Robert Tibshirani, Nigam Shah.
"A comparison of methods for model selection when estimating individual treatment effects."
Arxiv, 2018
`<https://arxiv.org/pdf/1804.05146.pdf>`_

"""

def __init__(self, *,
model_propensity='auto',
model_regression='auto',
model_final=StatsModelsLinearRegression(),
multitask_model_final=False,
featurizer=None,
min_propensity=1e-6,
categories='auto',
cv=2,
mc_iters=None,
mc_agg='mean',
random_state=None):
self.model_propensity = clone(model_propensity, safe=False)
self.model_regression = clone(model_regression, safe=False)
self.model_final = clone(model_final, safe=False)
self.multitask_model_final = multitask_model_final
self.featurizer = clone(featurizer, safe=False)
self.min_propensity = min_propensity
self.categories = categories
self.cv = cv
self.mc_iters = mc_iters
self.mc_agg = mc_agg
self.random_state = random_state

def fit(self, y, T, X=None, W=None, sample_weight=None, groups=None):
"""

Parameters
----------
Y: (n × d_y) matrix or vector of length n
Outcomes for each sample
T: (n × dₜ) matrix or vector of length n
Treatments for each sample
X: (n × dₓ) matrix, optional
Features for each sample
W: (n × d_w) matrix, optional
Controls for each sample
sample_weight: (n,) vector, optional
Weights for each row
groups: (n,) vector, optional
All rows corresponding to the same group will be kept together during splitting.
If groups is not None, the `cv` argument passed to this class's initializer
must support a 'groups' argument to its split method.

Returns
-------
self
"""
if X is None:
raise ValueError("X cannot be None for the DRScorer!")

self.drlearner_ = DRLearner(model_propensity=self.model_propensity,
model_regression=self.model_regression,
model_final=self.model_final,
multitask_model_final=self.multitask_model_final,
featurizer=self.featurizer,
min_propensity=self.min_propensity,
categories=self.categories,
cv=self.cv,
mc_iters=self.mc_iters,
mc_agg=self.mc_agg,
random_state=self.random_state)
self.drlearner_.fit(y, T, X=None, W=np.hstack([v for v in [X, W] if v is not None]),
sample_weight=sample_weight, groups=groups, cache_values=True)
self.base_score_ = self.drlearner_.score_
self.dx_ = X.shape[1]
return self

def score(self, cate_model):
"""
Parameters
----------
cate_model : instance of fitted BaseCateEstimator

Returns
-------
score : double
An analogue of the DR-square loss for the causal setting.
"""
Y = self.drlearner_._cached_values.Y
T = self.drlearner_._cached_values.T
Y_pred, _ = self.drlearner_._cached_values.nuisances
Ydr = Y_pred[..., 1:] - Y_pred[..., [0]]
X = self.drlearner_._cached_values.W[:, :self.dx_]
sample_weight = self.drlearner_._cached_values.sample_weight
if Ydr.ndim == 1:
Ydr = Ydr.reshape((-1, 1))

effects = cate_model.const_marginal_effect(X).reshape((-1, Ydr.shape[1]))
if sample_weight is not None:
return 1 - np.mean(np.average((Ydr - effects)**2, weights=sample_weight, axis=0)) / self.base_score_
else:
return 1 - np.mean((Ydr - effects) ** 2) / self.base_score_

def best_model(self, cate_models, return_scores=False):
""" Chooses the best among a list of models

Parameters
----------
cate_models : list of instance of fitted BaseCateEstimator
return_scores : bool, default False
Whether to return the list scores of each model
Returns
-------
best_model : instance of fitted BaseCateEstimator
The model that achieves the best score
best_score : double
The score of the best model
scores : list of double
The list of scores for each of the input models. Returned only if `return_scores=True`.
"""
rscores = [self.score(mdl) for mdl in cate_models]
best = np.nanargmax(rscores)
if return_scores:
return cate_models[best], rscores[best], rscores
else:
return cate_models[best], rscores[best]

def ensemble(self, cate_models, eta=1000.0, return_scores=False):
""" Ensembles a list of models based on their performance

Parameters
----------
cate_models : list of instance of fitted BaseCateEstimator
eta : double, default 1000
The soft-max parameter for the ensemble
return_scores : bool, default False
Whether to return the list scores of each model
Returns
-------
ensemble_model : instance of fitted EnsembleCateEstimator
A fitted ensemble cate model that calculates effects based on a weighted
version of the input cate models, weighted by a softmax of their score
performance
ensemble_score : double
The score of the ensemble model
scores : list of double
The list of scores for each of the input models. Returned only if `return_scores=True`.
"""
drscores = np.array([self.score(mdl) for mdl in cate_models])
goodinds = np.isfinite(drscores)
weights = softmax(eta * drscores[goodinds])
goodmodels = [mdl for mdl, good in zip(cate_models, goodinds) if good]
ensemble = EnsembleCateEstimator(cate_models=goodmodels, weights=weights)
ensemble_score = self.score(ensemble)
if return_scores:
return ensemble, ensemble_score, drscores
else:
return ensemble, ensemble_score
97 changes: 97 additions & 0 deletions econml/tests/test_drscorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) PyWhy contributors. All rights reserved.
# Licensed under the MIT License.

import unittest
import numpy as np
import scipy.special
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import check_random_state
from econml.score import DRScorer
from econml.dr import DRLearner


class TestDRLearner(unittest.TestCase):
def test_default_models(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner()
est.fit(y, T, X=X, W=None)
assert est.const_marginal_effect(X[:2]).shape == (2, 2)
assert est.effect(X[:2], T0=0, T1=1).shape == (2,)
assert isinstance(est.score_, float)
assert isinstance(est.score(y, T, X=X), float)
assert len(est.model_cate(T=1).coef_.shape) == 1
assert len(est.model_cate(T=2).coef_.shape) == 1
assert isinstance(est.cate_feature_names(), list)
assert isinstance(est.models_regression[0][0].coef_, np.ndarray)
assert isinstance(est.models_propensity[0][0].coef_, np.ndarray)

def test_custom_models(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.01
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner(
model_propensity=RandomForestClassifier(n_estimators=100, min_samples_leaf=10),
model_regression=RandomForestRegressor(n_estimators=100, min_samples_leaf=10),
model_final=LassoCV(cv=3),
featurizer=None
)
est.fit(y, T, X=X, W=None)
assert isinstance(est.score_, float)
assert est.const_marginal_effect(X[:3]).shape == (3, 2)
assert len(est.model_cate(T=2).coef_.shape) == 1
assert isinstance(est.model_cate(T=2).intercept_, float)
assert len(est.model_cate(T=1).coef_.shape) == 1
assert isinstance(est.model_cate(T=1).intercept_, float)

def test_cv_splitting_strategy(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner(cv=2)
est.fit(y, T, X=X, W=None)
assert est.const_marginal_effect(X[:2]).shape == (2, 2)

def test_mc_iters(self):
np.random.seed(123)
X = np.random.normal(size=(1000, 3))
T = np.random.binomial(2, scipy.special.expit(X[:, 0]))
sigma = 0.001
y = (1 + 0.5 * X[:, 0]) * T + X[:, 0] + np.random.normal(0, sigma, size=(1000,))
est = DRLearner()
est.fit(y, T, X=X, W=None, inference='bootstrap', n_bootstrap_samples=50)

self.aseertAlmostEqual(est.effect(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50).shape[0], 50)
self.assertAlmostEqual(est.effect_interval(X[:2], T0=0, T1=1, alpha=0.05, inference='bootstrap',
n_bootstrap_samples=50).shape, (2, 50, 2))
self.assertAlmostEqual(est.ortho_summary(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50).shape, (2, 2, 5))
self.assertAlmostEqual(est.ortho_intervals(X[:2], T0=0, T1=1, inference='bootstrap',
n_bootstrap_samples=50, method='normal').shape, (2, 2, 2, 2))

def test_score(self):
np.random.seed(123)
y = np.random.normal(size=(1000,))
T = np.random.binomial(2, 0.5, size=(1000,))
X = np.random.normal(size=(1000, 3))
est = DRScorer()
est.fit(y, T, X=X, W=None)
score = est.score()
self.assertAlmostEqual(score, 0.05778546)
# Test using baseline method (e.g., RScorer)
# Replace the following lines with the appropriate baseline method and its parameters
baseline_est = BaselineScorer()
baseline_est.fit(y, T, X=X, W=None)
score_baseline = baseline_est.score()
# Perform the comparison test
self.assertAlmostEqual(score_dr, score_baseline)