Skip to content

Commit dc0c612

Browse files
committed
first version plr
1 parent 8b664ff commit dc0c612

File tree

4 files changed

+185
-0
lines changed

4 files changed

+185
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
"""Monte Carlo coverage simulations for PLM."""
2+
3+
from montecover.plm.plr_ate import PLRATECoverageSimulation
4+
5+
__all__ = ["PLRATECoverageSimulation"]
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from typing import Any, Dict, Optional
2+
3+
import doubleml as dml
4+
from doubleml.datasets import make_plr_CCDDHNR2018
5+
from lightgbm import LGBMRegressor
6+
from sklearn.ensemble import RandomForestRegressor
7+
from sklearn.linear_model import LassoCV
8+
9+
from montecover.base import BaseSimulation
10+
11+
12+
class PLRATECoverageSimulation(BaseSimulation):
13+
"""Simulation study for coverage properties of DoubleMLPLR for ATE estimation."""
14+
15+
def __init__(
16+
self,
17+
config_file: str,
18+
suppress_warnings: bool = True,
19+
log_level: str = "INFO",
20+
log_file: Optional[str] = None,
21+
):
22+
super().__init__(
23+
config_file=config_file,
24+
suppress_warnings=suppress_warnings,
25+
log_level=log_level,
26+
log_file=log_file,
27+
)
28+
29+
# Additional results storage for aggregated results
30+
self.results_aggregated = []
31+
32+
# Calculate oracle values
33+
self._calculate_oracle_values()
34+
35+
def _process_config_parameters(self):
36+
"""Process simulation-specific parameters from config"""
37+
# Process ML models in parameter grid
38+
assert "learners" in self.dml_parameters, "No learners specified in the config file"
39+
for learner in self.dml_parameters["learners"]:
40+
assert "ml_g" in learner, "No ml_g specified in the config file"
41+
assert "ml_m" in learner, "No ml_m specified in the config file"
42+
43+
# Convert ml_g strings to actual objects
44+
learner["ml_g"] = self._convert_ml_string_to_object(learner["ml_g"][0])
45+
learner["ml_m"] = self._convert_ml_string_to_object(learner["ml_m"][0])
46+
47+
def _convert_ml_string_to_object(self, ml_string):
48+
"""Convert a string to a machine learning object."""
49+
if ml_string == "Lasso":
50+
learner = LassoCV()
51+
elif ml_string == "Random Forest":
52+
learner = RandomForestRegressor(n_estimators=100, max_features=20, max_depth=5, min_samples_leaf=2)
53+
elif ml_string == "LGBM":
54+
learner = LGBMRegressor(n_estimators=100, learning_rate=0.05, verbose=-1)
55+
else:
56+
raise ValueError(f"Unknown learner type: {ml_string}")
57+
58+
return (ml_string, learner)
59+
60+
def _calculate_oracle_values(self):
61+
"""Calculate oracle values for the simulation."""
62+
self.logger.info("Calculating oracle values")
63+
64+
self.oracle_values = dict()
65+
self.oracle_values["theta"] = self.dgp_parameters["theta"]
66+
67+
def run_single_rep(self, dml_data, dml_params) -> Dict[str, Any]:
68+
"""Run a single repetition with the given parameters."""
69+
# Extract parameters
70+
learner_g_name, ml_g = dml_params["learners"]["ml_g"]
71+
learner_m_name, ml_m = dml_params["learners"]["ml_m"]
72+
score = dml_params["score"]
73+
74+
# Model
75+
dml_model = dml.DoubleMLPLR(
76+
obj_dml_data=dml_data,
77+
ml_l=ml_g,
78+
ml_m=ml_m,
79+
ml_g=ml_g if score == "IV-type" else None,
80+
score=score,
81+
)
82+
dml_model.fit()
83+
84+
result = {
85+
"coverage": [],
86+
}
87+
for level in self.confidence_parameters["level"]:
88+
level_result = dict()
89+
level_result["coverage"] = self._compute_coverage(
90+
thetas=dml_model.coef,
91+
oracle_thetas=self.oracle_values["theta"],
92+
confint=dml_model.confint(level=level),
93+
joint_confint=None,
94+
)
95+
96+
# add parameters to the result
97+
for res in level_result.values():
98+
res.update(
99+
{
100+
"Learner g": learner_g_name,
101+
"Learner m": learner_m_name,
102+
"Score": score,
103+
"level": level,
104+
}
105+
)
106+
for key, res in level_result.items():
107+
result[key].append(res)
108+
109+
return result
110+
111+
def summarize_results(self):
112+
"""Summarize the simulation results."""
113+
self.logger.info("Summarizing simulation results")
114+
115+
# Group by parameter combinations
116+
groupby_cols = ["Learner g", "Learner m", "Score", "level"]
117+
aggregation_dict = {
118+
"Coverage": "mean",
119+
"CI Length": "mean",
120+
"Bias": "mean",
121+
}
122+
123+
# Aggregate results (possibly multiple result dfs)
124+
result_summary = dict()
125+
for result_name, result_df in self.results.items():
126+
result_summary[result_name] = result_df.groupby(groupby_cols).agg(aggregation_dict).reset_index()
127+
self.logger.debug(f"Summarized {result_name} results")
128+
129+
return result_summary
130+
131+
def _generate_dml_data(self, dgp_params) -> dml.DoubleMLData:
132+
"""Generate data for the simulation."""
133+
data = make_plr_CCDDHNR2018(
134+
alpha=dgp_params["theta"], n_obs=dgp_params["n_obs"], dim_x=dgp_params["dim_x"], return_type="DataFrame"
135+
)
136+
dml_data = dml.DoubleMLData(data, "y", "d")
137+
return dml_data

scripts/plm/plr_ate.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
2+
from montecover.plm import PLRATECoverageSimulation
3+
4+
# Create and run simulation with config file
5+
sim = PLRATECoverageSimulation(
6+
config_file="scripts/plm/plr_ate_config.yml",
7+
log_level="INFO",
8+
log_file="logs/plm/plr_ate_sim.log"
9+
)
10+
sim.run_simulation()
11+
sim.save_results(output_path="results/plm/", file_prefix="plr_ate")
12+
13+
# Save config file for reproducibility
14+
sim.save_config("results/plm/plr_ate_config.yml")

scripts/plm/plr_ate_config.yml

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Simulation parameters for PLR ATE Coverage
2+
3+
simulation_parameters:
4+
repetitions: 10
5+
max_runtime: 19800 # 5.5 hours in seconds
6+
random_seed: 42
7+
n_jobs: -2
8+
9+
dgp_parameters:
10+
theta: [0.5] # Treatment effect
11+
n_obs: [500] # Sample size
12+
dim_x: [20] # Number of covariates
13+
14+
dml_parameters:
15+
# ML methods for ml_g and ml_m
16+
learners:
17+
- ml_g: ["Lasso"]
18+
ml_m: ["Lasso"]
19+
- ml_g: ["Random Forest"]
20+
ml_m: ["Random Forest"]
21+
- ml_g: ["Lasso"]
22+
ml_m: ["Random Forest"]
23+
- ml_g: ["Random Forest"]
24+
ml_m: ["Lasso"]
25+
26+
score: ["partialling out", "IV-type"]
27+
28+
confidence_parameters:
29+
level: [0.95, 0.90] # Confidence levels

0 commit comments

Comments
 (0)