Applying dowhy tools to the IHDP data set.
import pandas as pd
from sklearn import ensemble, metrics
import dowhy as dw
from bcg.basics import *
urls = {
'data': 'https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv',
}
cols = ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1'] + \
[f'x{i}' for i in range(1, 26)]
print(len(cols), cols)
# treatment, y_factual, y_cfactual, mu0, mu1, x1, …, x25]
obs = pd.read_csv(urls['data'], header = None,
names=cols)
with pd.option_context('display.max_rows', 40):
display(obs.head().T)
obs['treatment'] = obs.treatment.astype(bool)
with pd.option_context('display.max_rows', 40):
display(obs.describe().T)
show_correlations(obs)
target = 'y_factual'
in_cols = ['treatment'] + [v for v in cols if v.startswith('x')]
print(in_cols)
obs_sub = obs.loc[:, in_cols + [target]]
X, y, not_target = get_Xy(obs_sub, target=target)
model = ensemble.RandomForestRegressor(n_estimators=100, max_features='sqrt')
model.fit(X,y)
get_model_feel(model, obs_sub, target=target)
fi_scores = get_feature_importance(model, obs_sub, target=target,
metric=metrics.mean_squared_error)
fi_scores.head()
%%time
part_deps = get_partial_dependencies(model, obs_sub, target=target,
max_num_obs=100,
max_num_ys=10)
%%time
plot_partial_dependencies(part_deps, target=target)
causal_model = dw.CausalModel(
data = obs_sub,
treatment = 'treatment',
outcome = 'y_factual',
common_causes = [v for v in cols if v.startswith('x')]
)
causal_model.view_model()
identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
estimate = causal_model.estimate_effect(identified_estimand,
method_name="backdoor.propensity_score_matching")
print(estimate)
refute_res = causal_model.refute_estimate(identified_estimand, estimate,
method_name="placebo_treatment_refuter", placebo_type="permute")
print(refute_res)