Applying dowhy tools to the IHDP data set.
import pandas as pd
from sklearn import ensemble, metrics
import dowhy as dw
from bcg.basics import * 

Loading the data

IHDP data: link

Related columns: link

urls = {
    'data': 'https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv',
}

cols = ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1'] + \
    [f'x{i}' for i in range(1, 26)]
print(len(cols), cols)
# treatment, y_factual, y_cfactual, mu0, mu1, x1, …, x25]
30 ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25']
obs = pd.read_csv(urls['data'], header = None,
                  names=cols)
with pd.option_context('display.max_rows', 40): 
    display(obs.head().T)
0 1 2 3 4
treatment 1.000000 0.000000 0.000000 0.000000 0.000000
y_factual 5.599916 6.875856 2.996273 1.366206 1.963538
y_cfactual 4.318780 7.856495 6.633952 5.697239 6.202582
mu0 3.268256 6.636059 1.570536 1.244738 1.685048
mu1 6.854457 7.562718 6.121617 5.889125 6.191994
x1 -0.528603 -1.736945 -0.807451 0.390083 -1.045229
x2 -0.343455 -1.802002 -0.202946 0.596582 -0.602710
x3 1.128554 0.383828 -0.360898 -1.850350 0.011465
x4 0.161703 2.244320 -0.879606 -0.879606 0.161703
x5 -0.316603 -0.629189 0.808706 -0.004017 0.683672
x6 1.295216 1.295216 -0.526556 -0.857787 -0.360940
x7 1.000000 0.000000 0.000000 0.000000 1.000000
x8 0.000000 0.000000 0.000000 0.000000 0.000000
x9 1.000000 0.000000 0.000000 0.000000 0.000000
x10 0.000000 1.000000 1.000000 0.000000 0.000000
x11 0.000000 0.000000 0.000000 0.000000 0.000000
x12 0.000000 0.000000 0.000000 1.000000 1.000000
x13 0.000000 1.000000 0.000000 1.000000 1.000000
x14 1.000000 1.000000 2.000000 2.000000 1.000000
x15 0.000000 1.000000 0.000000 0.000000 0.000000
x16 1.000000 1.000000 1.000000 1.000000 1.000000
x17 1.000000 1.000000 0.000000 0.000000 1.000000
x18 1.000000 1.000000 1.000000 1.000000 1.000000
x19 1.000000 1.000000 1.000000 1.000000 1.000000
x20 0.000000 0.000000 0.000000 0.000000 0.000000
x21 0.000000 0.000000 0.000000 0.000000 0.000000
x22 0.000000 0.000000 0.000000 0.000000 0.000000
x23 0.000000 0.000000 0.000000 0.000000 0.000000
x24 0.000000 0.000000 0.000000 0.000000 0.000000
x25 0.000000 0.000000 0.000000 0.000000 0.000000
obs['treatment'] = obs.treatment.astype(bool)

Basic analysis

with pd.option_context('display.max_rows', 40):
    display(obs.describe().T)
count mean std min 25% 50% 75% max
treatment 747.0 1.860776e-01 0.389430 0.000000 0.000000 0.000000 0.000000 1.000000
y_factual 747.0 3.159538e+00 2.179956 -1.543902 1.626779 2.577294 4.494637 11.268228
y_cfactual 747.0 5.696107e+00 1.980121 -1.037628 5.053598 6.209686 6.948922 10.171004
mu0 747.0 2.432513e+00 1.281515 0.924453 1.518409 2.114661 2.989305 9.821792
mu1 747.0 6.448580e+00 0.454766 5.591647 6.087863 6.419095 6.765241 7.954804
x1 747.0 4.518177e-17 1.000000 -2.731287 -0.666946 0.165275 0.813759 1.505476
x2 747.0 9.511951e-18 1.000000 -3.800823 -0.602710 0.196818 0.596582 2.595403
x3 747.0 -5.231573e-17 1.000000 -1.850350 -0.733261 -0.360898 0.756191 2.990369
x4 747.0 -6.135208e-16 1.000000 -0.879606 -0.879606 0.161703 0.161703 2.244320
x5 747.0 3.091384e-17 1.000000 -5.130428 -0.566672 0.121017 0.683672 2.371637
x6 747.0 -7.514441e-16 1.000000 -1.851480 -0.857787 -0.029709 0.632754 2.951372
x7 747.0 5.140562e-01 0.500137 0.000000 0.000000 1.000000 1.000000 1.000000
x8 747.0 9.370817e-02 0.291618 0.000000 0.000000 0.000000 0.000000 1.000000
x9 747.0 5.207497e-01 0.499904 0.000000 0.000000 1.000000 1.000000 1.000000
x10 747.0 3.641232e-01 0.481506 0.000000 0.000000 0.000000 1.000000 1.000000
x11 747.0 2.690763e-01 0.443777 0.000000 0.000000 0.000000 1.000000 1.000000
x12 747.0 2.195448e-01 0.414216 0.000000 0.000000 0.000000 0.000000 1.000000
x13 747.0 3.587684e-01 0.479960 0.000000 0.000000 0.000000 1.000000 1.000000
x14 747.0 1.463186e+00 0.498977 1.000000 1.000000 1.000000 2.000000 2.000000
x15 747.0 1.405622e-01 0.347802 0.000000 0.000000 0.000000 0.000000 1.000000
x16 747.0 9.598394e-01 0.196467 0.000000 1.000000 1.000000 1.000000 1.000000
x17 747.0 5.943775e-01 0.491341 0.000000 0.000000 1.000000 1.000000 1.000000
x18 747.0 9.638554e-01 0.186775 0.000000 1.000000 1.000000 1.000000 1.000000
x19 747.0 1.352075e-01 0.342174 0.000000 0.000000 0.000000 0.000000 1.000000
x20 747.0 1.352075e-01 0.342174 0.000000 0.000000 0.000000 0.000000 1.000000
x21 747.0 1.566265e-01 0.363692 0.000000 0.000000 0.000000 0.000000 1.000000
x22 747.0 8.165997e-02 0.274029 0.000000 0.000000 0.000000 0.000000 1.000000
x23 747.0 7.362784e-02 0.261339 0.000000 0.000000 0.000000 0.000000 1.000000
x24 747.0 1.285141e-01 0.334886 0.000000 0.000000 0.000000 0.000000 1.000000
x25 747.0 1.579652e-01 0.364953 0.000000 0.000000 0.000000 0.000000 1.000000
show_correlations(obs)
target = 'y_factual'
in_cols = ['treatment'] + [v for v in cols if v.startswith('x')]
print(in_cols)
['treatment', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25']
obs_sub = obs.loc[:, in_cols + [target]]
X, y, not_target = get_Xy(obs_sub, target=target)
model = ensemble.RandomForestRegressor(n_estimators=100, max_features='sqrt')
model.fit(X,y)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)
get_model_feel(model, obs_sub, target=target)
fi_scores = get_feature_importance(model, obs_sub, target=target,
                                   metric=metrics.mean_squared_error)
fi_scores.head()
variable feature_importance
0 treatment 3.921270
6 x6 1.709902
15 x15 0.532079
5 x5 0.415899
1 x1 0.405943
%%time
part_deps = get_partial_dependencies(model, obs_sub, target=target,
                                     max_num_obs=100, 
                                     max_num_ys=10)
CPU times: user 6.33 s, sys: 93.8 ms, total: 6.42 s
Wall time: 6.52 s
%%time
plot_partial_dependencies(part_deps, target=target)
CPU times: user 29 s, sys: 11.4 s, total: 40.4 s
Wall time: 21.9 s

Causal analysis

causal_model = dw.CausalModel(
    data = obs_sub,
    treatment = 'treatment',
    outcome = 'y_factual',
    common_causes = [v for v in cols if v.startswith('x')]
)
causal_model.view_model()
WARNING:dowhy.causal_model:Causal Graph not provided. DoWhy will construct a graph based on data inputs.
INFO:dowhy.causal_graph:If this is observed data (not from a randomized experiment), there might always be missing confounders. Adding a node named "Unobserved Confounders" to reflect this.
INFO:dowhy.causal_model:Model to find the causal effect of treatment ['treatment'] on outcome ['y_factual']
WARNING:dowhy.causal_graph:Warning: Pygraphviz cannot be loaded. Check that graphviz and pygraphviz are installed.
INFO:dowhy.causal_graph:Using Matplotlib for plotting
identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
estimate = causal_model.estimate_effect(identified_estimand,
                                        method_name="backdoor.propensity_score_matching")
print(estimate)
INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['x25', 'x1', 'x13', 'x17', 'x15', 'x22', 'U', 'x24', 'x2', 'x18', 'x23', 'x9', 'x7', 'x19', 'x3', 'x14', 'x21', 'x10', 'x16', 'x4', 'x5', 'x12', 'x11', 'x20', 'x6', 'x8']
WARNING:dowhy.causal_identifier:If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly.
INFO:dowhy.causal_identifier:Continuing by ignoring these unobserved confounders because proceed_when_unidentifiable flag is True.
INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]
INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator
INFO:dowhy.causal_estimator:b: y_factual~treatment+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
/mnt/c/Programs_wsl/anaconda3/envs/py37_dowhy/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
*** Causal Estimate ***

## Target estimand
Estimand type: nonparametric-ate
### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d                                                                        
────────────(Expectation(y_factual|x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7
d[treatment]                                                                  

                                                 
,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8))
                                                 
Estimand assumption 1, Unconfoundedness: If U→{treatment} and U→y_factual then P(y_factual|treatment,x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8,U) = P(y_factual|treatment,x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8)
### Estimand : 2
Estimand name: iv
No such variable found!

## Realized estimand
b: y_factual~treatment+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
## Estimate
Value: 3.9791388232170393

refute_res = causal_model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter", placebo_type="permute")
print(refute_res)
INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator
INFO:dowhy.causal_estimator:b: y_factual~placebo+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
/mnt/c/Programs_wsl/anaconda3/envs/py37_dowhy/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
Refute: Use a Placebo Treatment
Estimated effect:(3.9791388232170393,)
New effect:(-0.16334815281211698,)