import pandas as pd
from sklearn import ensemble, metrics
import dowhy as dw
from bcg.basics import *

Loading the data

IHDP data: link

Related columns: link

urls = {
    'data': 'https://raw.githubusercontent.com/AMLab-Amsterdam/CEVAE/master/datasets/IHDP/csv/ihdp_npci_1.csv',
}

cols = ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1'] + \
    [f'x{i}' for i in range(1, 26)]
print(len(cols), cols)
# treatment, y_factual, y_cfactual, mu0, mu1, x1, …, x25]

30 ['treatment', 'y_factual', 'y_cfactual', 'mu0', 'mu1', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25']

obs = pd.read_csv(urls['data'], header = None,
                  names=cols)
with pd.option_context('display.max_rows', 40): 
    display(obs.head().T)

obs['treatment'] = obs.treatment.astype(bool)

Basic analysis

with pd.option_context('display.max_rows', 40):
    display(obs.describe().T)

show_correlations(obs)

target = 'y_factual'
in_cols = ['treatment'] + [v for v in cols if v.startswith('x')]
print(in_cols)

['treatment', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x25']

obs_sub = obs.loc[:, in_cols + [target]]
X, y, not_target = get_Xy(obs_sub, target=target)
model = ensemble.RandomForestRegressor(n_estimators=100, max_features='sqrt')
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

get_model_feel(model, obs_sub, target=target)

fi_scores = get_feature_importance(model, obs_sub, target=target,
                                   metric=metrics.mean_squared_error)
fi_scores.head()

%%time
part_deps = get_partial_dependencies(model, obs_sub, target=target,
                                     max_num_obs=100, 
                                     max_num_ys=10)

CPU times: user 6.33 s, sys: 93.8 ms, total: 6.42 s
Wall time: 6.52 s

%%time
plot_partial_dependencies(part_deps, target=target)

CPU times: user 29 s, sys: 11.4 s, total: 40.4 s
Wall time: 21.9 s

Causal analysis

causal_model = dw.CausalModel(
    data = obs_sub,
    treatment = 'treatment',
    outcome = 'y_factual',
    common_causes = [v for v in cols if v.startswith('x')]
)
causal_model.view_model()

WARNING:dowhy.causal_model:Causal Graph not provided. DoWhy will construct a graph based on data inputs.
INFO:dowhy.causal_graph:If this is observed data (not from a randomized experiment), there might always be missing confounders. Adding a node named "Unobserved Confounders" to reflect this.
INFO:dowhy.causal_model:Model to find the causal effect of treatment ['treatment'] on outcome ['y_factual']
WARNING:dowhy.causal_graph:Warning: Pygraphviz cannot be loaded. Check that graphviz and pygraphviz are installed.
INFO:dowhy.causal_graph:Using Matplotlib for plotting

identified_estimand = causal_model.identify_effect(proceed_when_unidentifiable=True)
estimate = causal_model.estimate_effect(identified_estimand,
                                        method_name="backdoor.propensity_score_matching")
print(estimate)

INFO:dowhy.causal_identifier:Common causes of treatment and outcome:['x25', 'x1', 'x13', 'x17', 'x15', 'x22', 'U', 'x24', 'x2', 'x18', 'x23', 'x9', 'x7', 'x19', 'x3', 'x14', 'x21', 'x10', 'x16', 'x4', 'x5', 'x12', 'x11', 'x20', 'x6', 'x8']
WARNING:dowhy.causal_identifier:If this is observed data (not from a randomized experiment), there might always be missing confounders. Causal effect cannot be identified perfectly.
INFO:dowhy.causal_identifier:Continuing by ignoring these unobserved confounders because proceed_when_unidentifiable flag is True.
INFO:dowhy.causal_identifier:Instrumental variables for treatment and outcome:[]
INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator
INFO:dowhy.causal_estimator:b: y_factual~treatment+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
/mnt/c/Programs_wsl/anaconda3/envs/py37_dowhy/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

*** Causal Estimate ***

## Target estimand
Estimand type: nonparametric-ate
### Estimand : 1
Estimand name: backdoor
Estimand expression:
     d                                                                        
────────────(Expectation(y_factual|x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7
d[treatment]                                                                  

                                                 
,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8))
                                                 
Estimand assumption 1, Unconfoundedness: If U→{treatment} and U→y_factual then P(y_factual|treatment,x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8,U) = P(y_factual|treatment,x25,x1,x13,x17,x15,x22,x24,x2,x18,x23,x9,x7,x19,x3,x14,x21,x10,x16,x4,x5,x12,x11,x20,x6,x8)
### Estimand : 2
Estimand name: iv
No such variable found!

## Realized estimand
b: y_factual~treatment+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
## Estimate
Value: 3.9791388232170393

refute_res = causal_model.refute_estimate(identified_estimand, estimate,
        method_name="placebo_treatment_refuter", placebo_type="permute")
print(refute_res)

INFO:dowhy.causal_estimator:INFO: Using Propensity Score Matching Estimator
INFO:dowhy.causal_estimator:b: y_factual~placebo+x25+x1+x13+x17+x15+x22+x24+x2+x18+x23+x9+x7+x19+x3+x14+x21+x10+x16+x4+x5+x12+x11+x20+x6+x8
/mnt/c/Programs_wsl/anaconda3/envs/py37_dowhy/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

Refute: Use a Placebo Treatment
Estimated effect:(3.9791388232170393,)
New effect:(-0.16334815281211698,)

	0	1	2	3	4
treatment	1.000000	0.000000	0.000000	0.000000	0.000000
y_factual	5.599916	6.875856	2.996273	1.366206	1.963538
y_cfactual	4.318780	7.856495	6.633952	5.697239	6.202582
mu0	3.268256	6.636059	1.570536	1.244738	1.685048
mu1	6.854457	7.562718	6.121617	5.889125	6.191994
x1	-0.528603	-1.736945	-0.807451	0.390083	-1.045229
x2	-0.343455	-1.802002	-0.202946	0.596582	-0.602710
x3	1.128554	0.383828	-0.360898	-1.850350	0.011465
x4	0.161703	2.244320	-0.879606	-0.879606	0.161703
x5	-0.316603	-0.629189	0.808706	-0.004017	0.683672
x6	1.295216	1.295216	-0.526556	-0.857787	-0.360940
x7	1.000000	0.000000	0.000000	0.000000	1.000000
x8	0.000000	0.000000	0.000000	0.000000	0.000000
x9	1.000000	0.000000	0.000000	0.000000	0.000000
x10	0.000000	1.000000	1.000000	0.000000	0.000000
x11	0.000000	0.000000	0.000000	0.000000	0.000000
x12	0.000000	0.000000	0.000000	1.000000	1.000000
x13	0.000000	1.000000	0.000000	1.000000	1.000000
x14	1.000000	1.000000	2.000000	2.000000	1.000000
x15	0.000000	1.000000	0.000000	0.000000	0.000000
x16	1.000000	1.000000	1.000000	1.000000	1.000000
x17	1.000000	1.000000	0.000000	0.000000	1.000000
x18	1.000000	1.000000	1.000000	1.000000	1.000000
x19	1.000000	1.000000	1.000000	1.000000	1.000000
x20	0.000000	0.000000	0.000000	0.000000	0.000000
x21	0.000000	0.000000	0.000000	0.000000	0.000000
x22	0.000000	0.000000	0.000000	0.000000	0.000000
x23	0.000000	0.000000	0.000000	0.000000	0.000000
x24	0.000000	0.000000	0.000000	0.000000	0.000000
x25	0.000000	0.000000	0.000000	0.000000	0.000000

	count	mean	std	min	25%	50%	75%	max
treatment	747.0	1.860776e-01	0.389430	0.000000	0.000000	0.000000	0.000000	1.000000
y_factual	747.0	3.159538e+00	2.179956	-1.543902	1.626779	2.577294	4.494637	11.268228
y_cfactual	747.0	5.696107e+00	1.980121	-1.037628	5.053598	6.209686	6.948922	10.171004
mu0	747.0	2.432513e+00	1.281515	0.924453	1.518409	2.114661	2.989305	9.821792
mu1	747.0	6.448580e+00	0.454766	5.591647	6.087863	6.419095	6.765241	7.954804
x1	747.0	4.518177e-17	1.000000	-2.731287	-0.666946	0.165275	0.813759	1.505476
x2	747.0	9.511951e-18	1.000000	-3.800823	-0.602710	0.196818	0.596582	2.595403
x3	747.0	-5.231573e-17	1.000000	-1.850350	-0.733261	-0.360898	0.756191	2.990369
x4	747.0	-6.135208e-16	1.000000	-0.879606	-0.879606	0.161703	0.161703	2.244320
x5	747.0	3.091384e-17	1.000000	-5.130428	-0.566672	0.121017	0.683672	2.371637
x6	747.0	-7.514441e-16	1.000000	-1.851480	-0.857787	-0.029709	0.632754	2.951372
x7	747.0	5.140562e-01	0.500137	0.000000	0.000000	1.000000	1.000000	1.000000
x8	747.0	9.370817e-02	0.291618	0.000000	0.000000	0.000000	0.000000	1.000000
x9	747.0	5.207497e-01	0.499904	0.000000	0.000000	1.000000	1.000000	1.000000
x10	747.0	3.641232e-01	0.481506	0.000000	0.000000	0.000000	1.000000	1.000000
x11	747.0	2.690763e-01	0.443777	0.000000	0.000000	0.000000	1.000000	1.000000
x12	747.0	2.195448e-01	0.414216	0.000000	0.000000	0.000000	0.000000	1.000000
x13	747.0	3.587684e-01	0.479960	0.000000	0.000000	0.000000	1.000000	1.000000
x14	747.0	1.463186e+00	0.498977	1.000000	1.000000	1.000000	2.000000	2.000000
x15	747.0	1.405622e-01	0.347802	0.000000	0.000000	0.000000	0.000000	1.000000
x16	747.0	9.598394e-01	0.196467	0.000000	1.000000	1.000000	1.000000	1.000000
x17	747.0	5.943775e-01	0.491341	0.000000	0.000000	1.000000	1.000000	1.000000
x18	747.0	9.638554e-01	0.186775	0.000000	1.000000	1.000000	1.000000	1.000000
x19	747.0	1.352075e-01	0.342174	0.000000	0.000000	0.000000	0.000000	1.000000
x20	747.0	1.352075e-01	0.342174	0.000000	0.000000	0.000000	0.000000	1.000000
x21	747.0	1.566265e-01	0.363692	0.000000	0.000000	0.000000	0.000000	1.000000
x22	747.0	8.165997e-02	0.274029	0.000000	0.000000	0.000000	0.000000	1.000000
x23	747.0	7.362784e-02	0.261339	0.000000	0.000000	0.000000	0.000000	1.000000
x24	747.0	1.285141e-01	0.334886	0.000000	0.000000	0.000000	0.000000	1.000000
x25	747.0	1.579652e-01	0.364953	0.000000	0.000000	0.000000	0.000000	1.000000

	variable	feature_importance
0	treatment	3.921270
6	x6	1.709902
15	x15	0.532079
5	x5	0.415899
1	x1	0.405943

Inspecting the infant health and development program (IHDP)

Loading the data

Basic analysis

Causal analysis