Ported Think Stat's `survey.py` script
dct = nsfg.get_dct_dataframe(nsfg.data_dir/nsfg.dct_file)
dct.head()
%%time
dat = nsfg.get_dat_dataframe(nsfg.data_dir/nsfg.dat_file, dct)
df = dat[nsfg.columns_of_interest].copy()
df = nsfg.cleanup(df)
Effect size¶
col = "prglngth"
col1 = "birthord"
length_lim = (27, 46)
length_mask = (df[col] >= length_lim[0]) & (df[col]<=length_lim[1])
first_mask = df[col1]==1
other_mask = df[col1]!=1
m, v = df.loc[(first_mask & length_mask) , col].mean(), df.loc[(first_mask & length_mask) , col].var()
mo, vo = df.loc[(other_mask & length_mask) , col].mean(), df.loc[(other_mask & length_mask) , col].var()
m, v, mo, vo
effect_size(m, mo, v, vo, (first_mask & length_mask).sum(), (other_mask & length_mask).sum(),)
Probability functions¶
x = np.random.choice([1,2,3], p=[.1, .6, .3], size=10)
pmf = get_discrete_pmf(x)
pmf.xk, pmf.pk
ax = pd.DataFrame({"x":pmf.xk, "p":pmf.pk}).plot(kind="bar", x="x", y="p", label="pmf")
ax.plot(pmf.cdf(pmf.xk), '-r', label="cdf")
ax.legend()
samples = np.random.normal(loc=1., scale=2., size=100)
kde = stats.gaussian_kde(samples)
fig, ax = plt.subplots()
x = np.linspace(-10,10,100)
ax.plot(x, kde.evaluate(x))
ax.hist(samples, bins=20, density=True)
plt.show()
k = 2
central_moment(x, k=k), moment(x, k=k), stats.norm.moment(k, scale=2, loc=1)
PearsonsMedianSkewness(x)
Sampling distributions¶
Computing and visualizing biased and unbiased statistics over synthesized data.
population_generator = stats.norm(loc=90., scale=5.) # rv of the population to draw from
sample_size = 9 # number of samples to draw
n_iter = 200 # number of trials drawing `sample_size` samples
Setting up the telemetry storage
process_telemetry = {
"mean": np.zeros(n_iter),
"std_biased": np.zeros(n_iter),
"std_unbiased": np.zeros(n_iter),
"sem": np.zeros(n_iter),
}
Executing n_iter
trials drawing sample_size
samples each
%%time
for i in range(n_iter):
population = population_generator.rvs(size=sample_size)
process_telemetry["mean"][i] = np.mean(population)
process_telemetry["std_biased"][i] = np.std(population)
process_telemetry["std_unbiased"][i] = np.std(population, ddof=1)
process_telemetry["sem"][i] = stats.sem(population)
Visualizing telemetry in serial form
fig, axs = plt.subplots(figsize=(10,7), ncols=1, nrows=4, constrained_layout=True,
sharex=True)
ax = axs[0]
ax.plot(process_telemetry["mean"], marker="o")
ax.set_title("Mean")
ax.set_ylim((population_generator.kwds["loc"]-10, population_generator.kwds["loc"]+10))
ax = axs[1]
ax.plot(process_telemetry["std_unbiased"], marker="o")
ax.set_title("std - unbiased")
ax = axs[2]
ax.plot(process_telemetry["std_biased"], marker="o")
ax.set_title("std - biased")
ax = axs[3]
ax.plot(process_telemetry["sem"], marker="o")
ax.set_title("standard error mean")
plt.show()
Visualizing telemetry using distributions
fig, axs = plt.subplots(figsize=(10,7), ncols=1, nrows=4, constrained_layout=True,)
bins = 50
plot_hist_with_kde(process_telemetry["mean"], left_ext=5, right_ext=5, plot_params={"title":"Mean"},
ax=axs[0])
plot_hist_with_kde(process_telemetry["std_unbiased"], plot_params={"title":"std - unbiased"},
ax=axs[1])
plot_hist_with_kde(process_telemetry["std_biased"], plot_params={"title":"std - biased"},
ax=axs[2])
plot_hist_with_kde(process_telemetry["sem"], plot_params={"title":"standard error mean"},
ax=axs[3])
plt.show()