Reading of NSFG [1] .dat and .dct files.
%matplotlib inline
data_dir
data_dir/dat_file, data_dir/dct_file
Parsing DCT files¶
dct = get_dct_dataframe(data_dir/dct_file)
dct.head()
len(dct), len(dct.columns)
Parsing DAT files¶
%%time
dat = get_dat_dataframe(data_dir/dat_file, dct)
dat.head()
len(dat), len(dat.columns)
Basic EDA¶
df = dat[columns_of_interest].copy()
display_all(df.isnull().sum()/len(df), sort_values=True)
df = cleanup(df)
%%time
ax = pd.plotting.scatter_matrix(df.sample(n=100), figsize=(12,12))
col = "prglngth"
col1 = "birthord"
xlim = [27,46]
ax = df.loc[df[col1]==1, col].hist(alpha=.5, label="first", range=xlim, density=True)
ax = df.loc[df[col1]!=1, col].hist(ax=ax, alpha=.5, label="other", range=xlim, density=True)
ax.legend()
ax.set_xlabel(col)
plt.show()
Export cleaned NSFG data¶
clean_file_path
df.to_csv(clean_file_path, index=False, encoding="utf8")