%matplotlib inline

data_dir

WindowsPath('../data/nsfg')

data_dir/dat_file, data_dir/dct_file

(WindowsPath('../data/nsfg/2002FemPreg.dat'),
 WindowsPath('../data/nsfg/2002FemPreg.dct'))

Parsing DCT files¶

dct = get_dct_dataframe(data_dir/dct_file)
dct.head()

len(dct), len(dct.columns)

(243, 6)

Parsing DAT files¶

%%time
dat = get_dat_dataframe(data_dir/dat_file, dct)

Wall time: 1.76 s

dat.head()

len(dat), len(dat.columns)

(13593, 243)

Basic EDA¶

df = dat[columns_of_interest].copy()

display_all(df.isnull().sum()/len(df), sort_values=True)

birthwgt_oz    0.331494
birthwgt_lb    0.327301
birthord       0.327007
agepreg        0.025896
finalwgt       0.000000
pregordr       0.000000
outcome        0.000000
prglngth       0.000000
caseid         0.000000
dtype: float64

df = cleanup(df)

%%time
ax = pd.plotting.scatter_matrix(df.sample(n=100), figsize=(12,12))

Wall time: 2.56 s

col = "prglngth"
col1 = "birthord"

xlim = [27,46]
ax = df.loc[df[col1]==1, col].hist(alpha=.5, label="first", range=xlim, density=True)
ax = df.loc[df[col1]!=1, col].hist(ax=ax, alpha=.5, label="other", range=xlim, density=True)
ax.legend()
ax.set_xlabel(col)
plt.show()

Export cleaned NSFG data¶

clean_file_path

WindowsPath('../data/2002FemPreg.csv')

df.to_csv(clean_file_path, index=False, encoding="utf8")

	start	vtype	name	fstring	desc	end
0	1	str	caseid	%12s	RESPONDENT ID NUMBER	13
1	13	int	pregordr	%2f	PREGNANCY ORDER (NUMBER)	15
2	15	int	howpreg_n	%2f	BB-2 # OF WEEKS OR MONTHS CURRENTLY PREGNANT	17
3	17	int	howpreg_p	%1f	BB-2 CURRENT PREGNANCY LENGTH REPORTED IN MONT...	18
4	18	int	moscurrp	%1f	NUMBER OF MONTHS CURRENTLY PREGNANT	19

	caseid	pregordr	howpreg_n	howpreg_p	moscurrp	nowprgdk	pregend1	pregend2	nbrnaliv	multbrth	...	basewgt	adj_mod_basewgt	finalwgt	secu_p	sest	cmintvw
0	1	1	NaN	NaN	NaN	NaN	6.0	NaN	1.0	NaN	...	3410.389399	3869.349602	6448.271112	2	9	1231
1	1	2	NaN	NaN	NaN	NaN	6.0	NaN	1.0	NaN	...	3410.389399	3869.349602	6448.271112	2	9	1231
2	2	1	NaN	NaN	NaN	NaN	5.0	NaN	3.0	5.0	...	7226.301740	8567.549110	12999.542264	2	12	1231
3	2	2	NaN	NaN	NaN	NaN	6.0	NaN	1.0	NaN	...	7226.301740	8567.549110	12999.542264	2	12	1231
4	2	3	NaN	NaN	NaN	NaN	6.0	NaN	1.0	NaN	...	7226.301740	8567.549110	12999.542264	2	12	1231

NSFG

Parsing DCT files¶

`type_sub`[source]

`get_dct_dataframe`[source]

Parsing DAT files¶

`get_dat_dataframe`[source]

Basic EDA¶

`display_all`[source]

`cleanup`[source]

Export cleaned NSFG data¶

NSFG

Parsing DCT files¶

type_sub[source]

get_dct_dataframe[source]

Parsing DAT files¶

get_dat_dataframe[source]

Basic EDA¶

display_all[source]

cleanup[source]

Export cleaned NSFG data¶

`type_sub`[source]

`get_dct_dataframe`[source]

`get_dat_dataframe`[source]

`display_all`[source]

`cleanup`[source]