dep_var = 'meter_reading'
loading.N_TRAIN = 100_000
data_path = loading.DATA_PATH
%%time
ashrae_data = loading.load_all()
%%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'
t_train = None
train_frac = .8
splits = split_dataset(ashrae_data['meter_train'], split_kind=split_kind, train_frac=train_frac,
t_train=t_train)
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(ashrae_data["meter_train"]):.4f}, valid {len(splits[1])} = {len(splits[1])/len(ashrae_data["meter_train"]):.4f}')
t_train = ashrae_data['meter_train'].iloc[splits[0]][['timestamp']]
t_train.head()
%%time
#t_train.to_parquet(data_path/'t_train.parquet')
Visualizing the split
%%time
tmp = pd.concat((ashrae_data['meter_train'].iloc[splits[0]]
.assign(label='train')
.assign(meter_reading=lambda x: np.log(x['meter_reading']+1)),
(ashrae_data['meter_train'].iloc[splits[1]]
.assign(label='valid')
.assign(meter_reading=lambda x: np.log(x['meter_reading']+1)))),
axis=0, ignore_index=True)
tmp.groupby('label').size()
px.scatter(tmp.sample(10000), x='timestamp', y='meter_reading', color='label')
potentially sensible cleaning to do (https://www.kaggle.com/purist1024/ashrae-simple-data-cleanup-lb-1-08-no-leaks):
- remove all 0s for meter 0
- remove all 0s for meter 2 and 3 if not summer
- potentially remove 0s for meter 1 during winter
- remove "known-bad" electrical readings from the first 141 days of the data for site 0 (i.e. UCF)
- remove most absurdly high readings from building 1099. These are orders of magnitude higher than all data, and have been emperically seen in LB probes to be harmful outliers.
- time time zone for weather data
- impute nas for weather data
- convert cyclic features, like hour, to 2d features (sin,cos)
Only applying the dep_var
transform
%%time
processor = Processor()
_df, _vars = processor(ashrae_data['meter_train'])
display(_df.head(), _vars)
px.histogram(_df.groupby('meter').sample(1000),
x='meter_reading_log1p', facet_row='meter')
The meter readings for building_id
363 before July 30th are likely due to a construction phase since the bulding's year value is 2017. So this method removes the readings from during the construction time.
%%time
processor = Processor()
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
processor = Processor(t_train=t_train)
tfms_config = {'fix_bid_363':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly(bid=363, meter=0)
There seem to be quite a few imputed / filled values in the meter readings, being visible as constant meter readings for more than a week at a time. This method removes those values.
%%time
it = inspection.InspectTimeseries(ashrae_data['meter_train'],
building=ashrae_data['building'])
it.inspect_boldly(bid=363, meter=0)
%%time
processor = Processor()
tfms_config = {'remove_0s_meter0':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly(meter=0)
%%time
processor = Processor()
tfms_config = {'remove_not_summer_0s_meter_2_and_3':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly(meter=2)
%%time
processor = Processor()
tfms_config = {'remove_bad_meter0_readings_of_first_141days':{}}
_df, _vars = processor(ashrae_data['meter_train'], df_building=ashrae_data['building'],
tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly(meter=0)
%%time
processor = Processor()
tfms_config = {'fix_bid_1099':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
processor = Processor(t_train=t_train)
tfms_config = {'fix_bid_1099':{}}
_df, _vars = processor(ashrae_data['meter_train'], tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly(bid=1099, meter=2)
%%time
processor = Processor()
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==0)&(ashrae_data['meter_train']['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_imputed_weeks':{'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==0)&(ashrae_data['meter_train']['meter']==0)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly()
There are outliers! 😨 Let's remove them as well. Example building_id
60 and meter
1.
%%time
it = inspection.InspectTimeseries(ashrae_data['meter_train'],
building=ashrae_data['building'])
it.inspect_boldly(bid=60, meter=1)
%%time
processor = Processor()
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
processor = Processor(t_train=t_train)
tfms_config = {'remove_outliers':{'f':10,'dep_var':'meter_reading'}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
it = inspection.InspectTimeseries(_df, building=ashrae_data['building'],
dep_var='meter_reading_log1p')
it.inspect_boldly()
%%time
processor = Processor()
tfms_config = {'add_random_noise_features':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
%%time
processor = Processor()
tfms_config = {'add_building_features':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
df_building=ashrae_data['building'])
display(_df.head(), _vars)
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':False,
'add_na_indicators':False,
'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
df_building=ashrae_data['building'],
df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':False,
'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
df_building=ashrae_data['building'],
df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':True,
'impute_nas':False}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
df_building=ashrae_data['building'],
df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)
%%time
processor = Processor()
tfms_config = {'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':False,
'impute_nas':True}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config,
df_building=ashrae_data['building'],
df_weather=ashrae_data['weather_train'])
display(_df.head().T, _vars)
%%time
processor = Processor()
tfms_config = {'add_time_features':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head().T, _vars)
%%time
processor = Processor()
tfms_config = {'add_dep_var_stats':{}}
mask = (ashrae_data['meter_train']['building_id']==60)&(ashrae_data['meter_train']['meter']==1)
tmp = ashrae_data['meter_train'].loc[mask]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head().T, _vars)
%%time
mask = (ashrae_data['meter_test']['building_id']==60)&(ashrae_data['meter_test']['meter']==1)
tmp_test = ashrae_data['meter_test'].loc[mask]
_df_test, _ = processor(tmp_test, tfms_configs=tfms_config)
%%time
processor = Processor()
tfms_config = {'add_onehot_encoded':{}}
tmp = ashrae_data['meter_train'].loc[(ashrae_data['meter_train']['building_id']<=60)]
_df, _vars = processor(tmp, tfms_configs=tfms_config)
display(_df.head(), _vars)
Train set
%%time
processor = Processor(t_train=t_train)
tfms_config = {
'add_time_features':{},
'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':True,
'impute_nas':True},
'add_building_features':{},
}
tmp = ashrae_data['meter_train']
df_train, var_names = processor(tmp, tfms_configs=tfms_config,
df_weather=ashrae_data['weather_train'],
df_building=ashrae_data['building'])
Running through part of the test set
%%time
tmp = ashrae_data['meter_test']
df_test, _ = processor(tmp, tfms_configs=tfms_config,
df_weather=ashrae_data['weather_test'],
df_building=ashrae_data['building'])
Making sure the columns are aligned in train/var and test
%%time
df_test = align_test(df_train, var_names, df_test)
Done testing. Let's apply the transforms to the entire data set. Takes about 20min with remove_imputed_weeks
.
test_var_names(var_names)
%%time
store_var_names(data_path, var_names)
%%time
# var_names = load_var_names(data_path/'var_names.pckl')
%%time
store_df(data_path/'X.parquet', df_train)
%%time
store_df(data_path/'X_test.parquet', df_test)
%%time
df_train = load_df(data_path/'X.parquet')
%%time
tfms_config = {
'add_random_noise_features':{},
'add_time_features':{},
'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':True,
'impute_nas':True},
'add_building_features':{},
}
df_train, df_test, var_names = preprocess_all(ashrae_data, tfms_config)