Current implementation of this notebook leads to (private leaderboard score):
- baseline (linear regression on dep_var_stats and meter 1hot) 1.7
- RandomForest, tabular_learner, lgbm at ~1.45,
- ensembling tabular_learner, RandomForest and lgbm gives ~1.4
%load_ext autoreload
%autoreload 2
import tqdm
from multiprocessing import Pool
pd.options.plotting.backend = "plotly"
do_test = True
do_submit = False
data_path = loading.DATA_PATH
loading.N_TRAIN = 100_000
loading.N_TEST = 100_000
Loading from scratch
%%time
ashrae_data = loading.load_all()
Takes about 3min30
%%time
processor = preprocessing.Processor() # t_train=t_train
tfms_config = {
# 'fix_bid_363':{},
# 'fix_bid_1099':{'threshold': 10.},
# 'remove_bad_meter0_readings_of_first_141days': {},
# 'remove_not_summer_0s_meter_2_and_3': {},
# 'remove_0s_meter0': {},
# 'remove_outliers':{'f':10,'dep_var':'meter_reading'},
# 'remove_imputed_weeks':{'dep_var':'meter_reading'},
# 'add_dep_var_stats':{},
'add_random_noise_features':{},
'add_time_features':{},
'add_weather_features':{'fix_time_offset':True,
'add_na_indicators':True,
'impute_nas':True},
'add_building_features':{},
# 'add_onehot_encoded':{},
}
df, var_names = processor(ashrae_data['meter_train'], tfms_configs=tfms_config,
df_weather=ashrae_data['weather_train'],
df_building=ashrae_data['building'])
display(df.head(), var_names)
%time
df_test, _ = processor(ashrae_data['meter_test'], tfms_configs=tfms_config,
df_weather=ashrae_data['weather_test'],
df_building=ashrae_data['building'])
df_test = preprocessing.align_test(df, var_names, df_test)
%%time
# df.to_parquet(data_path/'X.parquet')
# df_test.to_parquet(data_path/'X_test.parquet')
# pickle.dump(var_names, open(data_path/'var_names.pkl', 'wb'))
%%time
# var_names = preprocessing.load_var_names(data_path/'var_names.pckl')
# var_names
%%time
# df = preprocessing.load_df(data_path/'X.parquet') #.sample(100000)
# if do_test:
# df_test = preprocessing.load_df(data_path/'X_test.parquet') #.sample(100000)
len(df), len(df_test)
%%time
n = len(df)
if False: # per building_id and meter sampling
n_sample_per_bid = 500
replace = True
df = (df.groupby(['building_id', 'meter'])
.sample(n=n_sample_per_bid, replace=replace))
if False: # general sampling
frac_samples = .1
replace = False
df = (df.sample(frac=frac_samples, replace=replace))
print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')
Split settings
%%time
# t_train = pd.read_parquet(data_path/'t_train.parquet')
t_train = None
%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'
train_frac = .9
Splitting
splits = preprocessing.split_dataset(df, split_kind=split_kind, train_frac=train_frac,
t_train=t_train)
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(df):.4f}, valid {len(splits[1])} = {len(splits[1])/len(df):.4f}')
var_names
var_names_no_anns = {
'dep_var': var_names['dep_var'],
'conts': [v for v in var_names['conts'] if 'meter_reading' in v],
'cats': [v for v in var_names['cats'] if v.startswith('meter_')]
}
var_names_no_anns
var_names_anns = {
'dep_var': var_names['dep_var'],
'conts': var_names['conts'],
'cats': [v for v in var_names['cats'] if not v.startswith('meter_')]
}
var_names_anns
Takes about 6 minutes on 100% of the data
%%time
procs = [Categorify, FillMissing, Normalize]
to = feature_testing.get_tabular_object(df,
var_names,
splits=splits,
procs=procs)
Storing to
%%time
# pickle.dump(to, open(data_path/'to.pkl', 'wb'))
Loading to
%%time
# to = pickle.load(open(data_path/'to.pkl', 'rb'))
Creating data loaders
%%time
train_bs = 1000
val_bs = 1000
dls = to.dataloaders(bs=train_bs, val_bs=val_bs)
Saving dls
%%time
# torch.save(dls, data_path/'dls.pkl')
Warning: Takes about 14min with the test set
%%time
test_bs = 1000
if do_test:
test_dl = dls.test_dl(df_test, bs=test_bs)
%%time
# torch.save(test_dl, data_path/'test_dl.pkl')
Loading dls
%%time
# dls = torch.load(data_path/'dls.pkl')
%%time
# test_dl = torch.load(data_path/'test_dl.pkl')
Fastai finding: make sure your test set values are not out of domain $\Rightarrow$ timestampYear
in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.
np.min([to.train.ys.values.min(), to.valid.ys.values.min()]), np.max([to.train.ys.values.max(), to.valid.ys.values.max()])
y_range = (-.1, 17)
# layers = [4000, 2000, 1000, 500, 250]
layers = [50, 20] # [1600, 800, 400, 200]
# embed_p = .01
embed_p = 0.
# ps = [.1, .1, .1, .1, .1]
ps = [.0 for _ in layers]
# ps[0] = .2
# config = None
config = tabular_config(embed_p=embed_p, ps=ps,
# act_cls=Swish(inplace=True)
)
# config = tabular_config(act_cls=nn.ReLU(inplace=True))
# config = tabular_config(act_cls=Swish(inplace=True))
# config = tabular_config(act_cls=Sine(inplace=True))
learn = tabular_learner(dls, y_range=y_range,
layers=layers, n_out=1,
config=config,
# wd=.01,
loss_func=evaluate_torch) #.to_fp16()
run = -1 # a counter for `fit_one_cycle` executions
%%time
# learn.save('1600-800-400-200')
%%time
# learn = learn.load('1600-800-400-200')
run += 1
print(f'run #{run}')
learn.fit_one_cycle(5, lr_max=1e-1)
%%time
y_valid_pred, y_valid_true = learn.get_preds()
y_valid_pred_fast = cnr(y_valid_pred)
nb_score = evaluate_torch(y_valid_true,
y_valid_pred).item()
print(f'fastai loss {nb_score:.4f}')
%%time
if do_test:
y_test_pred, _ = learn.get_preds(dl=test_dl)
y_test_pred_fast = cnr(y_test_pred)
y_test_pred = cnr(y_test_pred)
y_valid_pred, y_valid_true = cnr(y_valid_pred), cnr(y_valid_true)
replacing categorical features for trees with learned embeddings
trees_with_embeddings = False
if trees_with_embeddings:
X_emb_train = to.train.xs.copy()
X_emb_val = to.valid.xs.copy()
%%time
ef = EmbeddingFeatures(to, learn)
ef.df_embs['building_id'].iloc[:5,:3]
TODO: fix memory error in the creation of X_emb
%%time
if trees_with_embeddings:
X_emb_train = ef.replace_cat_features_with_embeddings(X_emb_train)
X_emb_val = ef.replace_cat_features_with_embeddings(X_emb_val)
%%time
params = {'n_estimators': 20, 'max_features': 'sqrt', 'n_jobs':1}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression
m = model(**params)
%%time
if trees_with_embeddings:
m.fit(X_emb_train, to.train.ys.values.ravel())
else:
m.fit(to.train.xs.values, to.train.ys.values.ravel())
%%time
if trees_with_embeddings:
y_valid_pred = m.predict(X_emb_val)
else:
y_valid_pred = m.predict(to.valid.xs.values)
y_valid_pred_sk = np.copy(y_valid_pred)
%%time
if do_test:
if trees_with_embeddings:
y_test_pred = ef.predict_with_embeddings(test_dl.xs, m.set_params(n_jobs=-1),
num_workers=1)
else:
y_test_pred = m.predict(test_dl.xs)
y_test_pred_sk = np.copy(y_test_pred)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true),
torch.from_numpy(y_valid_pred)).item()
print(f'sklearn loss {nb_score:.4f}')
%%time
if trees_with_embeddings:
lgb_train = lgb.Dataset(X_emb_train, to.train.ys.values.ravel())
lgb_eval = lgb.Dataset(X_emb_val, to.valid.ys.values.ravel(),
reference=lgb_train)
else:
lgb_train = lgb.Dataset(to.train.xs.values, to.train.ys.values.ravel())
lgb_eval = lgb.Dataset(to.valid.xs.values, to.valid.ys.values.ravel(),
reference=lgb_train)
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': 'l2',
'num_leaves': 42,
'learning_rate': 0.5,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
%%time
gbm = lgb.train(params, lgb_train,
num_boost_round=10,
valid_sets=lgb_eval,
early_stopping_rounds=5)
%%time
if trees_with_embeddings:
y_valid_pred_lgbm = gbm.predict(X_emb_val,
num_iteration=gbm.best_iteration)
else:
y_valid_pred_lgbm = gbm.predict(to.valid.xs.values,
num_iteration=gbm.best_iteration)
y_valid_pred = np.copy(y_valid_pred_lgbm)
%%time
if do_test:
if trees_with_embeddings:
y_test_pred_lgbm = gbm.predict(X_emb_test,
num_iteration=gbm.best_iteration)
else:
y_test_pred_lgbm = gbm.predict(test_dl.xs.values,
num_iteration=gbm.best_iteration)
y_test_pred = np.copy(y_test_pred_lgbm)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true),
torch.from_numpy(y_valid_pred)).item()
print(f'lgbm loss {nb_score:.4f}')
y_valid_pred = np.mean([
y_valid_pred_sk,
y_valid_pred_fast,
y_valid_pred_lgbm
], axis=0)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true),
torch.from_numpy(y_valid_pred)).item()
print(f'ensembling loss {nb_score:.4f}')
%%time
if do_test:
y_test_pred = np.mean([
y_test_pred_sk,
y_test_pred_fast,
y_test_pred_lgbm
], axis=0)
Train vs validation distributions
feature_testing.hist_plot_preds(pick_random(y_valid_true),
pick_random(y_valid_pred),
label0='truth', label1='prediction')
if do_test:
feature_testing.hist_plot_preds(pick_random(y_valid_true),
pick_random(y_test_pred),
label0='truth (validation)',
label1='prediction (test set)').show()
%%time
miss_cols = [v for v in ['building_id', 'meter','timestamp'] if v not in to.valid.xs.columns]
tmp = to.valid.xs.join(df.loc[:,miss_cols]) if len(miss_cols)>0 else to.valid.xs
bwt = feature_testing.BoldlyWrongTimeseries(tmp, y_valid_true, y_valid_pred)
bwt.run_boldly()
%%time
if do_test:
y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1
y_out = pd.DataFrame(cnr(y_test_pred_original),
columns=['meter_reading'],
index=test_dl.xs.index)
display(y_out.head())
assert len(y_out) == 41697600
%%time
if do_submit:
y_out.to_csv(data_path/'my_submission.csv',
float_format='%.4f')
kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"
act = 'ReLu'
# lin_model_msg = f'baseline (linear regression)'
rf_model_msg = f'RandomForest: {pretty_dictionary(params)}'
if trees_with_embeddings:
rf_model_msg += ' (with embeddings)'
lgbm_model_msg = f'LGBM: {pretty_dictionary(params)}'
fast_model_msg = f'tabular_learner (run #{run}): act {act}, layers {layers}, ps {ps}, embed_p {embed_p}'
# model_msg = f'Ensembling tabular_learner and RandomForest ({fast_model_msg}, {rf_model_msg})'
model_msg = f'Ensembling LGBM, tabular_learner and RandomForest ({lgbm_model_msg}, {fast_model_msg}, {rf_model_msg})'
# model_msg = rf_model_msg
# model_msg = lgbm_model_msg
split_msg = f'split kind "{split_kind}" N_TRAIN {loading.N_TRAIN}'
samples_msg = f'num samples {len(dls.xs)} = {len(dls.xs)/20216100/2*100:.2f} %'
features_msg = f'weather and building features'
score_msg = f'nb score {nb_score:.4f}'
# message = ['baseline (linear regression on dep_var_stats and 1hot meter) ', '500 obs/bid', f'nb score {nb_score:.4f}']
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
# message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join([model_msg, samples_msg, split_msg, features_msg, score_msg])
message
if do_test and do_submit:
print('Submitting...')
!kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'