Current implementation of this notebook leads to (private leaderboard score):

baseline (linear regression on dep_var_stats and meter 1hot) 1.7
RandomForest, tabular_learner, lgbm at ~1.45,
ensembling tabular_learner, RandomForest and lgbm gives ~1.4

%load_ext autoreload
%autoreload 2

import tqdm
from multiprocessing import Pool

pd.options.plotting.backend = "plotly"

do_test = True
do_submit = False
data_path = loading.DATA_PATH

loading.N_TRAIN = 100_000
loading.N_TEST = 100_000

Loading from scratch

%%time
ashrae_data = loading.load_all()

Takes about 3min30

%%time
processor = preprocessing.Processor() # t_train=t_train
tfms_config = {
#     'fix_bid_363':{},
#     'fix_bid_1099':{'threshold': 10.},
#     'remove_bad_meter0_readings_of_first_141days': {},
#     'remove_not_summer_0s_meter_2_and_3': {},
#     'remove_0s_meter0': {},
#     'remove_outliers':{'f':10,'dep_var':'meter_reading'},
#     'remove_imputed_weeks':{'dep_var':'meter_reading'},
#     'add_dep_var_stats':{},
    'add_random_noise_features':{},
    'add_time_features':{},
    'add_weather_features':{'fix_time_offset':True,
                            'add_na_indicators':True,
                            'impute_nas':True},
    'add_building_features':{},
#     'add_onehot_encoded':{},
}

df, var_names = processor(ashrae_data['meter_train'], tfms_configs=tfms_config,
                          df_weather=ashrae_data['weather_train'],
                          df_building=ashrae_data['building'])
display(df.head(), var_names)

%time
df_test, _ = processor(ashrae_data['meter_test'], tfms_configs=tfms_config,
                         df_weather=ashrae_data['weather_test'],
                         df_building=ashrae_data['building'])
df_test = preprocessing.align_test(df, var_names, df_test)

%%time
# df.to_parquet(data_path/'X.parquet')
# df_test.to_parquet(data_path/'X_test.parquet')
# pickle.dump(var_names, open(data_path/'var_names.pkl', 'wb'))

Loading

%%time
# var_names = preprocessing.load_var_names(data_path/'var_names.pckl')
# var_names

%%time
# df = preprocessing.load_df(data_path/'X.parquet') #.sample(100000)

# if do_test:
#     df_test = preprocessing.load_df(data_path/'X_test.parquet') #.sample(100000)

len(df), len(df_test)

Sampling `df`

%%time
n = len(df)

if False: # per building_id and meter sampling
    n_sample_per_bid = 500
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

if False: # general sampling
    frac_samples = .1
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

Split

Split settings

%%time
# t_train = pd.read_parquet(data_path/'t_train.parquet')
t_train = None

%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'
train_frac = .9

Splitting

splits = preprocessing.split_dataset(df, split_kind=split_kind, train_frac=train_frac,
                                     t_train=t_train)
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(df):.4f}, valid {len(splits[1])} = {len(splits[1])/len(df):.4f}')

var_names

var_names_no_anns = {
    'dep_var': var_names['dep_var'],
    'conts': [v for v in var_names['conts'] if 'meter_reading' in v],
    'cats': [v for v in var_names['cats'] if v.startswith('meter_')]
}
var_names_no_anns

var_names_anns = {
    'dep_var': var_names['dep_var'],
    'conts': var_names['conts'],
    'cats': [v for v in var_names['cats'] if not v.startswith('meter_')]
}
var_names_anns

Takes about 6 minutes on 100% of the data

%%time
procs = [Categorify, FillMissing, Normalize]
to = feature_testing.get_tabular_object(df,
                                        var_names,
                                        splits=splits,
                                        procs=procs)

Storing to

%%time
# pickle.dump(to, open(data_path/'to.pkl', 'wb'))

Loading to

%%time
# to = pickle.load(open(data_path/'to.pkl', 'rb'))

Creating data loaders

%%time
train_bs = 1000
val_bs = 1000

dls = to.dataloaders(bs=train_bs, val_bs=val_bs)

Saving dls

%%time
# torch.save(dls, data_path/'dls.pkl')

Warning: Takes about 14min with the test set

%%time
test_bs = 1000

if do_test:
    test_dl = dls.test_dl(df_test, bs=test_bs)

%%time
# torch.save(test_dl, data_path/'test_dl.pkl')

Loading dls

%%time
# dls = torch.load(data_path/'dls.pkl')

%%time
# test_dl = torch.load(data_path/'test_dl.pkl')

Modelling with

`fastai`

Fastai finding: make sure your test set values are not out of domain $\Rightarrow$ timestampYear in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

np.min([to.train.ys.values.min(), to.valid.ys.values.min()]), np.max([to.train.ys.values.max(), to.valid.ys.values.max()])

y_range = (-.1, 17)

# layers = [4000, 2000, 1000, 500, 250]
layers = [50, 20] # [1600, 800, 400, 200]

# embed_p = .01
embed_p = 0.

# ps = [.1, .1, .1, .1, .1]
ps = [.0 for _ in layers]
# ps[0] = .2

# config = None
config = tabular_config(embed_p=embed_p, ps=ps,
#                         act_cls=Swish(inplace=True)
                        )
# config = tabular_config(act_cls=nn.ReLU(inplace=True))
# config = tabular_config(act_cls=Swish(inplace=True))
# config = tabular_config(act_cls=Sine(inplace=True))

learn = tabular_learner(dls, y_range=y_range, 
                        layers=layers, n_out=1, 
                        config=config, 
#                         wd=.01,
                        loss_func=evaluate_torch) #.to_fp16()
run = -1 # a counter for `fit_one_cycle` executions

%%time
# learn.save('1600-800-400-200')

%%time
# learn = learn.load('1600-800-400-200')

run += 1
print(f'run #{run}')
learn.fit_one_cycle(5, lr_max=1e-1)

%%time
y_valid_pred, y_valid_true = learn.get_preds()
y_valid_pred_fast = cnr(y_valid_pred)

nb_score = evaluate_torch(y_valid_true, 
                          y_valid_pred).item()
print(f'fastai loss {nb_score:.4f}')

%%time
if do_test:
    y_test_pred, _ = learn.get_preds(dl=test_dl)
    y_test_pred_fast = cnr(y_test_pred)
    y_test_pred = cnr(y_test_pred)

y_valid_pred, y_valid_true = cnr(y_valid_pred), cnr(y_valid_true)

replacing categorical features for trees with learned embeddings

trees_with_embeddings = False

if trees_with_embeddings:
    X_emb_train = to.train.xs.copy()
    X_emb_val = to.valid.xs.copy()

%%time
ef = EmbeddingFeatures(to, learn)

ef.df_embs['building_id'].iloc[:5,:3]

TODO: fix memory error in the creation of X_emb

%%time
if trees_with_embeddings:
    X_emb_train = ef.replace_cat_features_with_embeddings(X_emb_train)
    X_emb_val = ef.replace_cat_features_with_embeddings(X_emb_val)

`sklearn`

%%time
params = {'n_estimators': 20, 'max_features': 'sqrt', 'n_jobs':1}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression

m = model(**params)

%%time
if trees_with_embeddings:
    m.fit(X_emb_train, to.train.ys.values.ravel())
else:
    m.fit(to.train.xs.values, to.train.ys.values.ravel())

%%time
if trees_with_embeddings:
    y_valid_pred = m.predict(X_emb_val)
else:
    y_valid_pred = m.predict(to.valid.xs.values)

y_valid_pred_sk = np.copy(y_valid_pred)

Note: the prediction with embeddings takes ~ 37 minutes.

%%time
if do_test:
    if trees_with_embeddings:
        y_test_pred = ef.predict_with_embeddings(test_dl.xs, m.set_params(n_jobs=-1), 
                                                 num_workers=1)
    else:
        y_test_pred = m.predict(test_dl.xs)
    y_test_pred_sk = np.copy(y_test_pred)

y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'sklearn loss {nb_score:.4f}')

`lightgbm`

%%time
if trees_with_embeddings:
    lgb_train = lgb.Dataset(X_emb_train, to.train.ys.values.ravel())
    lgb_eval = lgb.Dataset(X_emb_val, to.valid.ys.values.ravel(), 
                           reference=lgb_train)
else:
    lgb_train = lgb.Dataset(to.train.xs.values, to.train.ys.values.ravel())
    lgb_eval = lgb.Dataset(to.valid.xs.values, to.valid.ys.values.ravel(), 
                           reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 42,
    'learning_rate': 0.5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

%%time
gbm = lgb.train(params, lgb_train,
                num_boost_round=10,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

%%time
if trees_with_embeddings:
    y_valid_pred_lgbm = gbm.predict(X_emb_val,
                                    num_iteration=gbm.best_iteration)
else:
    y_valid_pred_lgbm = gbm.predict(to.valid.xs.values,
                                    num_iteration=gbm.best_iteration)

y_valid_pred = np.copy(y_valid_pred_lgbm)

%%time
if do_test:
    if trees_with_embeddings:
        y_test_pred_lgbm = gbm.predict(X_emb_test,
                                       num_iteration=gbm.best_iteration)
    else:
        y_test_pred_lgbm = gbm.predict(test_dl.xs.values,
                                       num_iteration=gbm.best_iteration)
    y_test_pred = np.copy(y_test_pred_lgbm)

y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'lgbm loss {nb_score:.4f}')

Ensembling

y_valid_pred = np.mean([
    y_valid_pred_sk, 
    y_valid_pred_fast,
    y_valid_pred_lgbm
], axis=0)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'ensembling loss {nb_score:.4f}')

%%time
if do_test:
    y_test_pred = np.mean([
        y_test_pred_sk, 
        y_test_pred_fast, 
        y_test_pred_lgbm
    ], axis=0)

Inspecting

`dep_var` distribution

Train vs validation distributions

feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                pick_random(y_valid_pred), 
                                label0='truth', label1='prediction')

if do_test:
    feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                    pick_random(y_test_pred), 
                                    label0='truth (validation)', 
                                    label1='prediction (test set)').show()

Boldly wrong predictions

%%time
miss_cols = [v for v in ['building_id', 'meter','timestamp'] if v not in to.valid.xs.columns]
tmp = to.valid.xs.join(df.loc[:,miss_cols]) if len(miss_cols)>0 else to.valid.xs
bwt = feature_testing.BoldlyWrongTimeseries(tmp, y_valid_true, y_valid_pred)

bwt.run_boldly()

Submission to kaggle

%%time
if do_test:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=test_dl.xs.index)
    display(y_out.head())

    assert len(y_out) == 41697600

Note: Writing to csv takes ~2min

%%time
if do_submit:
    y_out.to_csv(data_path/'my_submission.csv',
                 float_format='%.4f')

kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"

act = 'ReLu'

# lin_model_msg = f'baseline (linear regression)'
rf_model_msg = f'RandomForest: {pretty_dictionary(params)}'
if trees_with_embeddings:
    rf_model_msg += ' (with embeddings)'
lgbm_model_msg = f'LGBM: {pretty_dictionary(params)}'

fast_model_msg = f'tabular_learner (run #{run}): act {act}, layers {layers}, ps {ps}, embed_p {embed_p}'

# model_msg = f'Ensembling tabular_learner and RandomForest ({fast_model_msg}, {rf_model_msg})'
model_msg = f'Ensembling LGBM, tabular_learner and RandomForest ({lgbm_model_msg}, {fast_model_msg}, {rf_model_msg})'
# model_msg = rf_model_msg
# model_msg = lgbm_model_msg

split_msg = f'split kind "{split_kind}" N_TRAIN {loading.N_TRAIN}'
samples_msg = f'num samples {len(dls.xs)} = {len(dls.xs)/20216100/2*100:.2f} %'
features_msg = f'weather and building features'
score_msg = f'nb score {nb_score:.4f}'
# message = ['baseline (linear regression on dep_var_stats and 1hot meter) ', '500 obs/bid', f'nb score {nb_score:.4f}']
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
# message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join([model_msg, samples_msg, split_msg, features_msg, score_msg])
message

if do_test and do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'

Modelling & submitting

`evaluate_torch`[source]

Loading

Sampling `df`

Split

Modelling with

`fastai`

`class` `EmbeddingFeatures`[source]

`EmbeddingFeatures.replace_cat_features_with_embeddings`[source]

`sklearn`

`EmbeddingFeatures.embedding_assignment_func`[source]

`EmbeddingFeatures.predict_with_embeddings`[source]

`lightgbm`

Ensembling

Inspecting

`dep_var` distribution

`pick_random`[source]

Boldly wrong predictions

Submission to kaggle

`pretty_dictionary`[source]

Modelling & submitting

evaluate_torch[source]

Loading

Sampling df

Split

Modelling with

fastai

class EmbeddingFeatures[source]

EmbeddingFeatures.replace_cat_features_with_embeddings[source]

sklearn

EmbeddingFeatures.embedding_assignment_func[source]

EmbeddingFeatures.predict_with_embeddings[source]

lightgbm

Ensembling

Inspecting

dep_var distribution

pick_random[source]

Boldly wrong predictions

Submission to kaggle

pretty_dictionary[source]

`evaluate_torch`[source]

Sampling `df`

`fastai`

`class` `EmbeddingFeatures`[source]

`EmbeddingFeatures.replace_cat_features_with_embeddings`[source]

`sklearn`

`EmbeddingFeatures.embedding_assignment_func`[source]

`EmbeddingFeatures.predict_with_embeddings`[source]

`lightgbm`

`dep_var` distribution

`pick_random`[source]

`pretty_dictionary`[source]