Playing with different models and submitting predictions over the test set to kaggle.

Current implementation of this notebook leads to (private leaderboard score):

  • baseline (linear regression on dep_var_stats and meter 1hot) 1.7
  • RandomForest, tabular_learner, lgbm at ~1.45,
  • ensembling tabular_learner, RandomForest and lgbm gives ~1.4
%load_ext autoreload
%autoreload 2
import tqdm
from multiprocessing import Pool
pd.options.plotting.backend = "plotly"
do_test = True
do_submit = False
data_path = loading.DATA_PATH
loading.N_TRAIN = 100_000
loading.N_TEST = 100_000

Loading from scratch

%%time
ashrae_data = loading.load_all()

Takes about 3min30

%%time
processor = preprocessing.Processor() # t_train=t_train
tfms_config = {
#     'fix_bid_363':{},
#     'fix_bid_1099':{'threshold': 10.},
#     'remove_bad_meter0_readings_of_first_141days': {},
#     'remove_not_summer_0s_meter_2_and_3': {},
#     'remove_0s_meter0': {},
#     'remove_outliers':{'f':10,'dep_var':'meter_reading'},
#     'remove_imputed_weeks':{'dep_var':'meter_reading'},
#     'add_dep_var_stats':{},
    'add_random_noise_features':{},
    'add_time_features':{},
    'add_weather_features':{'fix_time_offset':True,
                            'add_na_indicators':True,
                            'impute_nas':True},
    'add_building_features':{},
#     'add_onehot_encoded':{},
}

df, var_names = processor(ashrae_data['meter_train'], tfms_configs=tfms_config,
                          df_weather=ashrae_data['weather_train'],
                          df_building=ashrae_data['building'])
display(df.head(), var_names)

%time
df_test, _ = processor(ashrae_data['meter_test'], tfms_configs=tfms_config,
                         df_weather=ashrae_data['weather_test'],
                         df_building=ashrae_data['building'])
df_test = preprocessing.align_test(df, var_names, df_test)
%%time
# df.to_parquet(data_path/'X.parquet')
# df_test.to_parquet(data_path/'X_test.parquet')
# pickle.dump(var_names, open(data_path/'var_names.pkl', 'wb'))

evaluate_torch[source]

evaluate_torch(y_true:Tensor, y_pred:Tensor)

Loading

%%time
# var_names = preprocessing.load_var_names(data_path/'var_names.pckl')
# var_names
%%time
# df = preprocessing.load_df(data_path/'X.parquet') #.sample(100000)

# if do_test:
#     df_test = preprocessing.load_df(data_path/'X_test.parquet') #.sample(100000)
len(df), len(df_test)

Sampling df

%%time
n = len(df)

if False: # per building_id and meter sampling
    n_sample_per_bid = 500
    replace = True

    df = (df.groupby(['building_id', 'meter'])
         .sample(n=n_sample_per_bid, replace=replace))

if False: # general sampling
    frac_samples = .1
    replace = False

    df = (df.sample(frac=frac_samples, replace=replace))

print(f'using {len(df)} samples = {len(df)/n*100:.2f} %')

Split

Split settings

%%time
# t_train = pd.read_parquet(data_path/'t_train.parquet')
t_train = None

%time
#split_kind = 'random'
#split_kind = 'time'
# split_kind = 'fix_time'
split_kind = 'time_split_day'
train_frac = .9

Splitting

splits = preprocessing.split_dataset(df, split_kind=split_kind, train_frac=train_frac,
                                     t_train=t_train)
print(f'sets {len(splits)}, train {len(splits[0])} = {len(splits[0])/len(df):.4f}, valid {len(splits[1])} = {len(splits[1])/len(df):.4f}')
var_names
var_names_no_anns = {
    'dep_var': var_names['dep_var'],
    'conts': [v for v in var_names['conts'] if 'meter_reading' in v],
    'cats': [v for v in var_names['cats'] if v.startswith('meter_')]
}
var_names_no_anns
var_names_anns = {
    'dep_var': var_names['dep_var'],
    'conts': var_names['conts'],
    'cats': [v for v in var_names['cats'] if not v.startswith('meter_')]
}
var_names_anns

Takes about 6 minutes on 100% of the data

%%time
procs = [Categorify, FillMissing, Normalize]
to = feature_testing.get_tabular_object(df,
                                        var_names,
                                        splits=splits,
                                        procs=procs)

Storing to

%%time
# pickle.dump(to, open(data_path/'to.pkl', 'wb'))

Loading to

%%time
# to = pickle.load(open(data_path/'to.pkl', 'rb'))

Creating data loaders

%%time
train_bs = 1000
val_bs = 1000

dls = to.dataloaders(bs=train_bs, val_bs=val_bs)

Saving dls

%%time
# torch.save(dls, data_path/'dls.pkl')

Warning: Takes about 14min with the test set

%%time
test_bs = 1000

if do_test:
    test_dl = dls.test_dl(df_test, bs=test_bs) 
%%time
# torch.save(test_dl, data_path/'test_dl.pkl')

Loading dls

%%time
# dls = torch.load(data_path/'dls.pkl')
%%time
# test_dl = torch.load(data_path/'test_dl.pkl')

Modelling with

fastai

Fastai finding: make sure your test set values are not out of domain $\Rightarrow$ timestampYear in this notebook is put into the training set but there only takes on the value 2016.0, but in the test set it's 2017.0 and 2018.0, causing the predictions to zero out everywhere.

np.min([to.train.ys.values.min(), to.valid.ys.values.min()]), np.max([to.train.ys.values.max(), to.valid.ys.values.max()])
y_range = (-.1, 17)

# layers = [4000, 2000, 1000, 500, 250]
layers = [50, 20] # [1600, 800, 400, 200]

# embed_p = .01
embed_p = 0.

# ps = [.1, .1, .1, .1, .1]
ps = [.0 for _ in layers]
# ps[0] = .2

# config = None
config = tabular_config(embed_p=embed_p, ps=ps,
#                         act_cls=Swish(inplace=True)
                        )
# config = tabular_config(act_cls=nn.ReLU(inplace=True))
# config = tabular_config(act_cls=Swish(inplace=True))
# config = tabular_config(act_cls=Sine(inplace=True))

learn = tabular_learner(dls, y_range=y_range, 
                        layers=layers, n_out=1, 
                        config=config, 
#                         wd=.01,
                        loss_func=evaluate_torch) #.to_fp16()
run = -1 # a counter for `fit_one_cycle` executions
%%time
# learn.save('1600-800-400-200')
%%time
# learn = learn.load('1600-800-400-200')
 
run += 1
print(f'run #{run}')
learn.fit_one_cycle(5, lr_max=1e-1)
%%time
y_valid_pred, y_valid_true = learn.get_preds()
y_valid_pred_fast = cnr(y_valid_pred)
nb_score = evaluate_torch(y_valid_true, 
                          y_valid_pred).item()
print(f'fastai loss {nb_score:.4f}')
%%time
if do_test:
    y_test_pred, _ = learn.get_preds(dl=test_dl)
    y_test_pred_fast = cnr(y_test_pred)
    y_test_pred = cnr(y_test_pred)
y_valid_pred, y_valid_true = cnr(y_valid_pred), cnr(y_valid_true)

replacing categorical features for trees with learned embeddings

trees_with_embeddings = False
if trees_with_embeddings:
    X_emb_train = to.train.xs.copy()
    X_emb_val = to.valid.xs.copy()

class EmbeddingFeatures[source]

EmbeddingFeatures(to:TabularPandas, learn:Learner)

%%time
ef = EmbeddingFeatures(to, learn)
ef.df_embs['building_id'].iloc[:5,:3]

EmbeddingFeatures.replace_cat_features_with_embeddings[source]

EmbeddingFeatures.replace_cat_features_with_embeddings(X:DataFrame)

TODO: fix memory error in the creation of X_emb

%%time
if trees_with_embeddings:
    X_emb_train = ef.replace_cat_features_with_embeddings(X_emb_train)
    X_emb_val = ef.replace_cat_features_with_embeddings(X_emb_val)

sklearn

%%time
params = {'n_estimators': 20, 'max_features': 'sqrt', 'n_jobs':1}
model = ensemble.RandomForestRegressor
# params = {}
# model = linear_model.LinearRegression

m = model(**params)
%%time
if trees_with_embeddings:
    m.fit(X_emb_train, to.train.ys.values.ravel())
else:
    m.fit(to.train.xs.values, to.train.ys.values.ravel())

EmbeddingFeatures.embedding_assignment_func[source]

EmbeddingFeatures.embedding_assignment_func(stuff:tuple)

EmbeddingFeatures.predict_with_embeddings[source]

EmbeddingFeatures.predict_with_embeddings(X:DataFrame, m, num_rows:int=2000000, num_workers:int=1)

%%time
if trees_with_embeddings:
    y_valid_pred = m.predict(X_emb_val)
else:
    y_valid_pred = m.predict(to.valid.xs.values)

y_valid_pred_sk = np.copy(y_valid_pred)

%%time
if do_test:
    if trees_with_embeddings:
        y_test_pred = ef.predict_with_embeddings(test_dl.xs, m.set_params(n_jobs=-1), 
                                                 num_workers=1)
    else:
        y_test_pred = m.predict(test_dl.xs)
    y_test_pred_sk = np.copy(y_test_pred)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'sklearn loss {nb_score:.4f}')

lightgbm

%%time
if trees_with_embeddings:
    lgb_train = lgb.Dataset(X_emb_train, to.train.ys.values.ravel())
    lgb_eval = lgb.Dataset(X_emb_val, to.valid.ys.values.ravel(), 
                           reference=lgb_train)
else:
    lgb_train = lgb.Dataset(to.train.xs.values, to.train.ys.values.ravel())
    lgb_eval = lgb.Dataset(to.valid.xs.values, to.valid.ys.values.ravel(), 
                           reference=lgb_train)
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'l2',
    'num_leaves': 42,
    'learning_rate': 0.5,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}
%%time
gbm = lgb.train(params, lgb_train,
                num_boost_round=10,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)
%%time
if trees_with_embeddings:
    y_valid_pred_lgbm = gbm.predict(X_emb_val,
                                    num_iteration=gbm.best_iteration)
else:
    y_valid_pred_lgbm = gbm.predict(to.valid.xs.values,
                                    num_iteration=gbm.best_iteration)

y_valid_pred = np.copy(y_valid_pred_lgbm)
%%time
if do_test:
    if trees_with_embeddings:
        y_test_pred_lgbm = gbm.predict(X_emb_test,
                                       num_iteration=gbm.best_iteration)
    else:
        y_test_pred_lgbm = gbm.predict(test_dl.xs.values,
                                       num_iteration=gbm.best_iteration)
    y_test_pred = np.copy(y_test_pred_lgbm)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'lgbm loss {nb_score:.4f}')

Ensembling

y_valid_pred = np.mean([
    y_valid_pred_sk, 
    y_valid_pred_fast,
    y_valid_pred_lgbm
], axis=0)
y_valid_true = to.valid.ys.values.ravel()
nb_score = evaluate_torch(torch.from_numpy(y_valid_true), 
                          torch.from_numpy(y_valid_pred)).item()
print(f'ensembling loss {nb_score:.4f}')
%%time
if do_test:
    y_test_pred = np.mean([
        y_test_pred_sk, 
        y_test_pred_fast, 
        y_test_pred_lgbm
    ], axis=0)

Inspecting

dep_var distribution

Train vs validation distributions

pick_random[source]

pick_random(x, s:int=50)

feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                pick_random(y_valid_pred), 
                                label0='truth', label1='prediction')
if do_test:
    feature_testing.hist_plot_preds(pick_random(y_valid_true), 
                                    pick_random(y_test_pred), 
                                    label0='truth (validation)', 
                                    label1='prediction (test set)').show()

Boldly wrong predictions

%%time
miss_cols = [v for v in ['building_id', 'meter','timestamp'] if v not in to.valid.xs.columns]
tmp = to.valid.xs.join(df.loc[:,miss_cols]) if len(miss_cols)>0 else to.valid.xs
bwt = feature_testing.BoldlyWrongTimeseries(tmp, y_valid_true, y_valid_pred)
bwt.run_boldly()

Submission to kaggle

%%time
if do_test:
    y_test_pred_original = torch.exp(tensor(y_test_pred)) - 1

    y_out = pd.DataFrame(cnr(y_test_pred_original),
                         columns=['meter_reading'],
                         index=test_dl.xs.index)
    display(y_out.head())

    assert len(y_out) == 41697600

%%time
if do_submit:
    y_out.to_csv(data_path/'my_submission.csv',
                 float_format='%.4f')

kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "Message"

pretty_dictionary[source]

pretty_dictionary(d:dict)

act = 'ReLu'

# lin_model_msg = f'baseline (linear regression)'
rf_model_msg = f'RandomForest: {pretty_dictionary(params)}'
if trees_with_embeddings:
    rf_model_msg += ' (with embeddings)'
lgbm_model_msg = f'LGBM: {pretty_dictionary(params)}'

fast_model_msg = f'tabular_learner (run #{run}): act {act}, layers {layers}, ps {ps}, embed_p {embed_p}'

# model_msg = f'Ensembling tabular_learner and RandomForest ({fast_model_msg}, {rf_model_msg})'
model_msg = f'Ensembling LGBM, tabular_learner and RandomForest ({lgbm_model_msg}, {fast_model_msg}, {rf_model_msg})'
# model_msg = rf_model_msg
# model_msg = lgbm_model_msg

split_msg = f'split kind "{split_kind}" N_TRAIN {loading.N_TRAIN}'
samples_msg = f'num samples {len(dls.xs)} = {len(dls.xs)/20216100/2*100:.2f} %'
features_msg = f'weather and building features'
score_msg = f'nb score {nb_score:.4f}'
# message = ['baseline (linear regression on dep_var_stats and 1hot meter) ', '500 obs/bid', f'nb score {nb_score:.4f}']
# message = ['random forest', '500 obs/bid', 'all features', f'nb score {nb_score:.4f}']
# message = ['lightgbm', '500 obs/bid', '100 rounds', '42 leaves', 'lr .5', f'nb score {nb_score:.4f}']
# message = ['tabular_learner', '500 obs/bid', 'all features', f'layers {layers}, embed_p .1, ps [.1,.1,.1]', f'nb score {nb_score:.4f}']
message = ' + '.join([model_msg, samples_msg, split_msg, features_msg, score_msg])
message
if do_test and do_submit:
    print('Submitting...')
    !kaggle competitions submit -c ashrae-energy-prediction -f '{data_path}/my_submission.csv' -m '{message}'