Predicting Housing Prices 03 Preparing Test Set

import numpy as np
import pandas as pd
import pickle
import warnings; warnings.filterwarnings('ignore')
import lib.clean_helper as cl

Preparing Test Set

test = pd.read_csv('../data/kaggle_test.csv', index_col = 'Id')
test = test.rename(columns={ '3SsnPorch': 'ThreeSsnPorch'})
ordinal_f = ['MSSubClass' ,'OverallQual',
             'OverallCond',
             'FullBath','HalfBath',
             'BedroomAbvGr','KitchenAbvGr',
             'TotRmsAbvGrd','Fireplaces',
             'MoSold', 'BsmtCond', 'BsmtQual',
             'ExterCond', 'ExterQual',
             'GarageQual', 'GarageCond',
             'HeatingQC', 'KitchenQual',
             'PoolQC']
test[ordinal_f] = test[ordinal_f].apply(cl.obj_to_ordinal)
test.Exterior2nd = cl.clean_ext_typos(test.Exterior2nd)
test = cl.nan_to_str_none(test)

Cleaning Years

In the test set, one house is documented as having a garge built in the year 2207. This is clearly an error. We did not find this error in the training set. The largest GarageYrBlt values in the training set is 2010.

Let’s replace 2207 with a nan and our code will later impute that nan. We know that there is a garage because nan_to_str checks if there is a garage and would have made the GarageYrBlt value “none”.

def clean_years(year_df):
    ### drop years that don't make sense; years greater than 2010
    ### dataset only includes homes sold btwn 2006 and 2010
    for f in year_df.columns:
        is_no_yr = year_df[f].isnull() + (year_df[f].astype('str') == 'none') 
        exists_yr = is_no_yr == False
        to_nan =  year_df[f][exists_yr].apply(lambda x: int(x) > 2010)
        to_replace = year_df[f][exists_yr][to_nan].tolist()
        year_df[f] = year_df[f].replace(to_replace, [np.nan] * len(to_replace))
        clean_years = year_df
        
    return clean_years
yr_feats = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']
test[yr_feats] = cl.clean_years(test[yr_feats])
train_df = pickle.load(open('../data/train_df.p', 'rb'))
is_num = test.dtypes.isin([np.dtype('float64'), np.dtype('int64')])
num_feats = test.dtypes.index[is_num]
for f in num_feats:
    mean = train_df[f].mean()
    test[f] = test[f].fillna(mean)
cat_feats = test.dtypes.index[is_num == False]
for f in cat_feats:
    mode = train_df[f].mode()[0]
    test[f] = test[f].fillna(mode)

Pickling test

test.to_pickle('../data/test.p')