Predicting Housing Prices 02 Exploratory Data Analysis

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
train = pd.read_pickle('../data/train.p')

Exploratory Data Analysis: Charts of Features v Target Variable

Scatter Plots of Numerical Features v SalePrice

is_dtype_num = train.dtypes.isin([np.dtype('float64'), np.dtype('int64')])
num_feats = train.dtypes.index[is_dtype_num]
fig = plt.figure(figsize=(10,60))
for i, col in enumerate([f for f in num_feats if not f == 'SalePrice']):
    fig.add_subplot(20,3,i+1)
    sns.regplot(col, 
                train['SalePrice'],
                data=train,
                scatter_kws={'alpha': 0.2}
               )
fig.tight_layout()
C:\Users\HungFeb2016\Anaconda3\lib\site-packages\scipy\stats\stats.py:1706: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

png

Exploratory Data Analysis

Distribution of SalePrice by Feature

The following plots show the distribution of SalePrice by feature. For example, the first plot is for MSZoning. The green plot shows the distribution of SalePrice filtered for rows where MSZoning = C(all). The green vertical line shows the mean SalePrice of that subset. The means are spread out implying that MSZoning is related to SalePrice.

one_ext = train[train.Exterior1st == train.Exterior2nd].drop(columns = 'Exterior2nd')
two_ext = train[train.Exterior1st != train.Exterior2nd]

c0 = one_ext[['Exterior1st', 'SalePrice']].rename(columns = {'Exterior1st' : 'Exterior'})
c1 = two_ext[['Exterior1st', 'SalePrice']].rename(columns = {'Exterior1st' : 'Exterior'})
c2 = two_ext[['Exterior2nd', 'SalePrice']].rename(columns = {'Exterior2nd' : 'Exterior'})
data_to_plot = pd.concat([c0, c1, c2], axis = 'rows').reset_index()

def count_box_plots(categories):
    for category in categories:
        if not (category.startswith('Exterior') or category.startswith('Condition')):
            data_to_plot = train
        
        elif category == 'Exterior1st':
            one_ext = train[train.Exterior1st == train.Exterior2nd].drop(columns = 'Exterior2nd')
            two_ext = train[train.Exterior1st != train.Exterior2nd]
            
            c0 = one_ext[['Exterior1st', 'SalePrice']].rename(columns = {'Exterior1st' : 'Exterior'})
            c1 = two_ext[['Exterior1st', 'SalePrice']].rename(columns = {'Exterior1st' : 'Exterior'})
            c2 = two_ext[['Exterior2nd', 'SalePrice']].rename(columns = {'Exterior2nd' : 'Exterior'})
            data_to_plot = pd.concat([c0, c1, c2], axis = 'rows').reset_index()
            category = 'Exterior'
            
        elif category == 'Condition1':
            one_cond = train[train.Condition1 == train.Condition2].drop(columns = 'Condition2')
            two_cond = train[train.Condition1 != train.Condition2]
            
            c0 = one_cond[['Condition1', 'SalePrice']].rename(columns = {'Condition1': 'Condn'})
            c1 = two_cond[['Condition1', 'SalePrice']].rename(columns = {'Condition1': 'Condn'})
            c2 = two_cond[['Condition2', 'SalePrice']].rename(columns = {'Condition2': 'Condn'})
            data_to_plot = pd.concat([c0, c1, c2], axis = 'rows').reset_index()
            category = 'Condn'
        
        elif category == 'Exterior2nd' or category == 'Condition2':
            continue

        print(category)

        values = data_to_plot[category].drop_duplicates()
        if len(values) > 6:
            f, (ax1, ax2) = plt.subplots(2,1, figsize = (12, 6))
        else:
            f, (ax1, ax2) = plt.subplots(1,2, figsize = (14, 3))
            
        sns.countplot(x = data_to_plot[category].sort_values(), 
                      ax=ax1)        
        xt = plt.xticks(rotation=45)
        sns.boxplot(x = data_to_plot[category].sort_values(), 
                    y = data_to_plot['SalePrice'],  
                    ax=ax2)
        xt = plt.xticks(rotation=45)        
        plt.show()

is_dtype_num = train.dtypes.isin([np.dtype('float64'), np.dtype('int64')])
num_feats = train.dtypes.index[is_dtype_num]
cat_df = train.drop(columns = list(num_feats) + ['GarageYrBlt'])
count_box_plots(cat_df.columns.values)

fig = plt.figure(figsize=(12,12))
sns.heatmap(train[num_feats].corr(), center=0, cmap = plt.get_cmap('PiYG'))
MSSubClass

png

MSZoning

png

Street

png

Alley

png

LotShape

png

LandContour

png

Utilities

png

LotConfig

png

LandSlope

png

Neighborhood

png

Condn

png

BldgType

png

HouseStyle

png

OverallQual

png

OverallCond

png

RoofStyle

png

RoofMatl

png

Exterior

png

MasVnrType

png

ExterQual

png

ExterCond

png

Foundation

png

BsmtQual

png

BsmtCond

png

BsmtExposure

png

BsmtFinType1

png

BsmtFinType2

png

Heating

png

HeatingQC

png

CentralAir

png

Electrical

png

BsmtFullBath

png

BsmtHalfBath

png

FullBath

png

HalfBath

png

BedroomAbvGr

png

KitchenAbvGr

png

KitchenQual

png

TotRmsAbvGrd

png

Functional

png

Fireplaces

png

FireplaceQu

png

GarageType

png

GarageFinish

png

GarageCars

png

GarageQual

png

GarageCond

png

PavedDrive

png

PoolQC

png

Fence

png

MiscFeature

png

MoSold

png

SaleType

png

SaleCondition

png

<matplotlib.axes._subplots.AxesSubplot at 0x166295516d8>

png