Market Segmentation Using Customer Data 02 Hunting For Outliers

library(mlbench); library(ggplot2); library(GGally); library(reshape2) # melt fn

source('../src/load_data.r')

Hunting for Outliers using Tukey’s Method

Tukey’s Method

Using Tukey’s Method, outliers are defined as values < $Q_{1} - 1.5IQR$ and values > $Q_{3} + 1.5IQR$ where IQR is the interquartile range.
$IQR = Q_{3} - Q{1}$
where $Q_{1}$ is the value in the dataset >= 25% of the values and <=75% of the values in the datset, $Q_{2}$ is the value >= 50% of the values and <= the rest of the values, and so forth.

options(repr.plot.width = 10, repr.plot.height = 2)

q <- quantile(wholesale_data$Detergents_Paper)
iqr <- q['75%'] - q['25%']
q[['Tukey floor']] <- q['25%'] - 1.5*iqr
q[['Tukey ceiling']] <- q['75%'] + 1.5* iqr

horiz_boxplot <- ggplot(wholesale_data, aes(x = 0, y = Detergents_Paper)) + 
                    geom_boxplot(outlier.alpha = 0, size = 1) + 
                    coord_flip(ylim = c(-5500, 10000)) 

points <- geom_jitter(width = 0.1, alpha = 0.1)

labels <- annotate(geom = 'text', y = q, x = c(0.25, -0.25, 0.25, 0.25, -0.25, 0.25, 0.25),  
                  label = names(q), check_overlap = FALSE, color = 'darkred')

tukey_window <- annotate(geom = 'line', y = c(q['Tukey floor'], q['Tukey ceiling']), 
                         x = 0, size = 20, color = 'darkblue', alpha = 0.5) 
text_in_window <- annotate(geom = 'text', x = 0, y = q['50%'],
                           color = 'white',
                           label = 'values not in this range [Q1 - 1.5*IQR, Q3 + 1.5*IQR] are considered outliers')

blank_background <- theme(panel.background = element_blank(),
                          axis.line = element_blank(),
                          axis.ticks = element_blank(),
                          axis.text = element_blank(),
                          axis.title.x = element_blank(),
                          axis.title.y = element_blank()) 

tukey_graphic <- horiz_boxplot + points + labels + tukey_window + text_in_window + blank_background
print(tukey_graphic)

png

Changing Size of ‘Tukey Window’

Let’s call the range of values not considered outliers under Tukey’s method, Tukey’s windows for simplicity. It’s not a technical term. We will widen Tukey’s window by changing the factor from 1.5 to 4 because this dataset contains many outliers. There is a large spread so it would be more useful to consider the values on the more extreme ends.

# TODO: ADD display_outliers() to handy_fns or source
display_outliers <- function(feature, param = 4, df = wholesale_data) {
    feature_vec =  as.vector(wholesale_data[[feature]])
    Q1 <- quantile(feature_vec, .25)
    Q3 <- quantile(feature_vec, .75)
    tukey_window <- param*(Q3-Q1)

    # less_than_Q1 & greater_than_Q3 are masks, meaning they're vectors whose
    # elements are booleans
    less_than_Q1 <- wholesale_data[[feature]] < Q1 - tukey_window
    greater_than_Q3 <- wholesale_data[[feature]] > Q3 + tukey_window
    tukey_mask <- (less_than_Q1 | greater_than_Q3)
    outliers <- wholesale_data[tukey_mask,]
    return(outliers)
}

num_df <- Filter(is.numeric, wholesale_data)
outlier_dfs <- sapply(X = colnames(num_df), FUN = display_outliers, simplify = FALSE)

It is nice to be able to see the data frames filtered for outliers, but it’s very granular. Let’s get a count of the number of outliers for each feature.

count_outliers <- function(feature) {
    outlier_count <- dim(display_outliers(feature))[1]
    return(outlier_count)
}

sapply(colnames(Filter(is.numeric, wholesale_data)), count_outliers)

<dl class=dl-horizontal> <dt>Fresh</dt> <dd>2</dd> <dt>Milk</dt> <dd>7</dd> <dt>Grocery</dt> <dd>5</dd> <dt>Frozen</dt> <dd>12</dd> <dt>Detergents_Paper</dt> <dd>9</dd> <dt>Deli</dt> <dd>6</dd> </dl>

## sapply() is difficult because rownames() returns a vector and sapply does not append vectors to return a vector
## it will return a list which messes up following code

raw_outliers <- c()
for (feat in outlier_dfs) {
    ndx <- c(row.names(feat))
    raw_outliers <- c(raw_outliers, ndx)
}

freq_outliers <- as.data.frame(table(raw_outliers))

freq_outliers[freq_outliers$Freq > 1,]

<th scope=col>raw_outliers</th><th scope=col>Freq</th><th scope=row>2</th><th scope=row>3</th><th scope=row>4</th><th scope=row>7</th><th scope=row>12</th><th scope=row>17</th><th scope=row>19</th><th scope=row>20</th><th scope=row>22</th><th scope=row>23</th>


126	2
182	3
184	3
24	2
334	2
48	3
62	3
66	2
86	3
87	2

wholesale_data['outlier_for'] <- rownames(wholesale_data) %in% raw_outliers

outliers_many_feats <- freq_outliers[freq_outliers$Freq > 1,]$raw_outliers

many_feats <- rownames(wholesale_data) %in% outliers_many_feats
wholesale_data$outlier_for[many_feats] <- 'many_feats'

wholesale_data$outlier_for[wholesale_data$outlier_for == TRUE] <- 'one_feat'
wholesale_data$outlier_for[wholesale_data$outlier_for == FALSE] <- 'zero'

head(wholesale_data)

<th scope=col>Channel</th><th scope=col>Region</th><th scope=col>Fresh</th><th scope=col>Milk</th><th scope=col>Grocery</th><th scope=col>Frozen</th><th scope=col>Detergents_Paper</th><th scope=col>Deli</th><th scope=col>outlier_for</th>

Retail	Other	12669	9656	7561	214	2674	1338	zero
Retail	Other	7057	9810	9568	1762	3293	1776	zero
Retail	Other	6353	8808	7684	2405	3516	7844	one_feat
Horeca	Other	13265	1196	4221	6404	507	1788	zero
Retail	Other	22615	5410	7198	3915	1777	5185	zero
Retail	Other	9413	8259	5126	666	1795	1451	zero

From R Documentation:

ggpairs(data, mapping = NULL, columns = 1:ncol(data), title = NULL,
  upper = list(continuous = "cor", combo = "box_no_facet", discrete =
  "facetbar", na = "na"), lower = list(continuous = "points", combo =
  "facethist", discrete = "facetbar", na = "na"), diag = list(continuous =
  "densityDiag", discrete = "barDiag", na = "naDiag"), params = NULL, ...,
  xlab = NULL, ylab = NULL, axisLabels = c("show", "internal", "none"),
  columnLabels = colnames(data[columns]), labeller = "label_value",
  switch = NULL, showStrips = NULL, legend = NULL,
  cardinality_threshold = 15, legends = stop("deprecated"))

options(repr.plot.width = 10, repr.plot.height = 10)

pairplot_products <- function(data) {
    ggpairs(data = data, 
            columns = c('Fresh','Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Deli'), 
            mapping = aes(color = outlier_for),
            upper = list(continuous = wrap(ggally_cor, size = 2.5, alignPercent = 0.8)),
            lower = list(continuous = wrap(ggally_points, alpha = 0.2)),
           )    
}

pairplot_products(wholesale_data)

png

boxplot_product <- function(data) {
    ggplot(melt(data), aes(x = variable, y = value)) +
        geom_boxplot(outlier.alpha = 0) +
        geom_jitter(width = .2, aes(color = outlier_for))
}

boxplot_product(wholesale_data)

Using Channel, Region, outlier_for as id variables

png

show_outliers_in_scatter <- function(xFeat, yFeat) {
    ggplot(wholesale_data, aes_string(x = substitute(xFeat), y = substitute(yFeat), color = 'outlier_for',
                                alpha = 'outlier_for')) +
    geom_text(check_overlap = FALSE, size = 4, aes(label = rownames(wholesale_data))) +
    scale_alpha_manual(guide = 'none', values = list(zero = 0.02, one_feat = 0.3, many_feats = 1))
}

show_outliers_in_scatter(Milk, Grocery)
show_outliers_in_scatter(Milk, Fresh)
show_outliers_in_scatter(Frozen, Grocery)
show_outliers_in_scatter(Deli, Detergents_Paper)

png

Visualizing Data Without Outliers That Are Outliers for Multiple Features

Let’s first see how many observations are outliers for multiple features.

dim(wholesale_data[wholesale_data$outlier_for != 'many_feats',])

pairplot_products(wholesale_data[wholesale_data$outlier_for != 'many_feats',])

png

boxplot_product(wholesale_data[wholesale_data$outlier_for != 'many_feats',])

Using Channel, Region, outlier_for as id variables

png

Verdict: Not Quite Removing Outliers

Usually, we would remove some outliers but there appears that outliers for many features are also the most extreme outliers within each feature. In practice, these outliers are significant because they represent customers that bring in a lot of business. We would be wise to not ignore them when analyzing our customers. However, while doing cluster modeling, these outliers will just add noise to our analysis of most of our customers. Thus, we will not consider these outliers that are outliers for many features in our clustering analysis.

wholesale_data <- wholesale_data[wholesale_data$outlier_for != 'many_feats',]

clean_wholesale <- subset(wholesale_data, select = -c(outlier_for))
write.csv(clean_wholesale, file = '../data/clean_wholesale.csv')