-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathK_Mean_Clustering_for_Customer_Segmentation.R
172 lines (121 loc) · 5.42 KB
/
K_Mean_Clustering_for_Customer_Segmentation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# turn off locale-specific sorting for messages in English
Sys.setlocale("LC_TIME", "C")
# Loading the Packages
library(tidyverse)
library(lubridate)
library(knitr)
library(readxl)
library(broom)
library(umap)
library(ggrepel)
library(plotly)
# The Data
offers_tbl <- read_excel('../00_data/WineKMC.xlsx', sheet = 'OfferInformation')
offers_tbl <- offers_tbl %>%
set_names(c('offer', 'campaign', 'varietal', 'min_qty_kg',
'disc_pct','origin', 'past_peak'))
transac_tbl <- read_excel('../00_data/WineKMC.xlsx', sheet = 'Transactions')
transac_tbl <- transac_tbl %>%
set_names(c('customer', 'offer'))
# convert to `User-Item format`
wine_tbl <- transac_tbl %>%
left_join(offers_tbl) %>%
mutate(value = 1) %>%
spread(customer,value, fill = 0)
# Clustering the Customers
user_item_tbl <- wine_tbl[,8:107]
set.seed(196) # for reproducibility of clusters visualisation with UMAP
kmeans_obj <- user_item_tbl %>%
kmeans(centers = 5, # number of clusters to divide customer list into
nstart = 100, # specify number of random sets to be chosen
iter.max = 50) # maximum number of iterations allowed -
# quickly inspect the model
glance(kmeans_obj) %>% glimpse()
# I build a function for a set number of `centers` (4 in this case)
kmeans_map <- function(centers = 4) {
user_item_tbl %>%
kmeans(centers = centers, nstart = 100, iter.max = 50)
}
# and check that is working on `glance()`
4 %>% kmeans_map() %>% glance()
# Then, I create a __nested tibble__, which is a way of "nesting" columns inside a data frame.
kmeans_map_tbl <- tibble(centers = 1:15) %>% # create column with centres
mutate(k_means = centers %>%
map(kmeans_map)) %>% # iterate `kmeans_map` row-wise to gather
# kmeans models for each centre in column 2
mutate(glance = k_means %>%
map(glance)) # apply `glance()` row-wise to gather each
# model’s summary metrics in column 3
kmeans_map_tbl %>% glimpse()
# Last, I can build a `scree plot` and look for the "elbow" on the graph,
# the point where the number of additional clusters seem to level off.
kmeans_map_tbl %>%
unnest(glance) %>% # unnest the glance column
select(centers, tot.withinss) %>% # select centers and tot.withinss
ggplot(aes(x = centers, y = tot.withinss)) +
geom_line(colour = 'grey30', size = .8) +
geom_point(colour = 'green4', size = 3) +
geom_label_repel(aes(label = centers),
colour = 'grey30') +
theme_bw() +
labs(title = 'Scree Plot')
# Visualising the Segments
# First, I create a umap object and pull out the `layout` argument
# (containing coordinates that can be used to visualize the dataset),
# change its format to a tibble and attach the `offer` column from the `wine_tbl`.
umap_obj <- user_item_tbl %>% umap()
umap_tbl <- umap_obj$layout %>%
as_tibble() %>% # change to a tibble
set_names(c('x', 'y')) %>% # remane columns
bind_cols(wine_tbl %>% select(offer)) # attach offer reference
# Then, I `pluck` the 5th kmeans model from the __nested tibble__,
# attach the _cluster_ argument from the `kmeans` function to the output,
# and join offer and cluster to the umap_tbl.
umap_kmeans_5_tbl <- kmeans_map_tbl %>%
pull(k_means) %>%
pluck(5) %>% # pluck element 5
broom::augment(wine_tbl) %>% # attach .cluster to the tibble
select(offer, .cluster) %>%
left_join(umap_tbl, by = 'offer') # join umap_tbl to clusters by offer
# At last, I can visualise the UMAP'ed projections of the clusters.
g <- umap_kmeans_5_tbl %>%
mutate(label_text = str_glue('Offer: {offer}
Cluster: {.cluster}')) %>%
ggplot(aes(x,y, colour = .cluster)) +
geom_point(aes(text = label_text)) +
theme_light() +
labs(title = 'UMAP 2D Projections of K-Means Clusters',
caption = "") +
theme(legend.position = 'none')
ggplotly(g, tooltip = "label_text")
# Evaluating the Clusters
# Let's first bring all information together in one data frame.
cluster_trends_tbl <- wine_tbl %>%
left_join(umap_kmeans_5_tbl) %>%
arrange(.cluster) %>%
select(.cluster, offer:past_peak)
### Cluster 1 & 2
# cluster 1: high volumes of sparkling wines (Champagne and Prosecco)
# cluster 2: low volume purchases of different varieties
cluster_trends_tbl %>%
filter(.cluster ==1 | .cluster ==2) %>%
count(.cluster, varietal, origin, min_qty_kg, disc_pct) %>%
select(-n) %>%
kable()
### Cluster 3 & 4
# Cluster 3: Pinot Noir
# Cluster 4: French Champagne in high volumes
cluster_trends_tbl %>%
filter(.cluster ==3 | .cluster ==4 ) %>%
group_by() %>%
count(.cluster, varietal, origin, min_qty_kg, disc_pct) %>%
select(-n) %>%
kable()
### Cluster 5
# more difficult to categorise .
# only clear trend: segment picked up all available Cabernet Sauvignon offers.
cluster_trends_tbl %>%
filter(.cluster ==5 ) %>%
count(.cluster, varietal, origin, min_qty_kg, disc_pct) %>%
select(-n) %>%
kable()