-
Notifications
You must be signed in to change notification settings - Fork 0
/
Preprocessing.R
188 lines (160 loc) · 7.82 KB
/
Preprocessing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
#
# STEP 1: Preprocessing - MD Naseem Ashraf
# Updated: Also covers low/very-low transaction frequencies
#
#Setup Environment & Workspace
rm(list=ls())
#install.packages('openxlsx')
#install.packages('plyr')
library("openxlsx")
options(java.parameters = "-Xmx2048m")
setwd("C:/Users/Naseem Ashraf/Desktop/Fall 16/DM Project sets")
##Reading Input Data
mydf <- read.xlsx("OnlineRetail.xlsx", sheet = 1,startRow = 2, colNames = TRUE)
colnames(mydf) <- c("InvoiceNo","StockCode","Description","Quantity","InvoiceDate","UnitPrice","CustomerID","Country")
initialStatistics <- function(mydf){
nrow(mydf) #541,908 Total Transactions
head(mydf[!complete.cases(mydf[, c("InvoiceNo","StockCode","Description","Quantity","InvoiceDate","UnitPrice","CustomerID","Country")]), ])
missingDataCount <- nrow(mydf[!complete.cases(mydf[, c("InvoiceNo","StockCode","Description","Quantity","InvoiceDate","UnitPrice","CustomerID","Country")]), ])
missingInvoiceNoCount <- nrow(mydf[!complete.cases(mydf[, c("InvoiceNo")]), ]) #0 missing InvoiceNo
missingStockCodeCount <- nrow(mydf[!complete.cases(mydf[, c("StockCode")]), ]) #0 missing StockCode
missingNameCount <- nrow(mydf[!complete.cases(mydf[, c("Description")]), ]) #1454 missing Description
missingQuantityCount <- nrow(mydf[!complete.cases(mydf[, c("Quantity")]), ]) #0 missing Quantity
missingInvoiceDateCount <- nrow(mydf[!complete.cases(mydf[, c("InvoiceDate")]), ]) #0 missing InvoiceDate
missingUnitPriceCount <- nrow(mydf[!complete.cases(mydf[, c("UnitPrice")]), ]) #0 missing UnitPrice
missingCustomerIDCount <- nrow(mydf[!complete.cases(mydf[, c("CustomerID")]), ]) #135080 missing CustomerID
missingCountryCount <- nrow(mydf[!complete.cases(mydf[, c("Country")]), ]) #0 missing Country
# NA/Missing Counts = 13,5080
}
initialStatistics(mydf)
nrow(mydf) #541,908 Transactions
dim(mydf)
## Filtering Transactions of Missing Data and Redundancies
# COMPLETE-CASES
mydf <- mydf[complete.cases(mydf),]
nrow(mydf) #406,828 Complete Case Transactions
# UNIQUE-CASES
mydf <- unique(mydf) #
nrow(mydf) #401,603 Unique Case Transactions
# LOST
dim(mydf)
mydf$'tindex'<- seq.int(nrow(mydf))
dim(mydf)
lost <- subset(mydf[with(mydf, order(Quantity)),], Quantity <0, select = c(tindex, InvoiceNo, StockCode, Description, Quantity, InvoiceDate, UnitPrice, CustomerID, Country))
dim(lost) #8872 Cancelled or Lost Product Transactions
# FILTERING
mydf <- mydf[ ! mydf$tindex %in% lost$tindex, ]
dim(mydf) #392731 9
mydf$tindex <- NULL
dim(mydf) #392731 9
nrow(mydf) ##UPDATED: 392,731 Transactions
###PRODUCTS TABLE GENERATION [ACTUAL DATASET]
###Generating ProductID for Products
newdf <- transform(mydf, ProductID = match(mydf$'Description', unique(mydf$'Description')))
all.equal(mydf,newdf) ## Only first 8 Column Names are left uncoded.
rm(mydf)
#head(newdf[with(newdf,order(ProductID)),])
#dim(newdf)
#nrow(unique(newdf))
###TRANSACTIONS SUMMARY TABLE
# ProductStat[PID, Description, TotalQuantity, TransactionFreq, UnitPrice]
library(plyr)
getTotalQuantity <- function(pid, npar = FALSE, print = FALSE)
{
totalQuantity <- sum(subset(newdf, ProductID == pid, select = c(Quantity)))
return(totalQuantity)
}
getTotalCustomers <- function(pid, npar = FALSE, print = FALSE)
{
return(length(unique(newdf$CustomerID[newdf$ProductID==pid])))
}
getMeanQuantityPerTransaction <- function(pid, npar = FALSE, print = FALSE)
{
meanQuantity <- mean(subset(newdf, ProductID == pid, select = c(Quantity))$Quantity)
return(meanQuantity)
}
range(newdf$ProductID) #1-3877
##UPDATED
productstatdf <- count(subset(newdf, ProductID = 1:3877, select = c(ProductID, Description, UnitPrice)), c('ProductID', 'Description'))
colnames(productstatdf)[3] <- "TransactionFreq"
#head(productstatdf)
totQuantity <- c()
totCustomers <- c()
meanQuantityPerTransaction <- c()
for (netproductcount in 1:3877)
{
totQuantity[netproductcount] <- getTotalQuantity(netproductcount)
totCustomers[netproductcount] <- getTotalCustomers(netproductcount)
meanQuantityPerTransaction[netproductcount] <- getMeanQuantityPerTransaction(netproductcount)
}
productstatdf$"TotalQuantity" <- totQuantity
productstatdf$"Customers" <- totCustomers
productstatdf$"MeanQuantityPerTransaction" <- meanQuantityPerTransaction
meanQuantityPerCustomer <- c()
for (netproductcount in 1:3877)
{
meanQuantityPerCustomer[netproductcount] <- productstatdf$"TotalQuantity"[productstatdf$ProductID==netproductcount]/productstatdf$"Customers"[productstatdf$ProductID==netproductcount]
}
productstatdf$"MeanQuantityPerCustomer" <- meanQuantityPerCustomer
rm(totCustomers, totQuantity, lost, meanQuantityPerCustomer, meanQuantityPerTransaction)
plotProductsInitial <- function(){
head(productstatdf)
##Histogram of productstatdf attributes
for (n in c(1,3,4,5,6,7))
{
hist(productstatdf[,n], xlab = colnames(productstatdf)[n], xlim= c(100,500), breaks = 100)
hist(productstatdf[,n], xlab = colnames(productstatdf)[n])
}
hist(productstatdf[,3], xlab = colnames(productstatdf)[3], breaks=100, xlim = c(0,2015), ylab = "No. of Different Products")
max(productstatdf[,3])
hist(productstatdf[,4], xlab = colnames(productstatdf)[4], breaks=100, xlim = c(0,81000), ylab = "No. of Different Products")
max(productstatdf[,4])
}
###Splitting Dataset into High Frequency, Medium Frequency and Low Frequency Transactions
orderedallproducts <- productstatdf[with(productstatdf, order(-TransactionFreq, TotalQuantity, ProductID)),]
orderdProductsStats(orderedallproducts)
orderdProductsStats <- function(orderedallproducts){
head(orderedallproducts)
tail(orderedallproducts)
max(orderedallproducts$ProductID)
summary(orderedallproducts$TransactionFreq)
}
transfreqsummary <- summary(orderedallproducts$TransactionFreq)
#summary(orderedallproducts$TransactionFreq)
## UPDATED
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 10.0 43.0 101.3 127.0 2015.0
lowfreqproducts <- subset(orderedallproducts, (ProductID = 1:3877)&(TransactionFreq < transfreqsummary[4]), select = c(ProductID, Description, TransactionFreq, TotalQuantity, Customers, MeanQuantityPerTransaction, MeanQuantityPerCustomer))
#range(lowfreqproducts$TransactionFreq)
#hist(lowfreqproducts$TransactionFreq)
#dim(lowfreqproducts) ##2707 Products <101.3
midfreqproducts <- subset(orderedallproducts, (ProductID = 1:3877)&(TransactionFreq > transfreqsummary[4])&(TransactionFreq < transfreqsummary[5]), select = c(ProductID, Description, TransactionFreq, TotalQuantity, Customers, MeanQuantityPerTransaction, MeanQuantityPerCustomer))
#range(midfreqproducts$TransactionFreq) #102-126
#hist(midfreqproducts$TransactionFreq)
#dim(midfreqproducts) ##198 Products ~ 774
highfreqproducts <- subset(orderedallproducts, (ProductID = 1:3877)&(TransactionFreq >= transfreqsummary[5]), select = c(ProductID, Description, TransactionFreq, TotalQuantity, Customers, MeanQuantityPerTransaction, MeanQuantityPerCustomer))
#range(highfreqproducts$TransactionFreq) #127-2015
#hist(highfreqproducts$TransactionFreq)
#dim(highfreqproducts) ##972 Products
#UPDATED
finaldataset <- orderedallproducts
#finaldataset <- orderedallproducts[ ! orderedallproducts$ProductID %in% lowfreqproducts$ProductID, ]
##UPDATED
##dim(finaldataset) #3877 products and 7 Attributes
#dim(finaldataset) #1170 Products [198 + 972 + ]
#Reattach UnitPrice of each product
upricein <- c()
earningpertransactions <- c()
index <- 0
for (currentid in finaldataset$ProductID) #3877 Total Products
{
index = index + 1
upricein[index] <- newdf$UnitPrice[newdf$ProductID==currentid]
earningpertransactions[index] <- newdf$UnitPrice[newdf$ProductID==currentid] * finaldataset$MeanQuantityPerTransaction[finaldataset$ProductID==currentid]
}
rm(index)
finaldataset$"UnitPrice" <- upricein
finaldataset$"MeanEarningPerTransaction" <- earningpertransactions
rm(upricein, earningpertransactions)
write.xlsx(orderedallproducts, "orderedallproductsNEW.xlsx")
write.xlsx(finaldataset, "finaldatasetNEW.xlsx")