-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.tex
760 lines (654 loc) · 42 KB
/
main.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
%-------------------------------------------------------------------------------
% Document & Package declarations
%-------------------------------------------------------------------------------
\documentclass[a4paper, 10pt, conference]{ieeeconf}
\usepackage{graphicx}
\usepackage[colorlinks=true, allcolors=black]{hyperref}
\usepackage{tabularx}
%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
%% Useful packages
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[font=footnotesize,labelfont=bf]{caption}
\usepackage[font=footnotesize,labelfont=bf]{subcaption}
%% Packages for displaying source code
\usepackage{listings}
% \usepackage[framed,numbered,autolinebreaks,useliterate]{mcode}
\usepackage{float}
\usepackage{longtable}
%% Packages for displaying source code
\usepackage[numbered,framed]{matlab-prettifier}
\usepackage{color}
%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%% bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex,
%% bare_jrnl_transmag.tex
%%*************************************************************************
%-------------------------------------------------------------------------------
% Document Configuration
%-------------------------------------------------------------------------------
\begin{document}
\title{Pattern Recognition - Distance Metrics and Neural Networks}
\author{Michael~Hart (00818445) and
Meng~Kiang~Seah (00699092)
\\
Department of Electrical and Electronic Engineering,
Imperial College London,
SW7 2AZ
\\
E-mail: \{mh1613, mks211\}@imperial.ac.uk}
\date{\today}
%-------------------------------------------------------------------------------
% Plan on what to write
%-------------------------------------------------------------------------------
% See coursework instructions at:
% https://bb.imperial.ac.uk/bbcswebdav/pid-1009576-dt-content-rid-3417906_1/courses/DSS-EE4_68-16_17/PRCoursework2.pdf
%-------------------------------------------------------------------------------
% Information Banner
%-------------------------------------------------------------------------------
\maketitle
%-------------------------------------------------------------------------------
% Abstract
%-------------------------------------------------------------------------------
\begin{abstract}
\todo{Use actual words} Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus ac faucibus libero, vel facilisis lacus. Cras imperdiet, enim vitae consequat pharetra, dui odio cursus mi, finibus tristique est sapien a dolor. Donec eu dui in elit tempor volutpat id non sapien. Aenean volutpat lectus eu lacinia tincidunt. Donec at libero volutpat, tempus magna ac, gravida est. Integer gravida, ligula id posuere semper, libero justo gravida lorem, non congue odio odio luctus lectus. In hac habitasse platea dictumst. Quisque auctor purus lectus, vitae condimentum libero.
\end{abstract}
%-------------------------------------------------------------------------------
% Introduction
%-------------------------------------------------------------------------------
\section{Introduction}
There are a number of methods that can be used to classify data. The data set used is a set of 178 vectors, each containing 13 elements, describing wine. The data is split into 3 classes, the classes having 59, 71, and 48 instances each, respectively.
The data was split into a training set, a validation set, and a testing set. This was in the ratio $118:20:40$. To ensure that each set received an appropriate mix of each class, the classes themselves were split. Class 1 was split into $39:7:13$ for training, validation, and testing, respectively. Class 2 was in $47:8:16$, and Class 3 was $32:5:11$.
\subsection{Nearest Neighbour Distance Metrics Classification}
The nearest neighbour (NN) classification system works by finding the closest training point to the test point, and assigning it the same class. The notion of ``closest'' is explored further with different distance metrics.
\subsection{K-Means Clustering and Classification}
K-means clustering works by clustering data points into a predetermined number of clusters. An initial $k$ number of centroids are first seeded, with each training point assigned to the closest centroid. Each centroid is changed to the value of the average of the points assigned to it. This is repeated until the points no longer change centroid, or a certain number of iterations have completed \cite{kmeans}.
\subsection{Neural Networks}
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus ac faucibus libero, vel facilisis lacus. Cras imperdiet, enim vitae consequat pharetra, dui odio cursus mi, finibus tristique est sapien a dolor. Donec eu dui in elit tempor volutpat id non sapien. Aenean volutpat lectus eu lacinia tincidunt. Donec at libero volutpat, tempus magna ac, gravida est.
%-------------------------------------------------------------------------------
% Distance Metrics
%-------------------------------------------------------------------------------
\section{Distance Metrics}
\subsection{Data Preparation}
The data given can be analysed in its raw form, but it can also be L2-normalised. This means that it is divided by its L2-distance. This is defined as in Equation \ref{eq:l2}, for a $p$-dimensional vector.
\begin{align}
L_2 &= \sqrt{\sum_{i = 1}^{p} \left( x_i \right) ^2 }\label{eq:l2}
\end{align}
The question then becomes whether to normalise all 178 instances by instance (per row) or by dimension (by column). If one were to visualise the data, normalising it by instance would place all points at a Euclidean distance of 1 from the origin. This would mean they lie on the surface of a hypersphere. In contrast, normalising by dimension shrinks the spread of the data so that every value fits in the range $[-1; 1]$. From the same source as was used with the Support Vector Machines (SVMs) \cite{scale}, the sensible thing is to normalise by dimension. This prevents any one dimension from becoming overbearing in future measurements. Additionally, it makes no sense to scale the data in one instance with itself, as this will still allow the dimensions with larger variance to dominate.
\subsection{Covariances}
From the unmodified data and the normalised data, the covariances can be calculated by using the full set of data, or by focusing on each of the three classes individually. Thus, for the two types of data, there are 4 covariance matrices: from all the data, Class 1, Class 2, and Class 3. The way used to obtain the covariance matrices was the \texttt{cov} built-in function in MATLAB.
For context, the way to interpret the data in a covariance matrix is to know that it will be a square, symmetric matrix, with dimensions equal to the number of dimensions in each data instance. The matrix, $S$, will have elements $s_{i, j}$, equal to covariance of the $i$-th and $j$-th dimension. The primary diagonal is the covariance of a dimension with itself, and is therefore the variance \cite{cov}.
Analysing 13-by-13 tables of numbers is not easy. Therefore, a useful alternative is to view the matrices as colour maps, which are easily produced with MATLAB.
\subsubsection{Unmodifed Training Data}
Figure \ref{fig:covtrainingone} shows the colour map for the covariance of the training matrix. The other 3 are found in the Appendix as Figure \ref{fig:covtrainingall} to conserve space. However, they are all like Figure \ref{fig:covtrainingone}. By not performing any scaling to the data, the dimension with the largest range, 13, has dominated the diagram.
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/covtraining.png}
\caption{Colour map of covariance of training data.}
\label{fig:covtrainingone}
\end{figure}
This is further seen when looking at the mean values in each dimension for the 4 sets of unmodified data, shown in Table \ref{tbl:trainingmean}. Dimension 13 has the largest magnitude, and this is the cause for the skewed colour maps.
\begin{table}[!ht]
\centering
\caption{Means of Unmodified Data}
\label{tbl:trainingmean}
\begin{tabular}{|r|llll|}
\hline
\textbf{Dimension} & \textbf{All Training} & \textbf{Class 1} & \textbf{Class 2} & \textbf{Class 3} \\ \hline
1 & 12.719 & 13.484 & 11.989 & 12.859\\
2 & 2.262 & 1.959 & 1.846 & 3.242\\
3 & 2.379 & 2.487 & 2.269 & 2.409\\
4 & 19.506 & 17.541 & 20.013 & 21.156\\
5 & 99.542 & 105.000 & 94.106 & 100.875\\
6 & 2.277 & 2.783 & 2.286 & 1.647\\
7 & 2.006 & 2.924 & 2.078 & 0.782\\
8 & 0.357 & 0.291 & 0.358 & 0.438\\
9 & 1.561 & 1.815 & 1.673 & 1.088\\
10 & 4.824 & 5.253 & 2.959 & 7.040\\
11 & 0.959 & 1.059 & 1.073 & 0.668\\
12 & 2.601 & 3.132 & 2.799 & 1.663\\
13 & 737.110 & 1070.385 & 522.213 & 646.562\\ \hline
\end{tabular}
\end{table}
\subsubsection{Normalised Training Data}
The same process is repeated with the normalised data. To save space, the covariance matrices for the different classes can be found in the Appendix as Figure \ref{fig:covl2all}, but Figure \ref{fig:covl2one} shows the colour plot for the covariance matrix of all the normalised training data. Along with that, Table \ref{tbl:l2mean} shows the mean values.
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/covl2.png}
\caption{Colour map of covariance of normalised training data.}
\label{fig:covl2one}
\end{figure}
\begin{table}[!ht]
\centering
\caption{Means of Normalised Data}
\label{tbl:l2mean}
\begin{tabular}{|r|llll|}
\hline
\textbf{Dimension} & \textbf{All Training} & \textbf{Class 1} & \textbf{Class 2} & \textbf{Class 3} \\ \hline
1 & 0.073 & 0.078 & 0.069 & 0.074\\
2 & 0.066 & 0.057 & 0.053 & 0.094\\
3 & 0.075 & 0.078 & 0.071 & 0.076\\
4 & 0.074 & 0.066 & 0.076 & 0.080\\
5 & 0.074 & 0.078 & 0.070 & 0.075\\
6 & 0.072 & 0.088 & 0.072 & 0.052\\
7 & 0.067 & 0.097 & 0.069 & 0.026\\
8 & 0.070 & 0.057 & 0.070 & 0.086\\
9 & 0.069 & 0.080 & 0.074 & 0.048\\
10 & 0.065 & 0.071 & 0.040 & 0.095\\
11 & 0.073 & 0.081 & 0.082 & 0.051\\
12 & 0.072 & 0.087 & 0.078 & 0.046\\
13 & 0.068 & 0.099 & 0.048 & 0.060\\ \hline
\end{tabular}
\end{table}
The data's means are very much in the same range, which allows the covariances to be in the same range. This means displaying them all as the same scale is possible in the colour map. Knowing that a covariance of zero is a necessary, but not sufficient condition of independence, the dimension pairs that have the lowest covariances are the ones that are the most likely to be independent.
\subsubsection{Correlation Matrices and Analysis}
Another clearer way to view the covariance matrix is to convert it to a correlation matrix, using the built-in function \texttt{corrcov()}. A correlation matrix displays 1 on the major diagonal, and it is a dimensionless value in the range $[-1; 1]$, as it is scaled by the dimensions' standard deviation. A value of $1$ suggests a positive linear relationship, and $-1$ a negative linear relationship. On the other hand, $0$ means no linear relationship at all.
As it can be seen as a scaled version of the covariance, both the normalised data and unmodified data yielded identical correlation matrices. Once again, they can be found in the Appendix as Figure \ref{fig:corrall}. The trouble with this measurement is that it only ``detects'' linear relationships. Different dimension pairs may be dependent on some other function and this will not be reflected.
While the images are not included in the main body of the report, the takeaway is that the covariance matrices, correlation matrices, and mean values by dimension all vary according to whether all the training data is used, or if it is separated by class. Those statistical measurements can be used to characterise and describe groups of data, which means they can be used for classification.
\subsection{Nearest Neighbour Classification}
The Nearest Neighbour (NN) method takes each test data point and assigns it a class based on the class of the closest training data point to it. Different measurements were used to represent this concept of ``closeness''. The first one is the L1 distance, or the Manhattan distance. This is expressed with Equation \ref{eq:l1} for 2 $p$-dimensional vectors, $x$ and $y$ \cite{metrics}.
\begin{align}
L_1 \left( x, y \right) &= \sum_{i = 1}^{p} \lvert x_i - y_i \rvert \label{eq:l1}
\end{align}
The second is the L2, or Euclidean distance, in Equation \ref{eq:l22}. Equation \ref{eq:chi} shows the Chi-Squared formula, Equation \ref{eq:intersection} shows the histogram intersection formula, and finally Equation \ref{eq:corr} shows the correlation formula \cite{metrics}. The functions written to implement these metrics are in the Appendix.
\begin{align}
L_2 \left( x, y \right) &= \sum_{i = 1}^{p} \left( x_i - y_i \right) ^2 \label{eq:l22}\\
\mathcal{X}^2 \left( x, y \right) &= \frac{1}{2} \sum_{i = 1}^{p} \frac{\left( x_i - y_i \right) ^2 }{\left( x_i + y_i \right)} \label{eq:chi}\\
\cap \left( x, y \right) &= \sum_{i = 1}^{p} \min \left( x_i, y_i \right) \label{eq:intersection}\\
CrossCorr &= \sum_{i = 1}^{p} x_i y_i \label{eq:corr}
\end{align}
The fifth method, called the Mahalanobis distance, defined for vectors $\mathbf{x} = [x_1, x_2, \ldots x_p]$ and $\mathbf{y} = [y_1, y_2, \ldots y_p]$ as shown in Equation \ref{eq:maha}. Originally, $\mathbf{x}$ and $\mathbf{y}$ were from the same set of data, and $\Sigma$ was the covariance matrix of that set of data. However, it can be generalised to use any positive semi-definite matrix (needed to ensure that the distance is always more than zero), $A$, shown in Equation \ref{eq:maha2} \cite{metrics}.
\begin{align}
Maha \left( \mathbf{x}, \mathbf{y} \right) &= \left( \mathbf{x} - \mathbf{y} \right) ^T \Sigma^{-1} \left( \mathbf{x} - \mathbf{y} \right)\label{eq:maha}\\
\Sigma &= \text{Covariance Matrix}\notag\\
Maha \left( \mathbf{x}, \mathbf{y} \right)&= \left( \mathbf{x} - \mathbf{y} \right) ^T A \left( \mathbf{x} - \mathbf{y}\right)\label{eq:maha2}\\
&= \left( \mathbf{x} - \mathbf{y} \right) ^T G^T G \left( \mathbf{x} - \mathbf{y}\right)\notag\\
&= \left( G\mathbf{x} - G\mathbf{y} \right) ^T \left( G\mathbf{x} - G\mathbf{y}\right)\notag\\
&= L_2\left( G\mathbf{x}, G\mathbf{y} \right) \label{eq:maha3}
\end{align}
And as Equation \ref{eq:maha3} shows, if $A$ is decomposed into $G^TG$, it becomes a transformation to apply to the data before performing the L2 distance metric. This is what was done as well for the NN classification. 4 covariance matrices were obtained from the normalised training data. One was from all of it, and the other three were for each of the classes \cite{metrics}.
The inverses were taken (MATLAB's \texttt{inv()}) and decomposed using the Cholesky decomposition (MATLAB's \texttt{chol()}).
The five metrics were used for NN classification with the normalised and unmodified data. Table \ref{tbl:nn} shows the accuracy. Table \ref{tbl:nnmaha} show the accuracy of the Mahalanobis distance with different covariance matrices with the normalised and unmodified data. A full set of the confusion matrices is in the Appendix as Figures \ref{fig:nnconfuseunmod} and \ref{fig:nnconfusemod}.
\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in NN Classification}
\label{tbl:nn}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Type of Data} & \textbf{L1} & \textbf{L2} & \textbf{Chi-Squared} & \textbf{Intersection} & \textbf{Correlation}\\ \hline
Unmodified & 85.00 & 80.00 & 90.00 & 90.00 & 87.50\\
Normalised & 90.00 & 87.50 & 95.00 & 87.50 & 95.00\\ \hline
\end{tabular}
\end{table}
The first observation is that overall, using the normalised data yields a higher accuracy. However, this is quite slight. Looking at the accuracies, the lowest value is 80\%, which is quite impressive for such a simple classification system. The metric with the highest accuracy with both sets of data is the Chi-Squared metric, while the lowest is the L2 one.
\begin{table}[!ht]
\centering
\caption{Accuracy of Mahalanobis Distance in NN Classification}
\label{tbl:nnmaha}
\begin{tabular}{|r|llll|}
\hline
\textbf{Covariance Used} & L2 Training & L2 Class 1 & L2 Class 2 & L2 Class 3\\ \hline
\textbf{Unmodified} & 82.50 & 77.50 & 80.00 & 82.50\\
\textbf{Normalised} & 80.00 & 90.00 & 77.50 & 97.50\\ \hline
\end{tabular}
\end{table}
However, with Mahalanobis distance, the improvement from normalising data is no longer so apparent, working only for certain cases. This is because the applying the transformation already results in some scaling of the data \cite{metrics}.
Transformation of the data through the covariance matrices of Class 3 yielded the highest accuracy for classification when applied to normalised data. Looking at the colour map in the Appendix shows how small its covariance values are (after normalisation). This means that the resulting $G$ transformation was such that each class on average moved away from each other.
%-------------------------------------------------------------------------------
% K-means clustering
%-------------------------------------------------------------------------------
\section{K-means clustering}
As explained in the Introduction, K-means is a way of classifying data. There is a built-in function in MATLAB called \texttt{kmeans()} that takes in data, along with a $k$ value and returns the centroids after performing K-means clustering on it. This was used with the training data, before the testing data was classified using those centroids.
As with the different distance metrics in the previous section, there are different metrics that MATLAB uses for determining the centroid positions. They are \texttt{sqeuclidean} (L2), \texttt{cityblock} (L1), \texttt{cosine}, and \texttt{correlation} \cite{kmeans}. There are also corresponding \texttt{pdist()} variations, which return a distance value depending on the selected metric \cite{pdist}.
These two functions were combined in the function that can be found in the Appendix. The K-means clustering is performed for all 4 metrics, before the testing data is classified using the respective identical metric to find the closest centroid. It does not make sense to use other metrics in testing other than the one used for clustering as otherwise the notion of ``closest'' in the classification is different.
Different ``versions'' of the data can be used with this K-means clustering. The different types are shown in Table \ref{tbl:types}, while the Table \ref{tbl:k3} shows the classification accuracy for $k=3$.
% a is normal data
% b is l2norm data
% c is normal data mahalanobis transformed by covariance of l2 data
% d is l2norm data mahalanobis transformed by covariance of l2 data
% e is normal data mahalanobis transformed by covariance of class 1 l2 data
% f is l2norm data mahalanobis transformed by covariance of class 1 l2 data
% g is normal data mahalanobis transformed by covariance of class 2 l2 data
% h is l2norm data mahalanobis transformed by covariance of class 2 l2 data
% i is normal data mahalanobis transformed by covariance of class 3 l2 data
% j is l2norm data mahalanobis transformed by covariance of class 3 l2 data
\begin{table}[!ht]
\centering
\caption{Data types used in K-means clustering for classification.}
\label{tbl:types}
\begin{tabular}{|r|l|}
\hline
\textbf{Data} & \textbf{Description} \\ \hline
(a) & Unmodified Data\\
(b) & Normalised Data\\
(c) & Unmodified Data transformed from Normalised Data\\
(d) & Normalised Data transformed from Normalised Data\\
(e) & Unmodified Data transformed from Normalised Class 1 Data\\
(f) & Normalised Data transformed from Normalised Class 1 Data\\
(g) & Unmodified Data transformed from Normalised Class 2 Data\\
(h) & Normalised Data transformed from Normalised Class 2 Data\\
(i) & Unmodified Data transformed from Normalised Class 3 Data\\
(j) & Normalised Data transformed from Normalised Class 3 Data\\\hline
\end{tabular}
\end{table}
\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in K-Means Clustering for $k = 3$}
\label{tbl:k3}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Data} & \textbf{Euclidean} & \textbf{Cityblock} & \textbf{Cosine} & \textbf{Correlation} & \textbf{Average} \\ \hline
(a) & 75.00 & 75.00 & 75.00 & 72.50 & 74.38\\
(b) & 87.50 & 85.00 & 85.00 & 85.00 & 85.62\\
(c) & 75.00 & 75.00 & 72.50 & 72.50 & 73.75\\
(d) & 52.50 & 85.00 & 72.50 & 65.00 & 68.75\\
(e) & 72.50 & 75.00 & 72.50 & 72.50 & 73.12\\
(f) & 80.00 & 87.50 & 80.00 & 77.50 & 81.25\\
(g) & 75.00 & 75.00 & 72.50 & 62.50 & 71.25\\
(h) & 87.50 & 62.50 & 92.50 & 97.50 & 85.00\\
(i) & 75.00 & 75.00 & 72.50 & 62.50 & 71.25\\
(j) & 92.50 & 90.00 & 92.50 & 92.50 & 91.88\\
\textbf{Average} & 77.25 & 78.50 & 78.75 & 76.00 & NIL \\ \hline
\end{tabular}
\end{table}
Overall for $k=3$, the normalised data yielded a better result that the unmodified data, with the only outlier being type (d). From the average classification accuracy per metric type, the best metric is a close tie between \texttt{cityblock} and \texttt{cosine}. Additionally, transforming the data through a covariance matrix only works if the covariance is from a specific class, as all those data types outperformed (c) and (d). However, none of them outperformed untransformed data, with the exception of those transformed by the Class 3 covariance. This is similar to the results from Table \ref{tbl:nnmaha}. The lower values of the scaled Class 3 covariance matrix are once again likely to be the cause, as explained earlier.
\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in K-Means Clustering for $k = 10$}
\label{tbl:k10}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Data} & \textbf{Euclidean} & \textbf{Cityblock} & \textbf{Cosine} & \textbf{Correlation} & \textbf{Average}\\ \hline
(a) & 72.50 & 75.00 & 72.50 & 70.00 & 72.50\\
(b) & 90.00 & 90.00 & 90.00 & 77.50 & 86.88\\
(c) & 67.50 & 75.00 & 72.50 & 70.00 & 71.25\\
(d) & 77.50 & 80.00 & 80.00 & 85.00 & 80.62\\
(e) & 77.50 & 72.50 & 67.50 & 72.50 & 72.50\\
(f) & 92.50 & 87.50 & 87.50 & 92.50 & 90.00\\
(g) & 70.00 & 77.50 & 65.00 & 60.00 & 68.12\\
(h) & 82.50 & 90.00 & 92.50 & 90.00 & 88.75\\
(i) & 75.00 & 72.50 & 70.00 & 70.00 & 71.88\\
(j) & 90.00 & 90.00 & 97.50 & 97.50 & 93.75\\
\textbf{Average} & 79.50 & 81.00 & 79.50 & 78.50 & NIL \\ \hline
\end{tabular}
\end{table}
For $k=10$, the difference is quite significant. Overall, there was better accuracy across the data types. Having more clusters than classes means that multiple clusters are assigned to a class. This can help to capture the data's nature, if a single class happens to contain data from two different sources, with different statistical characteristics.
The metric with the highest average classification accuracy is \texttt{cityblock}. Scaling the data yields an improvement overall. Once again, the clear ``winner'' is the data type (j), which has been transformed using the covariance of the Class 3 training data.
%-------------------------------------------------------------------------------
% Neural Network
%-------------------------------------------------------------------------------
\section{Neural Network}
% Using Matlab Neural Network toolbox create a network, train and test with the wine data. Vary the parameters of the network to minimize the classification error. Report the observations and compare to the results from Q1 and Q2. Generate another random split of train/test/validation data and repeat the experiment.
Neural networks are an often-used method of pattern recognition, due to their high accuracy, low error rates, and ease of use. They are flexible, and so can be adapted to most pattern recognition problems, such as the use of Convolutional Neural Networks (CNNs) in image recognition, an industry standard.
Neural networks take inspiration from biological neurons, which are interconnected and transmit information using sudden spikes in chemical levels. This is a process that can be simulated in software, including using MATLAB's neural network toolbox (NNT); this toolbox allows the creation, training and testing of various types of neural network. Figures \ref{fig:nnt_perf} and \ref{fig:nnt_vis} show the NNT performance evaluation and visualisation of an example network respectively, with two hidden layers each of 10 neurons in a fitting network configuration.
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/performance}
\caption{NNT Performance Evaluation}
\label{fig:nnt_perf}
\end{figure}
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/view_twolayer}
\caption{NNT Network Visualisation}
\label{fig:nnt_vis}
\end{figure}
This section tests the use of neural networks to minimise the Mean Squared Error (MSE), as a measure of the performance of the network. First the number of hidden neurons in a single layer fitting network is varied to test performance; results for a standard partition are shown in Table \ref{tbl:unmixed}. The fitting network was used as one of the most flexible types offered by the NNT, and most networks perform with near-optimal results using only a single hidden layer. The code used to generate these results is shown in the Appendix.
The two training algorithms with the minimum mean squared error (MSE) were determined; the results are shown in Figure \ref{fig:unmixed_full}. The effects of overfitting are clearly shown by the huge spike in both algorithms after around 100 hidden neurons. The comparison between the two algorithms is more clearly seen in Figure \ref{fig:unmixed_limited}, which limits the x-axis to 100 hidden neurons.
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/unmixed_best_fullrange.png}
\caption{MSE of best performing algorithms}
\label{fig:unmixed_full}
\end{figure}
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/unmixed_best_limited.png}
\caption{MSE of best performing algorithms with limited axis}
\label{fig:unmixed_limited}
\end{figure}
The graphs in the figures show a large variation in MSE between the lower numbers of hidden neurons, but overall there is a downward trend towards the apparent optimum of 25 hidden neurons, which indicates underfitting. After the lowest point, there is an upward trend, which indicates overfitting. Thus, the optimal number of nodes to use is likely between 10 and 50. The best performing algorithm from this data set is the \textit{trainbfg} algorithm.
However, although the standard partition provides an even comparison with other tasks, it does not provide the optimal method of training a network, which is to use a random partition. Therefore, the data in Table \ref{tbl:mixed} show the results of repartitioning with the same ratios, but in a random order. The graph is shown in Figure \ref{fig:mixed_limited}, already limited to 100 neurons due to the same overfitting effect as with the standard partition data.
With randomly partitioned data, the most effective training algorithm is \textit{trainlm} with 15 hidden neurons, but the graph shows that overall \textit{trainbr} is a more effective algorithm, even reducing the effects of overfitting. This is in contrast to the standard partition, which shows other algorithms to be more effective, but more surprisingly the random data shows a lower MSE overall. This implies that using random partitioning is a more effective method of training a neural network than a fixed or contiguous partitioning method; this is further backed up by the first 100\% classification accuracy results. The random partitioning data also shows that the optimum number of hidden neurons to use is between 10 and 50 neurons.
\begin{table}
\centering
\caption{Standard Partition - Hidden Neurons Comparison}
\label{tbl:unmixed}
\begin{tabular}{llllll}
\hline
\textbf{Algorithm} & \textbf{Neurons} & \textbf{Time to Train (s)} & \textbf{Accuracy} & \textbf{MSE} \\ \hline
trainbfg & 15 & 0.15658 & 0.9 & 0.10695 \\ \hline
trainbfg & 20 & 0.23614 & 0.8 & 0.13496 \\ \hline
trainbfg & 25 & 0.82571 & 0.95 & 0.054725 \\ \hline
trainbfg & 30 & 0.76897 & 0.95 & 0.069127 \\ \hline
trainbfg & 35 & 0.70948 & 0.85 & 0.10906 \\ \hline
traincgp & 15 & 0.066146 & 0.825 & 0.16928 \\ \hline
traincgp & 20 & 0.09904 & 0.875 & 0.12706 \\ \hline
traincgp & 25 & 0.10311 & 0.95 & 0.05694 \\ \hline
traincgp & 30 & 0.075015 & 0.85 & 0.19212 \\ \hline
traincgp & 35 & 0.11164 & 0.85 & 0.13576 \\ \hline
\end{tabular}
\end{table}
\begin{figure}[!ht]
\centering
\includegraphics[width=\linewidth]{pic/mixed_best_limited.png}
\caption{MSE of best performing algorithms with limited axis with random partition}
\label{fig:mixed_limited}
\end{figure}
\begin{table}
\centering
\caption{Random Partition - Hidden Neurons Comparison}
\label{tbl:mixed}
\begin{tabular}{llllll}
\hline
\textbf{Algorithm} & \textbf{Neurons} & \textbf{Time to Train (s)} & \textbf{Accuracy} & \textbf{MSE} \\ \hline
trainbr & 15 & 5.8827 & 0.975 & 0.028823 \\ \hline
trainbr & 20 & 8.1333 & 0.975 & 0.020667 \\ \hline
trainbr & 25 & 14.6828 & 1 & 0.015294 \\ \hline
trainbr & 30 & 20.2646 & 0.975 & 0.021972 \\ \hline
trainbr & 35 & 28.5412 & 1 & 0.016915 \\ \hline
trainlm & 15 & 0.0657085 & 0.95 & 0.036084 \\ \hline
trainlm & 20 & 0.0607793 & 1 & 0.015435 \\ \hline
trainlm & 25 & 0.0679456 & 1 & 0.019838 \\ \hline
trainlm & 30 & 0.0846978 & 0.85 & 0.10142 \\ \hline
trainlm & 35 & 0.0921787 & 0.975 & 0.065321 \\ \hline
\end{tabular}
\end{table}
% TODO Decide whether to visualise matlab's performance stuff. Should just be in pic folder, but where does it fit?
The examination of neural networks thus far has shown that the most effective number of neurons for a single layer fitting network is in the range of 10-50 approximately. To further minimise the MSE in neural networks, multiple types of network were investigated, simultaneously with the number of layers per network. The code for this may be found in the Appendix; the results with minimum MSE are shown in Tables \ref{tbl:unmixed_networks} and \ref{tbl:mixed_networks}, representing standard and random partition results respectively.
% Tables of interesting data from network and layers comparison
\begin{table}
\centering
\caption{Standard Partition - Network and Layer Comparison}
\label{tbl:unmixed_networks}
\begin{tabular}{llllll}
\hline
\textbf{Network} & \textbf{Algorithm} & \textbf{Layers} & \textbf{Neurons} & \textbf{$T_{train}$} & \textbf{MSE} \\ \hline
pattern & trancgf & 3 & 20 & 0.086992 & 0.034907 \\ \hline
pattern & trainscg & 3 & 20 & 0.065218 & 0.040891 \\ \hline
pattern & trainlm & 2 & 10 & 0.083229 & 0.042546 \\ \hline
pattern & traincgp & 1 & 40 & 0.053625 & 0.0484 \\ \hline
fit & trainbfg & 2 & 30 & 38.0862 & 0.050361 \\ \hline
\end{tabular}
\end{table}
\begin{table}
\centering
\caption{Random Partition - Network and Layer Comparison}
\label{tbl:mixed_networks}
\begin{tabular}{llllll}
\hline
\textbf{Network} & \textbf{Algorithm} & \textbf{Layers} & \textbf{Neurons} & \textbf{$T_{train}$} & \textbf{MSE} \\ \hline
fit & trainbr & 2 & 10 & 6.7953 & 0.01676 \\ \hline
fit & trainbr & 1 & 10 & 0.9048 & 0.01691 \\ \hline
pattern & trainlm & 3 & 25 & 4.2124 & 0.01707 \\ \hline
pattern & trainscg & 3 & 15 & 0.0668& 0.01846 \\ \hline
feedforward & traincgp & 3 & 15 & 0.1105 & 0.01892 \\ \hline
\end{tabular}
\end{table}
The standard partition shows that a different type of network has proved most effective. \textit{Patternnet} is the network that is intended for use in pattern recognition applications, such as this one. However, the MSE of standard partition is still greater than that attained by randomly partitioned data, which uses a fitting network with \textit{trainbr} algorithm as the most effective method, matching very closely with the results of the previous data set. In fact, the previous set of data showed a lower MSE using \textit{trainbr} with a single layer of 25 neurons, which was not investigated in the latter data set due to the large amount of training time required.
Overall, the data shown in this section suggests that the most effective method of minimising error from a neural network is to randomly partition the data, and use a fitting network of a single layer of 25 neurons with the \textit{trainbr} training algorithm. It is possible that more layers of 25 neurons would increase accuracy, but at the cost of training and test time with a minimal decrease in MSE.
% TODO compare MSE and accuracy with other tasks
%-------------------------------------------------------------------------------
% Conclusion
%-------------------------------------------------------------------------------
\section{Conclusion}
A conclusion \cite{pca}.
%-------------------------------------------------------------------------------
% References
%-------------------------------------------------------------------------------
\bibliographystyle{unsrt}
\bibliography{pr_refs}
%-------------------------------------------------------------------------------
% Appendix(ces)
%-------------------------------------------------------------------------------
\onecolumn
\section*{Appendix}
\subsection*{All Colour Maps of Covariance Matrices}
\begin{figure}[!ht]
\captionsetup[subfigure]{position=b}
\centering
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covtraining.png}
\caption{Covariance of all.}
\label{fig:covtraining}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covclass1.png}
\caption{Covariance of Class 1.}
\label{fig:covclass1}
\end{subfigure}
\\
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covclass2.png}
\caption{Covariance of Class 2.}
\label{fig:covclass2}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covclass3.png}
\caption{Covariance of Class 3.}
\label{fig:covclass3}
\end{subfigure}
\caption{Visualised color maps of covariance matrices from unmodified data.}
\label{fig:covtrainingall}
\end{figure}
\newpage
\begin{figure}[!ht]
\captionsetup[subfigure]{position=b}
\centering
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covl2.png}
\caption{Covariance of all.}
\label{fig:covl2}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covl2class1.png}
\caption{Covariance of Class 1.}
\label{fig:covl2class1}
\end{subfigure}
\\
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covl2class2.png}
\caption{Covariance of Class 2.}
\label{fig:covl2class2}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/covl2class3.png}
\caption{Covariance of Class 3.}
\label{fig:covl2class3}
\end{subfigure}
\caption{Visualised color maps of covariance matrices from L2-normalised data.}
\label{fig:covl2all}
\end{figure}
\newpage
\subsection*{All Colour Maps of Correlation Matrices}
\begin{figure}[!ht]
\captionsetup[subfigure]{position=b}
\centering
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/corrl2.png}
\caption{Correlation of all.}
\label{fig:corrtraining}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/corrclass1.png}
\caption{Correlation of Class 1.}
\label{fig:corrclass1}
\end{subfigure}
\\
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/corrclass2.png}
\caption{Correlation of Class 2.}
\label{fig:corrclass2}
\end{subfigure}
~
\begin{subfigure}{0.45\textwidth}
\includegraphics[width=\textwidth]{pic/corrclass3.png}
\caption{Correlation of Class 3.}
\label{fig:corrclass3}
\end{subfigure}
\caption{Visualised color maps of correlations matrices from data. As correlation is a normalised value, the colour maps are identical for both modified and unmodified data. }
\label{fig:corrall}
\end{figure}
\newpage
\subsection*{Confusion Matrices From NN Classification}
\begin{figure}[!ht]
\captionsetup[subfigure]{position=b}
\centering
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_l1.png}
\caption{Unmodified data with L1 metric. }
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_l2.png}
\caption{Unmodified data with L2 metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_chi.png}
\caption{Unmodified data with Chi-Squared metric.}
\end{subfigure}
\\
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_inter.png}
\caption{Unmodified data with Intersection metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_corr.png}
\caption{Unmodified data with Correlation metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_cov_l2.png}
\caption{Unmodified data with Mahalanobis metric, L2 covariance.}
\end{subfigure}
\\
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_cov_l2_class1.png}
\caption{Unmodified data with Mahalanobis metric, L2 class 1 covariance.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_cov_l2_class2.png}
\caption{Unmodified data with Mahalanobis metric, L2 class 2 covariance.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/unmod_cov_l2_class3.png}
\caption{Unmodified data with Mahalanobis metric, L2 class 3 covariance.}
\end{subfigure}
\caption{Confusion matrices for the different NN classification metrics. The overall classification success and error can be seen in the bottom right square of each matrix. }
\label{fig:nnconfuseunmod}
\end{figure}
\newpage
\begin{figure}[!ht]
\captionsetup[subfigure]{position=b}
\centering
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_l1.png}
\caption{Normalised data with L1 metric. }
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_l2.png}
\caption{Normalised data with L2 metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_chi.png}
\caption{Normalised data with Chi-Squared metric.}
\end{subfigure}
\\
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_inter.png}
\caption{Normalised data with Intersection metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_corr.png}
\caption{Normalised data with Correlation metric.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_cov_l2.png}
\caption{Normalised data with Mahalanobis metric, L2 covariance.}
\end{subfigure}
\\
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_cov_l2_class1.png}
\caption{Normalised data with Mahalanobis metric, L2 class 1 covariance.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_cov_l2_class2.png}
\caption{Normalised data with Mahalanobis metric, L2 class 2 covariance.}
\end{subfigure}
~
\begin{subfigure}{0.32\textwidth}
\includegraphics[width=\textwidth]{pic/mod_cov_l2_class3.png}
\caption{Normalised data with Mahalanobis metric, L2 class 3 covariance.}
\end{subfigure}
\caption{Confusion matrices for the different NN classification metrics. The overall classification success and error can be seen in the bottom right square of each matrix. }
\label{fig:nnconfusemod}
\end{figure}
\newpage
\subsection*{Nearest Neighbour Function Using L1 Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_l1.m}
\subsection*{Nearest Neighbour Function Using L2 Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_l2.m}
\newpage
\subsection*{Nearest Neighbour Function Using Chi Squared Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_chi.m}
\subsection*{Nearest Neighbour Function Using Histogram Intersection Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_intersection.m}
\newpage
\subsection*{Nearest Neighbour Function Using Correlation Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_corr.m}
\subsection*{Nearest Cluster Function Using \texttt{pdist()}}
\lstinputlisting[style=Matlab-editor]{src/nearest_cluster.m}
\newpage
\subsection*{K-Means Clustering Function Using All Four Metrics}
\lstinputlisting[style=Matlab-editor]{src/kmeanmetrics.m}
\newpage
\subsection*{Test set code for Task 3}
\lstinputlisting[style=Matlab-editor]{src/task3_all.m}
\newpage
\subsection*{Making and testing of neural network}
\lstinputlisting[style=Matlab-editor]{src/make_test_nn.m}
\newpage
\subsection*{Test code for various layers/networks for Task 3}
\lstinputlisting[style=Matlab-editor]{src/task3_networks.m}
\newpage
\end{document}