main.tex

%-------------------------------------------------------------------------------
% Document & Package declarations
%-------------------------------------------------------------------------------

\documentclass[a4paper, 10pt, conference]{ieeeconf}
\usepackage{graphicx}
\usepackage[colorlinks=true, allcolors=black]{hyperref}
\usepackage{tabularx}

%% Language and font encodings
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}

%% Useful packages
\usepackage{amsmath}
\usepackage{graphicx}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage[font=footnotesize,labelfont=bf]{caption}
\usepackage[font=footnotesize,labelfont=bf]{subcaption}

%% Packages for displaying source code
\usepackage{listings}
% \usepackage[framed,numbered,autolinebreaks,useliterate]{mcode}

\usepackage{float}
\usepackage{longtable}

%% Packages for displaying source code
\usepackage[numbered,framed]{matlab-prettifier}
\usepackage{color}

%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including  **
%% ** renaming them and changing author support contact information. **
%%
%% File list of work: IEEEtran.cls, IEEEtran_HOWTO.pdf, bare_adv.tex,
%%                    bare_conf.tex, bare_jrnl.tex, bare_jrnl_compsoc.tex,
%%                    bare_jrnl_transmag.tex
%%*************************************************************************

%-------------------------------------------------------------------------------
% Document Configuration
%-------------------------------------------------------------------------------

\begin{document}
\title{Pattern Recognition - Distance Metrics and Neural Networks}
\author{Michael~Hart (00818445) and
        Meng~Kiang~Seah (00699092)
\\
        Department of Electrical and Electronic Engineering,
        Imperial College London,
        SW7 2AZ
\\
        E-mail: \{mh1613, mks211\}@imperial.ac.uk}
\date{\today}

%-------------------------------------------------------------------------------
% Plan on what to write
%-------------------------------------------------------------------------------

% See coursework instructions at:
% https://bb.imperial.ac.uk/bbcswebdav/pid-1009576-dt-content-rid-3417906_1/courses/DSS-EE4_68-16_17/PRCoursework2.pdf

%-------------------------------------------------------------------------------
% Information Banner
%-------------------------------------------------------------------------------

\maketitle

%-------------------------------------------------------------------------------
% Abstract
%-------------------------------------------------------------------------------

\begin{abstract}
\todo{Use actual words} Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus ac faucibus libero, vel facilisis lacus. Cras imperdiet, enim vitae consequat pharetra, dui odio cursus mi, finibus tristique est sapien a dolor. Donec eu dui in elit tempor volutpat id non sapien. Aenean volutpat lectus eu lacinia tincidunt. Donec at libero volutpat, tempus magna ac, gravida est. Integer gravida, ligula id posuere semper, libero justo gravida lorem, non congue odio odio luctus lectus. In hac habitasse platea dictumst. Quisque auctor purus lectus, vitae condimentum libero.
\end{abstract}

%-------------------------------------------------------------------------------
% Introduction
%-------------------------------------------------------------------------------
\section{Introduction}
There are a number of methods that can be used to classify data. The data set used is a set of 178 vectors, each containing 13 elements, describing wine. The data is split into 3 classes, the classes having 59, 71, and 48 instances each, respectively.

The data was split into a training set, a validation set, and a testing set. This was in the ratio $118:20:40$. To ensure that each set received an appropriate mix of each class, the classes themselves were split. Class 1 was split into $39:7:13$ for training, validation, and testing, respectively. Class 2 was in $47:8:16$, and Class 3 was $32:5:11$.

\subsection{Nearest Neighbour Distance Metrics Classification}
The nearest neighbour (NN) classification system works by finding the closest training point to the test point, and assigning it the same class. The notion of ``closest'' is explored further with different distance metrics.

\subsection{K-Means Clustering and Classification}
K-means clustering works by clustering data points into a predetermined number of clusters. An initial $k$ number of centroids are first seeded, with each training point assigned to the closest centroid. Each centroid is changed to the value of the average of the points assigned to it. This is repeated until the points no longer change centroid, or a certain number of iterations have completed \cite{kmeans}.

\subsection{Neural Networks}
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus ac faucibus libero, vel facilisis lacus. Cras imperdiet, enim vitae consequat pharetra, dui odio cursus mi, finibus tristique est sapien a dolor. Donec eu dui in elit tempor volutpat id non sapien. Aenean volutpat lectus eu lacinia tincidunt. Donec at libero volutpat, tempus magna ac, gravida est.

%-------------------------------------------------------------------------------
% Distance Metrics
%-------------------------------------------------------------------------------
\section{Distance Metrics}
\subsection{Data Preparation}
The data given can be analysed in its raw form, but it can also be L2-normalised. This means that it is divided by its L2-distance. This is defined as in Equation \ref{eq:l2}, for a $p$-dimensional vector.
\begin{align}
  L_2 &= \sqrt{\sum_{i = 1}^{p} \left( x_i \right) ^2 }\label{eq:l2}
\end{align}
The question then becomes whether to normalise all 178 instances by instance (per row) or by dimension (by column). If one were to visualise the data, normalising it by instance would place all points at a Euclidean distance of 1 from the origin. This would mean they lie on the surface of a hypersphere. In contrast, normalising by dimension shrinks the spread of the data so that every value fits in the range $[-1; 1]$. From the same source as was used with the Support Vector Machines (SVMs) \cite{scale}, the sensible thing is to normalise by dimension. This prevents any one dimension from becoming overbearing in future measurements. Additionally, it makes no sense to scale the data in one instance with itself, as this will still allow the dimensions with larger variance to dominate.

\subsection{Covariances}
From the unmodified data and the normalised data, the covariances can be calculated by using the full set of data, or by focusing on each of the three classes individually. Thus, for the two types of data, there are 4 covariance matrices: from all the data, Class 1, Class 2, and Class 3. The way used to obtain the covariance matrices was the \texttt{cov} built-in function in MATLAB.

For context, the way to interpret the data in a covariance matrix is to know that it will be a square, symmetric matrix, with dimensions equal to the number of dimensions in each data instance. The matrix, $S$, will have elements $s_{i, j}$, equal to covariance of the $i$-th and $j$-th dimension. The primary diagonal is the covariance of a dimension with itself, and is therefore the variance \cite{cov}.

Analysing 13-by-13 tables of numbers is not easy. Therefore, a useful alternative is to view the matrices as colour maps, which are easily produced with MATLAB.

\subsubsection{Unmodifed Training Data}
Figure \ref{fig:covtrainingone} shows the colour map for the covariance of the training matrix. The other 3 are found in the Appendix as Figure \ref{fig:covtrainingall} to conserve space. However, they are all like Figure \ref{fig:covtrainingone}. By not performing any scaling to the data, the dimension with the largest range, 13, has dominated the diagram.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/covtraining.png}
    \caption{Colour map of covariance of training data.}
    \label{fig:covtrainingone}
\end{figure}

This is further seen when looking at the mean values in each dimension for the 4 sets of unmodified data, shown in Table \ref{tbl:trainingmean}. Dimension 13 has the largest magnitude, and this is the cause for the skewed colour maps.

\begin{table}[!ht]
\centering
\caption{Means of Unmodified Data}
\label{tbl:trainingmean}
\begin{tabular}{|r|llll|}
\hline
\textbf{Dimension} & \textbf{All Training} & \textbf{Class 1} & \textbf{Class 2} & \textbf{Class 3} \\ \hline
1 & 12.719 & 13.484 & 11.989 & 12.859\\
2 & 2.262 & 1.959 & 1.846 & 3.242\\
3 & 2.379 & 2.487 & 2.269 & 2.409\\
4 & 19.506 & 17.541 & 20.013 & 21.156\\
5 & 99.542 & 105.000 & 94.106 & 100.875\\
6 & 2.277 & 2.783 & 2.286 & 1.647\\
7 & 2.006 & 2.924 & 2.078 & 0.782\\
8 & 0.357 & 0.291 & 0.358 & 0.438\\
9 & 1.561 & 1.815 & 1.673 & 1.088\\
10 & 4.824 & 5.253 & 2.959 & 7.040\\
11 & 0.959 & 1.059 & 1.073 & 0.668\\
12 & 2.601 & 3.132 & 2.799 & 1.663\\
13 & 737.110 & 1070.385 & 522.213 & 646.562\\ \hline
\end{tabular}
\end{table}

\subsubsection{Normalised Training Data}
The same process is repeated with the normalised data. To save space, the covariance matrices for the different classes can be found in the Appendix as Figure \ref{fig:covl2all}, but Figure \ref{fig:covl2one} shows the colour plot for the covariance matrix of all the normalised training data. Along with that, Table \ref{tbl:l2mean} shows the mean values.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/covl2.png}
    \caption{Colour map of covariance of normalised training data.}
    \label{fig:covl2one}
\end{figure}

\begin{table}[!ht]
\centering
\caption{Means of Normalised Data}
\label{tbl:l2mean}
\begin{tabular}{|r|llll|}
\hline
\textbf{Dimension} & \textbf{All Training} & \textbf{Class 1} & \textbf{Class 2} & \textbf{Class 3} \\ \hline
1 & 0.073 & 0.078 & 0.069 & 0.074\\
2 & 0.066 & 0.057 & 0.053 & 0.094\\
3 & 0.075 & 0.078 & 0.071 & 0.076\\
4 & 0.074 & 0.066 & 0.076 & 0.080\\
5 & 0.074 & 0.078 & 0.070 & 0.075\\
6 & 0.072 & 0.088 & 0.072 & 0.052\\
7 & 0.067 & 0.097 & 0.069 & 0.026\\
8 & 0.070 & 0.057 & 0.070 & 0.086\\
9 & 0.069 & 0.080 & 0.074 & 0.048\\
10 & 0.065 & 0.071 & 0.040 & 0.095\\
11 & 0.073 & 0.081 & 0.082 & 0.051\\
12 & 0.072 & 0.087 & 0.078 & 0.046\\
13 & 0.068 & 0.099 & 0.048 & 0.060\\ \hline
\end{tabular}
\end{table}

The data's means are very much in the same range, which allows the covariances to be in the same range. This means displaying them all as the same scale is possible in the colour map. Knowing that a covariance of zero is a necessary, but not sufficient condition of independence, the dimension pairs that have the lowest covariances are the ones that are the most likely to be independent.

\subsubsection{Correlation Matrices and Analysis}
Another clearer way to view the covariance matrix is to convert it to a correlation matrix, using the built-in function \texttt{corrcov()}. A correlation matrix displays 1 on the major diagonal, and it is a dimensionless value in the range $[-1; 1]$, as it is scaled by the dimensions' standard deviation. A value of $1$ suggests a positive linear relationship, and $-1$ a negative linear relationship. On the other hand, $0$ means no linear relationship at all.

As it can be seen as a scaled version of the covariance, both the normalised data and unmodified data yielded identical correlation matrices. Once again, they can be found in the Appendix as Figure \ref{fig:corrall}. The trouble with this measurement is that it only ``detects'' linear relationships. Different dimension pairs may be dependent on some other function and this will not be reflected.

While the images are not included in the main body of the report, the takeaway is that the covariance matrices, correlation matrices, and mean values by dimension all vary according to whether all the training data is used, or if it is separated by class. Those statistical measurements can be used to characterise and describe groups of data, which means they can be used for classification.

\subsection{Nearest Neighbour Classification}
The Nearest Neighbour (NN) method takes each test data point and assigns it a class based on the class of the closest training data point to it. Different measurements were used to represent this concept of ``closeness''. The first one is the L1 distance, or the Manhattan distance. This is expressed with Equation \ref{eq:l1} for 2 $p$-dimensional vectors, $x$ and $y$ \cite{metrics}.

\begin{align}
  L_1 \left( x, y \right) &= \sum_{i = 1}^{p} \lvert x_i  - y_i \rvert \label{eq:l1}
\end{align}

The second is the L2, or Euclidean distance, in Equation \ref{eq:l22}. Equation \ref{eq:chi} shows the Chi-Squared formula, Equation \ref{eq:intersection} shows the histogram intersection formula, and finally Equation \ref{eq:corr} shows the correlation formula \cite{metrics}. The functions written to implement these metrics are in the Appendix.

\begin{align}
  L_2 \left( x, y \right) &= \sum_{i = 1}^{p} \left( x_i  - y_i \right) ^2 \label{eq:l22}\\
  \mathcal{X}^2 \left( x, y \right) &= \frac{1}{2} \sum_{i = 1}^{p} \frac{\left( x_i - y_i \right) ^2 }{\left( x_i + y_i \right)} \label{eq:chi}\\
  \cap \left( x, y \right) &= \sum_{i = 1}^{p} \min \left( x_i, y_i \right) \label{eq:intersection}\\
  CrossCorr &= \sum_{i = 1}^{p} x_i y_i \label{eq:corr}
\end{align}

The fifth method, called the Mahalanobis distance, defined for vectors $\mathbf{x} = [x_1, x_2, \ldots x_p]$ and $\mathbf{y} = [y_1, y_2, \ldots y_p]$ as shown in Equation \ref{eq:maha}. Originally, $\mathbf{x}$ and $\mathbf{y}$ were from the same set of data, and $\Sigma$ was the covariance matrix of that set of data. However, it can be generalised to use any positive semi-definite matrix (needed to ensure that the distance is always more than zero), $A$, shown in Equation \ref{eq:maha2} \cite{metrics}.

\begin{align}
  Maha \left( \mathbf{x}, \mathbf{y} \right) &= \left( \mathbf{x} - \mathbf{y} \right) ^T \Sigma^{-1} \left( \mathbf{x} - \mathbf{y} \right)\label{eq:maha}\\
  \Sigma &= \text{Covariance Matrix}\notag\\
  Maha \left( \mathbf{x}, \mathbf{y} \right)&= \left( \mathbf{x} - \mathbf{y} \right) ^T A \left( \mathbf{x} - \mathbf{y}\right)\label{eq:maha2}\\
  &= \left( \mathbf{x} - \mathbf{y} \right) ^T G^T G \left( \mathbf{x} - \mathbf{y}\right)\notag\\
  &= \left( G\mathbf{x} - G\mathbf{y} \right) ^T \left( G\mathbf{x} - G\mathbf{y}\right)\notag\\
  &= L_2\left( G\mathbf{x}, G\mathbf{y} \right) \label{eq:maha3}
\end{align}

And as Equation \ref{eq:maha3} shows, if $A$ is decomposed into $G^TG$, it becomes a transformation to apply to the data before performing the L2 distance metric. This is what was done as well for the NN classification. 4 covariance matrices were obtained from the normalised training data. One was from all of it, and the other three were for each of the classes \cite{metrics}.

The inverses were taken (MATLAB's \texttt{inv()}) and decomposed using the Cholesky decomposition (MATLAB's \texttt{chol()}).

The five metrics were used for NN classification with the normalised and unmodified data. Table \ref{tbl:nn} shows the accuracy. Table \ref{tbl:nnmaha} show the accuracy of the Mahalanobis distance with different covariance matrices with the normalised and unmodified data. A full set of the confusion matrices is in the Appendix as Figures \ref{fig:nnconfuseunmod} and \ref{fig:nnconfusemod}.

\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in NN Classification}
\label{tbl:nn}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Type of Data} & \textbf{L1} & \textbf{L2} & \textbf{Chi-Squared} & \textbf{Intersection} & \textbf{Correlation}\\ \hline
Unmodified & 85.00 & 80.00 & 90.00 & 90.00 & 87.50\\
Normalised & 90.00 & 87.50 & 95.00 & 87.50 & 95.00\\ \hline
\end{tabular}
\end{table}

The first observation is that overall, using the normalised data yields a higher accuracy. However, this is quite slight. Looking at the accuracies, the lowest value is 80\%, which is quite impressive for such a simple classification system. The metric with the highest accuracy with both sets of data is the Chi-Squared metric, while the lowest is the L2 one.

\begin{table}[!ht]
\centering
\caption{Accuracy of Mahalanobis Distance in NN Classification}
\label{tbl:nnmaha}
\begin{tabular}{|r|llll|}
\hline
\textbf{Covariance Used} & L2 Training & L2 Class 1 & L2 Class 2 & L2 Class 3\\ \hline
\textbf{Unmodified} & 82.50 & 77.50 & 80.00 & 82.50\\
\textbf{Normalised} & 80.00 & 90.00 & 77.50 & 97.50\\ \hline
\end{tabular}
\end{table}

However, with Mahalanobis distance, the improvement from normalising data is no longer so apparent, working only for certain cases. This is because the applying the transformation already results in some scaling of the data \cite{metrics}.

Transformation of the data through the covariance matrices of Class 3 yielded the highest accuracy for classification when applied to normalised data. Looking at the colour map in the Appendix shows how small its covariance values are (after normalisation). This means that the resulting $G$ transformation was such that each class on average moved away from each other.

%-------------------------------------------------------------------------------
% K-means clustering
%-------------------------------------------------------------------------------
\section{K-means clustering}

As explained in the Introduction, K-means is a way of classifying data. There is a built-in function in MATLAB called \texttt{kmeans()} that takes in data, along with a $k$ value and returns the centroids after performing K-means clustering on it. This was used with the training data, before the testing data was classified using those centroids.

As with the different distance metrics in the previous section, there are different metrics that MATLAB uses for determining the centroid positions. They are \texttt{sqeuclidean} (L2), \texttt{cityblock} (L1), \texttt{cosine}, and \texttt{correlation} \cite{kmeans}. There are also corresponding \texttt{pdist()} variations, which return a distance value depending on the selected metric \cite{pdist}.

These two functions were combined in the function that can be found in the Appendix. The K-means clustering is performed for all 4 metrics, before the testing data is classified using the respective identical metric to find the closest centroid. It does not make sense to use other metrics in testing other than the one used for clustering as otherwise the notion of ``closest'' in the classification is different.

Different ``versions'' of the data can be used with this K-means clustering. The different types are shown in Table \ref{tbl:types}, while the Table \ref{tbl:k3} shows the classification accuracy for $k=3$.

% a is normal data
% b is l2norm data
% c is normal data mahalanobis transformed by covariance of l2 data
% d is l2norm data mahalanobis transformed by covariance of l2 data
% e is normal data mahalanobis transformed by covariance of class 1 l2 data
% f is l2norm data mahalanobis transformed by covariance of class 1 l2 data
% g is normal data mahalanobis transformed by covariance of class 2 l2 data
% h is l2norm data mahalanobis transformed by covariance of class 2 l2 data
% i is normal data mahalanobis transformed by covariance of class 3 l2 data
% j is l2norm data mahalanobis transformed by covariance of class 3 l2 data

\begin{table}[!ht]
\centering
\caption{Data types used in K-means clustering for classification.}
\label{tbl:types}
\begin{tabular}{|r|l|}
\hline
\textbf{Data} & \textbf{Description} \\ \hline
(a) & Unmodified Data\\
(b) & Normalised Data\\
(c) & Unmodified Data transformed from Normalised Data\\
(d) & Normalised Data transformed from Normalised Data\\
(e) & Unmodified Data transformed from Normalised Class 1 Data\\
(f) & Normalised Data transformed from Normalised Class 1 Data\\
(g) & Unmodified Data transformed from Normalised Class 2 Data\\
(h) & Normalised Data transformed from Normalised Class 2 Data\\
(i) & Unmodified Data transformed from Normalised Class 3 Data\\
(j) & Normalised Data transformed from Normalised Class 3 Data\\\hline
\end{tabular}
\end{table}

\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in K-Means Clustering for $k = 3$}
\label{tbl:k3}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Data} & \textbf{Euclidean} & \textbf{Cityblock} & \textbf{Cosine} & \textbf{Correlation} & \textbf{Average} \\ \hline
(a) & 75.00 & 75.00 & 75.00 & 72.50 & 74.38\\
(b) & 87.50 & 85.00 & 85.00 & 85.00 & 85.62\\
(c) & 75.00 & 75.00 & 72.50 & 72.50 & 73.75\\
(d) & 52.50 & 85.00 & 72.50 & 65.00 & 68.75\\
(e) & 72.50 & 75.00 & 72.50 & 72.50 & 73.12\\
(f) & 80.00 & 87.50 & 80.00 & 77.50 & 81.25\\
(g) & 75.00 & 75.00 & 72.50 & 62.50 & 71.25\\
(h) & 87.50 & 62.50 & 92.50 & 97.50 & 85.00\\
(i) & 75.00 & 75.00 & 72.50 & 62.50 & 71.25\\
(j) & 92.50 & 90.00 & 92.50 & 92.50 & 91.88\\
\textbf{Average} & 77.25 & 78.50 & 78.75 & 76.00 & NIL \\ \hline
\end{tabular}
\end{table}

Overall for $k=3$, the normalised data yielded a better result that the unmodified data, with the only outlier being type (d). From the average classification accuracy per metric type, the best metric is a close tie between \texttt{cityblock} and \texttt{cosine}. Additionally, transforming the data through a covariance matrix only works if the covariance is from a specific class, as all those data types outperformed (c) and (d). However, none of them outperformed untransformed data, with the exception of those transformed by the Class 3 covariance. This is similar to the results from Table \ref{tbl:nnmaha}. The lower values of the scaled Class 3 covariance matrix are once again likely to be the cause, as explained earlier.

\begin{table}[!ht]
\centering
\caption{Accuracy of Distance Metrics in K-Means Clustering for $k = 10$}
\label{tbl:k10}
\begin{tabular}{|r|lllll|}
\hline
\textbf{Data} & \textbf{Euclidean} & \textbf{Cityblock} & \textbf{Cosine} & \textbf{Correlation} & \textbf{Average}\\ \hline
(a) & 72.50 & 75.00 & 72.50 & 70.00 & 72.50\\
(b) & 90.00 & 90.00 & 90.00 & 77.50 & 86.88\\
(c) & 67.50 & 75.00 & 72.50 & 70.00 & 71.25\\
(d) & 77.50 & 80.00 & 80.00 & 85.00 & 80.62\\
(e) & 77.50 & 72.50 & 67.50 & 72.50 & 72.50\\
(f) & 92.50 & 87.50 & 87.50 & 92.50 & 90.00\\
(g) & 70.00 & 77.50 & 65.00 & 60.00 & 68.12\\
(h) & 82.50 & 90.00 & 92.50 & 90.00 & 88.75\\
(i) & 75.00 & 72.50 & 70.00 & 70.00 & 71.88\\
(j) & 90.00 & 90.00 & 97.50 & 97.50 & 93.75\\
\textbf{Average} & 79.50 & 81.00 & 79.50 & 78.50 & NIL \\ \hline
\end{tabular}
\end{table}

For $k=10$, the difference is quite significant. Overall, there was better accuracy across the data types. Having more clusters than classes means that multiple clusters are assigned to a class. This can help to capture the data's nature, if a single class happens to contain data from two different sources, with different statistical characteristics.

 The metric with the highest average classification accuracy is \texttt{cityblock}. Scaling the data yields an improvement overall. Once again, the clear ``winner'' is the data type (j), which has been transformed using the covariance of the Class 3 training data.

%-------------------------------------------------------------------------------
% Neural Network
%-------------------------------------------------------------------------------
\section{Neural Network}

% Using Matlab Neural Network toolbox create a network, train and test with the wine data. Vary the parameters of the network to minimize the classification error. Report the observations and compare to the results from Q1 and Q2. Generate another random split of train/test/validation data and repeat the experiment.

Neural networks are an often-used method of pattern recognition, due to their high accuracy, low error rates, and ease of use. They are flexible, and so can be adapted to most pattern recognition problems, such as the use of Convolutional Neural Networks (CNNs) in image recognition, an industry standard.

Neural networks take inspiration from biological neurons, which are interconnected and transmit information using sudden spikes in chemical levels. This is a process that can be simulated in software, including using MATLAB's neural network toolbox (NNT); this toolbox allows the creation, training and testing of various types of neural network. Figures \ref{fig:nnt_perf} and \ref{fig:nnt_vis} show the NNT performance evaluation and visualisation of an example network respectively, with two hidden layers each of 10 neurons in a fitting network configuration.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/performance}
    \caption{NNT Performance Evaluation}
    \label{fig:nnt_perf}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/view_twolayer}
    \caption{NNT Network Visualisation}
    \label{fig:nnt_vis}
\end{figure}

This section tests the use of neural networks to minimise the Mean Squared Error (MSE), as a measure of the performance of the network. First the number of hidden neurons in a single layer fitting network is varied to test performance; results for a standard partition are shown in Table \ref{tbl:unmixed}. The fitting network was used as one of the most flexible types offered by the NNT, and most networks perform with near-optimal results using only a single hidden layer. The code used to generate these results is shown in the Appendix.

The two training algorithms with the minimum mean squared error (MSE) were determined; the results are shown in Figure \ref{fig:unmixed_full}. The effects of overfitting are clearly shown by the huge spike in both algorithms after around 100 hidden neurons. The comparison between the two algorithms is more clearly seen in Figure \ref{fig:unmixed_limited}, which limits the x-axis to 100 hidden neurons.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/unmixed_best_fullrange.png}
    \caption{MSE of best performing algorithms}
    \label{fig:unmixed_full}
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/unmixed_best_limited.png}
    \caption{MSE of best performing algorithms with limited axis}
    \label{fig:unmixed_limited}
\end{figure}

The graphs in the figures show a large variation in MSE between the lower numbers of hidden neurons, but overall there is a downward trend towards the apparent optimum of 25 hidden neurons, which indicates underfitting. After the lowest point, there is an upward trend, which indicates overfitting. Thus, the optimal number of nodes to use is likely between 10 and 50. The best performing algorithm from this data set is the \textit{trainbfg} algorithm.

However, although the standard partition provides an even comparison with other tasks, it does not provide the optimal method of training a network, which is to use a random partition. Therefore, the data in Table \ref{tbl:mixed} show the results of repartitioning with the same ratios, but in a random order. The graph is shown in Figure \ref{fig:mixed_limited}, already limited to 100 neurons due to the same overfitting effect as with the standard partition data.

With randomly partitioned data, the most effective training algorithm is \textit{trainlm} with 15 hidden neurons, but the graph shows that overall \textit{trainbr} is a more effective algorithm, even reducing the effects of overfitting. This is in contrast to the standard partition, which shows other algorithms to be more effective, but more surprisingly the random data shows a lower MSE overall. This implies that using random partitioning is a more effective method of training a neural network than a fixed or contiguous partitioning method; this is further backed up by the first 100\% classification accuracy results. The random partitioning data also shows that the optimum number of hidden neurons to use is between 10 and 50 neurons.

\begin{table}
\centering
\caption{Standard Partition - Hidden Neurons Comparison}
\label{tbl:unmixed}
\begin{tabular}{llllll}
\hline
\textbf{Algorithm} & \textbf{Neurons} & \textbf{Time to Train (s)} & \textbf{Accuracy} & \textbf{MSE} \\ \hline
trainbfg & 15 & 0.15658 & 0.9 & 0.10695 \\ \hline
trainbfg & 20 & 0.23614 & 0.8 & 0.13496 \\ \hline
trainbfg & 25 & 0.82571 & 0.95 & 0.054725 \\ \hline
trainbfg & 30 & 0.76897 & 0.95 & 0.069127 \\ \hline
trainbfg & 35 & 0.70948 & 0.85 & 0.10906 \\ \hline

traincgp & 15 & 0.066146 & 0.825 & 0.16928 \\ \hline
traincgp & 20 & 0.09904 & 0.875 & 0.12706 \\ \hline
traincgp & 25 & 0.10311 & 0.95 & 0.05694 \\ \hline
traincgp & 30 & 0.075015 & 0.85 & 0.19212 \\ \hline
traincgp & 35 & 0.11164 & 0.85 & 0.13576 \\ \hline
\end{tabular}
\end{table}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=\linewidth]{pic/mixed_best_limited.png}
    \caption{MSE of best performing algorithms with limited axis with random partition}
    \label{fig:mixed_limited}
\end{figure}

\begin{table}
\centering
\caption{Random Partition - Hidden Neurons Comparison}
\label{tbl:mixed}
\begin{tabular}{llllll}
\hline
\textbf{Algorithm} & \textbf{Neurons} & \textbf{Time to Train (s)} & \textbf{Accuracy} & \textbf{MSE} \\ \hline
trainbr & 15 & 5.8827 & 0.975 & 0.028823 \\ \hline
trainbr & 20 & 8.1333 & 0.975 & 0.020667 \\ \hline
trainbr & 25 & 14.6828 & 1 & 0.015294 \\ \hline
trainbr & 30 & 20.2646 & 0.975 & 0.021972 \\ \hline
trainbr & 35 & 28.5412 & 1 & 0.016915 \\ \hline

trainlm & 15 & 0.0657085 & 0.95 & 0.036084 \\ \hline
trainlm & 20 & 0.0607793 & 1 & 0.015435 \\ \hline
trainlm & 25 & 0.0679456 & 1 & 0.019838 \\ \hline
trainlm & 30 & 0.0846978 & 0.85 & 0.10142 \\ \hline
trainlm & 35 & 0.0921787 & 0.975 & 0.065321 \\ \hline
\end{tabular}
\end{table}

% TODO Decide whether to visualise matlab's performance stuff. Should just be in pic folder, but where does it fit?

The examination of neural networks thus far has shown that the most effective number of neurons for a single layer fitting network is in the range of 10-50 approximately. To further minimise the MSE in neural networks, multiple types of network were investigated, simultaneously with the number of layers per network. The code for this may be found in the Appendix; the results with minimum MSE are shown in Tables \ref{tbl:unmixed_networks} and \ref{tbl:mixed_networks}, representing standard and random partition results respectively.

% Tables of interesting data from network and layers comparison
\begin{table}
\centering
\caption{Standard Partition - Network and Layer Comparison}
\label{tbl:unmixed_networks}
\begin{tabular}{llllll}
\hline
\textbf{Network} & \textbf{Algorithm} & \textbf{Layers} & \textbf{Neurons} & \textbf{$T_{train}$} & \textbf{MSE} \\ \hline
pattern & trancgf & 3 & 20 & 0.086992 & 0.034907 \\ \hline
pattern & trainscg & 3 & 20 & 0.065218 & 0.040891 \\ \hline
pattern & trainlm & 2 & 10 & 0.083229 & 0.042546 \\ \hline
pattern & traincgp & 1 & 40 & 0.053625 & 0.0484 \\ \hline
fit & trainbfg & 2 & 30 & 38.0862 & 0.050361 \\ \hline
\end{tabular}
\end{table}

\begin{table}
\centering
\caption{Random Partition - Network and Layer Comparison}
\label{tbl:mixed_networks}
\begin{tabular}{llllll}
\hline
\textbf{Network} & \textbf{Algorithm} & \textbf{Layers} & \textbf{Neurons} & \textbf{$T_{train}$} & \textbf{MSE} \\ \hline
fit & trainbr & 2 & 10 & 6.7953 & 0.01676 \\ \hline
fit & trainbr & 1 & 10 & 0.9048 & 0.01691 \\ \hline
pattern & trainlm & 3 & 25 & 4.2124 & 0.01707 \\ \hline
pattern & trainscg & 3 & 15 & 0.0668& 0.01846 \\ \hline
feedforward & traincgp & 3 & 15 & 0.1105 & 0.01892 \\ \hline
\end{tabular}
\end{table}

The standard partition shows that a different type of network has proved most effective. \textit{Patternnet} is the network that is intended for use in pattern recognition applications, such as this one. However, the MSE of standard partition is still greater than that attained by randomly partitioned data, which uses a fitting network with \textit{trainbr} algorithm as the most effective method, matching very closely with the results of the previous data set. In fact, the previous set of data showed a lower MSE using \textit{trainbr} with a single layer of 25 neurons, which was not investigated in the latter data set due to the large amount of training time required.

Overall, the data shown in this section suggests that the most effective method of minimising error from a neural network is to randomly partition the data, and use a fitting network of a single layer of 25 neurons with the \textit{trainbr} training algorithm. It is possible that more layers of 25 neurons would increase accuracy, but at the cost of training and test time with a minimal decrease in MSE.

% TODO compare MSE and accuracy with other tasks

%-------------------------------------------------------------------------------
% Conclusion
%-------------------------------------------------------------------------------
\section{Conclusion}

A conclusion \cite{pca}.

%-------------------------------------------------------------------------------
% References
%-------------------------------------------------------------------------------
\bibliographystyle{unsrt}
\bibliography{pr_refs}

%-------------------------------------------------------------------------------
% Appendix(ces)
%-------------------------------------------------------------------------------
\onecolumn
\section*{Appendix}

\subsection*{All Colour Maps of Covariance Matrices}
\begin{figure}[!ht]
  \captionsetup[subfigure]{position=b}
  \centering
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covtraining.png}
      \caption{Covariance of all.}
      \label{fig:covtraining}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covclass1.png}
      \caption{Covariance of Class 1.}
      \label{fig:covclass1}
    \end{subfigure}
    \\
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covclass2.png}
      \caption{Covariance of Class 2.}
      \label{fig:covclass2}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covclass3.png}
      \caption{Covariance of Class 3.}
      \label{fig:covclass3}
    \end{subfigure}
	\caption{Visualised color maps of covariance matrices from unmodified data.}
  \label{fig:covtrainingall}
\end{figure}
\newpage

\begin{figure}[!ht]
  \captionsetup[subfigure]{position=b}
  \centering
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covl2.png}
      \caption{Covariance of all.}
      \label{fig:covl2}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covl2class1.png}
      \caption{Covariance of Class 1.}
      \label{fig:covl2class1}
    \end{subfigure}
    \\
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covl2class2.png}
      \caption{Covariance of Class 2.}
      \label{fig:covl2class2}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/covl2class3.png}
      \caption{Covariance of Class 3.}
      \label{fig:covl2class3}
    \end{subfigure}
	\caption{Visualised color maps of covariance matrices from L2-normalised data.}
  \label{fig:covl2all}
\end{figure}
\newpage

\subsection*{All Colour Maps of Correlation Matrices}
\begin{figure}[!ht]
  \captionsetup[subfigure]{position=b}
  \centering
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/corrl2.png}
      \caption{Correlation of all.}
      \label{fig:corrtraining}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/corrclass1.png}
      \caption{Correlation of Class 1.}
      \label{fig:corrclass1}
    \end{subfigure}
    \\
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/corrclass2.png}
      \caption{Correlation of Class 2.}
      \label{fig:corrclass2}
    \end{subfigure}
    ~
    \begin{subfigure}{0.45\textwidth}
      \includegraphics[width=\textwidth]{pic/corrclass3.png}
      \caption{Correlation of Class 3.}
      \label{fig:corrclass3}
    \end{subfigure}
	\caption{Visualised color maps of correlations matrices from data. As correlation is a normalised value, the colour maps are identical for both modified and unmodified data. }
  \label{fig:corrall}
\end{figure}
\newpage

\subsection*{Confusion Matrices From NN Classification}
\begin{figure}[!ht]
  \captionsetup[subfigure]{position=b}
  \centering
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_l1.png}
      \caption{Unmodified data with L1 metric. }
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_l2.png}
      \caption{Unmodified data with L2 metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_chi.png}
      \caption{Unmodified data with Chi-Squared metric.}
    \end{subfigure}
    \\
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_inter.png}
      \caption{Unmodified data with Intersection metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_corr.png}
      \caption{Unmodified data with Correlation metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_cov_l2.png}
      \caption{Unmodified data with Mahalanobis metric, L2 covariance.}
    \end{subfigure}
    \\
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_cov_l2_class1.png}
      \caption{Unmodified data with Mahalanobis metric, L2 class 1 covariance.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_cov_l2_class2.png}
      \caption{Unmodified data with Mahalanobis metric, L2 class 2 covariance.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/unmod_cov_l2_class3.png}
      \caption{Unmodified data with Mahalanobis metric, L2 class 3 covariance.}
    \end{subfigure}
	\caption{Confusion matrices for the different NN classification metrics. The overall classification success and error can be seen in the bottom right square of each matrix. }
  \label{fig:nnconfuseunmod}
\end{figure}
\newpage

\begin{figure}[!ht]
  \captionsetup[subfigure]{position=b}
  \centering
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_l1.png}
      \caption{Normalised data with L1 metric. }
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_l2.png}
      \caption{Normalised data with L2 metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_chi.png}
      \caption{Normalised data with Chi-Squared metric.}
    \end{subfigure}
    \\
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_inter.png}
      \caption{Normalised data with Intersection metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_corr.png}
      \caption{Normalised data with Correlation metric.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_cov_l2.png}
      \caption{Normalised data with Mahalanobis metric, L2 covariance.}
    \end{subfigure}
    \\
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_cov_l2_class1.png}
      \caption{Normalised data with Mahalanobis metric, L2 class 1 covariance.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_cov_l2_class2.png}
      \caption{Normalised data with Mahalanobis metric, L2 class 2 covariance.}
    \end{subfigure}
    ~
    \begin{subfigure}{0.32\textwidth}
      \includegraphics[width=\textwidth]{pic/mod_cov_l2_class3.png}
      \caption{Normalised data with Mahalanobis metric, L2 class 3 covariance.}
    \end{subfigure}
	\caption{Confusion matrices for the different NN classification metrics. The overall classification success and error can be seen in the bottom right square of each matrix. }
  \label{fig:nnconfusemod}
\end{figure}
\newpage

\subsection*{Nearest Neighbour Function Using L1 Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_l1.m}

\subsection*{Nearest Neighbour Function Using L2 Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_l2.m}
\newpage

\subsection*{Nearest Neighbour Function Using Chi Squared Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_chi.m}

\subsection*{Nearest Neighbour Function Using Histogram Intersection Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_intersection.m}
\newpage

\subsection*{Nearest Neighbour Function Using Correlation Metric}
\lstinputlisting[style=Matlab-editor]{src/nearest_neighbour_corr.m}

\subsection*{Nearest Cluster Function Using \texttt{pdist()}}
\lstinputlisting[style=Matlab-editor]{src/nearest_cluster.m}
\newpage

\subsection*{K-Means Clustering Function Using All Four Metrics}
\lstinputlisting[style=Matlab-editor]{src/kmeanmetrics.m}
\newpage

\subsection*{Test set code for Task 3}
\lstinputlisting[style=Matlab-editor]{src/task3_all.m}
\newpage

\subsection*{Making and testing of neural network}
\lstinputlisting[style=Matlab-editor]{src/make_test_nn.m}
\newpage

\subsection*{Test code for various layers/networks for Task 3}
\lstinputlisting[style=Matlab-editor]{src/task3_networks.m}
\newpage

\end{document}