diff --git a/_config.yml b/_config.yml index b194a40..e4e3a28 100644 --- a/_config.yml +++ b/_config.yml @@ -60,6 +60,11 @@ ghub: edit: true repository: dsa + +analytics: + google: + tracking_id: UA-184894912-1 + future: true # Build settings diff --git a/_dsa/_config.yml b/_dsa/_config.yml deleted file mode 100755 index 529d800..0000000 --- a/_dsa/_config.yml +++ /dev/null @@ -1,27 +0,0 @@ -author: -- given: Neil D. - family: Lawrence - institution: University of Cambridge - gscholar: r3SJcvoAAAAJ - twitter: lawrennd - orcid: 0000-0001-9258-1030 - url: http://inverseprobability.com -layout: lecture -venue: Virtual (Zoom) -ipynb: True -talkcss: https://inverseprobability.com/assets/css/talks.css -postsdir: ../../../mlatcl/dsa/_lectures/ -slidesdir: ../../../mlatcl/dsa/slides/ -notesdir: ../../../mlatcl/dsa/_notes/ -notebooksdir: ../../../mlatcl/dsa/_notebooks/ -writediagramsdir: . -diagramsdir: ./slides/diagrams/ -baseurl: "dsa/" # the subpath of your site, e.g. /blog/ -url: "https://mlatcl.github.io/" # the base hostname & protocol for your site -transition: None -ghub: -- organization: lawrennd - repository: talks - branch: gh-pages - directory: _dsa - diff --git a/_lamd/_lamd.yml b/_lamd/_lamd.yml new file mode 100755 index 0000000..bda6e6f --- /dev/null +++ b/_lamd/_lamd.yml @@ -0,0 +1,43 @@ +author: +- family: Lawrence + given: Neil D. + gscholar: r3SJcvoAAAAJ + institute: University of Cambridge + twitter: lawrennd + orcid: 0000-0001-9258-1030 + url: http://inverseprobability.com +layout: lecture +room: +venue: Virtual (Zoom) +talkcss: https://inverseprobability.com/assets/css/talks.css +bibdir: $HOME/lawrennd/bibliography +postsdir: ../_lectures/ +slidesdir: ../slides/ +notesdir: ../_notes/ +notebooksdir: ../_notebooks/ +practicalsdir: ../_practicals/ +snippetsdir: $HOME/lawrennd/snippets +writediagramsdir: . +scriptsdir: ./scripts/ +diagramsdir: ./slides/diagrams/ +posts: True +docx: False +notespdf: False +slidesipynb: False +pptx: True +pdf: False +assignment: False +ipynb: True +reveal: True +transition: None +potx: custom-reference.potx +dotx: custom-reference.dotx +baseurl: "dsa/" # the subpath of your site, e.g. /blog/ +url: "https://mlatcl.github.io/" # the base hostname & protocol for your site +transition: None +ghub: +- organization: mlatcl + repository: dsa + branch: gh-pages + directory: _lamd + diff --git a/_dsa/bayesian-methods-abuja.md b/_lamd/bayesian-methods-abuja.md similarity index 95% rename from _dsa/bayesian-methods-abuja.md rename to _lamd/bayesian-methods-abuja.md index 1e4cbe3..6ccba1f 100755 --- a/_dsa/bayesian-methods-abuja.md +++ b/_lamd/bayesian-methods-abuja.md @@ -28,10 +28,8 @@ venue: DSA, Abuja transition: None --- -\include{talk-macros.tex} - \include{_ml/includes/what-is-ml.md} -\include{_ml/includes/nigeria-nmis-data.md} +\include{_datasets/includes/nigeria-nmis-data.md} \include{_ml/includes/probability-intro.md} \include{_ml/includes/probabilistic-modelling.md} diff --git a/_lamd/compile.sh b/_lamd/compile.sh new file mode 100755 index 0000000..bc67bab --- /dev/null +++ b/_lamd/compile.sh @@ -0,0 +1,11 @@ +#/bin/bash + +FILES="" +SKIP=true +while read stub; do + if $SKIP; then + SKIP=false + else + maketalk $stub + fi +done < lectures.csv diff --git a/_dsa/gaussian-processes.md b/_lamd/gaussian-processes.md similarity index 98% rename from _dsa/gaussian-processes.md rename to _lamd/gaussian-processes.md index 76c7338..ac9a8f7 100755 --- a/_dsa/gaussian-processes.md +++ b/_lamd/gaussian-processes.md @@ -11,7 +11,6 @@ time: "15:00 (West Africa Standard Time)" transition: None --- -\include{talk-macros.tex} \include{_mlai/includes/mlai-notebook-setup.md} \include{_gp/includes/gp-book.md} @@ -78,7 +77,7 @@ $$ \include{_gp/includes/gp-optimize.md} \include{_kern/includes/eq-covariance.md} \include{_gp/includes/gp-summer-school.md} -\include{_gp/includes/gpy-software.md} +\include{_software/includes/gpy-software.md} \include{_gp/includes/gpy-tutorial.md} \subsection{Review} diff --git a/_lamd/lectures.csv b/_lamd/lectures.csv new file mode 100644 index 0000000..6c6fa3c --- /dev/null +++ b/_lamd/lectures.csv @@ -0,0 +1,6 @@ +lectureId +what-is-machine-learning +probabilistic-machine-learning +ml-systems +bayesian-methods-abuja +gaussian-processes diff --git a/_dsa/ml-systems-kimberley.md b/_lamd/ml-systems-kimberley.md similarity index 100% rename from _dsa/ml-systems-kimberley.md rename to _lamd/ml-systems-kimberley.md diff --git a/_dsa/ml-systems.md b/_lamd/ml-systems.md similarity index 98% rename from _dsa/ml-systems.md rename to _lamd/ml-systems.md index 83d09cd..3c443f8 100755 --- a/_dsa/ml-systems.md +++ b/_lamd/ml-systems.md @@ -20,8 +20,6 @@ venue: Virtual DSA transition: None --- -\include{talk-macros.tex} - \slides{\section{AI via ML Systems} \include{_ai/includes/supply-chain-system.md} diff --git a/_dsa/probabilistic-machine-learning.md b/_lamd/probabilistic-machine-learning.md similarity index 98% rename from _dsa/probabilistic-machine-learning.md rename to _lamd/probabilistic-machine-learning.md index 8c80d10..50b51d7 100755 --- a/_dsa/probabilistic-machine-learning.md +++ b/_lamd/probabilistic-machine-learning.md @@ -30,8 +30,6 @@ https://www.kaggle.com/alaowerre/nigeria-nmis-health-facility-data %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\include{talk-macros.tex} - \include{_ml/includes/what-is-ml.md} \include{_ml/includes/probability-intro.md} \include{_ml/includes/probabilistic-modelling.md} diff --git a/_dsa/what-is-machine-learning-ashesi.md b/_lamd/what-is-machine-learning-ashesi.md similarity index 100% rename from _dsa/what-is-machine-learning-ashesi.md rename to _lamd/what-is-machine-learning-ashesi.md diff --git a/_dsa/what-is-machine-learning.md b/_lamd/what-is-machine-learning.md similarity index 97% rename from _dsa/what-is-machine-learning.md rename to _lamd/what-is-machine-learning.md index 9d5eb49..e3a5175 100755 --- a/_dsa/what-is-machine-learning.md +++ b/_lamd/what-is-machine-learning.md @@ -20,8 +20,6 @@ papersize: a4paper transition: None --- -\include{../talk-macros.gpp} - \section{Introduction} \include{_data-science/includes/data-science-africa.md} @@ -54,7 +52,7 @@ $$ \notes{Stephen Kiprotich, the 2012 gold medal winner from the London Olympics, comes from Kapchorwa district, in eastern Uganda, near the border with Kenya.} \include{_ml/includes/olympic-marathon-polynomial.md} -\include{../_ml/includes/what-does-machine-learning-do.md} +\include{_ml/includes/what-does-machine-learning-do.md} \include{_ml/includes/what-is-ml-2.md} \include{_ai/includes/ai-vs-data-science-2.md} diff --git a/_lectures/01-what-is-machine-learning.html b/_lectures/01-what-is-machine-learning.html index 912fd2d..dee2826 100644 --- a/_lectures/01-what-is-machine-learning.html +++ b/_lectures/01-what-is-machine-learning.html @@ -1,7 +1,12 @@ --- title: "What is Machine Learning?" venue: "Data Science Africa Summer School, Addis Ababa, Ethiopia" -abstract: "

In this talk we will introduce the fundamental ideas in machine learning. We’ll develop our exposition around the ideas of prediction function and the objective function. We don’t so much focus on the derivation of particular algorithms, but more the general principles involved to give an idea of the machine learning landscape.

" +abstract: "

In this talk we will introduce the fundamental ideas in +machine learning. We’ll develop our exposition around the ideas of +prediction function and the objective function. We don’t so much focus +on the derivation of particular algorithms, but more the general +principles involved to give an idea of the machine learning +landscape.

" author: - given: Neil D. family: Lawrence @@ -9,13 +14,15 @@ institute: Amazon Cambridge and University of Sheffield twitter: lawrennd gscholar: r3SJcvoAAAAJ - orchid: null + orcid: +edit_url: https://github.com/mlatcl/dsa/edit/gh-pages/_lamd/what-is-machine-learning.md date: 2019-06-03 published: 2019-06-03 -week: 0 session: 1 reveal: 01-what-is-machine-learning.slides.html +transition: None ipynb: 01-what-is-machine-learning.ipynb +pptx: 01-what-is-machine-learning.pptx layout: lecture categories: - notes @@ -33,41 +40,93 @@ -->

Introduction

Data Science Africa

-

[edit]

+
+[edit] +
- +
-
+
-

Figure: Data Science Africa http://datascienceafrica.org is a ground up initiative for capacity building around data science, machine learning and artificial intelligence on the African continent.

+

Figure: Data Science Africa http://datascienceafrica.org is a ground up initiative +for capacity building around data science, machine learning and +artificial intelligence on the African continent.

-

Data Science Africa is a bottom up initiative for capacity building in data science, machine learning and artificial intelligence on the African continent.

-

As of 2019 there have been five workshops and five schools, located in Nyeri, Kenya (twice); Kampala, Uganda; Arusha, Tanzania; Abuja, Nigeria; Addis Ababa, Ethiopia and Accra, Ghana. The next event is scheduled for June 2020 in Kampala, Uganda.

-

The main notion is end-to-end data science. For example, going from data collection in the farmer’s field to decision making in the Ministry of Agriculture. Or going from malaria disease counts in health centers to medicine distribution.

-

The philosophy is laid out in (Lawrence 2015). The key idea is that the modern information infrastructure presents new solutions to old problems. Modes of development change because less capital investment is required to take advantage of this infrastructure. The philosophy is that local capacity building is the right way to leverage these challenges in addressing data science problems in the African context.

-

Data Science Africa is now a non-govermental organization registered in Kenya. The organising board of the meeting is entirely made up of scientists and academics based on the African continent.

+
+
+ + +
+
+ +
+
+

Figure: Data Science Africa meetings held up to October 2021.

+
+
+

Data Science Africa is a bottom up initiative for capacity building +in data science, machine learning and artificial intelligence on the +African continent.

+

As of May 2023 there have been eleven workshops and schools, located +in seven different countries: Nyeri, Kenya (twice); Kampala, Uganda; +Arusha, Tanzania; Abuja, Nigeria; Addis Ababa, Ethiopia; Accra, Ghana; +Kampala, Uganda and Kimberley, South Africa (virtual), and in Kigali, +Rwanda.

+

The main notion is end-to-end data science. For example, +going from data collection in the farmer’s field to decision making in +the Ministry of Agriculture. Or going from malaria disease counts in +health centers to medicine distribution.

+

The philosophy is laid out in (Lawrence, 2015). The key idea is +that the modern information infrastructure presents new +solutions to old problems. Modes of development change because less +capital investment is required to take advantage of this infrastructure. +The philosophy is that local capacity building is the right way to +leverage these challenges in addressing data science problems in the +African context.

+

Data Science Africa is now a non-govermental organization registered +in Kenya. The organising board of the meeting is entirely made up of +scientists and academics based on the African continent.

- +
-
+
-

Figure: The lack of existing physical infrastructure on the African continent makes it a particularly interesting environment for deploying solutions based on the information infrastructure. The idea is explored more in this Guardian op-ed on Guardian article on How African can benefit from the data revolution.

-
+

Figure: The lack of existing physical infrastructure on the African +continent makes it a particularly interesting environment for deploying +solutions based on the information infrastructure. The idea is +explored more in this Guardian op-ed on Guardian article on How +African can benefit from the data revolution.

+
+
+

Guardian article on Data +Science Africa

+

Example: +Prediction of Malaria Incidence in Uganda

+
+[edit]
-

Guardian article on Data Science Africa

-

Example: Prediction of Malaria Incidence in Uganda

-

[edit]

@@ -80,7 +139,7 @@

Example: Prediction o Martin Mubangizi - + @@ -91,9 +150,9 @@

Example: Prediction o -Ricardo Andrade Pacheco +Ricardo Andrade Pacecho - + @@ -106,61 +165,113 @@

Example: Prediction o John Quinn - +

-

As an example of using Gaussian process models within the full pipeline from data to decsion, we’ll consider the prediction of Malaria incidence in Uganda. For the purposes of this study malaria reports come in two forms, HMIS reports from health centres and Sentinel data, which is curated by the WHO. There are limited sentinel sites and many HMIS sites.

-

The work is from Ricardo Andrade Pacheco’s PhD thesis, completed in collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco et al. 2014; Mubangizi et al. 2014). John and Martin were initally from the AI-DEV group from the University of Makerere in Kampala and more latterly they were based at UN Global Pulse in Kampala.

-

Malaria data is spatial data. Uganda is split into districts, and health reports can be found for each district. This suggests that models such as conditional random fields could be used for spatial modelling, but there are two complexities with this. First of all, occasionally districts split into two. Secondly, sentinel sites are a specific location within a district, such as Nagongera which is a sentinel site based in the Tororo district.

+

As an example of using Gaussian process models within the full +pipeline from data to decsion, we’ll consider the prediction of Malaria +incidence in Uganda. For the purposes of this study malaria reports come +in two forms, HMIS reports from health centres and Sentinel data, which +is curated by the WHO. There are limited sentinel sites and many HMIS +sites.

+

The work is from Ricardo Andrade Pacheco’s PhD thesis, completed in +collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco +et al., 2014; Mubangizi et al., 2014). John and Martin were +initally from the AI-DEV group from the University of Makerere in +Kampala and more latterly they were based at UN Global Pulse in Kampala. +You can see the work summarized on the UN Global Pulse disease +outbreaks project site here.

+ +

Malaria data is spatial data. Uganda is split into districts, and +health reports can be found for each district. This suggests that models +such as conditional random fields could be used for spatial modelling, +but there are two complexities with this. First of all, occasionally +districts split into two. Secondly, sentinel sites are a specific +location within a district, such as Nagongera which is a sentinel site +based in the Tororo district.

- +
-
+
-

Figure: Ugandan districs. Data SRTM/NASA from https://dds.cr.usgs.gov/srtm/version2_1.

+

Figure: Ugandan districts. Data SRTM/NASA from https://dds.cr.usgs.gov/srtm/version2_1.

-

(Andrade-Pacheco et al. 2014; Mubangizi et al. 2014)

-

The common standard for collecting health data on the African continent is from the Health management information systems (HMIS). However, this data suffers from missing values (Gething et al. 2006) and diagnosis of diseases like typhoid and malaria may be confounded.

+
+(Andrade-Pacheco +et al., 2014; Mubangizi et al., 2014) +
+

The common standard for collecting health data on the African +continent is from the Health management information systems (HMIS). +However, this data suffers from missing values (Gething et al., 2006) and diagnosis +of diseases like typhoid and malaria may be confounded.

- + -
+
-

Figure: The Tororo district, where the sentinel site, Nagongera, is located.

+

Figure: The Tororo district, where the sentinel site, Nagongera, is +located.

-

World Health Organization Sentinel Surveillance systems are set up “when high-quality data are needed about a particular disease that cannot be obtained through a passive system”. Several sentinel sites give accurate assessment of malaria disease levels in Uganda, including a site in Nagongera.

+

World +Health Organization Sentinel Surveillance systems are set up “when +high-quality data are needed about a particular disease that cannot be +obtained through a passive system”. Several sentinel sites give accurate +assessment of malaria disease levels in Uganda, including a site in +Nagongera.

- +
-
+
-

Figure: Sentinel and HMIS data along with rainfall and temperature for the Nagongera sentinel station in the Tororo district.

-
-
-

In collaboration with the AI Research Group at Makerere we chose to investigate whether Gaussian process models could be used to assimilate information from these two different sources of disease informaton. Further, we were interested in whether local information on rainfall and temperature could be used to improve malaria estimates.

-

The aim of the project was to use WHO Sentinel sites, alongside rainfall and temperature, to improve predictions from HMIS data of levels of malaria.

+

Figure: Sentinel and HMIS data along with rainfall and temperature +for the Nagongera sentinel station in the Tororo district.

+
+ +

In collaboration with the AI Research Group at Makerere we chose to +investigate whether Gaussian process models could be used to assimilate +information from these two different sources of disease informaton. +Further, we were interested in whether local information on rainfall and +temperature could be used to improve malaria estimates.

+

The aim of the project was to use WHO Sentinel sites, alongside +rainfall and temperature, to improve predictions from HMIS data of +levels of malaria.

- + -
+
@@ -170,10 +281,11 @@

Example: Prediction o
- +
-
+
@@ -182,24 +294,27 @@

Example: Prediction o

-
- +
+
-

Figure: The project arose out of the Gaussian process summer school held at Makerere in Kampala in 2013. The school led, in turn, to the Data Science Africa initiative.

+

Figure: The project arose out of the Gaussian process summer school +held at Makerere in Kampala in 2013. The school led, in turn, to the +Data Science Africa initiative.

Early Warning Systems

- + -
+
@@ -209,347 +324,940 @@

Early Warning Systems

- +
-
+
-

Figure: Estimate of the current disease situation in the Kabarole district over time. Estimate is constructed with a Gaussian process with an additive covariance funciton.

-
-
-

Health monitoring system for the Kabarole district. Here we have fitted the reports with a Gaussian process with an additive covariance function. It has two components, one is a long time scale component (in red above) the other is a short time scale component (in blue).

-

Monitoring proceeds by considering two aspects of the curve. Is the blue line (the short term report signal) above the red (which represents the long term trend? If so we have higher than expected reports. If this is the case and the gradient is still positive (i.e. reports are going up) we encode this with a red color. If it is the case and the gradient of the blue line is negative (i.e. reports are going down) we encode this with an amber color. Conversely, if the blue line is below the red and decreasing, we color green. On the other hand if it is below red but increasing, we color yellow.

-

This gives us an early warning system for disease. Red is a bad situation getting worse, amber is bad, but improving. Green is good and getting better and yellow good but degrading.

-

Finally, there is a gray region which represents when the scale of the effect is small.

+

Figure: Estimate of the current disease situation in the Kabarole +district over time. Estimate is constructed with a Gaussian process with +an additive covariance funciton.

+
+
+

Health monitoring system for the Kabarole district. Here we have +fitted the reports with a Gaussian process with an additive covariance +function. It has two components, one is a long time scale component (in +red above) the other is a short time scale component (in blue).

+

Monitoring proceeds by considering two aspects of the curve. Is the +blue line (the short term report signal) above the red (which represents +the long term trend? If so we have higher than expected reports. If this +is the case and the gradient is still positive (i.e. reports +are going up) we encode this with a red color. If it is the +case and the gradient of the blue line is negative (i.e. reports are +going down) we encode this with an amber color. Conversely, if +the blue line is below the red and decreasing, we color +green. On the other hand if it is below red but increasing, we +color yellow.

+

This gives us an early warning system for disease. Red is a bad +situation getting worse, amber is bad, but improving. Green is good and +getting better and yellow good but degrading.

+

Finally, there is a gray region which represents when the scale of +the effect is small.

- +
-
+
-

Figure: The map of Ugandan districts with an overview of the Malaria situation in each district.

+

Figure: The map of Ugandan districts with an overview of the Malaria +situation in each district.

-

These colors can now be observed directly on a spatial map of the districts to give an immediate impression of the current status of the disease across the country.

+

These colors can now be observed directly on a spatial map of the +districts to give an immediate impression of the current status of the +disease across the country.

Machine Learning

-

This talk is a general introduction to machine learning, we will highlight the technical challenges and the current solutions. We will give an overview of what is machine learning and why it is important.

+

This talk is a general introduction to machine learning, we will +highlight the technical challenges and the current solutions. We will +give an overview of what is machine learning and why it is +important.

Rise of Machine Learning

-

Machine learning is the combination of data and models, through computation, to make predictions.
$$ -\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction} -$$

+

Machine learning is the combination of data and models, through +computation, to make predictions. \[ +\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} +\text{prediction} +\]

Data Revolution

-Machine learning has risen in prominence due to the rise in data availability, and its interconnection with computers. The high bandwidth connection between data and computer leads to a new interaction between us and data via the computer. It is that channel that is being mediated by machine learning techniques. +Machine learning has risen in prominence due to the rise in data +availability, and its interconnection with computers. The high bandwidth +connection between data and computer leads to a new interaction between +us and data via the computer. It is that channel that is being mediated +by machine learning techniques.
- + -
+
-

Figure: Large amounts of data and high interconnection bandwidth mean that we receive much of our information about the world around us through computers.

+

Figure: Large amounts of data and high interconnection bandwidth mean +that we receive much of our information about the world around us +through computers.

Supply Chain

-

[edit]

+
+[edit] +
-
- +
+
-
+
-

Figure: Packhorse Bridge under Burbage Edge. This packhorse route climbs steeply out of Hathersage and heads towards Sheffield. Packhorses were the main route for transporting goods across the Peak District. The high cost of transport is one driver of the ‘smith’ model, where there is a local skilled person responsible for assembling or creating goods (e.g. a blacksmith).

-
-
-

On Sunday mornings in Sheffield, I often used to run across Packhorse Bridge in Burbage valley. The bridge is part of an ancient network of trails crossing the Pennines that, before Turnpike roads arrived in the 18th century, was the main way in which goods were moved. Given that the moors around Sheffield were home to sand quarries, tin mines, lead mines and the villages in the Derwent valley were known for nail and pin manufacture, this wasn’t simply movement of agricultural goods, but it was the infrastructure for industrial transport.

-

The profession of leading the horses was known as a Jagger and leading out of the village of Hathersage is Jagger’s Lane, a trail that headed underneath Stanage Edge and into Sheffield.

-

The movement of goods from regions of supply to areas of demand is fundamental to our society. The physical infrastructure of supply chain has evolved a great deal over the last 300 years.

+

Figure: Packhorse Bridge under Burbage Edge. This packhorse route +climbs steeply out of Hathersage and heads towards Sheffield. Packhorses +were the main route for transporting goods across the Peak District. The +high cost of transport is one driver of the ‘smith’ model, where there +is a local skilled person responsible for assembling or creating goods +(e.g. a blacksmith).

+
+
+

On Sunday mornings in Sheffield, I often used to run across Packhorse +Bridge in Burbage valley. The bridge is part of an ancient network of +trails crossing the Pennines that, before Turnpike roads arrived in the +18th century, was the main way in which goods were moved. Given that the +moors around Sheffield were home to sand quarries, tin mines, lead mines +and the villages in the Derwent valley were known for nail and pin +manufacture, this wasn’t simply movement of agricultural goods, but it +was the infrastructure for industrial transport.

+

The profession of leading the horses was known as a Jagger and +leading out of the village of Hathersage is Jagger’s Lane, a trail that +headed underneath Stanage Edge and into Sheffield.

+

The movement of goods from regions of supply to areas of demand is +fundamental to our society. The physical infrastructure of supply chain +has evolved a great deal over the last 300 years.

Cromford

-

[edit]

+
+[edit] +
-
- +
+
-
+
-

Figure: Richard Arkwright is regarded of the founder of the modern factory system. Factories exploit distribution networks to centralize production of goods. Arkwright located his factory in Cromford due to proximity to Nottingham Weavers (his market) and availability of water power from the tributaries of the Derwent river. When he first arrived there was almost no transportation network. Over the following 200 years The Cromford Canal (1790s), a Turnpike (now the A6, 1816-18) and the High Peak Railway (now closed, 1820s) were all constructed to improve transportation access as the factory blossomed.

-
-
-

Richard Arkwright is known as the father of the modern factory system. In 1771 he set up a Mill for spinning cotton yarn in the village of Cromford, in the Derwent Valley. The Derwent valley is relatively inaccessible. Raw cotton arrived in Liverpool from the US and India. It needed to be transported on packhorse across the bridleways of the Pennines. But Cromford was a good location due to proximity to Nottingham, where weavers where consuming the finished thread, and the availability of water power from small tributaries of the Derwent river for Arkwright’s water frames which automated the production of yarn from raw cotton.

-

By 1794 the Cromford Canal was opened to bring coal in to Cromford and give better transport to Nottingham. The construction of the canals was driven by the need to improve the transport infrastructure, facilitating the movement of goods across the UK. Canals, roads and railways were initially constructed by the economic need for moving goods. To improve supply chain.

-

The A6 now does pass through Cromford, but at the time he moved there there was merely a track. The High Peak Railway was opened in 1832, it is now converted to the High Peak Trail, but it remains the highest railway built in Britain.

-

Cooper (1991)

+

Figure: Richard Arkwright is regarded of the founder of the modern +factory system. Factories exploit distribution networks to centralize +production of goods. Arkwright located his factory in Cromford due to +proximity to Nottingham Weavers (his market) and availability of water +power from the tributaries of the Derwent river. When he first arrived +there was almost no transportation network. Over the following 200 years +The Cromford Canal (1790s), a Turnpike (now the A6, 1816-18) and the +High Peak Railway (now closed, 1820s) were all constructed to improve +transportation access as the factory blossomed.

+
+
+

Richard Arkwright is known as the father of the modern factory +system. In 1771 he set up a Mill for spinning +cotton yarn in the village of Cromford, in the Derwent Valley. The +Derwent valley is relatively inaccessible. Raw cotton arrived in +Liverpool from the US and India. It needed to be transported on +packhorse across the bridleways of the Pennines. But Cromford was a good +location due to proximity to Nottingham, where weavers where consuming +the finished thread, and the availability of water power from small +tributaries of the Derwent river for Arkwright’s water frames +which automated the production of yarn from raw cotton.

+

By 1794 the Cromford Canal +was opened to bring coal in to Cromford and give better transport to +Nottingham. The construction of the canals was driven by the need to +improve the transport infrastructure, facilitating the movement of goods +across the UK. Canals, roads and railways were initially constructed by +the economic need for moving goods. To improve supply chain.

+

The A6 now does pass through Cromford, but at the time he moved there +there was merely a track. The High Peak Railway was opened in 1832, it +is now converted to the High Peak Trail, but it remains the highest +railway built in Britain.

+

Cooper +(1991)

Containerization

-

[edit]

+
+[edit] +
-
- +
+
-
+
-

Figure: The container is one of the major drivers of globalization, and arguably the largest agent of social change in the last 100 years. It reduces the cost of transportation, significantly changing the appropriate topology of distribution networks. The container makes it possible to ship goods halfway around the world for cheaper than it costs to process those goods, leading to an extended distribution topology.

-
-
-

Containerization has had a dramatic effect on global economics, placing many people in the developing world at the end of the supply chain.

+

Figure: The container is one of the major drivers of globalization, +and arguably the largest agent of social change in the last 100 years. +It reduces the cost of transportation, significantly changing the +appropriate topology of distribution networks. The container makes it +possible to ship goods halfway around the world for cheaper than it +costs to process those goods, leading to an extended distribution +topology.

+
+
+

Containerization has had a dramatic effect on global economics, +placing many people in the developing world at the end of the supply +chain.

-
- +
+
-
- +
+
-
+
-

Figure: Wild Alaskan Cod, being solid in the Pacific Northwest, that is a product of China. It is cheaper to ship the deep frozen fish thousands of kilometers for processing than to process locally.

+

Figure: Wild Alaskan Cod, being solid in the Pacific Northwest, that +is a product of China. It is cheaper to ship the deep frozen fish +thousands of kilometers for processing than to process locally.

+
+
+

For example, you can buy Wild Alaskan Cod fished from Alaska, +processed in China, sold in North America. This is driven by the low +cost of transport for frozen cod vs the higher relative cost of cod +processing in the US versus China. Similarly, +Scottish +prawns are also processed in China for sale in the UK.

+
+
+
+ +
+
+
-

For example, you can buy Wild Alaskan Cod fished from Alaska, processed in China, sold in North America. This is driven by the low cost of transport for frozen cod vs the higher relative cost of cod processing in the US versus China. Similarly, Scottish prawns are also processed in China for sale in the UK.

-

This effect on cost of transport vs cost of processing is the main driver of the topology of the modern supply chain and the associated effect of globalization. If transport is much cheaper than processing, then processing will tend to agglomerate in places where processing costs can be minimized.

-

Large scale global economic change has principally been driven by changes in the technology that drives supply chain.

-

Supply chain is a large-scale automated decision making network. Our aim is to make decisions not only based on our models of customer behavior (as observed through data), but also by accounting for the structure of our fulfilment center, and delivery network.

-

Many of the most important questions in supply chain take the form of counterfactuals. E.g. “What would happen if we opened a manufacturing facility in Cambridge?” A counter factual is a question that implies a mechanistic understanding of a system. It goes beyond simple smoothness assumptions or translation invariants. It requires a physical, or mechanistic understanding of the supply chain network. For this reason, the type of models we deploy in supply chain often involve simulations or more mechanistic understanding of the network.

-

In supply chain Machine Learning alone is not enough, we need to bridge between models that contain real mechanisms and models that are entirely data driven.

-

This is challenging, because as we introduce more mechanism to the models we use, it becomes harder to develop efficient algorithms to match those models to data.

+
+

Figure: The transport cost of most foods is a very small portion of +the total cost. The exception is if foods are air freighted. Source: https://ourworldindata.org/food-choice-vs-eating-local +by Hannah Ritche CC-BY

+
+
+

This effect on cost of transport vs cost of processing is the main +driver of the topology of the modern supply chain and the associated +effect of globalization. If transport is much cheaper than processing, +then processing will tend to agglomerate in places where processing +costs can be minimized.

+

Large scale global economic change has principally been driven by +changes in the technology that drives supply chain.

+

Supply chain is a large-scale automated decision making network. Our +aim is to make decisions not only based on our models of customer +behavior (as observed through data), but also by accounting for the +structure of our fulfilment center, and delivery network.

+

Many of the most important questions in supply chain take the form of +counterfactuals. E.g. “What would happen if we opened a manufacturing +facility in Cambridge?” A counter factual is a question that implies a +mechanistic understanding of a system. It goes beyond simple smoothness +assumptions or translation invariants. It requires a physical, or +mechanistic understanding of the supply chain network. For this +reason, the type of models we deploy in supply chain often involve +simulations or more mechanistic understanding of the network.

+

In supply chain Machine Learning alone is not enough, we need to +bridge between models that contain real mechanisms and models that are +entirely data driven.

+

This is challenging, because as we introduce more mechanism to the +models we use, it becomes harder to develop efficient algorithms to +match those models to data.

For Africa

-

[edit]

-

There is a large opportunity because infrastructures around automation are moving from physical infrastructure towards information infrastructures. How can African countries benefit from a modern information infrastructure? The aim of Data Science Africa is to answer this question, with the answers coming from the attendees.

-

Machine learning aims to replicate processes through the direct use of data. When deployed in the domain of ‘artificial intelligence’, the processes that it is replicating, or emulating, are cognitive processes.

-

The first trick in machine learning is to convert the process itself into a mathematical function. That function has a set of parameters which control its behaviour. What we call learning is the adaption of these parameters to change the behavior of the function. The choice of mathematical function we use is a vital component of the model.

+
+[edit] +
+

There is a large opportunity because infrastructures around +automation are moving from physical infrastructure towards information +infrastructures. How can African countries benefit from a modern +information infrastructure? The aim of Data Science Africa is to answer +this question, with the answers coming from the attendees.

+

Machine learning aims to replicate processes through the direct use +of data. When deployed in the domain of ‘artificial intelligence’, the +processes that it is replicating, or emulating, are cognitive +processes.

+

The first trick in machine learning is to convert the process itself +into a mathematical function. That function has a set of +parameters which control its behaviour. What we call learning is the +adaption of these parameters to change the behavior of the function. The +choice of mathematical function we use is a vital component of the +model.

- + -
+
-

Figure: The Kapchorwa District, home district of Stephen Kiprotich.

+

Figure: The Kapchorwa District, home district of Stephen +Kiprotich.

-

Stephen Kiprotich, the 2012 gold medal winner from the London Olympics, comes from Kapchorwa district, in eastern Uganda, near the border with Kenya.

+

Stephen Kiprotich, the 2012 gold medal winner from the London +Olympics, comes from Kapchorwa district, in eastern Uganda, near the +border with Kenya.

Olympic Marathon Data

-

[edit]

+
+[edit] +
  • Gold medal times for Olympic Marathon since 1896.
  • -
  • Marathons before 1924 didn’t have a standardised distance.
  • +
  • Marathons before 1924 didn’t have a standardized distance.
  • Present results using pace per km.
  • -
  • In 1904 Marathon was badly organised leading to very slow times.
  • +
  • In 1904 Marathon was badly organized leading to very slow +times.
-
- +
+
-Image from Wikimedia Commons http://bit.ly/16kMKHQ +Image from Wikimedia Commons http://bit.ly/16kMKHQ
-

The first thing we will do is load a standard data set for regression modelling. The data consists of the pace of Olympic Gold Medal Marathon winners for the Olympics from 1896 to present. First we load in the data and plot.

-
%pip install --upgrade git+https://github.com/sods/ods
-
import numpy as np
-import pods
-
data = pods.datasets.olympic_marathon_men()
-x = data['X']
-y = data['Y']
-
-offset = y.mean()
-scale = np.sqrt(y.var())
+

The first thing we will do is load a standard data set for regression +modelling. The data consists of the pace of Olympic Gold Medal Marathon +winners for the Olympics from 1896 to present. Let’s load in the data +and plot.

+
%pip install pods
+
import numpy as np
+import pods
+
data = pods.datasets.olympic_marathon_men()
+x = data['X']
+y = data['Y']
+
+offset = y.mean()
+scale = np.sqrt(y.var())
+yhat = (y - offset)/scale
- + -
+
-

Figure: Olympic marathon pace times since 1892.

-
-
-

Things to notice about the data include the outlier in 1904, in this year, the olympics was in St Louis, USA. Organizational problems and challenges with dust kicked up by the cars following the race meant that participants got lost, and only very few participants completed.

-

More recent years see more consistently quick marathons.

-

Polynomial Fits to Olympic Data

-

[edit]

-
import numpy as np
-from matplotlib import pyplot as plt
-import mlai
-import pods
-
basis = mlai.polynomial
-
-data = pods.datasets.olympic_marathon_men()
-
-x = data['X']
-y = data['Y']
-
-xlim = [1892, 2020]
-
-basis=mlai.Basis(mlai.polynomial, number=1, data_limits=xlim)
-
import numpy as np
-from matplotlib import pyplot as plt
-import teaching_plots as plot
-import mlai
-import pods
-
basis = mlai.polynomial
-
-data = pods.datasets.olympic_marathon_men()
-
-x = data['X']
-y = data['Y']
-
-xlim = [1892, 2020]
-max_basis = 27
-
-ll = np.array([np.nan]*(max_basis))
-sum_squares = np.array([np.nan]*(max_basis))
-basis=mlai.Basis(mlai.polynomial, number=1, data_limits=xlim)
-
plot.rmse_fit(x, y, param_name='number', param_range=(1, 28), 
-              model=mlai.LM, basis=basis, 
-              xlim=xlim, objective_ylim=[0, 0.8],
-              diagrams='../slides/diagrams/ml')
+

Figure: Olympic marathon pace times since 1896.

+ + +

Things to notice about the data include the outlier in 1904, in that +year the Olympics was in St Louis, USA. Organizational problems and +challenges with dust kicked up by the cars following the race meant that +participants got lost, and only very few participants completed. More +recent years see more consistently quick marathons.

+

Polynomial Fits to +Olympic Marthon Data

+
+[edit] +
+
import numpy as np
+

Define the polynomial basis function.

+
import mlai
+

+from mlai import polynomial
+
def polynomial(x, num_basis=4, data_limits=[-1., 1.]):
+    "Polynomial basis"
+    centre = data_limits[0]/2. + data_limits[1]/2.
+    span = data_limits[1] - data_limits[0]
+    z = np.asarray(x, dtype=float) - centre
+    z = 2*z/span   # scale the inputs to be within -1, 1 where polynomials are well behaved
+    Phi = np.zeros((x.shape[0], num_basis))
+    for i in range(num_basis):
+        Phi[:, i:i+1] = z**i
+    return Phi
+

Now we include the solution for the linear regression through +QR-decomposition.

+
def basis_fit(Phi, y):
+    "Use QR decomposition to fit the basis."""
+    Q, R = np.linalg.qr(Phi)
+    return sp.linalg.solve_triangular(R, Q.T@y) 
+

Linear Fit

+
poly_args = {'num_basis':2, # two basis functions (1 and x)
+             'data_limits':xlim}
+Phi = polynomial(x, **poly_args)
+w = basis_fit(Phi, y)
+

Now we make some predictions for the fit.

+
x_pred = np.linspace(xlim[0], xlim[1], 400)[:, np.newaxis]
+Phi_pred = polynomial(x_pred, **poly_args)
+f_pred = Phi_pred@w
+
+
+ + +
+
+ +
+
+

Figure: Fit of a 1-degree polynomial (a linear model) to the Olympic +marathon data.

+
+
+

Cubic Fit

+
poly_args = {'num_basis':4, # four basis: 1, x, x^2, x^3
+             'data_limits':xlim}
+Phi = polynomial(x, **poly_args)
+w = basis_fit(Phi, y)
+
Phi_pred = polynomial(x_pred, **poly_args)
+f_pred = Phi_pred@w
+
+
+ + +
+
+ +
+
+

Figure: Fit of a 3-degree polynomial (a cubic model) to the Olympic +marathon data.

+
+
+

9th Degree Polynomial Fit

+

Now we’ll try a 9th degree polynomial fit to the data.

+
poly_args = {'num_basis':10, # basis up to x^9
+             'data_limits':xlim}
+Phi = polynomial(x, **poly_args)
+w = basis_fit(Phi, y)
+
Phi_pred = polynomial(x_pred, **poly_args)
+f_pred = Phi_pred@w
+
+
+ + +
+
+ +
+
+

Figure: Fit of a 9-degree polynomial to the Olympic marathon +data.

+
+
+

16th Degree Polynomial Fit

+

Now we’ll try a 16th degree polynomial fit to the data.

+
poly_args = {'num_basis':17, # basis up to x^16
+             'data_limits':xlim}
+Phi = polynomial(x, **poly_args)
+w = basis_fit(Phi, y)
+
Phi_pred = polynomial(x_pred, **poly_args)
+f_pred = Phi_pred@w
-
- +
+
-
+
-
-

Figure: Fit of a 1 degree polynomial to the olympic marathon data.

+
+

Figure: Fit of a 16-degree polynomial to the Olympic marathon +data.

+
+
+

26th Degree Polynomial Fit

+

Now we’ll try a 26th degree polynomial fit to the data.

+
poly_args = {'num_basis':27, # basis up to x^26
+             'data_limits':xlim}
+Phi = polynomial(x, **poly_args)
+w = basis_fit(Phi, y)
+
Phi_pred = polynomial(x_pred, **poly_args)
+f_pred = Phi_pred@w
+
+
+ +
+
+ +
+
+

Figure: Fit of a 26-degree polynomial to the Olympic marathon +data.

+
+
+

What does Machine Learning +do?

+
+[edit] +
+

Any process of automation allows us to scale what we do by codifying +a process in some way that makes it efficient and repeatable. Machine +learning automates by emulating human (or other actions) found in data. +Machine learning codifies in the form of a mathematical function that is +learnt by a computer. If we can create these mathematical functions in +ways in which they can interconnect, then we can also build systems.

+

Machine learning works through codifying a prediction of interest +into a mathematical function. For example, we can try and predict the +probability that a customer wants to by a jersey given knowledge of +their age, and the latitude where they live. The technique known as +logistic regression estimates the odds that someone will by a jumper as +a linear weighted sum of the features of interest.

+

\[ \text{odds} = +\frac{p(\text{bought})}{p(\text{not bought})} \]

+

\[ \log \text{odds} = w_0 + w_1 +\text{age} + w_2 \text{latitude}.\] Here \(w_0\), \(w_1\) and \(w_2\) are the parameters of the model. If +\(w_1\) and \(w_2\) are both positive, then the log-odds +that someone will buy a jumper increase with increasing latitude and +age, so the further north you are and the older you are the more likely +you are to buy a jumper. The parameter \(w_0\) is an offset parameter and gives the +log-odds of buying a jumper at zero age and on the equator. It is likely +to be negative1 indicating that the purchase is +odds-against. This is also a classical statistical model, and models +like logistic regression are widely used to estimate probabilities from +ad-click prediction to disease risk.

+

This is called a generalized linear model, we can also think of it as +estimating the probability of a purchase as a nonlinear +function of the features (age, latitude) and the parameters (the \(w\) values). The function is known as the +sigmoid or logistic +function, thus the name logistic regression.

+

Sigmoid Function

+
+[edit]
-
- +
+
-
+
-
-

Figure: Fit of a 2 degree polynomial to the olympic marathon data.

-
-
-

What does Machine Learning do?

-

[edit]

-

Any process of automation allows us to scale what we do by codifying a process in some way that makes it efficient and repeatable. Machine learning automates by emulating human (or other actions) found in data. Machine learning codifies in the form of a mathematical function that is learnt by a computer. If we can create these mathematical functions in ways in which they can interconnect, then we can also build systems.

-

Machine learning works through codifing a prediction of interest into a mathematical function. For example, we can try and predict the probability that a customer wants to by a jersey given knowledge of their age, and the latitude where they live. The technique known as logistic regression estimates the odds that someone will by a jumper as a linear weighted sum of the features of interest.

-


$$ \text{odds} = \frac{p(\text{bought})}{p(\text{not bought})} $$

-


log odds = β0 + β1age + β2latitude.
Here β0, β1 and β2 are the parameters of the model. If β1 and β2 are both positive, then the log-odds that someone will buy a jumper increase with increasing latitude and age, so the further north you are and the older you are the more likely you are to buy a jumper. The parameter β0 is an offset parameter, and gives the log-odds of buying a jumper at zero age and on the equator. It is likely to be negative1 indicating that the purchase is odds-against. This is actually a classical statistical model, and models like logistic regression are widely used to estimate probabilities from ad-click prediction to risk of disease.

-

This is called a generalized linear model, we can also think of it as estimating the probability of a purchase as a nonlinear function of the features (age, lattitude) and the parameters (the β values). The function is known as the sigmoid or logistic function, thus the name logistic regression.

-


$$ p(\text{bought}) = \sigmoid{\beta_0 + \beta_1 \text{age} + \beta_2 \text{latitude}}.$$
In the case where we have features to help us predict, we sometimes denote such features as a vector, $\inputVector$, and we then use an inner product between the features and the parameters, $\boldsymbol{\beta}^\top \inputVector = \beta_1 \inputScalar_1 + \beta_2 \inputScalar_2 + \beta_3 \inputScalar_3 ...$, to represent the argument of the sigmoid.

-


$$ p(\text{bought}) = \sigmoid{\boldsymbol{\beta}^\top \inputVector}.$$
More generally, we aim to predict some aspect of our data, $\dataScalar$, by relating it through a mathematical function, $\mappingFunction(\cdot)$, to the parameters, β and the data, $\inputVector$.

-


$$ \dataScalar = \mappingFunction\left(\inputVector, \boldsymbol{\beta}\right).$$
We call $\mappingFunction(\cdot)$ the prediction function.

-

To obtain the fit to data, we use a separate function called the objective function that gives us a mathematical representation of the difference between our predictions and the real data.

-


$$\errorFunction(\boldsymbol{\beta}, \dataMatrix, \inputMatrix)$$
A commonly used examples (for example in a regression problem) is least squares,
$$\errorFunction(\boldsymbol{\beta}, \dataMatrix, \inputMatrix) = \sum_{i=1}^\numData \left(\dataScalar_i - \mappingFunction(\inputVector_i, \boldsymbol{\beta})\right)^2.$$

-

If a linear prediction function is combined with the least squares objective function then that gives us a classical linear regression, another classical statistical model. Statistics often focusses on linear models because it makes interpretation of the model easier. Interpretation is key in statistics because the aim is normally to validate questions by analysis of data. Machine learning has typically focussed more on the prediction function itself and worried less about the interpretation of parameters, which are normally denoted by w instead of β. As a result non-linear functions are explored more often as they tend to improve quality of predictions but at the expense of interpretability.

+
+

Figure: The logistic function.

+
+ +

The function has this characeristic ‘s’-shape (from where the term +sigmoid, as in sigma, comes from). It also takes the input from the +entire real line and ‘squashes’ it into an output that is between zero +and one. For this reason it is sometimes also called a ‘squashing +function’.

+

The sigmoid comes from the inverting the odds ratio, \[ +\frac{\pi}{(1-\pi)} +\] where \(\pi\) is the +probability of a positive outcome and \(1-\pi\) is the probability of a negative +outcome

+

\[ p(\text{bought}) = \sigma\left(w_0 + +w_1 \text{age} + w_2 \text{latitude}\right).\]

+

In the case where we have features to help us predict, we +sometimes denote such features as a vector, \(\mathbf{ x}\), and we then use an inner +product between the features and the parameters, \(\mathbf{ w}^\top \mathbf{ x}= w_1 x_1 + w_2 x_2 + +w_3 x_3 ...\), to represent the argument of the sigmoid.

+

\[ p(\text{bought}) += \sigma\left(\mathbf{ w}^\top \mathbf{ x}\right).\] More +generally, we aim to predict some aspect of our data, \(y\), by relating it through a mathematical +function, \(f(\cdot)\), to the +parameters, \(\mathbf{ w}\) and the +data, \(\mathbf{ x}\).

+

\[ y= f\left(\mathbf{ x}, \mathbf{ +w}\right).\] We call \(f(\cdot)\) the prediction +function.

+

To obtain the fit to data, we use a separate function called the +objective function that gives us a mathematical representation +of the difference between our predictions and the real data.

+

\[E(\mathbf{ w}, \mathbf{Y}, +\mathbf{X})\] A commonly used examples (for example in a +regression problem) is least squares, \[E(\mathbf{ w}, \mathbf{Y}, \mathbf{X}) = +\sum_{i=1}^n\left(y_i - f(\mathbf{ x}_i, \mathbf{ +w})\right)^2.\]

+

If a linear prediction function is combined with the least squares +objective function, then that gives us a classical linear +regression, another classical statistical model. Statistics often +focusses on linear models because it makes interpretation of the model +easier. Interpretation is key in statistics because the aim is normally +to validate questions by analysis of data. Machine learning has +typically focused more on the prediction function itself and worried +less about the interpretation of parameters. In statistics, where +interpretation is typically more important than prediction, parameters +are normally denoted by \(\boldsymbol{\beta}\) instead of \(\mathbf{ w}\).

+

A key difference between statistics and machine learning, is that +(traditionally) machine learning has focussed on predictive capability +and statistics has focussed on interpretability. That means that in a +statistics class far more emphasis will be placed on interpretation of +the parameters. In machine learning, the parameters, $, are just a means +to an end. But in statistics, when we denote the parameters by \(\boldsymbol{\beta}\), we often use the +parameters to tell us something about the disease.

+

So we move between \[ p(\text{bought}) += \sigma\left(w_0 + w_1 \text{age} + w_2 +\text{latitude}\right).\]

+

to denote the emphasis is on predictive power to

+

\[ p(\text{bought}) = \sigma\left(\beta_0 ++ \beta_1 \text{age} + \beta_2 \text{latitude}\right).\]

+

to denote the emphasis is on interpretation of the parameters.

+

Another effect of the focus on prediction in machine learning is that +non-linear approaches, which can be harder to interpret, are +more widely deployedin machine learning – they tend to improve quality +of predictions at the expense of interpretability.

What is Machine Learning?

-

[edit]

-

Machine learning allows us to extract knowledge from data to form a prediction.

-


$$\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}$$

-

A machine learning prediction is made by combining a model with data to form the prediction. The manner in which this is done gives us the machine learning algorithm.

-

Machine learning models are mathematical models which make weak assumptions about data, e.g. smoothness assumptions. By combining these assumptions with the data, we observe we can interpolate between data points or, occasionally, extrapolate into the future.

-

Machine learning is a technology which strongly overlaps with the methodology of statistics. From a historical/philosophical view point, machine learning differs from statistics in that the focus in the machine learning community has been primarily on accuracy of prediction, whereas the focus in statistics is typically on the interpretability of a model and/or validating a hypothesis through data collection.

-

The rapid increase in the availability of compute and data has led to the increased prominence of machine learning. This prominence is surfacing in two different but overlapping domains: data science and artificial intelligence.

+
+[edit] +
+

Machine learning allows us to extract knowledge from data to form a +prediction.

+

\[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

+

A machine learning prediction is made by combining a model with data +to form the prediction. The manner in which this is done gives us the +machine learning algorithm.

+

Machine learning models are mathematical models which make +weak assumptions about data, e.g. smoothness assumptions. By combining +these assumptions with the data, we observe we can interpolate between +data points or, occasionally, extrapolate into the future.

+

Machine learning is a technology which strongly overlaps with the +methodology of statistics. From a historical/philosophical view point, +machine learning differs from statistics in that the focus in the +machine learning community has been primarily on accuracy of prediction, +whereas the focus in statistics is typically on the interpretability of +a model and/or validating a hypothesis through data collection.

+

The rapid increase in the availability of compute and data has led to +the increased prominence of machine learning. This prominence is +surfacing in two different but overlapping domains: data science and +artificial intelligence.

From Model to Decision

-

[edit]

-

The real challenge, however, is end-to-end decision making. Taking information from the environment and using it to drive decision making to achieve goals.

-

Artificial Intelligence and Data Science

-

[edit]

-

Artificial intelligence has the objective of endowing computers with human-like intelligent capabilities. For example, understanding an image (computer vision) or the contents of some speech (speech recognition), the meaning of a sentence (natural language processing) or the translation of a sentence (machine translation).

+
+[edit] +
+

The real challenge, however, is end-to-end decision making. Taking +information from the environment and using it to drive decision making +to achieve goals.

+

Artificial +Intelligence and Data Science

+
+[edit] +
+

Artificial intelligence has the objective of endowing computers with +human-like intelligent capabilities. For example, understanding an image +(computer vision) or the contents of some speech (speech recognition), +the meaning of a sentence (natural language processing) or the +translation of a sentence (machine translation).

Supervised Learning for AI

-

The machine learning approach to artificial intelligence is to collect and annotate a large data set from humans. The problem is characterized by input data (e.g. a particular image) and a label (e.g. is there a car in the image yes/no). The machine learning algorithm fits a mathematical function (I call this the prediction function) to map from the input image to the label. The parameters of the prediction function are set by minimizing an error between the function’s predictions and the true data. This mathematical function that encapsulates this error is known as the objective function.

-

This approach to machine learning is known as supervised learning. Various approaches to supervised learning use different prediction functions, objective functions or different optimization algorithms to fit them.

-

For example, deep learning makes use of neural networks to form the predictions. A neural network is a particular type of mathematical function that allows the algorithm designer to introduce invariances into the function.

-

An invariance is an important way of including prior understanding in a machine learning model. For example, in an image, a car is still a car regardless of whether it’s in the upper left or lower right corner of the image. This is known as translation invariance. A neural network encodes translation invariance in convolutional layers. Convolutional neural networks are widely used in image recognition tasks.

-

An alternative structure is known as a recurrent neural network (RNN). RNNs neural networks encode temporal structure. They use auto regressive connections in their hidden layers, they can be seen as time series models which have non-linear auto-regressive basis functions. They are widely used in speech recognition and machine translation.

-

Machine learning has been deployed in Speech Recognition (e.g. Alexa, deep neural networks, convolutional neural networks for speech recognition), in computer vision (e.g. Amazon Go, convolutional neural networks for person recognition and pose detection).

-

The field of data science is related to AI, but philosophically different. It arises because we are increasingly creating large amounts of data through happenstance rather than active collection. In the modern era data is laid down by almost all our activities. The objective of data science is to extract insights from this data.

-

Classically, in the field of statistics, data analysis proceeds by assuming that the question (or scientific hypothesis) comes before the data is created. E.g., if I want to determine the effectiveness of a particular drug, I perform a design for my data collection. I use foundational approaches such as randomization to account for confounders. This made a lot of sense in an era where data had to be actively collected. The reduction in cost of data collection and storage now means that many data sets are available which weren’t collected with a particular question in mind. This is a challenge because bias in the way data was acquired can corrupt the insights we derive. We can perform randomized control trials (or A/B tests) to verify our conclusions, but the opportunity is to use data science techniques to better guide our question selection or even answer a question without the expense of a full randomized control trial (referred to as A/B testing in modern internet parlance).

-

Neural Networks and Prediction Functions

-

[edit]

-

Neural networks are adaptive non-linear function models. Originally, they were studied (by McCulloch and Pitts (McCulloch and Pitts 1943)) as simple models for neurons, but over the last decade they have become popular because they are a flexible approach to modelling complex data. A particular characteristic of neural network models is that they can be composed to form highly complex functions which encode many of our expectations of the real world. They allow us to encode our assumptions about how the world works.

-

We will return to composition later, but for the moment, let’s focus on a one hidden layer neural network. We are interested in the prediction function, so we’ll ignore the objective function (which is often called an error function) for the moment, and just describe the mathematical object of interest

-


$$ -\mappingFunction(\inputVector) = \mappingMatrix^\top \activationVector(\mappingMatrixTwo, \inputVector) -$$

-

Where in this case $\mappingFunction(\cdot)$ is a scalar function with vector inputs, and $\activationVector(\cdot)$ is a vector function with vector inputs. The dimensionality of the vector function is known as the number of hidden units, or the number of neurons. The elements of this vector function are known as the activation function of the neural network and $\mappingMatrixTwo$ are the parameters of the activation functions.

-

Relations with Classical Statistics

-

In statistics activation functions are traditionally known as basis functions. And we would think of this as a linear model. It’s doesn’t make linear predictions, but it’s linear because in statistics estimation focuses on the parameters, $\mappingMatrix$, not the parameters, $\mappingMatrixTwo$. The linear model terminology refers to the fact that the model is linear in the parameters, but it is not linear in the data unless the activation functions are chosen to be linear.

+

The machine learning approach to artificial intelligence is to +collect and annotate a large data set from humans. The problem is +characterized by input data (e.g. a particular image) and a label +(e.g. is there a car in the image yes/no). The machine learning +algorithm fits a mathematical function (I call this the prediction +function) to map from the input image to the label. The parameters +of the prediction function are set by minimizing an error between the +function’s predictions and the true data. This mathematical function +that encapsulates this error is known as the objective +function.

+

This approach to machine learning is known as supervised +learning. Various approaches to supervised learning use different +prediction functions, objective functions or different optimization +algorithms to fit them.

+

For example, deep learning makes use of neural +networks to form the predictions. A neural network is a particular +type of mathematical function that allows the algorithm designer to +introduce invariances into the function.

+

An invariance is an important way of including prior understanding in +a machine learning model. For example, in an image, a car is still a car +regardless of whether it’s in the upper left or lower right corner of +the image. This is known as translation invariance. A neural network +encodes translation invariance in convolutional layers. +Convolutional neural networks are widely used in image recognition +tasks.

+

An alternative structure is known as a recurrent neural network +(RNN). RNNs neural networks encode temporal structure. They use auto +regressive connections in their hidden layers, they can be seen as time +series models which have non-linear auto-regressive basis functions. +They are widely used in speech recognition and machine translation.

+

Machine learning has been deployed in Speech Recognition (e.g. Alexa, +deep neural networks, convolutional neural networks for speech +recognition), in computer vision (e.g. Amazon Go, convolutional neural +networks for person recognition and pose detection).

+

The field of data science is related to AI, but philosophically +different. It arises because we are increasingly creating large amounts +of data through happenstance rather than active collection. In +the modern era data is laid down by almost all our activities. The +objective of data science is to extract insights from this data.

+

Classically, in the field of statistics, data analysis proceeds by +assuming that the question (or scientific hypothesis) comes before the +data is created. E.g., if I want to determine the effectiveness of a +particular drug, I perform a design for my data collection. I +use foundational approaches such as randomization to account for +confounders. This made a lot of sense in an era where data had to be +actively collected. The reduction in cost of data collection and storage +now means that many data sets are available which weren’t collected with +a particular question in mind. This is a challenge because bias in the +way data was acquired can corrupt the insights we derive. We can perform +randomized control trials (or A/B tests) to verify our conclusions, but +the opportunity is to use data science techniques to better guide our +question selection or even answer a question without the expense of a +full randomized control trial (referred to as A/B testing in modern +internet parlance).

+

Neural Networks and +Prediction Functions

+
+[edit] +
+

Neural networks are adaptive non-linear function models. Originally, +they were studied (by McCulloch and Pitts (McCulloch and Pitts, 1943)) as +simple models for neurons, but over the last decade they have become +popular because they are a flexible approach to modelling complex data. +A particular characteristic of neural network models is that they can be +composed to form highly complex functions which encode many of our +expectations of the real world. They allow us to encode our assumptions +about how the world works.

+

We will return to composition later, but for the moment, let’s focus +on a one hidden layer neural network. We are interested in the +prediction function, so we’ll ignore the objective function (which is +often called an error function) for the moment, and just describe the +mathematical object of interest

+

\[ +f(\mathbf{ x}) = \mathbf{W}^\top \boldsymbol{ \phi}(\mathbf{V}, \mathbf{ +x}) +\]

+

Where in this case \(f(\cdot)\) is a +scalar function with vector inputs, and \(\boldsymbol{ \phi}(\cdot)\) is a vector +function with vector inputs. The dimensionality of the vector function +is known as the number of hidden units, or the number of neurons. The +elements of this vector function are known as the activation +function of the neural network and \(\mathbf{V}\) are the parameters of the +activation functions.

+

Relations with Classical +Statistics

+

In statistics activation functions are traditionally known as +basis functions. And we would think of this as a linear +model. It’s doesn’t make linear predictions, but it’s linear +because in statistics estimation focuses on the parameters, \(\mathbf{W}\), not the parameters, \(\mathbf{V}\). The linear model terminology +refers to the fact that the model is linear in the parameters, +but it is not linear in the data unless the activation +functions are chosen to be linear.

Adaptive Basis Functions

-

The first difference in the (early) neural network literature to the classical statistical literature is the decision to optimize these parameters, $\mappingMatrixTwo$, as well as the parameters, $\mappingMatrix$ (which would normally be denoted in statistics by β)2.

+

The first difference in the (early) neural network literature to the +classical statistical literature is the decision to optimize these +parameters, \(\mathbf{V}\), as well as +the parameters, \(\mathbf{W}\) (which +would normally be denoted in statistics by \(\boldsymbol{\beta}\))2.

Machine Learning

-

The key idea in machine learning is to observe the system in practice, and then emulate its behavior with mathematics. That leads to a design challenge as to where to place the mathematical function. The placement of the mathematical function leads to the different domains of machine learning.

+

The key idea in machine learning is to observe the system in +practice, and then emulate its behavior with mathematics. That leads to +a design challenge as to where to place the mathematical function. The +placement of the mathematical function leads to the different domains of +machine learning.

  1. Supervised learning
  2. Unsupervised learning
  3. Reinforcement learning

Supervised Learning

-

[edit]

-

Supervised learning is one of the most widely deployed machine learning technologies, and a particular domain of success has been classification. Classification is the process of taking an input (which might be an image) and categorizing it into one of a number of different classes (e.g. dog or cat). This simple idea underpins a lot of machine learning. By scanning across the image we can also determine where the animal is in the image.

-

Introduction to Classification

-

[edit]

-

Classification is perhaps the technique most closely assocated with machine learning. In the speech based agents, on-device classifiers are used to determine when the wake word is used. A wake word is a word that wakes up the device. For the Amazon Echo it is “Alexa”, for Siri it is “Hey Siri”. Once the wake word detected with a classifier, the speech can be uploaded to the cloud for full processing, the speech recognition stages.

-

This isn’t just useful for intelligent agents, the UN global pulse project on public discussion on radio also uses wake word detection for recording radio conversations.

-

A major breakthrough in image classification came in 2012 with the ImageNet result of Alex Krizhevsky, Ilya Sutskever and Geoff Hinton from the University of Toronto. ImageNet is a large data base of 14 million images with many thousands of classes. The data is used in a community-wide challenge for object categorization. Krizhevsky et al used convolutional neural networks to outperform all previous approaches on the challenge. They formed a company which was purchased shortly after by Google. This challenge, known as object categorisation, was a major obstacle for practical computer vision systems. Modern object categorization systems are close to human performance.

-

Machine learning problems normally involve a prediction function and an objective function. Regression is the case where the prediction function iss over the real numbers, so the codomain of the functions, $\mappingFunction(\inputMatrix)$ was the real numbers or sometimes real vectors. The classification problem consists of predicting whether or not a particular example is a member of a particular class. So we may want to know if a particular image represents a digit 6 or if a particular user will click on a given advert. These are classification problems, and they require us to map to yes or no answers. That makes them naturally discrete mappings.

-

In classification we are given an input vector, $\inputVector$, and an associated label, $\dataScalar$ which either takes the value  − 1 to represent no or 1 to represent yes.

-

In supervised learning the inputs, $\inputVector$, are mapped to a label, $\dataScalar$, through a function $\mappingFunction(\cdot)$ that is dependent on a set of parameters, $\weightVector$,
$$ -\dataScalar = \mappingFunction(\inputVector; \weightVector). -$$
The function $\mappingFunction(\cdot)$ is known as the prediction function. The key challenges are (1) choosing which features, $\inputVector$, are relevant in the prediction, (2) defining the appropriate class of function, $\mappingFunction(\cdot)$, to use and (3) selecting the right parameters, $\weightVector$.

+
+[edit] +
+

Supervised learning is one of the most widely deployed machine +learning technologies, and a particular domain of success has been +classification. Classification is the process of taking an +input (which might be an image) and categorizing it into one of a number +of different classes (e.g. dog or cat). This simple idea underpins a lot +of machine learning. By scanning across the image we can also determine +where the animal is in the image.

+

Introduction to +Classification

+
+[edit] +
+

Classification is perhaps the technique most closely assocated with +machine learning. In the speech based agents, on-device classifiers are +used to determine when the wake word is used. A wake word is a word that +wakes up the device. For the Amazon Echo it is “Alexa”, for Siri it is +“Hey Siri”. Once the wake word detected with a classifier, the speech +can be uploaded to the cloud for full processing, the speech recognition +stages.

+

This isn’t just useful for intelligent agents, the UN global pulse +project on public discussion on radio also uses wake word detection for +recording radio conversations.

+

A major breakthrough in image classification came in 2012 with the +ImageNet result of Alex +Krizhevsky, Ilya Sutskever and Geoff Hinton from the University of +Toronto. ImageNet is a large data base of 14 million images with many +thousands of classes. The data is used in a community-wide challenge for +object categorization. Krizhevsky et al used convolutional neural +networks to outperform all previous approaches on the challenge. They +formed a company which was purchased shortly after by Google. This +challenge, known as object categorisation, was a major obstacle for +practical computer vision systems. Modern object categorization systems +are close to human performance.

+

Machine learning problems normally involve a prediction function and +an objective function. Regression is the case where the prediction +function iss over the real numbers, so the codomain of the functions, +\(f(\mathbf{X})\) was the real numbers +or sometimes real vectors. The classification problem consists of +predicting whether or not a particular example is a member of a +particular class. So we may want to know if a particular image +represents a digit 6 or if a particular user will click on a given +advert. These are classification problems, and they require us to map to +yes or no answers. That makes them naturally discrete +mappings.

+

In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, +\(y\) which either takes the value +\(-1\) to represent no or +\(1\) to represent yes.

+

In supervised learning the inputs, \(\mathbf{ x}\), are mapped to a label, \(y\), through a function \(f(\cdot)\) that is dependent on a set of +parameters, \(\mathbf{ w}\), \[ +y= f(\mathbf{ x}; \mathbf{ w}). +\] The function \(f(\cdot)\) is +known as the prediction function. The key challenges are (1) +choosing which features, \(\mathbf{ +x}\), are relevant in the prediction, (2) defining the +appropriate class of function, \(f(\cdot)\), to use and (3) selecting the +right parameters, \(\mathbf{ w}\).

Classification Examples

-

[edit]

+
+[edit] +
    -
  • Classifiying hand written digits from binary images (automatic zip code reading)
  • +
  • Classifiying hand written digits from binary images (automatic zip +code reading)
  • Detecting faces in images (e.g. digital cameras).
  • Who a detected face belongs to (e.g. Facebook, DeepFace)
  • Classifying type of cancer given gene expression data.
  • -
  • Categorization of document types (different types of news article on the internet)
  • +
  • Categorization of document types (different types of news article on +the internet)
- + - + -
+
@@ -557,778 +1265,2192 @@

Classification Examples

Logistic Regression

-

[edit]

-

A logistic regression is an approach to classification which extends the linear basis function models we’ve already explored. Rather than modeling the output of the function directly the assumption is that we model the log-odds with the basis functions.

-

The odds are defined as the ratio of the probability of a positive outcome, to the probability of a negative outcome. If the probability of a positive outcome is denoted by π, then the odds are computed as $\frac{\pi}{1-\pi}$. Odds are widely used by bookmakers in gambling, although a bookmakers odds won’t normalise: i.e. if you look at the equivalent probabilities, and sum over the probability of all outcomes the bookmakers are considering, then you won’t get one. This is how the bookmaker makes a profit. Because a probability is always between zero and one, the odds are always between 0 and . If the positive outcome is unlikely the odds are close to zero, if it is very likely then the odds become close to infinite. Taking the logarithm of the odds maps the odds from the positive half space to being across the entire real line. Odds that were between 0 and 1 (where the negative outcome was more likely) are mapped to the range between  − ∞ and 0. Odds that are greater than 1 are mapped to the range between 0 and . Considering the log odds therefore takes a number between 0 and 1 (the probability of positive outcome) and maps it to the entire real line. The function that does this is known as the logit function, $g^{-1}(p_i) = \log\frac{p_i}{1-p_i}$. This function is known as a link function.

-

For a standard regression we take,
$$ -\mappingFunction(\inputVector) = \mappingVector^\top -\basisVector(\inputVector), -$$
if we want to perform classification we perform a logistic regression.
$$ -\log \frac{\pi}{(1-\pi)} = \mappingVector^\top -\basisVector(\inputVector) -$$
where the odds ratio between the positive class and the negative class is given by
$$ +

+[edit] +
+

A logistic regression is an approach to classification which extends +the linear basis function models we’ve already explored. Rather than +modeling the output of the function directly the assumption is that we +model the log-odds with the basis functions.

+

The odds are defined +as the ratio of the probability of a positive outcome, to the +probability of a negative outcome. If the probability of a positive +outcome is denoted by \(\pi\), then the +odds are computed as \(\frac{\pi}{1-\pi}\). Odds are widely used +by bookmakers in +gambling, although a bookmakers odds won’t normalise: i.e. if you look +at the equivalent probabilities, and sum over the probability of all +outcomes the bookmakers are considering, then you won’t get one. This is +how the bookmaker makes a profit. Because a probability is always +between zero and one, the odds are always between \(0\) and \(\infty\). If the positive outcome is +unlikely the odds are close to zero, if it is very likely then the odds +become close to infinite. Taking the logarithm of the odds maps the odds +from the positive half space to being across the entire real line. Odds +that were between 0 and 1 (where the negative outcome was more likely) +are mapped to the range between \(-\infty\) and \(0\). Odds that are greater than 1 are +mapped to the range between \(0\) and +\(\infty\). Considering the log odds +therefore takes a number between 0 and 1 (the probability of positive +outcome) and maps it to the entire real line. The function that does +this is known as the logit +function, \(g^{-1}(p_i) = +\log\frac{p_i}{1-p_i}\). This function is known as a link +function.

+

For a standard regression we take, \[ +f(\mathbf{ x}) = \mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x}), +\] if we want to perform classification we perform a logistic +regression. \[ +\log \frac{\pi}{(1-\pi)} = \mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x}) +\] where the odds ratio between the positive class and the +negative class is given by \[ \frac{\pi}{(1-\pi)} -$$
The odds can never be negative, but can take any value from 0 to . We have defined the link function as taking the form g − 1( ⋅ ) implying that the inverse link function is given by g( ⋅ ). Since we have defined,
$$ +\] The odds can never be negative, but can take any value from 0 +to \(\infty\). We have defined the link +function as taking the form \(g^{-1}(\cdot)\) implying that the inverse +link function is given by \(g(\cdot)\). +Since we have defined, \[ g^{-1}(\pi) = -\mappingVector^\top \basisVector(\inputVector) -$$
we can write π in terms of the inverse link function, g( ⋅ ) as
$$ -\pi = g(\mappingVector^\top -\basisVector(\inputVector)). -$$

+\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}) +\] we can write \(\pi\) in terms +of the inverse link function, \(g(\cdot)\) as \[ +\pi = g(\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x})). +\]

Basis Function

-

We’ll define our prediction, objective and gradient functions below. But before we start, we need to define a basis function for our model. Let’s start with the linear basis.

-
import numpy as np
-

-from mlai import linear
+

We’ll define our prediction, objective and gradient functions below. +But before we start, we need to define a basis function for our model. +Let’s start with the linear basis.

+
import numpy as np
+
import mlai
+

+from mlai import linear

Prediction Function

-

Now we have the basis function let’s define the prediction function.

-
import numpy as np
-
def predict(w, x, basis=linear, **kwargs):
-    "Generates the prediction function and the basis matrix."
-    Phi = basis(x, **kwargs)
-    f = np.dot(Phi, w)
-    return 1./(1+np.exp(-f)), Phi
-

This inverse of the link function is known as the logistic (thus the name logistic regression) or sometimes it is called the sigmoid function. For a particular value of the input to the link function, $\mappingFunction_i = \mappingVector^\top \basisVector(\inputVector_i)$ we can plot the value of the inverse link function as below.

-

Sigmoid Function

-

[edit]

+

Now we have the basis function let’s define the prediction +function.

+
import numpy as np
+
def predict(w, x, basis=linear, **kwargs):
+    "Generates the prediction function and the basis matrix."
+    Phi = basis(x, **kwargs)
+    f = np.dot(Phi, w)
+    return 1./(1+np.exp(-f)), Phi
+

This inverse of the link function is known as the logistic (thus +the name logistic regression) or sometimes it is called the sigmoid +function. For a particular value of the input to the link function, +\(f_i = \mathbf{ w}^\top \boldsymbol{ +\phi}(\mathbf{ x}_i)\) we can plot the value of the inverse link +function as below.

+

By replacing the inverse link with the sigmoid we can write \(\pi\) as a function of the input and the +parameter vector as, \[ +\pi(\mathbf{ x},\mathbf{ w}) = \frac{1}{1+\exp\left(-\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x})\right)}. +\] The process for logistic regression is as follows. Compute the +output of a standard linear basis function composition (\(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x})\), as we did for linear regression) and then apply the +inverse link function, \(g(\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x}))\). In logistic regression this +involves squashing it with the logistic (or sigmoid) function. +Use this value, which now has an interpretation as a +probability in a Bernoulli distribution to form the likelihood. +Then we can assume conditional independence of each data point given the +parameters and develop a likelihod for the entire data set.

+

As we discussed last time, the Bernoulli likelihood is of the form, +\[ +P(y_i|\mathbf{ w}, \mathbf{ x}) = +\pi_i^{y_i} (1-\pi_i)^{1-y_i} +\] which we can think of as clever trick for mathematically +switching between two probabilities if we were to write it as code it +would be better described as

+
def bernoulli(x, y, pi):
+    if y == 1:
+        return pi(x)
+    else:
+        return 1-pi(x)
+

but writing it mathematically makes it easier to write our objective +function within a single mathematical equation.

+

Maximum Likelihood

+

To obtain the parameters of the model, we need to maximize the +likelihood, or minimize the objective function, normally taken to be the +negative log likelihood. With a data conditional independence assumption +the likelihood has the form, \[ +P(\mathbf{ y}|\mathbf{ w}, +\mathbf{X}) = \prod_{i=1}^nP(y_i|\mathbf{ w}, \mathbf{ x}_i). +\] which can be written as a log likelihood in the form \[ +\log P(\mathbf{ y}|\mathbf{ w}, +\mathbf{X}) = \sum_{i=1}^n\log P(y_i|\mathbf{ w}, \mathbf{ x}_i) = +\sum_{i=1}^n +y_i \log \pi_i + \sum_{i=1}^n(1-y_i)\log (1-\pi_i) +\] and if we take the probability of positive outcome for the +\(i\)th data point to be given by \[ +\pi_i = g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x}_i)\right), +\] where \(g(\cdot)\) is the +inverse link function, then this leads to an objective function +of the form, \[ +E(\mathbf{ w}) = - \sum_{i=1}^ny_i \log +g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}_i)\right) - +\sum_{i=1}^n(1-y_i)\log \left(1-g\left(\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ x}_i)\right)\right). +\]

+
import numpy as np
+
def objective(g, y):
+    "Computes the objective function."
+    labs = np.asarray(y, dtype=float).flatten()
+    posind = np.where(labs==1)
+    negind = np.where(labs==0)
+    return -np.log(g[posind, :]).sum() - np.log(1-g[negind, :]).sum()
+

As normal, we would like to minimize this objective. This can be done +by differentiating with respect to the parameters of our prediction +function, \(\pi(\mathbf{ x};\mathbf{ +w})\), for optimisation. The gradient of the likelihood with +respect to \(\pi(\mathbf{ x};\mathbf{ +w})\) is of the form, \[ +\frac{\text{d}E(\mathbf{ w})}{\text{d}\mathbf{ w}} = -\sum_{i=1}^n +\frac{y_i}{g\left(\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ +x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} +\boldsymbol{ \phi}(\mathbf{ x}_i) + \sum_{i=1}^n +\frac{1-y_i}{1-g\left(\mathbf{ w}^\top +\boldsymbol{ \phi}(\mathbf{ +x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} +\boldsymbol{ \phi}(\mathbf{ x}_i) +\] where we used the chain rule to develop the derivative in +terms of \(\frac{\text{d}g(f_i)}{\text{d}f_i}\), which +is the gradient of the inverse link function (in our case the gradient +of the sigmoid function).

+

So the objective function now depends on the gradient of the inverse +link function, as well as the likelihood depends on the gradient of the +inverse link function, as well as the gradient of the log likelihood, +and naturally the gradient of the argument of the inverse link function +with respect to the parameters, which is simply \(\boldsymbol{ \phi}(\mathbf{ x}_i)\).

+

The only missing term is the gradient of the inverse link function. +For the sigmoid squashing function we have, \[\begin{align*} +g(f_i) &= \frac{1}{1+\exp(-f_i)}\\ +&=(1+\exp(-f_i))^{-1} +\end{align*}\] and the gradient can be computed as \[\begin{align*} +\frac{\text{d}g(f_i)}{\text{d} f_i} & = +\exp(-f_i)(1+\exp(-f_i))^{-2}\\ +& = \frac{1}{1+\exp(-f_i)} +\frac{\exp(-f_i)}{1+\exp(-f_i)} \\ +& = g(f_i) (1-g(f_i)) +\end{align*}\] so the full gradient can be written down as \[ +\frac{\text{d}E(\mathbf{ w})}{\text{d}\mathbf{ w}} = -\sum_{i=1}^n +y_i\left(1-g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x})\right)\right) +\boldsymbol{ \phi}(\mathbf{ x}_i) + \sum_{i=1}^n +(1-y_i)\left(g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x})\right)\right) +\boldsymbol{ \phi}(\mathbf{ x}_i). +\]

+
import numpy as np
+
def gradient(g, Phi, y):
+    "Generates the gradient of the parameter vector."
+    labs = np.asarray(y, dtype=float).flatten()
+    posind = np.where(labs==1)
+    dw = -(Phi[posind]*(1-g[posind])).sum(0)
+    negind = np.where(labs==0 )
+    dw += (Phi[negind]*g[negind]).sum(0)
+    return dw[:, None]
+

Optimization of the Function

+

Reorganizing the gradient to find a stationary point of the function +with respect to the parameters \(\mathbf{ +w}\) turns out to be impossible. Optimization has to proceed by +numerical methods. Options include the multidimensional variant +of Newton’s +method or gradient based +optimization methods like we used for optimizing matrix +factorization for the movie recommender system. We recall from matrix +factorization that, for large data, stochastic gradient descent +or the Robbins Munro (Robbins and Monro, 1951) +optimization procedure worked best for function minimization.

+

Nigeria NMIS Data

+
+[edit] +
+

As an example data set we will use Nigerian Millennium Development +Goals Information System Health Facility (The Office of the Senior Special Assistant +to the President on the Millennium Development Goals (OSSAP-MDGs) and +Columbia University, 2014). It can be found here https://energydata.info/dataset/nigeria-nmis-education-facility-data-2014.

+

Taking from the information on the site,

+
+

The Nigeria MDG (Millennium Development Goals) Information System – +NMIS health facility data is collected by the Office of the Senior +Special Assistant to the President on the Millennium Development Goals +(OSSAP-MDGs) in partner with the Sustainable Engineering Lab at Columbia +University. A rigorous, geo-referenced baseline facility inventory +across Nigeria is created spanning from 2009 to 2011 with an additional +survey effort to increase coverage in 2014, to build Nigeria’s first +nation-wide inventory of health facility. The database includes 34,139 +health facilities info in Nigeria.

+

The goal of this database is to make the data collected available to +planners, government officials, and the public, to be used to make +strategic decisions for planning relevant interventions.

+

For data inquiry, please contact Ms. Funlola Osinupebi, Performance +Monitoring & Communications, Advisory Power Team, Office of the Vice +President at funlola.osinupebi@aptovp.org

+

To learn more, please visit http://csd.columbia.edu/2014/03/10/the-nigeria-mdg-information-system-nmis-takes-open-data-further/

+

Suggested citation: Nigeria NMIS facility database (2014), the Office +of the Senior Special Assistant to the President on the Millennium +Development Goals (OSSAP-MDGs) & Columbia University

+
+

For ease of use we’ve packaged this data set in the pods +library

+

pods

+
+[edit] +
+

In Sheffield we created a suite of software tools for ‘Open Data +Science’. Open data science is an approach to sharing code, models and +data that should make it easier for companies, health professionals and +scientists to gain access to data science techniques.

+

You can also check this blog post on Open +Data Science.

+

The software can be installed using

+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/ods

+

Once pods is installed, it can be imported in the usual +manner.

+
import pods
+
data = pods.datasets.nigeria_nmis()['Y']
+data.head()
+

Alternatively, you can access the data directly with the following +commands.

+
import urllib.request
+urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
+
+import pandas as pd
+data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
+

Once it is loaded in the data can be summarized using the +describe method in pandas.

+
data.describe()
+

We can also find out the dimensions of the dataset using the +shape property.

+
data.shape
+

Dataframes have different functions that you can use to explore and +understand your data. In python and the Jupyter notebook it is possible +to see a list of all possible functions and attributes by typing the +name of the object followed by .<Tab> for example in +the above case if we type data.<Tab> it show the +columns available (these are attributes in pandas dataframes) such as +num_nurses_fulltime, and also functions, such as +.describe().

+

For functions we can also see the documentation about the function by +following the name with a question mark. This will open a box with +documentation at the bottom which can be closed with the x button.

+
data.describe?
-
- - +
+
+
-
- -
-
-

Figure: The logistic function.

+
+
-

The function has this characeristic ‘s’-shape (from where the term sigmoid, as in sigma, comes from). It also takes the input from the entire real line and ‘squashes’ it into an output that is between zero and one. For this reason it is sometimes also called a ‘squashing function’.

-

By replacing the inverse link with the sigmoid we can write π as a function of the input and the parameter vector as,
$$ -\pi(\inputVector,\mappingVector) = \frac{1}{1+\exp\left(-\mappingVector^\top \basisVector(\inputVector)\right)}. -$$
The process for logistic regression is as follows. Compute the output of a standard linear basis function composition ($\mappingVector^\top \basisVector(\inputVector)$, as we did for linear regression) and then apply the inverse link function, $g(\mappingVector^\top \basisVector(\inputVector))$. In logistic regression this involves squashing it with the logistic (or sigmoid) function. Use this value, which now has an interpretation as a probability in a Bernoulli distribution to form the likelihood. Then we can assume conditional independence of each data point given the parameters and develop a likelihod for the entire data set.

-

As we discussed last time, the Bernoulli likelihood is of the form,
$$ -P(\dataScalar_i|\mappingVector, \inputVector) = -\pi_i^{\dataScalar_i} (1-\pi_i)^{1-\dataScalar_i} -$$
which we can think of as clever trick for mathematically switching between two probabilities if we were to write it as code it would be better described as

-
def bernoulli(x, y, pi):
-    if y == 1:
-        return pi(x)
-    else:
-        return 1-pi(x)
-

but writing it mathematically makes it easier to write our objective function within a single mathematical equation.

-

Maximum Likelihood

-

To obtain the parameters of the model, we need to maximize the likelihood, or minimize the objective function, normally taken to be the negative log likelihood. With a data conditional independence assumption the likelihood has the form,
$$ -P(\dataVector|\mappingVector, -\inputMatrix) = \prod_{i=1}^\numData P(\dataScalar_i|\mappingVector, \inputVector_i). -$$
which can be written as a log likelihood in the form
$$ -\log P(\dataVector|\mappingVector, -\inputMatrix) = \sum_{i=1}^\numData \log P(\dataScalar_i|\mappingVector, \inputVector_i) = \sum_{i=1}^\numData -\dataScalar_i \log \pi_i + \sum_{i=1}^\numData (1-\dataScalar_i)\log (1-\pi_i) -$$
and if we take the probability of positive outcome for the ith data point to be given by
$$ -\pi_i = g\left(\mappingVector^\top \basisVector(\inputVector_i)\right), -$$
where g( ⋅ ) is the inverse link function, then this leads to an objective function of the form,
$$ -E(\mappingVector) = - \sum_{i=1}^\numData \dataScalar_i \log -g\left(\mappingVector^\top \basisVector(\inputVector_i)\right) - -\sum_{i=1}^\numData(1-\dataScalar_i)\log \left(1-g\left(\mappingVector^\top -\basisVector(\inputVector_i)\right)\right). -$$

-
import numpy as np
-
def objective(g, y):
-    "Computes the objective function."
-    labs = np.asarray(y, dtype=float).flatten()
-    posind = np.where(labs==1)
-    negind = np.where(labs==0)
-    return -np.log(g[posind, :]).sum() - np.log(1-g[negind, :]).sum()
-

As normal, we would like to minimize this objective. This can be done by differentiating with respect to the parameters of our prediction function, $\pi(\inputVector;\mappingVector)$, for optimisation. The gradient of the likelihood with respect to $\pi(\inputVector;\mappingVector)$ is of the form,
$$ -\frac{\text{d}E(\mappingVector)}{\text{d}\mappingVector} = -\sum_{i=1}^\numData -\frac{\dataScalar_i}{g\left(\mappingVector^\top -\basisVector(\inputVector)\right)}\frac{\text{d}g(\mappingFunction_i)}{\text{d}\mappingFunction_i} -\basisVector(\inputVector_i) + \sum_{i=1}^\numData -\frac{1-\dataScalar_i}{1-g\left(\mappingVector^\top -\basisVector(\inputVector)\right)}\frac{\text{d}g(\mappingFunction_i)}{\text{d}\mappingFunction_i} -\basisVector(\inputVector_i) -$$
where we used the chain rule to develop the derivative in terms of $\frac{\text{d}g(\mappingFunction_i)}{\text{d}\mappingFunction_i}$, which is the gradient of the inverse link function (in our case the gradient of the sigmoid function).

-

So the objective function now depends on the gradient of the inverse link function, as well as the likelihood depends on the gradient of the inverse link function, as well as the gradient of the log likelihood, and naturally the gradient of the argument of the inverse link function with respect to the parameters, which is simply $\basisVector(\inputVector_i)$.

-

The only missing term is the gradient of the inverse link function. For the sigmoid squashing function we have,
$$\begin{align*} -g(\mappingFunction_i) &= \frac{1}{1+\exp(-\mappingFunction_i)}\\ -&=(1+\exp(-\mappingFunction_i))^{-1} -\end{align*}$$
and the gradient can be computed as
$$\begin{align*} -\frac{\text{d}g(\mappingFunction_i)}{\text{d} \mappingFunction_i} & = -\exp(-\mappingFunction_i)(1+\exp(-\mappingFunction_i))^{-2}\\ -& = \frac{1}{1+\exp(-\mappingFunction_i)} -\frac{\exp(-\mappingFunction_i)}{1+\exp(-\mappingFunction_i)} \\ -& = g(\mappingFunction_i) (1-g(\mappingFunction_i)) -\end{align*}$$
so the full gradient can be written down as
$$ -\frac{\text{d}E(\mappingVector)}{\text{d}\mappingVector} = -\sum_{i=1}^\numData -\dataScalar_i\left(1-g\left(\mappingVector^\top \basisVector(\inputVector)\right)\right) -\basisVector(\inputVector_i) + \sum_{i=1}^\numData -(1-\dataScalar_i)\left(g\left(\mappingVector^\top \basisVector(\inputVector)\right)\right) -\basisVector(\inputVector_i). -$$

-
import numpy as np
-
def gradient(g, Phi, y):
-    "Generates the gradient of the parameter vector."
-    labs = np.asarray(y, dtype=float).flatten()
-    posind = np.where(labs==1)
-    dw = -(Phi[posind]*(1-g[posind])).sum(0)
-    negind = np.where(labs==0 )
-    dw += (Phi[negind]*g[negind]).sum(0)
-    return dw[:, None]
-

Optimization of the Function

-

Reorganizing the gradient to find a stationary point of the function with respect to the parameters $\mappingVector$ turns out to be impossible. Optimization has to proceed by numerical methods. Options include the multidimensional variant of Newton’s method or gradient based optimization methods like we used for optimizing matrix factorization for the movie recommender system. We recall from matrix factorization that, for large data, stochastic gradient descent or the Robbins Munro (Robbins and Monro 1951) optimization procedure worked best for function minimization.

-

Nigerian NMIS Data

-

[edit]

-

First we will load in the Nigerian NMIS health data. Our aim will be to predict whether a center has maternal health delivery services given the attributes in the data. We will predict of the number of nurses, the number of doctors, location etc.

-

Let’s first remind ourselves of the data.

-
import urllib.request
-
urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
-
import pandas as pd
-
data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
-

data.head()}

-

Now we will convert this data into a form which we can use as inputs X, and labels y.

-
import pandas as pd
-import numpy as np
-
data = data[~pd.isnull(data['maternal_health_delivery_services'])]
-data = data.dropna() # Remove entries with missing values
-X = data[['emergency_transport',
-          'num_chews_fulltime', 
-          'phcn_electricity',
-          'child_health_measles_immun_calc',
-          'num_nurses_fulltime',
-          'num_doctors_fulltime', 
-          'improved_water_supply', 
-          'improved_sanitation',
-          'antenatal_care_yn', 
-          'family_planning_yn',
-          'malaria_treatment_artemisinin', 
-          'latitude', 
-          'longitude']].copy()
-y = data['maternal_health_delivery_services']==True  # set label to be whether there's a maternal health delivery service
-
-# Create series of health center types with the relevant index
-s = data['facility_type_display'].apply(pd.Series, 1).stack() 
-s.index = s.index.droplevel(-1) # to line up with df's index
-
-# Extract from the series the unique list of types.
-types = s.unique()
-
-# For each type extract the indices where it is present and add a column to X
-type_names = []
-for htype in types:
-    index = s[s==htype].index.tolist()
-    type_col=htype.replace(' ', '_').replace('/','-').lower()
-    type_names.append(type_col)
-    X.loc[:, type_col] = 0.0 
-    X.loc[index, type_col] = 1.0
-

This has given us a new data frame X which contains the different facility types in different columns.

-
X.describe()
+
+

Figure: Location of the over thirty-four thousand health facilities +registered in the NMIS data across Nigeria. Each facility plotted +according to its latitude and longitude.

+
+
+

Nigeria NMIS Data +Classification

+
+[edit] +
+

Our aim will be to predict whether a center has maternal health +delivery services given the attributes in the data. We will predict of +the number of nurses, the number of doctors, location etc.

+

Now we will convert this data into a form which we can use as inputs +X, and labels y.

+
import pandas as pd
+import numpy as np
+
data = data[~pd.isnull(data['maternal_health_delivery_services'])]
+data = data.dropna() # Remove entries with missing values
+X = data[['emergency_transport',
+          'num_chews_fulltime', 
+          'phcn_electricity',
+          'child_health_measles_immun_calc',
+          'num_nurses_fulltime',
+          'num_doctors_fulltime', 
+          'improved_water_supply', 
+          'improved_sanitation',
+          'antenatal_care_yn', 
+          'family_planning_yn',
+          'malaria_treatment_artemisinin', 
+          'latitude', 
+          'longitude']].copy()
+y = data['maternal_health_delivery_services']==True  # set label to be whether there's a maternal health delivery service
+
+# Create series of health center types with the relevant index
+s = data['facility_type_display'].apply(pd.Series, 1).stack() 
+s.index = s.index.droplevel(-1) # to line up with df's index
+
+# Extract from the series the unique list of types.
+types = s.unique()
+
+# For each type extract the indices where it is present and add a column to X
+type_names = []
+for htype in types:
+    index = s[s==htype].index.tolist()
+    type_col=htype.replace(' ', '_').replace('/','-').lower()
+    type_names.append(type_col)
+    X.loc[:, type_col] = 0.0 
+    X.loc[index, type_col] = 1.0
+

This has given us a new data frame X which contains the +different facility types in different columns.

+
X.describe()

Batch Gradient Descent

-

[edit]

-

We will need to define some initial random values for our vector and then minimize the objective by descending the gradient.

-
# Separate train and test
-indices = np.random.permutation(X.shape[0])
-num_train = np.ceil(X.shape[0]/2)r
-train_indices = indices[:num_train]
-test_indices = indices[num_train:]
-X_train = X.iloc[train_indices]
-y_train = y.iloc[train_indices]==True
-X_test = X.iloc[test_indices]
-y_test = y.iloc[test_indices]==True
-
import numpy as np
-
# gradient descent algorithm
-w = np.random.normal(size=(X.shape[1]+1, 1), scale = 0.001)
-eta = 1e-9
-iters = 10000
-for i in range(iters):
-    g, Phi = predict(w, X_train, linear)
-    w -= eta*gradient(g, Phi, y_train) + 0.001*w
-    if not i % 100:
-        print("Iter", i, "Objective", objective(g, y_train))
+
+[edit] +
+

We will need to define some initial random values for our vector and +then minimize the objective by descending the gradient.

+
# Separate train and test
+indices = np.random.permutation(X.shape[0])
+num_train = np.ceil(X.shape[0]/2)r
+train_indices = indices[:num_train]
+test_indices = indices[num_train:]
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]==True
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]==True
+
import numpy as np
+
# gradient descent algorithm
+w = np.random.normal(size=(X.shape[1]+1, 1), scale = 0.001)
+eta = 1e-9
+iters = 10000
+for i in range(iters):
+    g, Phi = predict(w, X_train, linear)
+    w -= eta*gradient(g, Phi, y_train) + 0.001*w
+    if not i % 100:
+        print("Iter", i, "Objective", objective(g, y_train))

Let’s look at the weights and how they relate to the inputs.

-
import matplotlib.pyplot as plt
-
print(w)
-

What does the magnitude of the weight vectors tell you about the different parameters and their influence on outcome? Are the weights of roughly the same size, if not, how might you fix this?

-
g_test, Phi_test = predict(w, X_test, linear)
-np.sum(g_test[y_test]>0.5)
+
import matplotlib.pyplot as plt
+
print(w)
+

What does the magnitude of the weight vectors tell you about the +different parameters and their influence on outcome? Are the weights of +roughly the same size, if not, how might you fix this?

+
g_test, Phi_test = predict(w, X_test, linear)
+np.sum(g_test[y_test]>0.5)

Stochastic Gradient Descent

-

Exercise 2

-

Now construct a stochastic gradient descent algorithm and run it on the data. Is it faster or slower than batch gradient descent? What can you do to improve convergence speed?

+

Exercise 1

+

Now construct a stochastic gradient descent algorithm and run it on +the data. Is it faster or slower than batch gradient descent? What can +you do to improve convergence speed?

Regression

-

[edit]

-

Classification is the case where our prediction function gives a discrete valued output, normally associated with a ‘class’. Regression is an alternative approach where the aim is to predict a continuous output.

-

The name is a historical accident, it would be better to call regression ‘curve fitting’, or even split it into two parts ‘interpolation’, which is the practice of predicting a function value between existing data, and ‘extrapolation’, which is the practice of predicting a function value beyond the regime where we have data.

+
+[edit] +
+

Classification is the case where our prediction function gives a +discrete valued output, normally associated with a ‘class’. Regression +is an alternative approach where the aim is to predict a continuous +output.

+

The name is a historical accident, it would be better to call +regression ‘curve fitting’, or even split it into two parts +‘interpolation’, which is the practice of predicting a function value +between existing data, and ‘extrapolation’, which is the practice of +predicting a function value beyond the regime where we have data.

Regression Examples

-

[edit]

-

Regression involves predicting a real value, $\dataScalar_i$, given an input vector, $\inputVector_i$. For example, the Tecator data involves predicting the quality of meat given spectral measurements. Or in radiocarbon dating, the C14 calibration curve maps from radiocarbon age to age measured through a back-trace of tree rings. Regression has also been used to predict the quality of board game moves given expert rated training data.

-

Supervised Learning Challenges

-

[edit]

-

There are three principal challenges in constructing a problem for supervised learning.

+
+[edit] +
+

Regression involves predicting a real value, \(y_i\), given an input vector, \(\mathbf{ x}_i\). For example, the Tecator +data involves predicting the quality of meat given spectral +measurements. Or in radiocarbon dating, the C14 calibration curve maps +from radiocarbon age to age measured through a back-trace of tree rings. +Regression has also been used to predict the quality of board game moves +given expert rated training data.

+

Supervised Learning +Challenges

+
+[edit] +
+

There are three principal challenges in constructing a problem for +supervised learning.

    -
  1. choosing which features, $\inputVector$, are relevant in the prediction
  2. -
  3. defining the appropriate class of function, $\mappingFunction(\cdot)$.
  4. -
  5. selecting the right parameters, $\weightVector$.
  6. +
  7. choosing which features, \(\mathbf{ +x}\), are relevant in the prediction
  8. +
  9. defining the appropriate class of function, \(f(\cdot)\).
  10. +
  11. selecting the right parameters, \(\mathbf{ +w}\).

Feature Selection

-

[edit]

-

Feature selection is a critical stage in the algorithm design process. In the Olympic prediction example above we’re only using time to predict the the pace of the runners. In practice we might also want to use characteristics of the course: how hilly it is, what the temperature was when the race was run. In 1904 the runners actually got lost during the race. Should we include ‘lost’ as a feature? It would certainly help explain the particularly slow time in 1904. The features we select should be ones we expect to correlate with the prediction. In statistics, these features are even called predictors which highlights their role in developing the prediction function. For Facebook newsfeed, we might use features that include how close your friendship is with the poster, or how often you react to that poster, or whether a photo is included in the post.

-

Sometimes we use feature selection algorithms, algorithms that automate the process of finding the features that we need. Classification is often used to rank search results, to decide which adverts to serve or, at Facebook, to determine what appears at the top of your newsfeed. In the Facebook example features might include how many likes a post has had, whether it has an image in it, whether you regularly interact with the friend who has posted. A good newsfeed ranking algorithm is critical to Facebook’s success, just as good ad serving choice is critical to Google’s success. These algorithms are in turn highly dependent on the feature sets used. Facebook in particular has made heavy investments in machine learning pipelines for evaluation of the feature utility.

-

Class of Function, $\mappingFunction(\cdot)$

-

[edit]

-

By class of function we mean, what are the characteristics of the mapping between x and y. Often, we might choose it to be a smooth function. Sometimes we will choose it to be a linear function. If the prediction is a forecast, for example the demand of a particular product, then the function would need some periodic components to reflect seasonal or weekly effects.

+
+[edit] +
+

Feature selection is a critical stage in the algorithm design +process. In the Olympic prediction example above we’re only using time +to predict the the pace of the runners. In practice we might also want +to use characteristics of the course: how hilly it is, what the +temperature was when the race was run. In 1904 the runners actually got +lost during the race. Should we include ‘lost’ as a feature? It would +certainly help explain the particularly slow time in 1904. The features +we select should be ones we expect to correlate with the prediction. In +statistics, these features are even called predictors which +highlights their role in developing the prediction function. For +Facebook newsfeed, we might use features that include how close your +friendship is with the poster, or how often you react to that poster, or +whether a photo is included in the post.

+

Sometimes we use feature selection algorithms, algorithms that +automate the process of finding the features that we need. +Classification is often used to rank search results, to decide which +adverts to serve or, at Facebook, to determine what appears at the top +of your newsfeed. In the Facebook example features might include how +many likes a post has had, whether it has an image in it, whether you +regularly interact with the friend who has posted. A good newsfeed +ranking algorithm is critical to Facebook’s success, just as good ad +serving choice is critical to Google’s success. These algorithms are in +turn highly dependent on the feature sets used. Facebook in particular +has made heavy investments in machine learning pipelines for evaluation +of the feature utility.

+

Class of Function, \(f(\cdot)\)

+
+[edit] +
+

By class of function we mean, what are the characteristics of the +mapping between \(\mathbf{x}\) and +\(y\). Often, we might choose it to be +a smooth function. Sometimes we will choose it to be a linear function. +If the prediction is a forecast, for example the demand of a particular +product, then the function would need some periodic components to +reflect seasonal or weekly effects.

Analysis of US Birth Rates

-

[edit]

+
+[edit] +
+
+ + + + + +Aki Vehtari + + + +
- +
-
+
-

Figure: This is a retrospective analysis of US births by Aki Vehtari. The challenges of forecasting. Even with seasonal and weekly effects removed there are significant effects on holidays, weekends, etc.

+

Figure: This is a retrospective analysis of US births by Aki Vehtari. +The challenges of forecasting. Even with seasonal and weekly effects +removed there are significant effects on holidays, weekends, etc.

-

There’s a nice analysis of US birth rates by Gaussian processes with additive covariances in Gelman et al. (2013). A combination of covariance functions are used to take account of weekly and yearly trends. The analysis is summarized on the cover of the book.

+

There’s a nice analysis of US birth rates by Gaussian processes with +additive covariances in Gelman et al. (2013). A +combination of covariance functions are used to take account of weekly +and yearly trends. The analysis is summarized on the cover of the +book.

- +
- +
-
+
-

Figure: Two different editions of Bayesian Data Analysis (Gelman et al. 2013).

-
-
-

In the ImageNet challenge the input, $\inputVector$, was in the form of an image. And the form of the prediction function was a convolutional neural network (more on this later). A convolutional neural network introduces invariances into the function that are particular to image classification. An invariance is a transformation of the input that we don’t want to affect the output. For example, a cat in an image is still a cat no matter where it’s located in the image (translation). The cat is also a cat regardless of how large it is (scale), or whether it’s upside-down (rotation). Convolutional neural networks encode these invariances: scale invariance, rotation invariance and translation invariance; in the mathematical function.

-

Encoding invariance in the prediction function is like encoding knowledge in the model. If we don’t specify these invariances, then the model must learn them. This will require a lot more data to achieve the same performance, making the model less data efficient. Note that one invariance that is not encoded in a convolutional network is invariance to camera type. As a result, practitioners need to be careful to ensure that their training data is representative of the type of cameras that will be used when the model is deployed.

-

In general the prediction function could be any set of parameterized functions. In the Olympic marathon data example above we used a polynomial fit,
$$ -\mappingFunction(\inputScalar) = \weightScalar_0 + \weightScalar_1 \inputScalar+ \weightScalar_2 \inputScalar^2 + \weightScalar_3 \inputScalar^3 + \weightScalar_4 \inputScalar^4. -$$
The Olympic example is also a supervised learning challenge. But it is a regression problem. A regression problem is one where the output is a continuous value (such as the pace in the marathon). In classification the output is constrained to be discrete. For example, classifying whether or not an image contains a dog implies the output is binary. An early example of a regression problem used in machine learning was the Tecator data, where the fat, water and protein content of meat samples was predicted as a function of the absorption of infrared light.

-

Class of Function: Neural Networks

-

[edit]

-

One class of function that has become popular recently is neural network functions, in particular deep neural networks. The ImageNet challenge uses convolutional neural networks which introduce a translation invariance to the prediction function.

-

It’s impressive that only this additional invariance is enough to improve performance so much, particularly when we know that rotational invariances and scale invariances are also applicable for object detection in images.

+

Figure: Two different editions of Bayesian Data Analysis (Gelman et al., +2013).

+
+
+

In the ImageNet challenge the input, \(\mathbf{ x}\), was in the form of an image. +And the form of the prediction function was a convolutional neural +network (more on this later). A convolutional neural network +introduces invariances into the function that are particular to +image classification. An invariance is a transformation of the input +that we don’t want to affect the output. For example, a cat in an image +is still a cat no matter where it’s located in the image (translation). +The cat is also a cat regardless of how large it is (scale), or whether +it’s upside-down (rotation). Convolutional neural networks encode these +invariances: scale invariance, rotation invariance and translation +invariance; in the mathematical function.

+

Encoding invariance in the prediction function is like encoding +knowledge in the model. If we don’t specify these invariances, then the +model must learn them. This will require a lot more data to achieve the +same performance, making the model less data efficient. Note that one +invariance that is not encoded in a convolutional network is +invariance to camera type. As a result, practitioners need to be careful +to ensure that their training data is representative of the type of +cameras that will be used when the model is deployed.

+

In general the prediction function could be any set of parameterized +functions. In the Olympic marathon data example above we used a +polynomial fit, \[ +f(x) = w_0 + w_1 x+ w_2 x^2 + w_3 x^3 + w_4 x^4. +\] The Olympic example is also a supervised learning challenge. +But it is a regression problem. A regression problem is one +where the output is a continuous value (such as the pace in the +marathon). In classification the output is constrained to be discrete. +For example, classifying whether or not an image contains a dog implies +the output is binary. An early example of a regression problem used in +machine learning was the Tecator data, +where the fat, water and protein content of meat samples was predicted +as a function of the absorption of infrared light.

+

Class of Function: Neural +Networks

+
+[edit] +
+

One class of function that has become popular recently is neural +network functions, in particular deep neural networks. The ImageNet +challenge uses convolutional neural networks which introduce a +translation invariance to the prediction function.

+

It’s impressive that only this additional invariance is enough to +improve performance so much, particularly when we know that rotational +invariances and scale invariances are also applicable for object +detection in images.

Deep Learning

-

[edit]

-

Classical statistical models and simple machine learning models have a great deal in common. The main difference between the fields is philosophical. Machine learning practitioners are typically more concerned with the quality of prediciton (e.g. measured by ROC curve) while statisticians tend to focus more on the interpretability of the model and the validity of any decisions drawn from that interpretation. For example, a statistical model may be used to validate whether a large scale intervention (such as the mass provision of mosquito nets) has had a long term effect on disease (such as malaria). In this case one of the covariates is likely to be the provision level of nets in a particular region. The response variable would be the rate of malaria disease in the region. The parmaeter, β1 associated with that covariate will demonstrate a positive or negative effect which would be validated in answering the question. The focus in statistics would be less on the accuracy of the response variable and more on the validity of the interpretation of the effect variable, β1.

-

A machine learning practitioner on the other hand would typically denote the parameter w1, instead of β1 and would only be interested in the output of the prediction function, $\mappingFunction(\cdot)$ rather than the parameter itself. The general formalism of the prediction function allows for non-linear models. In machine learning, the emphasis on prediction over interpretability means that non-linear models are often used. The parameters, w, are a means to an end (good prediction) rather than an end in themselves (interpretable).

+
+[edit] +
+

Classical statistical models and simple machine learning models have +a great deal in common. The main difference between the fields is +philosophical. Machine learning practitioners are typically more +concerned with the quality of prediciton (e.g. measured by ROC curve) +while statisticians tend to focus more on the interpretability of the +model and the validity of any decisions drawn from that interpretation. +For example, a statistical model may be used to validate whether a large +scale intervention (such as the mass provision of mosquito nets) has had +a long term effect on disease (such as malaria). In this case one of the +covariates is likely to be the provision level of nets in a particular +region. The response variable would be the rate of malaria disease in +the region. The parmaeter, \(\beta_1\) +associated with that covariate will demonstrate a positive or negative +effect which would be validated in answering the question. The focus in +statistics would be less on the accuracy of the response variable and +more on the validity of the interpretation of the effect variable, \(\beta_1\).

+

A machine learning practitioner on the other hand would typically +denote the parameter \(w_1\), instead +of \(\beta_1\) and would only be +interested in the output of the prediction function, \(f(\cdot)\) rather than the parameter +itself. The general formalism of the prediction function allows for +non-linear models. In machine learning, the emphasis on +prediction over interpretability means that non-linear models are often +used. The parameters, \(\mathbf{w}\), +are a means to an end (good prediction) rather than an end in themselves +(interpretable).

DeepFace

-

[edit]

+
+[edit] +
- +
-
+
-

Figure: The DeepFace architecture (Taigman et al. 2014), visualized through colors to represent the functional mappings at each layer. There are 120 million parameters in the model.

-
-
-

The DeepFace architecture (Taigman et al. 2014) consists of layers that deal with translation and rotational invariances. These layers are followed by three locally-connected layers and two fully-connected layers. Color illustrates feature maps produced at each layer. The neural network includes more than 120 million parameters, where more than 95% come from the local and fully connected layers.

+

Figure: The DeepFace architecture (Taigman et al., 2014), +visualized through colors to represent the functional mappings at each +layer. There are 120 million parameters in the model.

+
+
+

The DeepFace architecture (Taigman et al., 2014) consists +of layers that deal with translation invariances, known as +convolutional layers. These layers are followed by three +locally-connected layers and two fully-connected layers. Color +illustrates feature maps produced at each layer. The neural network +includes more than 120 million parameters, where more than 95% come from +the local and fully connected layers.

Deep Learning as Pinball

-

[edit]

+
+[edit] +
-
- +
+
-
+
-

Figure: Deep learning models are composition of simple functions. We can think of a pinball machine as an analogy. Each layer of pins corresponds to one of the layers of functions in the model. Input data is represented by the location of the ball from left to right when it is dropped in from the top. Output class comes from the position of the ball as it leaves the pins at the bottom.

-
-
-

Sometimes deep learning models are described as being like the brain, or too complex to understand, but one analogy I find useful to help the gist of these models is to think of them as being similar to early pin ball machines.

-

In a deep neural network, we input a number (or numbers), whereas in pinball, we input a ball.

-

Think of the location of the ball on the left-right axis as a single number. Our simple pinball machine can only take one number at a time. As the ball falls through the machine, each layer of pins can be thought of as a different layer of ‘neurons’. Each layer acts to move the ball from left to right.

-

In a pinball machine, when the ball gets to the bottom it might fall into a hole defining a score, in a neural network, that is equivalent to the decision: a classification of the input object.

-

An image has more than one number associated with it, so it is like playing pinball in a hyper-space.

+

Figure: Deep learning models are composition of simple functions. We +can think of a pinball machine as an analogy. Each layer of pins +corresponds to one of the layers of functions in the model. Input data +is represented by the location of the ball from left to right when it is +dropped in from the top. Output class comes from the position of the +ball as it leaves the pins at the bottom.

+
+
+

Sometimes deep learning models are described as being like the brain, +or too complex to understand, but one analogy I find useful to help the +gist of these models is to think of them as being similar to early pin +ball machines.

+

In a deep neural network, we input a number (or numbers), whereas in +pinball, we input a ball.

+

Think of the location of the ball on the left-right axis as a single +number. Our simple pinball machine can only take one number at a time. +As the ball falls through the machine, each layer of pins can be thought +of as a different layer of ‘neurons’. Each layer acts to move the ball +from left to right.

+

In a pinball machine, when the ball gets to the bottom it might fall +into a hole defining a score, in a neural network, that is equivalent to +the decision: a classification of the input object.

+

An image has more than one number associated with it, so it is like +playing pinball in a hyper-space.

- + -
+
-

Figure: At initialization, the pins, which represent the parameters of the function, aren’t in the right place to bring the balls to the correct decisions.

+

Figure: At initialization, the pins, which represent the parameters +of the function, aren’t in the right place to bring the balls to the +correct decisions.

- + -
+
-

Figure: After learning the pins are now in the right place to bring the balls to the correct decisions.

-
-
-

Learning involves moving all the pins to be in the correct position, so that the ball ends up in the right place when it’s fallen through the machine. But moving all these pins in hyperspace can be difficult.

-

In a hyper-space you have to put a lot of data through the machine for to explore the positions of all the pins. Even when you feed many millions of data points through the machine, there are likely to be regions in the hyper-space where no ball has passed. When future test data passes through the machine in a new route unusual things can happen.

-

Adversarial examples exploit this high dimensional space. If you have access to the pinball machine, you can use gradient methods to find a position for the ball in the hyper space where the image looks like one thing, but will be classified as another.

-

Probabilistic methods explore more of the space by considering a range of possible paths for the ball through the machine. This helps to make them more data efficient and gives some robustness to adversarial examples.

+

Figure: After learning the pins are now in the right place to bring +the balls to the correct decisions.

+ + +

Learning involves moving all the pins to be in the correct position, +so that the ball ends up in the right place when it’s fallen through the +machine. But moving all these pins in hyperspace can be difficult.

+

In a hyper-space you have to put a lot of data through the machine +for to explore the positions of all the pins. Even when you feed many +millions of data points through the machine, there are likely to be +regions in the hyper-space where no ball has passed. When future test +data passes through the machine in a new route unusual things can +happen.

+

Adversarial examples exploit this high dimensional space. If +you have access to the pinball machine, you can use gradient methods to +find a position for the ball in the hyper space where the image looks +like one thing, but will be classified as another.

+

Probabilistic methods explore more of the space by considering a +range of possible paths for the ball through the machine. This helps to +make them more data efficient and gives some robustness to adversarial +examples.

Encoding Knowledge

-

Knowledge that is not encoded in the prediction function must be learned through data. So any unspecified invariance (such as rotational or scale invariances) must be learned through the data. This means that learning would require a lot more data than otherwise would be necessary and results in less data efficient algorithms.

-

The choice of predication funciton and invariances is therefore a critical stage in designing your machine learning algorithm. Unfortunately many invariances are non-trivial to incorporate and many machine learning algorithms focus on simpler concepts such as linearity or smoothness.

-

Parameter Estimation: Objective Functions

-

[edit]

-

Once we have a set of features, and the class of functions we use is determined, we need to find the parameters of the model.

-

The parameters of the model, $\weightVector$, are estimated by specifying an objective function. The objective function specifies the quality of the match between the prediction function and the training data. In supervised learning the objective function incorporates both the input data (in the ImageNet data the image, in the Olympic marathon data the year of the marathon) and a label.

-

The label is where the term supervised learning comes from. The idea being that a supervisor, or annotator, has already looked at the data and given it labels. For regression problem, a typical objective function is the squared error,
$$ -\errorFunction(\weightVector) = \sum_{i=1}^\numData (\dataScalar_i - \mappingFunction(\inputVector_i))^2 -$$
where the data is provided to us as a set of n inputs, $\inputVector_1$, $\inputVector_2$, $\inputVector_3$, , $\inputVector_n$ each one with an associated label, $\dataScalar_1$, $\dataScalar_2$, $\dataScalar_3$, , $\dataScalar_\numData$. Sometimes the label is cheap to acquire. For example, in Newsfeed ranking Facebook are acquiring a label each time a user clicks on a post in their Newsfeed. Similarly, in ad-click prediction labels are obtained whenever an advert is clicked. More generally though, we have to employ human annotators to label the data. For example, ImageNet, the breakthrough deep learning result was annotated using Amazon’s Mechanical Turk. Without such large scale human input, we would not have the breakthrough results on image categorization we have today.

-

Some tasks are easier to annotate than others. For example, in the Tecator data, to acquire the actual values of water, protein and fat content in the meat samples further experiments may be required. It is not simply a matter of human labelling. Even if the task is easy for humans to solve there can be problems. For example, humans will extrapolate the context of an image. A colleague mentioned once to me a challenge where humans were labelling images as containing swimming pools, even though none was visible, because they could infer there must be a pool nearby, perhaps because there are kids wearing bathing suits. But there is no swimming pool in the image for the computer to find. The quality of any machine learning solution is very sensitive to the quality of annotated data we have. Investing in processes and tools to improve annotation of data is therefore priority for improving the quality of machine learning solutions.

-

There can also be significant problems with misrepresentation in the data set. If data isn’t collected carefully, then it can reflect biases about the population that we don’t want our models to have. For example, if we design a face detector using Californians may not perform well when deployed in Kampala, Uganda.

-

Generalization and Overfitting

-

[edit]

-

Once a supervised learning system is trained it can be placed in a sequential pipeline to automate a process that used to be done manually.

-

Supervised learning is one of the dominant approaches to learning. But the cost and time associated with labeling data is a major bottleneck for deploying machine learning systems. The process for creating training data requires significant human intervention. For example, internationalization of a speech recognition system would require large speech corpora in new languages.

-

An important distinction in machine learning is the separation between training data and test data (or production data). Training data is the data that was used to find the model parameters. Test data (or production data) is the data that is used with the live system. The ability of a machine learning system to predict well on production systems given only its training data is known as its generalization ability. This is the system’s ability to predict in areas where it hasn’t previously seen data.

-

Hold Out Validation on Olympic Marathon Data

-

[edit]

+

Knowledge that is not encoded in the prediction function must be +learned through data. So any unspecified invariance (such as rotational +or scale invariances) must be learned through the data. This means that +learning would require a lot more data than otherwise would be necessary +and results in less data efficient algorithms.

+

The choice of predication funciton and invariances is therefore a +critical stage in designing your machine learning algorithm. +Unfortunately many invariances are non-trivial to incorporate and many +machine learning algorithms focus on simpler concepts such as linearity +or smoothness.

+

Parameter Estimation: +Objective Functions

+
+[edit] +
+

Once we have a set of features, and the class of functions we use is +determined, we need to find the parameters of the model.

+

The parameters of the model, \(\mathbf{ +w}\), are estimated by specifying an objective function. +The objective function specifies the quality of the match between the +prediction function and the training data. In supervised +learning the objective function incorporates both the input data (in the +ImageNet data the image, in the Olympic marathon data the year of the +marathon) and a label.

+

The label is where the term supervised learning comes from. The idea +being that a supervisor, or annotator, has already looked at the data +and given it labels. For regression problem, a typical objective +function is the squared error, \[ +E(\mathbf{ w}) = \sum_{i=1}^n(y_i - f(\mathbf{ x}_i))^2 +\] where the data is provided to us as a set of \(n\) inputs, \(\mathbf{ x}_1\), \(\mathbf{ x}_2\), \(\mathbf{ x}_3\), \(\dots\), \(\mathbf{ x}_n\) each one with an associated +label, \(y_1\), \(y_2\), \(y_3\), \(\dots\), \(y_n\). Sometimes the label is cheap to +acquire. For example, in Newsfeed ranking Facebook are acquiring a label +each time a user clicks on a post in their Newsfeed. Similarly, in +ad-click prediction labels are obtained whenever an advert is clicked. +More generally though, we have to employ human annotators to label the +data. For example, ImageNet, the breakthrough deep learning result was +annotated using Amazon’s Mechanical Turk. Without such large scale human +input, we would not have the breakthrough results on image +categorization we have today.

+

Some tasks are easier to annotate than others. For example, in the +Tecator data, to acquire the actual values of water, protein and fat +content in the meat samples further experiments may be required. It is +not simply a matter of human labelling. Even if the task is easy for +humans to solve there can be problems. For example, humans will +extrapolate the context of an image. A colleague mentioned once to me a +challenge where humans were labelling images as containing swimming +pools, even though none was visible, because they could infer there must +be a pool nearby, perhaps because there are kids wearing bathing suits. +But there is no swimming pool in the image for the computer to find. The +quality of any machine learning solution is very sensitive to the +quality of annotated data we have. Investing in processes and tools to +improve annotation of data is therefore priority for improving the +quality of machine learning solutions.

+

There can also be significant problems with misrepresentation in the +data set. If data isn’t collected carefully, then it can reflect biases +about the population that we don’t want our models to have. For example, +if we design a face detector using Californians may not perform well +when deployed in Kampala, Uganda.

+

Generalization and +Overfitting

+
+[edit] +
+

Once a supervised learning system is trained it can be placed in a +sequential pipeline to automate a process that used to be done +manually.

+

Supervised learning is one of the dominant approaches to learning. +But the cost and time associated with labeling data is a major +bottleneck for deploying machine learning systems. The process for +creating training data requires significant human intervention. For +example, internationalization of a speech recognition system would +require large speech corpora in new languages.

+

An important distinction in machine learning is the separation +between training data and test data (or production data). Training data +is the data that was used to find the model parameters. Test data (or +production data) is the data that is used with the live system. The +ability of a machine learning system to predict well on production +systems given only its training data is known as its +generalization ability. This is the system’s ability to predict +in areas where it hasn’t previously seen data.

+

Hold Out +Validation on Olympic Marathon Data

+
+[edit] +
-
- +
+
-
+
-
-

Figure: Olympic marathon data with validation error for extrapolation.

+
+

Figure: Olympic marathon data with validation error for +extrapolation.

Extrapolation

Interpolation

-
- +
+
-
+
-
-

Figure: Olympic marathon data with validation error for interpolation.

+
+

Figure: Olympic marathon data with validation error for +interpolation.

Choice of Validation Set

Hold Out Data

-

You have a conclusion as to which model fits best under the training error, but how do the two models perform in terms of validation? In this section we consider hold out validation. In hold out validation we remove a portion of the training data for validating the model on. The remaining data is used for fitting the model (training). Because this is a time series prediction, it makes sense for us to hold out data at the end of the time series. This means that we are validating on future predictions. We will hold out data from after 1980 and fit the model to the data before 1980.

-
# select indices of data to 'hold out'
-indices_hold_out = np.flatnonzero(x>1980)
-
-# Create a training set
-x_train = np.delete(x, indices_hold_out, axis=0)
-y_train = np.delete(y, indices_hold_out, axis=0)
-
-# Create a hold out set
-x_valid = np.take(x, indices_hold_out, axis=0)
-y_valid = np.take(y, indices_hold_out, axis=0)
-

Exercise 3

-

For both the linear and quadratic models, fit the model to the data up until 1980 and then compute the error on the held out data (from 1980 onwards). Which model performs better on the validation data?

+

You have a conclusion as to which model fits best under the training +error, but how do the two models perform in terms of validation? In this +section we consider hold out validation. In hold out validation +we remove a portion of the training data for validating the +model on. The remaining data is used for fitting the model (training). +Because this is a time series prediction, it makes sense for us to hold +out data at the end of the time series. This means that we are +validating on future predictions. We will hold out data from after 1980 +and fit the model to the data before 1980.

+
# select indices of data to 'hold out'
+indices_hold_out = np.flatnonzero(x>1980)
+
+# Create a training set
+x_train = np.delete(x, indices_hold_out, axis=0)
+y_train = np.delete(y, indices_hold_out, axis=0)
+
+# Create a hold out set
+x_valid = np.take(x, indices_hold_out, axis=0)
+y_valid = np.take(y, indices_hold_out, axis=0)
+

Exercise 2

+

For both the linear and quadratic models, fit the model to the data +up until 1980 and then compute the error on the held out data (from 1980 +onwards). Which model performs better on the validation data?

Richer Basis Set

-

Now we have an approach for deciding which model to retain, we can consider the entire family of polynomial bases, with arbitrary degrees.

-

Exercise 4

-

Now we are going to build a more sophisticated form of basis function, one that can accept arguments to its inputs (similar to those we used in this lab). Here we will start with a polynomial basis.

+

Now we have an approach for deciding which model to retain, we can +consider the entire family of polynomial bases, with arbitrary +degrees.

+

Exercise 3

+

Now we are going to build a more sophisticated form of basis +function, one that can accept arguments to its inputs (similar to those +we used in this lab). Here we will start +with a polynomial basis.

def polynomial(x, degree, loc, scale):
     degrees =np.arange(degree+1)
     return ((x-loc)/scale)**degrees
-

The basis as we’ve defined it has three arguments as well as the input. The degree of the polynomial, the scale of the polynomial and the offset. These arguments need to be passed to the basis functions whenever they are called. Modify your code to pass these additional arguments to the python function for creating the basis. Do this for each of your functions predict, fit and objective. You will find *args (or **kwargs) useful.

-

Write code that tries to fit different models to the data with polynomial basis. Use a maximum degree for your basis from 0 to 17. For each polynomial store the hold out validation error and the training error. When you have finished the computation plot the hold out error for your models and the training error for your p. When computing your polynomial basis use offset=1956. and scale=120. to ensure that the data is mapped (roughly) to the -1, 1 range.

-

Which polynomial has the minimum training error? Which polynomial has the minimum validation error?

+

The basis as we’ve defined it has three arguments as well as the +input. The degree of the polynomial, the scale of the polynomial and the +offset. These arguments need to be passed to the basis functions +whenever they are called. Modify your code to pass these additional +arguments to the python function for creating the basis. Do this for +each of your functions predict, fit and +objective. You will find *args (or +**kwargs) useful.

+

Write code that tries to fit different models to the data with +polynomial basis. Use a maximum degree for your basis from 0 to 17. For +each polynomial store the hold out validation error and the +training error. When you have finished the computation plot the +hold out error for your models and the training error for your p. When +computing your polynomial basis use offset=1956. and +scale=120. to ensure that the data is mapped (roughly) to +the -1, 1 range.

+

Which polynomial has the minimum training error? Which polynomial has +the minimum validation error?

Bias Variance Decomposition

-

[edit]

-

The bias-variance decomposition considers the expected test error for different variations of the training data sampled from, $\Pr(\dataVector, \dataScalar)$
$$ -\mathbb{E}\left[ \left(\dataScalar - \mappingFunction^*(\dataVector)\right)^2 \right]. -$$
This can be decomposed into two parts,
$$ -\mathbb{E}\left[ \left(\dataScalar - \mappingFunction(\dataVector)\right)^2 \right] = \text{bias}\left[\mappingFunction^*(\dataVector)\right]^2 + \text{variance}\left[\mappingFunction^*(\dataVector)\right] +\sigma^2, -$$
where the bias is given by
$$ - \text{bias}\left[\mappingFunction^*(\dataVector)\right] = -\mathbb{E}\left[\mappingFunction^*(\dataVector)\right] * \mappingFunction(\dataVector) -$$
and it summarizes error that arises from the model’s inability to represent the underlying complexity of the data. For example, if we were to model the marathon pace of the winning runner from the Olympics by computing the average pace across time, then that model would exhibit bias error because the reality of Olympic marathon pace is it is changing (typically getting faster).

-

The variance term is given by
$$ - \text{variance}\left[\mappingFunction^*(\dataVector)\right] = \mathbb{E}\left[\left(\mappingFunction^*(\dataVector) - \mathbb{E}\left[\mappingFunction^*(\dataVector)\right]\right)^2\right]. - $$
The variance term is often described as arising from a model that is too complex, but we have to be careful with this idea. Is the model really too complex relative to the real world that generates the data? The real world is a complex place, and it is rare that we are constructing mathematical models that are more complex than the world around us. Rather, the ‘too complex’ refers to ability to estimate the parameters of the model given the data we have. Slight variations in the training set cause changes in prediction.

-

Models that exhibit high variance are sometimes said to ‘overfit’ the data whereas models that exhibit high bias are sometimes described as ‘underfitting’ the data.

+
+[edit] +
+

One of Breiman’s ideas for improving predictive performance is known +as bagging (Breiman:bagging96?). +The idea is to train a number of models on the data such that they +overfit (high variance). Then average the predictions of these models. +The models are trained on different bootstrap samples (Efron, 1979) and +their predictions are aggregated giving us the acronym, Bagging. By +combining decision trees with bagging, we recover random forests (Breiman, +2001).

+

Bias and variance can also be estimated through Efron’s bootstrap +(Efron, +1979), and the traditional view has been that there’s a form of +Goldilocks effect, where the best predictions are given by the model +that is ‘just right’ for the amount of data available. Not to simple, +not too complex. The idea is that bias decreases with increasing model +complexity and variance increases with increasing model complexity. +Typically plots begin with the Mummy bear on the left (too much bias) +end with the Daddy bear on the right (too much variance) and show a dip +in the middle where the Baby bear (just) right finds themselves.

+

The Daddy bear is typically positioned at the point where the model +can exactly interpolate the data. For a generalized linear model (McCullagh and +Nelder, 1989), this is the point at which the number of +parameters is equal to the number of data3.

+

The bias-variance decomposition (Geman:biasvariance92?) +considers the expected test error for different variations of the +training data sampled from, \(\mathbb{P}(\mathbf{ x}, y)\) \[\begin{align*} +R(\mathbf{ w}) = & \int \left(y- f^*(\mathbf{ x})\right)^2 +\mathbb{P}(y, \mathbf{ x}) \text{d}y\text{d}\mathbf{ x}\\ +& \triangleq \mathbb{E}\left[ \left(y- f^*(\mathbf{ x})\right)^2 +\right]. +\end{align*}\]

+

This can be decomposed into two parts, \[ +\begin{align*} +\mathbb{E}\left[ \left(y- f(\mathbf{ x})\right)^2 \right] = & +\text{bias}\left[f^*(\mathbf{ x})\right]^2 + +\text{variance}\left[f^*(\mathbf{ x})\right] +\sigma^2, +\end{align*} +\] where the bias is given by \[ + \text{bias}\left[f^*(\mathbf{ x})\right] = +\mathbb{E}\left[f^*(\mathbf{ x})\right] - f(\mathbf{ x}) +\] and it summarizes error that arises from the model’s inability +to represent the underlying complexity of the data. For example, if we +were to model the marathon pace of the winning runner from the Olympics +by computing the average pace across time, then that model would exhibit +bias error because the reality of Olympic marathon pace is it +is changing (typically getting faster).

+

The variance term is given by \[ + \text{variance}\left[f^*(\mathbf{ x})\right] = +\mathbb{E}\left[\left(f^*(\mathbf{ x}) - \mathbb{E}\left[f^*(\mathbf{ +x})\right]\right)^2\right]. + \] The variance term is often described as arising from a model +that is too complex, but we must be careful with this idea. Is the model +really too complex relative to the real world that generates the data? +The real world is a complex place, and it is rare that we are +constructing mathematical models that are more complex than the world +around us. Rather, the ‘too complex’ refers to ability to estimate the +parameters of the model given the data we have. Slight variations in the +training set cause changes in prediction.

+

Models that exhibit high variance are sometimes said to ‘overfit’ the +data whereas models that exhibit high bias are sometimes described as +‘underfitting’ the data.

Bias vs Variance Error Plots

-

[edit]

+
+[edit] +

Helper function for sampling data from two different classes.

-
import numpy as np
+
import numpy as np

Helper function for plotting the decision boundary of the SVM.

-
import urllib.request
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')
-
import matplotlib
-font = {'family' : 'sans',
-        'weight' : 'bold',
-        'size'   : 22}
-
-matplotlib.rc('font', **font)
-import matplotlib.pyplot as plt
-
from sklearn import svm
-
# Create an instance of SVM and fit the data. 
-C = 100.0  # SVM regularization parameter
-gammas = [0.001, 0.01, 0.1, 1]
-
-
-per_class=30
-num_samps = 20
-# Set-up 2x2 grid for plotting.
-fig, ax = plt.subplots(1, 4, figsize=(10,3))
-xlim=None
-ylim=None
-for samp in range(num_samps):
-    X, y=create_data(per_class)
-    models = []
-    titles = []
-    for gamma in gammas:
-        models.append(svm.SVC(kernel='rbf', gamma=gamma, C=C))
-        titles.append('$\gamma={}$'.format(gamma))
-    models = (cl.fit(X, y) for cl in models)
-    xlim, ylim = decision_boundary_plot(models, X, y, 
-                           axs=ax, 
-                           filename='bias-variance{samp:0>3}.svg'.format(samp=samp), 
-                           directory='../slides/diagrams/ml'
-                           titles=titles,
-                          xlim=xlim,
-                          ylim=ylim)
+
import urllib.request
+
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')
+
import matplotlib
+font = {'family' : 'sans',
+        'weight' : 'bold',
+        'size'   : 22}
+
+matplotlib.rc('font', **font)
+import matplotlib.pyplot as plt
+
from sklearn import svm
+
# Create an instance of SVM and fit the data. 
+C = 100.0  # SVM regularization parameter
+gammas = [0.001, 0.01, 0.1, 1]
+
+
+per_class=30
+num_samps = 20
+# Set-up 2x2 grid for plotting.
+fig, ax = plt.subplots(1, 4, figsize=(10,3))
+xlim=None
+ylim=None
+for samp in range(num_samps):
+    X, y=create_data(per_class)
+    models = []
+    titles = []
+    for gamma in gammas:
+        models.append(svm.SVC(kernel='rbf', gamma=gamma, C=C))
+        titles.append('$\gamma={}$'.format(gamma))
+    models = (cl.fit(X, y) for cl in models)
+    xlim, ylim = decision_boundary_plot(models, X, y, 
+                           axs=ax, 
+                           filename='bias-variance{samp:0>3}.svg'.format(samp=samp), 
+                           directory='./ml'
+                           titles=titles,
+                          xlim=xlim,
+                          ylim=ylim)
- +
- +
-
+
-

Figure: In each figure the simpler model is on the left, and the more complex model is on the right. Each fit is done to a different version of the data set. The simpler model is more consistent in its errors (bias error), whereas the more complex model is varying in its errors (variance error).

+

Figure: In each figure the simpler model is on the left, and the more +complex model is on the right. Each fit is done to a different version +of the data set. The simpler model is more consistent in its errors +(bias error), whereas the more complex model is varying in its errors +(variance error).

Overfitting

-
-
+

Figure: Alex Ihler discusses polynomials and overfitting.

-

We can easily develop a simple prediction function that reconstructs the training data exactly, you can just use a look up table. But how would the lookup table predict between the training data, where examples haven’t been seen before? The choice of the class of prediction functions is critical in ensuring that the model generalizes well.

-

The generalization error is normally estimated by applying the objective function to a set of data that the model wasn’t trained on, the test data. To ensure good performance we normally want a model that gives us a low generalization error. If we weren’t sure of the right prediction function to use, then we could try 1,000 different prediction functions. Then we could use the one that gives us the lowest error on the test data. But you have to be careful. Selecting a model in this way is like a further stage of training where you are using the test data in the training.3 So when this is done, the data used for this is not known as test data, it is known as validation data. And the associated error is the validation error. Using the validation error for model selection is a standard machine learning technique, but it can be misleading about the final generalization error. Almost all machine learning practitioners know not to use the test data in your training procedure, but sometimes people forget that when validation data is used for model selection that validation error cannot be used as an unbiased estimate of the generalization performance.

-

Olympic Data with Bayesian Polynomials

-

[edit]

-

Five fold cross validation tests the ability of the model to interpolate.

-
import mlai
-import pods
+

We can easily develop a simple prediction function that reconstructs +the training data exactly, you can just use a look up table. But how +would the lookup table predict between the training data, where examples +haven’t been seen before? The choice of the class of prediction +functions is critical in ensuring that the model generalizes well.

+

The generalization error is normally estimated by applying the +objective function to a set of data that the model wasn’t +trained on, the test data. To ensure good performance we normally want a +model that gives us a low generalization error. If we weren’t sure of +the right prediction function to use, then we could try 1,000 different +prediction functions. Then we could use the one that gives us the lowest +error on the test data. But you have to be careful. Selecting a model in +this way is like a further stage of training where you are using the +test data in the training.4 So when this is done, +the data used for this is not known as test data, it is known as +validation data. And the associated error is the validation +error. Using the validation error for model selection is a standard +machine learning technique, but it can be misleading about the final +generalization error. Almost all machine learning practitioners know not +to use the test data in your training procedure, but sometimes people +forget that when validation data is used for model selection that +validation error cannot be used as an unbiased estimate of the +generalization performance.

+

Olympic Data with +Bayesian Polynomials

+
+[edit] +
+

Five fold cross validation tests the ability of the model to +interpolate.

+
import mlai
+import pods
- + -
+
-
-

Figure: Bayesian fit with 26th degree polynomial and negative marginal log likelihood.

+
+

Figure: Bayesian fit with 26th degree polynomial and negative +marginal log likelihood.

Hold Out Validation

-

For the polynomial fit, we will now look at hold out validation, where we are holding out some of the most recent points. This tests the abilit of our model to extrapolate.

+

For the polynomial fit, we will now look at hold out +validation, where we are holding out some of the most recent points. +This tests the abilit of our model to extrapolate.

-
- +
+
-
+
-
-

Figure: Bayesian fit with 26th degree polynomial and hold out validation scores.

+
+

Figure: Bayesian fit with 26th degree polynomial and hold out +validation scores.

5-fold Cross Validation

-

Five fold cross validation tests the ability of the model to interpolate.

+

Five fold cross validation tests the ability of the model to +interpolate.

-
- +
+
-
+
-
-

Figure: Bayesian fit with 26th degree polynomial and five fold cross validation scores.

+
+

Figure: Bayesian fit with 26th degree polynomial and five fold cross +validation scores.

Unsupervised Learning

-

[edit]

-

In unsupervised learning you have data, $\inputVector$, but no labels $\dataScalar$. The aim in unsupervised learning is to extract structure from data. The type of structure you are interested in is dependent on the broader context of the task. In supervised learning that context is very much driven by the labels. Supervised learning algorithms try and focus on the aspects of the data which are relevant to predicting the labels. But in unsupervised learning there are no labels.

+
+[edit] +
+

Unsupervised Learning

+

Supervised learning is when your data is provided with labels. Now we +are going to turn to a different form of learning, commonly known as +unsupervised learning. In unsupervised learning our data isn’t +necessarily labelled in any form, but we want models that give us a +better understanding of the data. We’ve actually seen an example of this +already with , which we introduces in the context of objective +functions. Now we will introduce a more probabilistic approach to +such models, specifically we are interested in latent variable +modelling.

+

In unsupervised learning you have data, \(\mathbf{ x}\), but no labels \(y\). The aim in unsupervised learning is to +extract structure from data. The type of structure you are interested in +is dependent on the broader context of the task. In supervised learning +that context is very much driven by the labels. Supervised learning +algorithms try and focus on the aspects of the data which are relevant +to predicting the labels. But in unsupervised learning there are no +labels.

Context

-

Humans can easily sort a number of objects into objects that share similar characteristics. We easily categorize animals or vehicles. But if the data is very large this is too slow. Even for smaller data, it may be that it is presented in a form that is unintelligible for humans. We are good at dealing with high dimensional data when it’s presented in images, but if it’s presented as a series of numbers, we find it hard to interpret. In unsupervised learning we want the computer to do the sorting for us. For example, an e-commerce company might need an algorithm that can go through its entire list of products and automatically sort them into groups such that similar products are located together.

+

Humans can easily sort a number of objects into objects that share +similar characteristics. We easily categorize animals or vehicles. But +if the data is very large this is too slow. Even for smaller data, it +may be that it is presented in a form that is unintelligible for humans. +We are good at dealing with high dimensional data when it’s presented in +images, but if it’s presented as a series of numbers, we find it hard to +interpret. In unsupervised learning we want the computer to do the +sorting for us. For example, an e-commerce company might need an +algorithm that can go through its entire list of products and +automatically sort them into groups such that similar products are +located together.

Discrete vs Continuous

-

Supervised learning is broadly divided into classification: i.e. wake word classification in the Amazon Echo, and regression, e.g. shelf life prediction for perishable goods. Similarly, unsupervised learning can be broadly split into methods that cluster the data (i.e. provide a discrete label) and methods that represent the data as a continuous value.

+

Supervised learning is broadly divided into classification: i.e. wake +word classification in the Amazon Echo, and regression, e.g. shelf life +prediction for perishable goods. Similarly, unsupervised learning can be +broadly split into methods that cluster the data (i.e. provide a +discrete label) and methods that represent the data as a continuous +value.

Clustering

-

[edit]

-

Clustering methods associate each data point with a different label. Unlike in classification the label is not provided by a human annotator. It is allocated by the computer. Clustering is quite intuitive for humans, we do it naturally with our observations of the real world. For example, we cluster animals into different groups. If we encounter a new animal, we can immediately assign it to a group: bird, mammal, insect. These are certainly labels that can be provided by humans, but they were also originally invented by humans. With clustering we want the computer to recreate that process of inventing the label.

-

Unsupervised learning enables computers to form similar categorizations on data that is too large scale for us to process. When the Greek philosopher, Plato, was thinking about ideas, he considered the concept of the Platonic ideal. The Platonic ideal bird is the bird that is most bird-like or the chair that is most chair-like. In some sense, the task in clustering is to define different clusters, by finding their Platonic ideal (known as the cluster center) and allocate each data point to the relevant cluster center. So, allocate each animal to the class defined by its nearest cluster center.

-

To perform clustering on a computer we need to define a notion of either similarity or distance between the objects and their Platonic ideal, the cluster center. We normally assume that our objects are represented by vectors of data, $\inputVector_i$. Similarly, we represent our cluster center for category j by a vector $\meanVector_j$. This vector contains the ideal features of a bird, a chair, or whatever category j is. In clustering we can either think in terms of similarity of the objects, or distances. We want objects that are similar to each other to cluster together. We want objects that are distant from each other to cluster apart.

-

This requires us to formalize our notion of similarity or distance. Let’s focus on distances. A definition of distance between an object, i, and the cluster center of class j is a function of two vectors, the data point, $\inputVector_i$ and the cluster center, $\meanVector_j$,
$$ -d_{ij} = f(\inputVector_i, \meanVector_j). -$$
Our objective is then to find cluster centers that are close to as many data points as possible. For example, we might want to cluster customers into their different tastes. We could represent each customer by the products they’ve purchased in the past. This could be a binary vector $\inputVector_i$. We can then define a distance between the cluster center and the customer.

+
+[edit] +
+
    +
  • One common approach, not deeply covered in this course.

  • +
  • Associate each data point, \(\mathbf{ +y}_{i, :}\) with one of \(k\) +different discrete groups.

  • +
  • For example:

    +
      +
    • Clustering animals into discrete groups. Are animals discrete or +continuous?
    • +
    • Clustering into different different political +affiliations.
    • +
  • +
  • Humans do seem to like clusters:

    +
      +
    • Very useful when interacting with biologists.
    • +
  • +
  • Subtle difference between clustering and vector +quantisation

  • +
  • Little anecdote.

  • +
  • To my mind difference is in clustering there should be a +reduction in data density between samples.

  • +
  • This definition is not universally applied.

  • +
  • For today’s purposes we merge them:

    +
      +
    • Determine how to allocate each point to a group and harder +total number of groups.
    • +
  • +
  • Simple algorithm for allocating points to groups.

  • +
  • Require: Set of \(k\) +cluster centres & assignment of each points to a cluster.

  • +
+
    +
  1. Initialize cluster centres as randomly selected data points. +
      +
    1. Assign each data point to nearest cluster centre.
    2. +
    3. Update each cluster centre by setting it to the mean of assigned +data points.
    4. +
    5. Repeat 2 and 3 until cluster allocations do not change.
    6. +
  2. +
+
    +
  • This minimizes the objective \[ +E=\sum_{j=1}^K \sum_{i\ \text{allocated to}\ j} \left(\mathbf{ y}_{i, +:} - \boldsymbol{ \mu}_{j, :}\right)^\top\left(\mathbf{ y}_{i, :} - +\boldsymbol{ \mu}_{j, :}\right) +\] i.e. it minimizes thesum of Euclidean squared +distances betwen points and their associated centres.
  • +
  • The minimum is not guaranteed to be global or +unique.
  • +
  • This objective is a non-convex optimization problem.
  • +
+

Clustering methods associate each data point with a different label. +Unlike in classification the label is not provided by a human annotator. +It is allocated by the computer. Clustering is quite intuitive for +humans, we do it naturally with our observations of the real world. For +example, we cluster animals into different groups. If we encounter a new +animal, we can immediately assign it to a group: bird, mammal, insect. +These are certainly labels that can be provided by humans, but they were +also originally invented by humans. With clustering we want the computer +to recreate that process of inventing the label.

+

Unsupervised learning enables computers to form similar +categorizations on data that is too large scale for us to process. When +the Greek philosopher, Plato, was thinking about ideas, he considered +the concept of the Platonic ideal. The Platonic ideal bird is the bird +that is most bird-like or the chair that is most chair-like. In some +sense, the task in clustering is to define different clusters, by +finding their Platonic ideal (known as the cluster center) and allocate +each data point to the relevant cluster center. So, allocate each animal +to the class defined by its nearest cluster center.

+

To perform clustering on a computer we need to define a notion of +either similarity or distance between the objects and their Platonic +ideal, the cluster center. We normally assume that our objects are +represented by vectors of data, \(\mathbf{ +x}_i\). Similarly, we represent our cluster center for category +\(j\) by a vector \(\boldsymbol{ \mu}_j\). This vector contains +the ideal features of a bird, a chair, or whatever category \(j\) is. In clustering we can either think +in terms of similarity of the objects, or distances. We want objects +that are similar to each other to cluster together. We want objects that +are distant from each other to cluster apart.

+

This requires us to formalize our notion of similarity or distance. +Let’s focus on distances. A definition of distance between an object, +\(i\), and the cluster center of class +\(j\) is a function of two vectors, the +data point, \(\mathbf{ x}_i\) and the +cluster center, \(\boldsymbol{ +\mu}_j\), \[ +d_{ij} = f(\mathbf{ x}_i, \boldsymbol{ \mu}_j). +\] Our objective is then to find cluster centers that are close +to as many data points as possible. For example, we might want to +cluster customers into their different tastes. We could represent each +customer by the products they’ve purchased in the past. This could be a +binary vector \(\mathbf{ x}_i\). We can +then define a distance between the cluster center and the customer.

Squared Distance

-

A commonly used distance is the squared distance,
$$ -\distanceScalar_{ij} = (\inputVector_i - \meanVector_j)^2. -$$
The squared distance comes up a lot in machine learning. In unsupervised learning it was used to measure dissimilarity between predictions and observed data. Here its being used to measure the dissimilarity between a cluster center and the data.

-

Once we have decided on the distance or similarity function, we can decide a number of cluster centers, K. We find their location by allocating each center to a sub-set of the points and minimizing the sum of the squared errors,
$$ -\errorFunction(\meanMatrix) = \sum_{i \in \mathbf{i}_j} (\inputVector_i - \meanVector_j)^2 -$$
where the notation ij represents all the indices of each data point which has been allocated to the jth cluster represented by the center $\meanVector_j$.

-

k-Means Clustering

-

One approach to minimizing this objective function is known as k-means clustering. It is simple and relatively quick to implement, but it is an initialization sensitive algorithm. Initialization is the process of choosing an initial set of parameters before optimization. For k-means clustering you need to choose an initial set of centers. In k-means clustering your final set of clusters is very sensitive to the initial choice of centers. For more technical details on k-means clustering you can watch a video of Alex Ihler introducing the algorithm here.

-

k-Means Clustering

+

A commonly used distance is the squared distance, \[ +d_{ij} = (\mathbf{ x}_i - \boldsymbol{ \mu}_j)^2. +\] The squared distance comes up a lot in machine learning. In +unsupervised learning it was used to measure dissimilarity between +predictions and observed data. Here its being used to measure the +dissimilarity between a cluster center and the data.

+

Once we have decided on the distance or similarity function, we can +decide a number of cluster centers, \(K\). We find their location by allocating +each center to a sub-set of the points and minimizing the sum of the +squared errors, \[ +E(\mathbf{M}) = \sum_{i \in \mathbf{i}_j} (\mathbf{ x}_i - \boldsymbol{ +\mu}_j)^2 +\] where the notation \(\mathbf{i}_j\) represents all the indices +of each data point which has been allocated to the \(j\)th cluster represented by the center +\(\boldsymbol{ \mu}_j\).

+

\(k\)-Means +Clustering

+

One approach to minimizing this objective function is known as +\(k\)-means clustering. It is +simple and relatively quick to implement, but it is an initialization +sensitive algorithm. Initialization is the process of choosing an +initial set of parameters before optimization. For \(k\)-means clustering you need to choose an +initial set of centers. In \(k\)-means +clustering your final set of clusters is very sensitive to the initial +choice of centers. For more technical details on \(k\)-means clustering you can watch a video +of Alex Ihler introducing the algorithm here.

+

\(k\)-Means Clustering

- + -
+
-

Figure: Clustering with the k-means clustering algorithm.

+

Figure: Clustering with the \(k\)-means clustering algorithm.

-
-
+
-

Figure: k-means clustering by Alex Ihler.

+

Figure: \(k\)-means clustering by +Alex Ihler.

Hierarchical Clustering

-

Other approaches to clustering involve forming taxonomies of the cluster centers, like humans apply to animals, to form trees. You can learn more about agglomerative clustering in this video from Alex Ihler.

+

Other approaches to clustering involve forming taxonomies of the +cluster centers, like humans apply to animals, to form trees. You can +learn more about agglomerative clustering in this video from Alex +Ihler.

-
-
-
+
-
+

Figure: Hierarchical Clustering by Alex Ihler.

Phylogenetic Trees

-

Indeed, one application of machine learning techniques is performing a hierarchical clustering based on genetic data, i.e. the actual contents of the genome. If we do this across a number of species then we can produce a phylogeny. The phylogeny aims to represent the actual evolution of the species and some phylogenies even estimate the timing of the common ancestor between two species4. Similar methods are used to estimate the origin of viruses like AIDS or Bird flu which mutate very quickly. Determining the origin of viruses can be important in containing or treating outbreaks.

+

Indeed, one application of machine learning techniques is performing +a hierarchical clustering based on genetic data, i.e. the actual +contents of the genome. If we do this across a number of species then we +can produce a phylogeny. The phylogeny aims to represent the +actual evolution of the species and some phylogenies even estimate the +timing of the common ancestor between two species5. +Similar methods are used to estimate the origin of viruses like AIDS or +Bird flu which mutate very quickly. Determining the origin of viruses +can be important in containing or treating outbreaks.

Product Clustering

-

An e-commerce company could apply hierarchical clustering to all its products. That would give a phylogeny of products. Each cluster of products would be split into sub-clusters of products until we got down to individual products. For example, we might expect a high level split to be Electronics/Clothing. Of course, a challenge with these tree-like structures is that many products belong in more than one parent cluster: for example running shoes should be in more than one group, they are ‘sporting goods’ and they are ‘apparel’. A tree structure doesn’t allow this allocation.

-

Hierarchical Clustering Challenge

-

Our own psychological grouping capabilities are studied as a domain of cognitive science. Researchers like Josh Tenenbaum have developed algorithms that decompose data in more complex ways, but they can normally only be applied to smaller data sets.

+

An e-commerce company could apply hierarchical clustering to all its +products. That would give a phylogeny of products. Each cluster of +products would be split into sub-clusters of products until we got down +to individual products. For example, we might expect a high level split +to be Electronics/Clothing. Of course, a challenge with these tree-like +structures is that many products belong in more than one parent cluster: +for example running shoes should be in more than one group, they are +‘sporting goods’ and they are ‘apparel’. A tree structure doesn’t allow +this allocation.

+

Hierarchical Clustering +Challenge

+

Our own psychological grouping capabilities are studied as a domain +of cognitive science. Researchers like Josh Tenenbaum have developed +algorithms that decompose data in more complex ways, but they can +normally only be applied to smaller data sets.

+

Other Clustering Approaches

+
    +
  • Spectral clustering (Shi and Malik (2000),Ng et al. (n.d.)) +
      +
    • Allows clusters which aren’t convex hulls.
    • +
  • +
  • Dirichlet process +
      +
    • A probabilistic formulation for a clustering algorithm that is +non-parametric.
    • +
    • Loosely speaking it allows infinite clusters
    • +
    • In practice useful for dealing with previously unknown species +(e.g. a “Black Swan Event”).
    • +
  • +

Dimensionality Reduction

-

[edit]

-

Dimensionality reduction methods compress the data by replacing the original data with a reduced number of continuous variables. One way of thinking of these methods is to imagine a marionette.

+
+[edit] +
+

Dimensionality reduction methods compress the data by replacing the +original data with a reduced number of continuous variables. One way of +thinking of these methods is to imagine a marionette.

- + -
+
-

Figure: Thinking of dimensionality reduction as a marionette. We observe the high dimensional pose of the puppet, $\inputVector$, but the movement of the puppeteer’s hand, $\latentVector$ remains hidden to us. Dimensionality reduction aims to recover those hidden movements which generated the observations.

-
-
-

The position of each body part of a marionette could be thought of as our data, $\inputVector_i$. So, each data point consists of the 3-D co-ordinates of all the different body parts of the marionette. Let’s say there are 13 different body parts (2 each of feet, knees, hips, hands, elbows, shoulders, one head). Each body part has an x, y, z position in Cartesian coordinates. So that’s 39 numbers associated with each observation.

-

The movement of these 39 parts is determined by the puppeteer via strings. Let’s assume it’s a very simple puppet, with just one stick to control it. The puppeteer can move the stick up and down, left and right. And they can twist it. This gives three parameters in the puppeteers control. This implies that the 39 variables we see moving are controlled by only 3 variables. These 3 variables are often called the hidden or latent variables.

-

Dimensionality reduction assumes something similar for real world data. It assumes that the data we observe is generated from some lower dimensional underlying process. It then seeks to recover the values associated with this low dimensional process.

+

Figure: Thinking of dimensionality reduction as a marionette. We +observe the high dimensional pose of the puppet, \(\mathbf{ x}\), but the movement of the +puppeteer’s hand, \(\mathbf{ z}\) +remains hidden to us. Dimensionality reduction aims to recover those +hidden movements which generated the observations.

+ + +

The position of each body part of a marionette could be thought of as +our data, \(\mathbf{ x}_i\). So, each +data point consists of the 3-D co-ordinates of all the different body +parts of the marionette. Let’s say there are 13 different body parts (2 +each of feet, knees, hips, hands, elbows, shoulders, one head). Each +body part has an x, y, z position in Cartesian coordinates. So that’s 39 +numbers associated with each observation.

+

The movement of these 39 parts is determined by the puppeteer via +strings. Let’s assume it’s a very simple puppet, with just one stick to +control it. The puppeteer can move the stick up and down, left and +right. And they can twist it. This gives three parameters in the +puppeteers control. This implies that the 39 variables we see moving are +controlled by only 3 variables. These 3 variables are often called the +hidden or latent variables.

+

Dimensionality reduction assumes something similar for real world +data. It assumes that the data we observe is generated from some lower +dimensional underlying process. It then seeks to recover the values +associated with this low dimensional process.

Examples in Social Sciences

-

Dimensionality reduction techniques underpin a lot of psychological scoring tests such as IQ tests or personality tests. An IQ test can involve several hundred questions, potentially giving a rich, high dimensional, characterization of some aspects of your intelligence. It is then summarized by a single number. Similarly, the Myers-Briggs personality test involves answering questions about preferences which are reduced to a set of numbers reflecting personality.

-

These tests are assuming that our intelligence is implicitly one-dimensional and that our personality is implicitly four dimensional. Other examples include political belief which is typically represented on a left to right scale. A one-dimensional distillation of an entire philosophy about how a country should be run. Our own leadership principles imply that our decisions have a fourteen-dimensional space underlying them. Each decision could be characterized by judging to what extent it embodies each of the principles.

-

Political belief, personality, intelligence, leadership. None of these exist as a directly measurable quantity in the real world, rather they are inferred based on measurables. Dimensionality reduction is the process of allowing the computer to automatically find such underlying dimensions. This automatically allowing us to characterize each data point according to those explanatory variables. Each of these characteristics can be scored, and individuals can then be turned into vectors.

-

This doesn’t only apply to individuals, in recent years work on language modeling has taken a similar approach to words. The word2vec algorithm performed a dimensionality reduction on words, now you can take any word and map it to a latent space where similar words exhibit similar characteristics. A personality space for words.

+

Dimensionality reduction techniques underpin a lot of psychological +scoring tests such as IQ tests or personality tests. An IQ test can +involve several hundred questions, potentially giving a rich, high +dimensional, characterization of some aspects of your intelligence. It +is then summarized by a single number. Similarly, the Myers-Briggs +personality test involves answering questions about preferences which +are reduced to a set of numbers reflecting personality.

+

These tests are assuming that our intelligence is implicitly +one-dimensional and that our personality is implicitly four dimensional. +Other examples include political belief which is typically represented +on a left to right scale. A one-dimensional distillation of an entire +philosophy about how a country should be run. Our own leadership +principles imply that our decisions have a fourteen-dimensional space +underlying them. Each decision could be characterized by judging to what +extent it embodies each of the principles.

+

Political belief, personality, intelligence, leadership. None of +these exist as a directly measurable quantity in the real world, rather +they are inferred based on measurables. Dimensionality reduction is the +process of allowing the computer to automatically find such underlying +dimensions. This automatically allowing us to characterize each data +point according to those explanatory variables. Each of these +characteristics can be scored, and individuals can then be turned into +vectors.

+

This doesn’t only apply to individuals, in recent years work on +language modeling has taken a similar approach to words. The word2vec algorithm performed +a dimensionality reduction on words, now you can take any word and map +it to a latent space where similar words exhibit similar +characteristics. A ‘personality space’ for words.

Principal Component Analysis

-

Principal component analysis (PCA) is arguably the queen of dimensionality reduction techniques. PCA was developed as an approach to dimensionality reduction in 1930s by Hotelling as a method for the social sciences. In Hotelling’s formulation of PCA it was assumed that any data point, x could be represented as a weighted sum of the latent factors of interest, so that Hotelling described prediction functions (like in regression and classification above), only the regression is now multiple output. And instead of predicting a label, yi, we now try and force the regression to predict the observed feature vector, $\dataVector_i$. So, for example, on an IQ test we would try and predict subject i’s answer to the jth question with the following function
$$ -\dataScalar_{ij} = \mappingFunction_j(\latentScalar_i; \weightVector). -$$
Here zi would be the IQ of subject i and $\mappingFunction_j(\cdot)$ would be a function representing the relationship between the subject’s IQ and their score on the answer to question j. This function is the same for all subjects, but the subject’s IQ is assumed to differ leading to different scores for each subject.

+

Principal component analysis (PCA) is arguably the queen of +dimensionality reduction techniques. PCA was developed as an approach to +dimensionality reduction in 1930s by Hotelling as a method for the +social sciences. In Hotelling’s formulation of PCA it was assumed that +any data point, \(\mathbf{x}\) could be +represented as a weighted sum of the latent factors of interest, so that +Hotelling described prediction functions (like in regression and +classification above), only the regression is now multiple +output. And instead of predicting a label, \(y_i\), we now try and force the regression +to predict the observed feature vector, \(\mathbf{ y}_i\). So, for example, on an IQ +test we would try and predict subject \(i\)’s answer to the \(j\)th question with the following function +\[ +y_{ij} = f_j(z_i; \mathbf{ w}). +\] Here \(z_i\) would be the IQ +of subject \(i\) and \(f_j(\cdot)\) would be a function +representing the relationship between the subject’s IQ and their score +on the answer to question \(j\). This +function is the same for all subjects, but the subject’s IQ is assumed +to differ leading to different scores for each subject.

- + -
+
-

Figure: Visualization of the first two principal components of an artificial data set. The data was generated by taking an image of a handwritten digit, 6, and rotating it 360 times, one degree each time. The first two principal components have been extracted in the diagram. The underlying circular shape is derived from the rotation of the data. Each image in the data set is projected on to the location its projected to in the latent space.

+

Figure: Visualization of the first two principal components of an +artificial data set. The data was generated by taking an image of a +handwritten digit, 6, and rotating it 360 times, one degree each time. +The first two principal components have been extracted in the diagram. +The underlying circular shape is derived from the rotation of the data. +Each image in the data set is projected on to the location its projected +to in the latent space.

Hotelling’s PCA

-

In Hotelling’s formulation he assumed that the function was a linear function. This idea is taken from a wider field known as factor analysis, so Hotelling described the challenge as
$$ -\mappingFunction_j(\latentScalar_i; \weightVector) = \weightScalar_j \latentScalar_i -$$
so the answer to the jth question is predicted to be a scaling of the subject’s IQ. The scale factor is given by $\weightScalar_j$. If there are more latent dimensions then a matrix of parameters, $\weightMatrix$ is used, for example if there were two latent dimensions, we’d have
$$ -\mappingFunction_j(\mathbf{\latentScalar}_i; \weightMatrix) = \weightScalar_{1j} \latentScalar_{1i} + \weightScalar_{2j} \latentScalar_{2i} -$$
where, if this were a personality test, then $\latentScalar_{1i}$ might represent the spectrum over a subject’s extrovert/introvert and $\latentScalar_{2i}$ might represent where the subject was on the rational/perceptual scale. The function would make a prediction about the subjects answer to a particular question on the test (e.g. preference for office job vs preference for outdoor job). In factor analysis the parameters $\weightMatrix$ are known as the factor loadings and in PCA they are known as the principal components.

+

In Hotelling’s formulation he assumed that the function was a linear +function. This idea is taken from a wider field known as factor +analysis, so Hotelling described the challenge as \[ +f_j(z_i; \mathbf{ w}) = w_j z_i +\] so the answer to the \(j\)th +question is predicted to be a scaling of the subject’s IQ. The scale +factor is given by \(w_j\). If there +are more latent dimensions then a matrix of parameters, \(\mathbf{W}\) is used, for example if there +were two latent dimensions, we’d have \[ +f_j(\mathbf{z}_i; \mathbf{W}) = w_{1j} z_{1i} + w_{2j} z_{2i} +\] where, if this were a personality test, then \(z_{1i}\) might represent the spectrum over +a subject’s extrovert/introvert and \(z_{2i}\) might represent where the subject +was on the rational/perceptual scale. The function would make a +prediction about the subjects answer to a particular question on the +test (e.g. preference for office job vs preference for outdoor job). In +factor analysis the parameters \(\mathbf{W}\) are known as the factor +loadings and in PCA they are known as the principal +components.

Parameters

-

Fitting the model involves finding estimates for the loadings, $\weightMatrix$, and latent variables, $\latentMatrix$. There are different approaches including least squares. The least squares approach is used, for example, in recommender systems. In recommender systems this method is called matrix factorization. The customer characteristics, $\dataVector_i$ is the customer rating for each different product (or item) and the latent variables can be seen as a space of customer preferences. In the recommender system case, the loadings matrix also has an interpretation as product similarities.5 Recommender systems have a particular characteristic in that most of the entries of the vector $\dataVector_i$ are missing most of the time.

-

In PCA and factor analysis the unknown latent factors are dealt with through a probability distribution. They are each assumed to be drawn from a zero mean, unit variance normal distribution. This leaves the factor loadings to be estimated. For PCA the maximum likelihood solution for the factor loadings can be shown to be given by the eigenvalue decomposition of the data covariance matrix. This is algorithmically simple and convenient, although slow to compute for very large data sets with many features and many subjects. The eigenvalue problem can also be derived from many other starting points: e.g. the directions of maximum variance in the data or finding a latent space that best preserves inter-point distances between the data, or the optimal linear compression of the data given a linear reconstruction. These many and varied justifications for the eigenvalue decomposition may account for the popularity of PCA. Indeed, there is even an interpretation for Google’s original PageRank algorithm (which computed the smallest eigenvector of the internet’s linkage matrix) as seeking the dominant principal component of the web.6

-

Characterizing users according to past buying behavior and combining this with characteristics about products, is key to making good recommendations and returning useful search results. Further advances can be made if we understand the context of a particular session. For example, if a user is buying Christmas presents and searches for a dress, then it could be the case that the user is willing to spend a little more on the dress than in normal circumstances. Characterizing these effects requires more data and more complex algorithms. However, in domains such a search we are normally constrained by the speed with which we need to return results. Accounting for each of these factors while returning results with acceptable latency is a particular challenge.

+

Fitting the model involves finding estimates for the loadings, \(\mathbf{W}\), and latent variables, \(\mathbf{Z}\). There are different +approaches including least squares. The least squares approach is used, +for example, in recommender systems. In recommender systems this method +is called matrix factorization. The customer characteristics, +\(\mathbf{ y}_i\) is the customer +rating for each different product (or item) and the latent variables can +be seen as a space of customer preferences. In the recommender system +case, the loadings matrix also has an interpretation as product +similarities.6 Recommender systems have a +particular characteristic in that most of the entries of the vector +\(\mathbf{ y}_i\) are missing most of +the time.

+

In PCA and factor analysis the unknown latent factors are dealt with +through a probability distribution. They are each assumed to be drawn +from a zero mean, unit variance normal distribution. This leaves the +factor loadings to be estimated. For PCA the maximum likelihood solution +for the factor loadings can be shown to be given by the eigenvalue +decomposition of the data covariance matrix. This is +algorithmically simple and convenient, although slow to compute for very +large data sets with many features and many subjects. The eigenvalue +problem can also be derived from many other starting points: e.g. the +directions of maximum variance in the data or finding a latent space +that best preserves inter-point distances between the data, or the +optimal linear compression of the data given a linear reconstruction. +These many and varied justifications for the eigenvalue decomposition +may account for the popularity of PCA. Indeed, there is even an +interpretation for Google’s original PageRank algorithm (which computed +the smallest eigenvector of the internet’s linkage matrix) as +seeking the dominant principal component of the web.7

+

Characterizing users according to past buying behavior and combining +this with characteristics about products, is key to making good +recommendations and returning useful search results. Further advances +can be made if we understand the context of a particular session. For +example, if a user is buying Christmas presents and searches for a +dress, then it could be the case that the user is willing to spend a +little more on the dress than in normal circumstances. Characterizing +these effects requires more data and more complex algorithms. However, +in domains such a search we are normally constrained by the speed with +which we need to return results. Accounting for each of these factors +while returning results with acceptable latency is a particular +challenge.

Reinforcement Learning

-

[edit]

-

The final domain of learning we will review is known as reinforcement learning. The domain of reinforcement learning is one that many researchers seem to believe is offering a route to general intelligence. The idea of general intelligence is to develop algorithms that are adaptable to many different circumstances. Supervised learning algorithms are designed to resolve particular challenges. Data is annotated with those challenges in mind. Unsupervised attempts to build representations without any context. But normally the algorithm designer has an understanding of what the broader objective is and designs the algorithms accordingly (for example, characterizing users). In reinforcement learning some context is given, in the form of a reward, but the reward is normally delayed. There may have been many actions that affected the outcome, but which actions had a positive effect and which a negative effect?

+
+[edit] +
+

The final domain of learning we will review is known as reinforcement +learning. The domain of reinforcement learning is one that many +researchers seem to believe is offering a route to general +intelligence. The idea of general intelligence is to develop +algorithms that are adaptable to many different circumstances. +Supervised learning algorithms are designed to resolve particular +challenges. Data is annotated with those challenges in mind. +Unsupervised attempts to build representations without any context. But +normally the algorithm designer has an understanding of what the broader +objective is and designs the algorithms accordingly (for example, +characterizing users). In reinforcement learning some context is given, +in the form of a reward, but the reward is normally delayed. There may +have been many actions that affected the outcome, but which actions had +a positive effect and which a negative effect?

“Reward”

    -
  • In reinforcement learning some context is given, in the form of a reward. But it is often delayed

  • -
  • Credit allocation problem: many actions that affected the outcome, but which actions had a positive effect and which a negative effect?

  • +
  • In reinforcement learning some context is given, in the form of a +reward. But it is often delayed

  • +
  • Credit allocation problem: many actions that affected the +outcome, but which actions had a positive effect and which a negative +effect?

-

One issue for many companies is that the best way of testing the customer experience, A/B testing, prioritizes short term reward. The internet is currently being driven by short term rewards which make it distracting in the short term, but perhaps less useful in the long term. Click-bait is an example, but there are more subtle effects. The success of Facebook is driven by its ability to draw us in when likely we should be doing something else. This is driven by large scale A/B testing.

-

One open question is how to drive non-visual interfaces through equivalents to A/B testing. Speech interfaces, such as those used in intelligent agents, are less amenable to A/B testing when determining the quality of the interface. Improving interaction with them is therefore less exact science than the visual interface. Data efficient reinforcement learning methods are likely to be key to improving these agent’s ability to interact with the user and understand intent. However, they are not yet mature enough to be deployed in this application.

+

One issue for many companies is that the best way of testing the +customer experience, A/B testing, prioritizes short term reward. The +internet is currently being driven by short term rewards which make it +distracting in the short term, but perhaps less useful in the long term. +Click-bait is an example, but there are more subtle effects. The success +of Facebook is driven by its ability to draw us in when likely we should +be doing something else. This is driven by large scale A/B testing.

+

One open question is how to drive non-visual interfaces through +equivalents to A/B testing. Speech interfaces, such as those used in +intelligent agents, are less amenable to A/B testing when determining +the quality of the interface. Improving interaction with them is +therefore less exact science than the visual interface. Data efficient +reinforcement learning methods are likely to be key to improving these +agent’s ability to interact with the user and understand intent. +However, they are not yet mature enough to be deployed in this +application.

Game Play

-

An area where reinforcement learning methods have been deployed with high profile success is game play. In game play the reward is delayed to the end of the game, and it comes in the form of victory or defeat. A significant advantage of game play as an application area is that, through simulation of the game, it is possible to generate as much data as is required to solve the problem. For this reason, many of the recent advances in reinforcement learning have occurred with methods that are not data efficient.

-

The company DeepMind is set up around reinforcement learning as an approach to general intelligence. All their most well-known achievements are centered around artificial intelligence in game play. In reinforcement learning a decision made at any given time have a downstream effect on the result. Whether the effect if beneficial or not is unknown until a future moment.

-

We can think of reinforcement learning as providing a label, but the label is associated with a series of data involving a number of decisions taken. Each decision was taken given the understanding of game play at any given moment. Understanding which of these decisions was important in victory or defeat is a hard problem.

-

In machine learning the process of understanding which decisions were beneficial and which were detrimental is known as the credit allocation problem. You wish to reward decisions that led to success to encourage them, but punish decisions that lead to failure.

-

Broadly speaking, DeepMind uses an approach to Machine Learning where there are two mathematical functions at work. One determines the action to be taken at any given moment, the other estimates the quality of the board position at any given time. These are respectively known as the policy network and the value network.7 DeepMind made use of convolutional neural networks for both these models.

+

An area where reinforcement learning methods have been deployed with +high profile success is game play. In game play the reward is delayed to +the end of the game, and it comes in the form of victory or defeat. A +significant advantage of game play as an application area is that, +through simulation of the game, it is possible to generate as much data +as is required to solve the problem. For this reason, many of the recent +advances in reinforcement learning have occurred with methods that are +not data efficient.

+

The company DeepMind is set up around reinforcement learning as an +approach to general intelligence. All their most well-known achievements +are centered around artificial intelligence in game play. In +reinforcement learning a decision made at any given time have a +downstream effect on the result. Whether the effect if beneficial or not +is unknown until a future moment.

+

We can think of reinforcement learning as providing a label, but the +label is associated with a series of data involving a number of +decisions taken. Each decision was taken given the understanding of game +play at any given moment. Understanding which of these decisions was +important in victory or defeat is a hard problem.

+

In machine learning the process of understanding which decisions were +beneficial and which were detrimental is known as the credit allocation +problem. You wish to reward decisions that led to success to encourage +them, but punish decisions that lead to failure.

+

Broadly speaking, DeepMind uses an approach to Machine Learning where +there are two mathematical functions at work. One determines the action +to be taken at any given moment, the other estimates the quality of the +board position at any given time. These are respectively known as the +policy network and the value network.8 +DeepMind made use of convolutional neural networks for both these +models.

AlphaGo

-

The ancient Chinese game of Go was considered a challenge for artificial intelligence for two reasons. Firstly, the game tree has a very high branching factor. The game tree is a discrete representation of the game. Every node in the game tree is associated with a board position. You can move through the game tree by making legal a move on the board to change the position. In Go, there are so many legal moves that the game tree increases exponentially. This challenge in Go was addressed by using stochastic game tree search. Rather than exploring the game tree exhaustively they explored it randomly.

-

Secondly, evaluating the quality of any given board position was deemed to be very hard.8 The value function determines for each player whether they are winning or losing. Skilled Go players can assess a board position, but they do it by instinct, by intuition. Just as early AI researchers struggled to give rules for detecting cancer, it is challenging to give rules to assess a Go board. The machine learning approach that AlphaGo took is to train a value function network to make this assessment.

-

The approach that DeepMind took to conquering Go is a model-free approach known as Q-learning.9 The model-free approach refers to the fact that they don’t directly include a model of how the world evolves in the reinforcement learning algorithm. They make extensive use of the game tree, but they don’t model how it evolves. They do model the expected reward of each position in the game tree (the value function) but that is not the same as modeling how the game will proceed.

-

Reinforcement Learning and Classical Control

-

An alternative approach to reinforcement learning is to use a prediction function to suggest how the world will evolve in response to your actions. To predict how the game tree will evolve. You can then use this prediction to indirectly infer the expected reward associated with any action. This is known as model-based reinforcement learning.

-

This model-based approach is also closer to a control system. A classical control system is one where you give the system a set point. For example, a thermostat in the house. You set the temperature and the boiler switches off when it reaches it. Optimal control is about getting the house to the right temperature as quickly as possible. Classical control is widely used in robotic control and flight control.

-

One interesting crossover between classical control and machine learning arises because classical optimal control can be seen as a form of model-based reinforcement learning. One where the reward is recovered when the set point is reached. In control engineering the prediction function is known as the transfer function. The process of fitting the transfer function in control is known as system identification.

-

There is some exciting work emerging at the interface between the areas of control and reinforcement learning. Results at this interface could be very important for improving the quality of robotic and drone control.

+

The ancient Chinese game of Go was considered a challenge for +artificial intelligence for two reasons. Firstly, the game tree has a +very high branching factor. The game tree is a discrete representation +of the game. Every node in the game tree is associated with a board +position. You can move through the game tree by making legal a move on +the board to change the position. In Go, there are so many legal moves +that the game tree increases exponentially. This challenge in Go was +addressed by using stochastic game tree search. Rather than exploring +the game tree exhaustively they explored it randomly.

+

Secondly, evaluating the quality of any given board position was +deemed to be very hard.9 The value function determines for +each player whether they are winning or losing. Skilled Go players can +assess a board position, but they do it by instinct, by intuition. Just +as early AI researchers struggled to give rules for detecting cancer, it +is challenging to give rules to assess a Go board. The machine learning +approach that AlphaGo took is to train a value function network to make +this assessment.

+

The approach that DeepMind took to conquering Go is a +model-free approach known as Q-learning.10 The model-free approach refers to +the fact that they don’t directly include a model of how the world +evolves in the reinforcement learning algorithm. They make extensive use +of the game tree, but they don’t model how it evolves. They do model the +expected reward of each position in the game tree (the value function) +but that is not the same as modeling how the game will proceed.

+

Reinforcement +Learning and Classical Control

+

An alternative approach to reinforcement learning is to use a +prediction function to suggest how the world will evolve in response to +your actions. To predict how the game tree will evolve. You can then use +this prediction to indirectly infer the expected reward associated with +any action. This is known as model-based reinforcement +learning.

+

This model-based approach is also closer to a control system. A +classical control system is one where you give the system a set point. +For example, a thermostat in the house. You set the temperature and the +boiler switches off when it reaches it. Optimal control is about getting +the house to the right temperature as quickly as possible. Classical +control is widely used in robotic control and flight control.

+

One interesting crossover between classical control and machine +learning arises because classical optimal control can be seen as a form +of model-based reinforcement learning. One where the reward is recovered +when the set point is reached. In control engineering the prediction +function is known as the transfer function. The process of +fitting the transfer function in control is known as system +identification.

+

There is some exciting work emerging at the interface between the +areas of control and reinforcement learning. Results at this interface +could be very important for improving the quality of robotic and drone +control.

Optimization Methods

-

As we implied above, reinforcement learning can also used to improve user experience. In that case the reward is gained when the user buys a product from us. This makes it closely allied to the area of optimization. Optimization of our user interfaces can be seen as a reinforcement learning task, but more commonly it is thought about separately in the domains of Bayesian optimization or bandit learning.

-

We use optimization in machine learning to find the parameters of our models. We can do that because we have a mathematical representation of our objective function as a direct function of the parameters.

-

Examples in this form of optimization include, what is the best user interface for presenting adverts? What is the best design for a front wing for an F1 racing car? Which product should I return top of the list in response to this user’s search?

-

Bayesian optimization arises when we can’t directly relate the parameters in the system of interest to our objective through a mathematical function. For example, what is the mathematical function that relates a user’s experience to the probability that they will buy a product?

+

As we implied above, reinforcement learning can also used to improve +user experience. In that case the reward is gained when the user buys a +product from us. This makes it closely allied to the area of +optimization. Optimization of our user interfaces can be seen as a +reinforcement learning task, but more commonly it is thought about +separately in the domains of Bayesian optimization or +bandit learning.

+

We use optimization in machine learning to find the parameters of our +models. We can do that because we have a mathematical representation of +our objective function as a direct function of the parameters.

+

Examples in this form of optimization include, what is the best user +interface for presenting adverts? What is the best design for a front +wing for an F1 racing car? Which product should I return top of the list +in response to this user’s search?

+

Bayesian optimization arises when we can’t directly relate the +parameters in the system of interest to our objective through a +mathematical function. For example, what is the mathematical function +that relates a user’s experience to the probability that they will buy a +product?

Bayesian Optimization

-

One approach to these problems is to use machine learning methods to develop a surrogate model for the optimization task. The surrogate model is a prediction function that attempts to recreate the process we are finding hard to model. We try to simultaneously fit the surrogate model and optimize the process.

+

One approach to these problems is to use machine learning methods to +develop a surrogate model for the optimization task. The +surrogate model is a prediction function that attempts to recreate the +process we are finding hard to model. We try to simultaneously fit the +surrogate model and optimize the process.

Surrogate Models

-

Bayesian optimization methods use a surrogate model (normally a specific form of regression model). They use this to predict how the real system will perform. The surrogate model makes a prediction (with an estimate of the uncertainty) of what the response will be to any given input. Parameters to test are chosen by considering this prediction. Similar to reinforcement learning, this can be viewed as a model-based approach because the surrogate model can be seen as a model of the real world. In bandit methods strategies are determined without turning to a model to motivate them. They are model free methods.

-

Model-Based and Model Free: Performance

-

Because of their different philosophies, if a class of prediction functions is chosen, then a model-based approach might have better average case performance. At least in terms of data efficiency. A model free approach may well have better worst-case performance though, because it makes less assumptions about the nature of the data. To put it another way, making assumptions about the data is helpful if they are right: and if the model is sensible they’ll be right on average. However, it is unhelpful if the model is wrong. Indeed, it could be actively damaging. Since we can’t usually guarantee the model is absolutely right, the worst-case performance of a model-based approach would be poor.

-

We have introduced a range of machine learning approaches by focusing on their use of mathematical functions to replace manually coded systems of rules. The important characteristic of machine learning is that the form of these functions, as dictated by their parameters, is determined by acquiring data from the real world.

+

Bayesian optimization methods use a surrogate model +(normally a specific form of regression model). They use this to predict +how the real system will perform. The surrogate model makes a prediction +(with an estimate of the uncertainty) of what the response will be to +any given input. Parameters to test are chosen by considering this +prediction. Similar to reinforcement learning, this can be viewed as a +model-based approach because the surrogate model can be seen as +a model of the real world. In bandit methods strategies are determined +without turning to a model to motivate them. They are model +free methods.

+

Model-Based and Model +Free: Performance

+

Because of their different philosophies, if a class of prediction +functions is chosen, then a model-based approach might have better +average case performance. At least in terms of data efficiency. +A model free approach may well have better worst-case performance +though, because it makes less assumptions about the nature of the data. +To put it another way, making assumptions about the data is helpful if +they are right: and if the model is sensible they’ll be right on +average. However, it is unhelpful if the model is wrong. Indeed, it +could be actively damaging. Since we can’t usually guarantee the model +is absolutely right, the worst-case performance of a model-based +approach would be poor.

+

We have introduced a range of machine learning approaches by focusing +on their use of mathematical functions to replace manually coded systems +of rules. The important characteristic of machine learning is that the +form of these functions, as dictated by their parameters, is determined +by acquiring data from the real world.

Deployment

-

[edit]

-

The methods we have introduced are roughly speaking introduced in order of difficulty of deployment. While supervised learning is more involved in terms of collection of data, it is the most straightforward method to deploy once that data is recovered. For this reason, a major focus with supervised learning should always be on maintaining data quality, increasing the efficiency and accountability10 of the data collection pipeline and the quality of features used.

-

You can also check my blog post on Data Readiness Levels. and my blog post on The 3Ds of Machine Learning Systems Design..

+
+[edit] +
+

The methods we have introduced are roughly speaking introduced in +order of difficulty of deployment. While supervised learning is more +involved in terms of collection of data, it is the most straightforward +method to deploy once that data is recovered. For this reason, a major +focus with supervised learning should always be on maintaining data +quality, increasing the efficiency and accountability11 +of the data collection pipeline and the quality of features used.

+

You can also check my blog post on Data +Readiness Levels. and my blog post on The +3Ds of Machine Learning Systems Design..

Where to Deploy?

-

In relation to what AI can and can’t do today Andrew Ng is quoted as saying:

+

In relation to what AI can and can’t do today Andrew Ng is quoted as +saying:

-

If a typical person can do a mental task with less than one second of thought, we can probably automate it using AI either now or in the near future.11 Andrew Ng

+

If a typical person can do a mental task with less than one second of +thought, we can probably automate it using AI either now or in the near +future.12 Andrew Ng

Is this Right?

-

I would broadly agree with this quote but only in the context of supervised learning. If a human expert takes around that amount of time, then it’s also likely we can acquire the data necessary to build a supervised learning algorithm that can emulate that human’s response.

-

The picture with regard to unsupervised learning and reinforcement learning is more clouded.

-

One observation is that for supervised learning we seem to be moving beyond the era where very deep machine learning expertise is required to deploy methods. A solid understanding of machine learning (say to Masters level) is certainly required, but the quality of the final result is likely more dependent on domain expertise and the quality of the data and the information processing pipeline. This seems part of a wider trend where some of the big successes in machine learning are moving rapidly from the domain of science to that of engineering.12

-

You can check my blog post on New Directions in Kernels and Gaussian Processes..

-

So if we can only emulate tasks that humans take around a second to do, how are we managing to deliver on self driving cars? The answer is that we are constructing engineered systems from sub-components, each of which is a machine learning subsystem. But they are tied together as a component based system in line with our traditional engineering approach. This has an advantage that each component in the system can be verified before its inclusion. This is important for debugging and safety. But in practice we can expect these systems to be very brittle. A human adapts the way in which they drive the car across their lifetime. A human can react to other road users. In extreme situations, such as a car jacking, a human can set to one side normal patterns of behavior, and purposely crash their car to draw attention to the situation.

-

Supervised machine learning solutions are normally trained offline. They do not adapt when deployed because this makes them less verifiable. But this compounds the brittleness of our solutions. By deploying our solutions we actually change the environment in which they operate. Therefore, it’s important that they can be quickly updated to reflect changing circumstances. This updating happens offline. For a complex mechanical system, such as a delivery drone, extensive testing of the system may be required when any component is updated. It is therefore imperative that these data processing pipelines are well documented so that they can be redeployed on demand.

-

In practice there can be challenges with the false dichotomy between reproducibility and performance. It is likely that most of our data scientists are caring less about their ability to redeploy their pipelines and only about their ability to produce an algorithm that achieves a particular performance. A key question is how reproducible is that process? There is a false dichotomy because ensuring reproducibility will typically improve performance as it will make it easier to run a rigorous set of explorative experiments. A worry is that, currently, we do not have a way to quantify the scale of this potential problem within companies.

+

I would broadly agree with this quote but only in the context of +supervised learning. If a human expert takes around that amount of time, +then it’s also likely we can acquire the data necessary to build a +supervised learning algorithm that can emulate that human’s +response.

+

The picture with regard to unsupervised learning and reinforcement +learning is more clouded.

+

One observation is that for supervised learning we seem to +be moving beyond the era where very deep machine learning expertise is +required to deploy methods. A solid understanding of machine learning +(say to Masters level) is certainly required, but the quality of the +final result is likely more dependent on domain expertise and the +quality of the data and the information processing pipeline. This seems +part of a wider trend where some of the big successes in machine +learning are moving rapidly from the domain of science to that of +engineering.13

+

You can check my blog post on New +Directions in Kernels and Gaussian Processes..

+

So if we can only emulate tasks that humans take around a second to +do, how are we managing to deliver on self driving cars? The answer is +that we are constructing engineered systems from sub-components, each of +which is a machine learning subsystem. But they are tied together as a +component based system in line with our traditional engineering +approach. This has an advantage that each component in the system can be +verified before its inclusion. This is important for debugging and +safety. But in practice we can expect these systems to be very brittle. +A human adapts the way in which they drive the car across their +lifetime. A human can react to other road users. In extreme situations, +such as a car jacking, a human can set to one side normal patterns of +behavior, and purposely crash their car to draw attention to the +situation.

+

Supervised machine learning solutions are normally trained offline. +They do not adapt when deployed because this makes them less verifiable. +But this compounds the brittleness of our solutions. By deploying our +solutions we actually change the environment in which they operate. +Therefore, it’s important that they can be quickly updated to reflect +changing circumstances. This updating happens offline. For a complex +mechanical system, such as a delivery drone, extensive testing of the +system may be required when any component is updated. It is therefore +imperative that these data processing pipelines are well documented so +that they can be redeployed on demand.

+

In practice there can be challenges with the false dichotomy between +reproducibility and performance. It is likely that most of our data +scientists are caring less about their ability to redeploy their +pipelines and only about their ability to produce an algorithm that +achieves a particular performance. A key question is how reproducible is +that process? There is a false dichotomy because ensuring +reproducibility will typically improve performance as it will make it +easier to run a rigorous set of explorative experiments. A worry is +that, currently, we do not have a way to quantify the scale of this +potential problem within companies.

Model Choice

-

Common to all machine learning methods is the initial choice of useful classes of functions. The deep learning revolution is associated with a particular class of mathematical functions that is proving very successful in what were seen to be challenging domains: speech, vision, language. This has meant that significant advances in problems that have been seen as hard have occurred in artificial intelligence.

+

Common to all machine learning methods is the initial choice of +useful classes of functions. The deep learning revolution is associated +with a particular class of mathematical functions that is proving very +successful in what were seen to be challenging domains: speech, vision, +language. This has meant that significant advances in problems that have +been seen as hard have occurred in artificial intelligence.

Thanks!

-

For more information on these subjects and more you might want to check the following resources.

+

For more information on these subjects and more you might want to +check the following resources.

References

-
-
-

Andrade-Pacheco, Ricardo, Martin Mubangizi, John Quinn, and Neil D. Lawrence. 2014. “Consistent Mapping of Government Malaria Records Across a Changing Territory Delimitation.” Malaria Journal 13 (Suppl 1). https://doi.org/10.1186/1475-2875-13-S1-P5.

-
-
-

Cooper, Brian. 1991. Transformation of a Valley: Derbyshire Derwent. Scarthin Books.

-
-
-

Gelman, Andrew, John B. Carlin, Hal S. Stern, and Donald B. Rubin. 2013. Bayesian Data Analysis. 3rd ed. Chapman; Hall.

-
-
-

Gething, Peter W., Abdisalan M. Noor, Priscilla W. Gikandi, Esther A. A. Ogara, Simon I. Hay, Mark S. Nixon, Robert W. Snow, and Peter M. Atkinson. 2006. “Improving Imperfect Data from Health Management Information Systems in Africa Using Space–Time Geostatistics.” PLoS Medicine 3 (6). https://doi.org/10.1371/journal.pmed.0030271.

-
-
-

Lawrence, Neil D. 2015. “How Africa Can Benefit from the Data Revolution.” The Guardian Media & Tech Network. https://www.theguardian.com/media-network/2015/aug/25/africa-benefit-data-science-information.

-
-
-

McCulloch, Warren S., and Walter Pitts. 1943. “A Logical Calculus of the Ideas Immanent in Nervous Activity.” Bulletin of Mathematical Biophysics 5: 115–33.

-
-
-

Mubangizi, Martin, Ricardo Andrade-Pacheco, Michael Thomas Smith, John Quinn, and Neil D. Lawrence. 2014. “Malaria Surveillance with Multiple Data Sources Using Gaussian Process Models.” In 1st International Conference on the Use of Mobile ICT in Africa.

-
-
-

Robbins, H., and S. Monro. 1951. “A Stochastic Approximation Method.” Annals of Mathematical Statistics 22: 400–407.

-
-
-

Taigman, Yaniv, Ming Yang, Marc’Aurelio Ranzato, and Lior Wolf. 2014. “DeepFace: Closing the Gap to Human-Level Performance in Face Verification.” In Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/CVPR.2014.220.

-
-
-
+
+
+Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. +Consistent mapping of government malaria records across a changing +territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5 +
+
+Breiman, L., 2001. Random forests. Mach. Learn. 45, 5–32. https://doi.org/10.1023/A:1010933404324 +
+
+Cooper, B., 1991. Transformation of a valley: Derbyshire derwent. +Scarthin Books. +
+
+Efron, B., 1979. Bootstrap methods: Another look at the jackkife. Annals +of Statistics 7, 1–26. +
+
+Gelman, A., Carlin, J.B., Stern, H.S., Dunson, D.B., Vehtari, A., Rubin, +D.B., 2013. Bayesian data analysis, 3rd ed. Chapman; Hall. +
+
+Gething, P.W., Noor, A.M., Gikandi, P.W., Ogara, E.A.A., Hay, S.I., +Nixon, M.S., Snow, R.W., Atkinson, P.M., 2006. Improving imperfect data +from health management information systems in Africa using +space–time geostatistics. PLoS Medicine 3. https://doi.org/10.1371/journal.pmed.0030271 +
+
+Lawrence, N.D., 2015. How +Africa can benefit from the data revolution. +
+
+McCullagh, P., Nelder, J.A., 1989. Generalized linear models, 2nd ed. +Chapman; Hall. +
+
+McCulloch, W.S., Pitts, W., 1943. A logical calculus of the ideas +immanent in nervous activity. Bulletin of Mathematical Biophysics 5, +115–133. https://doi.org/10.1007/BF02478259 +
+
+Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, +N.D., 2014. Malaria surveillance with multiple data sources using +Gaussian process models, in: 1st International Conference +on the Use of Mobile ICT in Africa. +
+
+Ng, A.Y., Jordan, M.I., Weiss, Y., n.d. On spectral clustering: Analysis +and an algorithm. +
+
+Robbins, H., Monro, S., 1951. A stochastic approximation method. Annals +of Mathematical Statistics 22, 400–407. +
+
+Shi, J., Malik, J., 2000. Normalized cuts and image segmentation. IEEE +Transactions on Pattern Analysis and Machine Intelligence 22, 888–905. +
+
+Taigman, Y., Yang, M., Ranzato, M., Wolf, L., 2014. +DeepFace: Closing the gap to human-level performance in +face verification, in: Proceedings of the IEEE Computer +Society Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/CVPR.2014.220 +
+
+The Office of the Senior Special Assistant to the President on the +Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014. +Nigeria NMIS facility database. +
+
+
+ diff --git a/_lectures/02-ml-systems.html b/_lectures/02-ml-systems.html index 4b15b07..6aa0e85 100644 --- a/_lectures/02-ml-systems.html +++ b/_lectures/02-ml-systems.html @@ -1,7 +1,14 @@ --- title: "Introduction to Machine Learning Systems" venue: "Virtual DSA" -abstract: "This notebook introduces some of the challenges of building machine learning data systems. It will introduce you to concepts around joining of databases together. The storage and manipulation of data is at the core of machine learning systems and data science. The goal of this notebook is to introduce the reader to these concepts, not to authoritatively answer any questions about the state of Nigerian health facilities or Covid19, but it may give you ideas about how to try and do that in your own country." +abstract: "This notebook introduces some of the challenges of building +machine learning data systems. It will introduce you to concepts around +joining of databases together. The storage and manipulation of data is +at the core of machine learning systems and data science. The goal of +this notebook is to introduce the reader to these concepts, not to +authoritatively answer any questions about the state of Nigerian health +facilities or Covid19, but it may give you ideas about how to try and do +that in your own country." author: - given: Eric family: Meissner @@ -9,27 +16,29 @@ institute: twitter: meissner_eric_7 gscholar: - orchid: + orcid: - given: Andrei family: Paleyes url: https://www.linkedin.com/in/andreipaleyes/ institute: twitter: gscholar: - orchid: + orcid: - given: Neil D. family: Lawrence url: http://inverseprobability.com institute: twitter: lawrennd gscholar: - orchid: + orcid: +edit_url: https://github.com/mlatcl/dsa/edit/gh-pages/_lamd/ml-systems.md date: 2020-07-24 published: 2020-07-24 -week: 0 session: 2 reveal: 02-ml-systems.slides.html +transition: None ipynb: 02-ml-systems.ipynb +pptx: 02-ml-systems.pptx layout: lecture categories: - notes @@ -45,253 +54,953 @@ -

Question

-

In this notebook, we explore the question of health facility distribution in Nigeria, spatially, and in relation to population density.

-

We answer and visualize the question “How does the number of health facilities per capita vary across Nigeria?”

-

Rather than focussing purely on using tools like pandas to manipulate the data, our focus will be on introducing some concepts from databases.

-

Machine learning can be summarized as
$$ -\text{model} + \text{data} \xrightarrow{\text{compute}} \text{prediction} -$$
and many machine learning courses focus a lot on the model part. But to build a machine learning system in practice, a lot of work has to be put into the data part. This notebook gives some pointers on that work and how to think about your machine learning systems design.

+

Nigerian Health Facility +Distribution

+

In this notebook, we explore the question of health facility +distribution in Nigeria, spatially, and in relation to population +density.

+

We explore and visualize the question “How does the number of health +facilities per capita vary across Nigeria?”

+

Rather than focussing purely on using tools like pandas +to manipulate the data, our focus will be on introducing some concepts +from databases.

+

Machine learning can be summarized as \[ +\text{model} + \text{data} \xrightarrow{\text{compute}} +\text{prediction} +\] and many machine learning courses focus a lot on the model +part. But to build a machine learning system in practice, a lot of work +must be put into the data part. This notebook gives some pointers on +that work and how to think about your machine learning systems +design.

Datasets

-

In this notebook , we download 4 datasets:

+

In this notebook, we download 4 datasets:

  • Nigeria NMIS health facility data
  • -
  • Population data for Administrative Zone 1 (states) areas in Nigeria
  • +
  • Population data for Administrative Zone 1 (states) areas in +Nigeria
  • Map boundaries for Nigerian states (for plotting and binning)
  • Covid cases across Nigeria (as of May 20, 2020)
-

But joining these data sets together is just an example. As another example, you could think of SafeBoda, a ride-hailing app that’s available in Lagos and Kampala. As well as looking at the health examples, try to imagine how SafeBoda may have had to design their systems to be scalable and reliable for storing and sharing data.

-

Imports, Installs, and Downloads

-

[edit]

-

First, we’re going to download some particular python libraries for dealing with geospatial data. We’re dowloading geopandas which will help us deal with ‘shape files’ that give the geographical lay out of Nigeria. And to get a small database set up running quickly, we’re installing csv-to-sqlite which allows us to convert CSV data to a simple database.

-
%pip install geopandas
+

But joining these data sets together is just an example. As another +example, you could think of SafeBoda, a ride-hailing app that’s +available in Lagos and Kampala. As well as looking at the health +examples, try to imagine how SafeBoda may have had to design their +systems to be scalable and reliable for storing and sharing data.

+

Imports, Installs, and +Downloads

+
+[edit] +
+

First, we’re going to download some particular python libraries for +dealing with geospatial data. We’re dowloading geopandas which will help +us deal with ‘shape files’ that give the geographical lay out of +Nigeria. We also need pygeos for indexing.

+
%pip install geopandas
+
%pip install pygeos
+

Setup

+
+[edit] +
+ +

notutils

+
+[edit] +
+

This small package is a helper package for various notebook utilities +used below.

+

The software can be installed using

+
%pip install notutils
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/notutils

+

Once notutils is installed, it can be imported in the +usual manner.

+
import notutils
+

pods

+
+[edit] +
+

In Sheffield we created a suite of software tools for ‘Open Data +Science’. Open data science is an approach to sharing code, models and +data that should make it easier for companies, health professionals and +scientists to gain access to data science techniques.

+

You can also check this blog post on Open +Data Science.

+

The software can be installed using

+
%pip install pods
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/ods

+

Once pods is installed, it can be imported in the usual +manner.

+
import pods
+

mlai

+
+[edit] +
+

The mlai software is a suite of helper functions for +teaching and demonstrating machine learning algorithms. It was first +used in the Machine Learning and Adaptive Intelligence course in +Sheffield in 2013.

+

The software can be installed using

+
%pip install mlai
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/mlai

+

Once mlai is installed, it can be imported in the usual +manner.

+
import mlai

Databases and Joins

-

[edit]

-

The main idea we will be working with today is called the ‘join’. A join does exactly what it sounds like, it combines two database tables.

-

You have already started to look at data structures, in particular you have been learning about pandas which is a great way of storing and structuring your data set to make it easier to plot and manipulate your data.

-

Pandas is great for the data scientist to analyze data because it makes many operations easier. But it is not so good for building the machine learning system. In a machine learning system, you may have to handle a lot of data. Even if you start with building a system where you only have a few customers, perhaps you build an online taxi system (like SafeBoda) for Kampala. Maybe you will have 50 customers. Then maybe your system can be handled with some python scripts and pandas.

+
+[edit] +
+

The main idea we will be working with in this practical is the +‘join’. A join does exactly what it sounds like, it combines two +database tables.

+

You may have already started to look at data structures and learning +about pandas which is a great way of storing and +structuring your data set to make it easier to plot and manipulate your +data.

+

Pandas is great for the data scientist to analyze data because it +makes many operations easier. But it is not so good for building the +machine learning system. In a machine learning system, you may have to +handle a lot of data. Even if you start with building a system where you +only have a few customers, perhaps you build an online taxi system (like +SafeBoda) for Kampala. Maybe you +will have 50 customers. Then maybe your system can be handled with some +python scripts and pandas.

Scaling ML Systems

-

But what if you are succesful? What if everyone in Kampala wants to use your system? There are 1.5 million people in Kampala and maybe 100,000 Boda Boda drivers.

-

What if you are even more succesful? What if everyone in Lagos wants to use your system? There are around 20 million people in Lagos … and maybe as many Okada drivers as people in Kampala!

-

We want to build safe and reliable machine learning systems. Building them from pandas and python is about as safe and reliable as taking six children to school on a boda boda.

-

To build a reliable system, we need to turn to databases. In this notebook we’ll be focussing on SQL databases and how you bring together different streams of data in a Machine Learning System.

-

In a machine learning system, you will need to bring different data sets together. In database terminology this is known as a ‘join’. You have two different data sets, and you want to join them together. Just like you can join two pieces of metal using a welder, or two pieces of wood with screws.

-

But instead of using a welder or screws to join data, we join it using particular columns of the data. We can join data together using people’s names. One database may contain where people live, another database may contain where they go to school. If we join these two databases we can have a database which shows where people live and where they got to school.

-

In the notebook, we will join together some data about where the health centres are in Nigeria and where the have been cases of Covid19. There are other challenges in the ML System Design that are not going to be covered here. They include: how to update the data bases, and how to control access to the data bases from different users (boda boda drivers, riders, administrators etc).

-

Hospital Data

-

[edit]

-

The first and primary dataset we use is the NMIS health facility dataset, which contains data on the location, type, and staffing of health facilities across Nigeria.

-
import urllib.request
-import pandas as pd
-
urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
-hospital_data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
-

It’s always a good idea to inspect your data once it’s downloaded to check it contains what you expect. In pandas you can do this with the .head() method. That allows us to see the first few entries of the pandas data structure.

-
hospital_data.head()
-

We can also check in pandas what the different columns of the data frame are to see what it contains.

-
hospital_data.columns
-

We can immiediately see that there are facility names, dates, and some characteristics of each health center such as number of doctors etc. As well as all that, we have two fields, latitude and longitude that likely give us the hospital locaiton. Let’s plot them to have a look.

-
import matplotlib.pyplot as plt
-
plt.plot(hospital_data.longitude, hospital_data.latitude,'ro', alpha=0.01)
-

There we have the location of these different hospitals. We set alpha in the plot to 0.01 to make the dots transparent, so we can see the locations of each health center.

+

But what if you are successful? What if everyone in Kampala wants to +use your system? There are 1.5 million people in Kampala and maybe +100,000 Boda Boda drivers.1

+

What if you are even more succesful? What if everyone in Lagos wants +to use your system? There are around 20 million people in Lagos … and +maybe as many Okada[^okada] drivers as people in Kampala!

+

[^okada] In Lagos the Boda Boda is called an Okada.

+

We want to build safe and reliable machine learning systems. Building +them from pandas and python is about as safe and reliable +as taking +six children to school on a boda boda.

+

To build a reliable system, we need to turn to databases. In +this notebook we’ll +be focusing on SQL databases and how you bring together different +streams of data in a Machine Learning System.

+

In a machine learning system, you will need to bring different data +sets together. In database terminology this is known as a ‘join’. You +have two different data sets, and you want to join them together. Just +like you can join two pieces of metal using a welder, or two pieces of +wood with screws.

+

But instead of using a welder or screws to join data, we join it +using columns of the data. We can join data together using people’s +names. One database may contain where people live, another database may +contain where they go to school. If we join these two databases, we can +have a database which shows where people live and where they got to +school.

+

In the notebook, we will join some data about where the health +centers are in Nigeria with data about where there have been cases of +Covid19. There are other challenges in the ML System Design that are not +going to be covered here. They include how to update the databases and +how to control access to the databases from different users (boda boda +drivers, riders, administrators etc).

+

Nigeria NMIS Data

+
+[edit] +
+

As an example data set we will use Nigerian Millennium Development +Goals Information System Health Facility (The Office of the Senior Special Assistant +to the President on the Millennium Development Goals (OSSAP-MDGs) and +Columbia University, 2014). It can be found here https://energydata.info/dataset/nigeria-nmis-education-facility-data-2014.

+

Taking from the information on the site,

+
+

The Nigeria MDG (Millennium Development Goals) Information System – +NMIS health facility data is collected by the Office of the Senior +Special Assistant to the President on the Millennium Development Goals +(OSSAP-MDGs) in partner with the Sustainable Engineering Lab at Columbia +University. A rigorous, geo-referenced baseline facility inventory +across Nigeria is created spanning from 2009 to 2011 with an additional +survey effort to increase coverage in 2014, to build Nigeria’s first +nation-wide inventory of health facility. The database includes 34,139 +health facilities info in Nigeria.

+

The goal of this database is to make the data collected available to +planners, government officials, and the public, to be used to make +strategic decisions for planning relevant interventions.

+

For data inquiry, please contact Ms. Funlola Osinupebi, Performance +Monitoring & Communications, Advisory Power Team, Office of the Vice +President at funlola.osinupebi@aptovp.org

+

To learn more, please visit http://csd.columbia.edu/2014/03/10/the-nigeria-mdg-information-system-nmis-takes-open-data-further/

+

Suggested citation: Nigeria NMIS facility database (2014), the Office +of the Senior Special Assistant to the President on the Millennium +Development Goals (OSSAP-MDGs) & Columbia University

+
+

For ease of use we’ve packaged this data set in the pods +library

+
data = pods.datasets.nigeria_nmis()['Y']
+data.head()
+

Alternatively, you can access the data directly with the following +commands.

+
import urllib.request
+urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
+
+import pandas as pd
+data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
+

Once it is loaded in the data can be summarized using the +describe method in pandas.

+
data.describe()
+

We can also find out the dimensions of the dataset using the +shape property.

+
data.shape
+

Dataframes have different functions that you can use to explore and +understand your data. In python and the Jupyter notebook it is possible +to see a list of all possible functions and attributes by typing the +name of the object followed by .<Tab> for example in +the above case if we type data.<Tab> it show the +columns available (these are attributes in pandas dataframes) such as +num_nurses_fulltime, and also functions, such as +.describe().

+

For functions we can also see the documentation about the function by +following the name with a question mark. This will open a box with +documentation at the bottom which can be closed with the x button.

+
data.describe?
+
+
+
+ +
+
+
+ +
+
+

Figure: Location of the over thirty-four thousand health facilities +registered in the NMIS data across Nigeria. Each facility plotted +according to its latitude and longitude.

+
+
+
hospital_data = data

Administrative Zone Geo Data

-

[edit]

-

A very common operation is the need to map from locations in a country to the administrative regions. If we were building a ride sharing app, we might also want to map riders to locations in the city, so that we could know how many riders we had in different city areas.

-

Administrative regions have various names like cities, counties, districts or states. These conversions for the administrative regions are important for getting the right information to the right people.

-

Of course, if we had a knowlegdeable Nigerian, we could ask her about what the right location for each of these health facilities is, which state is it in? But given that we have the latitude and longitude, we should be able to find out automatically what the different states are.

-

This is where “geo” data becomes important. We need to download a dataset that stores the location of the different states in Nigeria. These files are known as ‘outline’ files. Because they draw the different states of different countries in outline.

-

There are special databases for storing this type of information, the database we are using is in the gdb or GeoDataBase format. It comes in a zip file. Let’s download the outline files for the Nigerian states. They have been made available by the Humanitarian Data Exchange, you can also find other states data from the same site.

-
import zipfile
-
admin_zones_url = 'https://data.humdata.org/dataset/81ac1d38-f603-4a98-804d-325c658599a3/resource/0bc2f7bb-9ff6-40db-a569-1989b8ffd3bc/download/nga_admbnda_osgof_eha_itos.gdb.zip'
-_, msg = urllib.request.urlretrieve(admin_zones_url, 'nga_admbnda_osgof_eha_itos.gdb.zip')
-with zipfile.ZipFile('/content/nga_admbnda_osgof_eha_itos.gdb.zip', 'r') as zip_ref:
-    zip_ref.extractall('/content/nga_admbnda_osgof_eha_itos.gdb')
-

Now we have this data of the outlines of the different states in Nigeria.

-

The next thing we need to know is how these health facilities map onto different states in Nigeria. Without “binning” facilities somehow, it’s difficult to effectively see how they are distributed across the country.

-

We do this by finding a “geo” dataset that contains the spatial outlay of Nigerian states by latitude/longitude coordinates. The dataset we use is of the “gdb” (GeoDataBase) type and comes as a zip file. We don’t need to worry much about this datatype for this notebook, only noting that geopandas knows how to load in the dataset, and that it contains different “layers” for the same map. In this case, each layer is a different degree of granularity of the political boundaries, with layer 0 being the whole country, 1 is by state, or 2 is by local government. We’ll go with a state level view for simplicity, but as an excercise you can change it to layer 2 to view the distribution by local government.

-

Once we have these MultiPolygon objects that define the boundaries of different states, we can perform a spatial join (sjoin) from the coordinates of individual health facilities (which we already converted to the appropriate Point type when moving the health data to a GeoDataFrame.)

+
+[edit] +
+

A very common operation is the need to map from locations in a +country to the administrative regions. If we were building a ride +sharing app, we might also want to map riders to locations in the city, +so that we could know how many riders we had in different city +areas.

+

Administrative regions have various names like cities, counties, +districts, or states. These conversions for the administrative regions +are important for getting the right information to the right people.

+

Of course, if we had a knowledgeable Nigerian, we could ask her about +what the right location for each of these health facilities is, which +state is it in? But given that we have the latitude and longitude, we +should be able to find out automatically what the different states +are.

+

This is where “geo” data becomes important. We need to download a +dataset that stores the location of the different states in Nigeria. +These files are known as ‘outline’ files. Because they draw the +different states of different countries in outline.

+

There are special databases for storing this type of information, the +database we are using is in the gdb or GeoDataBase format. +It comes in a zip file. Let’s download the outline files for the +Nigerian states. They have been made available by the Humanitarian Data Exchange, you can +also find other states data from the same site.

+

Nigerian Administrative +Zones Data

+
+[edit] +
+

For ease of use we’ve packaged this data set in the pods +library

+
data = pods.datasets.nigerian_administrative_zones()['Y']
+data.set_index("admin1Name_en", inplace=True)
+data.head()
+

Alternatively you can access the data directly with the following +commands.

+
import zipfile
+
+admin_zones_url = 'https://data.humdata.org/dataset/81ac1d38-f603-4a98-804d-325c658599a3/resource/0bc2f7bb-9ff6-40db-a569-1989b8ffd3bc/download/nga_admbnda_osgof_eha_itos.gdb.zip'
+_, msg = urllib.request.urlretrieve(admin_zones_url, 'nga_admbnda_osgof_eha_itos.gdb.zip')
+with zipfile.ZipFile('/content/nga_admbnda_osgof_eha_itos.gdb.zip', 'r') as zip_ref:
+    zip_ref.extractall('/content/nga_admbnda_osgof_eha_itos.gdb')
+
+import geopandas as gpd
+import fiona
+
+states_file = "./nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/"
+
+layers = fiona.listlayers(states_file)
+data = gpd.read_file(states_file, layer=1)
+data.crs = "EPSG:4326"
+data = data.set_index('admin1Name_en')
+    
+
+
+ + +
+
+ +
+
+

Figure: Border locations for the thirty-six different states of +Nigeria.

+
+
+
zones_gdf = data
+zones_gdf['admin1Name_en'] = zones_gdf.index
+

Now we have this data of the outlines of the different states in +Nigeria.

+

The next thing we need to know is how these health facilities map +onto different states in Nigeria. Without “binning” facilities somehow, +it’s difficult to effectively see how they are distributed across the +country.

+

We do this by finding a “geo” dataset that contains the spatial +outlay of Nigerian states by latitude/longitude coordinates. The dataset +we use is of the “gdb” (GeoDataBase) type and comes as a zip file. We +don’t need to worry much about this datatype for this notebook, only +noting that geopandas knows how to load in the dataset, and that it +contains different “layers” for the same map. In this case, each layer +is a different degree of granularity of the political boundaries, with +layer 0 being the whole country, 1 is by state, or 2 is by local +government. We’ll go with a state level view for simplicity, but as an +excercise you can change it to layer 2 to view the distribution by local +government.

+

Once we have these MultiPolygon objects that define the +boundaries of different states, we can perform a spatial join (sjoin) +from the coordinates of individual health facilities (which we already +converted to the appropriate Point type when moving the +health data to a GeoDataFrame.)

Joining a GeoDataFrame

-

The first database join we’re going to do is a special one, it’s a ‘spatial join’. We’re going to join together the locations of the hospitals with their states.

-

This join is unusual because it requires some mathematics to get right. The outline files give us the borders of the different states in latitude and longitude, the health facilities have given locations in the country.

-

A spatial join involves finding out which state each health facility belongs to. Fortunately, the mathematics you need is already programmed for you in GeoPandas. That means all we need to do is convert our pandas dataframe of health facilities into a GeoDataFrame which allows us to do the spatial join.

-
import geopandas as gpd
-
hosp_gdf = gpd.GeoDataFrame(
-    hospital_data, geometry=gpd.points_from_xy(hospital_data.longitude, hospital_data.latitude))
-hosp_gdf.crs = "EPSG:4326"
-

There are some technial details here: the crs refers to the coordinate system in use by a particular GeoDataFrame. EPSG:4326 is the standard coordinate system of latitude/longitude.

-

Your First Join: Converting GPS Coordinates to States

-

Now we have the data in the GeoPandas format, we can start converting into states. We will use the fiona library for reading the right layers from the files. Before we do the join, lets plot the location of health centers and states on the same map.

-
import fiona
-
states_file = "/content/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/"
-
-# geopandas included map, filtered to just Nigeria
-world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
-world.crs = "EPSG:4326"
-nigeria = world[(world['name'] == 'Nigeria')]
-base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-
-layers = fiona.listlayers(states_file)
-zones_gdf = gpd.read_file(states_file, layer=1)
-zones_gdf.crs = "EPSG:4326"
-zones_gdf = zones_gdf.set_index('admin1Name_en')
-zones_gdf.plot(ax=base, color='white', edgecolor='black')
-
-# We can now plot our ``GeoDataFrame``.
-hosp_gdf.plot(ax=base, color='b', alpha=0.02, )
-
-plt.show()
+

The first database join we’re going to do is a special one, it’s a +‘spatial join’. We’re going to join the locations of the hospitals with +their states.

+

This join is unusual because it requires some mathematics to get +right. The outline files give us the borders of the different states in +latitude and longitude, the health facilities have given locations in +the country.

+

A spatial join involves finding out which state each health facility +belongs to. Fortunately, the mathematics you need is already programmed +for you in GeoPandas. That means all we need to do is convert our +pandas dataframe of health facilities into a +GeoDataFrame which allows us to do the spatial join.

+

First, we convert the hospital data to a geopandas data +frame.

+
import geopandas as gpd
+
geometry = gpd.points_from_xy(hospital_data.longitude, hospital_data.latitude)
+hosp_gdf = gpd.GeoDataFrame(hospital_data, 
+                            geometry=geometry)
+hosp_gdf.crs = "EPSG:4326"
+

There are some technial details here: the crs refers to +the coordinate system in use by a particular GeoDataFrame. +EPSG:4326 is the standard coordinate system of +latitude/longitude.

+

Your First +Join: Converting GPS Coordinates to States

+

Now we have the data in the GeoPandas format, we can +start converting into states. We will use the fiona library +for reading the right layers from the files. Before we do the join, lets +plot the location of health centers and states on the same map.

+
world_gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
+world_gdf.crs = "EPSG:4326"
+nigeria_gdf = world_gdf[(world_gdf['name'] == 'Nigeria')]
+
+
+ + +
+
+ +
+
+

Figure: The outline of the thirty-six different states of nigeria +with the location sof the health centers plotted on the map.

+
+

Performing the Spatial Join

-

We’ve now plotted the different health center locations across the states. You can clearly see that each of the dots falls within a different state. For helping the visualisation, we’ve made the dots somewhat transparent (we set the alpha in the plot). This means that we can see the regions where there are more health centers, you should be able to spot where the major cities in Nigeria are given the increased number of health centers in those regions.

-

Of course, we can now see by eye, which of the states each of the health centers belongs to. But we want the computer to do our join for us. GeoPandas provides us with the spatial join. Here we’re going to do a left or outer join.

-
from geopandas.tools import sjoin
-

We have two GeoPandas data frames, hosp_gdf and zones_gdf. Let’s have a look at the columns the contain.

-
hosp_gdf.columns
-

We can see that this is the GeoDataFrame containing the information about the hospital. Now let’s have a look at the zones_gdf data frame.

-
zones_gdf.columns
-

You can see that this data frame has a different set of columns. It has all the different administrative regions. But there is one column name that overlaps. We can find it by looking for the intersection between the two sets.

-
set(hosp_gdf.columns).intersection(set(zones_gdf.columns))
-

Here we’ve converted the lists of columns into python ‘sets’, and then looked for the intersection. The join will occur on the intersection between these columns. It will try and match the geometry of the hospitals (their location) to the geometry of the states (their outlines). This match is done in one line in GeoPandas.

-

We’re having to use GeoPandas because this join is a special one based on geographical locations, if the join was on customer name or some other discrete variable, we could do the join in pandas or directly in SQL.

-
hosp_state_joined = sjoin(hosp_gdf, zones_gdf, how='left')
-

The intersection of the two data frames indicates how the two data frames will be joined (if there’s no intersection, they can’t be joined). It’s like indicating the two holes that would need to be bolted together on two pieces of metal. If the holes don’t match, the join can’t be done. There has to be an intersection.

-

But what will the result look like? Well the join should be the ‘union’ of the two data frames. We can have a look at what the union should be by (again) converting the columns to sets.

-
set(hosp_gdf.columns).union(set(zones_gdf.columns))
-

That gives a list of all the columns (notice that ‘geometry’ only appears once).

-

Let’s check that’s what the join command did, by looking at the columns of our new data frame, hosp_state_joined. Notice also that there’s a new column: index_right. The two original data bases had separate indices. The index_right column represents the index from the zones_gdf, which is the Nigerian state.

-
set(hosp_state_joined.columns)
-

Great! They are all there! We have completed our join. We had two separate data frames with information about states and information about hospitals. But by performing an ‘outer’ or a ‘left’ join, we now have a single data frame with all the information in the same place! Let’s have a look at the first frew entries in the new data frame.

-
hosp_state_joined.head()
+

We’ve now plotted the different health center locations across the +states. You can clearly see that each of the dots falls within a +different state. For helping the visualization, we’ve made the dots +somewhat transparent (we set the alpha in the plot). This +means that we can see the regions where there are more health centers, +you should be able to spot where the major cities in Nigeria are given +the increased number of health centers in those regions.

+

Of course, we can now see by eye, which of the states each of the +health centers belongs to. But we want the computer to do our join for +us. GeoPandas provides us with the spatial join. Here we’re +going to do a left +or outer join.

+
from geopandas.tools import sjoin
+

We have two GeoPandas data frames, hosp_gdf and +zones_gdf. Let’s have a look at the columns the +contain.

+
hosp_gdf.columns
+

We can see that this is the GeoDataFrame containing the information +about the hospital. Now let’s have a look at the zones_gdf +data frame.

+
zones_gdf.columns
+

You can see that this data frame has a different set of columns. It +has all the different administrative regions. But there is one column +name that overlaps. We can find it by looking for the intersection +between the two sets.

+
set(hosp_gdf.columns).intersection(set(zones_gdf.columns))
+

Here we’ve converted the lists of columns into python ‘sets’, and +then looked for the intersection. The join will occur on the +intersection between these columns. It will try and match the geometry +of the hospitals (their location) to the geometry of the states (their +outlines). This match is done in one line in GeoPandas.

+

We’re having to use GeoPandas because this join is a special one +based on geographical locations, if the join was on customer name or +some other discrete variable, we could do the join in pandas or directly +in SQL.

+
hosp_state_joined = sjoin(hosp_gdf, zones_gdf, how='left')
+

The intersection of the two data frames indicates how the two data +frames will be joined (if there’s no intersection, they can’t be +joined). It’s like indicating the two holes that would need to be bolted +together on two pieces of metal. If the holes don’t match, the join +can’t be done. There has to be an intersection.

+

But what will the result look like? Well, the join should be the +‘union’ of the two data frames. We can have a look at what the union +should be by (again) converting the columns to sets.

+
set(hosp_gdf.columns).union(set(zones_gdf.columns))
+

That gives a list of all the columns (notice that ‘geometry’ only +appears once).

+

Let’s check that’s what the join command did, by looking at the +columns of our new data frame, hosp_state_joined. Notice +also that there’s a new column: index_right. The two +original data bases had separate indices. The index_right +column represents the index from the zones_gdf, which is +the Nigerian state.

+
set(hosp_state_joined.columns)
+

Great! They are all there! We have completed our join. We had two +separate data frames with information about states and information about +hospitals. But by performing an ‘outer’ or a ‘left’ join, we now have a +single data frame with all the information in the same place! Let’s have +a look at the first frew entries in the new data frame.

+
hosp_state_joined.head()

SQL Database

-

[edit]

-

Our first join was a special one, because it involved spatial data. That meant using the special gdb format and the GeoPandas tool for manipulating that data. But we’ve now saved our updated data in a new file.

-

To do this, we use the command line utility that comes standard for SQLite database creation. SQLite is a simple database that’s useful for playing with database commands on your local machine. For a real system, you would need to set up a server to run the database. The server is a separate machine with the job of answering database queries. SQLite pretends to be a proper database, but doesn’t require us to go to the extra work of setting up a server. Popular SQL server software includes MySQL which is free or Microsoft’s SQL Server.

-

A typical machine learning installation might have you running a database from a cloud service (such as AWS, Azure or Google Cloud Platform). That cloud service would host the database for you and you would pay according to the number of queries made.

-

Many start-up companies were formed on the back of a MySQL server hosted on top of AWS. You can read how to do that here.

-

If you were designing your own ride hailing app, or any other major commercial software you would want to investigate whether you would need to set up a central SQL server in one of these frameworks.

-

Today though, we’ll just stick to SQLite which gives you a sense of the database without the time and expense of setting it up on the cloud. As well as showing you the SQL commands (which is often what’s used in a production ML system) we’ll also give the equivalent pandas commands, which would often be what you would use when you’re doing data analysis in python and Jupyter.

+
+[edit] +
+

Our first join was a special one, because it involved spatial data. +That meant using the special gdb format and the +GeoPandas tool for manipulating that data. But we’ve now +saved our updated data in a new file.

+

To do this, we use the command line utility that comes standard for +SQLite database creation. SQLite is a simple database that’s useful for +playing with database commands on your local machine. For a real system, +you would need to set up a server to run the database. The server is a +separate machine with the job of answering database queries. SQLite +pretends to be a proper database but doesn’t require us to go to the +extra work of setting up a server. Popular SQL server software includes +MariaDB which is open +source, or Microsoft’s +SQL Server.

+

A typical machine learning installation might have you running a +database from a cloud service (such as AWS, Azure or Google Cloud +Platform). That cloud service would host the database for you, and you +would pay according to the number of queries made.

+

Many start-up companies were formed on the back of a +MySQL server hosted on top of AWS. Although since MySQL was +sold to Sun, and then passed on to Oracle, the open source community has +turned its attention to MariaDB, here’s the AWS +instructions on how to set up MariaDB.

+

If you were designing your own ride hailing app, or any other major +commercial software you would want to investigate whether you would need +to set up a central SQL server in one of these frameworks.

+

Today though, we’ll just stick to SQLite which gives you a sense of +the database without the time and expense of setting it up on the cloud. +As well as showing you the SQL commands (which is often what’s used in a +production ML system) we’ll also give the equivalent pandas +commands, which would often be what you would use when you’re doing data +analysis in python and Jupyter.

Create the SQLite Database

-

The beautiful thing about SQLite is that it allows us to play with SQL without going to the work of setting up a proper SQL server. Creating a data base in SQLite is as simple as writing a new file. To create the database, we’ll first write our joined data to a CSV file, then we’ll use a little utility to convert our hospital database into a SQLite database.

-
hosp_state_joined.to_csv('facilities.csv')
-
%pip install csv-to-sqlite
-
!csv-to-sqlite -f facilities.csv -t full -o db.sqlite
-

Rather than being installed on a separate server, SQLite simply stores the database locally in a file called db.sqlite.

-

In the database there can be several ‘tables’. Each table can be thought of as like a separate dataframe. The table name we’ve just saved is ‘hospitals_zones_joined’.

+
+[edit] +
+

The beautiful thing about SQLite is that it allows us to play with +SQL without going to the work of setting up a proper SQL server. +Creating a data base in SQLite is as simple as writing a new file. To +create the database, we’ll first write our joined data to a CSV file, +then we’ll use a little utility to convert our hospital database into a +SQLite database.

+
hosp_state_joined.to_csv("hospitals_zones_joined.csv")
+
%pip install csv-to-sqlite
+
!csv-to-sqlite -f hospitals_zones_joined.csv -t full -o db.sqlite
+

Rather than being installed on a separate server, SQLite simply +stores the database locally in a file called db.sqlite.

+

In the database there can be several ‘tables’. Each table can be +thought of as like a separate dataframe. The table name we’ve just saved +is ‘hospitals_zones_joined’.

Accessing the SQL Database

-

Now that we have a SQL database, we can create a connection to it and query it using SQL commands. Let’s try to simply select the data we wrote to it, to make sure its the same.

-

Start by making a connection to the database. This will often be done via remote connections, but for this example we’ll connect locally to the database using the filepath directly.

-
conn = create_connection("db.sqlite")
-

Now that we have a connection, we can write a command and pass it to the database.

-

To access a data base, the first thing that is made is a connection. Then SQL is used to extract the information required. A typical SQL command is SELECT. It allows us to extract rows from a given table. It operates a bit like the .head() method in pandas, it will return the first N rows (by default the .head() command returns the first 5 rows, but you can set n to whatever you like. Here we’ve included a default value of 5 to make it match the pandas command.

-

The python library, sqlite3, allows us to access the SQL database directly from python. We do this using an execute command on the connection.

-

Typically, its good software engineering practice to ‘wrap’ the database command in some python code. This allows the commands to be maintained. Below we wrap the SQL command

-
SELECT * FROM [table_name] LIMIT : N
-

in python code. This SQL command selects the first N entries from a given database called table_name.

-

We can pass the table_name and number of rows, N to the python command.

-

Let’s have a go at calling the command to extract the first three facilities from our health center database. Let’s try creating a function that does the same thing the pandas .head() method does so we can inspect our database.

-
def head(conn, table, n=5):
-  rows = select_top(conn, table, n)
-  for r in rows:
-      print(r)
-
head(conn, 'facilities')
-

Great! We now have the data base in SQLite, and some python functions that operate on the data base by wrapping SQL commands.

-

We will return to the SQL command style after download and add the other datasets to the database using a combination of pandas and the csv-to-sqlite utility.

-

Our next task will be to introduce data on COVID19 so that we can join that to our other data sets.

+

Now that we have a SQL database, we can create a connection to it and +query it using SQL commands. Let’s try to simply select the data we +wrote to it, to make sure it’s the same.

+

Start by making a connection to the database. This will often be done +via remote connections, but for this example we’ll connect locally to +the database using the filepath directly.

+

To access a data base, the first thing that is made is a connection. +Then SQL is used to extract the information required. A typical SQL +command is SELECT. It allows us to extract rows from a +given table. It operates a bit like the .head() method in +pandas, it will return the first N rows (by +default the .head() command returns the first 5 rows, but +you can set N to whatever you like. Here we’ve included a +default value of 5 to make it match the pandas command.

+

We do this using an execute command on the +connection.

+

Typically, its good software engineering practice to ‘wrap’ the +database command in some python code. This allows the commands to be +maintained. You will also be asked to do this in your final assessment, +including re-writing some of the code - pay attention to the slight +syntax differences and multi-statement queries.Below we wrap the SQL +command

+
SELECT * FROM table_name LIMIT N
+

in python code. This SQL command selects the first N +entries from a given database called table_name.

+

We can pass the table_name and number of rows, +n, to the python command.

+
conn = create_connection("db.sqlite")
+

Now that we have a connection, we can write a command and pass it to +the database.

+

To access a data base, the first thing that is made is a connection. +Then SQL is used to extract the information required. A typical SQL +command is SELECT. It allows us to extract rows from a +given table. It operates a bit like the .head() method in +pandas, it will return the first N rows (by +default the .head() command returns the first 5 rows, but +you can set N to whatever you like. Here we’ve included a +default value of 5 to make it match the pandas command.

+

The python library, sqlite3, allows us to access the SQL +database directly from python.

+

Let’s have a go at calling the command to extract the first three +facilities from our health center database. Let’s try creating a +function that does the same thing the pandas .head() method +does so we can inspect our database.

+
def head(conn, table, n=5):
+  rows = select_top(conn, table, n)
+  for r in rows:
+      print(r)
+
head(conn, "hospitals_zones_joined")
+

Great! We now have the database in and some python functions that +operate on the data base by wrapping SQL commands.

+

We will return to the SQL command style after download and add the +other datasets to the database using a combination of +pandas and the database utilities.

+

Our next task will be to introduce data on COVID19 so that we can +join that to our other data sets.

Covid Data

-

[edit]

-

Now we have the health data, we’re going to combine it with data about COVID-19 cases in Nigeria over time. This data is kindly provided by Africa open COVID-19 data working group, which Elaine Nsoesie has been working with. The data is taken from Twitter, and only goes up until May 2020.

-

They provide their data in github. We can access the cases we’re interested in from the following URL.

-

For convenience, we’ll load the data into pandas first, but our next step will be to create a new SQLite table containing the data. Then we’ll join that table to our existing tables.

-
covid_data_url = 'https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv'
-covid_data_csv = 'cases.csv'
-urllib.request.urlretrieve(covid_data_url, covid_data_csv)
-covid_data = pd.read_csv(covid_data_csv)
-

As normal, we should inspect our data to check that it contains what we expect.

-
covid_data.head()
-

And we can get an idea of all the information in the data from looking at the columns.

-
covid_data.columns
-

Now we convert this CSV file we’ve downloaded into a new table in the database file. We can do this, again, with the csv-to-sqlite script.

-
!csv-to-sqlite -f cases.csv -t full -o db.sqlite
+
+[edit] +
+

Now we have the health data, we’re going to combine it with data about COVID-19 cases +in Nigeria over time. This data is kindly provided by Africa open +COVID-19 data working group, which Elaine Nsoesie +has been working with. The data is taken from Twitter, and only goes up +until May 2020.

+

They provide their data in GitHub. We can access the cases we’re +interested in from the following URL.

+

https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv

+

For convenience, we’ll load the data into pandas first, but our next +step will be to create a new SQLite table containing the data. Then +we’ll join that table to our existing tables.

+

Nigerian COVID Data

+
+[edit] +
+

At the beginning of the COVID-19 outbreak, the Consortium for African +COVID-19 Data formed to bring together data from across the African +continent on COVID-19 cases (Marivate et al., 2020). These +cases are recorded in the following GitHub repository: https://github.com/dsfsi/covid19africa.

+

For ease of use we’ve packaged this data set in the pods +library

+
import pods
+
data = pods.datasets.nigerian_covid()['Y']
+data.head()
+

Alternatively, you can access the data directly with the following +commands.

+
import urllib.request
+import pandas as pd
+
+urllib.request.urlretrieve('https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv', 'line-list-nigeria.csv')
+data = pd.read_csv('line-list-nigeria.csv', parse_dates=['date', 
+                                                         'date_confirmation', 
+                                                         'date_admission_hospital', 
+                                                         'date_onset_symptoms',
+                                                         'death_date'])
+

Once it is loaded in the data can be summarized using the +describe method in pandas.

+
data.describe()
+
+
+ + +
+
+ +
+
+

Figure: Evolution of COVID-19 cases in Nigeria.

+
+
+
covid_data=data
+covid_data.to_csv('cases.csv')
+

Now we convert this CSV file we’ve downloaded into a new table in the +database file.

+

We can do this, again, with the csv-to-sqlite script.

+
!csv-to-sqlite -f cases.csv -t full -o db.sqlite

Population Data

-

Now we have information about COVID cases, and we have information about how many health centers and how many doctors and nurses there are in each health center. But unless we understand how many people there are in each state, then we cannot make decisions about where they may be problems with the disease.

-

If we were running our ride hailing service, we would also need information about how many people there were in different areas, so we could understand what the demand for the boda boda rides might be.

-

To access the number of people we can get population statistics from the Humanitarian Data Exchange.

-

We also want to have population data for each state in Nigeria, so that we can see attributes like whether there are zones of high health facility density but low population density.

-
pop_url = 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/d9fc551a-b5e4-4bed-9d0d-b047b6961817/download/nga_pop_adm1_2016.csv'
-_, msg = urllib.request.urlretrieve(pop_url,'nga_pop_adm1_2016.csv')
-pop_data = pd.read_csv('nga_pop_adm1_2016.csv')
-
pop_data.head()
-

To do joins with this data, we must first make sure that the columns have the right names. The name should match the same name of the column in our existing data. So we reset the column names, and the name of the index, as follows.

-
pop_data.columns = ['admin1Name_en', 'admin1Pcode', 'admin0Name_en', 'admin0Pcode', 'population']
-pop_data = pop_data.set_index('admin1Name_en')
-

When doing this for real world data, you should also make sure that the names used in the rows are the same across the different data bases. For example, has someone decided to use an abbreviation for ‘Federal Capital Territory’ and set it as ‘FCT’. The computer won’t understand these are the same states, and if you do a join with such data you can get duplicate entries or missing entries. This sort of thing happens a lot in real world data and takes a lot of time to sort out. Fortunately, in this case, the data is well curated and we don’t have these problems.

+
+[edit] +
+

Now we have information about COVID cases, and we have information +about how many health centers and how many doctors and nurses there are +in each health center. But unless we understand how many people there +are in each state, then we cannot make decisions about where they may be +problems with the disease.

+

If we were running our ride hailing service, we would also need +information about how many people there were in different areas, so we +could understand what the demand for the boda boda rides might be.

+

To access the number of people we can get population statistics from +the Humanitarian Data +Exchange.

+

We also want to have population data for each state in Nigeria, so +that we can see attributes like whether there are zones of high health +facility density but low population density.

+
import urllib
+
+pop_url = "https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/d9fc551a-b5e4-4bed-9d0d-b047b6961817/download/nga_admpop_adm1_2020.csv"
+_, msg = urllib.request.urlretrieve(pop_url,"nga_admpop_adm1_2020.csv")
+data = pd.read_csv("nga_admpop_adm1_2020.csv")
+

To do joins with this data, we must first make sure that the columns +have the right names. The name should match the same name of the column +in our existing data. So we reset the column names, and the name of the +index, as follows.

+
data.dropna(axis=0, how="all", inplace=True)
+data.dropna(axis=1, how="all", inplace=True)
+data.rename(columns = {"ADM0_NAME" : "admin0Name_en", 
+                       "ADM0_PCODE" : "admin0Pcode", 
+                       "ADM1_NAME" : "admin1Name_en", 
+                       "ADM1_PCODE" : "admin1Pcode", 
+                       "T_TL" : "population"},
+            inplace=True)
+data["admin0Name_en"] = data["admin0Name_en"].str.title()
+data["admin1Name_en"] = data["admin1Name_en"].str.title()
+    
+data = data.set_index("admin1Name_en")
+
data = pods.datasets.nigerian_population()["Y"]
+
data.head()
+
pop_data=data
+

When doing this for real world data, you should also make sure that +the names used in the rows are the same across the different data bases. +For example, has someone decided to use an abbreviation for ‘Federal +Capital Territory’ and set it as ‘FCT’. The computer won’t understand +these are the same states, and if you do a join with such data, you can +get duplicate entries or missing entries. This sort of thing happens a +lot in real world data and takes a lot of time to sort out. Fortunately, +in this case, the data is well curated, and we don’t have these +problems.

Save to database file

-

The next step is to add this new CSV file as an additional table in our SQLite database. This is done using the script as before.

-
pop_data.to_csv('pop_data.csv')
-
!csv-to-sqlite -f pop_data.csv -t full -o db.sqlite
-

Computing per capita hospitals and COVID

-

The Minister of Health in Abuja may be interested in which states are most vulnerable to COVID19. We now have all the information in our SQL data bases to compute what our health center provision is per capita, and what the COVID19 situation is.

-

To do this, we will use the JOIN operation from SQL and introduce a new operation called GROUPBY.

-

Joining in Pandas

-

As before, these operations can be done in pandas or GeoPandas. Before we create the SQL commands, we’ll show how you can do that in pandas.

-

In pandas, the equivalent of a database table is a dataframe. So the JOIN operation takes two dataframes and joins them based on the key. The key is that special shared column between the two tables. The place where the ‘holes align’ so the two databases can be joined together.

-

In GeoPandas we used an outer join. In an outer join you keep all rows from both tables, even if there is no match on the key. In an inner join, you only keep the rows if the two tables have a matching key.

-

This is sometimes where problems can creep in. If in one table Abuja’s state is encoded as ‘FCT’ or ‘FCT-Abuja’, and in another table it’s encoded as ‘Federal Capital Territory’, they won’t match and that data wouldn’t appear in the joined table.

-

In simple terms, a JOIN operation takes two tables (or dataframes) and combines them based on some key, in this case the index of the Pandas data frame which is the state name.

-
pop_joined = zones_gdf.join(pop_data['population'], how='inner')
+

The next step is to add this new CSV file as an additional table in +our database.

+

Loading +the Population Data into the SQLite Database

+
+[edit] +
+

We can load the data into the SQLite database using the script as +before.

+
pop_data.to_csv('pop_data.csv')
+
!csv-to-sqlite -f pop_data.csv -t full -o db.sqlite
+

Computing per capita +hospitals and COVID

+

The Minister of Health in Abuja may be interested in which states are +most vulnerable to COVID19. We now have all the information in our SQL +data bases to compute what our health center provision is per capita, +and what the COVID19 situation is.

+

To do this, we will use the JOIN operation from SQL and +introduce a new operation called GROUPBY.

+

Joining in Pandas

+

As before, these operations can be done in pandas or GeoPandas. +Before we create the SQL commands, we’ll show how you can do that in +pandas.

+

In pandas, the equivalent of a database table is a +dataframe. So, the JOIN operation takes two dataframes and joins them +based on the key. The key is that special shared column between the two +tables. The place where the ‘holes align’ so the two databases can be +joined together.

+

In GeoPandas we used an outer join. In an outer join you keep all +rows from both tables, even if there is no match on the key. In an inner +join, you only keep the rows if the two tables have a matching key.

+

This is sometimes where problems can creep in. If in one table +Abuja’s state is encoded as ‘FCT’ or ‘FCT-Abuja’, and in another table +it’s encoded as ‘Federal Capital Territory’, they won’t match, and that +data wouldn’t appear in the joined table.

+

In simple terms, a JOIN operation takes two tables (or dataframes) +and combines them based on some key, in this case the index of the +Pandas data frame which is the state name.

+
zones_gdf.set_index("admin1Name_en", inplace=True)
+pop_joined = zones_gdf.join(pop_data['population'], how='inner')

GroupBy in Pandas

-

Our COVID19 data is in the form of individual cases. But we are interested in total case counts for each state. There is a special data base operation known as GROUP BY for collecting information about the individual states. The type of information you might want could be a sum, the maximum value, an average, the minimum value. We can use a GroupBy operation in pandas and SQL to summarize the counts of covid cases in each state.

-

A GROUPBY operation groups rows with the same key (in this case ‘province/state’) into separate objects, that we can operate on further such as to count the rows in each group, or to sum or take the mean over the values in some column (imagine each case row had the age of the patient, and you were interested in the mean age of patients.)

-
covid_cases_by_state = covid_data.groupby(['province/state']).count()['case_id']
-

The .groupby() method on the dataframe has now given us a new data series that contains the total number of covid cases in each state. We can examine it to check we have something sensible.

-
covid_cases_by_state
-

Now we have this new data series, it can be added to the pandas data frame as a new column.

-
pop_joined['covid_cases_by_state'] = covid_cases_by_state
-

The spatial join we did on the original data frame to obtain hosp_state_joined introduced a new column, index_right which contains the state of each of the hospitals. Let’s have a quick look at it below.

-
hosp_state_joined['index_right']
-

To count the hospitals in each of the states, we first create a grouped series where we’ve grouped on these states.

-
grouped = hosp_state_joined.groupby('index_right')
-

This python operation now goes through each of the groups and counts how many hospitals there are in each state. It stores the result in a dictionary. If you’re new to Python, then to understand this code you need to understand what a ‘dictionary comprehension’ is. In this case the dictionary comprehension is being used to create a python dictionary of states and total hospital counts. That’s then being converted into a pandas Data Series and added to the pop_joined dataframe.

-
counted_groups = {k: len(v) for k, v in grouped.groups.items()}
-pop_joined['hosp_state'] = pd.Series(counted_groups)
-

For convenience, we can now add a new data series to the data frame that contains the per capita information about hospitals. that makes it easy to retrieve later.

-
pop_joined['hosp_per_capita_10k'] = (pop_joined['hosp_state'] * 10000 )/ pop_joined['population']
+

Our COVID19 data is in the form of individual cases. But we are +interested in total case counts for each state. There is a special data +base operation known as GROUP BY for collecting information +about the individual states. The type of information you might want +could be a sum, the maximum value, an average, the minimum value. We can +use a GroupBy operation in pandas and SQL to summarize the +counts of covid cases in each state.

+

A GROUPBY operation groups rows with the same key (in +this case ‘province/state’) into separate objects, that we can operate +on further such as to count the rows in each group, or to sum or take +the mean over the values in some column (imagine each case row had the +age of the patient, and you were interested in the mean age of +patients.)

+
covid_cases_by_state = covid_data.groupby(['province/state']).count()['case_id']
+

The .groupby() method on the dataframe has now given us +a new data series that contains the total number of covid cases in each +state. We can examine it to check we have something sensible.

+
covid_cases_by_state
+

Now we have this new data series, it can be added to the pandas +dataframe as a new column.

+
pop_joined['covid_cases_by_state'] = covid_cases_by_state
+

The spatial join we did on the original data frame to obtain +hosp_state_joined introduced a new column, index_right that +contains the state of each of the hospitals. Let’s have a quick look at +it below.

+
hosp_state_joined['index_right']
+

To count the hospitals in each of the states, we first create a +grouped series where we’ve grouped on these states.

+
grouped = hosp_state_joined.groupby('admin1Name_en')
+

This python operation now goes through each of the groups and counts +how many hospitals there are in each state. It stores the result in a +dictionary. If you’re new to python, then to understand this code you +need to understand what a ‘dictionary comprehension’ is. In this case +the dictionary comprehension is being used to create a python dictionary +of states and total hospital counts. That’s then being converted into a +pandas Data Series and added to the pop_joined +dataframe.

+
import pandas as pd
+
counted_groups = {k: len(v) for k, v in grouped.groups.items()}
+pop_joined['hosp_state'] = pd.Series(counted_groups)
+

For convenience, we can now add a new data series to the data frame +that contains the per capita information about hospitals. that makes it +easy to retrieve later.

+
pop_joined['hosp_per_capita_10k'] = (pop_joined['hosp_state'] * 10000 )/ pop_joined['population']

SQL-style

-

That’s the pandas approach to doing it. But pandas itself is inspired by database language, in particular relational databases such as SQL. To do these types of joins at scale, e.g. for our ride hailing app, we need to see how to do these joins in a database.

-

As before, we’ll wrap the underlying SQL commands with a convenient python command.

-

What you see below gives the full SQL command. There is a SELECT command, which extracts FROM a particular table. It then completes an INNER JOIN using particular columns (provice/state and index_right)

-

Now we’ve created our python wrapper, we can connect to the data base and run our SQL command on the database using the wrapper.

-
conn = create_connection("db.sqlite")
-
state_cases_hosps = join_counts(conn)
-
for row in state_cases_hosps:
-    print("State {} \t\t Covid Cases {} \t\t Health Facilities {}".format(row[0], row[1], row[2]))
-
base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-pop_joined.plot(ax=base, column='population', edgecolor='black', legend=True)
-base.set_title("Population of Nigerian States")
-
base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-pop_joined.plot(ax=base, column='hosp_per_capita_10k', edgecolor='black', legend=True)
-base.set_title("Hospitals Per Capita (10k) of Nigerian States")
-

Exercise

-
    -
  1. Add a new column the dataframe for covid cases per 10,000 population, in the same way we computed health facilities per 10k capita.

  2. -
  3. Add a new column for covid cases per health facility.

  4. -
-

Do this in both the SQL and the Pandas styles to get a feel for how they differ.

-

{{.python} # pop_joined['cases_per_capita_10k'] = ??? # pop_joined['cases_per_facility'] = ???

-
base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-pop_joined.plot(ax=base, column='cases_per_capita_10k', edgecolor='black', legend=True)
-base.set_title("Covid Cases Per Capita (10k) of Nigerian States")
-
base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-pop_joined.plot(ax=base, column='covid_cases_by_state', edgecolor='black', legend=True)
-base.set_title("Covid Cases by State")
-
base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
-pop_joined.plot(ax=base, column='cases_per_facility', edgecolor='black', legend=True)
-base.set_title("Covid Cases per Health Facility")
+

That’s the pandas approach to doing it. But +pandas itself is inspired by database languages, in +particular relational databases such as SQL. To do these types of joins +at scale, e.g., for a ride hailing app, we need to do these joins in a +database.

+

As before, we’ll wrap the underlying SQL commands with a convenient +python command.

+

What you see below gives the full SQL command. There is a SELECT +command, which extracts FROM a particular table. It +then completes an INNER JOIN +using particular columns (province/state and +admin1Name_en)

+

Now we’ve created our python wrapper, we can connect to the data base +and run our SQL command on the database using the wrapper.

+
conn = create_connection("db.sqlite")
+
state_cases_hosps = join_counts(conn)
+
for row in state_cases_hosps:
+    print("State {} \t\t Covid Cases {} \t\t Health Facilities {}".format(row[0], row[1], row[2]))
+
base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
+pop_joined.plot(ax=base, column='population', edgecolor='black', legend=True)
+base.set_title("Population of Nigerian States")
+
base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
+pop_joined.plot(ax=base, column='hosp_per_capita_10k', edgecolor='black', legend=True)
+base.set_title("Hospitals Per Capita (10k) of Nigerian States")
+

Exercise 1

+

Add a new column the dataframe for covid cases per 10,000 population, +in the same way we computed health facilities per 10k capita.

+

Exercise 2

+

Add a new column for covid cases per health facility.

+

Exercise 3

+

Do this in both the SQL and the Pandas styles to get a feel for how +they differ.

+

Exercise 4

+

Perform an inner join using SQL on your databases and convert the +result into a pandas DataFrame.

+

+# pop_joined['cases_per_capita_10k'] = ???
+# pop_joined['cases_per_facility'] = ???
+
base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
+pop_joined.plot(ax=base, column='cases_per_capita_10k', edgecolor='black', legend=True)
+base.set_title("Covid Cases Per Capita (10k) of Nigerian States")
+
base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
+pop_joined.plot(ax=base, column='covid_cases_by_state', edgecolor='black', legend=True)
+base.set_title("Covid Cases by State")
+
base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))
+pop_joined.plot(ax=base, column='cases_per_facility', edgecolor='black', legend=True)
+base.set_title("Covid Cases per Health Facility")

Thanks!

-

For more information on these subjects and more you might want to check the following resources.

+

For more information on these subjects and more you might want to +check the following resources.

-

References

+

References

+
+
+Marivate, V., Nsoesie, E., Bekele, E., Africa open COVID-19 data working +group, 2020. Coronavirus COVID-19 (2019-nCoV) Data +Repository for Africa. https://doi.org/10.5281/zenodo.3757554 +
+
+The Office of the Senior Special Assistant to the President on the +Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014. +Nigeria NMIS facility database. +
+
+ diff --git a/_lectures/03-bayesian-methods-abuja.html b/_lectures/03-bayesian-methods-abuja.html index 6ea002d..39a949d 100644 --- a/_lectures/03-bayesian-methods-abuja.html +++ b/_lectures/03-bayesian-methods-abuja.html @@ -1,7 +1,12 @@ --- title: "Bayesian Methods" venue: "DSA, Abuja" -abstract: "

In this session we review the probabilistic approach to machine learning. We start with a review of probability, and introduce the concepts of probabilistic modelling. We then apply the approach in practice to Naive Bayesian classification. In this session we review the probabilistic formulation of a classification model, reviewing initially maximum likelihood and the naive Bayes model.

" +abstract: "

In this session we review the probabilistic +approach to machine learning. We start with a review of probability, and +introduce the concepts of probabilistic modelling. We then apply the +approach in practice to Naive Bayesian classification. In this session +we review the probabilistic formulation of a classification model, +reviewing initially maximum likelihood and the naive Bayes model.

" author: - given: Neil D. family: Lawrence @@ -9,20 +14,22 @@ institute: Amazon Cambridge and University of Sheffield twitter: lawrennd gscholar: r3SJcvoAAAAJ - orchid: + orcid: - given: Oluwasanmi family: Koyejo url: https://sanmi.cs.illinois.edu/ institute: Google and University of Illinois twitter: gscholar: EaaOeJwAAAAJ - orchid: + orcid: +edit_url: https://github.com/mlatcl/dsa/edit/gh-pages/_lamd/bayesian-methods-abuja.md date: 2018-11-14 published: 2018-11-14 -week: 0 session: 3 reveal: 03-bayesian-methods-abuja.slides.html +transition: None ipynb: 03-bayesian-methods-abuja.ipynb +pptx: 03-bayesian-methods-abuja.pptx layout: lecture categories: - notes @@ -39,98 +46,334 @@ -->

What is Machine Learning?

-

[edit]

-

What is machine learning? At its most basic level machine learning is a combination of

-


$$\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}$$

-

where data is our observations. They can be actively or passively acquired (meta-data). The model contains our assumptions, based on previous experience. That experience can be other data, it can come from transfer learning, or it can merely be our beliefs about the regularities of the universe. In humans our models include our inductive biases. The prediction is an action to be taken or a categorization or a quality score. The reason that machine learning has become a mainstay of artificial intelligence is the importance of predictions in artificial intelligence. The data and the model are combined through computation.

-

In practice we normally perform machine learning using two functions. To combine data with a model we typically make use of:

-

a prediction function a function which is used to make the predictions. It includes our beliefs about the regularities of the universe, our assumptions about how the world works, e.g. smoothness, spatial similarities, temporal similarities.

-

an objective function a function which defines the cost of misprediction. Typically it includes knowledge about the world’s generating processes (probabilistic objectives) or the costs we pay for mispredictions (empiricial risk minimization).

-

The combination of data and model through the prediction function and the objective function leads to a learning algorithm. The class of prediction functions and objective functions we can make use of is restricted by the algorithms they lead to. If the prediction function or the objective function are too complex, then it can be difficult to find an appropriate learning algorithm. Much of the acdemic field of machine learning is the quest for new learning algorithms that allow us to bring different types of models and data together.

-

A useful reference for state of the art in machine learning is the UK Royal Society Report, Machine Learning: Power and Promise of Computers that Learn by Example.

-

You can also check my post blog post on What is Machine Learning?..

-

Nigerian NMIS Data

-

[edit]

-

As an example data set we will use Nigerian NMIS Health Facility data from openAFRICA. It can be found here https://africaopendata.org/dataset/nigeria-nmis-health-facility-data-2014

+
+[edit] +
+

What is machine learning? At its most basic level machine learning is +a combination of

+

\[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

+

where data is our observations. They can be actively or +passively acquired (meta-data). The model contains our +assumptions, based on previous experience. That experience can be other +data, it can come from transfer learning, or it can merely be our +beliefs about the regularities of the universe. In humans our models +include our inductive biases. The prediction is an action to be +taken or a categorization or a quality score. The reason that machine +learning has become a mainstay of artificial intelligence is the +importance of predictions in artificial intelligence. The data and the +model are combined through computation.

+

In practice we normally perform machine learning using two functions. +To combine data with a model we typically make use of:

+

a prediction function it is used to make the +predictions. It includes our beliefs about the regularities of the +universe, our assumptions about how the world works, e.g., smoothness, +spatial similarities, temporal similarities.

+

an objective function it defines the ‘cost’ of +misprediction. Typically, it includes knowledge about the world’s +generating processes (probabilistic objectives) or the costs we pay for +mispredictions (empirical risk minimization).

+

The combination of data and model through the prediction function and +the objective function leads to a learning algorithm. The class +of prediction functions and objective functions we can make use of is +restricted by the algorithms they lead to. If the prediction function or +the objective function are too complex, then it can be difficult to find +an appropriate learning algorithm. Much of the academic field of machine +learning is the quest for new learning algorithms that allow us to bring +different types of models and data together.

+

A useful reference for state of the art in machine learning is the UK +Royal Society Report, Machine +Learning: Power and Promise of Computers that Learn by Example.

+

You can also check my post blog post on What +is Machine Learning?.

+

Nigeria NMIS Data

+
+[edit] +
+

As an example data set we will use Nigerian Millennium Development +Goals Information System Health Facility (The Office of the Senior Special Assistant +to the President on the Millennium Development Goals (OSSAP-MDGs) and +Columbia University, 2014). It can be found here https://energydata.info/dataset/nigeria-nmis-education-facility-data-2014.

Taking from the information on the site,

-

The Nigeria MDG (Millennium Development Goals) Information System – NMIS health facility data is collected by the Office of the Senior Special Assistant to the President on the Millennium Development Goals (OSSAP-MDGs) in partner with the Sustainable Engineering Lab at Columbia University. A rigorous, geo-referenced baseline facility inventory across Nigeria is created spanning from 2009 to 2011 with an additional survey effort to increase coverage in 2014, to build Nigeria’s first nation-wide inventory of health facility. The database includes 34,139 health facilities info in Nigeria.

-

The goal of this database is to make the data collected available to planners, government officials, and the public, to be used to make strategic decisions for planning relevant interventions.

-

For data inquiry, please contact Ms. Funlola Osinupebi, Performance Monitoring & Communications, Advisory Power Team, Office of the Vice President at funlola.osinupebi@aptovp.org

-

To learn more, please visit http://csd.columbia.edu/2014/03/10/the-nigeria-mdg-information-system-nmis-takes-open-data-further/

-

Suggested citation: Nigeria NMIS facility database (2014), the Office of the Senior Special Assistant to the President on the Millennium Development Goals (OSSAP-MDGs) & Columbia University

+

The Nigeria MDG (Millennium Development Goals) Information System – +NMIS health facility data is collected by the Office of the Senior +Special Assistant to the President on the Millennium Development Goals +(OSSAP-MDGs) in partner with the Sustainable Engineering Lab at Columbia +University. A rigorous, geo-referenced baseline facility inventory +across Nigeria is created spanning from 2009 to 2011 with an additional +survey effort to increase coverage in 2014, to build Nigeria’s first +nation-wide inventory of health facility. The database includes 34,139 +health facilities info in Nigeria.

+

The goal of this database is to make the data collected available to +planners, government officials, and the public, to be used to make +strategic decisions for planning relevant interventions.

+

For data inquiry, please contact Ms. Funlola Osinupebi, Performance +Monitoring & Communications, Advisory Power Team, Office of the Vice +President at funlola.osinupebi@aptovp.org

+

To learn more, please visit http://csd.columbia.edu/2014/03/10/the-nigeria-mdg-information-system-nmis-takes-open-data-further/

+

Suggested citation: Nigeria NMIS facility database (2014), the Office +of the Senior Special Assistant to the President on the Millennium +Development Goals (OSSAP-MDGs) & Columbia University

-
import urllib.request
-
urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
-
import pandas as pd
-
data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
-

Once it is loaded in the data can be summarized using the describe method in pandas.

-
data.describe()
-

In python and jupyter notebook it is possible to see a list of all possible functions and attributes by typing the name of the object followed by .<Tab> for example in the above case if we type data.<Tab> it show the columns available (these are attributes in pandas dataframes) such as num_nurses_fulltime, and also functions, such as .describe().

-

For functions we can also see the documentation about the function by following the name with a question mark. This will open a box with documentation at the bottom which can be closed with the x button.

-
data.describe?
-

The NMIS facility data is stored in an object known as a ‘data frame’. Data frames come from the statistical family of programming languages based on S, the most widely used of which is R. The data frame gives us a convenient object for manipulating data. The describe method summarizes which columns there are in the data frame and gives us counts, means, standard deviations and percentiles for the values in those columns. To access a column directly we can write

-
print(data['num_doctors_fulltime'])
-#print(data['num_nurses_fulltime'])
-

This shows the number of doctors per facility, number of nurses and number of community health workers (CHEWS). We can plot the number of doctors against the number of nurses as follows.

-
# this ensures the plot appears in the web browser
-%matplotlib inline 
-import matplotlib.pyplot as plt # this imports the plotting library in python
-
_ = plt.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx')
-

You may be curious what the arguments we give to plt.plot are for, now is the perfect time to look at the documentation

-
plt.plot?
-

We immediately note that some facilities have a lot of nurses, which prevent’s us seeing the detail of the main number of facilities. First lets identify the facilities with the most nurses.

-
data[data['num_nurses_fulltime']>100]
-

Here we are using the command data['num_nurses_fulltime']>100 to index the facilities in the pandas data frame which have over 100 nurses. To sort them in order we can also use the sort command. The result of this command on its own is a data Series of True and False values. However, when it is passed to the data data frame it returns a new data frame which contains only those values for which the data series is True. We can also sort the result. To sort the result by the values in the num_nurses_fulltime column in descending order we use the following command.

-
data[data['num_nurses_fulltime']>100].sort_values(by='num_nurses_fulltime', ascending=False)
-

We now see that the ‘University of Calabar Teaching Hospital’ is a large outlier with 513 nurses. We can try and determine how much of an outlier by histograming the data.

-

Plotting the Data

-
data['num_nurses_fulltime'].hist(bins=20) # histogram the data with 20 bins.
-plt.title('Histogram of Number of Nurses')
-

We can’t see very much here. Two things are happening. There are so many facilities with zero or one nurse that we don’t see the histogram for hospitals with many nurses. We can try more bins and using a log scale on the y-axis.

-
data['num_nurses_fulltime'].hist(bins=100) # histogram the data with 20 bins.
-plt.title('Histogram of Number of Nurses')
-ax = plt.gca()
-ax.set_yscale('log')
-

Exercise 1

-

Read on the internet about the following python libraries: numpy, matplotlib, scipy and pandas. What functionality does each provide python?

-

Let’s try and see how the number of nurses relates to the number of doctors.

-
fig, ax = plt.subplots(figsize=(10, 7)) 
-ax.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx')
-ax.set_xscale('log') # use a logarithmic x scale
-ax.set_yscale('log') # use a logarithmic Y scale
-# give the plot some titles and labels
-plt.title('Number of Nurses against Number of Doctors')
-plt.ylabel('number of nurses')
-plt.xlabel('number of doctors')
-

Note a few things. We are interacting with our data. In particular, we are replotting the data according to what we have learned so far. We are using the progamming language as a scripting language to give the computer one command or another, and then the next command we enter is dependent on the result of the previous. This is a very different paradigm to classical software engineering. In classical software engineering we normally write many lines of code (entire object classes or functions) before compiling the code and running it. Our approach is more similar to the approach we take whilst debugging. Historically, researchers interacted with data using a console. A command line window which allowed command entry. The notebook format we are using is slightly different. Each of the code entry boxes acts like a separate console window. We can move up and down the notebook and run each part in a different order. The state of the program is always as we left it after running the previous part.

+

For ease of use we’ve packaged this data set in the pods +library

+

pods

+
+[edit] +
+

In Sheffield we created a suite of software tools for ‘Open Data +Science’. Open data science is an approach to sharing code, models and +data that should make it easier for companies, health professionals and +scientists to gain access to data science techniques.

+

You can also check this blog post on Open +Data Science.

+

The software can be installed using

+
%pip install pods
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/ods

+

Once pods is installed, it can be imported in the usual +manner.

+
import pods
+
data = pods.datasets.nigeria_nmis()['Y']
+data.head()
+

Alternatively, you can access the data directly with the following +commands.

+
import urllib.request
+urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')
+
+import pandas as pd
+data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')
+

Once it is loaded in the data can be summarized using the +describe method in pandas.

+
data.describe()
+

We can also find out the dimensions of the dataset using the +shape property.

+
data.shape
+

Dataframes have different functions that you can use to explore and +understand your data. In python and the Jupyter notebook it is possible +to see a list of all possible functions and attributes by typing the +name of the object followed by .<Tab> for example in +the above case if we type data.<Tab> it show the +columns available (these are attributes in pandas dataframes) such as +num_nurses_fulltime, and also functions, such as +.describe().

+

For functions we can also see the documentation about the function by +following the name with a question mark. This will open a box with +documentation at the bottom which can be closed with the x button.

+
data.describe?
+
+
+
+ +
+
+
+ +
+
+

Figure: Location of the over thirty-four thousand health facilities +registered in the NMIS data across Nigeria. Each facility plotted +according to its latitude and longitude.

+
+

Probabilities

-

[edit]

-

We are now going to do some simple review of probabilities and use this review to explore some aspects of our data.

-

A probability distribution expresses uncertainty about the outcome of an event. We often encode this uncertainty in a variable. So if we are considering the outcome of an event, Y, to be a coin toss, then we might consider Y = 1 to be heads and Y = 0 to be tails. We represent the probability of a given outcome with the notation:
P(Y = 1) = 0.5
The first rule of probability is that the probability must normalize. The sum of the probability of all events must equal 1. So if the probability of heads (Y = 1) is 0.5, then the probability of tails (the only other possible outcome) is given by
P(Y = 0) = 1 − P(Y = 1) = 0.5

-

Probabilities are often defined as the limit of the ratio between the number of positive outcomes (e.g. heads) given the number of trials. If the number of positive outcomes for event y is denoted by n and the number of trials is denoted by N then this gives the ratio
$$ +

+[edit] +
+

We are now going to do some simple review of probabilities and use +this review to explore some aspects of our data.

+

A probability distribution expresses uncertainty about the outcome of +an event. We often encode this uncertainty in a variable. So if we are +considering the outcome of an event, \(Y\), to be a coin toss, then we might +consider \(Y=1\) to be heads and \(Y=0\) to be tails. We represent the +probability of a given outcome with the notation: \[ +P(Y=1) = 0.5 +\] The first rule of probability is that the probability must +normalize. The sum of the probability of all events must equal 1. So if +the probability of heads (\(Y=1\)) is +0.5, then the probability of tails (the only other possible outcome) is +given by \[ +P(Y=0) = 1-P(Y=1) = 0.5 +\]

+

Probabilities are often defined as the limit of the ratio between the +number of positive outcomes (e.g. heads) given the number of +trials. If the number of positive outcomes for event \(y\) is denoted by \(n\) and the number of trials is denoted by +\(N\) then this gives the ratio \[ P(Y=y) = \lim_{N\rightarrow \infty}\frac{n_y}{N}. -$$
In practice we never get to observe an event infinite times, so rather than considering this we often use the following estimate
$$ +\] In practice we never get to observe an event infinite times, +so rather than considering this we often use the following estimate +\[ P(Y=y) \approx \frac{n_y}{N}. -$$

-

Probability and the NMIS Data

-

[edit]

-

Let’s use the sum rule to compute the estimate the probability that a facility has more than two nurses.

-
large = (data.num_nurses_fulltime>2).sum()  # number of positive outcomes (in sum True counts as 1, False counts as 0)
-total_facilities = data.num_nurses_fulltime.count()
-
-prob_large = float(large)/float(total_facilities)
-print("Probability of number of nurses being greather than 2 is:", prob_large)
+\]

+

Exploring the NMIS Data

+
+[edit] +
+

The NMIS facility data is stored in an object known as a ‘data +frame’. Data frames come from the statistical family of programming +languages based on S, the most widely used of which is R. +The data frame gives us a convenient object for manipulating data. The +describe method summarizes which columns there are in the data frame and +gives us counts, means, standard deviations and percentiles for the +values in those columns. To access a column directly we can write

+
print(data['num_doctors_fulltime'])
+#print(data['num_nurses_fulltime'])
+

This shows the number of doctors per facility, number of nurses and +number of community health workers (CHEWS). We can plot the number of +doctors against the number of nurses as follows.

+
import matplotlib.pyplot as plt # this imports the plotting library in python
+
_ = plt.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx')
+

You may be curious what the arguments we give to +plt.plot are for, now is the perfect time to look at the +documentation

+
plt.plot?
+

We immediately note that some facilities have a lot of nurses, which +prevent’s us seeing the detail of the main number of facilities. First +lets identify the facilities with the most nurses.

+
data[data['num_nurses_fulltime']>100]
+

Here we are using the command +data['num_nurses_fulltime']>100 to index the facilities +in the pandas data frame which have over 100 nurses. To sort them in +order we can also use the sort command. The result of this +command on its own is a data Series of True +and False values. However, when it is passed to the +data data frame it returns a new data frame which contains +only those values for which the data series is True. We can +also sort the result. To sort the result by the values in the +num_nurses_fulltime column in descending order we +use the following command.

+
data[data['num_nurses_fulltime']>100].sort_values(by='num_nurses_fulltime', ascending=False)
+

We now see that the ‘University of Calabar Teaching Hospital’ is a +large outlier with 513 nurses. We can try and determine how much of an +outlier by histograming the data.

+

Plotting the Data

+
data['num_nurses_fulltime'].hist(bins=20) # histogram the data with 20 bins.
+plt.title('Histogram of Number of Nurses')
+

We can’t see very much here. Two things are happening. There are so +many facilities with zero or one nurse that we don’t see the histogram +for hospitals with many nurses. We can try more bins and using a +log scale on the \(y\)-axis.

+
data['num_nurses_fulltime'].hist(bins=100) # histogram the data with 20 bins.
+plt.title('Histogram of Number of Nurses')
+ax = plt.gca()
+ax.set_yscale('log')
+

Let’s try and see how the number of nurses relates to the number of +doctors.

+
fig, ax = plt.subplots(figsize=(10, 7)) 
+ax.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx')
+ax.set_xscale('log') # use a logarithmic x scale
+ax.set_yscale('log') # use a logarithmic Y scale
+# give the plot some titles and labels
+plt.title('Number of Nurses against Number of Doctors')
+plt.ylabel('number of nurses')
+plt.xlabel('number of doctors')
+

Note a few things. We are interacting with our data. In particular, +we are replotting the data according to what we have learned so far. We +are using the progamming language as a scripting language to +give the computer one command or another, and then the next command we +enter is dependent on the result of the previous. This is a very +different paradigm to classical software engineering. In classical +software engineering we normally write many lines of code (entire object +classes or functions) before compiling the code and running it. Our +approach is more similar to the approach we take whilst debugging. +Historically, researchers interacted with data using a console. +A command line window which allowed command entry. The notebook format +we are using is slightly different. Each of the code entry boxes acts +like a separate console window. We can move up and down the notebook and +run each part in a different order. The state of the program is +always as we left it after running the previous part.

+

Probability and the NMIS +Data

+
+[edit] +
+

Let’s use the sum rule to compute the estimate the probability that a +facility has more than two nurses.

+
large = (data.num_nurses_fulltime>2).sum()  # number of positive outcomes (in sum True counts as 1, False counts as 0)
+total_facilities = data.num_nurses_fulltime.count()
+
+prob_large = float(large)/float(total_facilities)
+print("Probability of number of nurses being greather than 2 is:", prob_large)

Conditioning

-

When predicting whether a coin turns up head or tails, we might think that this event is independent of the year or time of day. If we include an observation such as time, then in a probability this is known as condtioning. We use this notation, P(Y = y|X = x), to condition the outcome on a second variable (in this case the number of doctors). Or, often, for a shorthand we use P(y|x) to represent this distribution (the Y= and X= being implicit). If two variables are independent then we find that
P(y|x) = p(y).
However, we might believe that the number of nurses is dependent on the number of doctors. For this we can try estimating P(Y > 2|X > 1) and compare the result, for example to P(Y > 2|X ≤ 1) using our empirical estimate of the probability.

-
large = ((data.num_nurses_fulltime>2) & (data.num_doctors_fulltime>1)).sum()
-total_large_doctors = (data.num_doctors_fulltime>1).sum()
-prob_both_large = large/total_large_doctors
-print("Probability of number of nurses being greater than 2 given number of doctors is greater than 1 is:", prob_both_large)
-

Exercise 2

-

Write code that prints out the probability of nurses being greater than 2 for different numbers of doctors.

-

Make sure the plot is included in this notebook file (the Jupyter magic command %matplotlib inline we ran above will do that for you, it only needs to be run once per file).

+

When predicting whether a coin turns up head or tails, we might think +that this event is independent of the year or time of day. If +we include an observation such as time, then in a probability this is +known as condtioning. We use this notation, \(P(Y=y|X=x)\), to condition the outcome on a +second variable (in this case the number of doctors). Or, often, for a +shorthand we use \(P(y|x)\) to +represent this distribution (the \(Y=\) +and \(X=\) being implicit). If two +variables are independent then we find that \[ +P(y|x) = p(y). +\] However, we might believe that the number of nurses is +dependent on the number of doctors. For this we can try estimating \(P(Y>2 | X>1)\) and compare the +result, for example to \(P(Y>2|X\leq +1)\) using our empirical estimate of the probability.

+
large = ((data.num_nurses_fulltime>2) & (data.num_doctors_fulltime>1)).sum()
+total_large_doctors = (data.num_doctors_fulltime>1).sum()
+prob_both_large = large/total_large_doctors
+print("Probability of number of nurses being greater than 2 given number of doctors is greater than 1 is:", prob_both_large)
+

Exercise 1

+

Write code that prints out the probability of nurses being greater +than 2 for different numbers of doctors.

+

Make sure the plot is included in this notebook file (the +Jupyter magic command %matplotlib inline we ran above will +do that for you, it only needs to be run once per file).

@@ -142,17 +385,17 @@

Exercise 2

- + - + - + @@ -160,22 +403,33 @@

Exercise 2

The different basic probability distributions.
-

A Pictorial Definition of Probability

-

[edit]

+

A Pictorial Definition of +Probability

+
+[edit] +
- + -
+
-

Figure: Diagram representing the different probabilities, joint, marginal and conditional. This diagram was inspired by lectures given by Christopher Bishop.

+

Figure: Diagram representing the different probabilities, joint, +marginal and conditional. This diagram was inspired by lectures given by +Christopher Bishop.

-

Inspired by lectures from Christopher Bishop

-

Definition of probability distributions

+
+Inspired by lectures from Christopher Bishop +
+

Definition of +probability distributions

jointP(X = x, Y = y)\(P(X=x, Y=y)\) prob. that X=x and Y=y
marginalP(X = x)\(P(X=x)\) prob. that X=x regardless of Y
conditionalP(X = x|Y = y)\(P(X=x\vert Y=y)\) prob. that X=x given that Y=y
@@ -192,216 +446,537 @@

Definition of probability distr

- - + + - - + + - - + +
Joint Probability$\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{N}$P(X=3,Y=4)\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{N}\)\(P\left(X=3,Y=4\right)\)
Marginal Probability$\lim_{N\rightarrow\infty}\frac{n_{X=5}}{N}$P(X=5)\(\lim_{N\rightarrow\infty}\frac{n_{X=5}}{N}\)\(P\left(X=5\right)\)
Conditional Probability$\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{n_{Y=4}}$P(X=3|Y=4)\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{n_{Y=4}}\)\(P\left(X=3\vert Y=4\right)\)

Notational Details

-

Typically we should write out P(X=x,Y=y), but in practice we often shorten this to P(x,y). This looks very much like we might write a multivariate function, e.g.
$$ +

Typically we should write out \(P\left(X=x,Y=y\right)\), but in practice we +often shorten this to \(P\left(x,y\right)\). This looks very much +like we might write a multivariate function, e.g. \[ f\left(x,y\right)=\frac{x}{y}, -$$
but for a multivariate function
f(x,y) ≠ f(y,x).
However,
P(x,y) = P(y,x)
because
P(X=x,Y=y) = P(Y=y,X=x).
Sometimes I think of this as akin to the way in Python we can write ‘keyword arguments’ in functions. If we use keyword arguments, the ordering of arguments doesn’t matter.

-

We’ve now introduced conditioning and independence to the notion of probability and computed some conditional probabilities on a practical example The scatter plot of deaths vs year that we created above can be seen as a joint probability distribution. We represent a joint probability using the notation P(Y = y, X = x) or P(y, x) for short. Computing a joint probability is equivalent to answering the simultaneous questions, what’s the probability that the number of nurses was over 2 and the number of doctors was 1? Or any other question that may occur to us. Again we can easily use pandas to ask such questions.

-
num_doctors = 1
-large = (data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>2).sum()
-total_facilities = data.num_nurses_fulltime.count() # this is total number of films
-prob_large = float(large)/float(total_facilities)
-print("Probability of nurses being greater than 2 and number of doctors being", num_doctors, "is:", prob_large)
+\] but for a multivariate function \[ +f\left(x,y\right)\neq f\left(y,x\right). +\] However, \[ +P\left(x,y\right)=P\left(y,x\right) +\] because \[ +P\left(X=x,Y=y\right)=P\left(Y=y,X=x\right). +\] Sometimes I think of this as akin to the way in Python we can +write ‘keyword arguments’ in functions. If we use keyword arguments, the +ordering of arguments doesn’t matter.

+

We’ve now introduced conditioning and independence to the notion of +probability and computed some conditional probabilities on a practical +example The scatter plot of deaths vs year that we created above can be +seen as a joint probability distribution. We represent a joint +probability using the notation \(P(Y=y, +X=x)\) or \(P(y, x)\) for short. +Computing a joint probability is equivalent to answering the +simultaneous questions, what’s the probability that the number of nurses +was over 2 and the number of doctors was 1? Or any other question that +may occur to us. Again we can easily use pandas to ask such +questions.

+
num_doctors = 1
+large = (data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>2).sum()
+total_facilities = data.num_nurses_fulltime.count() # this is total number of films
+prob_large = float(large)/float(total_facilities)
+print("Probability of nurses being greater than 2 and number of doctors being", num_doctors, "is:", prob_large)

The Product Rule

-

This number is the joint probability, P(Y, X) which is much smaller than the conditional probability. The number can never be bigger than the conditional probabililty because it is computed using the product rule.
p(Y = y, X = x) = p(Y = y|X = x)p(X = x)
and
p(X = x)
is a probability distribution, which is equal or less than 1, ensuring the joint distribution is typically smaller than the conditional distribution.

-

The product rule is a fundamental rule of probability, and you must remember it! It gives the relationship between the two questions: 1) What’s the probability that a facility has over two nurses and one doctor? and 2) What’s the probability that a facility has over two nurses given that it has one doctor?

-

In our shorter notation we can write the product rule as
p(y, x) = p(y|x)p(x)
We can see the relation working in practice for our data above by computing the different values for x = 1.

-
num_doctors=1
-num_nurses=2
-p_x = float((data.num_doctors_fulltime==num_doctors).sum())/float(data.num_nurses_fulltime.count())
-p_y_given_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float((data.num_doctors_fulltime==num_doctors).sum())
-p_y_and_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float(data.num_nurses_fulltime.count())
-
-print("P(x) is", p_x)
-print("P(y|x) is", p_y_given_x)
-print("P(y,x) is", p_y_and_x)
+

This number is the joint probability, \(P(Y, X)\) which is much smaller +than the conditional probability. The number can never be bigger than +the conditional probabililty because it is computed using the +product rule. \[ +p(Y=y, X=x) = p(Y=y|X=x)p(X=x) +\] and \[p(X=x)\] is a +probability distribution, which is equal or less than 1, ensuring the +joint distribution is typically smaller than the conditional +distribution.

+

The product rule is a fundamental rule of probability, and +you must remember it! It gives the relationship between the two +questions: 1) What’s the probability that a facility has over two nurses +and one doctor? and 2) What’s the probability that a facility +has over two nurses given that it has one doctor?

+

In our shorter notation we can write the product rule as \[ +p(y, x) = p(y|x)p(x) +\] We can see the relation working in practice for our data above +by computing the different values for \(x=1\).

+
num_doctors=1
+num_nurses=2
+p_x = float((data.num_doctors_fulltime==num_doctors).sum())/float(data.num_doctors_fulltime.count())
+p_y_given_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float((data.num_doctors_fulltime==num_doctors).sum())
+p_y_and_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float(data.num_nurses_fulltime.count())
+
+print("P(x) is", p_x)
+print("P(y|x) is", p_y_given_x)
+print("P(y,x) is", p_y_and_x)

The Sum Rule

-

The other fundamental rule of probability is the sum rule this tells us how to get a marginal distribution from the joint distribution. Simply put it says that we need to sum across the value we’d like to remove.
P(Y = y) = ∑xP(Y = y, X = x)
Or in our shortened notation
P(y) = ∑xP(y, x)

-

Exercise 3

-

Write code that computes P(y) by adding P(y, x) for all values of x.

+

The other fundamental rule of probability is the sum +rule this tells us how to get a marginal distribution from +the joint distribution. Simply put it says that we need to sum across +the value we’d like to remove. \[ +P(Y=y) = \sum_{x} P(Y=y, X=x) +\] Or in our shortened notation \[ +P(y) = \sum_{x} P(y, x) +\]

+

Exercise 2

+

Write code that computes \(P(y)\) by +adding \(P(y, x)\) for all values of +\(x\).

Bayes’ Rule

-

Bayes’ rule is a very simple rule, it’s hardly worth the name of a rule at all. It follows directly from the product rule of probability. Because P(y, x) = P(y|x)P(x) and by symmetry P(y, x) = P(x, y) = P(x|y)P(y) then by equating these two equations and dividing through by P(y) we have
$$ +

Bayes’ rule is a very simple rule, it’s hardly worth the name of a +rule at all. It follows directly from the product rule of probability. +Because \(P(y, x) = P(y|x)P(x)\) and by +symmetry \(P(y,x)=P(x,y)=P(x|y)P(y)\) +then by equating these two equations and dividing through by \(P(y)\) we have \[ P(x|y) = \frac{P(y|x)P(x)}{P(y)} -$$
which is known as Bayes’ rule (or Bayes’s rule, it depends how you choose to pronounce it). It’s not difficult to derive, and its importance is more to do with the semantic operation that it enables. Each of these probability distributions represents the answer to a question we have about the world. Bayes rule (via the product rule) tells us how to invert the probability.

+\] which is known as Bayes’ rule (or Bayes’s rule, it depends how +you choose to pronounce it). It’s not difficult to derive, and its +importance is more to do with the semantic operation that it enables. +Each of these probability distributions represents the answer to a +question we have about the world. Bayes rule (via the product rule) +tells us how to invert the probability.

Further Reading

    -
  • Probability distributions: page 12–17 (Section 1.2) of Bishop (2006)
  • +
  • Probability distributions: page 12–17 (Section 1.2) of Bishop (2006)

Exercises

    -
  • Exercise 1.3 of Bishop (2006)
  • +
  • Exercise 1.3 of Bishop (2006)
-

Probabilities for Extracting Information from Data

-

[edit]

-

What use is all this probability in data science? Let’s think about how we might use the probabilities to do some decision making. Let’s look at the information data.

-
data.columns
-

Exercise 1

-

Now we see we have several additional features. Let’s assume we want to predict maternal_health_delivery_services. How would we go about doing it?

-

Using what you’ve learnt about joint, conditional and marginal probabilities, as well as the sum and product rule, how would you formulate the question you want to answer in terms of probabilities? Should you be using a joint or a conditional distribution? If it’s conditional, what should the distribution be over, and what should it be conditioned on?

+

Probabilities +for Extracting Information from Data

+
+[edit] +
+

What use is all this probability in data science? Let’s think about +how we might use the probabilities to do some decision making. Let’s +look at the information data.

+
data.columns
+

Exercise 3

+

Now we see we have several additional features. Let’s assume we want +to predict maternal_health_delivery_services. How would we +go about doing it?

+

Using what you’ve learnt about joint, conditional and marginal +probabilities, as well as the sum and product rule, how would you +formulate the question you want to answer in terms of probabilities? +Should you be using a joint or a conditional distribution? If it’s +conditional, what should the distribution be over, and what should it be +conditioned on?

Probabilistic Modelling

-

[edit]

-

This Bayesian approach is designed to deal with uncertainty arising from fitting our prediction function to the data we have, a reduced data set.

-

The Bayesian approach can be derived from a broader understanding of what our objective is. If we accept that we can jointly represent all things that happen in the world with a probability distribution, then we can interogate that probability to make predictions. So, if we are interested in predictions, $\dataScalar_*$ at future points input locations of interest, $\inputVector_*$ given previously training data, $\dataVector$ and corresponding inputs, $\inputMatrix$, then we are really interogating the following probability density,
$$ -p(\dataScalar_*|\dataVector, \inputMatrix, \inputVector_*), -$$
there is nothing controversial here, as long as you accept that you have a good joint model of the world around you that relates test data to training data, $p(\dataScalar_*, \dataVector, \inputMatrix, \inputVector_*)$ then this conditional distribution can be recovered through standard rules of probability (data + model → prediction).

-

We can construct this joint density through the use of the following decomposition:
$$ -p(\dataScalar_*|\dataVector, \inputMatrix, \inputVector_*) = \int p(\dataScalar_*|\inputVector_*, \mappingMatrix) p(\mappingMatrix | \dataVector, \inputMatrix) \text{d} \mappingMatrix -$$

-

where, for convenience, we are assuming all the parameters of the model are now represented by $\parameterVector$ (which contains $\mappingMatrix$ and $\mappingMatrixTwo$) and $p(\parameterVector | \dataVector, \inputMatrix)$ is recognised as the posterior density of the parameters given data and $p(\dataScalar_*|\inputVector_*, \parameterVector)$ is the likelihood of an individual test data point given the parameters.

-

The likelihood of the data is normally assumed to be independent across the parameters,
$$ -p(\dataVector|\inputMatrix, \mappingMatrix) = \prod_{i=1}^\numData p(\dataScalar_i|\inputVector_i, \mappingMatrix),$$

-

and if that is so, it is easy to extend our predictions across all future, potential, locations,
$$ -p(\dataVector_*|\dataVector, \inputMatrix, \inputMatrix_*) = \int p(\dataVector_*|\inputMatrix_*, \parameterVector) p(\parameterVector | \dataVector, \inputMatrix) \text{d} \parameterVector. -$$

-

The likelihood is also where the prediction function is incorporated. For example in the regression case, we consider an objective based around the Gaussian density,
$$ -p(\dataScalar_i | \mappingFunction(\inputVector_i)) = \frac{1}{\sqrt{2\pi \dataStd^2}} \exp\left(-\frac{\left(\dataScalar_i - \mappingFunction(\inputVector_i)\right)^2}{2\dataStd^2}\right) -$$

-

In short, that is the classical approach to probabilistic inference, and all approaches to Bayesian neural networks fall within this path. For a deep probabilistic model, we can simply take this one stage further and place a probability distribution over the input locations,
$$ -p(\dataVector_*|\dataVector) = \int p(\dataVector_*|\inputMatrix_*, \parameterVector) p(\parameterVector | \dataVector, \inputMatrix) p(\inputMatrix) p(\inputMatrix_*) \text{d} \parameterVector \text{d} \inputMatrix \text{d}\inputMatrix_* -$$
and we have unsupervised learning (from where we can get deep generative models).

+
+[edit] +
+

This Bayesian approach is designed to deal with uncertainty arising +from fitting our prediction function to the data we have, a reduced data +set.

+

The Bayesian approach can be derived from a broader understanding of +what our objective is. If we accept that we can jointly represent all +things that happen in the world with a probability distribution, then we +can interogate that probability to make predictions. So, if we are +interested in predictions, \(y_*\) at +future points input locations of interest, \(\mathbf{ x}_*\) given previously training +data, \(\mathbf{ y}\) and corresponding +inputs, \(\mathbf{X}\), then we are +really interogating the following probability density, \[ +p(y_*|\mathbf{ y}, \mathbf{X}, \mathbf{ x}_*), +\] there is nothing controversial here, as long as you accept +that you have a good joint model of the world around you that relates +test data to training data, \(p(y_*, \mathbf{ +y}, \mathbf{X}, \mathbf{ x}_*)\) then this conditional +distribution can be recovered through standard rules of probability +(\(\text{data} + \text{model} \rightarrow +\text{prediction}\)).

+

We can construct this joint density through the use of the following +decomposition: \[ +p(y_*|\mathbf{ y}, \mathbf{X}, \mathbf{ x}_*) = \int p(y_*|\mathbf{ +x}_*, \mathbf{W}) p(\mathbf{W}| \mathbf{ y}, \mathbf{X}) \text{d} +\mathbf{W} +\]

+

where, for convenience, we are assuming all the parameters +of the model are now represented by \(\boldsymbol{ \theta}\) (which contains +\(\mathbf{W}\) and \(\mathbf{V}\)) and \(p(\boldsymbol{ \theta}| \mathbf{ y}, +\mathbf{X})\) is recognised as the posterior density of the +parameters given data and \(p(y_*|\mathbf{ +x}_*, \boldsymbol{ \theta})\) is the likelihood of an +individual test data point given the parameters.

+

The likelihood of the data is normally assumed to be independent +across the parameters, \[ +p(\mathbf{ y}|\mathbf{X}, \mathbf{W}) = \prod_{i=1}^np(y_i|\mathbf{ +x}_i, \mathbf{W}),\]

+

and if that is so, it is easy to extend our predictions across all +future, potential, locations, \[ +p(\mathbf{ y}_*|\mathbf{ y}, \mathbf{X}, \mathbf{X}_*) = \int p(\mathbf{ +y}_*|\mathbf{X}_*, \boldsymbol{ \theta}) p(\boldsymbol{ \theta}| +\mathbf{ y}, \mathbf{X}) \text{d} \boldsymbol{ \theta}. +\]

+

The likelihood is also where the prediction function is +incorporated. For example in the regression case, we consider an +objective based around the Gaussian density, \[ +p(y_i | f(\mathbf{ x}_i)) = \frac{1}{\sqrt{2\pi \sigma^2}} +\exp\left(-\frac{\left(y_i - f(\mathbf{ +x}_i)\right)^2}{2\sigma^2}\right) +\]

+

In short, that is the classical approach to probabilistic inference, +and all approaches to Bayesian neural networks fall within this path. +For a deep probabilistic model, we can simply take this one stage +further and place a probability distribution over the input locations, +\[ +p(\mathbf{ y}_*|\mathbf{ y}) = \int p(\mathbf{ y}_*|\mathbf{X}_*, +\boldsymbol{ \theta}) p(\boldsymbol{ \theta}| \mathbf{ y}, \mathbf{X}) +p(\mathbf{X}) p(\mathbf{X}_*) \text{d} \boldsymbol{ \theta}\text{d} +\mathbf{X}\text{d}\mathbf{X}_* +\] and we have unsupervised learning (from where we can +get deep generative models).

Graphical Models

-

[edit]

-

One way of representing a joint distribution is to consider conditional dependencies between data. Conditional dependencies allow us to factorize the distribution. For example, a Markov chain is a factorization of a distribution into components that represent the conditional relationships between points that are neighboring, often in time or space. It can be decomposed in the following form.
$$p(\dataVector) = p(\dataScalar_\numData | \dataScalar_{\numData-1}) p(\dataScalar_{\numData-1}|\dataScalar_{\numData-2}) \dots p(\dataScalar_{2} | \dataScalar_{1})$$

+
+[edit] +
+

One way of representing a joint distribution is to consider +conditional dependencies between data. Conditional dependencies allow us +to factorize the distribution. For example, a Markov chain is a +factorization of a distribution into components that represent the +conditional relationships between points that are neighboring, often in +time or space. It can be decomposed in the following form. \[p(\mathbf{ y}) = p(y_n| y_{n-1}) +p(y_{n-1}|y_{n-2}) \dots p(y_{2} | y_{1})\]

- + -
+
-

Figure: A Markov chain is a simple form of probabilistic graphical model providing a particular decomposition of the joint density.

+

Figure: A Markov chain is a simple form of probabilistic graphical +model providing a particular decomposition of the joint density.

-

By specifying conditional independencies we can reduce the parameterization required for our data, instead of directly specifying the parameters of the joint distribution, we can specify each set of parameters of the conditonal independently. This can also give an advantage in terms of interpretability. Understanding a conditional independence structure gives a structured understanding of data. If developed correctly, according to causal methodology, it can even inform how we should intervene in the system to drive a desired result (Pearl 1995).

-

However, a challenge arises when the data becomes more complex. Consider the graphical model shown below, used to predict the perioperative risk of C Difficile infection following colon surgery (Steele et al. 2012).

+

By specifying conditional independencies we can reduce the +parameterization required for our data, instead of directly specifying +the parameters of the joint distribution, we can specify each set of +parameters of the conditonal independently. This can also give an +advantage in terms of interpretability. Understanding a conditional +independence structure gives a structured understanding of data. If +developed correctly, according to causal methodology, it can even inform +how we should intervene in the system to drive a desired result (Pearl, +1995).

+

However, a challenge arises when the data becomes more complex. +Consider the graphical model shown below, used to predict the +perioperative risk of C Difficile infection following colon +surgery (Steele +et al., 2012).

- +
-
+
-

Figure: A probabilistic directed graph used to predict the perioperative risk of C Difficile infection following colon surgery. When these models have good predictive performance they are often difficult to interpret. This may be due to the limited representation capability of the conditional densities in the model.

-
-
-

To capture the complexity in the interelationship between the data, the graph itself becomes more complex, and less interpretable.

-

Introduction to Classification

-

[edit]

-

Classification is perhaps the technique most closely assocated with machine learning. In the speech based agents, on-device classifiers are used to determine when the wake word is used. A wake word is a word that wakes up the device. For the Amazon Echo it is “Alexa”, for Siri it is “Hey Siri”. Once the wake word detected with a classifier, the speech can be uploaded to the cloud for full processing, the speech recognition stages.

-

This isn’t just useful for intelligent agents, the UN global pulse project on public discussion on radio also uses wake word detection for recording radio conversations.

-

A major breakthrough in image classification came in 2012 with the ImageNet result of Alex Krizhevsky, Ilya Sutskever and Geoff Hinton from the University of Toronto. ImageNet is a large data base of 14 million images with many thousands of classes. The data is used in a community-wide challenge for object categorization. Krizhevsky et al used convolutional neural networks to outperform all previous approaches on the challenge. They formed a company which was purchased shortly after by Google. This challenge, known as object categorisation, was a major obstacle for practical computer vision systems. Modern object categorization systems are close to human performance.

-

Machine learning problems normally involve a prediction function and an objective function. Regression is the case where the prediction function iss over the real numbers, so the codomain of the functions, $\mappingFunction(\inputMatrix)$ was the real numbers or sometimes real vectors. The classification problem consists of predicting whether or not a particular example is a member of a particular class. So we may want to know if a particular image represents a digit 6 or if a particular user will click on a given advert. These are classification problems, and they require us to map to yes or no answers. That makes them naturally discrete mappings.

-

In classification we are given an input vector, $\inputVector$, and an associated label, $\dataScalar$ which either takes the value  − 1 to represent no or 1 to represent yes.

-

In supervised learning the inputs, $\inputVector$, are mapped to a label, $\dataScalar$, through a function $\mappingFunction(\cdot)$ that is dependent on a set of parameters, $\weightVector$,
$$ -\dataScalar = \mappingFunction(\inputVector; \weightVector). -$$
The function $\mappingFunction(\cdot)$ is known as the prediction function. The key challenges are (1) choosing which features, $\inputVector$, are relevant in the prediction, (2) defining the appropriate class of function, $\mappingFunction(\cdot)$, to use and (3) selecting the right parameters, $\weightVector$.

+

Figure: A probabilistic directed graph used to predict the +perioperative risk of C Difficile infection following colon +surgery. When these models have good predictive performance they are +often difficult to interpret. This may be due to the limited +representation capability of the conditional densities in the model.

+
+ +

To capture the complexity in the interelationship between the data, +the graph itself becomes more complex, and less interpretable.

+

Introduction to +Classification

+
+[edit] +
+

Classification is perhaps the technique most closely assocated with +machine learning. In the speech based agents, on-device classifiers are +used to determine when the wake word is used. A wake word is a word that +wakes up the device. For the Amazon Echo it is “Alexa”, for Siri it is +“Hey Siri”. Once the wake word detected with a classifier, the speech +can be uploaded to the cloud for full processing, the speech recognition +stages.

+

This isn’t just useful for intelligent agents, the UN global pulse +project on public discussion on radio also uses wake word detection for +recording radio conversations.

+

A major breakthrough in image classification came in 2012 with the +ImageNet result of Alex +Krizhevsky, Ilya Sutskever and Geoff Hinton from the University of +Toronto. ImageNet is a large data base of 14 million images with many +thousands of classes. The data is used in a community-wide challenge for +object categorization. Krizhevsky et al used convolutional neural +networks to outperform all previous approaches on the challenge. They +formed a company which was purchased shortly after by Google. This +challenge, known as object categorisation, was a major obstacle for +practical computer vision systems. Modern object categorization systems +are close to human performance.

+

Machine learning problems normally involve a prediction function and +an objective function. Regression is the case where the prediction +function iss over the real numbers, so the codomain of the functions, +\(f(\mathbf{X})\) was the real numbers +or sometimes real vectors. The classification problem consists of +predicting whether or not a particular example is a member of a +particular class. So we may want to know if a particular image +represents a digit 6 or if a particular user will click on a given +advert. These are classification problems, and they require us to map to +yes or no answers. That makes them naturally discrete +mappings.

+

In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, +\(y\) which either takes the value +\(-1\) to represent no or +\(1\) to represent yes.

+

In supervised learning the inputs, \(\mathbf{ x}\), are mapped to a label, \(y\), through a function \(f(\cdot)\) that is dependent on a set of +parameters, \(\mathbf{ w}\), \[ +y= f(\mathbf{ x}; \mathbf{ w}). +\] The function \(f(\cdot)\) is +known as the prediction function. The key challenges are (1) +choosing which features, \(\mathbf{ +x}\), are relevant in the prediction, (2) defining the +appropriate class of function, \(f(\cdot)\), to use and (3) selecting the +right parameters, \(\mathbf{ w}\).

Classification Examples

-

[edit]

+
+[edit] +
    -
  • Classifiying hand written digits from binary images (automatic zip code reading)
  • +
  • Classifiying hand written digits from binary images (automatic zip +code reading)
  • Detecting faces in images (e.g. digital cameras).
  • Who a detected face belongs to (e.g. Facebook, DeepFace)
  • Classifying type of cancer given gene expression data.
  • -
  • Categorization of document types (different types of news article on the internet)
  • +
  • Categorization of document types (different types of news article on +the internet)

Bernoulli Distribution

-

[edit]

-

Our focus has been on models where the objective function is inspired by a probabilistic analysis of the problem. In particular we’ve argued that we answer questions about the data set by placing probability distributions over the various quantities of interest. For the case of binary classification this will normally involve introducing probability distributions for discrete variables. Such probability distributions, are in some senses easier than those for continuous variables, in particular we can represent a probability distribution over $\dataScalar$, where $\dataScalar$ is binary, with one value. If we specify the probability that $\dataScalar=1$ with a number that is between 0 and 1, i.e. let’s say that $P(\dataScalar=1) = \pi$ (here we don’t mean π the number, we are setting π to be a variable) then we can specify the probability distribution through a table.

+
+[edit] +
+

Our focus has been on models where the objective function is inspired +by a probabilistic analysis of the problem. In particular we’ve argued +that we answer questions about the data set by placing probability +distributions over the various quantities of interest. For the case of +binary classification this will normally involve introducing probability +distributions for discrete variables. Such probability distributions, +are in some senses easier than those for continuous variables, in +particular we can represent a probability distribution over \(y\), where \(y\) is binary, with one value. If we +specify the probability that \(y=1\) +with a number that is between 0 and 1, i.e. let’s say that \(P(y=1) = \pi\) (here we don’t mean \(\pi\) the number, we are setting \(\pi\) to be a variable) then we can specify +the probability distribution through a table.

- + - - - + + +
$\dataScalar$\(y\) 0 1
$P(\dataScalar)$(1 − π)π\(P(y)\)\((1-\pi)\)\(\pi\)
-

Mathematically we can use a trick to implement this same table. We can use the value $\dataScalar$ as a mathematical switch and write that
$$ - P(\dataScalar) = \pi^\dataScalar (1-\pi)^{(1-\dataScalar)} - $$
where our probability distribution is now written as a function of $\dataScalar$. This probability distribution is known as the Bernoulli distribution. The Bernoulli distribution is a clever trick for mathematically switching between two probabilities if we were to write it as code it would be better described as

-
def bernoulli(y_i, pi):
-    if y_i == 1:
-        return pi
-    else:
-        return 1-pi
-

If we insert $\dataScalar=1$ then the function is equal to π, and if we insert $\dataScalar=0$ then the function is equal to 1 − π. So the function recreates the table for the distribution given above.

-

The probability distribution is named for Jacob Bernoulli, the swiss mathematician. In his book Ars Conjectandi he considered the distribution and the result of a number of ‘trials’ under the Bernoulli distribution to form the binomial distribution. Below is the page where he considers Pascal’s triangle in forming combinations of the Bernoulli distribution to realise the binomial distribution for the outcome of positive trials.

-
- + -
+
-

Figure: Jacob Bernoulli described the Bernoulli distribution through an urn in which there are black and red balls.

+

Figure: Jacob Bernoulli described the Bernoulli distribution through +an urn in which there are black and red balls.

-

Thomas Bayes also described the Bernoulli distribution, only he didn’t refer to Jacob Bernoulli’s work, so he didn’t call it by that name. He described the distribution in terms of a table (think of a billiard table) and two balls. Bayes suggests that each ball can be rolled across the table such that it comes to rest at a position that is uniformly distributed between the sides of the table.

-

Let’s assume that the first ball is rolled, and that it comes to reset at a position that is π times the width of the table from the left hand side.

-

Now, we roll the second ball. We are interested if the second ball ends up on the left side (+ve result) or the right side (-ve result) of the first ball. We use the Bernoulli distribution to determine this.

-

For this reason in Bayes’s distribution there is considered to be aleatoric uncertainty about the distribution parameter.

+

Thomas Bayes also described the Bernoulli distribution, only he +didn’t refer to Jacob Bernoulli’s work, so he didn’t call it by that +name. He described the distribution in terms of a table (think of a +billiard table) and two balls. Bayes suggests that each ball +can be rolled across the table such that it comes to rest at a position +that is uniformly distributed between the sides of the +table.

+

Let’s assume that the first ball is rolled, and that it comes to +reset at a position that is \(\pi\) +times the width of the table from the left hand side.

+

Now, we roll the second ball. We are interested if the second ball +ends up on the left side (+ve result) or the right side (-ve result) of +the first ball. We use the Bernoulli distribution to determine this.

+

For this reason in Bayes’s distribution there is considered to be +aleatoric uncertainty about the distribution parameter.

- + -
+
-

Figure: Thomas Bayes described the Bernoulli distribution independently of Jacob Bernoulli. He used the analogy of a billiard table. Any ball on the table is given a uniformly random position between the left and right side of the table. The first ball (in the figure) gives the parameter of the Bernoulli distribution. The second ball (in the figure) gives the outcome as either left or right (relative to the first ball). This is the origin of the term Bayesian because the parameter of the distribution is drawn from a probsbility.

-
-
-

Maximum Likelihood in the Bernoulli

-

[edit]

-

Maximum likelihood in the Bernoulli distribution is straightforward. Let’s assume we have data, $\dataVector$ which consists of a vector of binary values of length n. If we assume each value was sampled independently from the Bernoulli distribution, conditioned on the parameter π then our joint probability density has the form
$$ -p(\dataVector|\pi) = \prod_{i=1}^{\numData} \pi^{\dataScalar_i} (1-\pi)^{1-\dataScalar_i}. -$$
As normal in maximum likelihood we consider the negative log likelihood as our objective,
$$\begin{align*} - \errorFunction(\pi)& = -\log p(\dataVector|\pi)\\ - & = -\sum_{i=1}^{\numData} \dataScalar_i \log \pi - \sum_{i=1}^{\numData} (1-\dataScalar_i) \log(1-\pi), - \end{align*}$$

-

and we can derive the gradient with respect to the parameter π.
$$\frac{\text{d}\errorFunction(\pi)}{\text{d}\pi} = -\frac{\sum_{i=1}^{\numData} \dataScalar_i}{\pi} + \frac{\sum_{i=1}^{\numData} (1-\dataScalar_i)}{1-\pi},$$

-

and as normal we look for a stationary point for the log likelihood by setting this derivative to zero,
$$0 = -\frac{\sum_{i=1}^{\numData} \dataScalar_i}{\pi} + \frac{\sum_{i=1}^{\numData} (1-\dataScalar_i)}{1-\pi},$$
rearranging we form
$$(1-\pi)\sum_{i=1}^{\numData} \dataScalar_i = \pi\sum_{i=1}^{\numData} (1-\dataScalar_i),$$
which implies
$$\sum_{i=1}^{\numData} \dataScalar_i = \pi\left(\sum_{i=1}^{\numData} (1-\dataScalar_i) + \sum_{i=1}^{\numData} \dataScalar_i\right),$$

-

and now we recognise that $\sum_{i=1}^{\numData} (1-\dataScalar_i) + \sum_{i=1}^{\numData} \dataScalar_i = \numData$ so we have
$$\pi = \frac{\sum_{i=1}^{\numData} \dataScalar_i}{\numData}$$

-

so in other words we estimate the probability associated with the Bernoulli by setting it to the number of observed positives, divided by the total length of $\dataScalar$. This makes intiutive sense. If I asked you to estimate the probability of a coin being heads, and you tossed the coin 100 times, and recovered 47 heads, then the estimate of the probability of heads should be $\frac{47}{100}$.

+

Figure: Thomas Bayes described the Bernoulli distribution +independently of Jacob Bernoulli. He used the analogy of a billiard +table. Any ball on the table is given a uniformly random position +between the left and right side of the table. The first ball (in the +figure) gives the parameter of the Bernoulli distribution. The second +ball (in the figure) gives the outcome as either left or right (relative +to the first ball). This is the origin of the term Bayesian because the +parameter of the distribution is drawn from a probsbility.

+ + +

Maximum Likelihood in the +Bernoulli

+
+[edit] +
+

Maximum likelihood in the Bernoulli distribution is straightforward. +Let’s assume we have data, \(\mathbf{ +y}\) which consists of a vector of binary values of length \(n\). If we assume each value was sampled +independently from the Bernoulli distribution, conditioned on the +parameter \(\pi\) then our joint +probability density has the form \[ +p(\mathbf{ y}|\pi) = \prod_{i=1}^{n} \pi^{y_i} (1-\pi)^{1-y_i}. +\] As normal in maximum likelihood we consider the negative log +likelihood as our objective, \[\begin{align*} + E(\pi)& = -\log p(\mathbf{ y}|\pi)\\ + & = -\sum_{i=1}^{n} y_i \log \pi - +\sum_{i=1}^{n} (1-y_i) \log(1-\pi), + \end{align*}\]

+

and we can derive the gradient with respect to the parameter \(\pi\). \[\frac{\text{d}E(\pi)}{\text{d}\pi} = +-\frac{\sum_{i=1}^{n} y_i}{\pi} + \frac{\sum_{i=1}^{n} +(1-y_i)}{1-\pi},\]

+

and as normal we look for a stationary point for the log likelihood +by setting this derivative to zero, \[0 = +-\frac{\sum_{i=1}^{n} y_i}{\pi} + \frac{\sum_{i=1}^{n} +(1-y_i)}{1-\pi},\] rearranging we form \[(1-\pi)\sum_{i=1}^{n} y_i = \pi\sum_{i=1}^{n} +(1-y_i),\] which implies \[\sum_{i=1}^{n} y_i = \pi\left(\sum_{i=1}^{n} +(1-y_i) + \sum_{i=1}^{n} y_i\right),\]

+

and now we recognise that \(\sum_{i=1}^{n} +(1-y_i) + \sum_{i=1}^{n} y_i = n\) so we have \[\pi = \frac{\sum_{i=1}^{n} y_i}{n}\]

+

so in other words we estimate the probability associated with the +Bernoulli by setting it to the number of observed positives, divided by +the total length of \(y\). This makes +intiutive sense. If I asked you to estimate the probability of a coin +being heads, and you tossed the coin 100 times, and recovered 47 heads, +then the estimate of the probability of heads should be \(\frac{47}{100}\).

Exercise 4

-

Show that the maximum likelihood solution we have found is a minimum for our objective.

-


$$ +

Show that the maximum likelihood solution we have found is a +minimum for our objective.

+

\[ \text{posterior} = \frac{\text{likelihood}\times\text{prior}}{\text{marginal likelihood}} -$$

+\]

Four components:

  1. Prior distribution
  2. @@ -410,276 +985,655 @@

    Exercise 4

  3. Marginal likelihood

Naive Bayes Classifiers

-

[edit]

-

Note: Everything we do below is possible using standard packages like scikit-learn, our purpose in this session is to help you understand how those engines are constructed. In practice for an application you should use a library like scikit-learn.

-

In probabilistic machine learning we place probability distributions (or densities) over all the variables of interest, our first classification algorithm will do just that. We will consider how to form a classification by making assumptions about the joint density of our observations. We need to make assumptions to reduce the number of parameters we need to optimise.

-

In the ideal world, given label data $\dataVector$ and the inputs $\inputMatrix$ we should be able to specify the joint density of all potential values of $\dataVector$ and $\inputMatrix$, $p(\dataVector, \inputMatrix)$. If $\inputMatrix$ and $\dataVector$ are our training data, and we can somehow extend our density to incorporate future test data (by augmenting $\dataVector$ with a new observation $\dataScalar^*$ and $\inputMatrix$ with the corresponding inputs, $\inputVector^*$), then we can answer any given question about a future test point $\dataScalar^*$ given its covariates $\inputVector^*$ by conditioning on the training variables to recover,
$$ -p(\dataScalar^*|\inputMatrix, \dataVector, \inputVector^*), -$$

-

We can compute this distribution using the product and sum rules. However, to specify this density we must give the probability associated with all possible combinations of $\dataVector$ and $\inputMatrix$. There are $2^{\numData}$ possible combinations for the vector $\dataVector$ and the probability for each of these combinations must be jointly specified along with the joint density of the matrix $\inputMatrix$, as well as being able to extend the density for any chosen test location $\inputVector^*$.

-

In naive Bayes we make certain simplifying assumptions that allow us to perform all of the above in practice.

-

Data Conditional Independence

-

If we are given model parameters $\paramVector$ we assume that conditioned on all these parameters that all data points in the model are independent. In other words we have,
$$ - p(\dataScalar^*, \inputVector^*, \dataVector, \inputMatrix|\paramVector) = p(\dataScalar^*, \inputVector^*|\paramVector)\prod_{i=1}^{\numData} p(\dataScalar_i, \inputVector_i | \paramVector). - $$
This is a conditional independence assumption because we are not assuming our data are purely independent. If we were to assume that, then there would be nothing to learn about our test data given our training data. We are assuming that they are independent given our parameters, $\paramVector$. We made similar assumptions for regression, where our parameter set included $\mappingVector$ and $\dataStd^2$. Given those parameters we assumed that the density over $\dataVector, \dataScalar^*$ was independent. Here we are going a little further with that assumption because we are assuming the joint density of $\dataVector$ and $\inputMatrix$ is independent across the data given the parameters.

-

Computing posterior distribution in this case becomes easier, this is known as the ‘Bayes classifier’.

-

Feature Conditional Independence

-


$$ -p(\inputVector_i | \dataScalar_i, \paramVector) = \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector) -$$
where $\dataDim$ is the dimensionality of our inputs.

-

The assumption that is particular to naive Bayes is to now consider that the features are also conditionally independent, but not only given the parameters. We assume that the features are independent given the parameters and the label. So for each data point we have
$$p(\inputVector_i | \dataScalar_i, \paramVector) = \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i,\paramVector)$$
where $\dataDim$ is the dimensionality of our inputs.

-

Marginal Density for $\dataScalar_i$

-


$$ -p(\inputScalar_{i,j},\dataScalar_i| \paramVector) = p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i). -$$

-

We now have nearly all of the components we need to specify the full joint density. However, the feature conditional independence doesn’t yet give us the joint density over $p(\dataScalar_i, \inputVector_i)$ which is required to subsitute in to our data conditional independence to give us the full density. To recover the joint density given the conditional distribution of each feature, $p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)$, we need to make use of the product rule and combine it with a marginal density for $\dataScalar_i$,

-


$$p(\inputScalar_{i,j},\dataScalar_i| \paramVector) = p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i).$$
Because $\dataScalar_i$ is binary the Bernoulli density makes a suitable choice for our prior over $\dataScalar_i$,
$$p(\dataScalar_i|\pi) = \pi^{\dataScalar_i} (1-\pi)^{1-\dataScalar_i}$$
where π now has the interpretation as being the prior probability that the classification should be positive.

-

Joint Density for Naive Bayes

-

This allows us to write down the full joint density of the training data,
$$ - p(\dataVector, \inputMatrix|\paramVector, \pi) = \prod_{i=1}^{\numData} \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i|\pi) - $$

-

which can now be fit by maximum likelihood. As normal we form our objective as the negative log likelihood,

-


$$\begin{align*} -\errorFunction(\paramVector, \pi)& = -\log p(\dataVector, \inputMatrix|\paramVector, \pi) \\ &= -\sum_{i=1}^{\numData} \sum_{j=1}^{\dataDim} \log p(\inputScalar_{i, j}|\dataScalar_i, \paramVector) - \sum_{i=1}^{\numData} \log p(\dataScalar_i|\pi), -\end{align*}$$
which we note decomposes into two objective functions, one which is dependent on π alone and one which is dependent on $\paramVector$ alone so we have,
$$ -\errorFunction(\pi, \paramVector) = \errorFunction(\paramVector) + \errorFunction(\pi). -$$
Since the two objective functions are separately dependent on the parameters π and $\paramVector$ we can minimize them independently. Firstly, minimizing the Bernoulli likelihood over the labels we have,
$$ -\errorFunction(\pi) = -\sum_{i=1}^{\numData}\log p(\dataScalar_i|\pi) = -\sum_{i=1}^{\numData} \dataScalar_i \log \pi - \sum_{i=1}^{\numData} (1-\dataScalar_i) \log (1-\pi) -$$
which we already minimized above recovering
$$ -\pi = \frac{\sum_{i=1}^{\numData} \dataScalar_i}{\numData}. -$$

-

We now need to minimize the objective associated with the conditional distributions for the features,
$$ -\errorFunction(\paramVector) = -\sum_{i=1}^{\numData} \sum_{j=1}^{\dataDim} \log p(\inputScalar_{i, j} |\dataScalar_i, \paramVector), -$$
which necessarily implies making some assumptions about the form of the conditional distributions. The right assumption will depend on the nature of our input data. For example, if we have an input which is real valued, we could use a Gaussian density and we could allow the mean and variance of the Gaussian to be different according to whether the class was positive or negative and according to which feature we were measuring. That would give us the form,
$$ -p(\inputScalar_{i, j} | \dataScalar_i,\paramVector) = \frac{1}{\sqrt{2\pi \dataStd_{\dataScalar_i,j}^2}} \exp \left(-\frac{(\inputScalar_{i,j} - \mu_{\dataScalar_i, j})^2}{\dataStd_{\dataScalar_i,j}^2}\right), -$$
where $\dataStd_{1, j}^2$ is the variance of the density for the jth output and the class $\dataScalar_i=1$ and $\dataStd_{0, j}^2$ is the variance if the class is 0. The means can vary similarly. Our parameters, $\paramVector$ would consist of all the means and all the variances for the different dimensions.

-

As normal we form our objective as the negative log likelihood,
$$ -\errorFunction(\paramVector, \pi) = -\log p(\dataVector, \inputMatrix|\paramVector, \pi) = -\sum_{i=1}^{\numData} \sum_{j=1}^{\dataDim} \log p(\inputScalar_{i, j}|\dataScalar_i, \paramVector) - \sum_{i=1}^{\numData} \log p(\dataScalar_i|\pi), -$$
which we note decomposes into two objective functions, one which is dependent on π alone and one which is dependent on $\paramVector$ alone so we have,
$$ -\errorFunction(\pi, \paramVector) = \errorFunction(\paramVector) + \errorFunction(\pi). -$$

-

Nigerian NMIS Data

-

[edit]

-

First we will load in the Nigerian NMIS health data. Our aim will be to predict whether a center has maternal health delivery services given the attributes in the data. We will predict of the number of nurses, the number of doctors, location etc.

-

Let’s first remind ourselves of the data.

-
data.head()
-

Now we will convert this data into a form which we can use as inputs X, and labels y.

-
import pandas as pd
-import numpy as np
-
data = data[~pd.isnull(data['maternal_health_delivery_services'])]
-data = data.dropna() # Remove entries with missing values
-X = data[['emergency_transport',
-          'num_chews_fulltime', 
-          'phcn_electricity',
-          'child_health_measles_immun_calc',
-          'num_nurses_fulltime',
-          'num_doctors_fulltime', 
-          'improved_water_supply', 
-          'improved_sanitation',
-          'antenatal_care_yn', 
-          'family_planning_yn',
-          'malaria_treatment_artemisinin', 
-          'latitude', 
-          'longitude']].copy()
-y = data['maternal_health_delivery_services']==True  # set label to be whether there's a maternal health delivery service
-
-# Create series of health center types with the relevant index
-s = data['facility_type_display'].apply(pd.Series, 1).stack() 
-s.index = s.index.droplevel(-1) # to line up with df's index
-
-# Extract from the series the unique list of types.
-types = s.unique()
-
-# For each type extract the indices where it is present and add a column to X
-type_names = []
-for htype in types:
-    index = s[s==htype].index.tolist()
-    type_col=htype.replace(' ', '_').replace('/','-').lower()
-    type_names.append(type_col)
-    X.loc[:, type_col] = 0.0 
-    X.loc[index, type_col] = 1.0
-

This has given us a new data frame X which contains the different facility types in different columns.

-
X.describe()
+
+[edit] +
+

Note: Everything we do below is possible using standard +packages like scikit-learn, our purpose in this session is +to help you understand how those engines are constructed. In practice +for an application you should use a library like +scikit-learn.

+

In probabilistic machine learning we place probability distributions +(or densities) over all the variables of interest, our first +classification algorithm will do just that. We will consider how to form +a classification by making assumptions about the joint density +of our observations. We need to make assumptions to reduce the number of +parameters we need to optimise.

+

In the ideal world, given label data \(\mathbf{ y}\) and the inputs \(\mathbf{X}\) we should be able to specify +the joint density of all potential values of \(\mathbf{ y}\) and \(\mathbf{X}\), \(p(\mathbf{ y}, \mathbf{X})\). If \(\mathbf{X}\) and \(\mathbf{ y}\) are our training data, and we +can somehow extend our density to incorporate future test data (by +augmenting \(\mathbf{ y}\) with a new +observation \(y^*\) and \(\mathbf{X}\) with the corresponding inputs, +\(\mathbf{ x}^*\)), then we can answer +any given question about a future test point \(y^*\) given its covariates \(\mathbf{ x}^*\) by conditioning on the +training variables to recover, \[ +p(y^*|\mathbf{X}, \mathbf{ y}, \mathbf{ x}^*), +\]

+

We can compute this distribution using the product and sum rules. +However, to specify this density we must give the probability associated +with all possible combinations of \(\mathbf{ +y}\) and \(\mathbf{X}\). There +are \(2^{n}\) possible combinations for +the vector \(\mathbf{ y}\) and the +probability for each of these combinations must be jointly specified +along with the joint density of the matrix \(\mathbf{X}\), as well as being able to +extend the density for any chosen test location \(\mathbf{ x}^*\).

+

In naive Bayes we make certain simplifying assumptions that allow us +to perform all of the above in practice.

+

Data Conditional +Independence

+

If we are given model parameters \(\boldsymbol{ \theta}\) we assume that +conditioned on all these parameters that all data points in the model +are independent. In other words we have, \[ + p(y^*, \mathbf{ x}^*, \mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}) = +p(y^*, \mathbf{ x}^*|\boldsymbol{ \theta})\prod_{i=1}^{n} p(y_i, +\mathbf{ x}_i | \boldsymbol{ \theta}). + \] This is a conditional independence assumption because we are +not assuming our data are purely independent. If we were to assume that, +then there would be nothing to learn about our test data given our +training data. We are assuming that they are independent given +our parameters, \(\boldsymbol{ +\theta}\). We made similar assumptions for regression, where our +parameter set included \(\mathbf{ w}\) +and \(\sigma^2\). Given those +parameters we assumed that the density over \(\mathbf{ y}, y^*\) was +independent. Here we are going a little further with that +assumption because we are assuming the joint density of \(\mathbf{ y}\) and \(\mathbf{X}\) is independent across the data +given the parameters.

+

Computing posterior distribution in this case becomes easier, this is +known as the ‘Bayes classifier’.

+

Feature Conditional +Independence

+

\[ +p(\mathbf{ x}_i | y_i, \boldsymbol{ \theta}) = \prod_{j=1}^{p} +p(x_{i,j}|y_i, \boldsymbol{ \theta}) +\] where \(p\) is the +dimensionality of our inputs.

+

The assumption that is particular to naive Bayes is to now consider +that the features are also conditionally independent, but not +only given the parameters. We assume that the features are independent +given the parameters and the label. So for each data point we +have \[p(\mathbf{ x}_i | y_i, \boldsymbol{ +\theta}) = \prod_{j=1}^{p} p(x_{i,j}|y_i,\boldsymbol{ \theta})\] +where \(p\) is the dimensionality of +our inputs.

+

Marginal Density for \(y_i\)

+

\[ +p(x_{i,j},y_i| \boldsymbol{ \theta}) = p(x_{i,j}|y_i, \boldsymbol{ +\theta})p(y_i). +\]

+

We now have nearly all of the components we need to specify the full +joint density. However, the feature conditional independence doesn’t yet +give us the joint density over \(p(y_i, +\mathbf{ x}_i)\) which is required to subsitute in to our data +conditional independence to give us the full density. To recover the +joint density given the conditional distribution of each feature, \(p(x_{i,j}|y_i, \boldsymbol{ \theta})\), we +need to make use of the product rule and combine it with a marginal +density for \(y_i\),

+

\[p(x_{i,j},y_i| \boldsymbol{ \theta}) = +p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i).\] Because \(y_i\) is binary the Bernoulli +density makes a suitable choice for our prior over \(y_i\), \[p(y_i|\pi) = \pi^{y_i} (1-\pi)^{1-y_i}\] +where \(\pi\) now has the +interpretation as being the prior probability that the +classification should be positive.

+

Joint Density for Naive +Bayes

+

This allows us to write down the full joint density of the training +data, \[ + p(\mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}, \pi) = \prod_{i=1}^{n} +\prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi) + \]

+

which can now be fit by maximum likelihood. As normal we form our +objective as the negative log likelihood,

+

\[\begin{align*} +E(\boldsymbol{ \theta}, \pi)& = -\log p(\mathbf{ y}, +\mathbf{X}|\boldsymbol{ \theta}, \pi) \\ &= -\sum_{i=1}^{n} +\sum_{j=1}^{p} \log p(x_{i, j}|y_i, \boldsymbol{ \theta}) +- \sum_{i=1}^{n} \log p(y_i|\pi), +\end{align*}\] which we note decomposes into two +objective functions, one which is dependent on \(\pi\) alone and one which is dependent on +\(\boldsymbol{ \theta}\) alone so we +have, \[ +E(\pi, \boldsymbol{ \theta}) = E(\boldsymbol{ \theta}) + E(\pi). +\] Since the two objective functions are separately dependent on +the parameters \(\pi\) and \(\boldsymbol{ \theta}\) we can minimize them +independently. Firstly, minimizing the Bernoulli likelihood over the +labels we have, \[ +E(\pi) = -\sum_{i=1}^{n}\log p(y_i|\pi) = -\sum_{i=1}^{n} y_i \log \pi - +\sum_{i=1}^{n} (1-y_i) \log (1-\pi) +\] which we already minimized above recovering \[ +\pi = \frac{\sum_{i=1}^{n} y_i}{n}. +\]

+

We now need to minimize the objective associated with the conditional +distributions for the features, \[ +E(\boldsymbol{ \theta}) = -\sum_{i=1}^{n} \sum_{j=1}^{p} \log p(x_{i, j} +|y_i, \boldsymbol{ \theta}), +\] which necessarily implies making some assumptions about the +form of the conditional distributions. The right assumption will depend +on the nature of our input data. For example, if we have an input which +is real valued, we could use a Gaussian density and we could allow the +mean and variance of the Gaussian to be different according to whether +the class was positive or negative and according to which feature we +were measuring. That would give us the form, \[ +p(x_{i, j} | y_i,\boldsymbol{ \theta}) = \frac{1}{\sqrt{2\pi +\sigma_{y_i,j}^2}} \exp \left(-\frac{(x_{i,j} - \mu_{y_i, +j})^2}{\sigma_{y_i,j}^2}\right), +\] where \(\sigma_{1, j}^2\) is +the variance of the density for the \(j\)th output and the class \(y_i=1\) and \(\sigma_{0, j}^2\) is the variance if the +class is 0. The means can vary similarly. Our parameters, \(\boldsymbol{ \theta}\) would consist of all +the means and all the variances for the different dimensions.

+

As normal we form our objective as the negative log likelihood, \[ +E(\boldsymbol{ \theta}, \pi) = -\log p(\mathbf{ y}, +\mathbf{X}|\boldsymbol{ \theta}, \pi) = -\sum_{i=1}^{n} \sum_{j=1}^{p} +\log p(x_{i, j}|y_i, \boldsymbol{ \theta}) - \sum_{i=1}^{n} \log +p(y_i|\pi), +\] which we note decomposes into two objective +functions, one which is dependent on \(\pi\) alone and one which is dependent on +\(\boldsymbol{ \theta}\) alone so we +have, \[ +E(\pi, \boldsymbol{ \theta}) = E(\boldsymbol{ \theta}) + E(\pi). +\]

+

Nigeria NMIS Data +Classification

+
+[edit] +
+

Our aim will be to predict whether a center has maternal health +delivery services given the attributes in the data. We will predict of +the number of nurses, the number of doctors, location etc.

+

Now we will convert this data into a form which we can use as inputs +X, and labels y.

+
import pandas as pd
+import numpy as np
+
data = data[~pd.isnull(data['maternal_health_delivery_services'])]
+data = data.dropna() # Remove entries with missing values
+X = data[['emergency_transport',
+          'num_chews_fulltime', 
+          'phcn_electricity',
+          'child_health_measles_immun_calc',
+          'num_nurses_fulltime',
+          'num_doctors_fulltime', 
+          'improved_water_supply', 
+          'improved_sanitation',
+          'antenatal_care_yn', 
+          'family_planning_yn',
+          'malaria_treatment_artemisinin', 
+          'latitude', 
+          'longitude']].copy()
+y = data['maternal_health_delivery_services']==True  # set label to be whether there's a maternal health delivery service
+
+# Create series of health center types with the relevant index
+s = data['facility_type_display'].apply(pd.Series, 1).stack() 
+s.index = s.index.droplevel(-1) # to line up with df's index
+
+# Extract from the series the unique list of types.
+types = s.unique()
+
+# For each type extract the indices where it is present and add a column to X
+type_names = []
+for htype in types:
+    index = s[s==htype].index.tolist()
+    type_col=htype.replace(' ', '_').replace('/','-').lower()
+    type_names.append(type_col)
+    X.loc[:, type_col] = 0.0 
+    X.loc[index, type_col] = 1.0
+

This has given us a new data frame X which contains the +different facility types in different columns.

+
X.describe()

Naive Bayes NMIS

-

[edit]

-

We can now specify the naive Bayes model. For the genres we want to model the data as Bernoulli distributed, and for the year and body count we want to model the data as Gaussian distributed. We set up two data frames to contain the parameters for the rows and the columns below.

-
# assume data is binary or real.
-# this list encodes whether it is binary or real (1 for binary, 0 for real)
-binary_columns = ['emergency_transport',
-          'phcn_electricity',
-          'child_health_measles_immun_calc',
-          'improved_water_supply', 
-          'improved_sanitation',
-          'antenatal_care_yn', 
-          'family_planning_yn',
-          'malaria_treatment_artemisinin'] + type_names
-real_columns = ['num_chews_fulltime', 
-                'num_nurses_fulltime', 
-                'num_doctors_fulltime', 
-                'latitude', 
-                'longitude']
-Bernoulli = pd.DataFrame(data=np.zeros((2,len(binary_columns))), columns=binary_columns, index=['theta_0', 'theta_1'])
-Gaussian = pd.DataFrame(data=np.zeros((4,len(real_columns))), columns=real_columns, index=['mu_0', 'sigma2_0', 'mu_1', 'sigma2_1'])
-

Now we have the data in a form ready for analysis, let’s construct our data matrix.

-
num_train = 20000
-indices = np.random.permutation(X.shape[0])
-train_indices = indices[:num_train]
-test_indices = indices[num_train:]
-X_train = X.iloc[train_indices]
-y_train = y.iloc[train_indices]==True
-X_test = X.iloc[test_indices]
-y_test = y.iloc[test_indices]==True
-

And we can now train the model. For each feature we can make the fit independently. The fit is given by either counting the number of positives (for binary data) which gives us the maximum likelihood solution for the Bernoulli. Or by computing the empirical mean and variance of the data for the Gaussian, which also gives us the maximum likelihood solution.

-
for column in X_train:
-    if column in Gaussian:
-        Gaussian[column]['mu_0'] = X_train[column][~y_train].mean()
-        Gaussian[column]['mu_1'] = X_train[column][y_train].mean()
-        Gaussian[column]['sigma2_0'] = X_train[column][~y_train].var(ddof=0)
-        Gaussian[column]['sigma2_1'] = X_train[column][y_train].var(ddof=0)
-    if column in Bernoulli:
-        Bernoulli[column]['theta_0'] = X_train[column][~y_train].sum()/(~y_train).sum()
-        Bernoulli[column]['theta_1'] = X_train[column][y_train].sum()/(y_train).sum()
-

We can examine the nature of the distributions we’ve fitted to the model by looking at the entries in these data frames.

-
Bernoulli
-

The distributions show the parameters of the independent class conditional probabilities for no maternity services. It is a Bernoulli distribution with the parameter, π, given by (theta_0) for the facilities without maternity services and theta_1 for the facilities with maternity services. The parameters whow that, facilities with maternity services also are more likely to have other services such as grid electricity, emergency transport, immunization programs etc.

-

The naive Bayes assumption says that the joint probability for these services is given by the product of each of these Bernoulli distributions.

-
Gaussian
-

We have modelled the numbers in our table with a Gaussian density. Since several of these numbers are counts, a more appropriate distribution might be the Poisson distribution. But here we can see that the average number of nurses, healthworkers and doctors is higher in the facilities with maternal services (mu_1) than those without maternal services (mu_0). There is also a small difference between the mean latitude and longitudes. However, the standard deviation which would be given by the square root of the variance parameters (sigma_0 and sigma_1) is large, implying that a difference in latitude and longitude may be due to sampling error. To be sure more analysis would be required.

-

The final model parameter is the prior probability of the positive class, π, which is computed by maximum likelihood.

-
prior = float(y_train.sum())/len(y_train)
-

The prior probability tells us that slightly more facilities have maternity services than those that don’t.

+
+[edit] +
+

We can now specify the naive Bayes model. For the genres we want to +model the data as Bernoulli distributed, and for the year and body count +we want to model the data as Gaussian distributed. We set up two data +frames to contain the parameters for the rows and the columns below.

+
# assume data is binary or real.
+# this list encodes whether it is binary or real (1 for binary, 0 for real)
+binary_columns = ['emergency_transport',
+          'phcn_electricity',
+          'child_health_measles_immun_calc',
+          'improved_water_supply', 
+          'improved_sanitation',
+          'antenatal_care_yn', 
+          'family_planning_yn',
+          'malaria_treatment_artemisinin'] + type_names
+real_columns = ['num_chews_fulltime', 
+                'num_nurses_fulltime', 
+                'num_doctors_fulltime', 
+                'latitude', 
+                'longitude']
+Bernoulli = pd.DataFrame(data=np.zeros((2,len(binary_columns))), columns=binary_columns, index=['theta_0', 'theta_1'])
+Gaussian = pd.DataFrame(data=np.zeros((4,len(real_columns))), columns=real_columns, index=['mu_0', 'sigma2_0', 'mu_1', 'sigma2_1'])
+

Now we have the data in a form ready for analysis, let’s construct +our data matrix.

+
num_train = 20000
+indices = np.random.permutation(X.shape[0])
+train_indices = indices[:num_train]
+test_indices = indices[num_train:]
+X_train = X.iloc[train_indices]
+y_train = y.iloc[train_indices]==True
+X_test = X.iloc[test_indices]
+y_test = y.iloc[test_indices]==True
+

And we can now train the model. For each feature we can make the fit +independently. The fit is given by either counting the number of +positives (for binary data) which gives us the maximum likelihood +solution for the Bernoulli. Or by computing the empirical mean and +variance of the data for the Gaussian, which also gives us the maximum +likelihood solution.

+
for column in X_train:
+    if column in Gaussian:
+        Gaussian[column]['mu_0'] = X_train[column][~y_train].mean()
+        Gaussian[column]['mu_1'] = X_train[column][y_train].mean()
+        Gaussian[column]['sigma2_0'] = X_train[column][~y_train].var(ddof=0)
+        Gaussian[column]['sigma2_1'] = X_train[column][y_train].var(ddof=0)
+    if column in Bernoulli:
+        Bernoulli[column]['theta_0'] = X_train[column][~y_train].sum()/(~y_train).sum()
+        Bernoulli[column]['theta_1'] = X_train[column][y_train].sum()/(y_train).sum()
+

We can examine the nature of the distributions we’ve fitted to the +model by looking at the entries in these data frames.

+
Bernoulli
+

The distributions show the parameters of the independent +class conditional probabilities for no maternity services. It is a +Bernoulli distribution with the parameter, \(\pi\), given by (theta_0) for +the facilities without maternity services and theta_1 for +the facilities with maternity services. The parameters whow that, +facilities with maternity services also are more likely to have other +services such as grid electricity, emergency transport, immunization +programs etc.

+

The naive Bayes assumption says that the joint probability for these +services is given by the product of each of these Bernoulli +distributions.

+
Gaussian
+

We have modelled the numbers in our table with a Gaussian density. +Since several of these numbers are counts, a more appropriate +distribution might be the Poisson distribution. But here we can see that +the average number of nurses, healthworkers and doctors is +higher in the facilities with maternal services +(mu_1) than those without maternal services +(mu_0). There is also a small difference between the mean +latitude and longitudes. However, the standard deviation which +would be given by the square root of the variance parameters +(sigma_0 and sigma_1) is large, implying that +a difference in latitude and longitude may be due to sampling error. To +be sure more analysis would be required.

+

The final model parameter is the prior probability of the positive +class, \(\pi\), which is computed by +maximum likelihood.

+
prior = float(y_train.sum())/len(y_train)
+

The prior probability tells us that slightly more facilities have +maternity services than those that don’t.

Making Predictions

-

Naive Bayes has given us the class conditional densities: $p(\inputVector_i | \dataScalar_i, \paramVector)$. To make predictions with these densities we need to form the distribution given by
$$ -P(\dataScalar^*| \dataVector, \inputMatrix, \inputVector^*, \paramVector) -$$
This can be computed by using the product rule. We know that
$$ -P(\dataScalar^*| \dataVector, \inputMatrix, \inputVector^*, \paramVector)p(\dataVector, \inputMatrix, \inputVector^*|\paramVector) = p(\dataScalar*, \dataVector, \inputMatrix, \inputVector^*| \paramVector) -$$
implying that
$$ -P(\dataScalar^*| \dataVector, \inputMatrix, \inputVector^*, \paramVector) = \frac{p(\dataScalar*, \dataVector, \inputMatrix, \inputVector^*| \paramVector)}{p(\dataVector, \inputMatrix, \inputVector^*|\paramVector)} -$$
and we’ve already defined $p(\dataScalar^*, \dataVector, \inputMatrix, \inputVector^*| \paramVector)$ using our conditional independence assumptions above
$$ -p(\dataScalar^*, \dataVector, \inputMatrix, \inputVector^*| \paramVector) = \prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*, \paramVector)p(\dataScalar^*|\pi)\prod_{i=1}^{\numData} \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i|\pi) -$$
The other required density is
$$ -p(\dataVector, \inputMatrix, \inputVector^*|\paramVector) -$$
which can be found from
$$p(\dataScalar^*, \dataVector, \inputMatrix, \inputVector^*| \paramVector)$$
using the sum rule of probability,
$$ -p(\dataVector, \inputMatrix, \inputVector^*|\paramVector) = \sum_{\dataScalar^*=0}^1 p(\dataScalar^*, \dataVector, \inputMatrix, \inputVector^*| \paramVector). -$$
Because of our independence assumptions that is simply equal to
$$ -p(\dataVector, \inputMatrix, \inputVector^*| \paramVector) = \sum_{\dataScalar^*=0}^1 \prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*_i, \paramVector)p(\dataScalar^*|\pi)\prod_{i=1}^{\numData} \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i|\pi). -$$
Substituting both forms in to recover our distribution over the test label conditioned on the training data we have,
$$ -P(\dataScalar^*| \dataVector, \inputMatrix, \inputVector^*, \paramVector) = \frac{\prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*_i, \paramVector)p(\dataScalar^*|\pi)\prod_{i=1}^{\numData} \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i|\pi)}{\sum_{\dataScalar^*=0}^1 \prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*_i, \paramVector)p(\dataScalar^*|\pi)\prod_{i=1}^{\numData} \prod_{j=1}^{\dataDim} p(\inputScalar_{i,j}|\dataScalar_i, \paramVector)p(\dataScalar_i|\pi)} -$$
and we notice that all the terms associated with the training data actually cancel, the test prediction is conditionally independent of the training data given the parameters. This is a result of our conditional independence assumptions over the data points.
$$ -p(\dataScalar^*| \inputVector^*, \paramVector) = \frac{\prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*_i, -\paramVector)p(\dataScalar^*|\pi)}{\sum_{\dataScalar^*=0}^1 \prod_{j=1}^{\dataDim} p(\inputScalar^*_{j}|\dataScalar^*_i, \paramVector)p(\dataScalar^*|\pi)} -$$
This formula is also fairly straightforward to implement. First we implement the log probabilities for the Gaussian density.

-
def log_gaussian(x, mu, sigma2):
-    return -0.5* np.log(2*np.pi*sigma2)-((x-mu)**2)/(2*sigma2)
-

Now for any test point we compute the joint distribution of the Gaussian features by summing their log probabilities. Working in log space can be a considerable advantage over computing the probabilities directly: as the number of features we include goes up, because all the probabilities are less than 1, the joint probability will become smaller and smaller, and may be difficult to represent accurately (or even underflow). Working in log space can ameliorate this problem. We can also compute the log probability for the Bernoulli distribution.

-
def log_bernoulli(x, theta):
-    return x*np.log(theta) + (1-x)*np.log(1-theta)
+

Naive Bayes has given us the class conditional densities: \(p(\mathbf{ x}_i | y_i, \boldsymbol{ +\theta})\). To make predictions with these densities we need to +form the distribution given by \[ +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) +\] This can be computed by using the product rule. We know that +\[ +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ +\theta})p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = +p(y*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) +\] implying that \[ +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = +\frac{p(y*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ +\theta})}{p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ +\theta})} +\] and we’ve already defined \(p(y^*, +\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta})\) +using our conditional independence assumptions above \[ +p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = +\prod_{j=1}^{p} p(x^*_{j}|y^*, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi) +\] The other required density is \[ +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) +\] which can be found from \[p(y^*, +\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta})\] +using the sum rule of probability, \[ +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = +\sum_{y^*=0}^1 p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| +\boldsymbol{ \theta}). +\] Because of our independence assumptions that is simply equal +to \[ +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = +\sum_{y^*=0}^1 \prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi). +\] Substituting both forms in to recover our distribution over +the test label conditioned on the training data we have, \[ +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = +\frac{\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi)}{\sum_{y^*=0}^1 \prod_{j=1}^{p} +p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} +\prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi)} +\] and we notice that all the terms associated with the training +data actually cancel, the test prediction is conditionally +independent of the training data given the parameters. +This is a result of our conditional independence assumptions over the +data points. \[ +p(y^*| \mathbf{ x}^*, \boldsymbol{ \theta}) = \frac{\prod_{j=1}^{p} +p(x^*_{j}|y^*_i, +\boldsymbol{ \theta})p(y^*|\pi)}{\sum_{y^*=0}^1 \prod_{j=1}^{p} +p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)} +\] This formula is also fairly straightforward to implement. +First we implement the log probabilities for the Gaussian density.

+
def log_gaussian(x, mu, sigma2):
+    return -0.5* np.log(2*np.pi*sigma2)-((x-mu)**2)/(2*sigma2)
+

Now for any test point we compute the joint distribution of the +Gaussian features by summing their log probabilities. Working +in log space can be a considerable advantage over computing the +probabilities directly: as the number of features we include goes up, +because all the probabilities are less than 1, the joint probability +will become smaller and smaller, and may be difficult to represent +accurately (or even underflow). Working in log space can ameliorate this +problem. We can also compute the log probability for the Bernoulli +distribution.

+
def log_bernoulli(x, theta):
+    return x*np.log(theta) + (1-x)*np.log(1-theta)

Laplace Smoothing

-

Before we proceed, let’s just pause and think for a moment what will happen if theta here is either zero or one. This will result in log 0 =  − ∞ and cause numerical problems. This definitely can happen in practice. If some of the features are rare or very common across the data set then the maximum likelihood solution could find values of zero or one respectively. Such values are problematic because they cause posterior probabilities of class membership of either one or zero. In practice we deal with this using Laplace smoothing (which actually has an interpretation as a Bayesian fit of the Bernoulli distribution. Laplace used an example of the sun rising each day, and a wish to predict the sun rise the following day to describe his idea of smoothing, which can be found at the bottom of following page from Laplace’s ‘Essai Philosophique …’

- -

Laplace suggests that when computing the probability of an event where a success or failure is rare (he uses an example of the sun rising across the last 5,000 years or 1,826,213 days) that even though only successes have been observed (in the sun rising case) that the odds for tomorrow shouldn’t be given as
$$ +

Laplace suggests that when computing the probability of an event +where a success or failure is rare (he uses an example of the sun rising +across the last 5,000 years or 1,826,213 days) that even though only +successes have been observed (in the sun rising case) that the odds for +tomorrow shouldn’t be given as \[ \frac{1,826,213}{1,826,213} = 1 -$$
but rather by adding one to the numerator and two to the denominator,
$$ +\] but rather by adding one to the numerator and two to the +denominator, \[ \frac{1,826,213 + 1}{1,826,213 + 2} = 0.99999945. -$$
This technique is sometimes called a ‘pseudocount technique’ because it has an intepretation of assuming some observations before you start, it’s as if instead of observing $\sum_{i}\dataScalar_i$ successes you have an additional success, $\sum_{i}\dataScalar_i + 1$ and instead of having observed n events you’ve observed $\numData + 2$. So we can think of Laplace’s idea saying (before we start) that we have ‘two observations worth of belief, that the odds are 50/50’, because before we start (i.e. when $\numData=0$) our estimate is 0.5, yet because the effective n is only 2, this estimate is quickly overwhelmed by data. Laplace used ideas like this a lot, and it is known as his ‘principle of insufficient reason’. His idea was that in the absence of knowledge (i.e. before we start) we should assume that all possible outcomes are equally likely. This idea has a modern counterpart, known as the principle of maximum entropy. A lot of the theory of this approach was developed by Ed Jaynes, who according to his erstwhile collaborator and friend, John Skilling, learnt French as an undergraduate by reading the works of Laplace. Although John also related that Jaynes’s spoken French was not up to the standard of his scientific French. For me Ed Jaynes’s work very much carries on the tradition of Laplace into the modern era, in particular his focus on Bayesian approaches. I’m very proud to have met those that knew and worked with him. It turns out that Laplace’s idea also has a Bayesian interpretation (as Laplace understood), it comes from assuming a particular prior density for the parameter π, but we won’t explore that interpretation for the moment, and merely choose to estimate the probability as,
$$ -\pi = \frac{\sum_{i=1}^{\numData} \dataScalar_i + 1}{\numData + 2} -$$
to prevent problems with certainty causing numerical issues and misclassifications. Let’s refit the Bernoulli features now.

-
# fit the Bernoulli with Laplace smoothing.
-for column in X_train:
-    if column in Bernoulli:
-        Bernoulli[column]['theta_0'] = (X_train[column][~y_train].sum() + 1)/((~y_train).sum() + 2)
-        Bernoulli[column]['theta_1'] = (X_train[column][y_train].sum() + 1)/((y_train).sum() + 2)
+\] This technique is sometimes called a ‘pseudocount technique’ +because it has an intepretation of assuming some observations before you +start, it’s as if instead of observing \(\sum_{i}y_i\) successes you have an +additional success, \(\sum_{i}y_i + 1\) +and instead of having observed \(n\) +events you’ve observed \(n+ 2\). So we +can think of Laplace’s idea saying (before we start) that we have ‘two +observations worth of belief, that the odds are 50/50’, because before +we start (i.e. when \(n=0\)) our +estimate is 0.5, yet because the effective \(n\) is only 2, this estimate is quickly +overwhelmed by data. Laplace used ideas like this a lot, and it is known +as his ‘principle of insufficient reason’. His idea was that in the +absence of knowledge (i.e. before we start) we should assume that all +possible outcomes are equally likely. This idea has a modern +counterpart, known as the principle +of maximum entropy. A lot of the theory of this approach was +developed by Ed Jaynes, +who according to his erstwhile collaborator and friend, John Skilling, +learnt French as an undergraduate by reading the works of Laplace. +Although John also related that Jaynes’s spoken French was not up to the +standard of his scientific French. For me Ed Jaynes’s work very much +carries on the tradition of Laplace into the modern era, in particular +his focus on Bayesian approaches. I’m very proud to have met those that +knew and worked with him. It turns out that Laplace’s idea also has a +Bayesian interpretation (as Laplace understood), it comes from assuming +a particular prior density for the parameter \(\pi\), but we won’t explore that +interpretation for the moment, and merely choose to estimate the +probability as, \[ +\pi = \frac{\sum_{i=1}^{n} y_i + 1}{n+ 2} +\] to prevent problems with certainty causing numerical issues +and misclassifications. Let’s refit the Bernoulli features now.

+
# fit the Bernoulli with Laplace smoothing.
+for column in X_train:
+    if column in Bernoulli:
+        Bernoulli[column]['theta_0'] = (X_train[column][~y_train].sum() + 1)/((~y_train).sum() + 2)
+        Bernoulli[column]['theta_1'] = (X_train[column][y_train].sum() + 1)/((y_train).sum() + 2)

That places us in a position to write the prediction function.

-
import numpy as np
-import pandas as pd
-
def predict(X_test, Gaussian, Bernoulli, prior):
-    log_positive = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index)
-    log_negative = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index)
-    for column in X_test.columns:
-        if column in Gaussian:
-            log_positive += log_gaussian(X_test[column], Gaussian[column]['mu_1'], Gaussian[column]['sigma2_1'])
-            log_negative += log_gaussian(X_test[column], Gaussian[column]['mu_0'], Gaussian[column]['sigma2_0'])
-        elif column in Bernoulli:
-            log_positive += log_bernoulli(X_test[column], Bernoulli[column]['theta_1'])
-            log_negative += log_bernoulli(X_test[column], Bernoulli[column]['theta_0'])
-            
-    v = np.zeros_like(log_positive.values)
-    for i in range(X_test.shape[0]):
-        v[i] = np.exp(log_positive.values[i] + np.log(prior))/(np.exp(log_positive.values[i] + np.log(prior)) 
-                                                               + np.exp(log_negative.values[i] + np.log(1-prior)))
-    return v
-    #return np.exp(log_positive + np.log(prior))/(np.exp(log_positive + np.log(prior)) + np.exp(log_negative + np.log(1-prior)))
-

Now we are in a position to make the predictions for the test data.

-
p_y = predict(X_test, Gaussian, Bernoulli, prior)
-

We can test the quality of the predictions in the following way. Firstly, we can threshold our probabilities at 0.5, allocating points with greater than 50% probability of membership of the positive class to the positive class. We can then compare to the true values, and see how many of these values we got correct. This is our total number correct.

-
correct = y_test.eq(p_y>0.5)
-total_correct = sum(correct)
-print("Total correct", total_correct, " out of ", len(y_test), "which is", float(total_correct)/len(y_test), "%")
-

We can also now plot the confusion matrix. A confusion matrix tells us where we are making mistakes. Along the diagonal it stores the true positives, the points that were positive class that we classified correctly, and the true negatives, the points that were negative class and that we classified correctly. The off diagonal terms contain the false positives and the false negatives. Along the rows of the matrix we place the actual class, and along the columns we place our predicted class.

-
confusion_matrix = pd.DataFrame(data=np.zeros((2,2)), 
-                                columns=['predicted no maternity', 'predicted maternity'],
-                                index =['actual no maternity','actual maternity'])
-confusion_matrix['predicted maternity']['actual maternity'] = (y_test & (p_y>0.5)).sum()
-confusion_matrix['predicted maternity']['actual no maternity'] = (~y_test & (p_y>0.5)).sum()
-confusion_matrix['predicted no maternity']['actual maternity'] = (y_test & ~(p_y>0.5)).sum()
-confusion_matrix['predicted no maternity']['actual no maternity'] = (~y_test & ~(p_y>0.5)).sum()
-confusion_matrix
+
import numpy as np
+import pandas as pd
+
def predict(X_test, Gaussian, Bernoulli, prior):
+    log_positive = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index)
+    log_negative = pd.Series(data = np.zeros(X_test.shape[0]), index=X_test.index)
+    for column in X_test.columns:
+        if column in Gaussian:
+            log_positive += log_gaussian(X_test[column], Gaussian[column]['mu_1'], Gaussian[column]['sigma2_1'])
+            log_negative += log_gaussian(X_test[column], Gaussian[column]['mu_0'], Gaussian[column]['sigma2_0'])
+        elif column in Bernoulli:
+            log_positive += log_bernoulli(X_test[column], Bernoulli[column]['theta_1'])
+            log_negative += log_bernoulli(X_test[column], Bernoulli[column]['theta_0'])
+            
+    v = np.zeros_like(log_positive.values)
+    for i in range(X_test.shape[0]):
+        v[i] = np.exp(log_positive.values[i] + np.log(prior))/(np.exp(log_positive.values[i] + np.log(prior)) 
+                                                               + np.exp(log_negative.values[i] + np.log(1-prior)))
+    return v
+    #return np.exp(log_positive + np.log(prior))/(np.exp(log_positive + np.log(prior)) + np.exp(log_negative + np.log(1-prior)))
+

Now we are in a position to make the predictions for the test +data.

+
p_y = predict(X_test, Gaussian, Bernoulli, prior)
+

We can test the quality of the predictions in the following way. +Firstly, we can threshold our probabilities at 0.5, allocating points +with greater than 50% probability of membership of the positive class to +the positive class. We can then compare to the true values, and see how +many of these values we got correct. This is our total number +correct.

+
correct = y_test.eq(p_y>0.5)
+total_correct = sum(correct)
+print("Total correct", total_correct, " out of ", len(y_test), "which is", float(total_correct)/len(y_test), "%")
+

We can also now plot the confusion +matrix. A confusion matrix tells us where we are making mistakes. +Along the diagonal it stores the true positives, the points +that were positive class that we classified correctly, and the true +negatives, the points that were negative class and that we +classified correctly. The off diagonal terms contain the false positives +and the false negatives. Along the rows of the matrix we place the +actual class, and along the columns we place our predicted class.

+
confusion_matrix = pd.DataFrame(data=np.zeros((2,2)), 
+                                columns=['predicted no maternity', 'predicted maternity'],
+                                index =['actual no maternity','actual maternity'])
+confusion_matrix['predicted maternity']['actual maternity'] = (y_test & (p_y>0.5)).sum()
+confusion_matrix['predicted maternity']['actual no maternity'] = (~y_test & (p_y>0.5)).sum()
+confusion_matrix['predicted no maternity']['actual maternity'] = (y_test & ~(p_y>0.5)).sum()
+confusion_matrix['predicted no maternity']['actual no maternity'] = (~y_test & ~(p_y>0.5)).sum()
+confusion_matrix

Exercise 5

-

How can you improve your classification, are all the features equally valid? Are some features more helpful than others? What happens if you remove features that appear to be less helpful. How might you select such features?

+

How can you improve your classification, are all the features equally +valid? Are some features more helpful than others? What happens if you +remove features that appear to be less helpful. How might you select +such features?

Exercise 6

-

We have decided to classify positive if probability of maternity is greater than 0.5. This has led us to accidentally classify some facilities as havien’t facilities for maternity when in fact they don’t. Imagine you wish to ensure that a facility handles maternity. With your test set how low do you have to set the threshold to avoid all the false negatives (i.e. facilities where you predicted there was no maternity, but in actuality there were?

+

We have decided to classify positive if probability of maternity is +greater than 0.5. This has led us to accidentally classify some +facilities as havien’t facilities for maternity when in fact they don’t. +Imagine you wish to ensure that a facility handles maternity. With your +test set how low do you have to set the threshold to avoid all the false +negatives (i.e. facilities where you predicted there was no maternity, +but in actuality there were?

Making Predictions

-

Naive Bayes has given us the class conditional densities: $p(\inputVector_i | \dataScalar_i, \paramVector)$. To make predictions with these densities we need to form the distribution given by
$$ -P(\dataScalar^*| \dataVector, \inputMatrix, \inputVector^*, \paramVector) -$$

+

Naive Bayes has given us the class conditional densities: \(p(\mathbf{ x}_i | y_i, \boldsymbol{ +\theta})\). To make predictions with these densities we need to +form the distribution given by \[ +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) +\]

Exercise 7

-

Write down the negative log likelihood of the Gaussian density over a vector of variables $\inputVector$. Assume independence between each variable. Minimize this objective to obtain the maximum likelihood solution of the form.
$$ -\mu = \frac{\sum_{i=1}^{\numData} \inputScalar_i}{\numData} -$$

$$ -\dataStd^2 = \frac{\sum_{i=1}^{\numData} (\inputScalar_i - \mu)^2}{\numData} -$$

-

If the input data was binary then we could also make use of the Bernoulli distribution for the features. For that case we would have the form,
$$ -p(\inputScalar_{i, j} | \dataScalar_i,\paramVector) = \theta_{\dataScalar_i, j}^{\inputScalar_{i, j}}(1-\theta_{\dataScalar_i, j})^{(1-\inputScalar_{i,j})}, -$$
where θ1, j is the probability that the jth feature is on if $\dataScalar_i$ is 1.

-

In either case, maximum likelihood fitting would proceed in the same way. The objective has the form,
$$ -\errorFunction(\paramVector) = -\sum_{j=1}^{\dataDim} \sum_{i=1}^{\numData} \log p(\inputScalar_{i,j} |\dataScalar_i, \paramVector), -$$
and if, as above, the parameters of the distributions are specific to each feature vector (we had means and variances for each continuous feature, and a probability for each binary feature) then we can use the fact that these parameters separate into disjoint subsets across the features to write,
$$ +

Write down the negative log likelihood of the Gaussian density over a +vector of variables \(\mathbf{ x}\). +Assume independence between each variable. Minimize this objective to +obtain the maximum likelihood solution of the form. \[ +\mu = \frac{\sum_{i=1}^{n} x_i}{n} +\] \[ +\sigma^2 = \frac{\sum_{i=1}^{n} (x_i - \mu)^2}{n} +\]

+

If the input data was binary then we could also make use of +the Bernoulli distribution for the features. For that case we would have +the form, \[ +p(x_{i, j} | y_i,\boldsymbol{ \theta}) = \theta_{y_i, j}^{x_{i, +j}}(1-\theta_{y_i, j})^{(1-x_{i,j})}, +\] where \(\theta_{1, j}\) is +the probability that the \(j\)th +feature is on if \(y_i\) is 1.

+

In either case, maximum likelihood fitting would proceed in the same +way. The objective has the form, \[ +E(\boldsymbol{ \theta}) = -\sum_{j=1}^{p} \sum_{i=1}^{n} \log p(x_{i,j} +|y_i, \boldsymbol{ \theta}), +\] and if, as above, the parameters of the distributions are +specific to each feature vector (we had means and variances for each +continuous feature, and a probability for each binary feature) then we +can use the fact that these parameters separate into disjoint subsets +across the features to write, \[ \begin{align*} -\errorFunction(\paramVector) &= -\sum_{j=1}^{\dataDim} \sum_{i=1}^{\numData} \log -p(\inputScalar_{i,j} |\dataScalar_i, \paramVector_j)\\ -& \sum_{j=1}^{\dataDim} -\errorFunction(\paramVector_j), +E(\boldsymbol{ \theta}) &= -\sum_{j=1}^{p} \sum_{i=1}^{n} \log +p(x_{i,j} |y_i, \boldsymbol{ \theta}_j)\\ +& \sum_{j=1}^{p} +E(\boldsymbol{ \theta}_j), \end{align*} -$$
which means we can minimize our objective on each feature independently.

-

These characteristics mean that naive Bayes scales very well with big data. To fit the model we consider each feature in turn, we select the positive class and fit parameters for that class, then we select each negative class and fit features for that class. We have code below.

+\] which means we can minimize our objective on each feature +independently.

+

These characteristics mean that naive Bayes scales very well with big +data. To fit the model we consider each feature in turn, we select the +positive class and fit parameters for that class, then we select each +negative class and fit features for that class. We have code below.

Naive Bayes Summary

-

Naive Bayes is making very simple assumptions about the data, in particular it is modeling the full joint probability of the data set, $p(\dataVector, \inputMatrix | \paramVector, \pi)$ by very strong assumptions about factorizations that are unlikely to be true in practice. The data conditional independence assumption is common, and relies on a rich parameter vector to absorb all the information in the training data. The additional assumption of naive Bayes is that features are conditional independent given the class label $\dataScalar_i$ (and the parameter vector, $\paramVector$. This is quite a strong assumption. However, it causes the objective function to decompose into parts which can be independently fitted to the different feature vectors, meaning it is very easy to fit the model to large data. It is also clear how we should handle streaming data and missing data. This means that the model can be run ‘live’, adapting parameters and information as it arrives. Indeed, the model is even capable of dealing with new features that might arrive at run time. Such is the strength of the modeling the joint probability density. However, the factorization assumption that allows us to do this efficiently is very strong and may lead to poor decision boundaries in practice.

+

Naive Bayes is making very simple assumptions about the data, in +particular it is modeling the full joint probability of the +data set, \(p(\mathbf{ y}, \mathbf{X}| +\boldsymbol{ \theta}, \pi)\) by very strong assumptions about +factorizations that are unlikely to be true in practice. The data +conditional independence assumption is common, and relies on a rich +parameter vector to absorb all the information in the training data. The +additional assumption of naive Bayes is that features are conditional +independent given the class label \(y_i\) (and the parameter vector, \(\boldsymbol{ \theta}\). This is quite a +strong assumption. However, it causes the objective function to +decompose into parts which can be independently fitted to the different +feature vectors, meaning it is very easy to fit the model to large data. +It is also clear how we should handle streaming data and +missing data. This means that the model can be run ‘live’, +adapting parameters and information as it arrives. Indeed, the model is +even capable of dealing with new features that might arrive at +run time. Such is the strength of the modeling the joint probability +density. However, the factorization assumption that allows us to do this +efficiently is very strong and may lead to poor decision boundaries in +practice.

Other Reading

    -
  • Chapter 5 of Rogers and Girolami (2011) up to pg 179 (Section 5.1, and 5.2 up to 5.2.2).
  • +
  • Chapter 5 of Rogers and Girolami (2011) up to pg +179 (Section 5.1, and 5.2 up to 5.2.2).

References

Thanks!

-

For more information on these subjects and more you might want to check the following resources.

+

For more information on these subjects and more you might want to +check the following resources.

-
-
-

Bishop, Christopher M. 2006. Pattern Recognition and Machine Learning. springer.

+
+
+Bishop, C.M., 2006. Pattern recognition and machine learning. springer. +
+
+Pearl, J., 1995. From Bayesian networks to causal networks, +in: Gammerman, A. (Ed.), Probabilistic Reasoning and +Bayesian Belief Networks. Alfred Waller, pp. 1–31.
-
-

Pearl, Judea. 1995. “From Bayesian Networks to Causal Networks.” In Probabilistic Reasoning and Bayesian Belief Networks, edited by A. Gammerman, 1–31. Alfred Waller.

+
+Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC +Press.
-
-

Rogers, Simon, and Mark Girolami. 2011. A First Course in Machine Learning. CRC Press.

+
+Steele, S., Bilchik, A., Eberhardt, J., Kalina, P., Nissan, A., Johnson, +E., Avital, I., Stojadinovic, A., 2012. Using machine-learned +Bayesian belief networks to predict perioperative risk of +clostridium difficile infection following colon surgery. Interact J Med +Res 1, e6. https://doi.org/10.2196/ijmr.2131
-
-

Steele, S, A Bilchik, J Eberhardt, P Kalina, A Nissan, E Johnson, I Avital, and A Stojadinovic. 2012. “Using Machine-Learned Bayesian Belief Networks to Predict Perioperative Risk of Clostridium Difficile Infection Following Colon Surgery.” Interact J Med Res 1 (2): e6. https://doi.org/10.2196/ijmr.2131.

+
+The Office of the Senior Special Assistant to the President on the +Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014. +Nigeria NMIS facility database.
diff --git a/_lectures/04-gaussian-processes.html b/_lectures/04-gaussian-processes.html index eb5d33c..dd4c48b 100644 --- a/_lectures/04-gaussian-processes.html +++ b/_lectures/04-gaussian-processes.html @@ -1,22 +1,24 @@ --- title: "Gaussian Processes" venue: "Virtual Data Science Nigeria" -abstract: "

Classical machine learning and statistical approaches to learning, such as neural networks and linear regression, assume a parametric form for functions. Gaussian process models are an alternative approach that assumes a probabilistic prior over functions. This brings benefits, in that uncertainty of function estimation is sustained throughout inference, and some challenges: algorithms for fitting Gaussian processes tend to be more complex than parametric models. In this sessions I will introduce Gaussian processes and explain why sustaining uncertainty is important.

" -author: -- given: Neil D. - family: Lawrence - url: http://inverseprobability.com - institute: - twitter: lawrennd - gscholar: r3SJcvoAAAAJ - orcid: 0000-0001-9258-1030 +abstract: "

Classical machine learning and statistical approaches to +learning, such as neural networks and linear regression, assume a +parametric form for functions. Gaussian process models are an +alternative approach that assumes a probabilistic prior over functions. +This brings benefits, in that uncertainty of function estimation is +sustained throughout inference, and some challenges: algorithms for +fitting Gaussian processes tend to be more complex than parametric +models. In this sessions I will introduce Gaussian processes and explain +why sustaining uncertainty is important.

" +edit_url: https://github.com/mlatcl/dsa/edit/gh-pages/_lamd/gaussian-processes.md date: 2020-11-13 published: 2020-11-13 time: "15:00 (West Africa Standard Time)" -week: 0 session: 4 reveal: 04-gaussian-processes.slides.html +transition: None ipynb: 04-gaussian-processes.ipynb +pptx: 04-gaussian-processes.pptx layout: lecture categories: - notes @@ -33,54 +35,142 @@ -->

Setup

-

[edit]

-

First we download some libraries and files to support the notebook.

-
import urllib.request
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/teaching_plots.py','teaching_plots.py')
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/gp_tutorial.py','gp_tutorial.py')
+
+[edit] +
+ +

notutils

+
+[edit] +
+

This small package is a helper package for various notebook utilities +used below.

+

The software can be installed using

+
%pip install notutils
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/notutils

+

Once notutils is installed, it can be imported in the +usual manner.

+
import notutils

pods

-

[edit]

-

In Sheffield we created a suite of software tools for ‘Open Data Science’. Open data science is an approach to sharing code, models and data that should make it easier for companies, health professionals and scientists to gain access to data science techniques.

-

You can also check this blog post on Open Data Science.

+
+[edit] +
+

In Sheffield we created a suite of software tools for ‘Open Data +Science’. Open data science is an approach to sharing code, models and +data that should make it easier for companies, health professionals and +scientists to gain access to data science techniques.

+

You can also check this blog post on Open +Data Science.

+

The software can be installed using

+
%pip install pods
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/ods

+

Once pods is installed, it can be imported in the usual +manner.

+
import pods
+

mlai

+
+[edit] +
+

The mlai software is a suite of helper functions for +teaching and demonstrating machine learning algorithms. It was first +used in the Machine Learning and Adaptive Intelligence course in +Sheffield in 2013.

The software can be installed using

-
%pip install --upgrade git+https://github.com/sods/ods
-

from the command prompt where you can access your python installation.

-

The code is also available on github: https://github.com/sods/ods

-

Once pods is installed, it can be imported in the usual manner.

-
import pods
+
%pip install mlai
+

from the command prompt where you can access your python +installation.

+

The code is also available on GitHub: https://github.com/lawrennd/mlai

+

Once mlai is installed, it can be imported in the usual +manner.

+
import mlai
-
-
- +
+
+
-
+
-
-

Figure: A key reference for Gaussian process models remains the excellent book “Gaussian Processes for Machine Learning” (Rasmussen and Williams (2006)). The book is also freely available online.

-
+
+

Figure: A key reference for Gaussian process models remains the +excellent book “Gaussian Processes for Machine Learning” (Rasmussen and Williams +(2006)). The book is also +freely +available online.

+
+
+

Rasmussen and +Williams (2006) is still one of the most important references on +Gaussian process models. It is available freely +online.

+

A First Course in Machine +Learning

+
+[edit]
-

Rasmussen and Williams (2006) is still one of the most important references on Gaussian process models. It is available freely online.

-

A First Course in Machine Learning

-

[edit]

-
-
- +
+
+
-
+
-
-

Figure: The main course text is “A First Course in Machine Learning” by Rogers and Girolami (2011).

+
+

Figure: The main course text is “A First Course in Machine Learning” +by Rogers and Girolami +(2011).

-

Example: Prediction of Malaria Incidence in Uganda

-

[edit]

+

Example: +Prediction of Malaria Incidence in Uganda

+
+[edit] +
@@ -93,7 +183,7 @@

Example: Prediction o Martin Mubangizi - + @@ -104,9 +194,9 @@

Example: Prediction o -Ricardo Andrade Pacheco +Ricardo Andrade Pacecho - + @@ -119,61 +209,113 @@

Example: Prediction o John Quinn - +

-

As an example of using Gaussian process models within the full pipeline from data to decsion, we’ll consider the prediction of Malaria incidence in Uganda. For the purposes of this study malaria reports come in two forms, HMIS reports from health centres and Sentinel data, which is curated by the WHO. There are limited sentinel sites and many HMIS sites.

-

The work is from Ricardo Andrade Pacheco’s PhD thesis, completed in collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco et al. 2014; Mubangizi et al. 2014). John and Martin were initally from the AI-DEV group from the University of Makerere in Kampala and more latterly they were based at UN Global Pulse in Kampala.

-

Malaria data is spatial data. Uganda is split into districts, and health reports can be found for each district. This suggests that models such as conditional random fields could be used for spatial modelling, but there are two complexities with this. First of all, occasionally districts split into two. Secondly, sentinel sites are a specific location within a district, such as Nagongera which is a sentinel site based in the Tororo district.

+

As an example of using Gaussian process models within the full +pipeline from data to decsion, we’ll consider the prediction of Malaria +incidence in Uganda. For the purposes of this study malaria reports come +in two forms, HMIS reports from health centres and Sentinel data, which +is curated by the WHO. There are limited sentinel sites and many HMIS +sites.

+

The work is from Ricardo Andrade Pacheco’s PhD thesis, completed in +collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco +et al., 2014; Mubangizi et al., 2014). John and Martin were +initally from the AI-DEV group from the University of Makerere in +Kampala and more latterly they were based at UN Global Pulse in Kampala. +You can see the work summarized on the UN Global Pulse disease +outbreaks project site here.

+ +

Malaria data is spatial data. Uganda is split into districts, and +health reports can be found for each district. This suggests that models +such as conditional random fields could be used for spatial modelling, +but there are two complexities with this. First of all, occasionally +districts split into two. Secondly, sentinel sites are a specific +location within a district, such as Nagongera which is a sentinel site +based in the Tororo district.

- +
-
+
-

Figure: Ugandan districs. Data SRTM/NASA from https://dds.cr.usgs.gov/srtm/version2_1.

+

Figure: Ugandan districts. Data SRTM/NASA from https://dds.cr.usgs.gov/srtm/version2_1.

+
+
+(Andrade-Pacheco +et al., 2014; Mubangizi et al., 2014)
-

(Andrade-Pacheco et al. 2014; Mubangizi et al. 2014)

-

The common standard for collecting health data on the African continent is from the Health management information systems (HMIS). However, this data suffers from missing values (Gething et al. 2006) and diagnosis of diseases like typhoid and malaria may be confounded.

+

The common standard for collecting health data on the African +continent is from the Health management information systems (HMIS). +However, this data suffers from missing values (Gething et al., 2006) and diagnosis +of diseases like typhoid and malaria may be confounded.

- + -
+
-

Figure: The Tororo district, where the sentinel site, Nagongera, is located.

+

Figure: The Tororo district, where the sentinel site, Nagongera, is +located.

-

World Health Organization Sentinel Surveillance systems are set up “when high-quality data are needed about a particular disease that cannot be obtained through a passive system”. Several sentinel sites give accurate assessment of malaria disease levels in Uganda, including a site in Nagongera.

+

World +Health Organization Sentinel Surveillance systems are set up “when +high-quality data are needed about a particular disease that cannot be +obtained through a passive system”. Several sentinel sites give accurate +assessment of malaria disease levels in Uganda, including a site in +Nagongera.

- +
-
+
-

Figure: Sentinel and HMIS data along with rainfall and temperature for the Nagongera sentinel station in the Tororo district.

-
-
-

In collaboration with the AI Research Group at Makerere we chose to investigate whether Gaussian process models could be used to assimilate information from these two different sources of disease informaton. Further, we were interested in whether local information on rainfall and temperature could be used to improve malaria estimates.

-

The aim of the project was to use WHO Sentinel sites, alongside rainfall and temperature, to improve predictions from HMIS data of levels of malaria.

+

Figure: Sentinel and HMIS data along with rainfall and temperature +for the Nagongera sentinel station in the Tororo district.

+
+ +

In collaboration with the AI Research Group at Makerere we chose to +investigate whether Gaussian process models could be used to assimilate +information from these two different sources of disease informaton. +Further, we were interested in whether local information on rainfall and +temperature could be used to improve malaria estimates.

+

The aim of the project was to use WHO Sentinel sites, alongside +rainfall and temperature, to improve predictions from HMIS data of +levels of malaria.

- + -
+
@@ -183,10 +325,11 @@

Example: Prediction o
- +
-
+
@@ -195,24 +338,27 @@

Example: Prediction o

-
- +
+
-

Figure: The project arose out of the Gaussian process summer school held at Makerere in Kampala in 2013. The school led, in turn, to the Data Science Africa initiative.

+

Figure: The project arose out of the Gaussian process summer school +held at Makerere in Kampala in 2013. The school led, in turn, to the +Data Science Africa initiative.

Early Warning Systems

- + -
+
@@ -222,451 +368,1203 @@

Early Warning Systems

- +
-
+
-

Figure: Estimate of the current disease situation in the Kabarole district over time. Estimate is constructed with a Gaussian process with an additive covariance funciton.

-
-
-

Health monitoring system for the Kabarole district. Here we have fitted the reports with a Gaussian process with an additive covariance function. It has two components, one is a long time scale component (in red above) the other is a short time scale component (in blue).

-

Monitoring proceeds by considering two aspects of the curve. Is the blue line (the short term report signal) above the red (which represents the long term trend? If so we have higher than expected reports. If this is the case and the gradient is still positive (i.e. reports are going up) we encode this with a red color. If it is the case and the gradient of the blue line is negative (i.e. reports are going down) we encode this with an amber color. Conversely, if the blue line is below the red and decreasing, we color green. On the other hand if it is below red but increasing, we color yellow.

-

This gives us an early warning system for disease. Red is a bad situation getting worse, amber is bad, but improving. Green is good and getting better and yellow good but degrading.

-

Finally, there is a gray region which represents when the scale of the effect is small.

+

Figure: Estimate of the current disease situation in the Kabarole +district over time. Estimate is constructed with a Gaussian process with +an additive covariance funciton.

+
+
+

Health monitoring system for the Kabarole district. Here we have +fitted the reports with a Gaussian process with an additive covariance +function. It has two components, one is a long time scale component (in +red above) the other is a short time scale component (in blue).

+

Monitoring proceeds by considering two aspects of the curve. Is the +blue line (the short term report signal) above the red (which represents +the long term trend? If so we have higher than expected reports. If this +is the case and the gradient is still positive (i.e. reports +are going up) we encode this with a red color. If it is the +case and the gradient of the blue line is negative (i.e. reports are +going down) we encode this with an amber color. Conversely, if +the blue line is below the red and decreasing, we color +green. On the other hand if it is below red but increasing, we +color yellow.

+

This gives us an early warning system for disease. Red is a bad +situation getting worse, amber is bad, but improving. Green is good and +getting better and yellow good but degrading.

+

Finally, there is a gray region which represents when the scale of +the effect is small.

- +
-
+
-

Figure: The map of Ugandan districts with an overview of the Malaria situation in each district.

+

Figure: The map of Ugandan districts with an overview of the Malaria +situation in each district.

-

These colors can now be observed directly on a spatial map of the districts to give an immediate impression of the current status of the disease across the country.

+

These colors can now be observed directly on a spatial map of the +districts to give an immediate impression of the current status of the +disease across the country.

What is Machine Learning?

-

[edit]

-

What is machine learning? At its most basic level machine learning is a combination of

-


$$\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}$$

-

where data is our observations. They can be actively or passively acquired (meta-data). The model contains our assumptions, based on previous experience. That experience can be other data, it can come from transfer learning, or it can merely be our beliefs about the regularities of the universe. In humans our models include our inductive biases. The prediction is an action to be taken or a categorization or a quality score. The reason that machine learning has become a mainstay of artificial intelligence is the importance of predictions in artificial intelligence. The data and the model are combined through computation.

-

In practice we normally perform machine learning using two functions. To combine data with a model we typically make use of:

-

a prediction function a function which is used to make the predictions. It includes our beliefs about the regularities of the universe, our assumptions about how the world works, e.g. smoothness, spatial similarities, temporal similarities.

-

an objective function a function which defines the cost of misprediction. Typically it includes knowledge about the world’s generating processes (probabilistic objectives) or the costs we pay for mispredictions (empiricial risk minimization).

-

The combination of data and model through the prediction function and the objective function leads to a learning algorithm. The class of prediction functions and objective functions we can make use of is restricted by the algorithms they lead to. If the prediction function or the objective function are too complex, then it can be difficult to find an appropriate learning algorithm. Much of the acdemic field of machine learning is the quest for new learning algorithms that allow us to bring different types of models and data together.

-

A useful reference for state of the art in machine learning is the UK Royal Society Report, Machine Learning: Power and Promise of Computers that Learn by Example.

-

You can also check my post blog post on What is Machine Learning?..

+
+[edit] +
+

What is machine learning? At its most basic level machine learning is +a combination of

+

\[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

+

where data is our observations. They can be actively or +passively acquired (meta-data). The model contains our +assumptions, based on previous experience. That experience can be other +data, it can come from transfer learning, or it can merely be our +beliefs about the regularities of the universe. In humans our models +include our inductive biases. The prediction is an action to be +taken or a categorization or a quality score. The reason that machine +learning has become a mainstay of artificial intelligence is the +importance of predictions in artificial intelligence. The data and the +model are combined through computation.

+

In practice we normally perform machine learning using two functions. +To combine data with a model we typically make use of:

+

a prediction function it is used to make the +predictions. It includes our beliefs about the regularities of the +universe, our assumptions about how the world works, e.g., smoothness, +spatial similarities, temporal similarities.

+

an objective function it defines the ‘cost’ of +misprediction. Typically, it includes knowledge about the world’s +generating processes (probabilistic objectives) or the costs we pay for +mispredictions (empirical risk minimization).

+

The combination of data and model through the prediction function and +the objective function leads to a learning algorithm. The class +of prediction functions and objective functions we can make use of is +restricted by the algorithms they lead to. If the prediction function or +the objective function are too complex, then it can be difficult to find +an appropriate learning algorithm. Much of the academic field of machine +learning is the quest for new learning algorithms that allow us to bring +different types of models and data together.

+

A useful reference for state of the art in machine learning is the UK +Royal Society Report, Machine +Learning: Power and Promise of Computers that Learn by Example.

+

You can also check my post blog post on What +is Machine Learning?.

Overdetermined System

-

[edit]

-

The challenge with a linear model is that it has two unknowns, m, and c. Observing data allows us to write down a system of simultaneous linear equations. So, for example if we observe two data points, the first with the input value, $\inputScalar_1 = 1$ and the output value, $\dataScalar_1 =3$ and a second data point, $\inputScalar = 3$, $\dataScalar=1$, then we can write two simultaneous linear equations of the form.

-

point 1: $\inputScalar = 1$, $\dataScalar=3$
3 = m + c
point 2: $\inputScalar = 3$, $\dataScalar=1$
1 = 3m + c

-

The solution to these two simultaneous equations can be represented graphically as

+
+[edit] +
+

The challenge with a linear model is that it has two unknowns, \(m\), and \(c\). Observing data allows us to write down +a system of simultaneous linear equations. So, for example if we observe +two data points, the first with the input value, \(x_1 = 1\) and the output value, \(y_1 =3\) and a second data point, \(x= 3\), \(y=1\), then we can write two simultaneous +linear equations of the form.

+

point 1: \(x= 1\), \(y=3\) \[ +3 = m + c +\] point 2: \(x= 3\), \(y=1\) \[ +1 = 3m + c +\]

+

The solution to these two simultaneous equations can be represented +graphically as

- + -
+
-

Figure: The solution of two linear equations represented as the fit of a straight line through two data

+

Figure: The solution of two linear equations represented as the fit +of a straight line through two data

-

The challenge comes when a third data point is observed and it doesn’t naturally fit on the straight line.

-

point 3: $\inputScalar = 2$, $\dataScalar=2.5$
2.5 = 2m + c

+

The challenge comes when a third data point is observed, and it +doesn’t fit on the straight line.

+

point 3: \(x= 2\), \(y=2.5\) \[ +2.5 = 2m + c +\]

- + -
+
-

Figure: A third observation of data is inconsistent with the solution dictated by the first two observations

+

Figure: A third observation of data is inconsistent with the solution +dictated by the first two observations

-

Now there are three candidate lines, each consistent with our data.

+

Now there are three candidate lines, each consistent with our +data.

- + -
+
-

Figure: Three solutions to the problem, each consistent with two points of the three observations

-
+

Figure: Three solutions to the problem, each consistent with two +points of the three observations

+
+ +

This is known as an overdetermined system because there are +more data than we need to determine our parameters. The problem arises +because the model is a simplification of the real world, and the data we +observe is therefore inconsistent with our model.

+

Pierre-Simon Laplace

+
+[edit] +
+

The solution was proposed by Pierre-Simon Laplace. His idea was to +accept that the model was an incomplete representation of the real +world, and the way it was incomplete is unknown. His idea was +that such unknowns could be dealt with through probability.

+

Pierre-Simon Laplace

+
+[edit]
-

This is known as an overdetermined system because there are more data than we need to determine our parameters. The problem arises because the model is a simplification of the real world, and the data we observe is therefore inconsistent with our model.

-

The solution was proposed by Pierre-Simon Laplace. His idea was to accept that the model was an incomplete representation of the real world, and the manner in which it was incomplete is unknown. His idea was that such unknowns could be dealt with through probability.

-

Pierre-Simon Laplace

-

[edit]

- +
-
+

Figure: Pierre-Simon Laplace 1749-1827.

- -

Famously, Laplace considered the idea of a deterministic Universe, one in which the model is known, or as the below translation refers to it, “an intelligence which could comprehend all the forces by which nature is animated”. He speculates on an “intelligence” that can submit this vast data to analysis and propsoses that such an entity would be able to predict the future.

+

Famously, Laplace considered the idea of a deterministic Universe, +one in which the model is known, or as the below translation +refers to it, “an intelligence which could comprehend all the forces by +which nature is animated”. He speculates on an “intelligence” that can +submit this vast data to analysis and propsoses that such an entity +would be able to predict the future.

-

Given for one instant an intelligence which could comprehend all the forces by which nature is animated and the respective situation of the beings who compose it—an intelligence sufficiently vast to submit these data to analysis—it would embrace in the same formulate the movements of the greatest bodies of the universe and those of the lightest atom; for it, nothing would be uncertain and the future, as the past, would be present in its eyes.

+

Given for one instant an intelligence which could comprehend all the +forces by which nature is animated and the respective situation of the +beings who compose it—an intelligence sufficiently vast to submit these +data to analysis—it would embrace in the same formulate the movements of +the greatest bodies of the universe and those of the lightest atom; for +it, nothing would be uncertain and the future, as the past, would be +present in its eyes.

-

This notion is known as Laplace’s demon or Laplace’s superman.

+

This notion is known as Laplace’s demon or Laplace’s +superman.

- +
-
+

Figure: Laplace’s determinsim in English translation.

-

Unfortunately, most analyses of his ideas stop at that point, whereas his real point is that such a notion is unreachable. Not so much superman as strawman. Just three pages later in the “Philosophical Essay on Probabilities” (Laplace 1814), Laplace goes on to observe:

+

Laplace’s Gremlin

+
+[edit] +
+

Unfortunately, most analyses of his ideas stop at that point, whereas +his real point is that such a notion is unreachable. Not so much +superman as strawman. Just three pages later in the +“Philosophical Essay on Probabilities” (Laplace, 1814), Laplace goes on to +observe:

-

The curve described by a simple molecule of air or vapor is regulated in a manner just as certain as the planetary orbits; the only difference between them is that which comes from our ignorance.

-

Probability is relative, in part to this ignorance, in part to our knowledge.

+

The curve described by a simple molecule of air or vapor is regulated +in a manner just as certain as the planetary orbits; the only difference +between them is that which comes from our ignorance.

+

Probability is relative, in part to this ignorance, in part to our +knowledge.

-
- +
-
+
-

Figure: To Laplace, determinism is a strawman. Ignorance of mechanism and data leads to uncertainty which should be dealt with through probability.

-
-
-

In other words, we can never make use of the idealistic deterministic Universe due to our ignorance about the world, Laplace’s suggestion, and focus in this essay is that we turn to probability to deal with this uncertainty. This is also our inspiration for using probability in machine learning.

-

The “forces by which nature is animated” is our model, the “situation of beings that compose it” is our data and the “intelligence sufficiently vast enough to submit these data to analysis” is our compute. The fly in the ointment is our ignorance about these aspects. And probability is the tool we use to incorporate this ignorance leading to uncertainty or doubt in our predictions.

-

Laplace’s concept was that the reason that the data doesn’t match up to the model is because of unconsidered factors, and that these might be well represented through probability densities. He tackles the challenge of the unknown factors by adding a variable, $\noiseScalar$, that represents the unknown. In modern parlance we would call this a latent variable. But in the context Laplace uses it, the variable is so common that it has other names such as a “slack” variable or the noise in the system.

-

point 1: $\inputScalar = 1$, $\dataScalar=3$
$$ -3 = m + c + \noiseScalar_1 -$$
point 2: $\inputScalar = 3$, $\dataScalar=1$
$$ -1 = 3m + c + \noiseScalar_2 -$$
point 3: $\inputScalar = 2$, $\dataScalar=2.5$
$$ -2.5 = 2m + c + \noiseScalar_3 -$$

-

Laplace’s trick has converted the overdetermined system into an underdetermined system. He has now added three variables, $\{\noiseScalar_i\}_{i=1}^3$, which represent the unknown corruptions of the real world. Laplace’s idea is that we should represent that unknown corruption with a probability distribution.

+

Figure: To Laplace, determinism is a strawman. Ignorance of mechanism +and data leads to uncertainty which should be dealt with through +probability.

+
+
+

In other words, we can never make use of the idealistic deterministic +Universe due to our ignorance about the world, Laplace’s suggestion, and +focus in this essay is that we turn to probability to deal with this +uncertainty. This is also our inspiration for using probability in +machine learning. This is the true message of Laplace’s essay, not +determinism, but the gremlin of uncertainty that emerges from our +ignorance.

+

The “forces by which nature is animated” is our model, the +“situation of beings that compose it” is our data and the +“intelligence sufficiently vast enough to submit these data to analysis” +is our compute. The fly in the ointment is our ignorance about +these aspects. And probability is the tool we use to +incorporate this ignorance leading to uncertainty or doubt in +our predictions.

+

Latent Variables

+
+[edit] +
+

Laplace’s concept was that the reason that the data doesn’t match up +to the model is because of unconsidered factors, and that these might be +well represented through probability densities. He tackles the challenge +of the unknown factors by adding a variable, \(\epsilon\), that represents the unknown. In +modern parlance we would call this a latent variable. But in +the context Laplace uses it, the variable is so common that it has other +names such as a “slack” variable or the noise in the +system.

+

point 1: \(x= 1\), \(y=3\) [ 3 = m + c + _1 ] point 2: \(x= 3\), \(y=1\) [ 1 = 3m + c + _2 ] point 3: \(x= 2\), \(y=2.5\) [ 2.5 = 2m + c + _3 ]

+

Laplace’s trick has converted the overdetermined system into +an underdetermined system. He has now added three variables, +\(\{\epsilon_i\}_{i=1}^3\), which +represent the unknown corruptions of the real world. Laplace’s idea is +that we should represent that unknown corruption with a probability +distribution.

A Probabilistic Process

-

[edit]

-

However, it was left to an admirer of Gauss to develop a practical probability density for that purpose. It was Carl Friederich Gauss who suggested that the Gaussian density (which at the time was unnamed!) should be used to represent this error.

-

The result is a noisy function, a function which has a deterministic part, and a stochastic part. This type of function is sometimes known as a probabilistic or stochastic process, to distinguish it from a deterministic process.

-

Two Important Gaussian Properties

-

[edit]

-

The Gaussian density has many important properties, but for the moment we’ll review two of them.

+

However, it was left to an admirer of Laplace to develop a practical +probability density for that purpose. It was Carl Friedrich Gauss who +suggested that the Gaussian density (which at the time was +unnamed!) should be used to represent this error.

+

The result is a noisy function, a function which has a +deterministic part, and a stochastic part. This type of function is +sometimes known as a probabilistic or stochastic process, to distinguish +it from a deterministic process.

+

Two Important Gaussian +Properties

+
+[edit] +
+

The Gaussian density has many important properties, but for the +moment we’ll review two of them.

Sum of Gaussians

-

If we assume that a variable, $\dataScalar_i$, is sampled from a Gaussian density,

-


$$\dataScalar_i \sim \gaussianSamp{\meanScalar_i}{\sigma_i^2}$$

-

Then we can show that the sum of a set of variables, each drawn independently from such a density is also distributed as Gaussian. The mean of the resulting density is the sum of the means, and the variance is the sum of the variances,

-


$$ -\sum_{i=1}^{\numData} \dataScalar_i \sim \gaussianSamp{\sum_{i=1}^\numData \meanScalar_i}{\sum_{i=1}^\numData \sigma_i^2} -$$

-

Since we are very familiar with the Gaussian density and its properties, it is not immediately apparent how unusual this is. Most random variables, when you add them together, change the family of density they are drawn from. For example, the Gaussian is exceptional in this regard. Indeed, other random variables, if they are independently drawn and summed together tend to a Gaussian density. That is the central limit theorem which is a major justification for the use of a Gaussian density.

+

If we assume that a variable, \(y_i\), is sampled from a Gaussian +density,

+

\[y_i \sim +\mathcal{N}\left(\mu_i,\sigma_i^2\right)\]

+

Then we can show that the sum of a set of variables, each drawn +independently from such a density is also distributed as Gaussian. The +mean of the resulting density is the sum of the means, and the variance +is the sum of the variances,

+

\[ +\sum_{i=1}^{n} y_i \sim +\mathcal{N}\left(\sum_{i=1}^n\mu_i,\sum_{i=1}^n\sigma_i^2\right) +\]

+

Since we are very familiar with the Gaussian density and its +properties, it is not immediately apparent how unusual this is. Most +random variables, when you add them together, change the family of +density they are drawn from. For example, the Gaussian is exceptional in +this regard. Indeed, other random variables, if they are independently +drawn and summed together tend to a Gaussian density. That is the central +limit theorem which is a major justification for the use of a +Gaussian density.

Scaling a Gaussian

-

Less unusual is the scaling property of a Gaussian density. If a variable, $\dataScalar$, is sampled from a Gaussian density,

-


$$\dataScalar \sim \gaussianSamp{\meanScalar}{\sigma^2}$$
and we choose to scale that variable by a deterministic value, $\mappingScalar$, then the scaled variable is distributed as

-


$$\mappingScalar \dataScalar \sim \gaussianSamp{\mappingScalar\meanScalar}{\mappingScalar^2 \sigma^2}.$$
Unlike the summing properties, where adding two or more random variables independently sampled from a family of densitites typically brings the summed variable outside that family, scaling many densities leaves the distribution of that variable in the same family of densities. Indeed, many densities include a scale parameter (e.g. the Gamma density) which is purely for this purpose. In the Gaussian the standard deviation, $\dataStd$, is the scale parameter. To see why this makes sense, let’s consider,
$$z \sim \gaussianSamp{0}{1},$$
then if we scale by $\dataStd$ so we have, $\dataScalar=\dataStd z$, we can write,
$$\dataScalar =\dataStd z \sim \gaussianSamp{0}{\dataStd^2}$$

-

Let’s first of all review the properties of the multivariate Gaussian distribution that make linear Gaussian models easier to deal with. We’ll return to the, perhaps surprising, result on the parameters within the nonlinearity, $\parameterVector$, shortly.

-

To work with linear Gaussian models, to find the marginal likelihood all you need to know is the following rules. If
$$ -\dataVector = \mappingMatrix \inputVector + \noiseVector, -$$
where $\dataVector$, $\inputVector$ and $\noiseVector$ are vectors and we assume that $\inputVector$ and $\noiseVector$ are drawn from multivariate Gaussians,
$$ +

Less unusual is the scaling property of a Gaussian density. +If a variable, \(y\), is sampled from a +Gaussian density,

+

\[y\sim +\mathcal{N}\left(\mu,\sigma^2\right)\] and we choose to scale +that variable by a deterministic value, \(w\), then the scaled variable is +distributed as

+

\[wy\sim \mathcal{N}\left(w\mu,w^2 +\sigma^2\right).\] Unlike the summing properties, where adding +two or more random variables independently sampled from a family of +densitites typically brings the summed variable outside that +family, scaling many densities leaves the distribution of that variable +in the same family of densities. Indeed, many densities include +a scale parameter (e.g. the Gamma +density) which is purely for this purpose. In the Gaussian the +standard deviation, \(\sigma\), is the +scale parameter. To see why this makes sense, let’s consider, \[z \sim \mathcal{N}\left(0,1\right),\] then +if we scale by \(\sigma\) so we have, +\(y=\sigma z\), we can write, \[y=\sigma z \sim +\mathcal{N}\left(0,\sigma^2\right)\]

+

Let’s first of all review the properties of the multivariate Gaussian +distribution that make linear Gaussian models easier to deal with. We’ll +return to the, perhaps surprising, result on the parameters within the +nonlinearity, \(\boldsymbol{ \theta}\), +shortly.

+

To work with linear Gaussian models, to find the marginal likelihood +all you need to know is the following rules. If \[ +\mathbf{ y}= \mathbf{W}\mathbf{ x}+ \boldsymbol{ \epsilon}, +\] where \(\mathbf{ y}\), \(\mathbf{ x}\) and \(\boldsymbol{ \epsilon}\) are vectors and we +assume that \(\mathbf{ x}\) and \(\boldsymbol{ \epsilon}\) are drawn from +multivariate Gaussians, \[ \begin{align} -\inputVector & \sim \gaussianSamp{\meanVector}{\covarianceMatrix}\\ -\noiseVector & \sim \gaussianSamp{\zerosVector}{\covarianceMatrixTwo} +\mathbf{ x}& \sim \mathcal{N}\left(\boldsymbol{ +\mu},\mathbf{C}\right)\\ +\boldsymbol{ \epsilon}& \sim +\mathcal{N}\left(\mathbf{0},\boldsymbol{ \Sigma}\right) \end{align} -$$
then we know that $\dataVector$ is also drawn from a multivariate Gaussian with,
$$ -\dataVector \sim \gaussianSamp{\mappingMatrix\meanVector}{\mappingMatrix\covarianceMatrix\mappingMatrix^\top + \covarianceMatrixTwo}. -$$

-

With appropriately defined covariance, $\covarianceMatrixTwo$, this is actually the marginal likelihood for Factor Analysis, or Probabilistic Principal Component Analysis (Tipping and Bishop 1999), because we integrated out the inputs (or latent variables they would be called in that case).

+\] then we know that \(\mathbf{ +y}\) is also drawn from a multivariate Gaussian with, \[ +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{W}\boldsymbol{ +\mu},\mathbf{W}\mathbf{C}\mathbf{W}^\top + \boldsymbol{ \Sigma}\right). +\]

+

With appropriately defined covariance, \(\boldsymbol{ \Sigma}\), this is actually +the marginal likelihood for Factor Analysis, or Probabilistic Principal +Component Analysis (Tipping and Bishop, 1999), +because we integrated out the inputs (or latent variables they +would be called in that case).

Laplace’s Idea

-

[edit]

-

Laplace had the idea to augment the observations by noise, that is equivalent to considering a probability density whose mean is given by the prediction function
$$p\left(\dataScalar_i|\inputScalar_i\right)=\frac{1}{\sqrt{2\pi\dataStd^2}}\exp\left(-\frac{\left(\dataScalar_i-f\left(\inputScalar_i\right)\right)^{2}}{2\dataStd^2}\right).$$

-

This is known as stochastic process. It is a function that is corrupted by noise. Laplace didn’t suggest the Gaussian density for that purpose, that was an innovation from Carl Friederich Gauss, which is what gives the Gaussian density its name.

-

Height as a Function of Weight

-

In the standard Gaussian, parametized by mean and variance.

-

Make the mean a linear function of an input.

-

This leads to a regression model.
$$ +

+[edit] +
+

Laplace had the idea to augment the observations by noise, that is +equivalent to considering a probability density whose mean is given by +the prediction function \[p\left(y_i|x_i\right)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp\left(-\frac{\left(y_i-f\left(x_i\right)\right)^{2}}{2\sigma^2}\right).\]

+

This is known as stochastic process. It is a function that +is corrupted by noise. Laplace didn’t suggest the Gaussian density for +that purpose, that was an innovation from Carl Friederich Gauss, which +is what gives the Gaussian density its name.

+

Height as a Function of +Weight

+

In the standard Gaussian, parameterized by mean and variance, make +the mean a linear function of an input.

+

This leads to a regression model. \[ \begin{align*} - \dataScalar_i=&\mappingFunction\left(\inputScalar_i\right)+\noiseScalar_i,\\ - \noiseScalar_i \sim & \gaussianSamp{0}{\dataStd^2}. + y_i=&f\left(x_i\right)+\epsilon_i,\\ + \epsilon_i \sim & \mathcal{N}\left(0,\sigma^2\right). \end{align*} -$$

-

Assume $\dataScalar_i$ is height and $\inputScalar_i$ is weight.

-

Linear Algebra

-

[edit]

-

Linear algebra provides a very similar role, when we introduce linear algebra, it is because we are faced with a large number of addition and multiplication operations. These operations need to be done together and would be very tedious to write down as a group. So the first reason we reach for linear algebra is for a more compact representation of our mathematical formulae.

-

Running Example: Olympic Marathons

-

Now we will load in the Olympic marathon data. This is data of the olympic marath times for the men’s marathon from the first olympics in 1896 up until the London 2012 olympics.

-
import pods
-
data = pods.datasets.olympic_marathon_men()
-x = data['X']
-y = data['Y']
-

You can see what these values are by typing:

-
print(x)
-print(y)
-

Note that they are not pandas data frames for this example, they are just arrays of dimensionality $\numData\times 1$, where $\numData$ is the number of data.

-

The aim of this lab is to have you coding linear regression in python. We will do it in two ways, once using iterative updates (coordinate ascent) and then using linear algebra. The linear algebra approach will not only work much better, it is easy to extend to multiple input linear regression and non-linear regression using basis functions.

-

Plotting the Data

-

You can make a plot of $\dataScalar$ vs $\inputScalar$ with the following command:

-
%matplotlib inline 
-import matplotlib.pyplot as plt
-
plt.plot(x, y, 'rx')
-plt.xlabel('year')
-plt.ylabel('pace in min/km')
-

Maximum Likelihood: Iterative Solution

-

Now we will take the maximum likelihood approach we derived in the lecture to fit a line, $\dataScalar_i=m\inputScalar_i + c$, to the data you’ve plotted. We are trying to minimize the error function:
$$ -\errorFunction(m, c) = \sum_{i=1}^\numData(\dataScalar_i-m\inputScalar_i-c)^2 -$$
with respect to m, c and σ2. We can start with an initial guess for m,

-
m = -0.4
-c = 80
-

Then we use the maximum likelihood update to find an estimate for the offset, c.

-

Log Likelihood for Multivariate Regression

-

[edit]

-

Multiple Input Solution with Linear Algebra

-

You’ve now seen how slow it can be to perform a coordinate ascent on a system. Another approach to solving the system (which is not always possible, particularly in non-linear systems) is to go direct to the minimum. To do this we need to introduce linear algebra. We will represent all our errors and functions in the form of linear algebra. As we mentioned above, linear algebra is just a shorthand for performing lots of multiplications and additions simultaneously. What does it have to do with our system then? Well the first thing to note is that the linear function we were trying to fit has the following form:
$$ -\mappingFunction(x) = mx + c -$$
the classical form for a straight line. From a linear algebraic perspective we are looking for multiplications and additions. We are also looking to separate our parameters from our data. The data is the givens remember, in French the word is données literally translated means givens that’s great, because we don’t need to change the data, what we need to change are the parameters (or variables) of the model. In this function the data comes in through x, and the parameters are m and c.

-

What we’d like to create is a vector of parameters and a vector of data. Then we could represent the system with vectors that represent the data, and vectors that represent the parameters.

-

We look to turn the multiplications and additions into a linear algebraic form, we have one multiplication (m × c) and one addition (mx + c). But we can turn this into a inner product by writing it in the following way,
$$ -\mappingFunction(x) = m \times x + +\]

+

Assume \(y_i\) is height and \(x_i\) is weight.

+

Olympic Marathon Data

+
+[edit] +
+ + + + + +
+
    +
  • Gold medal times for Olympic Marathon since 1896.
  • +
  • Marathons before 1924 didn’t have a standardized distance.
  • +
  • Present results using pace per km.
  • +
  • In 1904 Marathon was badly organized leading to very slow +times.
  • +
+
+
+ +
+Image from Wikimedia Commons http://bit.ly/16kMKHQ +
+

The first thing we will do is load a standard data set for regression +modelling. The data consists of the pace of Olympic Gold Medal Marathon +winners for the Olympics from 1896 to present. Let’s load in the data +and plot.

+
import numpy as np
+import pods
+
data = pods.datasets.olympic_marathon_men()
+x = data['X']
+y = data['Y']
+
+offset = y.mean()
+scale = np.sqrt(y.var())
+yhat = (y - offset)/scale
+
+
+ + +
+
+ +
+
+

Figure: Olympic marathon pace times since 1896.

+
+
+

Things to notice about the data include the outlier in 1904, in that +year the Olympics was in St Louis, USA. Organizational problems and +challenges with dust kicked up by the cars following the race meant that +participants got lost, and only very few participants completed. More +recent years see more consistently quick marathons.

+

Running Example: Olympic +Marathons

+
+[edit] +
+

Note that x and y are not +pandas data frames for this example, they are just arrays +of dimensionality \(n\times 1\), where +\(n\) is the number of data.

+

The aim of this lab is to have you coding linear regression in +python. We will do it in two ways, once using iterative updates +(coordinate ascent) and then using linear algebra. The linear algebra +approach will not only work much better, it is also easy to extend to +multiple input linear regression and non-linear regression +using basis functions.

+

Maximum Likelihood: +Iterative Solution

+

Now we will take the maximum likelihood approach we derived in the +lecture to fit a line, \(y_i=mx_i + +c\), to the data you’ve plotted. We are trying to minimize the +error function: \[ +E(m, c) = \sum_{i=1}^n(y_i-mx_i-c)^2 +\] with respect to \(m\), \(c\) and \(\sigma^2\). We can start with an initial +guess for \(m\),

+
m = -0.4
+c = 80
+

Then we use the maximum likelihood update to find an estimate for the +offset, \(c\).

+

Log Likelihood for +Multivariate Regression

+
+[edit] +
+

Quadratic Loss

+
+[edit] +
+

Now we’ve identified the empirical risk with the loss, we’ll use +\(E(\mathbf{ w})\) to represent our +objective function. \[ +E(\mathbf{ w}) = \sum_{i=1}^n\left(y_i - f(\mathbf{ x}_i, \mathbf{ +w})\right)^2 +\] gives us our objective.

+

In the case of the linear prediction function, we can substitute +\(f(\mathbf{ x}_i, \mathbf{ w}) = \mathbf{ +w}^\top \mathbf{ x}_i\). \[ +E(\mathbf{ w}) = \sum_{i=1}^n\left(y_i - \mathbf{ w}^\top \mathbf{ +x}_i\right)^2 +\] To compute the gradient of the objective, we first expand the +brackets.

+

Bracket Expansion

+

\[ +\begin{align*} + E(\mathbf{ w},\sigma^2) = & +\frac{n}{2}\log \sigma^2 + \frac{1}{2\sigma^2}\sum +_{i=1}^{n}y_i^{2}-\frac{1}{\sigma^2}\sum +_{i=1}^{n}y_i\mathbf{ w}^{\top}\mathbf{ +x}_i\\&+\frac{1}{2\sigma^2}\sum +_{i=1}^{n}\mathbf{ w}^{\top}\mathbf{ x}_i\mathbf{ x}_i^{\top}\mathbf{ w} ++\text{const}.\\ + = & \frac{n}{2}\log \sigma^2 + \frac{1}{2\sigma^2}\sum +_{i=1}^{n}y_i^{2}-\frac{1}{\sigma^2} +\mathbf{ w}^\top\sum_{i=1}^{n}\mathbf{ +x}_iy_i\\&+\frac{1}{2\sigma^2} +\mathbf{ w}^{\top}\left[\sum +_{i=1}^{n}\mathbf{ x}_i\mathbf{ x}_i^{\top}\right]\mathbf{ +w}+\text{const}. +\end{align*} +\]

+

Solution with Linear Algebra

+

In this section we’re going compute the minimum of the quadratic loss +with respect to the parameters. When we do this, we’ll also review +linear algebra. We will represent all our errors and functions +in the form of matrices and vectors.

+

Linear algebra is just a shorthand for performing lots of +multiplications and additions simultaneously. What does it have to do +with our system then? Well, the first thing to note is that the classic +linear function we fit for a one-dimensional regression has the form: +\[ +f(x) = mx + c +\] the classical form for a straight line. From a linear +algebraic perspective, we are looking for multiplications and additions. +We are also looking to separate our parameters from our data. The data +is the givens. In French the word is données literally +translated means givens that’s great, because we don’t need to +change the data, what we need to change are the parameters (or +variables) of the model. In this function the data comes in through +\(x\), and the parameters are \(m\) and \(c\).

+

What we’d like to create is a vector of parameters and a vector of +data. Then we could represent the system with vectors that represent the +data, and vectors that represent the parameters.

+

We look to turn the multiplications and additions into a linear +algebraic form, we have one multiplication (\(m\times c\)) and one addition (\(mx + c\)). But we can turn this into an +inner product by writing it in the following way, \[ +f(x) = m \times x + c \times 1, -$$
in other words we’ve extracted the unit value, from the offset, c. We can think of this unit value like an extra item of data, because it is always given to us, and it is always set to 1 (unlike regular data, which is likely to vary!). We can therefore write each input data location, $\inputVector$, as a vector
$$ -\inputVector = \begin{bmatrix} 1\\ x\end{bmatrix}. -$$

-

Now we choose to also turn our parameters into a vector. The parameter vector will be defined to contain
$$ -\mappingVector = \begin{bmatrix} c \\ m\end{bmatrix} -$$
because if we now take the inner product between these to vectors we recover
$$ -\inputVector\cdot\mappingVector = 1 \times c + x \times m = mx + c -$$
In numpy we can define this vector as follows

-
import numpy as np
-
# define the vector w
-w = np.zeros(shape=(2, 1))
-w[0] = m
-w[1] = c
-

This gives us the equivalence between original operation and an operation in vector space. Whilst the notation here isn’t a lot shorter, the beauty is that we will be able to add as many features as we like and still keep the seame representation. In general, we are now moving to a system where each of our predictions is given by an inner product. When we want to represent a linear product in linear algebra, we tend to do it with the transpose operation, so since we have a ⋅ b = ab we can write
$$ -\mappingFunction(\inputVector_i) = \inputVector_i^\top\mappingVector. -$$
Where we’ve assumed that each data point, $\inputVector_i$, is now written by appending a 1 onto the original vector
$$ -\inputVector_i = \begin{bmatrix} +\] in other words, we’ve extracted the unit value from the +offset, \(c\). We can think of this +unit value like an extra item of data, because it is always given to us, +and it is always set to 1 (unlike regular data, which is likely to +vary!). We can therefore write each input data location, \(\mathbf{ x}\), as a vector \[ +\mathbf{ x}= \begin{bmatrix} 1\\ x\end{bmatrix}. +\]

+

Now we choose to also turn our parameters into a vector. The +parameter vector will be defined to contain \[ +\mathbf{ w}= \begin{bmatrix} c \\ m\end{bmatrix} +\] because if we now take the inner product between these two +vectors we recover \[ +\mathbf{ x}\cdot\mathbf{ w}= 1 \times c + x \times m = mx + c +\] In numpy we can define this vector as follows

+
import numpy as np
+
# define the vector w
+w = np.zeros(shape=(2, 1))
+w[0] = m
+w[1] = c
+

This gives us the equivalence between original operation and an +operation in vector space. Whilst the notation here isn’t a lot shorter, +the beauty is that we will be able to add as many features as we like +and keep the same representation. In general, we are now moving to a +system where each of our predictions is given by an inner product. When +we want to represent a linear product in linear algebra, we tend to do +it with the transpose operation, so since we have \(\mathbf{a}\cdot\mathbf{b} = +\mathbf{a}^\top\mathbf{b}\) we can write \[ +f(\mathbf{ x}_i) = \mathbf{ x}_i^\top\mathbf{ w}. +\] Where we’ve assumed that each data point, \(\mathbf{ x}_i\), is now written by +appending a 1 onto the original vector \[ +\mathbf{ x}_i = \begin{bmatrix} 1 \\ -\inputScalar_i +x_i \end{bmatrix} -$$

+\]

Design Matrix

-

We can do this for the entire data set to form a design matrix $\inputMatrix$,

-


$$\inputMatrix -= \begin{bmatrix} -\inputVector_1^\top \\\ -\inputVector_2^\top \\\ +

We can do this for the entire data set to form a design +matrix \(\boldsymbol{ \Phi}\), +\[ +\boldsymbol{ \Phi} += \begin{bmatrix} +\mathbf{ x}_1^\top \\\ +\mathbf{ x}_2^\top \\\ \vdots \\\ -\inputVector_\numData^\top +\mathbf{ x}_n^\top \end{bmatrix} = \begin{bmatrix} -1 & \inputScalar_1 \\\ -1 & \inputScalar_2 \\\ +1 & x_1 \\\ +1 & x_2 \\\ \vdots & \vdots \\\ -1 & \inputScalar_\numData -\end{bmatrix},$$

-

which in numpy can be done with the following commands:

-
import numpy as np
-
X = np.hstack((np.ones_like(x), x))
-print(X)
-

Writing the Objective with Linear Algebra

-

When we think of the objective function, we can think of it as the errors where the error is defined in a similar way to what it was in Legendre’s day $\dataScalar_i - \mappingFunction(\inputVector_i)$, in statistics these errors are also sometimes called residuals. So we can think as the objective and the prediction function as two separate parts, first we have,
$$ -\errorFunction(\mappingVector) = \sum_{i=1}^\numData (\dataScalar_i - \mappingFunction(\inputVector_i; \mappingVector))^2, -$$
where we’ve made the function $\mappingFunction(\cdot)$’s dependence on the parameters $\mappingVector$ explicit in this equation. Then we have the definition of the function itself,
$$ -\mappingFunction(\inputVector_i; \mappingVector) = \inputVector_i^\top \mappingVector. -$$
Let’s look again at these two equations and see if we can identify any inner products. The first equation is a sum of squares, which is promising. Any sum of squares can be represented by an inner product,
$$ -a = \sum_{i=1}^{k} b^2_i = \mathbf{b}^\top\mathbf{b}, -$$
so if we wish to represent $\errorFunction(\mappingVector)$ in this way, all we need to do is convert the sum operator to an inner product. We can get a vector from that sum operator by placing both $\dataScalar_i$ and $\mappingFunction(\inputVector_i; \mappingVector)$ into vectors, which we do by defining
$$ -\dataVector = \begin{bmatrix}\dataScalar_1\\ \dataScalar_2\\ \vdots \\ \dataScalar_\numData\end{bmatrix} -$$
and defining
$$ -\mappingFunctionVector(\inputVector_1; \mappingVector) = \begin{bmatrix}\mappingFunction(\inputVector_1; \mappingVector)\\ \mappingFunction(\inputVector_2; \mappingVector)\\ \vdots \\ \mappingFunction(\inputVector_\numData; \mappingVector)\end{bmatrix}. -$$
The second of these is actually a vector-valued function. This term may appear intimidating, but the idea is straightforward. A vector valued function is simply a vector whose elements are themselves defined as functions, i.e. it is a vector of functions, rather than a vector of scalars. The idea is so straightforward, that we are going to ignore it for the moment, and barely use it in the derivation. But it will reappear later when we introduce basis functions. So we will, for the moment, ignore the dependence of $\mappingFunctionVector$ on $\mappingVector$ and $\inputMatrix$ and simply summarise it by a vector of numbers
$$ -\mappingFunctionVector = \begin{bmatrix}\mappingFunction_1\\\mappingFunction_2\\ -\vdots \\ \mappingFunction_\numData\end{bmatrix}. -$$
This allows us to write our objective in the folowing, linear algebraic form,
$$ -\errorFunction(\mappingVector) = (\dataVector - \mappingFunctionVector)^\top(\dataVector - \mappingFunctionVector) -$$
from the rules of inner products. But what of our matrix $\inputMatrix$ of input data? At this point, we need to dust off matrix-vector multiplication. Matrix multiplication is simply a convenient way of performing many inner products together, and it’s exactly what we need to summarise the operation
$$ -f_i = \inputVector_i^\top\mappingVector. -$$
This operation tells us that each element of the vector $\mappingFunctionVector$ (our vector valued function) is given by an inner product between $\inputVector_i$ and $\mappingVector$. In other words it is a series of inner products. Let’s look at the definition of matrix multiplication, it takes the form
c = Ba
where c might be a k dimensional vector (which we can intepret as a k × 1 dimensional matrix), and B is a k × k dimensional matrix and a is a k dimensional vector (k × 1 dimensional matrix).

-

The result of this multiplication is of the form
$$ +1 & x_n +\end{bmatrix}, +\] which in numpy can be done with the following +commands:

+
import numpy as np
+
Phi = np.hstack((np.ones_like(x), x))
+print(Phi)
+

Writing the Objective +with Linear Algebra

+

When we think of the objective function, we can think of it as the +errors where the error is defined in a similar way to what it was in +Legendre’s day \(y_i - f(\mathbf{ +x}_i)\), in statistics these errors are also sometimes called residuals. +So, we can think as the objective and the prediction function as two +separate parts, first we have, \[ +E(\mathbf{ w}) = \sum_{i=1}^n(y_i - f(\mathbf{ x}_i; \mathbf{ w}))^2, +\] where we’ve made the function \(f(\cdot)\)’s dependence on the parameters +\(\mathbf{ w}\) explicit in this +equation. Then we have the definition of the function itself, \[ +f(\mathbf{ x}_i; \mathbf{ w}) = \mathbf{ x}_i^\top \mathbf{ w}. +\] Let’s look again at these two equations and see if we can +identify any inner products. The first equation is a sum of squares, +which is promising. Any sum of squares can be represented by an inner +product, \[ +a = \sum_{i=1}^{k} b^2_i = \mathbf{b}^\top\mathbf{b}. +\] If we wish to represent \(E(\mathbf{ +w})\) in this way, all we need to do is convert the sum operator +to an inner product. We can get a vector from that sum operator by +placing both \(y_i\) and \(f(\mathbf{ x}_i; \mathbf{ w})\) into +vectors, which we do by defining \[ +\mathbf{ y}= \begin{bmatrix}y_1\\ y_2\\ \vdots \\ y_n\end{bmatrix} +\] and defining \[ +\mathbf{ f}(\mathbf{ x}_1; \mathbf{ w}) = \begin{bmatrix}f(\mathbf{ +x}_1; \mathbf{ w})\\ f(\mathbf{ x}_2; \mathbf{ w})\\ \vdots \\ +f(\mathbf{ x}_n; \mathbf{ w})\end{bmatrix}. +\] The second of these is a vector-valued function. This term may +appear intimidating, but the idea is straightforward. A vector valued +function is simply a vector whose elements are themselves defined as +functions, i.e., it is a vector of functions, rather than a +vector of scalars. The idea is so straightforward, that we are going to +ignore it for the moment, and barely use it in the derivation. But it +will reappear later when we introduce basis functions. So, we +will for the moment ignore the dependence of \(\mathbf{ f}\) on \(\mathbf{ w}\) and \(\boldsymbol{ \Phi}\) and simply summarise +it by a vector of numbers \[ +\mathbf{ f}= \begin{bmatrix}f_1\\f_2\\ +\vdots \\ f_n\end{bmatrix}. +\] This allows us to write our objective in the folowing, linear +algebraic form, \[ +E(\mathbf{ w}) = (\mathbf{ y}- \mathbf{ f})^\top(\mathbf{ y}- \mathbf{ +f}) +\] from the rules of inner products. But what of our matrix \(\boldsymbol{ \Phi}\) of input data? At this +point, we need to dust off matrix-vector +multiplication. Matrix multiplication is simply a convenient +way of performing many inner products together, and it’s exactly what we +need to summarize the operation \[ +f_i = \mathbf{ x}_i^\top\mathbf{ w}. +\] This operation tells us that each element of the vector \(\mathbf{ f}\) (our vector valued function) +is given by an inner product between \(\mathbf{ x}_i\) and \(\mathbf{ w}\). In other words, it is a +series of inner products. Let’s look at the definition of matrix +multiplication, it takes the form \[ +\mathbf{c} = \mathbf{B}\mathbf{a}, +\] where \(\mathbf{c}\) might be +a \(k\) dimensional vector (which we +can interpret as a \(k\times 1\) +dimensional matrix), and \(\mathbf{B}\) +is a \(k\times k\) dimensional matrix +and \(\mathbf{a}\) is a \(k\) dimensional vector (\(k\times 1\) dimensional matrix).

+

The result of this multiplication is of the form \[ \begin{bmatrix}c_1\\c_2 \\ \vdots \\ -a_k\end{bmatrix} = +a_k\end{bmatrix} = \begin{bmatrix} b_{1,1} & b_{1, 2} & \dots & b_{1, k} \\ b_{2, 1} & b_{2, 2} & \dots & b_{2, k} \\ \vdots & \vdots & \ddots & \vdots \\ -b_{k, 1} & b_{k, 2} & \dots & b_{k, k} \end{bmatrix} \begin{bmatrix}a_1\\a_2 \\ -\vdots\\ c_k\end{bmatrix} = \begin{bmatrix} b_{1, 1}a_1 + b_{1, 2}a_2 + \dots + +b_{k, 1} & b_{k, 2} & \dots & b_{k, k} \end{bmatrix} +\begin{bmatrix}a_1\\a_2 \\ +\vdots\\ c_k\end{bmatrix} = \begin{bmatrix} b_{1, 1}a_1 + b_{1, 2}a_2 + +\dots + b_{1, k}a_k\\ -b_{2, 1}a_1 + b_{2, 2}a_2 + \dots + b_{2, k}a_k \\ +b_{2, 1}a_1 + b_{2, 2}a_2 + \dots + b_{2, k}a_k \\ \vdots\\ -b_{k, 1}a_1 + b_{k, 2}a_2 + \dots + b_{k, k}a_k\end{bmatrix} -$$
so we see that each element of the result, a is simply the inner product between each row of B and the vector c. Because we have defined each element of $\mappingFunctionVector$ to be given by the inner product between each row of the design matrix and the vector $\mappingVector$ we now can write the full operation in one matrix multiplication,
$$ -\mappingFunctionVector = \inputMatrix\mappingVector. -$$

-
import numpy as np
-
f = X@w # The @ sign performs matrix multiplication
-

Combining this result with our objective function,
$$ -\errorFunction(\mappingVector) = (\dataVector - \mappingFunctionVector)^\top(\dataVector - \mappingFunctionVector) -$$
we find we have defined the model with two equations. One equation tells us the form of our predictive function and how it depends on its parameters, the other tells us the form of our objective function.

-
resid = (y-f)
-E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product.
-print("Error function is:", E)
-

Exercise 0

-

The prediction for our movie recommender system had the form
fi, j = uivj
and the objective function was then
$$ -E = \sum_{i,j} s_{i,j}(\dataScalar_{i,j} - f_{i, j})^2 -$$
Try writing this down in matrix and vector form. How many of the terms can you do? For each variable and parameter carefully think about whether it should be represented as a matrix or vector. Do as many of the terms as you can. Use $\LaTeX$ to give your answers and give the dimensions of any matrices you create.

-

Objective Optimisation

-

Our model has now been defined with two equations, the prediction function and the objective function. Next we will use multivariate calculus to define an algorithm to fit the model. The separation between model and algorithm is important and is often overlooked. Our model contains a function that shows how it will be used for prediction, and a function that describes the objective function we need to optimise to obtain a good set of parameters.

-

The model linear regression model we have described is still the same as the one we fitted above with a coordinate ascent algorithm. We have only played with the notation to obtain the same model in a matrix and vector notation. However, we will now fit this model with a different algorithm, one that is much faster. It is such a widely used algorithm that from the end user’s perspective it doesn’t even look like an algorithm, it just appears to be a single operation (or function). However, underneath the computer calls an algorithm to find the solution. Further, the algorithm we obtain is very widely used, and because of this it turns out to be highly optimised.

-

Once again we are going to try and find the stationary points of our objective by finding the stationary points. However, the stationary points of a multivariate function, are a little bit more complext to find. Once again we need to find the point at which the derivative is zero, but now we need to use multivariate calculus to find it. This involves learning a few additional rules of differentiation (that allow you to do the derivatives of a function with respect to vector), but in the end it makes things quite a bit easier. We define vectorial derivatives as follows,
$$ -\frac{\text{d}\errorFunction(\mappingVector)}{\text{d}\mappingVector} = -\begin{bmatrix}\frac{\text{d}\errorFunction(\mappingVector)}{\text{d}\mappingScalar_1}\\\frac{\text{d}\errorFunction(\mappingVector)}{\text{d}\mappingScalar_2}\end{bmatrix}. -$$
where $\frac{\text{d}\errorFunction(\mappingVector)}{\text{d}\mappingScalar_1}$ is the partial derivative of the error function with respect to $\mappingScalar_1$.

-

Differentiation through multiplications and additions is relatively straightforward, and since linear algebra is just multiplication and addition, then its rules of diffentiation are quite straightforward too, but slightly more complex than regular derivatives.

+b_{k, 1}a_1 + b_{k, 2}a_2 + \dots + b_{k, k}a_k\end{bmatrix}. +\] We see that each element of the result, \(\mathbf{a}\) is simply the inner product +between each row of \(\mathbf{B}\) and the vector \(\mathbf{c}\). Because we have defined each +element of \(\mathbf{ f}\) to be given +by the inner product between each row of the design matrix and +the vector \(\mathbf{ w}\) we now can +write the full operation in one matrix multiplication,

+

\[ +\mathbf{ f}= \boldsymbol{ \Phi}\mathbf{ w}. +\]

+
import numpy as np
+
f = Phi@w # The @ sign performs matrix multiplication
+

Combining this result with our objective function, \[ +E(\mathbf{ w}) = (\mathbf{ y}- \mathbf{ f})^\top(\mathbf{ y}- \mathbf{ +f}) +\] we find we have defined the model with two equations. +One equation tells us the form of our predictive function and how it +depends on its parameters, the other tells us the form of our objective +function.

+
resid = (y-f)
+E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product.
+print("Error function is:", E)
+

Objective Optimization

+
+[edit] +
+

Our model has now been defined with two equations: the +prediction function and the objective function. Now we will use +multivariate calculus to define an algorithm to fit the model. +The separation between model and algorithm is important and is often +overlooked. Our model contains a function that shows how it will be used +for prediction, and a function that describes the objective function we +need to optimize to obtain a good set of parameters.

+

The model linear regression model we have described is still the same +as the one we fitted above with a coordinate ascent algorithm. We have +only played with the notation to obtain the same model in a matrix and +vector notation. However, we will now fit this model with a different +algorithm, one that is much faster. It is such a widely used algorithm +that from the end user’s perspective it doesn’t even look like an +algorithm, it just appears to be a single operation (or function). +However, underneath the computer calls an algorithm to find the +solution. Further, the algorithm we obtain is very widely used, and +because of this it turns out to be highly optimized.

+

Once again, we are going to try and find the stationary points of our +objective by finding the stationary points. However, the +stationary points of a multivariate function, are a little bit more +complex to find. As before we need to find the point at which the +gradient is zero, but now we need to use multivariate calculus +to find it. This involves learning a few additional rules of +differentiation (that allow you to do the derivatives of a function with +respect to vector), but in the end it makes things quite a bit easier. +We define vectorial derivatives as follows, \[ +\frac{\text{d}E(\mathbf{ w})}{\text{d}\mathbf{ w}} = +\begin{bmatrix}\frac{\text{d}E(\mathbf{ +w})}{\text{d}w_1}\\\frac{\text{d}E(\mathbf{ +w})}{\text{d}w_2}\end{bmatrix}. +\] where \(\frac{\text{d}E(\mathbf{ +w})}{\text{d}w_1}\) is the partial +derivative of the error function with respect to \(w_1\).

+

Differentiation through multiplications and additions is relatively +straightforward, and since linear algebra is just multiplication and +addition, then its rules of differentiation are quite straightforward +too, but slightly more complex than regular derivatives.

Multivariate Derivatives

-

We will need two rules of multivariate or matrix differentiation. The first is diffentiation of an inner product. By remembering that the inner product is made up of multiplication and addition, we can hope that its derivative is quite straightforward, and so it proves to be. We can start by thinking about the definition of the inner product,
az = ∑iaizi,
which if we were to take the derivative with respect to zk would simply return the gradient of the one term in the sum for which the derivative was non zero, that of ak, so we know that
$$ +

We will need two rules of multivariate or matrix +differentiation. The first is differentiation of an inner product. By +remembering that the inner product is made up of multiplication and +addition, we can hope that its derivative is quite straightforward, and +so it proves to be. We can start by thinking about the definition of the +inner product, \[ +\mathbf{a}^\top\mathbf{z} = \sum_{i} a_i +z_i, +\] which if we were to take the derivative with respect to \(z_k\) would simply return the gradient of +the one term in the sum for which the derivative was non-zero, that of +\(a_k\), so we know that \[ \frac{\text{d}}{\text{d}z_k} \mathbf{a}^\top \mathbf{z} = a_k -$$
and by our definition of multivariate derivatives we can simply stack all the partial derivatives of this form in a vector to obtain the result that
$$ +\] and by our definition for multivariate derivatives, we can +simply stack all the partial derivatives of this form in a vector to +obtain the result that \[ \frac{\text{d}}{\text{d}\mathbf{z}} \mathbf{a}^\top \mathbf{z} = \mathbf{a}. -$$
The second rule that’s required is differentiation of a ‘matrix quadratic’. A scalar quadratic in z with coefficient c has the form cz2. If z is a k × 1 vector and C is a k × k matrix of coefficients then the matrix quadratic form is written as zCz, which is itself a scalar quantity, but it is a function of a vector.

-

Matching Dimensions in Matrix Multiplications

-

There’s a trick for telling that it’s a scalar result. When you are doing maths with matrices, it’s always worth pausing to perform a quick sanity check on the dimensions. Matrix multplication only works when the dimensions match. To be precise, the ‘inner’ dimension of the matrix must match. What is the inner dimension. If we multiply two matrices A and B, the first of which has k rows and columns and the second of which has p rows and q columns, then we can check whether the multiplication works by writing the dimensionalities next to each other,
$$ +\] The second rule that’s required is differentiation of a +‘matrix quadratic’. A scalar quadratic in \(z\) with coefficient \(c\) has the form \(cz^2\). If \(\mathbf{z}\) is a \(k\times 1\) vector and \(\mathbf{C}\) is a \(k \times k\) matrix of +coefficients then the matrix quadratic form is written as \(\mathbf{z}^\top \mathbf{C}\mathbf{z}\), +which is itself a scalar quantity, but it is a function of a +vector.

+

Matching +Dimensions in Matrix Multiplications

+

There’s a trick for telling a multiplication leads to a scalar +result. When you are doing mathematics with matrices, it’s always worth +pausing to perform a quick sanity check on the dimensions. Matrix +multplication only works when the dimensions match. To be precise, the +‘inner’ dimension of the matrix must match. What is the inner dimension? +If we multiply two matrices \(\mathbf{A}\) and \(\mathbf{B}\), the first of which has \(k\) rows and \(\ell\) columns and the second of which has +\(p\) rows and \(q\) columns, then we can check whether the +multiplication works by writing the dimensionalities next to each other, +\[ \mathbf{A} \mathbf{B} \rightarrow (k \times -\underbrace{\ell)(p}_\text{inner dimensions} \times q) \rightarrow (k\times q). -$$
The inner dimensions are the two inside dimensions, and p. The multiplication will only work if ℓ = p. The result of the multiplication will then be a k × q matrix: this dimensionality comes from the ‘outer dimensions’. Note that matrix multiplication is not commutative. And if you change the order of the multiplication,
$$ -\mathbf{B} \mathbf{A} \rightarrow (\ell \times \underbrace{k)(q}_\text{inner dimensions} \times p) \rightarrow (\ell \times p). -$$
firstly it may no longer even work, because now the condition is that k = q, and secondly the result could be of a different dimensionality. An exception is if the matrices are square matrices (e.g. same number of rows as columns) and they are both symmetric. A symmetric matrix is one for which A = A, or equivalently, ai, j = aj, i for all i and j.

-

You will need to get used to working with matrices and vectors applying and developing new machine learning techniques. You should have come across them before, but you may not have used them as extensively as we will now do in this course. You should get used to using this trick to check your work and ensure you know what the dimension of an output matrix should be. For our matrix quadratic form, it turns out that we can see it as a special type of inner product.
$$ +\underbrace{\ell)(p}_\text{inner dimensions} \times q) \rightarrow +(k\times q). +\] The inner dimensions are the two inside dimensions, \(\ell\) and \(p\). The multiplication will only work if +\(\ell=p\). The result of the +multiplication will then be a \(k\times +q\) matrix: this dimensionality comes from the ‘outer +dimensions’. Note that matrix multiplication is not commutative. +And if you change the order of the multiplication, \[ +\mathbf{B} \mathbf{A} \rightarrow (\ell \times +\underbrace{k)(q}_\text{inner dimensions} \times p) \rightarrow (\ell +\times p). +\] Firstly, it may no longer even work, because now the condition +is that \(k=q\), and secondly the +result could be of a different dimensionality. An exception is if the +matrices are square matrices (e.g., same number of rows as columns) and +they are both symmetric. A symmetric matrix is one for which +\(\mathbf{A}=\mathbf{A}^\top\), or +equivalently, \(a_{i,j} = a_{j,i}\) for +all \(i\) and \(j\).

+

For applying and developing machine learning algorithms you should +get familiar with working with matrices and vectors. You should have +come across them before, but you may not have used them as extensively +as we are doing now. It’s worth getting used to using this trick to +check your work and ensure you know what the dimension of an output +matrix should be. For our matrix quadratic form, it turns out that we +can see it as a special type of inner product. \[ \mathbf{z}^\top\mathbf{C}\mathbf{z} \rightarrow (1\times -\underbrace{k) (k}_\text{inner dimensions}\times k) (k\times 1) \rightarrow +\underbrace{k) (k}_\text{inner dimensions}\times k) (k\times 1) +\rightarrow \mathbf{b}^\top\mathbf{z} -$$
where b = Cz so therefore the result is a scalar,
$$ +\] where \(\mathbf{b} = +\mathbf{C}\mathbf{z}\) so therefore the result is a scalar, \[ \mathbf{b}^\top\mathbf{z} \rightarrow (1\times \underbrace{k) (k}_\text{inner dimensions}\times 1) \rightarrow (1\times 1) -$$
where a (1 × 1) matrix is recognised as a scalar.

-

This implies that we should be able to differentiate this form, and indeed the rule for its differentiation is slightly more complex than the inner product, but still quite simple,
$$ +\] where a \((1\times 1)\) +matrix is recognised as a scalar.

+

This implies that we should be able to differentiate this form, and +indeed the rule for its differentiation is slightly more complex than +the inner product, but still quite simple, \[ \frac{\text{d}}{\text{d}\mathbf{z}} -\mathbf{z}^\top\mathbf{C}\mathbf{z}= \mathbf{C}\mathbf{z} + \mathbf{C}^\top +\mathbf{z}^\top\mathbf{C}\mathbf{z}= \mathbf{C}\mathbf{z} + +\mathbf{C}^\top \mathbf{z}. -$$
Note that in the special case where C is symmetric then we have C = C and the derivative simplifies to
$$ +\] Note that in the special case where \(\mathbf{C}\) is symmetric then we have +\(\mathbf{C} = \mathbf{C}^\top\) and +the derivative simplifies to \[ \frac{\text{d}}{\text{d}\mathbf{z}} \mathbf{z}^\top\mathbf{C}\mathbf{z}= 2\mathbf{C}\mathbf{z}. -$$

+\]

Differentiate the Objective

-

First, we need to compute the full objective by substituting our prediction function into the objective function to obtain the objective in terms of $\mappingVector$. Doing this we obtain
$$ -\errorFunction(\mappingVector)= (\dataVector - \inputMatrix\mappingVector)^\top (\dataVector - \inputMatrix\mappingVector). -$$
We now need to differentiate this quadratic form to find the minimum. We differentiate with respect to the vector $\mappingVector$. But before we do that, we’ll expand the brackets in the quadratic form to obtain a series of scalar terms. The rules for bracket expansion across the vectors are similar to those for the scalar system giving,
(a − b)(c − d) = ac − ad − bc + bd
which substituting for $\mathbf{a} = \mathbf{c} = \dataVector$ and $\mathbf{b}=\mathbf{d} = \inputMatrix\mappingVector$ gives
$$ -\errorFunction(\mappingVector)= -\dataVector^\top\dataVector - 2\dataVector^\top\inputMatrix\mappingVector + -\mappingVector^\top\inputMatrix^\top\inputMatrix\mappingVector -$$
where we used the fact that $\dataVector^\top\inputMatrix\mappingVector=\mappingVector^\top\inputMatrix^\top\dataVector$. Now we can use our rules of differentiation to compute the derivative of this form, which is,
$$ -\frac{\text{d}}{\text{d}\mappingVector}\errorFunction(\mappingVector)=- 2\inputMatrix^\top \dataVector + -2\inputMatrix^\top\inputMatrix\mappingVector, -$$
where we have exploited the fact that $\inputMatrix^\top\inputMatrix$ is symmetric to obtain this result.

-

Exercise 0

-

Use the equivalence between our vector and our matrix formulations of linear regression, alongside our definition of vector derivates, to match the gradients we’ve computed directly for $\frac{\text{d}\errorFunction(c, m)}{\text{d}c}$ and $\frac{\text{d}\errorFunction(c, m)}{\text{d}m}$ to those for $\frac{\text{d}\errorFunction(\mappingVector)}{\text{d}\mappingVector}$.

-

Update Equation for Global Optimum

-

Once again, we need to find the minimum of our objective function. Using our likelihood for multiple input regression we can now minimize for our parameter vector $\mappingVector$. Firstly, just as in the single input case, we seek stationary points by find parameter vectors that solve for when the gradients are zero,
$$ -\mathbf{0}=- 2\inputMatrix^\top -\dataVector + 2\inputMatrix^\top\inputMatrix\mappingVector, -$$
where 0 is a vector of zeros. Rearranging this equation we find the solution to be
$$ -\mappingVector = \left[\inputMatrix^\top \inputMatrix\right]^{-1} \inputMatrix^\top -\dataVector -$$
where A − 1 denotes matrix inverse.

-

Solving the Multivariate System

-

The solution for $\mappingVector$ is given in terms of a matrix inverse, but computation of a matrix inverse requires, in itself, an algorithm to resolve it. You’ll know this if you had to invert, by hand, a 3 × 3 matrix in high school. From a numerical stability perspective, it is also best not to compute the matrix inverse directly, but rather to ask the computer to solve the system of linear equations given by
$$\inputMatrix^\top\inputMatrix \mappingVector = \inputMatrix^\top\dataVector$$
for $\mappingVector$. This can be done in numpy using the command

-
import numpy as np
-
np.linalg.solve?
-

so we can obtain the solution using

-
w = np.linalg.solve(X.T@X, X.T@y)
-print(w)
-

We can map it back to the liner regression and plot the fit as follows

-

Multivariate Linear Regression

-

A major advantage of the new system is that we can build a linear regression on a multivariate system. The matrix calculus didn’t specify what the length of the vector $\inputVector$ should be, or equivalently the size of the design matrix.

+

First, we need to compute the full objective by substituting our +prediction function into the objective function to obtain the objective +in terms of \(\mathbf{ w}\). Doing this +we obtain \[ +E(\mathbf{ w})= (\mathbf{ y}- \boldsymbol{ \Phi}\mathbf{ w})^\top +(\mathbf{ y}- \boldsymbol{ \Phi}\mathbf{ w}). +\] We now need to differentiate this quadratic form to +find the minimum. We differentiate with respect to the vector +\(\mathbf{ w}\). But before we do that, +we’ll expand the brackets in the quadratic form to obtain a series of +scalar terms. The rules for bracket expansion across the vectors are +similar to those for the scalar system giving, \[ +(\mathbf{a} - \mathbf{b})^\top +(\mathbf{c} - \mathbf{d}) = \mathbf{a}^\top \mathbf{c} - \mathbf{a}^\top +\mathbf{d} - \mathbf{b}^\top \mathbf{c} + \mathbf{b}^\top \mathbf{d} +\] which substituting for \(\mathbf{a} += \mathbf{c} = \mathbf{ y}\) and \(\mathbf{b}=\mathbf{d} = \boldsymbol{ \Phi}\mathbf{ +w}\) gives \[ +E(\mathbf{ w})= +\mathbf{ y}^\top\mathbf{ y}- 2\mathbf{ y}^\top\boldsymbol{ \Phi}\mathbf{ +w}+ +\mathbf{ w}^\top\boldsymbol{ \Phi}^\top\boldsymbol{ \Phi}\mathbf{ w} +\] where we used the fact that \(\mathbf{ y}^\top\boldsymbol{ \Phi}\mathbf{ +w}=\mathbf{ w}^\top\boldsymbol{ \Phi}^\top\mathbf{ y}\).

+

Now we can use our rules of differentiation to compute the derivative +of this form, which is, \[ +\frac{\text{d}}{\text{d}\mathbf{ w}}E(\mathbf{ w})=- 2\boldsymbol{ +\Phi}^\top \mathbf{ y}+ +2\boldsymbol{ \Phi}^\top\boldsymbol{ \Phi}\mathbf{ w}, +\] where we have exploited the fact that \(\boldsymbol{ \Phi}^\top\boldsymbol{ \Phi}\) +is symmetric to obtain this result.

+

Exercise 1

+

Use the equivalence between our vector and our matrix formulations of +linear regression, alongside our definition of vector derivates, to +match the gradients we’ve computed directly for \(\frac{\text{d}E(c, m)}{\text{d}c}\) and +\(\frac{\text{d}E(c, m)}{\text{d}m}\) +to those for \(\frac{\text{d}E(\mathbf{ +w})}{\text{d}\mathbf{ w}}\).

+

Update Equation for Global +Optimum

+

We need to find the minimum of our objective function. Using our +objective function, we can minimize for our parameter vector \(\mathbf{ w}\). Firstly, we seek stationary +points by find parameter vectors that solve for when the gradients are +zero, \[ +\mathbf{0}=- 2\boldsymbol{ \Phi}^\top +\mathbf{ y}+ 2\boldsymbol{ \Phi}^\top\boldsymbol{ \Phi}\mathbf{ w}, +\] where \(\mathbf{0}\) is a +vector of zeros. Rearranging this equation, we find the +solution to be \[ +\boldsymbol{ \Phi}^\top \boldsymbol{ \Phi}\mathbf{ w}= \boldsymbol{ +\Phi}^\top +\mathbf{ y} +\] which is a matrix equation of the familiar form \(\mathbf{A}\mathbf{x} = \mathbf{b}\).

+

Solving the Multivariate +System

+

The solution for \(\mathbf{ w}\) can +be written mathematically in terms of a matrix inverse of \(\boldsymbol{ \Phi}^\top\boldsymbol{ +\Phi}\), but computation of a matrix inverse requires an +algorithm to resolve it. You’ll know this if you had to invert, by hand, +a \(3\times 3\) matrix in high school. +From a numerical stability perspective, it is also best not to compute +the matrix inverse directly, but rather to ask the computer to +solve the system of linear equations given by \[ +\boldsymbol{ \Phi}^\top\boldsymbol{ \Phi}\mathbf{ w}= \boldsymbol{ +\Phi}^\top\mathbf{ y} +\] for \(\mathbf{ w}\).

+

Multivariate Linear +Regression

+

A major advantage of the new system is that we can build a linear +regression on a multivariate system. The matrix calculus didn’t specify +what the length of the vector \(\mathbf{ +x}\) should be, or equivalently the size of the design +matrix.

Movie Body Count Data

-

Let’s consider the movie body count data.

-
import pods
-
data = pods.datasets.movie_body_count()
-movies = data['Y']
-

Let’s remind ourselves of the features we’ve been provided with.

-
print(', '.join(movies.columns))
-

Now we will build a design matrix based on the numeric features: year, Body_Count, Length_Minutes in an effort to predict the rating. We build the design matrix as follows:

-

Relation to Single Input System

+
+[edit] +
+

This is a data set created by Simon Garnier and Rany Olson for +exploring the differences between R and Python for data science. The +data contains information about different movies augmented by estimates +about how many on-screen deaths are contained in the movie. The data is +craped from http://www.moviebodycounts.com. The data contains the +following featuers for each movie: Year, +Body_Count, MPAA_Rating, Genre, +Director, Actors, Length_Minutes, +IMDB_Rating.

+
import pods
+
data = pods.datasets.movie_body_count()
+movies = data['Y']
+

The data is provided to us in the form of a pandas data frame, we can +see the features we’re provided with by inspecting the columns of the +data frame.

+
print(', '.join(movies.columns))
+

Multivariate +Regression on Movie Body Count Data

+
+[edit] +
+

Now we will build a design matrix based on the numeric features: +year, Body_Count, Length_Minutes in an effort to predict the rating. We +build the design matrix as follows:

Bias as an additional feature.

-
select_features = ['Year', 'Body_Count', 'Length_Minutes']
-X = movies[select_features]
-X['Eins'] = 1 # add a column for the offset
-y = movies[['IMDB_Rating']]
-

Now let’s perform a linear regression. But this time, we will create a pandas data frame for the result so we can store it in a form that we can visualise easily.

-
import pandas as pd
-
w = pd.DataFrame(data=np.linalg.solve(X.T@X, X.T@y),  # solve linear regression here
-                 index = X.columns,  # columns of X become rows of w
-                 columns=['regression_coefficient']) # the column of X is the value of regression coefficient
-

We can check the residuals to see how good our estimates are

-
(y - X@w).hist()
-

Which shows our model hasn’t yet done a great job of representation, because the spread of values is large. We can check what the rating is dominated by in terms of regression coefficients.

-
w
-

Although we have to be a little careful about interpretation because our input values live on different scales, however it looks like we are dominated by the bias, with a small negative effect for later films (but bear in mind the years are large, so this effect is probably larger than it looks) and a positive effect for length. So it looks like long earlier films generally do better, but the residuals are so high that we probably haven’t modelled the system very well.

+
select_features = ['Year', 'Body_Count', 'Length_Minutes']
+Phi = movies[select_features]
+Phi['Eins'] = 1 # add a column for the offset
+y = movies[['IMDB_Rating']]
+

Now let’s perform a linear regression. But this time, we will create +a pandas data frame for the result so we can store it in a form that we +can visualise easily.

+
import pandas as pd
+
w = pd.DataFrame(data=np.linalg.solve(Phi.T@Phi, Phi.T@y),  # solve linear regression here
+                 index = Phi.columns,  # columns of Phi become rows of w
+                 columns=['regression_coefficient']) # the column of Phi is the value of regression coefficient
+

We can check the residuals to see how good our estimates are. First +we create a pandas data frame containing the predictions and use it to +compute the residuals.

+
ypred = pd.DataFrame(data=(Phi@w).values, columns=['IMDB_Rating'])
+resid = y-ypred
+
+
+ + +
+
+ +
+
+

Figure: Residual values for the ratings from the prediction of the +movie rating given the data from the film.

+
+
+

Which shows our model hasn’t yet done a great job of +representation, because the spread of values is large. We can check what +the rating is dominated by in terms of regression coefficients.

+
w
+

Although we have to be a little careful about interpretation because +our input values live on different scales, however it looks like we are +dominated by the bias, with a small negative effect for later films (but +bear in mind the years are large, so this effect is probably larger than +it looks) and a positive effect for length. So it looks like long +earlier films generally do better, but the residuals are so high that we +probably haven’t modelled the system very well.

Underdetermined System

-

[edit]

-

What about the situation where you have more parameters than data in your simultaneous equation? This is known as an underdetermined system. In fact this set up is in some sense easier to solve, because we don’t need to think about introducing a slack variable (although it might make a lot of sense from a modelling perspective to do so).

-

The way Laplace proposed resolving an overdetermined system, was to introduce slack variables, $\noiseScalar_i$, which needed to be estimated for each point. The slack variable represented the difference between our actual prediction and the true observation. This is known as the residual. By introducing the slack variable we now have an additional n variables to estimate, one for each data point, $\{\noiseScalar_i\}$. This actually turns the overdetermined system into an underdetermined system. Introduction of n variables, plus the original m and c gives us $\numData+2$ parameters to be estimated from n observations, which actually makes the system underdetermined. However, we then made a probabilistic assumption about the slack variables, we assumed that the slack variables were distributed according to a probability density. And for the moment we have been assuming that density was the Gaussian,
$$\noiseScalar_i \sim \gaussianSamp{0}{\dataStd^2},$$
with zero mean and variance $\dataStd^2$.

-

The follow up question is whether we can do the same thing with the parameters. If we have two parameters and only one unknown can we place a probability distribution over the parameters, as we did with the slack variables? The answer is yes.

+
+[edit] +
+

What about the situation where you have more parameters than data in +your simultaneous equation? This is known as an underdetermined +system. In fact, this set up is in some sense easier to solve, +because we don’t need to think about introducing a slack variable +(although it might make a lot of sense from a modelling +perspective to do so).

+

The way Laplace proposed resolving an overdetermined system, was to +introduce slack variables, \(\epsilon_i\), which needed to be estimated +for each point. The slack variable represented the difference between +our actual prediction and the true observation. This is known as the +residual. By introducing the slack variable, we now have an +additional \(n\) variables to estimate, +one for each data point, \(\{\epsilon_i\}\). This turns the +overdetermined system into an underdetermined system. Introduction of +\(n\) variables, plus the original +\(m\) and \(c\) gives us \(n+2\) parameters to be estimated from \(n\) observations, which makes the system +underdetermined. However, we then made a probabilistic +assumption about the slack variables, we assumed that the slack +variables were distributed according to a probability density. And for +the moment we have been assuming that density was the Gaussian, \[\epsilon_i \sim +\mathcal{N}\left(0,\sigma^2\right),\] with zero mean and variance +\(\sigma^2\).

+

The follow up question is whether we can do the same thing with the +parameters. If we have two parameters and only one unknown, can we place +a probability distribution over the parameters as we did with the slack +variables? The answer is yes.

Underdetermined System

- + -
+
-

Figure: An underdetermined system can be fit by considering uncertainty. Multiple solutions are consistent with one specified point.

+

Figure: An underdetermined system can be fit by considering +uncertainty. Multiple solutions are consistent with one specified +point.

Two Dimensional Gaussian

-

[edit]

-

Consider the distribution of height (in meters) of an adult male human population. We will approximate the marginal density of heights as a Gaussian density with mean given by 1.7m and a standard deviation of 0.15m, implying a variance of $\dataStd^2=0.0225$,
$$ - p(h) \sim \gaussianSamp{1.7}{0.0225}. - $$
Similarly, we assume that weights of the population are distributed a Gaussian density with a mean of 75kg and a standard deviation of 6kg (implying a variance of 36),
$$ - p(w) \sim \gaussianSamp{75}{36}. - $$

+
+[edit] +
+

Consider the distribution of height (in meters) of an adult male +human population. We will approximate the marginal density of heights as +a Gaussian density with mean given by \(1.7\text{m}\) and a standard deviation of +\(0.15\text{m}\), implying a variance +of \(\sigma^2=0.0225\), \[ + p(h) \sim \mathcal{N}\left(1.7,0.0225\right). + \] Similarly, we assume that weights of the population are +distributed a Gaussian density with a mean of \(75 \text{kg}\) and a standard deviation of +\(6 kg\) (implying a variance of 36), +\[ + p(w) \sim \mathcal{N}\left(75,36\right). + \]

- + -
+
@@ -674,374 +1572,688 @@

Two Dimensional Gaussian

Independence Assumption

-

First of all, we make an independence assumption, we assume that height and weight are independent. The definition of probabilistic independence is that the joint density, p(w, h), factorizes into its marginal densities,
p(w, h) = p(w)p(h).
Given this assumption we can sample from the joint distribution by independently sampling weights and heights.

+

First of all, we make an independence assumption, we assume that +height and weight are independent. The definition of probabilistic +independence is that the joint density, \(p(w, +h)\), factorizes into its marginal densities, \[ + p(w, h) = p(w)p(h). + \] Given this assumption we can sample from the joint +distribution by independently sampling weights and heights.

- + -
+
-

Figure: Samples from independent Gaussian variables that might represent heights and weights.

+

Figure: Samples from independent Gaussian variables that might +represent heights and weights.

-

In reality height and weight are not independent. Taller people tend on average to be heavier, and heavier people are likely to be taller. This is reflected by the body mass index. A ratio suggested by one of the fathers of statistics, Adolphe Quetelet. Quetelet was interested in the notion of the average man and collected various statistics about people. He defined the BMI to be,
$$ +

In reality height and weight are not independent. Taller +people tend on average to be heavier, and heavier people are likely to +be taller. This is reflected by the body mass index. A ratio +suggested by one of the fathers of statistics, Adolphe Quetelet. +Quetelet was interested in the notion of the average man and +collected various statistics about people. He defined the BMI to be, +\[ \text{BMI} = \frac{w}{h^2} -$$
To deal with this dependence we now introduce the notion of correlation to the multivariate Gaussian density.

-

Sampling Two Dimensional Variables

-

[edit]

+\]To deal with this dependence we now introduce the notion of +correlation to the multivariate Gaussian density.

+

Sampling Two Dimensional +Variables

+
+[edit] +
- + -
+
-

Figure: Samples from correlated Gaussian variables that might represent heights and weights.

+

Figure: Samples from correlated Gaussian variables that +might represent heights and weights.

Independent Gaussians

-

[edit]

-


p(w, h) = p(w)p(h)

-


$$ -p(w, h) = \frac{1}{\sqrt{2\pi \dataStd_1^2}\sqrt{2\pi\dataStd_2^2}} \exp\left(-\frac{1}{2}\left(\frac{(w-\meanScalar_1)^2}{\dataStd_1^2} + \frac{(h-\meanScalar_2)^2}{\dataStd_2^2}\right)\right) -$$

-


$$ -p(w, h) = \frac{1}{\sqrt{2\pi\dataStd_1^22\pi\dataStd_2^2}} \exp\left(-\frac{1}{2}\left(\begin{bmatrix}w \\ h\end{bmatrix} - \begin{bmatrix}\meanScalar_1 \\ \meanScalar_2\end{bmatrix}\right)^\top\begin{bmatrix}\dataStd_1^2& 0\\0&\dataStd_2^2\end{bmatrix}^{-1}\left(\begin{bmatrix}w \\ h\end{bmatrix} - \begin{bmatrix}\meanScalar_1 \\ \meanScalar_2\end{bmatrix}\right)\right) -$$

-


$$ -p(\dataVector) = \frac{1}{\det{2\pi \mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\dataVector - \meanVector)^\top\mathbf{D}^{-1}(\dataVector - \meanVector)\right) -$$

+
+[edit] +
+

\[ +p(w, h) = p(w)p(h) +\]

+

\[ +p(w, h) = \frac{1}{\sqrt{2\pi \sigma_1^2}\sqrt{2\pi\sigma_2^2}} +\exp\left(-\frac{1}{2}\left(\frac{(w-\mu_1)^2}{\sigma_1^2} + +\frac{(h-\mu_2)^2}{\sigma_2^2}\right)\right) +\]

+

\[ +p(w, h) = \frac{1}{\sqrt{2\pi\sigma_1^22\pi\sigma_2^2}} +\exp\left(-\frac{1}{2}\left(\begin{bmatrix}w \\ h\end{bmatrix} - +\begin{bmatrix}\mu_1 \\ +\mu_2\end{bmatrix}\right)^\top\begin{bmatrix}\sigma_1^2& +0\\0&\sigma_2^2\end{bmatrix}^{-1}\left(\begin{bmatrix}w \\ +h\end{bmatrix} - \begin{bmatrix}\mu_1 \\ +\mu_2\end{bmatrix}\right)\right) +\]

+

\[ +p(\mathbf{ y}) = \frac{1}{\det{2\pi \mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) +\]

Correlated Gaussian

-

Form correlated from original by rotating the data space using matrix $\rotationMatrix$.

-


$$ -p(\dataVector) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\dataVector - \meanVector)^\top\mathbf{D}^{-1}(\dataVector - \meanVector)\right) -$$

-


$$ -p(\dataVector) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\rotationMatrix^\top\dataVector - \rotationMatrix^\top\meanVector)^\top\mathbf{D}^{-1}(\rotationMatrix^\top\dataVector - \rotationMatrix^\top\meanVector)\right) -$$

-


$$ -p(\dataVector) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\dataVector - \meanVector)^\top\rotationMatrix\mathbf{D}^{-1}\rotationMatrix^\top(\dataVector - \meanVector)\right) -$$
this gives a covariance matrix:
$$ -\covarianceMatrix^{-1} = \rotationMatrix \mathbf{D}^{-1} \rotationMatrix^\top -$$

-


$$ -p(\dataVector) = \frac{1}{\det{2\pi\covarianceMatrix}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\dataVector - \meanVector)^\top\covarianceMatrix^{-1} (\dataVector - \meanVector)\right) -$$
this gives a covariance matrix:
$$ -\covarianceMatrix = \rotationMatrix \mathbf{D} \rotationMatrix^\top -$$

+

Form correlated from original by rotating the data space using matrix +\(\mathbf{R}\).

+

\[ +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) +\]

+

\[ +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{R}^\top\mathbf{ y}- +\mathbf{R}^\top\boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{R}^\top\mathbf{ y}- +\mathbf{R}^\top\boldsymbol{ \mu})\right) +\]

+

\[ +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{R}\mathbf{D}^{-1}\mathbf{R}^\top(\mathbf{ y}- +\boldsymbol{ \mu})\right) +\] this gives a covariance matrix: \[ +\mathbf{C}^{-1} = \mathbf{R}\mathbf{D}^{-1} \mathbf{R}^\top +\]

+

\[ +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{C}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{C}^{-1} (\mathbf{ y}- \boldsymbol{ \mu})\right) +\] this gives a covariance matrix: \[ +\mathbf{C}= \mathbf{R}\mathbf{D} \mathbf{R}^\top +\]

Basis Functions

-

[edit]

-

Here’s the idea, instead of working directly on the original input space, $\inputVector$, we build models in a new space, $\basisVector(\inputVector)$ where $\basisVector(\cdot)$ is a vector-valued function that is defined on the space $\inputVector$.

+
+[edit] +
+

Here’s the idea, instead of working directly on the original input +space, \(\mathbf{ x}\), we build models +in a new space, \(\boldsymbol{ \phi}(\mathbf{ +x})\) where \(\boldsymbol{ +\phi}(\cdot)\) is a vector-valued function that is +defined on the space \(\mathbf{ +x}\).

Quadratic Basis

-

Remember, that a vector-valued function is just a vector that contains functions instead of values. Here’s an example for a one dimensional input space, x, being projected to a quadratic basis. First we consider each basis function in turn, we can think of the elements of our vector as being indexed so that we have
$$ +

Remember, that a vector-valued function is just a vector +that contains functions instead of values. Here’s an example for a one +dimensional input space, \(x\), being +projected to a quadratic basis. First we consider each basis +function in turn, we can think of the elements of our vector as being +indexed so that we have \[ \begin{align*} -\basisFunc_1(\inputScalar) & = 1, \\ -\basisFunc_2(\inputScalar) & = x, \\ -\basisFunc_3(\inputScalar) & = \inputScalar^2. +\phi_1(x) & = 1, \\ +\phi_2(x) & = x, \\ +\phi_3(x) & = x^2. \end{align*} -$$
Now we can consider them together by placing them in a vector,
$$ -\basisVector(\inputScalar) = \begin{bmatrix} 1\\ x \\ \inputScalar^2\end{bmatrix}. -$$
For the vector-valued function, we have simply collected the different functions together in the same vector making them notationally easier to deal with in our mathematics.

-

When we consider the vector-valued function for each data point, then we place all the data into a matrix. The result is a matrix valued function,
$$ -\basisMatrix(\inputVector) = -\begin{bmatrix} 1 & \inputScalar_1 & -\inputScalar_1^2 \\ -1 & \inputScalar_2 & \inputScalar_2^2\\ +\] Now we can consider them together by placing them in a vector, +\[ +\boldsymbol{ \phi}(x) = \begin{bmatrix} 1\\ x \\ x^2\end{bmatrix}. +\] For the vector-valued function, we have simply collected the +different functions together in the same vector making them notationally +easier to deal with in our mathematics.

+

When we consider the vector-valued function for each data point, then +we place all the data into a matrix. The result is a matrix valued +function, \[ +\boldsymbol{ \Phi}(\mathbf{ x}) = +\begin{bmatrix} 1 & x_1 & +x_1^2 \\ +1 & x_2 & x_2^2\\ \vdots & \vdots & \vdots \\ -1 & \inputScalar_n & \inputScalar_n^2 +1 & x_n & x_n^2 \end{bmatrix} -$$
where we are still in the one dimensional input setting so $\inputVector$ here represents a vector of our inputs with $\numData$ elements.

-

Let’s try constructing such a matrix for a set of inputs. First of all, we create a function that returns the matrix valued function.

-
import numpy as np
-
def quadratic(x, **kwargs):
-    """Take in a vector of input values and return the design matrix associated 
-    with the basis functions."""
-    return np.hstack([np.ones((x.shape[0], 1)), x, x**2])
-

Functions Derived from Quadratic Basis

-


$$ -\mappingFunction(\inputScalar) = {\color{red}{\mappingScalar_0}} + {\color{magenta}{\mappingScalar_1 \inputScalar}} + {\color{blue}{\mappingScalar_2 \inputScalar^2}} -$$

+\] where we are still in the one dimensional input setting so +\(\mathbf{ x}\) here represents a +vector of our inputs with \(n\) +elements.

+

Let’s try constructing such a matrix for a set of inputs. First of +all, we create a function that returns the matrix valued function.

+
import numpy as np
+
def quadratic(x, **kwargs):
+    """Take in a vector of input values and return the design matrix associated 
+    with the basis functions."""
+    return np.hstack([np.ones((x.shape[0], 1)), x, x**2])
+

Functions Derived from +Quadratic Basis

+

\[ +f(x) = {\color{red}{w_0}} + {\color{magenta}{w_1 x}} + +{\color{blue}{w_2 x^2}} +\]

- + -
+
-

Figure: The set of functions which are combined to form a quadratic basis.

-
-
-

This function takes in an $\numData \times 1$ dimensional vector and returns an $\numData \times 3$ dimensional design matrix containing the basis functions. We can plot those basis functions against there input as follows.

-

The actual function we observe is then made up of a sum of these functions. This is the reason for the name basis. The term basis means ‘the underlying support or foundation for an idea, argument, or process’, and in this context they form the underlying support for our prediction function. Our prediction function can only be composed of a weighted linear sum of our basis functions.

+

Figure: The set of functions which are combined to form a +quadratic basis.

+ + +

This function takes in an \(n\times +1\) dimensional vector and returns an \(n\times 3\) dimensional design +matrix containing the basis functions. We can plot those basis +functions against there input as follows.

+

The actual function we observe is then made up of a sum of these +functions. This is the reason for the name basis. The term +basis means ‘the underlying support or foundation for an idea, +argument, or process’, and in this context they form the underlying +support for our prediction function. Our prediction function can only be +composed of a weighted linear sum of our basis functions.

Quadratic Functions

- + -
+
-

Figure: Functions constructed by weighted sum of the components of a quadratic basis.

+

Figure: Functions constructed by weighted sum of the components of a +quadratic basis.

Rectified Linear Units

-

[edit]

-

The rectified linear unit is a basis function that emerged out of the deep learning community. Rectified linear units are popular in the current generation of multilayer perceptron models, or deep networks. These basis functions start flat, and then become linear functions at a certain threshold.
$$ -\basisFunc_j(\inputScalar) = \inputScalar\heaviside(\mappingScalarTwo_j \inputScalar + \mappingScalarTwo_0) -$$

-
import numpy as np
-

-from mlai import relu
+
+[edit] +
+

The rectified linear unit is a basis function that emerged out of the +deep learning community. Rectified linear units are popular in the +current generation of multilayer perceptron models, or deep networks. +These basis functions start flat, and then become linear functions at a +certain threshold. \[ +\phi_j(x) = xH(v_j x+ v_0) +\]

+
import numpy as np
+
import mlai
+

+from mlai import relu
- + -
+
-

Figure: The set of functions which are combined to form a rectified linear unit basis.

+

Figure: The set of functions which are combined to form a rectified +linear unit basis.

-

Functions Derived from Relu Basis

-


$$ -\mappingFunction(\inputScalar) = \color{red}{\mappingScalar_0} + \color{magenta}{\mappingScalar_1 xH(x+1.0) } + \color{blue}{\mappingScalar_2 xH(x+0.33) } + \color{green}{\mappingScalar_3 xH(x-0.33)} + \color{cyan}{\mappingScalar_4 xH(x-1.0)} -$$

+

Functions Derived from Relu +Basis

+

\[ +f(x) = \color{red}{w_0} + \color{magenta}{w_1 xH(x+1.0) } + +\color{blue}{w_2 xH(x+0.33) } + \color{green}{w_3 xH(x-0.33)} ++ \color{cyan}{w_4 xH(x-1.0)} +\]

- + -
+
-

Figure: A rectified linear unit basis is made up of different rectified linear unit functions centered at different points.

+

Figure: A rectified linear unit basis is made up of different +rectified linear unit functions centered at different points.

Gaussian Processes

-

Models where we model the entire joint distribution of our training data, $p(\dataVector, \inputMatrix)$ are sometimes described as generative models. Because we can use sampling to generate data sets that represent all our assumptions. However, as we discussed in the sessions on and , this can be a bad idea, because if our assumptions are wrong then we can make poor predictions. We can try to make more complex assumptions about data to alleviate the problem, but then this typically leads to challenges for tractable application of the sum and rules of probability that are needed to compute the relevant marginal and conditional densities. If we know the form of the question we wish to answer then we typically try and represent that directly, through $p(\dataVector|\inputMatrix)$. In practice, we also have been making assumptions of conditional independence given the model parameters,
$$ -p(\dataVector|\inputMatrix, \mappingVector) = -\prod_{i=1}^{\numData} p(\dataScalar_i | \inputVector_i, \mappingVector) -$$
Gaussian processes are not normally considered to be generative models, but we will be much more interested in the principles of conditioning in Gaussian processes because we will use conditioning to make predictions between our test and training data. We will avoid the data conditional indpendence assumption in favour of a richer assumption about the data, in a Gaussian process we assume data is jointly Gaussian with a particular mean and covariance,
$$ -\dataVector|\inputMatrix \sim \gaussianSamp{\mathbf{m}(\inputMatrix)}{\kernelMatrix(\inputMatrix)}, -$$
where the conditioning is on the inputs $\inputMatrix$ which are used for computing the mean and covariance. For this reason they are known as mean and covariance functions.

+

Models where we model the entire joint distribution of our training +data, \(p(\mathbf{ y}, \mathbf{X})\) +are sometimes described as generative models. Because we can +use sampling to generate data sets that represent all our assumptions. +However, as we discussed in the sessions on and , this can be a bad +idea, because if our assumptions are wrong then we can make poor +predictions. We can try to make more complex assumptions about data to +alleviate the problem, but then this typically leads to challenges for +tractable application of the sum and rules of probability that are +needed to compute the relevant marginal and conditional densities. If we +know the form of the question we wish to answer then we typically try +and represent that directly, through \(p(\mathbf{ y}|\mathbf{X})\). In practice, +we also have been making assumptions of conditional independence given +the model parameters, \[ +p(\mathbf{ y}|\mathbf{X}, \mathbf{ w}) = +\prod_{i=1}^{n} p(y_i | \mathbf{ x}_i, \mathbf{ w}) +\] Gaussian processes are not normally considered to be +generative models, but we will be much more interested in the +principles of conditioning in Gaussian processes because we will use +conditioning to make predictions between our test and training data. We +will avoid the data conditional indpendence assumption in favour of a +richer assumption about the data, in a Gaussian process we assume data +is jointly Gaussian with a particular mean and covariance, +\[ +\mathbf{ y}|\mathbf{X}\sim +\mathcal{N}\left(\mathbf{m}(\mathbf{X}),\mathbf{K}(\mathbf{X})\right), +\] where the conditioning is on the inputs \(\mathbf{X}\) which are used for computing +the mean and covariance. For this reason they are known as mean and +covariance functions.

Linear Model Overview

-

[edit]

-

However, we are focussing on what happens in models which are non-linear in the inputs, whereas the above would be linear in the inputs. To consider these, we introduce a matrix, called the design matrix. We set each activation function computed at each data point to be
$$ -\activationScalar_{i,j} = \activationScalar(\mappingVector^{(1)}_{j}, \inputVector_{i}) -$$
and define the matrix of activations (known as the design matrix in statistics) to be,
$$ -\activationMatrix = +

+[edit] +
+

However, we are focussing on what happens in models which are +non-linear in the inputs, whereas the above would be linear in +the inputs. To consider these, we introduce a matrix, called the design +matrix. We set each activation function computed at each data point to +be \[ +\phi_{i,j} = \phi(\mathbf{ w}^{(1)}_{j}, \mathbf{ x}_{i}) +\] and define the matrix of activations (known as the design +matrix in statistics) to be, \[ +\boldsymbol{ \Phi}= \begin{bmatrix} -\activationScalar_{1, 1} & \activationScalar_{1, 2} & \dots & \activationScalar_{1, \numHidden} \\ -\activationScalar_{1, 2} & \activationScalar_{1, 2} & \dots & \activationScalar_{1, \numData} \\ +\phi_{1, 1} & \phi_{1, 2} & \dots & \phi_{1, h} \\ +\phi_{1, 2} & \phi_{1, 2} & \dots & \phi_{1, n} \\ \vdots & \vdots & \ddots & \vdots \\ -\activationScalar_{\numData, 1} & \activationScalar_{\numData, 2} & \dots & \activationScalar_{\numData, \numHidden} +\phi_{n, 1} & \phi_{n, 2} & \dots & \phi_{n, h} \end{bmatrix}. -$$
By convention this matrix always has $\numData$ rows and $\numHidden$ columns, now if we define the vector of all noise corruptions, $\noiseVector = \left[\noiseScalar_1, \dots \noiseScalar_\numData\right]^\top$.

-

If we define the prior distribution over the vector $\mappingVector$ to be Gaussian,
$$ -\mappingVector \sim \gaussianSamp{\zerosVector}{\alpha\eye}, -$$
then we can use rules of multivariate Gaussians to see that,
$$ -\dataVector \sim \gaussianSamp{\zerosVector}{\alpha \activationMatrix \activationMatrix^\top + \dataStd^2 \eye}. -$$

-

In other words, our training data is distributed as a multivariate Gaussian, with zero mean and a covariance given by
$$ -\kernelMatrix = \alpha \activationMatrix \activationMatrix^\top + \dataStd^2 \eye. -$$

-

This is an $\numData \times \numData$ size matrix. Its elements are in the form of a function. The maths shows that any element, index by i and j, is a function only of inputs associated with data points i and j, $\dataVector_i$, $\dataVector_j$. $\kernel_{i,j} = \kernel\left(\inputVector_i, \inputVector_j\right)$

-

If we look at the portion of this function associated only with $\mappingFunction(\cdot)$, i.e. we remove the noise, then we can write down the covariance associated with our neural network,
$$ -\kernel_\mappingFunction\left(\inputVector_i, \inputVector_j\right) = \alpha \activationVector\left(\mappingMatrix_1, \inputVector_i\right)^\top \activationVector\left(\mappingMatrix_1, \inputVector_j\right) -$$
so the elements of the covariance or kernel matrix are formed by inner products of the rows of the design matrix.

+\] By convention this matrix always has \(n\) rows and \(h\) columns, now if we define the vector of +all noise corruptions, \(\boldsymbol{ +\epsilon}= \left[\epsilon_1, \dots \epsilon_n\right]^\top\).

+

If we define the prior distribution over the vector \(\mathbf{ w}\) to be Gaussian, \[ +\mathbf{ w}\sim \mathcal{N}\left(\mathbf{0},\alpha\mathbf{I}\right), +\] then we can use rules of multivariate Gaussians to see that, +\[ +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top + \sigma^2 \mathbf{I}\right). +\]

+

In other words, our training data is distributed as a multivariate +Gaussian, with zero mean and a covariance given by \[ +\mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 +\mathbf{I}. +\]

+

This is an \(n\times n\) size +matrix. Its elements are in the form of a function. The maths shows that +any element, index by \(i\) and \(j\), is a function only of inputs +associated with data points \(i\) and +\(j\), \(\mathbf{ y}_i\), \(\mathbf{ y}_j\). \(k_{i,j} = k\left(\mathbf{ x}_i, \mathbf{ +x}_j\right)\)

+

If we look at the portion of this function associated only with \(f(\cdot)\), i.e. we remove the noise, then +we can write down the covariance associated with our neural network, +\[ +k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right) +\] so the elements of the covariance or kernel matrix +are formed by inner products of the rows of the design +matrix.

Gaussian Process

-

This is the essence of a Gaussian process. Instead of making assumptions about our density over each data point, $\dataScalar_i$ as i.i.d. we make a joint Gaussian assumption over our data. The covariance matrix is now a function of both the parameters of the activation function, $\mappingMatrixTwo$, and the input variables, $\inputMatrix$. This comes about through integrating out the parameters of the model, $\mappingVector$.

+

This is the essence of a Gaussian process. Instead of making +assumptions about our density over each data point, \(y_i\) as i.i.d. we make a joint Gaussian +assumption over our data. The covariance matrix is now a function of +both the parameters of the activation function, \(\mathbf{V}\), and the input variables, +\(\mathbf{X}\). This comes about +through integrating out the parameters of the model, \(\mathbf{ w}\).

Basis Functions

-

We can basically put anything inside the basis functions, and many people do. These can be deep kernels (Cho and Saul 2009) or we can learn the parameters of a convolutional neural network inside there.

-

Viewing a neural network in this way is also what allows us to beform sensible batch normalizations (Ioffe and Szegedy 2015).

+

We can basically put anything inside the basis functions, and many +people do. These can be deep kernels (Cho and Saul, 2009) or we can learn the +parameters of a convolutional neural network inside there.

+

Viewing a neural network in this way is also what allows us to beform +sensible batch normalizations (Ioffe and Szegedy, 2015).

Radial Basis Functions

-

[edit]

-

Another type of basis is sometimes known as a ‘radial basis’ because the effect basis functions are constructed on ‘centres’ and the effect of each basis function decreases as the radial distance from each centre increases.

-


$$ -\basisFunc_j(\inputScalar) = \exp\left(-\frac{(\inputScalar-\mu_j)^2}{\lengthScale^2}\right) -$$

-

-from mlai import radial
+
+[edit] +
+

Another type of basis is sometimes known as a ‘radial basis’ because +the effect basis functions are constructed on ‘centres’ and the effect +of each basis function decreases as the radial distance from each centre +increases.

+

\[ +\phi_j(x) = \exp\left(-\frac{(x-\mu_j)^2}{\ell^2}\right) +\]

+
import mlai
+

+from mlai import radial
- + -
+
-

Figure: The set of functions which are combined to form the radial basis.

+

Figure: The set of functions which are combined to form the radial +basis.

-

Functions Derived from Radial Basis

-


$$ -\mappingFunction(\inputScalar) = \color{red}{\mappingScalar_1 e^{-2(\inputScalar+1)^2}} + \color{magenta}{\mappingScalar_2e^{-2\inputScalar^2}} + \color{blue}{\mappingScalar_3 e^{-2(\inputScalar-1)^2}} -$$

+

Functions Derived from +Radial Basis

+

\[ +f(x) = \color{red}{w_1 e^{-2(x+1)^2}} + \color{magenta}{w_2e^{-2x^2}} + +\color{blue}{w_3 e^{-2(x-1)^2}} +\]

- + -
+
-

Figure: A radial basis is made up of different locally effective functions centered at different points.

+

Figure: A radial basis is made up of different locally effective +functions centered at different points.

Marginal Likelihood

-

[edit]

-

To understand the Gaussian process we’re going to build on our understanding of the marginal likelihood for Bayesian regression. In the session on we sampled directly from the weight vector, $\mappingVector$ and applied it to the basis matrix $\basisMatrix$ to obtain a sample from the prior and a sample from the posterior. It is often helpful to think of modeling techniques as generative models. To give some thought as to what the process for obtaining data from the model is. From the perspective of Gaussian processes, we want to start by thinking of basis function models, where the parameters are sampled from a prior, but move to thinking about sampling from the marginal likelihood directly.

+
+[edit] +
+

To understand the Gaussian process we’re going to build on our +understanding of the marginal likelihood for Bayesian regression. In the +session on we sampled directly from the weight vector, \(\mathbf{ w}\) and applied it to the basis +matrix \(\boldsymbol{ \Phi}\) to obtain +a sample from the prior and a sample from the posterior. It is often +helpful to think of modeling techniques as generative models. +To give some thought as to what the process for obtaining data from the +model is. From the perspective of Gaussian processes, we want to start +by thinking of basis function models, where the parameters are sampled +from a prior, but move to thinking about sampling from the marginal +likelihood directly.

Sampling from the Prior

-

The first thing we’ll do is to set up the parameters of the model, these include the parameters of the prior, the parameters of the basis functions and the noise level.

-
# set prior variance on w
-alpha = 4.
-# set the order of the polynomial basis set
-degree = 5
-# set the noise variance
-sigma2 = 0.01
-

Now we have the variance, we can sample from the prior distribution to see what form we are imposing on the functions a priori.

-

Let’s now compute a range of values to make predictions at, spanning the new space of inputs,

-
import numpy as np
-
def polynomial(x, degree, loc, scale):
-    degrees = np.arange(degree+1)
-    return ((x-loc)/scale)**degrees
+

The first thing we’ll do is to set up the parameters of the model, +these include the parameters of the prior, the parameters of the basis +functions and the noise level.

+
# set prior variance on w
+alpha = 4.
+# set the order of the polynomial basis set
+degree = 5
+# set the noise variance
+sigma2 = 0.01
+

Now we have the variance, we can sample from the prior distribution +to see what form we are imposing on the functions a priori.

+

Let’s now compute a range of values to make predictions at, spanning +the new space of inputs,

+
import numpy as np
+
def polynomial(x, degree, loc, scale):
+    degrees = np.arange(degree+1)
+    return ((x-loc)/scale)**degrees

now let’s build the basis matrices. First we load in the data

-
import pods
-
data = pods.datasets.olympic_marathon_men()
-x = data['X']
-y = data['Y']
-
loc = 1950.
-scale = 100.
-num_data = x.shape[0]
-num_pred_data = 100 # how many points to use for plotting predictions
-x_pred = np.linspace(1880, 2030, num_pred_data)[:, np.newaxis] # input locations for predictions
-Phi_pred = polynomial(x_pred, degree=degree, loc=loc, scale=scale)
-Phi = polynomial(x, degree=degree, loc=loc, scale=scale)
+
import pods
+
data = pods.datasets.olympic_marathon_men()
+x = data['X']
+y = data['Y']
+
loc = 1950.
+scale = 100.
+num_data = x.shape[0]
+num_pred_data = 100 # how many points to use for plotting predictions
+x_pred = np.linspace(1880, 2030, num_pred_data)[:, np.newaxis] # input locations for predictions
+Phi_pred = polynomial(x_pred, degree=degree, loc=loc, scale=scale)
+Phi = polynomial(x, degree=degree, loc=loc, scale=scale)

Weight Space View

-

To generate typical functional predictions from the model, we need a set of model parameters. We assume that the parameters are drawn independently from a Gaussian density,
$$ -\weightVector \sim \gaussianSamp{\zerosVector}{\alpha\eye}, -$$
then we can combine this with the definition of our prediction function $\mappingFunction(\inputVector)$,
$$ -\mappingFunction(\inputVector) = \weightVector^\top \basisVector(\inputVector). -$$
We can now sample from the prior density to obtain a vector $\weightVector$ using the function np.random.normal and combine these parameters with our basis to create some samples of what $\mappingFunction(\inputVector)$ looks like,

+

To generate typical functional predictions from the model, we need a +set of model parameters. We assume that the parameters are drawn +independently from a Gaussian density, \[ +\mathbf{ w}\sim \mathcal{N}\left(\mathbf{0},\alpha\mathbf{I}\right), +\] then we can combine this with the definition of our prediction +function \(f(\mathbf{ x})\), \[ +f(\mathbf{ x}) = \mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}). +\] We can now sample from the prior density to obtain a vector +\(\mathbf{ w}\) using the function +np.random.normal and combine these parameters with our +basis to create some samples of what \(f(\mathbf{ x})\) looks like,

Function Space View

-

The process we have used to generate the samples is a two stage process. To obtain each function, we first generated a sample from the prior,
$$ -\weightVector \sim \gaussianSamp{\zerosVector}{\alpha \eye} -$$
then if we compose our basis matrix, $\basisMatrix$ from the basis functions associated with each row then we get,
$$ -\basisMatrix = \begin{bmatrix}\basisVector(\inputVector_1) \\ \vdots \\ -\basisVector(\inputVector_\numData)\end{bmatrix} -$$
then we can write down the vector of function values, as evaluated at
$$ -\mappingFunctionVector = \begin{bmatrix} \mappingFunction_1 -\\ \vdots \mappingFunction_\numData\end{bmatrix} -$$
in the form
$$ -\mappingFunctionVector = \basisMatrix\weightVector. -$$

-

Now we can use standard properties of multivariate Gaussians to write down the probability density that is implied over $\mappingFunctionVector$. In particular we know that if $\weightVector$ is sampled from a multivariate normal (or multivariate Gaussian) with covariance $\alpha \eye$ and zero mean, then assuming that $\basisMatrix$ is a deterministic matrix (i.e. it is not sampled from a probability density) then the vector $\mappingFunctionVector$ will also be distributed according to a zero mean multivariate normal as follows,
$$ -\mappingFunctionVector \sim \gaussianSamp{\zerosVector}{\alpha \basisMatrix\basisMatrix^\top}. -$$

-

The question now is, what happens if we sample $\mappingFunctionVector$ directly from this density, rather than first sampling $\weightVector$ and then multiplying by $\basisMatrix$. Let’s try this. First of all we define the covariance as
$$ -\kernelMatrix = \alpha -\basisMatrix\basisMatrix^\top. -$$

-
K = alpha*Phi_pred@Phi_pred.T
-

Now we can use the np.random.multivariate_normal command for sampling from a multivariate normal with covariance given by $\kernelMatrix$ and zero mean,

-
fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
-for i in range(10):
-    f_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
-    ax.plot(x_pred.flatten(), f_sample.flatten(), linewidth=2)
-    
-mlai.write_figure('gp-sample-basis-function.svg', directory='../slides/diagrams/kern')
+

The process we have used to generate the samples is a two stage +process. To obtain each function, we first generated a sample from the +prior, \[ +\mathbf{ w}\sim \mathcal{N}\left(\mathbf{0},\alpha \mathbf{I}\right) +\] then if we compose our basis matrix, \(\boldsymbol{ \Phi}\) from the basis +functions associated with each row then we get, \[ +\boldsymbol{ \Phi}= \begin{bmatrix}\boldsymbol{ \phi}(\mathbf{ x}_1) \\ +\vdots \\ +\boldsymbol{ \phi}(\mathbf{ x}_n)\end{bmatrix} +\] then we can write down the vector of function values, as +evaluated at \[ +\mathbf{ f}= \begin{bmatrix} f_1 +\\ \vdots f_n\end{bmatrix} +\] in the form \[ +\mathbf{ f}= \boldsymbol{ \Phi}\mathbf{ w}. +\]

+

Now we can use standard properties of multivariate Gaussians to write +down the probability density that is implied over \(\mathbf{ f}\). In particular we know that +if \(\mathbf{ w}\) is sampled from a +multivariate normal (or multivariate Gaussian) with covariance \(\alpha \mathbf{I}\) and zero mean, then +assuming that \(\boldsymbol{ \Phi}\) is +a deterministic matrix (i.e. it is not sampled from a probability +density) then the vector \(\mathbf{ +f}\) will also be distributed according to a zero mean +multivariate normal as follows, \[ +\mathbf{ f}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top\right). +\]

+

The question now is, what happens if we sample \(\mathbf{ f}\) directly from this density, +rather than first sampling \(\mathbf{ +w}\) and then multiplying by \(\boldsymbol{ \Phi}\). Let’s try this. First +of all we define the covariance as \[ +\mathbf{K}= \alpha +\boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top. +\]

+
K = alpha*Phi_pred@Phi_pred.T
+

Now we can use the np.random.multivariate_normal command +for sampling from a multivariate normal with covariance given by \(\mathbf{K}\) and zero mean,

+
fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
+for i in range(10):
+    f_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
+    ax.plot(x_pred.flatten(), f_sample.flatten(), linewidth=2)
+    
+mlai.write_figure('gp-sample-basis-function.svg', directory='./kern')
- + -
+
-

Figure: Samples directly from the covariance function implied by the basis function based covariance, $\alpha \basisMatrix\basisMatrix^\top$.

-
-
-

The samples appear very similar to those which we obtained indirectly. That is no surprise because they are effectively drawn from the same mutivariate normal density. However, when sampling $\mappingFunctionVector$ directly we created the covariance for $\mappingFunctionVector$. We can visualise the form of this covaraince in an image in python with a colorbar to show scale.

+

Figure: Samples directly from the covariance function implied by the +basis function based covariance, \(\alpha +\boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top\).

+ + +

The samples appear very similar to those which we obtained +indirectly. That is no surprise because they are effectively drawn from +the same mutivariate normal density. However, when sampling \(\mathbf{ f}\) directly we created the +covariance for \(\mathbf{ f}\). We can +visualise the form of this covaraince in an image in python with a +colorbar to show scale.

- + -
+
-

Figure: Covariance of the function implied by the basis set $\alpha\basisMatrix\basisMatrix^\top$.

-
-
-

This image is the covariance expressed between different points on the function. In regression we normally also add independent Gaussian noise to obtain our observations $\dataVector$,
$$ -\dataVector = \mappingFunctionVector + \boldsymbol{\epsilon} -$$
where the noise is sampled from an independent Gaussian distribution with variance $\dataStd^2$,
$$ -\epsilon \sim \gaussianSamp{\zerosVector}{\dataStd^2\eye}. -$$
we can use properties of Gaussian variables, i.e. the fact that sum of two Gaussian variables is also Gaussian, and that it’s covariance is given by the sum of the two covariances, whilst the mean is given by the sum of the means, to write down the marginal likelihood,
$$ -\dataVector \sim \gaussianSamp{\zerosVector}{\basisMatrix\basisMatrix^\top +\dataStd^2\eye}. -$$
Sampling directly from this density gives us the noise corrupted functions,

-
K = alpha*Phi_pred@Phi_pred.T + sigma2*np.eye(x_pred.size)
-for i in range(10):
-    y_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
-    ax.plot(x_pred.flatten(), y_sample.flatten())
-    
-mlai.write_figure('gp-sample-basis-function-plus-noise.svg', 
-                  '../slides/diagrams/kern')
+

Figure: Covariance of the function implied by the basis set \(\alpha\boldsymbol{ \Phi}\boldsymbol{ +\Phi}^\top\).

+ + +

This image is the covariance expressed between different points on +the function. In regression we normally also add independent Gaussian +noise to obtain our observations \(\mathbf{ +y}\), \[ +\mathbf{ y}= \mathbf{ f}+ \boldsymbol{\epsilon} +\] where the noise is sampled from an independent Gaussian +distribution with variance \(\sigma^2\), \[ +\epsilon \sim \mathcal{N}\left(\mathbf{0},\sigma^2\mathbf{I}\right). +\] we can use properties of Gaussian variables, i.e. the fact +that sum of two Gaussian variables is also Gaussian, and that it’s +covariance is given by the sum of the two covariances, whilst the mean +is given by the sum of the means, to write down the marginal likelihood, +\[ +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top +\sigma^2\mathbf{I}\right). +\] Sampling directly from this density gives us the noise +corrupted functions,

-
- +
+
-
+
-
-

Figure: Samples directly from the covariance function implied by the noise corrupted basis function based covariance, $\alpha \basisMatrix\basisMatrix^\top + \dataStd^2 \eye$.

-
-
-

where the effect of our noise term is to roughen the sampled functions, we can also increase the variance of the noise to see a different effect,

-
sigma2 = 1.
-K = alpha*Phi_pred@Phi_pred.T + sigma2*np.eye(x_pred.size)
-
fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
-for i in range(10):
-    y_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
-    plt.plot(x_pred.flatten(), y_sample.flatten())
-    
-mlai.write_figure('gp-sample-basis-function-plus-large-noise.svg', 
-                  '../slides/diagrams/kern')
+
+

Figure: Samples directly from the covariance function implied by the +noise corrupted basis function based covariance, \(\alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + +\sigma^2 \mathbf{I}\).

+
+ +

where the effect of our noise term is to roughen the sampled +functions, we can also increase the variance of the noise to see a +different effect,

+
sigma2 = 1.
+K = alpha*Phi_pred@Phi_pred.T + sigma2*np.eye(x_pred.size)
+
fig, ax = plt.subplots(figsize=plot.big_wide_figsize)
+for i in range(10):
+    y_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
+    plt.plot(x_pred.flatten(), y_sample.flatten())
+    
+mlai.write_figure('gp-sample-basis-function-plus-large-noise.svg', 
+                  directory='./kern')
-
- +
+
-
+
-
-

Figure: Samples directly from the covariance function implied by the noise corrupted basis function based covariance, $\alpha \basisMatrix\basisMatrix^\top + \eye$.

-
-
-

Non-degenerate Gaussian Processes

-

[edit]

-

The process described above is degenerate. The covariance function is of rank at most $\numHidden$ and since the theoretical amount of data could always increase $\numData \rightarrow \infty$, the covariance function is not full rank. This means as we increase the amount of data to infinity, there will come a point where we can’t normalize the process because the multivariate Gaussian has the form,
$$ -\gaussianDist{\mappingFunctionVector}{\zerosVector}{\kernelMatrix} = \frac{1}{\left(2\pi\right)^{\frac{\numData}{2}}\det{\kernelMatrix}^\frac{1}{2}} \exp\left(-\frac{\mappingFunctionVector^\top\kernelMatrix \mappingFunctionVector}{2}\right) -$$
and a non-degenerate kernel matrix leads to $\det{\kernelMatrix} = 0$ defeating the normalization (it’s equivalent to finding a projection in the high dimensional Gaussian where the variance of the the resulting univariate Gaussian is zero, i.e. there is a null space on the covariance, or alternatively you can imagine there are one or more directions where the Gaussian has become the delta function).

+
+

Figure: Samples directly from the covariance function implied by the +noise corrupted basis function based covariance, \(\alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + +\mathbf{I}\).

+
+ +

Non-degenerate Gaussian +Processes

+
+[edit] +
+

The process described above is degenerate. The covariance function is +of rank at most \(h\) and since the +theoretical amount of data could always increase \(n\rightarrow \infty\), the covariance +function is not full rank. This means as we increase the amount of data +to infinity, there will come a point where we can’t normalize the +process because the multivariate Gaussian has the form, \[ +\mathcal{N}\left(\mathbf{ f}|\mathbf{0},\mathbf{K}\right) = +\frac{1}{\left(2\pi\right)^{\frac{n}{2}}\det{\mathbf{K}}^\frac{1}{2}} +\exp\left(-\frac{\mathbf{ f}^\top\mathbf{K}\mathbf{ f}}{2}\right) +\] and a non-degenerate kernel matrix leads to \(\det{\mathbf{K}} = 0\) defeating the +normalization (it’s equivalent to finding a projection in the high +dimensional Gaussian where the variance of the the resulting univariate +Gaussian is zero, i.e. there is a null space on the covariance, or +alternatively you can imagine there are one or more directions where the +Gaussian has become the delta function).

@@ -1054,348 +2266,801 @@

Non-degenerate Gaussian Processes

Radford Neal - +
-

In the machine learning field, it was Radford Neal (Neal 1994) that realized the potential of the next step. In his 1994 thesis, he was considering Bayesian neural networks, of the type we described above, and in considered what would happen if you took the number of hidden nodes, or neurons, to infinity, i.e. $\numHidden \rightarrow \infty$.

+

In the machine learning field, it was Radford Neal (Neal, 1994) that +realized the potential of the next step. In his 1994 thesis, he was +considering Bayesian neural networks, of the type we described above, +and in considered what would happen if you took the number of hidden +nodes, or neurons, to infinity, i.e. \(h\rightarrow \infty\).

- +
-
+
-

Figure: Page 37 of Radford Neal’s 1994 thesis

+

Figure: Page 37 of Radford Neal’s +1994 thesis

-

In loose terms, what Radford considers is what happens to the elements of the covariance function,
$$ +

In loose terms, what Radford considers is what happens to the +elements of the covariance function, \[ \begin{align*} - \kernel_\mappingFunction\left(\inputVector_i, \inputVector_j\right) & = \alpha \activationVector\left(\mappingMatrix_1, \inputVector_i\right)^\top \activationVector\left(\mappingMatrix_1, \inputVector_j\right)\\ - & = \alpha \sum_k \activationScalar\left(\mappingVector^{(1)}_k, \inputVector_i\right) \activationScalar\left(\mappingVector^{(1)}_k, \inputVector_j\right) + k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) & = \alpha +\boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top +\boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right)\\ + & = \alpha \sum_k \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ +x}_i\right) \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ x}_j\right) \end{align*} - $$
if instead of considering a finite number you sample infinitely many of these activation functions, sampling parameters from a prior density, $p(\mappingVectorTwo)$, for each one,
$$ -\kernel_\mappingFunction\left(\inputVector_i, \inputVector_j\right) = \alpha \int \activationScalar\left(\mappingVector^{(1)}, \inputVector_i\right) \activationScalar\left(\mappingVector^{(1)}, \inputVector_j\right) p(\mappingVector^{(1)}) \text{d}\mappingVector^{(1)} -$$
And that’s not only for Gaussian $p(\mappingVectorTwo)$. In fact this result holds for a range of activations, and a range of prior densities because of the central limit theorem.

-

To write it in the form of a probabilistic program, as long as the distribution for ϕi implied by this short probabilistic program,
$$ + \] if instead of considering a finite number you sample +infinitely many of these activation functions, sampling parameters from +a prior density, \(p(\mathbf{ v})\), +for each one, \[ +k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \int +\phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_i\right) \phi\left(\mathbf{ +w}^{(1)}, \mathbf{ x}_j\right) p(\mathbf{ w}^{(1)}) \text{d}\mathbf{ +w}^{(1)} +\] And that’s not only for Gaussian \(p(\mathbf{ v})\). In fact this result holds +for a range of activations, and a range of prior densities because of +the central limit theorem.

+

To write it in the form of a probabilistic program, as long as the +distribution for \(\phi_i\) implied by +this short probabilistic program, \[ \begin{align*} - \mappingVectorTwo & \sim p(\cdot)\\ - \phi_i & = \activationScalar\left(\mappingVectorTwo, \inputVector_i\right), + \mathbf{ v}& \sim p(\cdot)\\ + \phi_i & = \phi\left(\mathbf{ v}, \mathbf{ x}_i\right), \end{align*} - $$
has finite variance, then the result of taking the number of hidden units to infinity, with appropriate scaling, is also a Gaussian process.

+ \] has finite variance, then the result of taking the number of +hidden units to infinity, with appropriate scaling, is also a Gaussian +process.

Further Reading

-

To understand this argument in more detail, I highly recommend reading chapter 2 of Neal’s thesis (Neal 1994), which remains easy to read and clear today. Indeed, for readers interested in Bayesian neural networks, both Raford Neal’s and David MacKay’s PhD thesis (MacKay 1992) remain essential reading. Both theses embody a clarity of thought, and an ability to weave together threads from different fields that was the business of machine learning in the 1990s. Radford and David were also pioneers in making their software widely available and publishing material on the web.

+

To understand this argument in more detail, I highly recommend +reading chapter 2 of Neal’s thesis (Neal, 1994), which remains easy to +read and clear today. Indeed, for readers interested in Bayesian neural +networks, both Raford Neal’s and David MacKay’s PhD thesis (MacKay, 1992) +remain essential reading. Both theses embody a clarity of thought, and +an ability to weave together threads from different fields that was the +business of machine learning in the 1990s. Radford and David were also +pioneers in making their software widely available and publishing +material on the web.

Gaussian Process

-

[edit]

-

In our we sampled from the prior over paraemters. Through the properties of multivariate Gaussian densities this prior over parameters implies a particular density for our data observations, $\dataVector$. In this session we sampled directly from this distribution for our data, avoiding the intermediate weight-space representation. This is the approach taken by Gaussian processes. In a Gaussian process you specify the covariance function directly, rather than implicitly through a basis matrix and a prior over parameters. Gaussian processes have the advantage that they can be nonparametric, which in simple terms means that they can have infinite basis functions. In the lectures we introduced the exponentiated quadratic covariance, also known as the RBF or the Gaussian or the squared exponential covariance function. This covariance function is specified by
$$ -\kernelScalar(\inputVector, \inputVector^\prime) = \alpha \exp\left( -\frac{\left\Vert \inputVector-\inputVector^\prime\right\Vert^2}{2\ell^2}\right), -$$
where $\left\Vert\inputVector - \inputVector^\prime\right\Vert^2$ is the squared distance between the two input vectors
$$ -\left\Vert\inputVector - \inputVector^\prime\right\Vert^2 = (\inputVector - \inputVector^\prime)^\top (\inputVector - \inputVector^\prime) -$$
Let’s build a covariance matrix based on this function. First we define the form of the covariance function,

-

-from mlai import eq_cov
-

We can use this to compute directly the covariance for $\mappingFunctionVector$ at the points given by x_pred. Let’s define a new function K() which does this,

-

-from mlai import Kernel
+
+[edit] +
+

In our we sampled from the prior over paraemters. Through the +properties of multivariate Gaussian densities this prior over parameters +implies a particular density for our data observations, \(\mathbf{ y}\). In this session we sampled +directly from this distribution for our data, avoiding the intermediate +weight-space representation. This is the approach taken by Gaussian +processes. In a Gaussian process you specify the covariance +function directly, rather than implicitly through a basis +matrix and a prior over parameters. Gaussian processes have the +advantage that they can be nonparametric, which in simple terms +means that they can have infinite basis functions. In the +lectures we introduced the exponentiated quadratic covariance, +also known as the RBF or the Gaussian or the squared exponential +covariance function. This covariance function is specified by \[ +k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left( -\frac{\left\Vert +\mathbf{ x}-\mathbf{ x}^\prime\right\Vert^2}{2\ell^2}\right), +\] where \(\left\Vert\mathbf{ x}- +\mathbf{ x}^\prime\right\Vert^2\) is the squared distance between +the two input vectors \[ +\left\Vert\mathbf{ x}- \mathbf{ x}^\prime\right\Vert^2 = (\mathbf{ x}- +\mathbf{ x}^\prime)^\top (\mathbf{ x}- \mathbf{ x}^\prime) +\] Let’s build a covariance matrix based on this function. First +we define the form of the covariance function,

+
import mlai
+

+from mlai import eq_cov
+

We can use this to compute directly the covariance for \(\mathbf{ f}\) at the points given by +x_pred. Let’s define a new function K() which +does this,

+
import mlai
+

+from mlai import Kernel

Now we can image the resulting covariance,

-
kernel = Kernel(function=eq_cov, variance=1., lengthscale=10.)
-K = kernel.K(x_pred, x_pred)
-

To visualise the covariance between the points we can use the imshow function in matplotlib.

+
kernel = Kernel(function=eq_cov, variance=1., lengthscale=10.)
+K = kernel.K(x_pred, x_pred)
+

To visualise the covariance between the points we can use the +imshow function in matplotlib.

Finally, we can sample functions from the marginal likelihood.

-

Exercise 1

-

Moving Parameters Have a play with the parameters for this covariance function (the lengthscale and the variance) and see what effects the parameters have on the types of functions you observe.

-

Bayesian Inference by Rejection Sampling

-

[edit]

-

One view of Bayesian inference is to assume we are given a mechanism for generating samples, where we assume that mechanism is representing on accurate view on the way we believe the world works.

+

Exercise 2

+

Moving Parameters Have a play with the parameters +for this covariance function (the lengthscale and the variance) and see +what effects the parameters have on the types of functions you +observe.

+

Bayesian Inference by +Rejection Sampling

+
+[edit] +
+

One view of Bayesian inference is to assume we are given a mechanism +for generating samples, where we assume that mechanism is representing +an accurate view on the way we believe the world works.

This mechanism is known as our prior belief.

-

We combine our prior belief with our observations of the real world by discarding all those samples that are inconsistent with our prior. The likelihood defines mathematically what we mean by inconsistent with the prior. The higher the noise level in the likelihood, the looser the notion of consistent.

-

The samples that remain are considered to be samples from the posterior.

-

This approach to Bayesian inference is closely related to two sampling techniques known as rejection sampling and importance sampling. It is realized in practice in an approach known as approximate Bayesian computation (ABC) or likelihood-free inference.

-

In practice, the algorithm is often too slow to be practical, because most samples will be inconsistent with the data and as a result the mechanism has to be operated many times to obtain a few posterior samples.

-

However, in the Gaussian process case, when the likelihood also assumes Gaussian noise, we can operate this mechanism mathematically, and obtain the posterior density analytically. This is the benefit of Gaussian processes.

-

First we will load in two python functions for computing the covariance function.

-

Next we sample from a multivariate normal density (a multivariate Gaussian), using the covariance function as the covariance matrix.

-
plot.rejection_samples(kernel=kernel, 
-    diagrams='../slides/diagrams/gp')
+

We combine our prior belief with our observations of the real world +by discarding all those prior samples that are inconsistent with our +observations. The likelihood defines mathematically what we +mean by inconsistent with the observations. The higher the noise level +in the likelihood, the looser the notion of consistent.

+

The samples that remain are samples from the posterior.

+

This approach to Bayesian inference is closely related to two +sampling techniques known as rejection sampling and +importance sampling. It is realized in practice in an approach +known as approximate Bayesian computation (ABC) or +likelihood-free inference.

+

In practice, the algorithm is often too slow to be practical, because +most samples will be inconsistent with the observations and as a result +the mechanism must be operated many times to obtain a few posterior +samples.

+

However, in the Gaussian process case, when the likelihood also +assumes Gaussian noise, we can operate this mechanism mathematically, +and obtain the posterior density analytically. This is the +benefit of Gaussian processes.

+

First, we will load in two python functions for computing the +covariance function.

+

Next, we sample from a multivariate normal density (a multivariate +Gaussian), using the covariance function as the covariance matrix.

- +
- +
- +
-
+
-

Figure: One view of Bayesian inference is we have a machine for generating samples (the prior), and we discard all samples inconsistent with our data, leaving the samples of interest (the posterior). This is a rejection sampling view of Bayesian inference. The Gaussian process allows us to do this analytically by multiplying the prior by the likelihood.

+

Figure: One view of Bayesian inference is we have a machine for +generating samples (the prior), and we discard all samples +inconsistent with our data, leaving the samples of interest (the +posterior). This is a rejection sampling view of Bayesian +inference. The Gaussian process allows us to do this analytically by +multiplying the prior by the likelihood.

Gaussian Process

-

The Gaussian process perspective takes the marginal likelihood of the data to be a joint Gaussian density with a covariance given by $\kernelMatrix$. So the model likelihood is of the form,
$$ -p(\dataVector|\inputMatrix) = -\frac{1}{(2\pi)^{\frac{\numData}{2}}|\kernelMatrix|^{\frac{1}{2}}} -\exp\left(-\frac{1}{2}\dataVector^\top \left(\kernelMatrix+\dataStd^2 -\eye\right)^{-1}\dataVector\right) -$$
where the input data, $\inputMatrix$, influences the density through the covariance matrix, $\kernelMatrix$ whose elements are computed through the covariance function, $\kernelScalar(\inputVector, \inputVector^\prime)$.

-

This means that the negative log likelihood (the objective function) is given by,
$$ -\errorFunction(\boldsymbol{\theta}) = \frac{1}{2} \log |\kernelMatrix| -+ \frac{1}{2} \dataVector^\top \left(\kernelMatrix + -\dataStd^2\eye\right)^{-1}\dataVector -$$
where the parameters of the model are also embedded in the covariance function, they include the parameters of the kernel (such as lengthscale and variance), and the noise variance, $\dataStd^2$. Let’s create a class in python for storing these variables.

-

-from mlai import GP
+

The Gaussian process perspective takes the marginal likelihood of the +data to be a joint Gaussian density with a covariance given by \(\mathbf{K}\). So the model likelihood is of +the form, \[ +p(\mathbf{ y}|\mathbf{X}) = +\frac{1}{(2\pi)^{\frac{n}{2}}|\mathbf{K}|^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}\mathbf{ y}^\top \left(\mathbf{K}+\sigma^2 +\mathbf{I}\right)^{-1}\mathbf{ y}\right) +\] where the input data, \(\mathbf{X}\), influences the density +through the covariance matrix, \(\mathbf{K}\) whose elements are computed +through the covariance function, \(k(\mathbf{ +x}, \mathbf{ x}^\prime)\).

+

This means that the negative log likelihood (the objective function) +is given by, \[ +E(\boldsymbol{\theta}) = \frac{1}{2} \log |\mathbf{K}| ++ \frac{1}{2} \mathbf{ y}^\top \left(\mathbf{K}+ +\sigma^2\mathbf{I}\right)^{-1}\mathbf{ y} +\] where the parameters of the model are also embedded +in the covariance function, they include the parameters of the kernel +(such as lengthscale and variance), and the noise variance, \(\sigma^2\). Let’s create a set of classes +in python for storing these variables.

+
import mlai
+

+from mlai import Model
+
import mlai
+

+from mlai import MapModel
+
import mlai
+

+from mlai import ProbModel
+
import mlai
+

+from mlai import ProbMapModel
+
import mlai
+

+from mlai import GP

Making Predictions

-

We now have a probability density that represents functions. How do we make predictions with this density? The density is known as a process because it is consistent. By consistency, here, we mean that the model makes predictions for $\mappingFunctionVector$ that are unaffected by future values of $\mappingFunctionVector^*$ that are currently unobserved (such as test points). If we think of $\mappingFunctionVector^*$ as test points, we can still write down a joint probability density over the training observations, $\mappingFunctionVector$ and the test observations, $\mappingFunctionVector^*$. This joint probability density will be Gaussian, with a covariance matrix given by our covariance function, $\kernelScalar(\inputVector_i, \inputVector_j)$.
$$ -\begin{bmatrix}\mappingFunctionVector \\ \mappingFunctionVector^*\end{bmatrix} \sim \gaussianSamp{\zerosVector}{\begin{bmatrix} \kernelMatrix & \kernelMatrix_\ast \\ -\kernelMatrix_\ast^\top & \kernelMatrix_{\ast,\ast}\end{bmatrix}} -$$
where here $\kernelMatrix$ is the covariance computed between all the training points, $\kernelMatrix_\ast$ is the covariance matrix computed between the training points and the test points and $\kernelMatrix_{\ast,\ast}$ is the covariance matrix computed betwen all the tests points and themselves. To be clear, let’s compute these now for our example, using x and y for the training data (although y doesn’t enter the covariance) and x_pred as the test locations.

-
# set covariance function parameters
-variance = 16.0
-lengthscale = 8
-# set noise variance
-sigma2 = 0.05
-
-kernel = Kernel(eq_cov, variance=variance, lengthscale=lengthscale)
-K = kernel.K(x, x)
-K_star = kernel.K(x, x_pred)
-K_starstar = kernel.K(x_pred, x_pred)
-

Now we use this structure to visualise the covariance between test data and training data. This structure is how information is passed between test and training data. Unlike the maximum likelihood formalisms we’ve been considering so far, the structure expresses correlation between our different data points. However, just like the we now have a joint density between some variables of interest. In particular we have the joint density over $p(\mappingFunctionVector, \mappingFunctionVector^*)$. The joint density is Gaussian and zero mean. It is specified entirely by the covariance matrix, $\kernelMatrix$. That covariance matrix is, in turn, defined by a covariance function. Now we will visualise the form of that covariance in the form of the matrix,
$$ -\begin{bmatrix} \kernelMatrix & \kernelMatrix_\ast \\ \kernelMatrix_\ast^\top -& \kernelMatrix_{\ast,\ast}\end{bmatrix} -$$

-

There are four blocks to this color plot. The upper left block is the covariance of the training data with itself, $\kernelMatrix$. We see some structure here due to the missing data from the first and second world wars. Alongside this covariance (to the right and below) we see the cross covariance between the training and the test data ($\kernelMatrix_*$ and $\kernelMatrix_*^\top$). This is giving us the covariation between our training and our test data. Finally the lower right block The banded structure we now observe is because some of the training points are near to some of the test points. This is how we obtain ‘communication’ between our training data and our test data. If there is no structure in $\kernelMatrix_*$ then our belief about the test data simply matches our prior.

-

Prediction Across Two Points with GPs

-

[edit]

-
import numpy as np
-np.random.seed(4949)
-
import teaching_plots as plot
-import pods
-

Sampling a Function from a Gaussian

-

[edit]

+

We now have a probability density that represents functions. How do +we make predictions with this density? The density is known as a process +because it is consistent. By consistency, here, we mean that +the model makes predictions for \(\mathbf{ +f}\) that are unaffected by future values of \(\mathbf{ f}^*\) that are currently +unobserved (such as test points). If we think of \(\mathbf{ f}^*\) as test points, we can +still write down a joint probability density over the training +observations, \(\mathbf{ f}\) and the +test observations, \(\mathbf{ f}^*\). +This joint probability density will be Gaussian, with a covariance +matrix given by our covariance function, \(k(\mathbf{ x}_i, \mathbf{ x}_j)\). \[ +\begin{bmatrix}\mathbf{ f}\\ \mathbf{ f}^*\end{bmatrix} \sim +\mathcal{N}\left(\mathbf{0},\begin{bmatrix} \mathbf{K}& +\mathbf{K}_\ast \\ +\mathbf{K}_\ast^\top & \mathbf{K}_{\ast,\ast}\end{bmatrix}\right) +\] where here \(\mathbf{K}\) is +the covariance computed between all the training points, \(\mathbf{K}_\ast\) is the covariance matrix +computed between the training points and the test points and \(\mathbf{K}_{\ast,\ast}\) is the covariance +matrix computed betwen all the tests points and themselves. To be clear, +let’s compute these now for our example, using x and +y for the training data (although y doesn’t +enter the covariance) and x_pred as the test locations.

+
# set covariance function parameters
+variance = 16.0
+lengthscale = 8
+# set noise variance
+sigma2 = 0.05
+
+kernel = Kernel(eq_cov, variance=variance, lengthscale=lengthscale)
+K = kernel.K(x, x)
+K_star = kernel.K(x, x_pred)
+K_starstar = kernel.K(x_pred, x_pred)
+

Now we use this structure to visualise the covariance between test +data and training data. This structure is how information is passed +between test and training data. Unlike the maximum likelihood formalisms +we’ve been considering so far, the structure expresses +correlation between our different data points. However, just +like the we now have a joint density between some variables of +interest. In particular we have the joint density over \(p(\mathbf{ f}, \mathbf{ f}^*)\). The joint +density is Gaussian and zero mean. It is specified +entirely by the covariance matrix, \(\mathbf{K}\). That covariance matrix is, in +turn, defined by a covariance function. Now we will visualise the form +of that covariance in the form of the matrix, \[ +\begin{bmatrix} \mathbf{K}& \mathbf{K}_\ast \\ \mathbf{K}_\ast^\top +& \mathbf{K}_{\ast,\ast}\end{bmatrix} +\]

+
+
+ + +
+
+ +
+
+

Figure: Different blocks of the covariance function. The upper left +block is the covariance of the training data with itself, \(\mathbf{K}\). The top right is the cross +covariance between training data (rows) and prediction locations +(columns). The lower left is the same matrix transposed. The bottom +right is the covariance matrix of the test data with itself.

+
+
+

There are four blocks to this plot. The upper left block is the +covariance of the training data with itself, \(\mathbf{K}\). We see some structure here +due to the missing data from the first and second world wars. Alongside +this covariance (to the right and below) we see the cross covariance +between the training and the test data (\(\mathbf{K}_*\) and \(\mathbf{K}_*^\top\)). This is giving us the +covariation between our training and our test data. Finally the lower +right block The banded structure we now observe is because some of the +training points are near to some of the test points. This is how we +obtain ‘communication’ between our training data and our test data. If +there is no structure in \(\mathbf{K}_*\) then our belief about the +test data simply matches our prior.

+

Prediction Across Two +Points with GPs

+
+[edit] +
+
import numpy as np
+np.random.seed(4949)
+
import mlai.plot as plot
+import pods
+

Sampling a Function

+
+[edit] +
+

We will consider a Gaussian distribution with a particular structure +of covariance matrix. We will generate one sample from a +25-dimensional Gaussian density. \[ +\mathbf{ f}=\left[f_{1},f_{2}\dots f_{25}\right]. +\] in the figure below we plot these data on the \(y\)-axis against their indices on +the \(x\)-axis.

+
import mlai
+

+from mlai import Kernel
+
import mlai
+

+from mlai import polynomial_cov
+
import mlai
+

+from mlai import exponentiated_quadratic
+
+
+ + +
+
+ +
+
+

Figure: A 25 dimensional correlated random variable (values ploted +against index)

+
+
+

Sampling a Function from a +Gaussian

+
+[edit] +
- + -
+
-

Figure: The joint Gaussian over $\mappingFunction_1$ and $\mappingFunction_2$ along with the conditional distribution of $\mappingFunction_2$ given $\mappingFunction_1$

+

Figure: The joint Gaussian over \(f_1\) and \(f_2\) along with the conditional +distribution of \(f_2\) given \(f_1\)

-

Joint Density of f1 and f2

+

Joint Density of \(f_1\) and \(f_2\)

- + -
+
-

Figure: The joint Gaussian over $\mappingFunction_1$ and $\mappingFunction_2$ along with the conditional distribution of $\mappingFunction_2$ given $\mappingFunction_1$

+

Figure: The joint Gaussian over \(f_1\) and \(f_2\) along with the conditional +distribution of \(f_2\) given \(f_1\)

-
    -
  • The single contour of the Gaussian density represents the joint distribution, $p(\mappingFunction_1, \mappingFunction_2)$
  • -
-
-
    -
  • We observe that $\mappingFunction_1=?$
  • -
+

Uluru

+
+
+
+
-
-
    -
  • Conditional density: $p(\mappingFunction_2|\mappingFunction_1=?)$

  • -
  • Prediction of $\mappingFunction_2$ from $\mappingFunction_1$ requires conditional density.

  • -
  • Conditional density is also Gaussian.
    $$ -p(\mappingFunction_2|\mappingFunction_1) = {\mathcal{N}\left(\mappingFunction_2|\frac{\kernelScalar_{1, 2}}{\kernelScalar_{1, 1}}\mappingFunction_1,\kernelScalar_{2, 2} - \frac{\kernelScalar_{1,2}^2}{\kernelScalar_{1,1}}\right)} -$$
    where covariance of joint density is given by
    $$ -\kernelMatrix= \begin{bmatrix} \kernelScalar_{1, 1} & \kernelScalar_{1, 2}\\ \kernelScalar_{2, 1} & \kernelScalar_{2, 2}\end{bmatrix} -$$

  • -
-

Joint Density of f1 and f8

-

[edit]

+
+ +
+
+

Figure: Uluru, the sacred rock in Australia. If we think of it as a +probability density, viewing it from this side gives us one +marginal from the density. Figuratively speaking, slicing +through the rock would give a conditional density.

+
+
+

When viewing these contour plots, I sometimes find it helpful to +think of Uluru, the prominent rock formation in Australia. The rock +rises above the surface of the plane, just like a probability density +rising above the zero line. The rock is three dimensional, but when we +view Uluru from the classical position, we are looking at one side of +it. This is equivalent to viewing the marginal density.

+

The joint density can be viewed from above, using contours. The +conditional density is equivalent to slicing the rock. Uluru is +a holy rock, so this has to be an imaginary slice. Imagine we cut down a +vertical plane orthogonal to our view point (e.g. coming across our view +point). This would give a profile of the rock, which when renormalized, +would give us the conditional distribution, the value of conditioning +would be the location of the slice in the direction we are facing.

+

Prediction with Correlated +Gaussians

+

Of course in practice, rather than manipulating mountains physically, +the advantage of the Gaussian density is that we can perform these +manipulations mathematically.

+

Prediction of \(f_2\) given \(f_1\) requires the conditional +density, \(p(f_2|f_1)\).Another +remarkable property of the Gaussian density is that this conditional +distribution is also guaranteed to be a Gaussian density. It +has the form, \[ +p(f_2|f_1) = \mathcal{N}\left(f_2|\frac{k_{1, 2}}{k_{1, 1}}f_1, k_{2, 2} +- \frac{k_{1,2}^2}{k_{1,1}}\right) +\]where we have assumed that the covariance of the original joint +density was given by \[ +\mathbf{K}= \begin{bmatrix} k_{1, 1} & k_{1, 2}\\ k_{2, 1} & +k_{2, 2}.\end{bmatrix} +\]

+

Using these formulae we can determine the conditional density for any +of the elements of our vector \(\mathbf{ +f}\). For example, the variable \(f_8\) is less correlated with \(f_1\) than \(f_2\). If we consider this variable we see +the conditional density is more diffuse.

+

Joint Density of \(f_1\) and \(f_8\)

+
+[edit] +
- + -
+
-

Figure: Sample from the joint Gaussian model, points indexed by 1 and 8 highlighted.

+

Figure: Sample from the joint Gaussian model, points indexed by 1 and +8 highlighted.

-

Prediction of $\mappingFunction_{8}$ from $\mappingFunction_{1}$

+

Prediction of \(f_{8}\) from \(f_{1}\)

- + -
+
-

Figure: The joint Gaussian over $\mappingFunction_1$ and $\mappingFunction_8$ along with the conditional distribution of $\mappingFunction_8$ given $\mappingFunction_1$

+

Figure: The joint Gaussian over \(f_1\) and \(f_8\) along with the conditional +distribution of \(f_8\) given \(f_1\)

    -
  • The single contour of the Gaussian density represents the joint distribution, $p(\mappingFunction_1, \mappingFunction_8)$
  • +
  • The single contour of the Gaussian density represents the +joint distribution, \(p(f_1, f_8)\)
-
+

. . .

    -
  • We observe a value for $\mappingFunction_1=-?$
  • +
  • We observe a value for \(f_1=-?\)
-
-
+

. . .

    -
  • Conditional density: $p(\mappingFunction_5|\mappingFunction_1=?)$.

  • -
  • Prediction of $\mappingFunctionVector_*$ from $\mappingFunctionVector$ requires multivariate conditional density.

  • -
  • Multivariate conditional density is also Gaussian.
    $$ -p(\mappingFunctionVector_*|\mappingFunctionVector) = {\mathcal{N}\left(\mappingFunctionVector_*|\kernelMatrix_{*,\mappingFunctionVector}\kernelMatrix_{\mappingFunctionVector,\mappingFunctionVector}^{-1}\mappingFunctionVector,\kernelMatrix_{*,*}-\kernelMatrix_{*,\mappingFunctionVector} \kernelMatrix_{\mappingFunctionVector,\mappingFunctionVector}^{-1}\kernelMatrix_{\mappingFunctionVector,*}\right)} -$$

  • -
  • Here covariance of joint density is given by
    $$ -\kernelMatrix= \begin{bmatrix} \kernelMatrix_{\mappingFunctionVector, \mappingFunctionVector} & \kernelMatrix_{*, \mappingFunctionVector}\\ \kernelMatrix_{\mappingFunctionVector, *} & \kernelMatrix_{*, *}\end{bmatrix} -$$

  • -
  • Prediction of $\mappingFunctionVector_*$ from $\mappingFunctionVector$ requires multivariate conditional density.

  • -
  • Multivariate conditional density is also Gaussian.
    $$ -p(\mappingFunctionVector_*|\mappingFunctionVector) = {\mathcal{N}\left(\mappingFunctionVector_*|\meanVector,\conditionalCovariance\right)} -$$

    $$ -\meanVector= \kernelMatrix_{*,\mappingFunctionVector}\kernelMatrix_{\mappingFunctionVector,\mappingFunctionVector}^{-1}\mappingFunctionVector -$$

    $$ -\conditionalCovariance = \kernelMatrix_{*,*}-\kernelMatrix_{*,\mappingFunctionVector} \kernelMatrix_{\mappingFunctionVector,\mappingFunctionVector}^{-1}\kernelMatrix_{\mappingFunctionVector,*} -$$

  • -
  • Here covariance of joint density is given by
    $$ -\kernelMatrix= \begin{bmatrix} \kernelMatrix_{\mappingFunctionVector, \mappingFunctionVector} & \kernelMatrix_{*, \mappingFunctionVector}\\ \kernelMatrix_{\mappingFunctionVector, *} & \kernelMatrix_{*, *}\end{bmatrix} -$$

  • +
  • Conditional density: \(p(f_8|f_1=?)\).

  • +
  • Prediction of \(\mathbf{ f}_*\) +from \(\mathbf{ f}\) requires +multivariate conditional density.

  • +
  • Multivariate conditional density is also Gaussian. + \[ +p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ +f}_*|\mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ f},\mathbf{ +f}}^{-1}\mathbf{ f},\mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} +\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ +f},*}\right)} +\]

  • +
  • Here covariance of joint density is given by \[ +\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & +\mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & +\mathbf{K}_{*, *}\end{bmatrix} +\]

  • +
  • Prediction of \(\mathbf{ f}_*\) +from \(\mathbf{ f}\) requires +multivariate conditional density.

  • +
  • Multivariate conditional density is also Gaussian. + \[ +p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ +f}_*|\boldsymbol{ \mu},\boldsymbol{ \Sigma}\right)} +\] \[ +\boldsymbol{ \mu}= \mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ +f},\mathbf{ f}}^{-1}\mathbf{ f} +\] \[ +\boldsymbol{ \Sigma}= \mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} +\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ f},*} +\]

  • +
  • Here covariance of joint density is given by \[ +\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & +\mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & +\mathbf{K}_{*, *}\end{bmatrix} +\]

-
-

The Importance of the Covariance Function

-

[edit]

-

The covariance function encapsulates our assumptions about the data. The equations for the distribution of the prediction function, given the training observations, are highly sensitive to the covariation between the test locations and the training locations as expressed by the matrix $\kernelMatrix_*$. We defined a matrix A which allowed us to express our conditional mean in the form,
$$ -\meanVector_\mappingFunction = \mathbf{A}^\top \dataVector, -$$
where $\dataVector$ were our training observations. In other words our mean predictions are always a linear weighted combination of our training data. The weights are given by computing the covariation between the training and the test data ($\kernelMatrix_*$) and scaling it by the inverse covariance of the training data observations, $\left[\kernelMatrix + \dataStd^2 \eye\right]^{-1}$. This inverse is the main computational object that needs to be resolved for a Gaussian process. It has a computational burden which is $O(\numData^3)$ and a storage burden which is $O(\numData^2)$. This makes working with Gaussian processes computationally intensive for the situation where $\numData>10,000$.

+

The Importance of the +Covariance Function

+
+[edit] +
+

The covariance function encapsulates our assumptions about the data. +The equations for the distribution of the prediction function, given the +training observations, are highly sensitive to the covariation between +the test locations and the training locations as expressed by the matrix +\(\mathbf{K}_*\). We defined a matrix +\(\mathbf{A}\) which allowed us to +express our conditional mean in the form, \[ +\boldsymbol{ \mu}_f= \mathbf{A}^\top \mathbf{ y}, +\] where \(\mathbf{ y}\) were +our training observations. In other words our mean predictions +are always a linear weighted combination of our training data. +The weights are given by computing the covariation between the training +and the test data (\(\mathbf{K}_*\)) +and scaling it by the inverse covariance of the training data +observations, \(\left[\mathbf{K}+ \sigma^2 +\mathbf{I}\right]^{-1}\). This inverse is the main computational +object that needs to be resolved for a Gaussian process. It has a +computational burden which is \(O(n^3)\) and a storage burden which is +\(O(n^2)\). This makes working with +Gaussian processes computationally intensive for the situation where +\(n>10,000\).

-
+
-

Figure: Introduction to Gaussian processes given by Neil Lawrence at the 2014 Gaussian process Winter School at the University of Sheffield.

+

Figure: Introduction to Gaussian processes given by Neil Lawrence at +the 2014 Gaussian process Winter School at the University of +Sheffield.

Improving the Numerics

-

[edit]

-

In practice we shouldn’t be using matrix inverse directly to solve the GP system. One more stable way is to compute the Cholesky decomposition of the kernel matrix. The log determinant of the covariance can also be derived from the Cholesky decomposition.

-

-from mlai import update_inverse
-
GP.update_inverse = update_inverse
+
+[edit] +
+

In practice we shouldn’t be using matrix inverse directly to solve +the GP system. One more stable way is to compute the Cholesky +decomposition of the kernel matrix. The log determinant of the +covariance can also be derived from the Cholesky decomposition.

+
import mlai
+

+from mlai import update_inverse
+
GP.update_inverse = update_inverse

Capacity Control

-

Gaussian processes are sometimes seen as part of a wider family of methods known as kernel methods. Kernel methods are also based around covariance functions, but in the field they are known as Mercer kernels. Mercer kernels have interpretations as inner products in potentially infinite dimensional Hilbert spaces. This interpretation arises because, if we take α = 1, then the kernel can be expressed as
$$ -\kernelMatrix = \basisMatrix\basisMatrix^\top -$$
which imples the elements of the kernel are given by,
$$ -\kernelScalar(\inputVector, \inputVector^\prime) = \basisVector(\inputVector)^\top \basisVector(\inputVector^\prime). -$$
So we see that the kernel function is developed from an inner product between the basis functions. Mercer’s theorem tells us that any valid positive definite function can be expressed as this inner product but with the caveat that the inner product could be infinite length. This idea has been used quite widely to kernelize algorithms that depend on inner products. The kernel functions are equivalent to covariance functions and they are parameterized accordingly. In the kernel modeling community it is generally accepted that kernel parameter estimation is a difficult problem and the normal solution is to cross validate to obtain parameters. This can cause difficulties when a large number of kernel parameters need to be estimated. In Gaussian process modelling kernel parameter estimation (in the simplest case proceeds) by maximum likelihood. This involves taking gradients of the likelihood with respect to the parameters of the covariance function.

+

Gaussian processes are sometimes seen as part of a wider family of +methods known as kernel methods. Kernel methods are also based around +covariance functions, but in the field they are known as Mercer kernels. +Mercer kernels have interpretations as inner products in potentially +infinite dimensional Hilbert spaces. This interpretation arises because, +if we take \(\alpha=1\), then the +kernel can be expressed as \[ +\mathbf{K}= \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top +\] which imples the elements of the kernel are given by, \[ +k(\mathbf{ x}, \mathbf{ x}^\prime) = \boldsymbol{ \phi}(\mathbf{ +x})^\top \boldsymbol{ \phi}(\mathbf{ x}^\prime). +\] So we see that the kernel function is developed from an inner +product between the basis functions. Mercer’s theorem tells us that any +valid positive definite function can be expressed as this inner +product but with the caveat that the inner product could be infinite +length. This idea has been used quite widely to kernelize +algorithms that depend on inner products. The kernel functions are +equivalent to covariance functions and they are parameterized +accordingly. In the kernel modeling community it is generally accepted +that kernel parameter estimation is a difficult problem and the normal +solution is to cross validate to obtain parameters. This can cause +difficulties when a large number of kernel parameters need to be +estimated. In Gaussian process modelling kernel parameter estimation (in +the simplest case proceeds) by maximum likelihood. This involves taking +gradients of the likelihood with respect to the parameters of the +covariance function.

Gradients of the Likelihood

-

The easiest conceptual way to obtain the gradients is a two step process. The first step involves taking the gradient of the likelihood with respect to the covariance function, the second step involves considering the gradient of the covariance function with respect to its parameters.

+

The easiest conceptual way to obtain the gradients is a two step +process. The first step involves taking the gradient of the likelihood +with respect to the covariance function, the second step involves +considering the gradient of the covariance function with respect to its +parameters.

Overall Process Scale

-

In general we won’t be able to find parameters of the covariance function through fixed point equations, we will need to do gradient based optimization.

-

Capacity Control and Data Fit

-

The objective function can be decomposed into two terms, a capacity control term, and a data fit term. The capacity control term is the log determinant of the covariance. The data fit term is the matrix inner product between the data and the inverse covariance.

-

Learning Covariance Parameters

-

[edit]

+

In general we won’t be able to find parameters of the covariance +function through fixed point equations, we will need to do gradient +based optimization.

+

Capacity Control and Data +Fit

+

The objective function can be decomposed into two terms, a capacity +control term, and a data fit term. The capacity control term is the log +determinant of the covariance. The data fit term is the matrix inner +product between the data and the inverse covariance.

+

Learning Covariance +Parameters

+
+[edit] +

Can we determine covariance parameters from the data?

-


$$ -\gaussianDist{\dataVector}{\mathbf{0}}{\kernelMatrix}=\frac{1}{(2\pi)^\frac{\numData}{2}{\det{\kernelMatrix}^{\frac{1}{2}}}}{\exp\left(-\frac{\dataVector^{\top}\kernelMatrix^{-1}\dataVector}{2}\right)} -$$

-


$$ +

\[ +\mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}{\det{\mathbf{K}}^{\frac{1}{2}}}}{\exp\left(-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} +\]

+

\[ \begin{aligned} - \gaussianDist{\dataVector}{\mathbf{0}}{\kernelMatrix}=\frac{1}{(2\pi)^\frac{\numData}{2}\color{blue}{\det{\kernelMatrix}^{\frac{1}{2}}}}\color{red}{\exp\left(-\frac{\dataVector^{\top}\kernelMatrix^{-1}\dataVector}{2}\right)} + \mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}\color{blue}{\det{\mathbf{K}}^{\frac{1}{2}}}}\color{red}{\exp\left(-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} \end{aligned} -$$

-


$$ +\]

+

\[ \begin{aligned} - \log \gaussianDist{\dataVector}{\mathbf{0}}{\kernelMatrix}=&\color{blue}{-\frac{1}{2}\log\det{\kernelMatrix}}\color{red}{-\frac{\dataVector^{\top}\kernelMatrix^{-1}\dataVector}{2}} \\ &-\frac{\numData}{2}\log2\pi + \log \mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=&\color{blue}{-\frac{1}{2}\log\det{\mathbf{K}}}\color{red}{-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} \\ &-\frac{n}{2}\log2\pi \end{aligned} -$$

-


$$ -\errorFunction(\parameterVector) = \color{blue}{\frac{1}{2}\log\det{\kernelMatrix}} + \color{red}{\frac{\dataVector^{\top}\kernelMatrix^{-1}\dataVector}{2}} -$$

-

Capacity Control through the Determinant

-

[edit]

-

The parameters are inside the covariance function (matrix).
$$\kernelScalar_{i, j} = \kernelScalar(\inputVals_i, \inputVals_j; \parameterVector)$$

-


$$\kernelMatrix = \rotationMatrix \eigenvalueMatrix^2 \rotationMatrix^\top$$

-
gpoptimizePlot1
+\]

+

\[ +E(\boldsymbol{ \theta}) = \color{blue}{\frac{1}{2}\log\det{\mathbf{K}}} ++ \color{red}{\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} +\]

+

Capacity Control +through the Determinant

+
+[edit] +
+

The parameters are inside the covariance function (matrix). +\[k_{i, j} = k(\mathbf{ x}_i, \mathbf{ x}_j; +\boldsymbol{ \theta})\]

+

\[\mathbf{K}= +\mathbf{R}\boldsymbol{ \Lambda}^2 \mathbf{R}^\top\]

- +
-$\eigenvalueMatrix$ represents distance on axes. $\rotationMatrix$ gives rotation. +\(\boldsymbol{ \Lambda}\) represents +distance on axes. \(\mathbf{R}\) gives +rotation.
    -
  • $\eigenvalueMatrix$ is diagonal, $\rotationMatrix^\top\rotationMatrix = \eye$.
  • -
  • Useful representation since $\det{\kernelMatrix} = \det{\eigenvalueMatrix^2} = \det{\eigenvalueMatrix}^2$.
  • +
  • \(\boldsymbol{ \Lambda}\) is +diagonal, \(\mathbf{R}^\top\mathbf{R}= +\mathbf{I}\).
  • +
  • Useful representation since \(\det{\mathbf{K}} = \det{\boldsymbol{ \Lambda}^2} = +\det{\boldsymbol{ \Lambda}}^2\).
-
diagrams = './gp/'
- + -
+
-

Figure: The determinant of the covariance is dependent only on the eigenvalues. It represents the ‘footprint’ of the Gaussian.

+

Figure: The determinant of the covariance is dependent only on the +eigenvalues. It represents the ‘footprint’ of the Gaussian.

+

Quadratic Data Fit

+
+[edit] +
- + -
+
-

Figure: The data fit term of the Gaussian process is a quadratic loss centered around zero. This has eliptical contours, the principal axes of which are given by the covariance matrix.

+

Figure: The data fit term of the Gaussian process is a quadratic loss +centered around zero. This has eliptical contours, the principal axes of +which are given by the covariance matrix.

-

Quadratic Data Fit

-

[edit]

Data Fit Term

-

[edit]

+
+[edit] +
@@ -1403,31 +3068,52 @@

Data Fit Term

- + - +
- + - +
-
+
-

Figure: Variation in the data fit term, the capacity term and the negative log likelihood for different lengthscales.

-
-
-

Exponentiated Quadratic Covariance

-

[edit]

-

The exponentiated quadratic covariance, also known as the Gaussian covariance or the RBF covariance and the squared exponential. Covariance between two points is related to the negative exponential of the squared distnace between those points. This covariance function can be derived in a few different ways: as the infinite limit of a radial basis function neural network, as diffusion in the heat equation, as a Gaussian filter in Fourier space or as the composition as a series of linear filters applied to a base function.

-

The covariance takes the following form,
$$ -\kernelScalar(\inputVector, \inputVector^\prime) = \alpha \exp\left(-\frac{\ltwoNorm{\inputVector-\inputVector^\prime}^2}{2\lengthScale^2}\right) -$$
where is the length scale or time scale of the process and α represents the overall process variance.

+

Figure: Variation in the data fit term, the capacity term and the +negative log likelihood for different lengthscales.

+
+ +

Exponentiated Quadratic +Covariance

+
+[edit] +
+

The exponentiated quadratic covariance, also known as the Gaussian +covariance or the RBF covariance and the squared exponential. Covariance +between two points is related to the negative exponential of the squared +distnace between those points. This covariance function can be derived +in a few different ways: as the infinite limit of a radial basis +function neural network, as diffusion in the heat equation, as a +Gaussian filter in Fourier space or as the composition as a +series of linear filters applied to a base function.

+

The covariance takes the following form, \[ +k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left(-\frac{\left\Vert +\mathbf{ x}-\mathbf{ x}^\prime \right\Vert_2^2}{2\ell^2}\right) +\] where \(\ell\) is the +length scale or time scale of the process and \(\alpha\) represents the overall process +variance.

-
$$\kernelScalar(\inputVector, \inputVector^\prime) = \alpha \exp\left(-\frac{\ltwoNorm{\inputVector-\inputVector^\prime}^2}{2\lengthScale^2}\right)$$
+\[k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha +\exp\left(-\frac{\left\Vert \mathbf{ x}-\mathbf{ x}^\prime +\right\Vert_2^2}{2\ell^2}\right)\]
@@ -1445,41 +3131,79 @@

Exponentiated Quadratic Covariance

-
+

Figure: The exponentiated quadratic covariance function.

-

GPSS: Gaussian Process Summer School

-

[edit]

+

GPSS: Gaussian Process +Summer School

+
+[edit] +
-

If you’re interested in finding out more about Gaussian processes, you can attend the Gaussian process summer school, or view the lectures and material on line. Details of the school, future events and past events can be found at the website http://gpss.cc.

-

GPy: A Gaussian Process Framework in Python

-

[edit]

-

Gaussian processes are a flexible tool for non-parametric analysis with uncertainty. The GPy software was started in Sheffield to provide a easy to use interface to GPs. One which allowed the user to focus on the modelling rather than the mathematics.

+

If you’re interested in finding out more about Gaussian processes, +you can attend the Gaussian process summer school, or view the lectures +and material on line. Details of the school, future events and past +events can be found at the website http://gpss.cc.

+
%pip install gpy
+

GPy: A Gaussian +Process Framework in Python

+
+[edit] +
+

Gaussian processes are a flexible tool for non-parametric analysis +with uncertainty. The GPy software was started in Sheffield to provide a +easy to use interface to GPs. One which allowed the user to focus on the +modelling rather than the mathematics.

- +
-
+
-

Figure: GPy is a BSD licensed software code base for implementing Gaussian process models in Python. It is designed for teaching and modelling. We welcome contributions which can be made through the Github repository https://github.com/SheffieldML/GPy

-
-
-

GPy is a BSD licensed software code base for implementing Gaussian process models in python. This allows GPs to be combined with a wide variety of software libraries.

-

The software itself is available on GitHub and the team welcomes contributions.

-

The aim for GPy is to be a probabilistic-style programming language, i.e. you specify the model rather than the algorithm. As well as a large range of covariance functions the software allows for non-Gaussian likelihoods, multivariate outputs, dimensionality reduction and approximations for larger data sets.

-

The documentation for GPy can be found here.

+

Figure: GPy is a BSD licensed software code base for implementing +Gaussian process models in Python. It is designed for teaching and +modelling. We welcome contributions which can be made through the GitHub +repository https://github.com/SheffieldML/GPy

+
+
+

GPy is a BSD licensed software code base for implementing Gaussian +process models in python. This allows GPs to be combined with a wide +variety of software libraries.

+

The software itself is available on GitHub and the team +welcomes contributions.

+

The aim for GPy is to be a probabilistic-style programming language, +i.e., you specify the model rather than the algorithm. As well as a +large range of covariance functions the software allows for non-Gaussian +likelihoods, multivariate outputs, dimensionality reduction and +approximations for larger data sets.

+

The documentation for GPy can be found here.

GPy Tutorial

-

[edit]

+
+[edit] +
@@ -1492,7 +3216,7 @@

GPy Tutorial

James Hensman - + @@ -1505,225 +3229,363 @@

GPy Tutorial

Nicolas Durrande - +
-

This GPy tutorial is based on material we share in the Gaussian process summer school for teaching these models https://gpss.cc. It contains material from various members and former members of the Sheffield machine learning group, but particular mention should be made of Nicolas Durrande and James Hensman, see http://gpss.cc/gpss17/labs/GPSS_Lab1_2017.ipynb.

-
%pip install gpy
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/teaching_plots.py','teaching_plots.py')
-
urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/gp_tutorial.py','gp_tutorial.py')
-
import numpy as np
-import GPy
-

To give a feel for the sofware we’ll start by creating an exponentiated quadratic covariance function,
$$ -\kernelScalar(\inputVector, \inputVector^\prime) = \alpha \exp\left(-\frac{\ltwoNorm{\inputVector - \inputVector^\prime}^2}{2\ell^2}\right), -$$
where the length scale is and the variance is α.

+

This GPy tutorial is based on material we share in the Gaussian +process summer school for teaching these models https://gpss.cc. It contains +material from various members and former members of the Sheffield +machine learning group, but particular mention should be made of Nicolas +Durrande and James +Hensman, see http://gpss.cc/gpss17/labs/GPSS_Lab1_2017.ipynb.

+
import numpy as np
+import GPy
+

To give a feel for the software we’ll start by creating an +exponentiated quadratic covariance function, \[ +k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left(-\frac{\left\Vert +\mathbf{ x}- \mathbf{ x}^\prime \right\Vert_2^2}{2\ell^2}\right), +\] where the length scale is \(\ell\) and the variance is \(\alpha\).

To set this up in GPy we create a kernel in the following manner.

-
input_dim=1
-alpha = 1.0
-lengthscale = 2.0
-kern = GPy.kern.RBF(input_dim=input_dim, variance=alpha, lengthscale=lengthscale)
+
input_dim=1
+alpha = 1.0
+lengthscale = 2.0
+kern = GPy.kern.RBF(input_dim=input_dim, variance=alpha, lengthscale=lengthscale)

That builds a kernel object for us. The kernel can be displayed.

-
display(kern)
-

Or because it’s one dimensional, you can also plot the kernel as a function of its inputs (while the other is fixed).

+
display(kern)
+

Or because it’s one dimensional, you can also plot the kernel as a +function of its inputs (while the other is fixed).

- + -
+
-

Figure: The exponentiated quadratic covariance function as plotted by the GPy.kern.plot command.

+

Figure: The exponentiated quadratic covariance function as plotted by +the GPy.kern.plot command.

-

You can set the lengthscale of the covariance to different values and plot the result.

-
kern = GPy.kern.RBF(input_dim=input_dim)     # By default, the parameters are set to 1.
-lengthscales = np.asarray([0.2,0.5,1.,2.,4.])
+

You can set the length scale of the covariance to different values +and plot the result.

+
kern = GPy.kern.RBF(input_dim=input_dim)     # By default, the parameters are set to 1.
+lengthscales = np.asarray([0.2,0.5,1.,2.,4.])
- + -
+
-

Figure: The exponentiated quadratic covariance function plotted for different lengthscales by GPy.kern.plot command.

+

Figure: The exponentiated quadratic covariance function plotted for +different length scales by GPy.kern.plot command.

Covariance Functions in GPy

-

Many covariance functions are already implemented in GPy. Instead of rbf, try constructing and plotting the following covariance functions: exponential, Matern32, Matern52, Brownian, linear, bias, rbfcos, periodic_Matern32, etc. Some of these covariance functions, such as rbfcos, are not parametrized by a variance and a lengthscale. Furthermore, not all kernels are stationary (i.e., they can’t all be written as $\kernelScalar(\inputVector, \inputVector^\prime) = f(\inputVector-\inputVector^\prime)$, see for example the Brownian covariance function). For plotting so it may be interesting to change the value of the fixed input.

-

Combining Covariance Functions in GPy

-

In GPy you can easily combine covariance functions you have created using the sum and product operators, + and *. So, for example, if we wish to combine an exponentiated quadratic covariance with a Matern 5/2 then we can write

-
kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
-kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
-kern = kern1 + kern2
-display(kern)
+

Many covariance functions are already implemented in GPy. Instead of +rbf, try constructing and plotting the following covariance functions: +exponential, Matern32, Matern52, +Brownian, linear, bias, +rbfcos, periodic_Matern32, etc. Some of these +covariance functions, such as rbfcos, are not parametrized +by a variance and a length scale. Further, not all kernels are +stationary (i.e., they can’t all be written as \(k(\mathbf{ x}, \mathbf{ x}^\prime) = f(\mathbf{ +x}-\mathbf{ x}^\prime)\), see for example the Brownian covariance +function). So for plotting it may be interesting to change the value of +the fixed input.

+

Combining Covariance +Functions in GPy

+

In GPy you can easily combine covariance functions you have created +using the sum and product operators, + and *. +So, for example, if we wish to combine an exponentiated quadratic +covariance with a Matern 5/2 then we can write

+
kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
+kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
+kern = kern1 + kern2
+display(kern)
- + -
+
-

Figure: A combination of the exponentiated quadratic covariance plus the Matern 5/2 covariance.

+

Figure: A combination of the exponentiated quadratic covariance plus +the Matern \(5/2\) covariance.

-

Or if we wanted to multiply them we can write

-
kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
-kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
-kern = kern1 * kern2
-display(kern)
+

Or if we wanted to multiply them, we can write

+
kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
+kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
+kern = kern1 * kern2
+display(kern)
- + -
+
-
-

Figure: A combination of the exponentiated quadratic covariance multiplied by the Matern 5/2 covariance.

+
+

Figure: A combination of the exponentiated quadratic covariance +multiplied by the Matern \(5/2\) +covariance.

-

You can learn about how to implement new kernel objects in GPy here.

+

You can learn about how to implement new +kernel objects in GPy here.

-
+
-
-

Figure: Designing the covariance function for your Gaussian process is a key place in which you introduce your understanding of the data problem. To learn more about the design of covariance functions, see this talk from Nicolas Durrande at GPSS in 2016.

-
-
-

A Gaussian Process Regression Model

-

We will now combine the Gaussian process prior with some data to form a GP regression model with GPy. We will generate data from the function
$$ -\mappingFunction( \inputScalar ) = − \cos(\pi \inputScalar ) + \sin(4\pi \inputScalar ) -$$
over the domain [0, 1], adding some noise to gives
$$ -\dataScalar(\inputScalar) = \mappingFunction(\inputScalar) + \noiseScalar, -$$
with the noise being Gaussian distributed, $\noiseScalar \sim \gaussianSamp{0}{0.01}$.

-
X = np.linspace(0.05,0.95,10)[:,np.newaxis]
-Y = -np.cos(np.pi*X) + np.sin(4*np.pi*X) + np.random.normal(loc=0.0, scale=0.1, size=(10,1))
+
+

Figure: Designing the covariance function for your Gaussian process +is a key place in which you introduce your understanding of the data +problem. To learn more about the design of covariance functions, see +this talk from Nicolas Durrande at GPSS in 2016.

+
+
+

A Gaussian Process +Regression Model

+

We will now combine the Gaussian process prior with some data to form +a GP regression model with GPy. We will generate data from the function +\[ +f( x) = − \cos(\pi x) + \sin(4\pi x) +\] over the domain \([0, 1]\), +adding some noise to gives \[ +y(x) = f(x) + \epsilon, +\] with the noise being Gaussian distributed, \(\epsilon\sim +\mathcal{N}\left(0,0.01\right)\).

+
X = np.linspace(0.05,0.95,10)[:,np.newaxis]
+Y = -np.cos(np.pi*X) + np.sin(4*np.pi*X) + np.random.normal(loc=0.0, scale=0.1, size=(10,1))
- + -
+
-

Figure: Data from the noisy sine wave for fitting with a GPy model.

-
-
-

A GP regression model based on an exponentiated quadratic covariance function can be defined by first defining a covariance function.

-
kern = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)
-

And then combining it with the data to form a Gaussian process model.

-
model = GPy.models.GPRegression(X,Y,kern)
-

Just as for the covariance function object, we can find out about the model using the command display(model).

-
display(model)
-

Note that by default the model includes some observation noise with variance 1. We can see the posterior mean prediction and visualize the marginal posterior variances using model.plot().

+

Figure: Data from the noisy sine wave for fitting with a GPy +model.

+ + +

A GP regression model based on an exponentiated quadratic covariance +function can be defined by first defining a covariance function.

+
kern = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)
+

And then combining it with the data to form a Gaussian process +model.

+
model = GPy.models.GPRegression(X,Y,kern)
+

Just as for the covariance function object, we can find out about the +model using the command display(model).

+
display(model)
+

Note that by default the model includes some observation noise with +variance 1. We can see the posterior mean prediction and visualize the +marginal posterior variances using model.plot().

- + -
+
-

Figure: A Gaussian process fit to the noisy sine data. Here the parameters of the process and the covariance function haven’t yet been optimized.

-
-
-

You can also look directly at the predictions for the model using.

-
Xstar = np.linspace(0, 10, 100)[:, np.newaxis]
-Ystar, Vstar = model.predict(Xstar)
-

Which gives you the mean (Ystar), the variance (Vstar) at the locations given by Xstar.

-

Covariance Function Parameter Estimation

-

As we have seen during the lectures, the parameters values can be estimated by maximizing the likelihood of the observations. Since we don’t want one of the variance to become negative during the optimization, we can constrain all parameters to be positive before running the optimisation.

-
model.constrain_positive()
-

The warnings are because the parameters are already constrained by default, the software is warning us that they are being reconstrained.

-

Now we can optimize the model using the model.optimize() method. Here we switch messages on, which allows us to see the progession of the optimization.

-
model.optimize(messages=True)
-

By default the optimization is using a limited memory BFGS optimizer (Byrd, Lu, and Nocedal 1995).

-

Once again we can display the model, now to see how the parameters have changed.

-
display(model)
-

The lengthscale is much smaller, as well as the noise level. The variance of the exponentiated quadratic has also reduced.

+

Figure: A Gaussian process fit to the noisy sine data. Here the +parameters of the process and the covariance function haven’t yet been +optimized.

+ + +

You can also look directly at the predictions for the model +using.

+
Xstar = np.linspace(0, 10, 100)[:, np.newaxis]
+Ystar, Vstar = model.predict(Xstar)
+

Which gives you the mean (Ystar), the variance +(Vstar) at the locations given by Xstar.

+

Covariance Function +Parameter Estimation

+

As we have seen during the lectures, the parameters values can be +estimated by maximizing the likelihood of the observations. Since we +don’t want any of the variances to become negative during the +optimization, we can constrain all parameters to be positive before +running the optimization.

+
model.constrain_positive()
+

The warnings are because the parameters are already constrained by +default, the software is warning us that they are being +reconstrained.

+

Now we can optimize the model using the model.optimize() +method. Here we switch messages on, which allows us to see the +progression of the optimization.

+
model.optimize(messages=True)
+

By default, the optimization is using a limited memory BFGS optimizer +(Byrd et al., +1995).

+

Once again, we can display the model, now to see how the parameters +have changed.

+
display(model)
+

The length scale is much smaller, as well as the noise level. The +variance of the exponentiated quadratic has also reduced.

- + -
+
-

Figure: A Gaussian process fit to the noisy sine data with parameters optimized.

+

Figure: A Gaussian process fit to the noisy sine data with parameters +optimized.

Review

Other Software

-

[edit]

-

GPy has inspired other software solutions, first of all GPflow, which uses Tensor Flow’s automatic differentiation engine to allow rapid prototyping of new covariance functions and algorithms. More recently, GPyTorch uses PyTorch for the same purpose.

-

The Probabilistic programming language pyro also has GP support.

+
+[edit] +
+

GPy has inspired other software solutions, first of all GPflow, which uses Tensor +Flow’s automatic differentiation engine to allow rapid prototyping of +new covariance functions and algorithms. More recently, GPyTorch uses +PyTorch for the same purpose.

+

The Probabilistic programming language pyro also has GP support.

Further Reading

    -
  • Chapter 2 of Neal (1994)

  • -
  • Rest of Neal (1994)

  • -
  • All of MacKay (1992)

  • +
  • Chapter 2 of Neal (1994)

  • +
  • Rest of Neal +(1994)

  • +
  • All of MacKay (1992)

Thanks!

-

For more information on these subjects and more you might want to check the following resources.

+

For more information on these subjects and more you might want to +check the following resources.

References

-
-
-

Andrade-Pacheco, Ricardo, Martin Mubangizi, John Quinn, and Neil D. Lawrence. 2014. “Consistent Mapping of Government Malaria Records Across a Changing Territory Delimitation.” Malaria Journal 13 (Suppl 1). https://doi.org/10.1186/1475-2875-13-S1-P5.

-
-
-

Byrd, Richard H., Peihuang Lu, and Jorge Nocedal. 1995. “A Limited Memory Algorithm for Bound Constrained Optimization.” SIAM Journal on Scientific and Statistical Computing 16 (5): 1190–1208.

-
-
-

Cho, Youngmin, and Lawrence K. Saul. 2009. “Kernel Methods for Deep Learning.” In Advances in Neural Information Processing Systems 22, edited by Y. Bengio, D. Schuurmans, J. D. Lafferty, C. K. I. Williams, and A. Culotta, 342–50. Curran Associates, Inc. http://papers.nips.cc/paper/3628-kernel-methods-for-deep-learning.pdf.

-
-
-

Gething, Peter W., Abdisalan M. Noor, Priscilla W. Gikandi, Esther A. A. Ogara, Simon I. Hay, Mark S. Nixon, Robert W. Snow, and Peter M. Atkinson. 2006. “Improving Imperfect Data from Health Management Information Systems in Africa Using Space–Time Geostatistics.” PLoS Medicine 3 (6). https://doi.org/10.1371/journal.pmed.0030271.

-
-
-

Ioffe, Sergey, and Christian Szegedy. 2015. “Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift.” In Proceedings of the 32nd International Conference on Machine Learning, edited by Francis Bach and David Blei, 37:448–56. Proceedings of Machine Learning Research. Lille, France: PMLR. http://proceedings.mlr.press/v37/ioffe15.html.

-
-
-

Laplace, Pierre Simon. 1814. Essai Philosophique Sur Les Probabilités. 2nd ed. Paris: Courcier.

-
-
-

MacKay, David J. C. 1992. “Bayesian Methods for Adaptive Models.” PhD thesis, California Institute of Technology.

-
-
-

Mubangizi, Martin, Ricardo Andrade-Pacheco, Michael Thomas Smith, John Quinn, and Neil D. Lawrence. 2014. “Malaria Surveillance with Multiple Data Sources Using Gaussian Process Models.” In 1st International Conference on the Use of Mobile ICT in Africa.

-
-
-

Neal, Radford M. 1994. “Bayesian Learning for Neural Networks.” PhD thesis, Dept. of Computer Science, University of Toronto.

-
-
-

Rasmussen, Carl Edward, and Christopher K. I. Williams. 2006. Gaussian Processes for Machine Learning. Cambridge, MA: mit.

-
-
-

Rogers, Simon, and Mark Girolami. 2011. A First Course in Machine Learning. CRC Press.

-
-
-

Tipping, Michael E., and Christopher M. Bishop. 1999. “Probabilistic Principal Component Analysis.” Journal of the Royal Statistical Society, B 6 (3): 611–22. https://doi.org/doi:10.1111/1467-9868.00196.

+
+
+Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. +Consistent mapping of government malaria records across a changing +territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5 +
+
+Byrd, R.H., Lu, P., Nocedal, J., 1995. A limited memory algorithm for +bound constrained optimization. SIAM Journal on Scientific and +Statistical Computing 16, 1190–1208. +
+
+Cho, Y., Saul, L.K., 2009. Kernel +methods for deep learning, in: Bengio, Y., Schuurmans, D., Lafferty, +J.D., Williams, C.K.I., Culotta, A. (Eds.), Advances in Neural +Information Processing Systems 22. Curran Associates, Inc., pp. 342–350. +
+
+Gething, P.W., Noor, A.M., Gikandi, P.W., Ogara, E.A.A., Hay, S.I., +Nixon, M.S., Snow, R.W., Atkinson, P.M., 2006. Improving imperfect data +from health management information systems in Africa using +space–time geostatistics. PLoS Medicine 3. https://doi.org/10.1371/journal.pmed.0030271 +
+
+Ioffe, S., Szegedy, C., 2015. Batch +normalization: Accelerating deep network training by reducing internal +covariate shift, in: Bach, F., Blei, D. (Eds.), Proceedings of the +32nd International Conference on Machine Learning, Proceedings of +Machine Learning Research. PMLR, Lille, France, pp. 448–456. +
+
+Laplace, P.S., 1814. Essai philosophique sur les probabilités, 2nd ed. +Courcier, Paris. +
+
+MacKay, D.J.C., 1992. Bayesian methods for adaptive models (PhD thesis). +California Institute of Technology. +
+
+Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, +N.D., 2014. Malaria surveillance with multiple data sources using +Gaussian process models, in: 1st International Conference +on the Use of Mobile ICT in Africa. +
+
+Neal, R.M., 1994. Bayesian learning for neural networks (PhD thesis). +Dept. of Computer Science, University of Toronto. +
+
+Rasmussen, C.E., Williams, C.K.I., 2006. Gaussian processes for machine +learning. mit, Cambridge, MA. +
+
+Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC +Press. +
+
+Tipping, M.E., Bishop, C.M., 1999. Probabilistic principal component +analysis. Journal of the Royal Statistical Society, B 6, 611–622. https://doi.org/doi:10.1111/1467-9868.00196
diff --git a/_notebooks/01-what-is-machine-learning.ipynb b/_notebooks/01-what-is-machine-learning.ipynb index b17ff89..99396c6 100644 --- a/_notebooks/01-what-is-machine-learning.ipynb +++ b/_notebooks/01-what-is-machine-learning.ipynb @@ -4,13 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "What is Machine Learning?\n", - "=========================\n", + "# What is Machine Learning?\n", "\n", "### [Neil D. Lawrence](http://inverseprobability.com), Amazon Cambridge\n", "\n", - "and University of Sheffield \\#\\#\\# 2019-06-03" - ] + "and University of Sheffield\n", + "\n", + "### 2019-06-03" + ], + "id": "f21d883d-ceda-417f-8556-413a1032834d" }, { "cell_type": "markdown", @@ -21,309 +23,24 @@ "prediction function and the objective function. We don’t so much focus\n", "on the derivation of particular algorithms, but more the general\n", "principles involved to give an idea of the machine learning *landscape*." - ] + ], + "id": "689b2b4c-695b-4145-b2e5-d0ca1953fb7c" }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$\n", - "\\newcommand{\\tk}[1]{}\n", - "\\newcommand{\\Amatrix}{\\mathbf{A}}\n", - "\\newcommand{\\KL}[2]{\\text{KL}\\left( #1\\,\\|\\,#2 \\right)}\n", - "\\newcommand{\\Kaast}{\\kernelMatrix_{\\mathbf{ \\ast}\\mathbf{ \\ast}}}\n", - "\\newcommand{\\Kastu}{\\kernelMatrix_{\\mathbf{ \\ast} \\inducingVector}}\n", - "\\newcommand{\\Kff}{\\kernelMatrix_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kfu}{\\kernelMatrix_{\\mappingFunctionVector \\inducingVector}}\n", - "\\newcommand{\\Kuast}{\\kernelMatrix_{\\inducingVector \\bf\\ast}}\n", - "\\newcommand{\\Kuf}{\\kernelMatrix_{\\inducingVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kuu}{\\kernelMatrix_{\\inducingVector \\inducingVector}}\n", - "\\newcommand{\\Kuui}{\\Kuu^{-1}}\n", - "\\newcommand{\\Qaast}{\\mathbf{Q}_{\\bf \\ast \\ast}}\n", - "\\newcommand{\\Qastf}{\\mathbf{Q}_{\\ast \\mappingFunction}}\n", - "\\newcommand{\\Qfast}{\\mathbf{Q}_{\\mappingFunctionVector \\bf \\ast}}\n", - "\\newcommand{\\Qff}{\\mathbf{Q}_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\aMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\aScalar}{a}\n", - "\\newcommand{\\aVector}{\\mathbf{a}}\n", - "\\newcommand{\\acceleration}{a}\n", - "\\newcommand{\\bMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\bScalar}{b}\n", - "\\newcommand{\\bVector}{\\mathbf{b}}\n", - "\\newcommand{\\basisFunc}{\\phi}\n", - "\\newcommand{\\basisFuncVector}{\\boldsymbol{ \\basisFunc}}\n", - "\\newcommand{\\basisFunction}{\\phi}\n", - "\\newcommand{\\basisLocation}{\\mu}\n", - "\\newcommand{\\basisMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\basisScalar}{\\basisFunction}\n", - "\\newcommand{\\basisVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\activationFunction}{\\phi}\n", - "\\newcommand{\\activationMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\activationScalar}{\\basisFunction}\n", - "\\newcommand{\\activationVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\bigO}{\\mathcal{O}}\n", - "\\newcommand{\\binomProb}{\\pi}\n", - "\\newcommand{\\cMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\cbasisMatrix}{\\hat{\\boldsymbol{ \\Phi}}}\n", - "\\newcommand{\\cdataMatrix}{\\hat{\\dataMatrix}}\n", - "\\newcommand{\\cdataScalar}{\\hat{\\dataScalar}}\n", - "\\newcommand{\\cdataVector}{\\hat{\\dataVector}}\n", - "\\newcommand{\\centeredKernelMatrix}{\\mathbf{ \\MakeUppercase{\\centeredKernelScalar}}}\n", - "\\newcommand{\\centeredKernelScalar}{b}\n", - "\\newcommand{\\centeredKernelVector}{\\centeredKernelScalar}\n", - "\\newcommand{\\centeringMatrix}{\\mathbf{H}}\n", - "\\newcommand{\\chiSquaredDist}[2]{\\chi_{#1}^{2}\\left(#2\\right)}\n", - "\\newcommand{\\chiSquaredSamp}[1]{\\chi_{#1}^{2}}\n", - "\\newcommand{\\conditionalCovariance}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\coregionalizationMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\coregionalizationScalar}{b}\n", - "\\newcommand{\\coregionalizationVector}{\\mathbf{ \\coregionalizationScalar}}\n", - "\\newcommand{\\covDist}[2]{\\text{cov}_{#2}\\left(#1\\right)}\n", - "\\newcommand{\\covSamp}[1]{\\text{cov}\\left(#1\\right)}\n", - "\\newcommand{\\covarianceScalar}{c}\n", - "\\newcommand{\\covarianceVector}{\\mathbf{ \\covarianceScalar}}\n", - "\\newcommand{\\covarianceMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\covarianceMatrixTwo}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\croupierScalar}{s}\n", - "\\newcommand{\\croupierVector}{\\mathbf{ \\croupierScalar}}\n", - "\\newcommand{\\croupierMatrix}{\\mathbf{ \\MakeUppercase{\\croupierScalar}}}\n", - "\\newcommand{\\dataDim}{p}\n", - "\\newcommand{\\dataIndex}{i}\n", - "\\newcommand{\\dataIndexTwo}{j}\n", - "\\newcommand{\\dataMatrix}{\\mathbf{Y}}\n", - "\\newcommand{\\dataScalar}{y}\n", - "\\newcommand{\\dataSet}{\\mathcal{D}}\n", - "\\newcommand{\\dataStd}{\\sigma}\n", - "\\newcommand{\\dataVector}{\\mathbf{ \\dataScalar}}\n", - "\\newcommand{\\decayRate}{d}\n", - "\\newcommand{\\degreeMatrix}{\\mathbf{ \\MakeUppercase{\\degreeScalar}}}\n", - "\\newcommand{\\degreeScalar}{d}\n", - "\\newcommand{\\degreeVector}{\\mathbf{ \\degreeScalar}}\n", - "\\newcommand{\\diag}[1]{\\text{diag}\\left(#1\\right)}\n", - "\\newcommand{\\diagonalMatrix}{\\mathbf{D}}\n", - "\\newcommand{\\diff}[2]{\\frac{\\text{d}#1}{\\text{d}#2}}\n", - "\\newcommand{\\diffTwo}[2]{\\frac{\\text{d}^2#1}{\\text{d}#2^2}}\n", - "\\newcommand{\\displacement}{x}\n", - "\\newcommand{\\displacementVector}{\\textbf{\\displacement}}\n", - "\\newcommand{\\distanceMatrix}{\\mathbf{ \\MakeUppercase{\\distanceScalar}}}\n", - "\\newcommand{\\distanceScalar}{d}\n", - "\\newcommand{\\distanceVector}{\\mathbf{ \\distanceScalar}}\n", - "\\newcommand{\\eigenvaltwo}{\\ell}\n", - "\\newcommand{\\eigenvaltwoMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\eigenvaltwoVector}{\\mathbf{l}}\n", - "\\newcommand{\\eigenvalue}{\\lambda}\n", - "\\newcommand{\\eigenvalueMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\eigenvalueVector}{\\boldsymbol{ \\lambda}}\n", - "\\newcommand{\\eigenvector}{\\mathbf{ \\eigenvectorScalar}}\n", - "\\newcommand{\\eigenvectorMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\eigenvectorScalar}{u}\n", - "\\newcommand{\\eigenvectwo}{\\mathbf{v}}\n", - "\\newcommand{\\eigenvectwoMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\eigenvectwoScalar}{v}\n", - "\\newcommand{\\entropy}[1]{\\mathcal{H}\\left(#1\\right)}\n", - "\\newcommand{\\errorFunction}{E}\n", - "\\newcommand{\\expDist}[2]{\\left<#1\\right>_{#2}}\n", - "\\newcommand{\\expSamp}[1]{\\left<#1\\right>}\n", - "\\newcommand{\\expectation}[1]{\\left\\langle #1 \\right\\rangle }\n", - "\\newcommand{\\expectationDist}[2]{\\left\\langle #1 \\right\\rangle _{#2}}\n", - "\\newcommand{\\expectedDistanceMatrix}{\\mathcal{D}}\n", - "\\newcommand{\\eye}{\\mathbf{I}}\n", - "\\newcommand{\\fantasyDim}{r}\n", - "\\newcommand{\\fantasyMatrix}{\\mathbf{ \\MakeUppercase{\\fantasyScalar}}}\n", - "\\newcommand{\\fantasyScalar}{z}\n", - "\\newcommand{\\fantasyVector}{\\mathbf{ \\fantasyScalar}}\n", - "\\newcommand{\\featureStd}{\\varsigma}\n", - "\\newcommand{\\gammaCdf}[3]{\\mathcal{GAMMA CDF}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaDist}[3]{\\mathcal{G}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaSamp}[2]{\\mathcal{G}\\left(#1,#2\\right)}\n", - "\\newcommand{\\gaussianDist}[3]{\\mathcal{N}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gaussianSamp}[2]{\\mathcal{N}\\left(#1,#2\\right)}\n", - "\\newcommand{\\given}{|}\n", - "\\newcommand{\\half}{\\frac{1}{2}}\n", - "\\newcommand{\\heaviside}{H}\n", - "\\newcommand{\\hiddenMatrix}{\\mathbf{ \\MakeUppercase{\\hiddenScalar}}}\n", - "\\newcommand{\\hiddenScalar}{h}\n", - "\\newcommand{\\hiddenVector}{\\mathbf{ \\hiddenScalar}}\n", - "\\newcommand{\\identityMatrix}{\\eye}\n", - "\\newcommand{\\inducingInputScalar}{z}\n", - "\\newcommand{\\inducingInputVector}{\\mathbf{ \\inducingInputScalar}}\n", - "\\newcommand{\\inducingInputMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\inducingScalar}{u}\n", - "\\newcommand{\\inducingVector}{\\mathbf{ \\inducingScalar}}\n", - "\\newcommand{\\inducingMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\inlineDiff}[2]{\\text{d}#1/\\text{d}#2}\n", - "\\newcommand{\\inputDim}{q}\n", - "\\newcommand{\\inputMatrix}{\\mathbf{X}}\n", - "\\newcommand{\\inputScalar}{x}\n", - "\\newcommand{\\inputSpace}{\\mathcal{X}}\n", - "\\newcommand{\\inputVals}{\\inputVector}\n", - "\\newcommand{\\inputVector}{\\mathbf{ \\inputScalar}}\n", - "\\newcommand{\\iterNum}{k}\n", - "\\newcommand{\\kernel}{\\kernelScalar}\n", - "\\newcommand{\\kernelMatrix}{\\mathbf{K}}\n", - "\\newcommand{\\kernelScalar}{k}\n", - "\\newcommand{\\kernelVector}{\\mathbf{ \\kernelScalar}}\n", - "\\newcommand{\\kff}{\\kernelScalar_{\\mappingFunction \\mappingFunction}}\n", - "\\newcommand{\\kfu}{\\kernelVector_{\\mappingFunction \\inducingScalar}}\n", - "\\newcommand{\\kuf}{\\kernelVector_{\\inducingScalar \\mappingFunction}}\n", - "\\newcommand{\\kuu}{\\kernelVector_{\\inducingScalar \\inducingScalar}}\n", - "\\newcommand{\\lagrangeMultiplier}{\\lambda}\n", - "\\newcommand{\\lagrangeMultiplierMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\lagrangian}{L}\n", - "\\newcommand{\\laplacianFactor}{\\mathbf{ \\MakeUppercase{\\laplacianFactorScalar}}}\n", - "\\newcommand{\\laplacianFactorScalar}{m}\n", - "\\newcommand{\\laplacianFactorVector}{\\mathbf{ \\laplacianFactorScalar}}\n", - "\\newcommand{\\laplacianMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\laplacianScalar}{\\ell}\n", - "\\newcommand{\\laplacianVector}{\\mathbf{ \\ell}}\n", - "\\newcommand{\\latentDim}{q}\n", - "\\newcommand{\\latentDistanceMatrix}{\\boldsymbol{ \\Delta}}\n", - "\\newcommand{\\latentDistanceScalar}{\\delta}\n", - "\\newcommand{\\latentDistanceVector}{\\boldsymbol{ \\delta}}\n", - "\\newcommand{\\latentForce}{f}\n", - "\\newcommand{\\latentFunction}{u}\n", - "\\newcommand{\\latentFunctionVector}{\\mathbf{ \\latentFunction}}\n", - "\\newcommand{\\latentFunctionMatrix}{\\mathbf{ \\MakeUppercase{\\latentFunction}}}\n", - "\\newcommand{\\latentIndex}{j}\n", - "\\newcommand{\\latentScalar}{z}\n", - "\\newcommand{\\latentVector}{\\mathbf{ \\latentScalar}}\n", - "\\newcommand{\\latentMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\learnRate}{\\eta}\n", - "\\newcommand{\\lengthScale}{\\ell}\n", - "\\newcommand{\\rbfWidth}{\\ell}\n", - "\\newcommand{\\likelihoodBound}{\\mathcal{L}}\n", - "\\newcommand{\\likelihoodFunction}{L}\n", - "\\newcommand{\\locationScalar}{\\mu}\n", - "\\newcommand{\\locationVector}{\\boldsymbol{ \\locationScalar}}\n", - "\\newcommand{\\locationMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\variance}[1]{\\text{var}\\left( #1 \\right)}\n", - "\\newcommand{\\mappingFunction}{f}\n", - "\\newcommand{\\mappingFunctionMatrix}{\\mathbf{F}}\n", - "\\newcommand{\\mappingFunctionTwo}{g}\n", - "\\newcommand{\\mappingFunctionTwoMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\mappingFunctionTwoVector}{\\mathbf{ \\mappingFunctionTwo}}\n", - "\\newcommand{\\mappingFunctionVector}{\\mathbf{ \\mappingFunction}}\n", - "\\newcommand{\\scaleScalar}{s}\n", - "\\newcommand{\\mappingScalar}{w}\n", - "\\newcommand{\\mappingVector}{\\mathbf{ \\mappingScalar}}\n", - "\\newcommand{\\mappingMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\mappingScalarTwo}{v}\n", - "\\newcommand{\\mappingVectorTwo}{\\mathbf{ \\mappingScalarTwo}}\n", - "\\newcommand{\\mappingMatrixTwo}{\\mathbf{V}}\n", - "\\newcommand{\\maxIters}{K}\n", - "\\newcommand{\\meanMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanScalar}{\\mu}\n", - "\\newcommand{\\meanTwoMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanTwoScalar}{m}\n", - "\\newcommand{\\meanTwoVector}{\\mathbf{ \\meanTwoScalar}}\n", - "\\newcommand{\\meanVector}{\\boldsymbol{ \\meanScalar}}\n", - "\\newcommand{\\mrnaConcentration}{m}\n", - "\\newcommand{\\naturalFrequency}{\\omega}\n", - "\\newcommand{\\neighborhood}[1]{\\mathcal{N}\\left( #1 \\right)}\n", - "\\newcommand{\\neilurl}{http://inverseprobability.com/}\n", - "\\newcommand{\\noiseMatrix}{\\boldsymbol{ E}}\n", - "\\newcommand{\\noiseScalar}{\\epsilon}\n", - "\\newcommand{\\noiseVector}{\\boldsymbol{ \\epsilon}}\n", - "\\newcommand{\\norm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\normalizedLaplacianMatrix}{\\hat{\\mathbf{L}}}\n", - "\\newcommand{\\normalizedLaplacianScalar}{\\hat{\\ell}}\n", - "\\newcommand{\\normalizedLaplacianVector}{\\hat{\\mathbf{ \\ell}}}\n", - "\\newcommand{\\numActive}{m}\n", - "\\newcommand{\\numBasisFunc}{m}\n", - "\\newcommand{\\numComponents}{m}\n", - "\\newcommand{\\numComps}{K}\n", - "\\newcommand{\\numData}{n}\n", - "\\newcommand{\\numFeatures}{K}\n", - "\\newcommand{\\numHidden}{h}\n", - "\\newcommand{\\numInducing}{m}\n", - "\\newcommand{\\numLayers}{\\ell}\n", - "\\newcommand{\\numNeighbors}{K}\n", - "\\newcommand{\\numSequences}{s}\n", - "\\newcommand{\\numSuccess}{s}\n", - "\\newcommand{\\numTasks}{m}\n", - "\\newcommand{\\numTime}{T}\n", - "\\newcommand{\\numTrials}{S}\n", - "\\newcommand{\\outputIndex}{j}\n", - "\\newcommand{\\paramVector}{\\boldsymbol{ \\theta}}\n", - "\\newcommand{\\parameterMatrix}{\\boldsymbol{ \\Theta}}\n", - "\\newcommand{\\parameterScalar}{\\theta}\n", - "\\newcommand{\\parameterVector}{\\boldsymbol{ \\parameterScalar}}\n", - "\\newcommand{\\partDiff}[2]{\\frac{\\partial#1}{\\partial#2}}\n", - "\\newcommand{\\precisionScalar}{j}\n", - "\\newcommand{\\precisionVector}{\\mathbf{ \\precisionScalar}}\n", - "\\newcommand{\\precisionMatrix}{\\mathbf{J}}\n", - "\\newcommand{\\pseudotargetScalar}{\\widetilde{y}}\n", - "\\newcommand{\\pseudotargetVector}{\\mathbf{ \\pseudotargetScalar}}\n", - "\\newcommand{\\pseudotargetMatrix}{\\mathbf{ \\widetilde{Y}}}\n", - "\\newcommand{\\rank}[1]{\\text{rank}\\left(#1\\right)}\n", - "\\newcommand{\\rayleighDist}[2]{\\mathcal{R}\\left(#1|#2\\right)}\n", - "\\newcommand{\\rayleighSamp}[1]{\\mathcal{R}\\left(#1\\right)}\n", - "\\newcommand{\\responsibility}{r}\n", - "\\newcommand{\\rotationScalar}{r}\n", - "\\newcommand{\\rotationVector}{\\mathbf{ \\rotationScalar}}\n", - "\\newcommand{\\rotationMatrix}{\\mathbf{R}}\n", - "\\newcommand{\\sampleCovScalar}{s}\n", - "\\newcommand{\\sampleCovVector}{\\mathbf{ \\sampleCovScalar}}\n", - "\\newcommand{\\sampleCovMatrix}{\\mathbf{s}}\n", - "\\newcommand{\\scalarProduct}[2]{\\left\\langle{#1},{#2}\\right\\rangle}\n", - "\\newcommand{\\sign}[1]{\\text{sign}\\left(#1\\right)}\n", - "\\newcommand{\\sigmoid}[1]{\\sigma\\left(#1\\right)}\n", - "\\newcommand{\\singularvalue}{\\ell}\n", - "\\newcommand{\\singularvalueMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\singularvalueVector}{\\mathbf{l}}\n", - "\\newcommand{\\sorth}{\\mathbf{u}}\n", - "\\newcommand{\\spar}{\\lambda}\n", - "\\newcommand{\\trace}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\BasalRate}{B}\n", - "\\newcommand{\\DampingCoefficient}{C}\n", - "\\newcommand{\\DecayRate}{D}\n", - "\\newcommand{\\Displacement}{X}\n", - "\\newcommand{\\LatentForce}{F}\n", - "\\newcommand{\\Mass}{M}\n", - "\\newcommand{\\Sensitivity}{S}\n", - "\\newcommand{\\basalRate}{b}\n", - "\\newcommand{\\dampingCoefficient}{c}\n", - "\\newcommand{\\mass}{m}\n", - "\\newcommand{\\sensitivity}{s}\n", - "\\newcommand{\\springScalar}{\\kappa}\n", - "\\newcommand{\\springVector}{\\boldsymbol{ \\kappa}}\n", - "\\newcommand{\\springMatrix}{\\boldsymbol{ \\mathcal{K}}}\n", - "\\newcommand{\\tfConcentration}{p}\n", - "\\newcommand{\\tfDecayRate}{\\delta}\n", - "\\newcommand{\\tfMrnaConcentration}{f}\n", - "\\newcommand{\\tfVector}{\\mathbf{ \\tfConcentration}}\n", - "\\newcommand{\\velocity}{v}\n", - "\\newcommand{\\sufficientStatsScalar}{g}\n", - "\\newcommand{\\sufficientStatsVector}{\\mathbf{ \\sufficientStatsScalar}}\n", - "\\newcommand{\\sufficientStatsMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\switchScalar}{s}\n", - "\\newcommand{\\switchVector}{\\mathbf{ \\switchScalar}}\n", - "\\newcommand{\\switchMatrix}{\\mathbf{S}}\n", - "\\newcommand{\\tr}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\loneNorm}[1]{\\left\\Vert #1 \\right\\Vert_1}\n", - "\\newcommand{\\ltwoNorm}[1]{\\left\\Vert #1 \\right\\Vert_2}\n", - "\\newcommand{\\onenorm}[1]{\\left\\vert#1\\right\\vert_1}\n", - "\\newcommand{\\twonorm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\vScalar}{v}\n", - "\\newcommand{\\vVector}{\\mathbf{v}}\n", - "\\newcommand{\\vMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\varianceDist}[2]{\\text{var}_{#2}\\left( #1 \\right)}\n", - "\\newcommand{\\vecb}[1]{\\left(#1\\right):}\n", - "\\newcommand{\\weightScalar}{w}\n", - "\\newcommand{\\weightVector}{\\mathbf{ \\weightScalar}}\n", - "\\newcommand{\\weightMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\weightedAdjacencyMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\weightedAdjacencyScalar}{a}\n", - "\\newcommand{\\weightedAdjacencyVector}{\\mathbf{ \\weightedAdjacencyScalar}}\n", - "\\newcommand{\\onesVector}{\\mathbf{1}}\n", - "\\newcommand{\\zerosVector}{\\mathbf{0}}\n", "$$" - ] + ], + "id": "7c9ef05b-6c24-4761-a09d-77d234bbb531" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "::: {.cell .markdown}\n", + "\n", "\n", "\n", "\n", @@ -333,39 +50,45 @@ "" - ] + ], + "id": "ebdd54d4-aa90-4cd5-bdea-568cdbe8da61" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Introduction\n", - "============" - ] + "# Introduction" + ], + "id": "31e9141c-346b-496e-ab0d-0d18dc494fe7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Data Science Africa\n", - "-------------------\n", + "## Data Science Africa\n", + "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", - "\n", + "Figure: Data Science Africa is a\n", + "ground up initiative for capacity building around data science, machine\n", + "learning and artificial intelligence on the African continent.\n", "\n", - "Figure: Data Science Africa\n", - "http://datascienceafrica.org\n", - "is a ground up initiative for capacity building around data science,\n", - "machine learning and artificial intelligence on the African\n", - "continent.\n", + "\n", "\n", + "Figure: Data Science Africa meetings held up to October 2021.\n", "Data Science Africa is a bottom up initiative for capacity building in\n", "data science, machine learning and artificial intelligence on the\n", "African continent.\n", "\n", - "As of 2019 there have been five workshops and five schools, located in\n", - "Nyeri, Kenya (twice); Kampala, Uganda; Arusha, Tanzania; Abuja, Nigeria;\n", - "Addis Ababa, Ethiopia and Accra, Ghana. The next event is scheduled for\n", - "June 2020 in Kampala, Uganda.\n", + "As of May 2023 there have been eleven workshops and schools, located in\n", + "seven different countries: Nyeri, Kenya (twice); Kampala, Uganda;\n", + "Arusha, Tanzania; Abuja, Nigeria; Addis Ababa, Ethiopia; Accra, Ghana;\n", + "Kampala, Uganda and Kimberley, South Africa (virtual), and in Kigali,\n", + "Rwanda.\n", "\n", "The main notion is *end-to-end* data science. For example, going from\n", "data collection in the farmer’s field to decision making in the Ministry\n", @@ -383,7 +106,7 @@ "Kenya. The organising board of the meeting is entirely made up of\n", "scientists and academics based on the African continent.\n", "\n", - "\n", + "\n", "\n", "Figure: The lack of existing physical infrastructure on the African\n", "continent makes it a particularly interesting environment for deploying\n", @@ -394,14 +117,18 @@ "\n", "Guardian article on [Data Science\n", "Africa](https://www.theguardian.com/media-network/2015/aug/25/africa-benefit-data-science-information)" - ] + ], + "id": "8c2a9b93-8793-4af7-b531-863aa6dc95d7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Example: Prediction of Malaria Incidence in Uganda\n", - "--------------------------------------------------\n", + "## Example: Prediction of Malaria Incidence in Uganda\n", + "\n", + "\\[edit\\]\n", "\n", "\n", "\n", @@ -421,7 +148,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -438,11 +165,11 @@ "\n", "\n", "\n", - "Ricardo Andrade Pacheco\n", + "Ricardo Andrade Pacecho\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -463,7 +190,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -478,7 +205,12 @@ "collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco et\n", "al., 2014; Mubangizi et al., 2014). John and Martin were initally from\n", "the AI-DEV group from the University of Makerere in Kampala and more\n", - "latterly they were based at UN Global Pulse in Kampala.\n", + "latterly they were based at UN Global Pulse in Kampala. You can see the\n", + "work summarized on the UN Global Pulse [disease outbreaks project site\n", + "here](https://diseaseoutbreaks.unglobalpulse.net/uganda/).\n", + "\n", + "- See [UN Global Pulse Disease Outbreaks\n", + " Site](https://diseaseoutbreaks.unglobalpulse.net/uganda/)\n", "\n", "Malaria data is spatial data. Uganda is split into districts, and health\n", "reports can be found for each district. This suggests that models such\n", @@ -488,20 +220,19 @@ "location within a district, such as Nagongera which is a sentinel site\n", "based in the Tororo district.\n", "\n", - "\n", + "\n", "\n", - "Figure: Ugandan districs. Data SRTM/NASA from\n", - "https://dds.cr.usgs.gov/srtm/version2_1.\n", + "Figure: Ugandan districts. Data SRTM/NASA from\n", + ".\n", "\n", - "(Andrade-Pacheco et al., 2014; Mubangizi\n", - "et al., 2014)\n", + "(Andrade-Pacheco et al., 2014; Mubangizi et al., 2014)\n", "\n", "The common standard for collecting health data on the African continent\n", "is from the Health management information systems (HMIS). However, this\n", "data suffers from missing values (Gething et al., 2006) and diagnosis of\n", "diseases like typhoid and malaria may be confounded.\n", "\n", - "\n", + "\n", "\n", "Figure: The Tororo district, where the sentinel site, Nagongera, is\n", "located.\n", @@ -513,7 +244,7 @@ "sites give accurate assessment of malaria disease levels in Uganda,\n", "including a site in Nagongera.\n", "\n", - "\n", + "\n", "\n", "Figure: Sentinel and HMIS data along with rainfall and temperature\n", "for the Nagongera sentinel station in the Tororo district.\n", @@ -528,33 +259,33 @@ "and temperature, to improve predictions from HMIS data of levels of\n", "malaria.\n", "\n", - "\n", + "\n", "\n", "Figure: The Mubende District.\n", "\n", - "\n", + "\n", "\n", "Figure: Prediction of malaria incidence in Mubende.\n", "\n", - "\n", + "\n", "\n", "Figure: The project arose out of the Gaussian process summer school\n", "held at Makerere in Kampala in 2013. The school led, in turn, to the\n", "Data Science Africa initiative." - ] + ], + "id": "ffda8944-a56f-4553-aabd-3d3940c5886a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Early Warning Systems\n", - "---------------------\n", + "## Early Warning Systems\n", "\n", - "\n", + "\n", "\n", "Figure: The Kabarole district in Uganda.\n", "\n", - "\n", + "\n", "\n", "Figure: Estimate of the current disease situation in the Kabarole\n", "district over time. Estimate is constructed with a Gaussian process with\n", @@ -582,7 +313,7 @@ "Finally, there is a gray region which represents when the scale of the\n", "effect is small.\n", "\n", - "\n", + "\n", "\n", "Figure: The map of Ugandan districts with an overview of the Malaria\n", "situation in each district.\n", @@ -590,39 +321,39 @@ "These colors can now be observed directly on a spatial map of the\n", "districts to give an immediate impression of the current status of the\n", "disease across the country." - ] + ], + "id": "698fb2a0-0ac9-416c-b5cc-1d75d5db5be5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Machine Learning\n", - "----------------\n", + "## Machine Learning\n", "\n", "This talk is a general introduction to machine learning, we will\n", "highlight the technical challenges and the current solutions. We will\n", "give an overview of what is machine learning and why it is important." - ] + ], + "id": "f8166f49-e4e9-4a6b-937e-2775d36dda59" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Rise of Machine Learning\n", - "------------------------\n", + "## Rise of Machine Learning\n", "\n", "Machine learning is the combination of data and models, through\n", "computation, to make predictions. $$\n", "\\text{data} + \\text{model} \\stackrel{\\text{compute}}{\\rightarrow} \\text{prediction}\n", "$$" - ] + ], + "id": "5e370d0b-eaa5-41e2-beb1-4a31b4a0ba95" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Data Revolution\n", - "---------------\n", + "## Data Revolution\n", "\n", "Machine learning has risen in prominence due to the rise in data\n", "availability, and its interconnection with computers. The high bandwidth\n", @@ -630,21 +361,25 @@ "us and data via the computer. It is that channel that is being mediated\n", "by machine learning techniques.\n", "\n", - "\n", + "\n", "\n", "Figure: Large amounts of data and high interconnection bandwidth mean\n", "that we receive much of our information about the world around us\n", "through computers." - ] + ], + "id": "919f2f72-7120-42e7-bc55-0ce767c6c85f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Supply Chain\n", - "------------\n", + "## Supply Chain\n", + "\n", + "\\[edit\\]\n", "\n", - "\n", + "\n", "\n", "Figure: Packhorse Bridge under Burbage Edge. This packhorse route\n", "climbs steeply out of Hathersage and heads towards Sheffield. Packhorses\n", @@ -669,16 +404,20 @@ "The movement of goods from regions of supply to areas of demand is\n", "fundamental to our society. The physical infrastructure of supply chain\n", "has evolved a great deal over the last 300 years." - ] + ], + "id": "f99c5901-b479-4b1c-9988-0811aab910b8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Cromford\n", - "--------\n", + "## Cromford\n", "\n", - "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", "Figure: Richard Arkwright is regarded of the founder of the modern\n", "factory system. Factories exploit distribution networks to centralize\n", @@ -716,16 +455,20 @@ "railway built in Britain.\n", "\n", "Cooper (1991)" - ] + ], + "id": "d8bcba69-064a-4b16-960b-3a70d30ea04c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Containerization\n", - "----------------\n", + "## Containerization\n", + "\n", + "\\[edit\\]\n", "\n", - "\n", + "\n", "\n", "Figure: The container is one of the major drivers of globalization,\n", "and arguably the largest agent of social change in the last 100 years.\n", @@ -742,12 +485,12 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -761,9 +504,16 @@ "in China, sold in North America. This is driven by the low cost of\n", "transport for frozen cod vs the higher relative cost of cod processing\n", "in the US versus China. Similarly,\n", - "Scottish\n", + "Scottish\n", "prawns are also processed in China for sale in the UK.\n", "\n", + "\n", + "\n", + "Figure: The transport cost of most foods is a very small portion of\n", + "the total cost. The exception is if foods are air freighted. Source:\n", + " by Hannah\n", + "Ritche CC-BY\n", + "\n", "This effect on cost of transport vs cost of processing is the main\n", "driver of the topology of the modern supply chain and the associated\n", "effect of globalization. If transport is much cheaper than processing,\n", @@ -794,14 +544,18 @@ "This is challenging, because as we introduce more mechanism to the\n", "models we use, it becomes harder to develop efficient algorithms to\n", "match those models to data." - ] + ], + "id": "ffeb8995-28ba-4b96-8488-c61a86901dee" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "For Africa\n", - "----------\n", + "## For Africa\n", + "\n", + "\\[edit\\]\n", "\n", "There is a large opportunity because infrastructures around automation\n", "are moving from physical infrastructure towards information\n", @@ -820,7 +574,7 @@ "these parameters to change the behavior of the function. The choice of\n", "mathematical function we use is a vital component of the model.\n", "\n", - "\n", + "\n", "\n", "Figure: The Kapchorwa District, home district of Stephen\n", "Kiprotich.\n", @@ -828,30 +582,33 @@ "Stephen Kiprotich, the 2012 gold medal winner from the London Olympics,\n", "comes from Kapchorwa district, in eastern Uganda, near the border with\n", "Kenya." - ] + ], + "id": "bcf493d8-5ccf-4d23-8399-f6cda5b01780" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Olympic Marathon Data\n", - "---------------------\n", + "## Olympic Marathon Data\n", + "\n", + "\\[edit\\]\n", "\n", "\n", "\n", "\n", "\n", "\n", @@ -859,9 +616,10 @@ "\n", "The first thing we will do is load a standard data set for regression\n", "modelling. The data consists of the pace of Olympic Gold Medal Marathon\n", - "winners for the Olympics from 1896 to present. First we load in the data\n", + "winners for the Olympics from 1896 to present. Let’s load in the data\n", "and plot." - ] + ], + "id": "5100f09e-517b-4034-8cd8-469ea13199fa" }, { "cell_type": "code", @@ -869,8 +627,9 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade git+https://github.com/sods/ods" - ] + "%pip install pods" + ], + "id": "2cd033d0-3dc0-4956-a3fe-f13437754d1c" }, { "cell_type": "code", @@ -880,7 +639,8 @@ "source": [ "import numpy as np\n", "import pods" - ] + ], + "id": "c0a49b1b-51a8-42bf-b165-3cbff250a263" }, { "cell_type": "code", @@ -893,8 +653,10 @@ "y = data['Y']\n", "\n", "offset = y.mean()\n", - "scale = np.sqrt(y.var())" - ] + "scale = np.sqrt(y.var())\n", + "yhat = (y - offset)/scale" + ], + "id": "8daafd3f-c44c-4e1b-91e5-be3d3c67c92f" }, { "cell_type": "code", @@ -903,9 +665,10 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import teaching_plots as plot\n", + "import mlai.plot as plot\n", "import mlai" - ] + ], + "id": "48b9d97f-3d5c-49f8-bfe6-34c48f89d009" }, { "cell_type": "code", @@ -913,9 +676,9 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "xlim = (1875,2030)\n", "ylim = (2.5, 6.5)\n", - "yhat = (y-offset)/scale\n", "\n", "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", "_ = ax.plot(x, y, 'r.',markersize=10)\n", @@ -924,36 +687,56 @@ "ax.set_xlim(xlim)\n", "ax.set_ylim(ylim)\n", "\n", - "mlai.write_figure(figure=fig, \n", - " filename='olympic-marathon.svg', \n", - " diagrams='./datasets',\n", - " transparent=True, \n", - " facecolor=(1, 1, 1, 1))" - ] + "mlai.write_figure(filename='olympic-marathon.svg', \n", + " directory='./datasets')" + ], + "id": "754b760a-9b3c-4ee0-ae8d-a3933b1ba2d8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", - "Figure: Olympic marathon pace times since 1892.\n", + "Figure: Olympic marathon pace times since 1896.\n", "\n", - "Things to notice about the data include the outlier in 1904, in this\n", - "year, the olympics was in St Louis, USA. Organizational problems and\n", + "Things to notice about the data include the outlier in 1904, in that\n", + "year the Olympics was in St Louis, USA. Organizational problems and\n", "challenges with dust kicked up by the cars following the race meant that\n", - "participants got lost, and only very few participants completed.\n", + "participants got lost, and only very few participants completed. More\n", + "recent years see more consistently quick marathons." + ], + "id": "d7d30bc1-97e7-4f48-98a6-75e5d2f408d6" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Polynomial Fits to Olympic Marthon Data\n", "\n", - "More recent years see more consistently quick marathons." - ] + "\\[edit\\]" + ], + "id": "bbc8c9f8-2b80-4e29-92b9-2aac46905cd6" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ], + "id": "78cf434d-5226-41df-986b-151dbe07d269" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Polynomial Fits to Olympic Data\n", - "-------------------------------" - ] + "Define the polynomial basis function." + ], + "id": "d16019fa-6a99-4ed9-9d45-99a8ea64dd49" }, { "cell_type": "code", @@ -961,11 +744,9 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "import mlai\n", - "import pods" - ] + "import mlai" + ], + "id": "09b0d1c7-09ca-4f5d-839b-b7c632e97fa9" }, { "cell_type": "code", @@ -973,17 +754,142 @@ "metadata": {}, "outputs": [], "source": [ - "basis = mlai.polynomial\n", - "\n", - "data = pods.datasets.olympic_marathon_men()\n", + "%load -n mlai.polynomial" + ], + "id": "73c020ff-0c67-467a-9cc6-d718d32c3744" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def polynomial(x, num_basis=4, data_limits=[-1., 1.]):\n", + " \"Polynomial basis\"\n", + " centre = data_limits[0]/2. + data_limits[1]/2.\n", + " span = data_limits[1] - data_limits[0]\n", + " z = np.asarray(x, dtype=float) - centre\n", + " z = 2*z/span # scale the inputs to be within -1, 1 where polynomials are well behaved\n", + " Phi = np.zeros((x.shape[0], num_basis))\n", + " for i in range(num_basis):\n", + " Phi[:, i:i+1] = z**i\n", + " return Phi" + ], + "id": "4f5d31d1-c630-460f-95d6-d91d68dae39b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we include the solution for the linear regression through\n", + "QR-decomposition." + ], + "id": "658e00f6-6ba9-413c-9d6d-2cd6aa559a35" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def basis_fit(Phi, y):\n", + " \"Use QR decomposition to fit the basis.\"\"\"\n", + " Q, R = np.linalg.qr(Phi)\n", + " return sp.linalg.solve_triangular(R, Q.T@y) " + ], + "id": "ff5aab90-c585-4556-bf50-1084bf9a72f2" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear Fit" + ], + "id": "2a00e7da-9ed5-4265-ad67-3fc0e431e4a8" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "poly_args = {'num_basis':2, # two basis functions (1 and x)\n", + " 'data_limits':xlim}\n", + "Phi = polynomial(x, **poly_args)\n", + "w = basis_fit(Phi, y)" + ], + "id": "2c8034cb-6834-4a1c-b1df-bf738c7c029e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we make some predictions for the fit." + ], + "id": "b1ab132c-3631-426b-8d47-46fa5af856bb" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "x_pred = np.linspace(xlim[0], xlim[1], 400)[:, np.newaxis]\n", + "Phi_pred = polynomial(x_pred, **poly_args)\n", + "f_pred = Phi_pred@w" + ], + "id": "722af050-3c74-4a43-9261-469fa37d868f" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "ebdaa78f-1b3c-4b4f-abf2-8f0f45f2edc9" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", "\n", - "x = data['X']\n", - "y = data['Y']\n", + "_ = ax.plot(x_pred, f_pred, 'b-', linewidth=2)\n", "\n", - "xlim = [1892, 2020]\n", + "mlai.write_figure(filename='olympic-marathon-polynomial-2.svg', \n", + " directory='./ml')" + ], + "id": "0725803b-8ad1-4f65-8df6-a1feeb502273" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "basis=mlai.Basis(mlai.polynomial, number=1, data_limits=xlim)" - ] + "Figure: Fit of a 1-degree polynomial (a linear model) to the Olympic\n", + "marathon data." + ], + "id": "f8804b10-1093-4c40-8ddd-4137ba9a0827" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cubic Fit" + ], + "id": "807a4981-169c-4f20-9289-dec00ba9a994" }, { "cell_type": "code", @@ -991,8 +897,12 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "poly_args = {'num_basis':4, # four basis: 1, x, x^2, x^3\n", + " 'data_limits':xlim}\n", + "Phi = polynomial(x, **poly_args)\n", + "w = basis_fit(Phi, y)" + ], + "id": "f25ad870-5881-4d7a-b14a-2777fd03a778" }, { "cell_type": "code", @@ -1000,12 +910,10 @@ "metadata": {}, "outputs": [], "source": [ - "plot.rmse_fit(x, y, param_name='number', param_range=(1, 27), \n", - " model=mlai.LM, \n", - " basis=basis,\n", - " xlim=xlim, objective_ylim=[0, 0.8],\n", - " diagrams='./ml')" - ] + "Phi_pred = polynomial(x_pred, **poly_args)\n", + "f_pred = Phi_pred@w" + ], + "id": "7e785cac-6442-4d87-abc2-809be58a2c11" }, { "cell_type": "code", @@ -1013,8 +921,11 @@ "metadata": {}, "outputs": [], "source": [ - "from ipywidgets import IntSlider" - ] + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "0ba7e21b-22af-4d5d-9b19-056ad2295837" }, { "cell_type": "code", @@ -1022,10 +933,40 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_LM_polynomial_number{num_basis:0>3}.svg',\n", - " directory='./ml', \n", - " num_basis=IntSlider(1,1,27,1))" - ] + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", + "\n", + "_ = ax.plot(x_pred, f_pred, 'b-', linewidth=2)\n", + "\n", + "mlai.write_figure(filename='olympic-marathon-polynomial-4.svg', \n", + " directory='./ml')" + ], + "id": "ba2b9fcb-da58-452e-b3e2-0aab81212a69" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: Fit of a 3-degree polynomial (a cubic model) to the Olympic\n", + "marathon data." + ], + "id": "6144b4a4-e311-418f-8e31-6158d29cffc4" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9th Degree Polynomial Fit\n", + "\n", + "Now we’ll try a 9th degree polynomial fit to the data." + ], + "id": "d92c7749-4aee-406a-bbdd-d35e10c1b1d4" }, { "cell_type": "code", @@ -1033,12 +974,23 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "import teaching_plots as plot\n", - "import mlai\n", - "import pods" - ] + "poly_args = {'num_basis':10, # basis up to x^9\n", + " 'data_limits':xlim}\n", + "Phi = polynomial(x, **poly_args)\n", + "w = basis_fit(Phi, y)" + ], + "id": "4a680789-2628-406e-aeee-020b75c24662" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Phi_pred = polynomial(x_pred, **poly_args)\n", + "f_pred = Phi_pred@w" + ], + "id": "708c2e82-6fb1-4cbf-9a36-65e38806be60" }, { "cell_type": "code", @@ -1046,20 +998,52 @@ "metadata": {}, "outputs": [], "source": [ - "basis = mlai.polynomial\n", + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "e1f379ec-b22a-4d28-a9da-da93266f4574" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", "\n", - "data = pods.datasets.olympic_marathon_men()\n", + "_ = ax.plot(x_pred, f_pred, 'b-', linewidth=2)\n", "\n", - "x = data['X']\n", - "y = data['Y']\n", + "mlai.write_figure(filename='olympic-marathon-polynomial-10.svg', \n", + " directory='./ml')" + ], + "id": "8eff6d59-2593-4afb-a219-40c1d2411495" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "xlim = [1892, 2020]\n", - "max_basis = 27\n", + "Figure: Fit of a 9-degree polynomial to the Olympic marathon\n", + "data." + ], + "id": "879f1c3a-f536-4313-b237-11eedc467f00" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 16th Degree Polynomial Fit\n", "\n", - "ll = np.array([np.nan]*(max_basis))\n", - "sum_squares = np.array([np.nan]*(max_basis))\n", - "basis=mlai.Basis(mlai.polynomial, number=1, data_limits=xlim)" - ] + "Now we’ll try a 16th degree polynomial fit to the data." + ], + "id": "41040aac-acbe-48ce-a04e-0db0c7b8cffb" }, { "cell_type": "code", @@ -1067,16 +1051,23 @@ "metadata": {}, "outputs": [], "source": [ - "plot.rmse_fit(x, y, param_name='number', param_range=(1, 28), \n", - " model=mlai.LM, basis=basis, \n", - " xlim=xlim, objective_ylim=[0, 0.8],\n", - " diagrams='./ml')" - ] + "poly_args = {'num_basis':17, # basis up to x^16\n", + " 'data_limits':xlim}\n", + "Phi = polynomial(x, **poly_args)\n", + "w = basis_fit(Phi, y)" + ], + "id": "b679b14e-b0d4-4952-b968-e8e081c57313" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, - "source": [] + "outputs": [], + "source": [ + "Phi_pred = polynomial(x_pred, **poly_args)\n", + "f_pred = Phi_pred@w" + ], + "id": "f3db05ae-03d4-4ca9-b3f3-c748f9eac3c9" }, { "cell_type": "code", @@ -1084,32 +1075,129 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_LM_polynomial_number{num_basis:0>3}.svg',\n", - " directory='./ml', \n", - " num_basis=IntSlider(1,1,28,1))" - ] + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "7176f512-26df-4ad1-b870-59ff149156c8" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", + "\n", + "_ = ax.plot(x_pred, f_pred, 'b-', linewidth=2)\n", + "\n", + "mlai.write_figure(filename='olympic-marathon-polynomial-17.svg', \n", + " directory='./ml')" + ], + "id": "bb2f04a2-a04d-43c5-a05c-ff24ce3774a0" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", - "Figure: Fit of a 1 degree polynomial to the olympic marathon\n", - "data.\n", + "Figure: Fit of a 16-degree polynomial to the Olympic marathon\n", + "data." + ], + "id": "0bd19f38-943f-44e5-b7d2-9d079cfb3f69" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 26th Degree Polynomial Fit\n", "\n", - "\n", + "Now we’ll try a 26th degree polynomial fit to the data." + ], + "id": "8657df6f-fe1f-42d0-9ef3-2b684e8899fb" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "poly_args = {'num_basis':27, # basis up to x^26\n", + " 'data_limits':xlim}\n", + "Phi = polynomial(x, **poly_args)\n", + "w = basis_fit(Phi, y)" + ], + "id": "dd7eea65-45c0-4d70-920a-63677b804c94" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Phi_pred = polynomial(x_pred, **poly_args)\n", + "f_pred = Phi_pred@w" + ], + "id": "9420968b-e3c7-4ccc-99de-b68e09bd3738" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "29e5101e-9ea9-4f46-8122-b2cb26ceabf7" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", + "\n", + "_ = ax.plot(x_pred, f_pred, 'b-', linewidth=2)\n", "\n", - "Figure: Fit of a 2 degree polynomial to the olympic marathon\n", + "mlai.write_figure(filename='olympic-marathon-polynomial-27.svg', \n", + " directory='./ml')" + ], + "id": "586592df-8d02-4aa4-aef1-fda279a1c020" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: Fit of a 26-degree polynomial to the Olympic marathon\n", "data." - ] + ], + "id": "1287f735-5c77-4772-8b10-f34ac80b16ce" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "What does Machine Learning do?\n", - "------------------------------\n", + "## What does Machine Learning do?\n", + "\n", + "\\[edit\\]\n", "\n", "Any process of automation allows us to scale what we do by codifying a\n", "process in some way that makes it efficient and repeatable. Machine\n", @@ -1118,7 +1206,7 @@ "learnt by a computer. If we can create these mathematical functions in\n", "ways in which they can interconnect, then we can also build systems.\n", "\n", - "Machine learning works through codifing a prediction of interest into a\n", + "Machine learning works through codifying a prediction of interest into a\n", "mathematical function. For example, we can try and predict the\n", "probability that a customer wants to by a jersey given knowledge of\n", "their age, and the latitude where they live. The technique known as\n", @@ -1127,72 +1215,152 @@ "\n", "$$ \\text{odds} = \\frac{p(\\text{bought})}{p(\\text{not bought})} $$\n", "\n", - "$$ \\log \\text{odds} = \\beta_0 + \\beta_1 \\text{age} + \\beta_2 \\text{latitude}.$$\n", - "Here $\\beta_0$, $\\beta_1$ and $\\beta_2$ are the parameters of the model.\n", - "If $\\beta_1$ and $\\beta_2$ are both positive, then the log-odds that\n", - "someone will buy a jumper increase with increasing latitude and age, so\n", - "the further north you are and the older you are the more likely you are\n", - "to buy a jumper. The parameter $\\beta_0$ is an offset parameter, and\n", - "gives the log-odds of buying a jumper at zero age and on the equator. It\n", - "is likely to be negative[1] indicating that the purchase is\n", - "odds-against. This is actually a classical statistical model, and models\n", - "like logistic regression are widely used to estimate probabilities from\n", - "ad-click prediction to risk of disease.\n", + "$$ \\log \\text{odds} = w_0 + w_1 \\text{age} + w_2 \\text{latitude}.$$\n", + "Here $w_0$, $w_1$ and $w_2$ are the parameters of the model. If $w_1$\n", + "and $w_2$ are both positive, then the log-odds that someone will buy a\n", + "jumper increase with increasing latitude and age, so the further north\n", + "you are and the older you are the more likely you are to buy a jumper.\n", + "The parameter $w_0$ is an offset parameter and gives the log-odds of\n", + "buying a jumper at zero age and on the equator. It is likely to be\n", + "negative[1] indicating that the purchase is odds-against. This is also a\n", + "classical statistical model, and models like logistic regression are\n", + "widely used to estimate probabilities from ad-click prediction to\n", + "disease risk.\n", "\n", "This is called a generalized linear model, we can also think of it as\n", "estimating the *probability* of a purchase as a nonlinear function of\n", - "the features (age, lattitude) and the parameters (the $\\beta$ values).\n", - "The function is known as the *sigmoid* or [logistic\n", + "the features (age, latitude) and the parameters (the $w$ values). The\n", + "function is known as the *sigmoid* or [logistic\n", "function](https://en.wikipedia.org/wiki/Logistic_regression), thus the\n", "name *logistic* regression.\n", "\n", - "$$ p(\\text{bought}) = \\sigma\\left(\\beta_0 + \\beta_1 \\text{age} + \\beta_2 \\text{latitude}\\right).$$\n", + "[1] The logarithm of a number less than one is negative, for a number\n", + "greater than one the logarithm is positive. So if odds are greater than\n", + "evens (odds-on) the log-odds are positive, if the odds are less than\n", + "evens (odds-against) the log-odds will be negative." + ], + "id": "08729fab-2909-4467-9807-c9b86407668c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sigmoid Function\n", + "\n", + "\\[edit\\]" + ], + "id": "3d2b3529-f06e-4251-945b-4ae7ef0094b4" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai.plot as plot" + ], + "id": "7a3c6921-013c-4615-ae56-16aa6129c397" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot.logistic('./ml/logistic.svg')" + ], + "id": "2f906093-50de-472e-ac9b-f02cbd047218" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: The logistic function.\n", + "\n", + "The function has this characeristic ‘s’-shape (from where the term\n", + "sigmoid, as in sigma, comes from). It also takes the input from the\n", + "entire real line and ‘squashes’ it into an output that is between zero\n", + "and one. For this reason it is sometimes also called a ‘squashing\n", + "function’.\n", + "\n", + "The sigmoid comes from the inverting the odds ratio, $$\n", + "\\frac{\\pi}{(1-\\pi)}\n", + "$$ where $\\pi$ is the probability of a positive outcome and $1-\\pi$ is\n", + "the probability of a negative outcome\n", + "\n", + "$$ p(\\text{bought}) = \\sigma\\left(w_0 + w_1 \\text{age} + w_2 \\text{latitude}\\right).$$\n", + "\n", "In the case where we have *features* to help us predict, we sometimes\n", "denote such features as a vector, $\\mathbf{ x}$, and we then use an\n", "inner product between the features and the parameters,\n", - "$\\boldsymbol{\\beta}^\\top \\mathbf{ x}= \\beta_1 x_1 + \\beta_2 x_2 + \\beta_3 x_3 ...$,\n", - "to represent the argument of the sigmoid.\n", + "$\\mathbf{ w}^\\top \\mathbf{ x}= w_1 x_1 + w_2 x_2 + w_3 x_3 ...$, to\n", + "represent the argument of the sigmoid.\n", "\n", - "$$ p(\\text{bought}) = \\sigma\\left(\\boldsymbol{\\beta}^\\top \\mathbf{ x}\\right).$$\n", + "$$ p(\\text{bought}) = \\sigma\\left(\\mathbf{ w}^\\top \\mathbf{ x}\\right).$$\n", "More generally, we aim to predict some aspect of our data, $y$, by\n", "relating it through a mathematical function, $f(\\cdot)$, to the\n", - "parameters, $\\boldsymbol{\\beta}$ and the data, $\\mathbf{ x}$.\n", + "parameters, $\\mathbf{ w}$ and the data, $\\mathbf{ x}$.\n", "\n", - "$$ y= f\\left(\\mathbf{ x}, \\boldsymbol{\\beta}\\right).$$ We call\n", - "$f(\\cdot)$ the *prediction function*.\n", + "$$ y= f\\left(\\mathbf{ x}, \\mathbf{ w}\\right).$$ We call $f(\\cdot)$ the\n", + "*prediction function*.\n", "\n", "To obtain the fit to data, we use a separate function called the\n", "*objective function* that gives us a mathematical representation of the\n", "difference between our predictions and the real data.\n", "\n", - "$$E(\\boldsymbol{\\beta}, \\mathbf{Y}, \\mathbf{X})$$ A commonly used\n", - "examples (for example in a regression problem) is least squares,\n", - "$$E(\\boldsymbol{\\beta}, \\mathbf{Y}, \\mathbf{X}) = \\sum_{i=1}^n\\left(y_i - f(\\mathbf{ x}_i, \\boldsymbol{\\beta})\\right)^2.$$\n", + "$$E(\\mathbf{ w}, \\mathbf{Y}, \\mathbf{X})$$ A commonly used examples (for\n", + "example in a regression problem) is least squares,\n", + "$$E(\\mathbf{ w}, \\mathbf{Y}, \\mathbf{X}) = \\sum_{i=1}^n\\left(y_i - f(\\mathbf{ x}_i, \\mathbf{ w})\\right)^2.$$\n", "\n", "If a linear prediction function is combined with the least squares\n", - "objective function then that gives us a classical *linear regression*,\n", + "objective function, then that gives us a classical *linear regression*,\n", "another classical statistical model. Statistics often focusses on linear\n", "models because it makes interpretation of the model easier.\n", "Interpretation is key in statistics because the aim is normally to\n", "validate questions by analysis of data. Machine learning has typically\n", - "focussed more on the prediction function itself and worried less about\n", - "the interpretation of parameters, which are normally denoted by\n", - "$\\mathbf{w}$ instead of $\\boldsymbol{\\beta}$. As a result *non-linear*\n", - "functions are explored more often as they tend to improve quality of\n", - "predictions but at the expense of interpretability.\n", + "focused more on the prediction function itself and worried less about\n", + "the interpretation of parameters. In statistics, where interpretation is\n", + "typically more important than prediction, parameters are normally\n", + "denoted by $\\boldsymbol{\\beta}$ instead of $\\mathbf{ w}$.\n", "\n", - "[1] The logarithm of a number less than one is negative, for a number\n", - "greater than one the logarithm is positive. So if odds are greater than\n", - "evens (odds-on) the log-odds are positive, if the odds are less than\n", - "evens (odds-against) the log-odds will be negative." - ] + "A key difference between statistics and machine learning, is that\n", + "(traditionally) machine learning has focussed on predictive capability\n", + "and statistics has focussed on interpretability. That means that in a\n", + "statistics class far more emphasis will be placed on interpretation of\n", + "the parameters. In machine learning, the parameters, \\$, are just a\n", + "means to an end. But in statistics, when we denote the parameters by\n", + "$\\boldsymbol{\\beta}$, we often use the parameters to tell us something\n", + "about the disease.\n", + "\n", + "So we move between\n", + "$$ p(\\text{bought}) = \\sigma\\left(w_0 + w_1 \\text{age} + w_2 \\text{latitude}\\right).$$\n", + "\n", + "to denote the emphasis is on predictive power to\n", + "\n", + "$$ p(\\text{bought}) = \\sigma\\left(\\beta_0 + \\beta_1 \\text{age} + \\beta_2 \\text{latitude}\\right).$$\n", + "\n", + "to denote the emphasis is on interpretation of the parameters.\n", + "\n", + "Another effect of the focus on prediction in machine learning is that\n", + "*non-linear* approaches, which can be harder to interpret, are more\n", + "widely deployedin machine learning – they tend to improve quality of\n", + "predictions at the expense of interpretability." + ], + "id": "909bd233-e185-4b06-a464-4ec1ee55e3bf" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "What is Machine Learning?\n", - "-------------------------\n", + "## What is Machine Learning?\n", + "\n", + "\\[edit\\]\n", "\n", "Machine learning allows us to extract knowledge from data to form a\n", "prediction.\n", @@ -1219,33 +1387,42 @@ "the increased prominence of machine learning. This prominence is\n", "surfacing in two different but overlapping domains: data science and\n", "artificial intelligence." - ] + ], + "id": "acc73d97-3931-45fd-88ef-0c03aecb53a6" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "From Model to Decision\n", - "----------------------\n", + "## From Model to Decision\n", + "\n", + "\\[edit\\]\n", "\n", "The real challenge, however, is end-to-end decision making. Taking\n", "information from the environment and using it to drive decision making\n", "to achieve goals." - ] + ], + "id": "12ddb0dd-d9a6-4145-bbc9-212c1e5b4613" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Artificial Intelligence and Data Science\n", - "----------------------------------------\n", + "## Artificial Intelligence and Data Science\n", + "\n", + "\\[edit\\]\n", "\n", "Artificial intelligence has the objective of endowing computers with\n", "human-like intelligent capabilities. For example, understanding an image\n", "(computer vision) or the contents of some speech (speech recognition),\n", "the meaning of a sentence (natural language processing) or the\n", "translation of a sentence (machine translation)." - ] + ], + "id": "525ec1c2-7076-4507-ac0e-c65e216f8da8" }, { "cell_type": "markdown", @@ -1312,14 +1489,18 @@ "question selection or even answer a question without the expense of a\n", "full randomized control trial (referred to as A/B testing in modern\n", "internet parlance)." - ] + ], + "id": "962e7b57-cbf1-47c1-abef-22dd419de2c6" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Neural Networks and Prediction Functions\n", - "----------------------------------------\n", + "## Neural Networks and Prediction Functions\n", + "\n", + "\\[edit\\]\n", "\n", "Neural networks are adaptive non-linear function models. Originally,\n", "they were studied (by McCulloch and Pitts (McCulloch and Pitts, 1943))\n", @@ -1346,14 +1527,14 @@ "hidden units, or the number of neurons. The elements of this vector\n", "function are known as the *activation* function of the neural network\n", "and $\\mathbf{V}$ are the parameters of the activation functions." - ] + ], + "id": "ff3f147f-897e-44c0-82e6-50c149f0f25d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Relations with Classical Statistics\n", - "-----------------------------------\n", + "## Relations with Classical Statistics\n", "\n", "In statistics activation functions are traditionally known as *basis\n", "functions*. And we would think of this as a *linear model*. It’s doesn’t\n", @@ -1362,14 +1543,14 @@ "$\\mathbf{V}$. The linear model terminology refers to the fact that the\n", "model is *linear in the parameters*, but it is *not* linear in the data\n", "unless the activation functions are chosen to be linear." - ] + ], + "id": "3143e07f-3ed2-4828-b126-69a7de0102d1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Adaptive Basis Functions\n", - "------------------------\n", + "## Adaptive Basis Functions\n", "\n", "The first difference in the (early) neural network literature to the\n", "classical statistical literature is the decision to optimize these\n", @@ -1383,14 +1564,14 @@ "normally use $\\boldsymbol{\\beta}$ when I care about the value of these\n", "parameters, and $\\mathbf{ w}$ when I care more about the quality of the\n", "prediction." - ] + ], + "id": "e7d3b706-8fdf-4eb6-b3d9-84d1ef060bad" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Machine Learning\n", - "----------------\n", + "## Machine Learning\n", "\n", "The key idea in machine learning is to observe the system in practice,\n", "and then emulate its behavior with mathematics. That leads to a design\n", @@ -1401,14 +1582,18 @@ "1. Supervised learning\n", "2. Unsupervised learning\n", "3. Reinforcement learning" - ] + ], + "id": "4947410c-37af-482a-ae85-26b6a81e30cd" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Supervised Learning\n", - "===================\n", + "# Supervised Learning\n", + "\n", + "\\[edit\\]\n", "\n", "Supervised learning is one of the most widely deployed machine learning\n", "technologies, and a particular domain of success has been\n", @@ -1417,14 +1602,18 @@ "different classes (e.g. dog or cat). This simple idea underpins a lot of\n", "machine learning. By scanning across the image we can also determine\n", "where the animal is in the image." - ] + ], + "id": "73ceb8d7-b706-4edf-8742-239b50edbf7f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Introduction to Classification\n", - "------------------------------\n", + "## Introduction to Classification\n", + "\n", + "\\[edit\\]\n", "\n", "Classification is perhaps the technique most closely assocated with\n", "machine learning. In the speech based agents, on-device classifiers are\n", @@ -1474,14 +1663,18 @@ "relevant in the prediction, (2) defining the appropriate *class of\n", "function*, $f(\\cdot)$, to use and (3) selecting the right parameters,\n", "$\\mathbf{ w}$." - ] + ], + "id": "f17c33e2-5fc5-4787-83ec-cdbd24af8556" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Classification Examples\n", - "-----------------------\n", + "## Classification Examples\n", + "\n", + "\\[edit\\]\n", "\n", "- Classifiying hand written digits from binary images (automatic zip\n", " code reading)\n", @@ -1491,17 +1684,21 @@ "- Categorization of document types (different types of news article on\n", " the internet)\n", "\n", - "\n", + "\n", "\n", "Figure: The perceptron algorithm." - ] + ], + "id": "47aac3f5-391c-4636-abb7-2af766c1615e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Logistic Regression\n", - "-------------------\n", + "## Logistic Regression\n", + "\n", + "\\[edit\\]\n", "\n", "A logistic regression is an approach to classification which extends the\n", "linear basis function models we’ve already explored. Rather than\n", @@ -1552,7 +1749,8 @@ "\\pi = g(\\mathbf{ w}^\\top\n", "\\boldsymbol{ \\phi}(\\mathbf{ x})).\n", "$$" - ] + ], + "id": "8beed29c-80ab-4b3c-b9fe-63ce8557a207" }, { "cell_type": "code", @@ -1560,8 +1758,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "61ec61b4-a416-4f52-8f17-6a7a53231156" }, { "cell_type": "code", @@ -1570,19 +1769,20 @@ "outputs": [], "source": [ "plot.logistic('./ml/logistic.svg')" - ] + ], + "id": "f6979019-7fb6-422f-9e65-61f30c07da6b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Basis Function\n", - "--------------\n", + "## Basis Function\n", "\n", "We’ll define our prediction, objective and gradient functions below. But\n", "before we start, we need to define a basis function for our model. Let’s\n", "start with the linear basis." - ] + ], + "id": "c5ab1d85-18b8-42bd-a265-a5042bed2b1b" }, { "cell_type": "code", @@ -1591,7 +1791,18 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "29506b89-4333-4e8e-b3f3-ca3958a7a205" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "3005f52c-9095-4ddb-b012-5a3c385c9a86" }, { "cell_type": "code", @@ -1599,18 +1810,19 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s linear mlai.py" - ] + "%load -n mlai.linear" + ], + "id": "17ace8a5-9a5a-4090-bd7f-d92bcba1a930" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Prediction Function\n", - "-------------------\n", + "## Prediction Function\n", "\n", "Now we have the basis function let’s define the prediction function." - ] + ], + "id": "676939c1-d499-4037-9399-be032960037e" }, { "cell_type": "code", @@ -1619,7 +1831,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "8c26c562-b74e-4075-90e9-bbae39ed4328" }, { "cell_type": "code", @@ -1632,7 +1845,8 @@ " Phi = basis(x, **kwargs)\n", " f = np.dot(Phi, w)\n", " return 1./(1+np.exp(-f)), Phi" - ] + ], + "id": "80955320-82af-4373-b032-fd58039f9925" }, { "cell_type": "markdown", @@ -1643,47 +1857,7 @@ "name logistic regression) or sometimes it is called the sigmoid\n", "function. For a particular value of the input to the link function,\n", "$f_i = \\mathbf{ w}^\\top \\boldsymbol{ \\phi}(\\mathbf{ x}_i)$ we can plot\n", - "the value of the inverse link function as below." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sigmoid Function" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import teaching_plots as plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot.logistic('./ml/logistic.svg')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "Figure: The logistic function.\n", - "\n", - "The function has this characeristic ‘s’-shape (from where the term\n", - "sigmoid, as in sigma, comes from). It also takes the input from the\n", - "entire real line and ‘squashes’ it into an output that is between zero\n", - "and one. For this reason it is sometimes also called a ‘squashing\n", - "function’.\n", + "the value of the inverse link function as below.\n", "\n", "By replacing the inverse link with the sigmoid we can write $\\pi$ as a\n", "function of the input and the parameter vector as, $$\n", @@ -1716,14 +1890,14 @@ "\n", "but writing it mathematically makes it easier to write our objective\n", "function within a single mathematical equation." - ] + ], + "id": "d2f8cea8-f2b0-457e-a907-757c087c6a86" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Maximum Likelihood\n", - "------------------\n", + "## Maximum Likelihood\n", "\n", "To obtain the parameters of the model, we need to maximize the\n", "likelihood, or minimize the objective function, normally taken to be the\n", @@ -1745,7 +1919,8 @@ "\\sum_{i=1}^n(1-y_i)\\log \\left(1-g\\left(\\mathbf{ w}^\\top\n", "\\boldsymbol{ \\phi}(\\mathbf{ x}_i)\\right)\\right).\n", "$$" - ] + ], + "id": "402b0ea0-249e-4bae-a7be-2f9bb70247f0" }, { "cell_type": "code", @@ -1754,7 +1929,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "e57822af-784b-4f2d-a4d2-d5d3ce84ffa6" }, { "cell_type": "code", @@ -1768,7 +1944,8 @@ " posind = np.where(labs==1)\n", " negind = np.where(labs==0)\n", " return -np.log(g[posind, :]).sum() - np.log(1-g[negind, :]).sum()" - ] + ], + "id": "52130290-c9c2-44da-9f66-83a69295b978" }, { "cell_type": "markdown", @@ -1815,7 +1992,8 @@ "(1-y_i)\\left(g\\left(\\mathbf{ w}^\\top \\boldsymbol{ \\phi}(\\mathbf{ x})\\right)\\right)\n", "\\boldsymbol{ \\phi}(\\mathbf{ x}_i).\n", "$$" - ] + ], + "id": "2e662427-e4aa-4e26-8f70-714d3be8ed6d" }, { "cell_type": "code", @@ -1824,7 +2002,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "b4d4f581-5c3c-45b9-8ef4-92bc98b9abc5" }, { "cell_type": "code", @@ -1840,42 +2019,191 @@ " negind = np.where(labs==0 )\n", " dw += (Phi[negind]*g[negind]).sum(0)\n", " return dw[:, None]" - ] + ], + "id": "a09360a2-5ad3-4fb4-9668-52a4d13f7531" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optimization of the Function\n", + "\n", + "Reorganizing the gradient to find a stationary point of the function\n", + "with respect to the parameters $\\mathbf{ w}$ turns out to be impossible.\n", + "Optimization has to proceed by *numerical methods*. Options include the\n", + "multidimensional variant of [Newton’s\n", + "method](http://en.wikipedia.org/wiki/Newton%27s_method) or [gradient\n", + "based optimization\n", + "methods](http://en.wikipedia.org/wiki/Gradient_method) like we used for\n", + "optimizing matrix factorization for the movie recommender system. We\n", + "recall from matrix factorization that, for large data, *stochastic\n", + "gradient descent* or the Robbins Munro (Robbins and Monro, 1951)\n", + "optimization procedure worked best for function minimization." + ], + "id": "7cd71b8e-b53a-484e-95fe-35832a334008" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Nigeria NMIS Data\n", + "\n", + "\\[edit\\]\n", + "\n", + "As an example data set we will use Nigerian Millennium Development Goals\n", + "Information System Health Facility (The Office of the Senior Special\n", + "Assistant to the President on the Millennium Development Goals\n", + "(OSSAP-MDGs) and Columbia University, 2014). It can be found here\n", + ".\n", + "\n", + "Taking from the information on the site,\n", + "\n", + "> The Nigeria MDG (Millennium Development Goals) Information System –\n", + "> NMIS health facility data is collected by the Office of the Senior\n", + "> Special Assistant to the President on the Millennium Development Goals\n", + "> (OSSAP-MDGs) in partner with the Sustainable Engineering Lab at\n", + "> Columbia University. A rigorous, geo-referenced baseline facility\n", + "> inventory across Nigeria is created spanning from 2009 to 2011 with an\n", + "> additional survey effort to increase coverage in 2014, to build\n", + "> Nigeria’s first nation-wide inventory of health facility. The database\n", + "> includes 34,139 health facilities info in Nigeria.\n", + ">\n", + "> The goal of this database is to make the data collected available to\n", + "> planners, government officials, and the public, to be used to make\n", + "> strategic decisions for planning relevant interventions.\n", + ">\n", + "> For data inquiry, please contact Ms. Funlola Osinupebi, Performance\n", + "> Monitoring & Communications, Advisory Power Team, Office of the Vice\n", + "> President at funlola.osinupebi@aptovp.org\n", + ">\n", + "> To learn more, please visit\n", + "> \n", + ">\n", + "> Suggested citation: Nigeria NMIS facility database (2014), the Office\n", + "> of the Senior Special Assistant to the President on the Millennium\n", + "> Development Goals (OSSAP-MDGs) & Columbia University\n", + "\n", + "For ease of use we’ve packaged this data set in the `pods` library" + ], + "id": "7d50df52-0d80-418a-a0c7-ff74a2e68c6b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pods\n", + "\n", + "\\[edit\\]\n", + "\n", + "In Sheffield we created a suite of software tools for ‘Open Data\n", + "Science’. Open data science is an approach to sharing code, models and\n", + "data that should make it easier for companies, health professionals and\n", + "scientists to gain access to data science techniques.\n", + "\n", + "You can also check this blog post on [Open Data\n", + "Science](http://inverseprobability.com/2014/07/01/open-data-science).\n", + "\n", + "The software can be installed using\n", + "\n", + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub: \n", + "\n", + "Once `pods` is installed, it can be imported in the usual manner." + ], + "id": "18aeae4b-8405-4f33-8585-3d3dd62fb9b7" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pods" + ], + "id": "954ea58f-f28c-40c2-9272-650e30d8d201" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = pods.datasets.nigeria_nmis()['Y']\n", + "data.head()" + ], + "id": "bbc4a83d-a908-496b-80ff-1299cbefbff8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can access the data directly with the following\n", + "commands.\n", + "\n", + "``` python\n", + "import urllib.request\n", + "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')\n", + "\n", + "import pandas as pd\n", + "data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')\n", + "```\n", + "\n", + "Once it is loaded in the data can be summarized using the `describe`\n", + "method in pandas." + ], + "id": "f0db0880-346b-43b5-be2c-375d35f909c4" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.describe()" + ], + "id": "509eb807-4ea8-4fb9-bcf5-e39be7c804d2" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Optimization of the Function\n", - "----------------------------\n", - "\n", - "Reorganizing the gradient to find a stationary point of the function\n", - "with respect to the parameters $\\mathbf{ w}$ turns out to be impossible.\n", - "Optimization has to proceed by *numerical methods*. Options include the\n", - "multidimensional variant of [Newton’s\n", - "method](http://en.wikipedia.org/wiki/Newton%27s_method) or [gradient\n", - "based optimization\n", - "methods](http://en.wikipedia.org/wiki/Gradient_method) like we used for\n", - "optimizing matrix factorization for the movie recommender system. We\n", - "recall from matrix factorization that, for large data, *stochastic\n", - "gradient descent* or the Robbins Munro (Robbins and Monro, 1951)\n", - "optimization procedure worked best for function minimization." - ] + "We can also find out the dimensions of the dataset using the `shape`\n", + "property." + ], + "id": "c710c82b-899b-45e3-9d8e-52d70c451c5e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.shape" + ], + "id": "0389c8eb-fcde-49ec-b048-5b98703baa50" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Nigerian NMIS Data\n", - "------------------\n", - "\n", - "First we will load in the Nigerian NMIS health data. Our aim will be to\n", - "predict whether a center has maternal health delivery services given the\n", - "attributes in the data. We will predict of the number of nurses, the\n", - "number of doctors, location etc.\n", + "Dataframes have different functions that you can use to explore and\n", + "understand your data. In python and the Jupyter notebook it is possible\n", + "to see a list of all possible functions and attributes by typing the\n", + "name of the object followed by `.` for example in the above case if\n", + "we type `data.` it show the columns available (these are attributes\n", + "in pandas dataframes) such as `num_nurses_fulltime`, and also functions,\n", + "such as `.describe()`.\n", "\n", - "Let’s first remind ourselves of the data." - ] + "For functions we can also see the documentation about the function by\n", + "following the name with a question mark. This will open a box with\n", + "documentation at the bottom which can be closed with the x button." + ], + "id": "704fa8c0-3c76-4664-8898-a8df2a16b8f2" }, { "cell_type": "code", @@ -1883,8 +2211,9 @@ "metadata": {}, "outputs": [], "source": [ - "import urllib.request" - ] + "data.describe?" + ], + "id": "6c3839e6-c51e-4df9-94e4-3561edd8b34e" }, { "cell_type": "code", @@ -1892,8 +2221,11 @@ "metadata": {}, "outputs": [], "source": [ - "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')" - ] + "import matplotlib.pyplot as plt\n", + "import mlai\n", + "import mlai.plot as plot" + ], + "id": "0cfceef7-07d5-4ea3-ac4d-2f21e2d1bf90" }, { "cell_type": "code", @@ -1901,27 +2233,45 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" - ] + "fig, ax = plt.subplots(figsize=plot.big_figsize)\n", + "ax.plot(data.longitude, data.latitude, 'ro', alpha=0.01)\n", + "ax.set_xlabel('longitude')\n", + "ax.set_ylabel('latitude')\n", + "\n", + "mlai.write_figure('nigerian-health-facilities.png', directory='./ml')" + ], + "id": "014ae4a0-a536-45f6-bcb9-a2539443da7a" }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')" - ] + "\n", + "\n", + "Figure: Location of the over thirty-four thousand health facilities\n", + "registered in the NMIS data across Nigeria. Each facility plotted\n", + "according to its latitude and longitude." + ], + "id": "9cd1a369-dae5-47b0-8569-3e64fe75d478" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "data.head()}\n", + "## Nigeria NMIS Data Classification\n", + "\n", + "\\[edit\\]\n", + "\n", + "Our aim will be to predict whether a center has maternal health delivery\n", + "services given the attributes in the data. We will predict of the number\n", + "of nurses, the number of doctors, location etc.\n", "\n", "Now we will convert this data into a form which we can use as inputs\n", "`X`, and labels `y`." - ] + ], + "id": "937de532-2807-4fb2-a6b1-206e40000952" }, { "cell_type": "code", @@ -1931,7 +2281,8 @@ "source": [ "import pandas as pd\n", "import numpy as np" - ] + ], + "id": "0430daad-15a2-4ec7-9449-7d236196d0c3" }, { "cell_type": "code", @@ -1971,7 +2322,8 @@ " type_names.append(type_col)\n", " X.loc[:, type_col] = 0.0 \n", " X.loc[index, type_col] = 1.0" - ] + ], + "id": "f98f40df-bedc-412e-b41b-f28f61a931c5" }, { "cell_type": "markdown", @@ -1979,7 +2331,8 @@ "source": [ "This has given us a new data frame `X` which contains the different\n", "facility types in different columns." - ] + ], + "id": "3e988efb-ad6e-495d-aabb-5f7156a63d34" }, { "cell_type": "code", @@ -1988,18 +2341,23 @@ "outputs": [], "source": [ "X.describe()" - ] + ], + "id": "b8abbaa8-895e-46e8-a317-d6172a47c7bc" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Batch Gradient Descent\n", - "----------------------\n", + "## Batch Gradient Descent\n", + "\n", + "\\[edit\\]\n", "\n", "We will need to define some initial random values for our vector and\n", "then minimize the objective by descending the gradient." - ] + ], + "id": "44b9c1d3-96d5-43d9-926e-91675c287bfb" }, { "cell_type": "code", @@ -2016,7 +2374,8 @@ "y_train = y.iloc[train_indices]==True\n", "X_test = X.iloc[test_indices]\n", "y_test = y.iloc[test_indices]==True" - ] + ], + "id": "aa26d80d-870a-45d8-8684-eb461e174816" }, { "cell_type": "code", @@ -2025,7 +2384,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "2575fcd7-a481-4e48-86c9-3f792a9cb22d" }, { "cell_type": "code", @@ -2042,14 +2402,16 @@ " w -= eta*gradient(g, Phi, y_train) + 0.001*w\n", " if not i % 100:\n", " print(\"Iter\", i, \"Objective\", objective(g, y_train))" - ] + ], + "id": "1d1e7f7d-5770-4945-97e7-d21a13a43c26" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let’s look at the weights and how they relate to the inputs." - ] + ], + "id": "6ead36cd-b058-4074-9497-68466b7a670b" }, { "cell_type": "code", @@ -2058,7 +2420,8 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt" - ] + ], + "id": "870c9b08-9824-4454-90f1-0105c16fcc98" }, { "cell_type": "code", @@ -2067,7 +2430,8 @@ "outputs": [], "source": [ "print(w)" - ] + ], + "id": "6510bfe7-92a4-40a3-8947-4c729b875c41" }, { "cell_type": "markdown", @@ -2076,7 +2440,8 @@ "What does the magnitude of the weight vectors tell you about the\n", "different parameters and their influence on outcome? Are the weights of\n", "roughly the same size, if not, how might you fix this?" - ] + ], + "id": "01db6c1f-5a2c-4898-85b7-ff8621aa157d" }, { "cell_type": "code", @@ -2086,35 +2451,38 @@ "source": [ "g_test, Phi_test = predict(w, X_test, linear)\n", "np.sum(g_test[y_test]>0.5)" - ] + ], + "id": "44d5f14c-13ec-4f62-a3d2-f74715489d98" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Stochastic Gradient Descent\n", - "---------------------------" - ] + "## Stochastic Gradient Descent" + ], + "id": "06a9630a-a16c-4055-a54d-bfcbb6e8f6fa" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 2\n", + "### Exercise 1\n", "\n", "Now construct a stochastic gradient descent algorithm and run it on the\n", "data. Is it faster or slower than batch gradient descent? What can you\n", "do to improve convergence speed?" - ] + ], + "id": "28632e1a-c44d-4a1c-867b-538223ccd25f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 2 Answer\n", + "### Exercise 1 Answer\n", "\n", - "Write your answer to Exercise 2 here" - ] + "Write your answer to Exercise 1 here" + ], + "id": "b7e7675c-3322-4ece-8558-42b9922f7752" }, { "cell_type": "code", @@ -2124,14 +2492,18 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "a01df952-0b9d-4ee2-afd6-fa5f8f817f9a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Regression\n", - "----------\n", + "## Regression\n", + "\n", + "\\[edit\\]\n", "\n", "Classification is the case where our prediction function gives a\n", "discrete valued output, normally associated with a ‘class’. Regression\n", @@ -2143,14 +2515,18 @@ "is the practice of predicting a function value between existing data,\n", "and ‘extrapolation’, which is the practice of predicting a function\n", "value beyond the regime where we have data." - ] + ], + "id": "0ebb2fa8-37fb-4662-9878-b1a45c7a6829" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Regression Examples\n", - "-------------------\n", + "## Regression Examples\n", + "\n", + "\\[edit\\]\n", "\n", "Regression involves predicting a real value, $y_i$, given an input\n", "vector, $\\mathbf{ x}_i$. For example, the Tecator data involves\n", @@ -2159,14 +2535,18 @@ "to age measured through a back-trace of tree rings. Regression has also\n", "been used to predict the quality of board game moves given expert rated\n", "training data." - ] + ], + "id": "8f5d6d3d-6c88-46f2-8a97-38c2d62873d8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Supervised Learning Challenges\n", - "------------------------------\n", + "## Supervised Learning Challenges\n", + "\n", + "\\[edit\\]\n", "\n", "There are three principal challenges in constructing a problem for\n", "supervised learning.\n", @@ -2175,14 +2555,18 @@ " prediction\n", "2. defining the appropriate *class of function*, $f(\\cdot)$.\n", "3. selecting the right parameters, $\\mathbf{ w}$." - ] + ], + "id": "bd3ff95d-3286-4125-b694-39d9d027ca0d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Feature Selection\n", - "-----------------\n", + "## Feature Selection\n", + "\n", + "\\[edit\\]\n", "\n", "Feature selection is a critical stage in the algorithm design process.\n", "In the Olympic prediction example above we’re only using time to predict\n", @@ -2209,14 +2593,18 @@ "Google’s success. These algorithms are in turn highly dependent on the\n", "feature sets used. Facebook in particular has made heavy investments in\n", "machine learning pipelines for evaluation of the feature utility." - ] + ], + "id": "03b44f56-f39a-44ed-be14-5e3416d6f484" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Class of Function, $f(\\cdot)$\n", - "-----------------------------\n", + "## Class of Function, $f(\\cdot)$\n", + "\n", + "\\[edit\\]\n", "\n", "By class of function we mean, what are the characteristics of the\n", "mapping between $\\mathbf{x}$ and $y$. Often, we might choose it to be a\n", @@ -2224,16 +2612,42 @@ "the prediction is a forecast, for example the demand of a particular\n", "product, then the function would need some periodic components to\n", "reflect seasonal or weekly effects." - ] + ], + "id": "01f25a9e-f921-4558-be81-f2d33b69f34d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Analysis of US Birth Rates\n", - "--------------------------\n", + "## Analysis of US Birth Rates\n", + "\n", + "\\[edit\\]\n", + "\n", + "\n", + "\n", + " \n", + "\n", + "\n", "\n", - "\n", + " \n", + "\n", + "\n", + "\n", + "Aki Vehtari\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", "\n", "Figure: This is a retrospective analysis of US births by Aki Vehtari.\n", "The challenges of forecasting. Even with seasonal and weekly effects\n", @@ -2248,12 +2662,12 @@ "\n", "\n", "\n", "\n", @@ -2297,14 +2711,18 @@ "data](http://lib.stat.cmu.edu/datasets/tecator), where the fat, water\n", "and protein content of meat samples was predicted as a function of the\n", "absorption of infrared light." - ] + ], + "id": "b8308303-dc26-46ec-b6ed-6899d7844d16" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Class of Function: Neural Networks\n", - "----------------------------------\n", + "## Class of Function: Neural Networks\n", + "\n", + "\\[edit\\]\n", "\n", "One class of function that has become popular recently is neural network\n", "functions, in particular deep neural networks. The ImageNet challenge\n", @@ -2315,14 +2733,18 @@ "improve performance so much, particularly when we know that rotational\n", "invariances and scale invariances are also applicable for object\n", "detection in images." - ] + ], + "id": "c0e7ac43-9db2-480b-8050-dad26543a06a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Deep Learning\n", - "=============\n", + "# Deep Learning\n", + "\n", + "\\[edit\\]\n", "\n", "Classical statistical models and simple machine learning models have a\n", "great deal in common. The main difference between the fields is\n", @@ -2351,28 +2773,33 @@ "prediction) rather than an end in themselves (interpretable).\n", "\n", "" - ] + ], + "id": "5ff7b9b4-23cb-4918-b392-d46374331236" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "DeepFace\n", - "--------\n", + "## DeepFace\n", "\n", - "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", "Figure: The DeepFace architecture (Taigman et al., 2014), visualized\n", "through colors to represent the functional mappings at each layer. There\n", "are 120 million parameters in the model.\n", "\n", "The DeepFace architecture (Taigman et al., 2014) consists of layers that\n", - "deal with *translation* and *rotational* invariances. These layers are\n", - "followed by three locally-connected layers and two fully-connected\n", - "layers. Color illustrates feature maps produced at each layer. The\n", - "neural network includes more than 120 million parameters, where more\n", - "than 95% come from the local and fully connected layers." - ] + "deal with *translation* invariances, known as convolutional layers.\n", + "These layers are followed by three locally-connected layers and two\n", + "fully-connected layers. Color illustrates feature maps produced at each\n", + "layer. The neural network includes more than 120 million parameters,\n", + "where more than 95% come from the local and fully connected layers." + ], + "id": "0c0296a0-481d-4260-91bc-4e3db4738ba1" }, { "cell_type": "markdown", @@ -2380,7 +2807,11 @@ "source": [ "### Deep Learning as Pinball\n", "\n", - "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", "Figure: Deep learning models are composition of simple functions. We\n", "can think of a pinball machine as an analogy. Each layer of pins\n", @@ -2408,41 +2839,15 @@ "the decision: a classification of the input object.\n", "\n", "An image has more than one number associated with it, so it is like\n", - "playing pinball in a *hyper-space*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pods\n", - "from ipywidgets import IntSlider" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pods.notebook.display_plots('pinball{sample:0>3}.svg', \n", - " directory='.',\n", - " sample=IntSlider(1, 1, 2, 1))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", + "playing pinball in a *hyper-space*.\n", + "\n", + "\n", "\n", "Figure: At initialization, the pins, which represent the parameters\n", "of the function, aren’t in the right place to bring the balls to the\n", "correct decisions.\n", "\n", - "\n", + "\n", "\n", "Figure: After learning the pins are now in the right place to bring\n", "the balls to the correct decisions.\n", @@ -2467,14 +2872,14 @@ "of possible paths for the ball through the machine. This helps to make\n", "them more data efficient and gives some robustness to adversarial\n", "examples." - ] + ], + "id": "1ace69b4-8e46-47ba-810b-de1f3ba5a386" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Encoding Knowledge\n", - "------------------\n", + "## Encoding Knowledge\n", "\n", "Knowledge that is not encoded in the prediction function must be learned\n", "through data. So any unspecified invariance (such as rotational or scale\n", @@ -2487,14 +2892,18 @@ "Unfortunately many invariances are non-trivial to incorporate and many\n", "machine learning algorithms focus on simpler concepts such as linearity\n", "or smoothness." - ] + ], + "id": "6a718029-1218-40eb-b07c-c6758d5a2ab7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Parameter Estimation: Objective Functions\n", - "-----------------------------------------\n", + "## Parameter Estimation: Objective Functions\n", + "\n", + "\\[edit\\]\n", "\n", "Once we have a set of features, and the class of functions we use is\n", "determined, we need to find the parameters of the model.\n", @@ -2543,14 +2952,18 @@ "about the population that we don’t want our models to have. For example,\n", "if we design a face detector using Californians may not perform well\n", "when deployed in Kampala, Uganda." - ] + ], + "id": "e13f7a32-f947-4c3d-9ed6-0f6af02e9bf3" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Generalization and Overfitting\n", - "------------------------------\n", + "## Generalization and Overfitting\n", + "\n", + "\\[edit\\]\n", "\n", "Once a supervised learning system is trained it can be placed in a\n", "sequential pipeline to automate a process that used to be done manually.\n", @@ -2570,15 +2983,20 @@ "systems given only its training data is known as its *generalization*\n", "ability. This is the system’s ability to predict in areas where it\n", "hasn’t previously seen data." - ] + ], + "id": "53de6c61-c8f8-446e-98bc-3e40ed1a7702" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Hold Out Validation on Olympic Marathon Data\n", - "--------------------------------------------" - ] + "## Hold Out Validation on Olympic Marathon Data\n", + "\n", + "\\[edit\\]" + ], + "id": "425f4006-dd14-464d-afd1-9b8efeac2341" }, { "cell_type": "code", @@ -2586,9 +3004,10 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot\n", + "import mlai.plot as plot\n", "import mlai" - ] + ], + "id": "5adf571a-abbf-4354-a3ac-9c3510d94adb" }, { "cell_type": "code", @@ -2599,7 +3018,8 @@ "data_limits=xlim\n", "basis = mlai.Basis(mlai.polynomial, number=1, data_limits=data_limits)\n", "max_basis = 11" - ] + ], + "id": "bde20820-f545-4e18-807c-079c80f6e008" }, { "cell_type": "code", @@ -2613,7 +3033,8 @@ " permute=False, objective_ylim=[0, 0.8], \n", " xlim=data_limits, prefix='olympic_val_extra', \n", " diagrams='./ml')" - ] + ], + "id": "b2748703-fee0-4ea9-a97b-495ba1d87758" }, { "cell_type": "code", @@ -2621,9 +3042,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "d32986a9-7b20-43ba-b741-0072a37d1fbb" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "7340c357-53b0-41ac-89c9-ea4c8404b923" }, { "cell_type": "code", @@ -2631,36 +3063,38 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_val_extra_LM_polynomial_number{num_basis:0>3}.svg', \n", + "nu.display_plots('olympic_val_extra_LM_polynomial_number{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(1, 1, max_basis, 1))" - ] + ], + "id": "9069af87-0001-493a-bdbd-1ac96b9f2256" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Olympic marathon data with validation error for\n", "extrapolation." - ] + ], + "id": "cbfa2012-ff62-4d36-8aa7-2297054fbb99" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Extrapolation\n", - "-------------" - ] + "## Extrapolation" + ], + "id": "0812c04b-6400-40be-8c8b-1b620e154988" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Interpolation\n", - "-------------" - ] + "## Interpolation" + ], + "id": "dfe1915e-64ba-45dd-84c7-04abc69d810d" }, { "cell_type": "code", @@ -2668,8 +3102,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "2d83c4fe-9592-45c8-abdd-2f3cb388e864" }, { "cell_type": "code", @@ -2682,7 +3117,8 @@ " xlim=data_limits, prefix='olympic_val_inter', \n", " objective_ylim=[0.1, 0.6], permute=True,\n", " diagrams='./ml')" - ] + ], + "id": "d76c707f-77b0-4685-ae7f-50348350bb1f" }, { "cell_type": "code", @@ -2690,9 +3126,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "7addac2a-cfc6-4e3d-ba6a-dddbd47e302a" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "d989ef7b-a43d-42c8-9286-6db56d08c0af" }, { "cell_type": "code", @@ -2700,35 +3147,36 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_val_inter_LM_polynomial_number{num_basis:0>3}.svg', \n", + "nu.display_plots('olympic_val_inter_LM_polynomial_number{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(1, 1, max_basis, 1))" - ] + ], + "id": "b59b2697-5e33-4e89-a0d1-3fb11a8ac883" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Olympic marathon data with validation error for\n", "interpolation." - ] + ], + "id": "c0e581bf-2c62-49cd-ab1a-89ebcf70436e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Choice of Validation Set\n", - "------------------------" - ] + "## Choice of Validation Set" + ], + "id": "da946f13-a766-46c9-948b-2062fd5aa181" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Hold Out Data\n", - "-------------\n", + "## Hold Out Data\n", "\n", "You have a conclusion as to which model fits best under the training\n", "error, but how do the two models perform in terms of validation? In this\n", @@ -2739,7 +3187,8 @@ "end of the time series. This means that we are validating on future\n", "predictions. We will hold out data from after 1980 and fit the model to\n", "the data before 1980." - ] + ], + "id": "b8f80e63-32c2-4212-bd5c-d73ba0162271" }, { "cell_type": "code", @@ -2757,18 +3206,20 @@ "# Create a hold out set\n", "x_valid = np.take(x, indices_hold_out, axis=0)\n", "y_valid = np.take(y, indices_hold_out, axis=0)" - ] + ], + "id": "034011c6-70d8-4108-91d6-faff141a5fc8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 3\n", + "### Exercise 2\n", "\n", "For both the linear and quadratic models, fit the model to the data up\n", "until 1980 and then compute the error on the held out data (from 1980\n", "onwards). Which model performs better on the validation data?" - ] + ], + "id": "a852ebf7-066a-45b8-bc0c-f8b7714e53e8" }, { "cell_type": "code", @@ -2776,27 +3227,30 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your answer to Exercise 3 here\n", + "# Write your answer to Exercise 2 here\n", + "\n", + "\n", "\n", "\n" - ] + ], + "id": "66865dc3-1b9c-4a21-a201-fe708e79b48b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Richer Basis Set\n", - "----------------\n", + "## Richer Basis Set\n", "\n", "Now we have an approach for deciding which model to retain, we can\n", "consider the entire family of polynomial bases, with arbitrary degrees." - ] + ], + "id": "584fb21e-4c64-4b3a-b26a-1fdff78874a8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 4\n", + "### Exercise 3\n", "\n", "Now we are going to build a more sophisticated form of basis function,\n", "one that can accept arguments to its inputs (similar to those we used in\n", @@ -2824,7 +3278,8 @@ "\n", "Which polynomial has the minimum training error? Which polynomial has\n", "the minimum validation error?" - ] + ], + "id": "a6c156e5-fee4-4ed7-bbe3-081b73e0c27c" }, { "cell_type": "code", @@ -2832,27 +3287,61 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your answer to Exercise 4 here\n", + "# Write your answer to Exercise 3 here\n", + "\n", + "\n", "\n", "\n" - ] + ], + "id": "eba7b4b5-1e20-477d-bdd2-9237efacb665" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bias Variance Decomposition\n", - "---------------------------\n", - "\n", - "The bias-variance decomposition considers the expected test error for\n", - "different variations of the *training data* sampled from,\n", - "$\\Pr(\\mathbf{ y}, y)$ $$\n", - "\\mathbb{E}\\left[ \\left(y- f^*(\\mathbf{ y})\\right)^2 \\right].\n", - "$$ This can be decomposed into two parts, $$\n", - "\\mathbb{E}\\left[ \\left(y- f(\\mathbf{ y})\\right)^2 \\right] = \\text{bias}\\left[f^*(\\mathbf{ y})\\right]^2 + \\text{variance}\\left[f^*(\\mathbf{ y})\\right] +\\sigma^2,\n", + "## Bias Variance Decomposition\n", + "\n", + "\\[edit\\]\n", + "\n", + "One of Breiman’s ideas for improving predictive performance is known as\n", + "bagging (**Breiman:bagging96?**). The idea is to train a number of\n", + "models on the data such that they overfit (high variance). Then average\n", + "the predictions of these models. The models are trained on different\n", + "bootstrap samples (Efron, 1979) and their predictions are aggregated\n", + "giving us the acronym, Bagging. By combining decision trees with\n", + "bagging, we recover random forests (Breiman, 2001).\n", + "\n", + "Bias and variance can also be estimated through Efron’s bootstrap\n", + "(Efron, 1979), and the traditional view has been that there’s a form of\n", + "Goldilocks effect, where the best predictions are given by the model\n", + "that is ‘just right’ for the amount of data available. Not to simple,\n", + "not too complex. The idea is that bias decreases with increasing model\n", + "complexity and variance increases with increasing model complexity.\n", + "Typically plots begin with the Mummy bear on the left (too much bias)\n", + "end with the Daddy bear on the right (too much variance) and show a dip\n", + "in the middle where the Baby bear (just) right finds themselves.\n", + "\n", + "The Daddy bear is typically positioned at the point where the model can\n", + "exactly interpolate the data. For a generalized linear model (McCullagh\n", + "and Nelder, 1989), this is the point at which the number of parameters\n", + "is equal to the number of data[1].\n", + "\n", + "The bias-variance decomposition (**Geman:biasvariance92?**) considers\n", + "the expected test error for different variations of the *training data*\n", + "sampled from, $\\mathbb{P}(\\mathbf{ x}, y)$ $$\\begin{align*}\n", + "R(\\mathbf{ w}) = & \\int \\left(y- f^*(\\mathbf{ x})\\right)^2 \\mathbb{P}(y, \\mathbf{ x}) \\text{d}y\\text{d}\\mathbf{ x}\\\\\n", + "& \\triangleq \\mathbb{E}\\left[ \\left(y- f^*(\\mathbf{ x})\\right)^2 \\right].\n", + "\\end{align*}$$\n", + "\n", + "This can be decomposed into two parts, $$\n", + "\\begin{align*}\n", + "\\mathbb{E}\\left[ \\left(y- f(\\mathbf{ x})\\right)^2 \\right] = & \\text{bias}\\left[f^*(\\mathbf{ x})\\right]^2 + \\text{variance}\\left[f^*(\\mathbf{ x})\\right] +\\sigma^2,\n", + "\\end{align*}\n", "$$ where the bias is given by $$\n", - " \\text{bias}\\left[f^*(\\mathbf{ y})\\right] =\n", - "\\mathbb{E}\\left[f^*(\\mathbf{ y})\\right] * f(\\mathbf{ y})\n", + " \\text{bias}\\left[f^*(\\mathbf{ x})\\right] =\n", + "\\mathbb{E}\\left[f^*(\\mathbf{ x})\\right] - f(\\mathbf{ x})\n", "$$ and it summarizes error that arises from the model’s inability to\n", "represent the underlying complexity of the data. For example, if we were\n", "to model the marathon pace of the winning runner from the Olympics by\n", @@ -2861,9 +3350,9 @@ "changing (typically getting faster).\n", "\n", "The variance term is given by $$\n", - " \\text{variance}\\left[f^*(\\mathbf{ y})\\right] = \\mathbb{E}\\left[\\left(f^*(\\mathbf{ y}) - \\mathbb{E}\\left[f^*(\\mathbf{ y})\\right]\\right)^2\\right].\n", + " \\text{variance}\\left[f^*(\\mathbf{ x})\\right] = \\mathbb{E}\\left[\\left(f^*(\\mathbf{ x}) - \\mathbb{E}\\left[f^*(\\mathbf{ x})\\right]\\right)^2\\right].\n", " $$ The variance term is often described as arising from a model that\n", - "is too complex, but we have to be careful with this idea. Is the model\n", + "is too complex, but we must be careful with this idea. Is the model\n", "really too complex relative to the real world that generates the data?\n", "The real world is a complex place, and it is rare that we are\n", "constructing mathematical models that are more complex than the world\n", @@ -2873,18 +3362,26 @@ "\n", "Models that exhibit high variance are sometimes said to ‘overfit’ the\n", "data whereas models that exhibit high bias are sometimes described as\n", - "‘underfitting’ the data." - ] + "‘underfitting’ the data.\n", + "\n", + "[1] Assuming we are ignoring parameters in the link function and the\n", + "distribution function." + ], + "id": "63624ab4-041a-45f7-b946-1e495c8370f0" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bias vs Variance Error Plots\n", - "----------------------------\n", + "## Bias vs Variance Error Plots\n", + "\n", + "\\[edit\\]\n", "\n", "Helper function for sampling data from two different classes." - ] + ], + "id": "e45d82ef-8887-4e6e-8e9f-f4a71785ca9b" }, { "cell_type": "code", @@ -2893,7 +3390,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "aab7dd96-6e9b-4ec8-a645-ef3c2a58cb79" }, { "cell_type": "code", @@ -2921,14 +3419,16 @@ " X.append(np.random.multivariate_normal(mean=mean, cov=neg_cov, size=per_class))\n", " y.append(np.zeros((per_class, 1)))\n", " return np.vstack(X), np.vstack(y).flatten()" - ] + ], + "id": "20c656f1-eb1e-4f7a-9e43-3cdd586ca96b" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Helper function for plotting the decision boundary of the SVM." - ] + ], + "id": "f2fb1716-bc03-4b2c-93a6-f53a484ea553" }, { "cell_type": "code", @@ -2956,7 +3456,8 @@ " levels=[Z.min(), 0, Z.max()], \n", " colors=[[0.5, 1.0, 0.5], [1.0, 0.5, 0.5]])\n", " return out" - ] + ], + "id": "2630022b-fc32-4e03-a44b-2d192cb4c5de" }, { "cell_type": "code", @@ -2965,7 +3466,8 @@ "outputs": [], "source": [ "import urllib.request" - ] + ], + "id": "9b9ff14a-80ac-44fa-838c-460f461496f6" }, { "cell_type": "code", @@ -2974,7 +3476,8 @@ "outputs": [], "source": [ "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')" - ] + ], + "id": "f5b556c7-f847-44b5-a153-c2fbba6f9056" }, { "cell_type": "code", @@ -2984,7 +3487,8 @@ "source": [ "import mlai\n", "import os" - ] + ], + "id": "7a17f3c0-2d9a-49dd-95f6-4982a185f251" }, { "cell_type": "code", @@ -3024,7 +3528,8 @@ " figure=fig,\n", " transparent=True)\n", " return xlim, ylim" - ] + ], + "id": "c2c16735-97ba-4b84-a657-44b94f6eb362" }, { "cell_type": "code", @@ -3039,7 +3544,8 @@ "\n", "matplotlib.rc('font', **font)\n", "import matplotlib.pyplot as plt" - ] + ], + "id": "0a824e91-4588-481f-ba67-91bf714af76c" }, { "cell_type": "code", @@ -3048,7 +3554,8 @@ "outputs": [], "source": [ "from sklearn import svm" - ] + ], + "id": "f52940f2-43d0-48bd-b550-ee32b00c7283" }, { "cell_type": "code", @@ -3082,7 +3589,8 @@ " titles=titles,\n", " xlim=xlim,\n", " ylim=ylim)" - ] + ], + "id": "8f4c89ab-7d19-4aa8-8c7a-b8bd3d7b24f6" }, { "cell_type": "code", @@ -3090,9 +3598,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "02dce558-5a9f-429d-a251-214b9052391e" }, { "cell_type": "code", @@ -3100,10 +3609,21 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('bias-variance{samp:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "7e7c8d35-89df-44ba-9f7e-768b1c272838" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('bias-variance{samp:0>3}.svg', \n", " directory='./ml', \n", " samp=IntSlider(0,0,10,1))" - ] + ], + "id": "a7592e70-22f2-4d98-a8ac-274382834dbb" }, { "cell_type": "markdown", @@ -3111,22 +3631,23 @@ "source": [ "\n", "\n", - "\n", + "\n", "\n", "Figure: In each figure the simpler model is on the left, and the more\n", "complex model is on the right. Each fit is done to a different version\n", "of the data set. The simpler model is more consistent in its errors\n", "(bias error), whereas the more complex model is varying in its errors\n", "(variance error)." - ] + ], + "id": "d7fca466-c7a1-42fc-8791-190d4036feae" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Overfitting\n", - "-----------" - ] + "## Overfitting" + ], + "id": "fc2f32a3-5bd4-4ef3-a93a-af313c2f17d4" }, { "cell_type": "code", @@ -3136,7 +3657,8 @@ "source": [ "from IPython.lib.display import YouTubeVideo\n", "YouTubeVideo('py8QrZPT48s')" - ] + ], + "id": "ac21743e-61f0-49ee-807c-a3982561952f" }, { "cell_type": "markdown", @@ -3180,18 +3702,23 @@ "checking their test performance more times than was permitted by the\n", "challenge rules. This was then reported as “AI’s first doping scandal”.\n", "The team lead was fired by Baidu." - ] + ], + "id": "fdebad1f-4149-4ead-a294-665625e7d83f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Olympic Data with Bayesian Polynomials\n", - "--------------------------------------\n", + "## Olympic Data with Bayesian Polynomials\n", + "\n", + "\\[edit\\]\n", "\n", "Five fold cross validation tests the ability of the model to\n", "*interpolate*." - ] + ], + "id": "a3b5846d-ab1a-48f9-8970-c8fce9edbea6" }, { "cell_type": "code", @@ -3201,7 +3728,8 @@ "source": [ "import mlai\n", "import pods" - ] + ], + "id": "fbc45598-7eb0-49cd-9989-149f81a5c8c3" }, { "cell_type": "code", @@ -3209,11 +3737,12 @@ "metadata": {}, "outputs": [], "source": [ - "data_limits = [1892, 2020]\n", + "data_limits = (1888, 2020)\n", "basis = mlai.Basis(mlai.polynomial, number=1, data_limits=data_limits)\n", "\n", "max_basis = y.shape[0]" - ] + ], + "id": "85f83c4c-78e8-4db5-9edd-a6e0aa61a732" }, { "cell_type": "code", @@ -3221,8 +3750,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "1d1dc50f-5ede-4a6a-8d77-e3315bd9cb9b" }, { "cell_type": "code", @@ -3239,7 +3769,8 @@ " xlim=data_limits, \n", " objective_ylim=[0.5,1.6]\n", " diagrams='./ml')" - ] + ], + "id": "a02f847c-8ea4-4b3e-bb90-972e50c8052c" }, { "cell_type": "code", @@ -3247,9 +3778,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "4539adcf-d410-44f8-b60f-8975c3cfa6c3" }, { "cell_type": "code", @@ -3257,32 +3789,44 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_BLM_polynomial_number{num_basis:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "a37e9c61-a993-4023-8b94-2287bd4c109a" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('olympic_BLM_polynomial_number{num_basis:0>3}.svg', \n", " directory='./ml/', \n", " num_basis=IntSlider(1, 1, 27, 1))" - ] + ], + "id": "938f00a0-c3fd-46d7-bdd9-d57a2b666e98" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Bayesian fit with 26th degree polynomial and negative\n", "marginal log likelihood." - ] + ], + "id": "3c6cc03a-18a1-471c-8bad-7509d3d70ee5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Hold Out Validation\n", - "-------------------\n", + "## Hold Out Validation\n", "\n", "For the polynomial fit, we will now look at *hold out* validation, where\n", "we are holding out some of the most recent points. This tests the abilit\n", "of our model to *extrapolate*." - ] + ], + "id": "8e785875-b1af-434a-9a50-36f4d6fcffea" }, { "cell_type": "code", @@ -3290,8 +3834,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "c176a7a2-8359-453c-911e-ad2c26d15cb9" }, { "cell_type": "code", @@ -3308,7 +3853,8 @@ " xlim=data_limits, \n", " objective_ylim=[0.1,0.6], \n", " permute=False)" - ] + ], + "id": "97db8789-4bc4-4dc3-a143-3e732632cce6" }, { "cell_type": "code", @@ -3316,9 +3862,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "840390ff-1445-4529-99e5-8afd80452fa7" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "7002fad1-540f-4477-92fa-64f6508ba7ba" }, { "cell_type": "code", @@ -3326,31 +3883,33 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_val_BLM_polynomial_number{num_basis:0>3}.svg', \n", + "nu.display_plots('olympic_val_BLM_polynomial_number{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(1, 1, 27, 1))" - ] + ], + "id": "914352a9-7a16-4c9d-ab43-f52d7a83c1f4" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Bayesian fit with 26th degree polynomial and hold out\n", "validation scores." - ] + ], + "id": "b74070af-8494-4b11-a75a-b49e65f89d00" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "5-fold Cross Validation\n", - "-----------------------\n", + "## 5-fold Cross Validation\n", "\n", "Five fold cross validation tests the ability of the model to\n", "*interpolate*." - ] + ], + "id": "466bd4dd-92b0-4a08-9f19-895ee1f47de1" }, { "cell_type": "code", @@ -3368,7 +3927,8 @@ " xlim=data_limits, \n", " objective_ylim=[0.2,0.6], \n", " num_parts=num_parts)" - ] + ], + "id": "ebcdaa96-a91d-410f-a668-a916ae79e2b2" }, { "cell_type": "code", @@ -3376,9 +3936,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "e6dc3ae3-b5ce-47b8-93df-d5ec525cfd57" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "ee916e8b-2556-4a6d-b0eb-498ec7141185" }, { "cell_type": "code", @@ -3386,30 +3957,52 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('olympic_5cv{part:0>2}_BLM_polynomial_number{num_basis:0>3}.svg', \n", + "nu.display_plots('olympic_5cv{part:0>2}_BLM_polynomial_number{num_basis:0>3}.svg', \n", " directory='./ml', \n", " part=(0, 5), \n", " num_basis=IntSlider(1, 1, 27, 1))" - ] + ], + "id": "337b3cb4-6ed4-4ae3-a506-f6ef3f40ec00" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Bayesian fit with 26th degree polynomial and five fold cross\n", "validation scores.\n", "\n", "" - ] + ], + "id": "9eef1049-191c-427b-868c-3fc49763780b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Unsupervised Learning\n", + "\n", + "\\[edit\\]" + ], + "id": "db8deb78-cb13-415b-b8d0-0cfaaf1acc6c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Unsupervised Learning\n", - "=====================\n", + "## Unsupervised Learning\n", + "\n", + "Supervised learning is when your data is provided with labels. Now we\n", + "are going to turn to a different form of learning, commonly known as\n", + "*unsupervised* learning. In unsupervised learning our data isn’t\n", + "necessarily labelled in any form, but we want models that give us a\n", + "better understanding of the data. We’ve actually seen an example of this\n", + "already with , which we introduces in the context of *objective\n", + "functions*. Now we will introduce a more probabilistic approach to such\n", + "models, specifically we are interested in *latent variable* modelling.\n", "\n", "In unsupervised learning you have data, $\\mathbf{ x}$, but no labels\n", "$y$. The aim in unsupervised learning is to extract structure from data.\n", @@ -3418,14 +4011,14 @@ "driven by the labels. Supervised learning algorithms try and focus on\n", "the aspects of the data which are relevant to predicting the labels. But\n", "in unsupervised learning there are no labels." - ] + ], + "id": "96f942db-bfc0-4f79-8346-910cf2fcbc7c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Context\n", - "-------\n", + "## Context\n", "\n", "Humans can easily sort a number of objects into objects that share\n", "similar characteristics. We easily categorize animals or vehicles. But\n", @@ -3438,14 +4031,14 @@ "algorithm that can go through its entire list of products and\n", "automatically sort them into groups such that similar products are\n", "located together." - ] + ], + "id": "018faf47-8e39-415b-b0de-b283afb1a018" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Discrete vs Continuous\n", - "----------------------\n", + "## Discrete vs Continuous\n", "\n", "Supervised learning is broadly divided into classification: i.e. wake\n", "word classification in the Amazon Echo, and regression, e.g. shelf life\n", @@ -3453,15 +4046,239 @@ "broadly split into methods that cluster the data (i.e. provide a\n", "discrete label) and methods that represent the data as a continuous\n", "value." - ] + ], + "id": "2bc98508-b769-42c3-a84c-deab4daf067c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Clustering\n", - "----------\n", + "## Clustering\n", + "\n", + "\\[edit\\]\n", + "\n", + "- One common approach, not deeply covered in this course.\n", + "\n", + "- Associate each data point, $\\mathbf{ y}_{i, :}$ with one of $k$\n", + " different discrete groups.\n", + "\n", + "- For example:\n", + "\n", + " - Clustering animals into discrete groups. Are animals discrete or\n", + " continuous?\n", + " - Clustering into different different *political* affiliations.\n", + "\n", + "- Humans do seem to like clusters:\n", + "\n", + " - Very useful when interacting with biologists.\n", + "\n", + "- Subtle difference between clustering and *vector quantisation*\n", + "\n", + "- Little anecdote.\n", + "\n", + "- To my mind difference is in clustering there should be a reduction\n", + " in data density between samples.\n", + "\n", + "- This definition is not universally applied.\n", + "\n", + "- For today’s purposes we merge them:\n", + "\n", + " - Determine how to allocate each point to a group and *harder*\n", + " total number of groups.\n", + "\n", + "- Simple algorithm for allocating points to groups.\n", + "\n", + "- *Require*: Set of $k$ cluster centres & assignment of each points to\n", + " a cluster.\n", + "\n", + "1. Initialize cluster centres as randomly selected data points.\n", + " 1. Assign each data point to *nearest* cluster centre.\n", + " 2. Update each cluster centre by setting it to the mean of assigned\n", + " data points.\n", + " 3. Repeat 2 and 3 until cluster allocations do not change.\n", + "\n", + "- This minimizes the objective $$\n", + " E=\\sum_{j=1}^K \\sum_{i\\ \\text{allocated to}\\ j} \\left(\\mathbf{ y}_{i, :} - \\boldsymbol{ \\mu}_{j, :}\\right)^\\top\\left(\\mathbf{ y}_{i, :} - \\boldsymbol{ \\mu}_{j, :}\\right)\n", + " $$ *i.e.* it minimizes thesum of Euclidean squared distances betwen\n", + " points and their associated centres.\n", + "- The minimum is *not* guaranteed to be *global* or *unique*.\n", + "- This objective is a non-convex optimization problem." + ], + "id": "bdc56e87-d8f2-484e-8907-ee491e88fed1" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai\n", + "import numpy as np" + ], + "id": "0aac7a9d-fac4-47af-b653-e44903a865ad" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def write_plot(counter, caption):\n", + " directory = \"./ml\"\n", + " filestub = f\"kmeans_clustering_{counter:0>3}\"\n", + " mlai.write_figure(filestub+\".svg\", directory=directory)\n", + " f = open(os.path.join(directory,filestub) + '.md', 'w')\n", + " f.write(caption)\n", + " f.close()" + ], + "id": "2822a82e-2a6f-4f11-b9ce-5239214694a1" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(5,5))\n", + "fontsize = 20\n", + "\n", + "num_clust_points = 30\n", + "\n", + "Y = np.vstack([np.random.normal(size=(num_clust_points, 2)) + 2.5,\n", + " np.random.normal(size=(num_clust_points, 2)) - 2.5,\n", + " np.random.normal(size=(num_clust_points, 2)) + np.array([2.5, -2.5])])\n", + "\n", + "centre_inds = np.random.permutation(Y.shape[0])[:3]\n", + "centres = Y[centre_inds, :]\n", + "\n", + "ax.cla()\n", + "\n", + "ax.plot(Y[:, 0], Y[:, 1], '.', color=[0, 0, 0], markersize=10)\n", + "ax.set_xlabel('$y_1$')\n", + "ax.set_ylabel('$y_2$')\n", + "ax.set_title('Data')\n", + "counter=0\n", + "write_plot(counter, 'Data set to be analyzed. Initialize cluster centres.')\n", + "ax.plot(centres[:, 0], centres[:, 1], 'o', color=[0,0,0], linewidth=3, markersize=12) \n", + "counter+=1\n", + "write_plot(counter, 'Allocate each point to the cluster with the nearest centre')\n", + "i = 0\n", + "\n", + "for i in range(6):\n", + " dist_mat = ((Y[:, :, None] - centres.T[None, :, :])**2).sum(1)\n", + " ind = dist_mat.argmin(1)\n", + " ax.cla()\n", + " ax.plot(Y[ind==0, 0], Y[ind==0, 1], 'x', color= [1, 0, 0], markersize=10)\n", + " ax.plot(Y[ind==1, 0], Y[ind==1, 1], 'o', color=[0, 1, 0], markersize=10)\n", + " ax.plot(Y[ind==2, 0], Y[ind==2, 1], '+', color=[0, 0, 1], markersize=10)\n", + " c = ax.plot(centres[:, 0], centres[:, 1], 'o', color=[0,0, 0], markersize=12, linewidth=3)\n", + " ax.set_xlabel('$y_1$',fontsize=fontsize)\n", + " ax.set_ylabel('$y_2$',fontsize=fontsize)\n", + " ax.set_title('Iteration ' + str(i))\n", + " counter+=1\n", + " write_plot(counter, 'Update each centre by setting to the mean of the allocated points.')\n", + " for j in range(centres.shape[0]):\n", + " centres[j, :] = np.mean(Y[ind==j, :], 0)\n", + " c[0].set_data(centres[:, 0], centres[:, 1])\n", + " counter+=1\n", + " mlai.write_figure(f\"kmeans_clustering_{counter:0>3}.svg\", directory=\"./ml\")\n", + " write_plot(counter, 'Allocate each data point to the nearest cluster centre.')" + ], + "id": "4132c077-4cab-4f9e-9a30-0749f6a55d20" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "c079f4a5-2742-4dfd-9baf-1f6f377ba5bb" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots(\"kmeans_clustering_{counter:0>3}.svg\", directory=\"./ml\", \n", + " text_top='kmeans_clustering_{counter:0>3}.tex', counter=(0, 13))" + ], + "id": "23ad9cab-4417-47f1-a930-7331f526cf3c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import mlai" + ], + "id": "7c927c7e-af89-4cd0-b3b5-cfa64dd24401" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(5,5))\n", + "\n", + "num_centres = 20\n", + "num_data = 200\n", + "centres = np.random.normal(size=(num_centres, 2))\n", + "w = np.random.normal(size=(num_centres, 2))*0.1\n", + "alloc = np.random.randint(0, num_centres, size=(num_data))\n", + "sigma = np.random.normal(size=(num_centres, 1))*0.05\n", + "epsilon = np.random.normal(size=(num_data,2))*sigma[alloc, :]\n", "\n", + "Y = w[alloc, :]*np.random.normal(size=(num_data, 1)) + centres[alloc, :] + epsilon\n", + "\n", + "ax.plot(Y[:, 0], Y[:, 1], 'rx')\n", + "ax.set_xlabel('$y_1$', fontsize=20)\n", + "ax.set_ylabel('$y_2$', fontsize=20)\n", + "\n", + "mlai.write_figure(\"cluster_data00.svg\", directory=\"./ml/\")\n", + "pi_vals = np.linspace(-np.pi, np.pi, 200)[:, None]\n", + "for i in range(num_centres):\n", + " ax.plot(centres[i, 0], centres[i, 1], 'o', markersize=5, color=[0, 0, 0], linewidth=2)\n", + " x = np.hstack([np.sin(pi_vals), np.cos(pi_vals)])\n", + " L = np.linalg.cholesky(np.outer(w[i, :],w[i, :]) + sigma[i]**2*np.eye(2))\n", + " el = np.dot(x, L.T)\n", + " ax.plot(centres[i, 0] + el[:, 0], centres[i, 1] + el[:, 1], linewidth=2, color=[0,0,0])\n", + "mlai.write_figure(\"cluster_data01.svg\", directory=\"./ml/\")" + ], + "id": "8c36cea7-29fc-4ff4-b251-5834110152de" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "37ecf7ee-72b1-4c87-92b5-1c90ea68d7ab" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('cluster_data{counter:0>2}.svg', directory='./ml', counter=(0, 1))" + ], + "id": "a0ebdb56-207f-47ca-9a3f-99be61be6852" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "Clustering methods associate each data point with a different label.\n", "Unlike in classification the label is not provided by a human annotator.\n", "It is allocated by the computer. Clustering is quite intuitive for\n", @@ -3505,7 +4322,8 @@ "by the products they’ve purchased in the past. This could be a binary\n", "vector $\\mathbf{ x}_i$. We can then define a distance between the\n", "cluster center and the customer." - ] + ], + "id": "2d7fa02d-8ddd-4059-b67c-af33e8ad6ae7" }, { "cell_type": "markdown", @@ -3528,7 +4346,8 @@ "$$ where the notation $\\mathbf{i}_j$ represents all the indices of each\n", "data point which has been allocated to the $j$th cluster represented by\n", "the center $\\boldsymbol{ \\mu}_j$." - ] + ], + "id": "42091476-bd3c-48a2-8efa-2e5e22379680" }, { "cell_type": "markdown", @@ -3545,7 +4364,8 @@ "the initial choice of centers. For more technical details on $k$-means\n", "clustering you can watch a video of Alex Ihler introducing the algorithm\n", "here." - ] + ], + "id": "8b9ee964-6c94-4f80-866f-cbf277d97586" }, { "cell_type": "markdown", @@ -3553,10 +4373,11 @@ "source": [ "### $k$-Means Clustering\n", "\n", - "\n", + "\n", "\n", "Figure: Clustering with the $k$-means clustering algorithm." - ] + ], + "id": "6abfb53a-eb34-4a82-ae3c-1d1d9a3e20d1" }, { "cell_type": "code", @@ -3566,14 +4387,16 @@ "source": [ "from IPython.lib.display import YouTubeVideo\n", "YouTubeVideo('mfqmoUN-Cuw')" - ] + ], + "id": "1578ca65-db0e-4c3c-a862-e08c27c93d5a" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Figure: $k$-means clustering by Alex Ihler." - ] + ], + "id": "d4e1962a-a756-4c57-a949-948f3dfddb90" }, { "cell_type": "markdown", @@ -3584,7 +4407,8 @@ "Other approaches to clustering involve forming taxonomies of the cluster\n", "centers, like humans apply to animals, to form trees. You can learn more\n", "about agglomerative clustering in this video from Alex Ihler." - ] + ], + "id": "df900dac-3b70-42db-baa6-b0bff1228aa9" }, { "cell_type": "code", @@ -3594,14 +4418,16 @@ "source": [ "from IPython.lib.display import YouTubeVideo\n", "YouTubeVideo('OcoE7JlbXvY')" - ] + ], + "id": "17793767-587b-4a98-bb19-5e28a4d6451d" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Figure: Hierarchical Clustering by Alex Ihler." - ] + ], + "id": "e24a72d7-f362-4fa1-b8a6-f5a7de1d5a50" }, { "cell_type": "markdown", @@ -3624,7 +4450,8 @@ "center that is then allowed to evolve over time through a mutation rate.\n", "The time of separation between different species is estimated via these\n", "mutation rates." - ] + ], + "id": "d0d96411-64bf-4f86-b4a1-a623bf0a29b2" }, { "cell_type": "markdown", @@ -3641,7 +4468,8 @@ "for example running shoes should be in more than one group, they are\n", "‘sporting goods’ and they are ‘apparel’. A tree structure doesn’t allow\n", "this allocation." - ] + ], + "id": "a71b99e6-73b6-408d-85dc-c3fcd17e8d88" }, { "cell_type": "markdown", @@ -3653,20 +4481,41 @@ "cognitive science. Researchers like Josh Tenenbaum have developed\n", "algorithms that decompose data in more complex ways, but they can\n", "normally only be applied to smaller data sets." - ] + ], + "id": "7b2f2e9f-ac0b-407b-8414-2f0f29d528ad" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other Clustering Approaches\n", + "\n", + "- Spectral clustering (Shi and Malik (2000),Ng et al. (n.d.))\n", + " - Allows clusters which aren’t convex hulls.\n", + "- Dirichlet process\n", + " - A probabilistic formulation for a clustering algorithm that is\n", + " *non-parametric*.\n", + " - Loosely speaking it allows infinite clusters\n", + " - In practice useful for dealing with previously unknown species\n", + " (e.g. a “Black Swan Event”)." + ], + "id": "504c9e5f-a33b-427b-9415-a5ba8316b2fc" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Dimensionality Reduction\n", - "------------------------\n", + "## Dimensionality Reduction\n", + "\n", + "\\[edit\\]\n", "\n", "Dimensionality reduction methods compress the data by replacing the\n", "original data with a reduced number of continuous variables. One way of\n", "thinking of these methods is to imagine a marionette.\n", "\n", - "\n", + "\n", "\n", "Figure: Thinking of dimensionality reduction as a marionette. We\n", "observe the high dimensional pose of the puppet, $\\mathbf{ x}$, but the\n", @@ -3694,7 +4543,8 @@ "It assumes that the data we observe is generated from some lower\n", "dimensional underlying process. It then seeks to recover the values\n", "associated with this low dimensional process." - ] + ], + "id": "77d9b15b-809a-45bd-ab55-b28d09832c5b" }, { "cell_type": "markdown", @@ -3733,8 +4583,9 @@ "[word2vec](https://arxiv.org/abs/1301.3781) algorithm performed a\n", "dimensionality reduction on words, now you can take any word and map it\n", "to a latent space where similar words exhibit similar characteristics. A\n", - "personality space for words." - ] + "‘personality space’ for words." + ], + "id": "798ac42c-aa78-46d2-9a79-c4536edf48ed" }, { "cell_type": "markdown", @@ -3761,7 +4612,7 @@ "all subjects, but the subject’s IQ is assumed to differ leading to\n", "different scores for each subject.\n", "\n", - "\n", + "\n", "\n", "Figure: Visualization of the first two principal components of an\n", "artificial data set. The data was generated by taking an image of a\n", @@ -3770,7 +4621,8 @@ "The underlying circular shape is derived from the rotation of the data.\n", "Each image in the data set is projected on to the location its projected\n", "to in the latent space." - ] + ], + "id": "f0d7bbf8-9680-4dc0-a20f-4166f54ffa89" }, { "cell_type": "markdown", @@ -3795,7 +4647,8 @@ "preference for outdoor job). In factor analysis the parameters\n", "$\\mathbf{W}$ are known as the factor *loadings* and in PCA they are\n", "known as the principal components." - ] + ], + "id": "9b97ba7e-4911-4b06-acfe-9ebb6aeb12a1" }, { "cell_type": "markdown", @@ -3871,14 +4724,18 @@ "matrix (which would be impossible to compute) embeds similarities\n", "between pages according to how far apart they are via a random walk\n", "along the linkage matrix." - ] + ], + "id": "68c2731d-87e8-400d-9749-fbe856707163" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Reinforcement Learning\n", - "======================\n", + "# Reinforcement Learning\n", + "\n", + "\\[edit\\]\n", "\n", "The final domain of learning we will review is known as reinforcement\n", "learning. The domain of reinforcement learning is one that many\n", @@ -3894,14 +4751,14 @@ "but the reward is normally delayed. There may have been many actions\n", "that affected the outcome, but which actions had a positive effect and\n", "which a negative effect?" - ] + ], + "id": "4c9e9b24-b47d-4d81-acd7-76364eca301a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "“Reward”\n", - "--------\n", + "## “Reward”\n", "\n", "- In reinforcement learning some context is given, in the form of a\n", " reward. But it is often *delayed*\n", @@ -3926,14 +4783,14 @@ "agent’s ability to interact with the user and understand intent.\n", "However, they are not yet mature enough to be deployed in this\n", "application." - ] + ], + "id": "220dc80e-1910-4286-94a2-15d8d045110e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Game Play\n", - "---------\n", + "## Game Play\n", "\n", "An area where reinforcement learning methods have been deployed with\n", "high profile success is game play. In game play the reward is delayed to\n", @@ -3976,14 +4833,14 @@ "as the use of fast compute to generate and process very large quantities\n", "of data. In its standard form it is not seen as a very data-efficient\n", "approach." - ] + ], + "id": "9f5a6d8d-d4a7-47bb-b64a-d3decc2d69a5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "AlphaGo\n", - "-------\n", + "## AlphaGo\n", "\n", "The ancient Chinese game of Go was considered a challenge for artificial\n", "intelligence for two reasons. Firstly, the game tree has a very high\n", @@ -4025,14 +4882,14 @@ "as the use of fast compute to generate and process very large quantities\n", "of data. In its standard form it is not seen as a very data-efficient\n", "approach." - ] + ], + "id": "590435e3-3e62-49b5-b89e-42d7e1353cc0" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Reinforcement Learning and Classical Control\n", - "--------------------------------------------\n", + "## Reinforcement Learning and Classical Control\n", "\n", "An alternative approach to reinforcement learning is to use a prediction\n", "function to suggest how the world will evolve in response to your\n", @@ -4058,14 +4915,14 @@ "of control and reinforcement learning. Results at this interface could\n", "be very important for improving the quality of robotic and drone\n", "control." - ] + ], + "id": "dcc993f9-d31a-4ff2-83be-09eab6b78903" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Optimization Methods\n", - "--------------------\n", + "## Optimization Methods\n", "\n", "As we implied above, reinforcement learning can also used to improve\n", "user experience. In that case the reward is gained when the user buys a\n", @@ -4089,28 +4946,28 @@ "mathematical function. For example, what is the mathematical function\n", "that relates a user’s experience to the probability that they will buy a\n", "product?" - ] + ], + "id": "e7bc7c5e-49ab-4661-94a3-ed848876da83" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bayesian Optimization\n", - "---------------------\n", + "## Bayesian Optimization\n", "\n", "One approach to these problems is to use machine learning methods to\n", "develop a *surrogate model* for the optimization task. The surrogate\n", "model is a prediction function that attempts to recreate the process we\n", "are finding hard to model. We try to simultaneously fit the surrogate\n", "model and optimize the process." - ] + ], + "id": "0a1d305c-0d47-4681-a7c3-191f49a475dd" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Surrogate Models\n", - "----------------\n", + "## Surrogate Models\n", "\n", "Bayesian optimization methods use a *surrogate model* (normally a\n", "specific form of regression model). They use this to predict how the\n", @@ -4122,14 +4979,14 @@ "model of the real world. In bandit methods strategies are determined\n", "without turning to a model to motivate them. They are *model free*\n", "methods." - ] + ], + "id": "b862bcf6-dff0-4bec-b76b-f7e6ebab5b8f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Model-Based and Model Free: Performance\n", - "---------------------------------------\n", + "## Model-Based and Model Free: Performance\n", "\n", "Because of their different philosophies, if a class of prediction\n", "functions is chosen, then a model-based approach might have better\n", @@ -4148,14 +5005,18 @@ "rules. The important characteristic of machine learning is that the form\n", "of these functions, as dictated by their parameters, is determined by\n", "acquiring data from the real world." - ] + ], + "id": "13469678-c455-4668-a1c7-73a43fe3295f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Deployment\n", - "----------\n", + "## Deployment\n", + "\n", + "\\[edit\\]\n", "\n", "The methods we have introduced are roughly speaking introduced in order\n", "of difficulty of deployment. While supervised learning is more involved\n", @@ -4174,14 +5035,14 @@ "organizations I’ve been proposing “Data Readiness Levels”. More needs to\n", "be done in this area to improve the efficiency of the data science\n", "pipeline." - ] + ], + "id": "72d3285a-3263-47e3-b5db-3ab7083b66a1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Where to Deploy?\n", - "----------------\n", + "## Where to Deploy?\n", "\n", "In relation to what AI can and can’t do today Andrew Ng is quoted as\n", "saying:\n", @@ -4193,14 +5054,14 @@ "[1] The quote can be found in the Harvard Business Review Article [“What\n", "Artificial Intelligence Can and Can’t Do Right\n", "Now”](https://hbr.org/2016/11/what-artificial-intelligence-can-and-cant-do-right-now)." - ] + ], + "id": "2b4e6e9d-2732-41f6-9cf4-1e08dde93143" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Is this Right?\n", - "--------------\n", + "## Is this Right?\n", "\n", "I would broadly agree with this quote but only in the context of\n", "supervised learning. If a human expert takes around that amount of time,\n", @@ -4261,14 +5122,14 @@ "[1] This trend was very clear at the moment, [I spoke about\n", "it](%7B%7Bsite.baseurl%20%7D%7D/) at a recent Dagstuhl workshop on new\n", "directions for kernel methods and Gaussian processes." - ] + ], + "id": "49e5091c-c50d-495b-a49c-3e4466abf473" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Model Choice\n", - "------------\n", + "## Model Choice\n", "\n", "Common to all machine learning methods is the initial choice of useful\n", "classes of functions. The deep learning revolution is associated with a\n", @@ -4282,14 +5143,14 @@ "\n", "\n", "" - ] + ], + "id": "bc6d953c-dcb4-42c2-8f9a-948cfab2834e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Thanks!\n", - "-------\n", + "## Thanks!\n", "\n", "For more information on these subjects and more you might want to check\n", "the following resources.\n", @@ -4300,15 +5161,16 @@ " Page](http://www.theguardian.com/profile/neil-lawrence)\n", "- blog:\n", " [http://inverseprobability.com](http://inverseprobability.com/blog.html)" - ] + ], + "id": "41041846-5d71-473e-9623-3b38e14b49a7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "References\n", - "----------" - ] + "## References" + ], + "id": "7d55b180-d558-4509-99a9-0253d9c02257" }, { "cell_type": "markdown", @@ -4319,11 +5181,17 @@ "territory delimitation. Malaria Journal 13.\n", "\n", "\n", + "Breiman, L., 2001. Random forests. Mach. Learn. 45, 5–32.\n", + "\n", + "\n", "Cooper, B., 1991. Transformation of a valley: Derbyshire derwent.\n", "Scarthin Books.\n", "\n", - "Gelman, A., Carlin, J.B., Stern, H.S., Rubin, D.B., 2013. Bayesian data\n", - "analysis, 3rd ed. Chapman; Hall.\n", + "Efron, B., 1979. Bootstrap methods: Another look at the jackkife. Annals\n", + "of Statistics 7, 1–26.\n", + "\n", + "Gelman, A., Carlin, J.B., Stern, H.S., Dunson, D.B., Vehtari, A., Rubin,\n", + "D.B., 2013. Bayesian data analysis, 3rd ed. Chapman; Hall.\n", "\n", "Gething, P.W., Noor, A.M., Gikandi, P.W., Ogara, E.A.A., Hay, S.I.,\n", "Nixon, M.S., Snow, R.W., Atkinson, P.M., 2006. Improving imperfect data\n", @@ -4331,25 +5199,40 @@ "geostatistics. PLoS Medicine 3.\n", "\n", "\n", - "Lawrence, N.D., 2015. How Africa can benefit from the data revolution.\n", + "Lawrence, N.D., 2015. [How Africa can benefit from the data\n", + "revolution](https://www.theguardian.com/media-network/2015/aug/25/africa-benefit-data-science-information).\n", + "\n", + "McCullagh, P., Nelder, J.A., 1989. Generalized linear models, 2nd ed.\n", + "Chapman; Hall.\n", "\n", "McCulloch, W.S., Pitts, W., 1943. A logical calculus of the ideas\n", "immanent in nervous activity. Bulletin of Mathematical Biophysics 5,\n", - "115–133.\n", + "115–133. \n", "\n", "Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence,\n", "N.D., 2014. Malaria surveillance with multiple data sources using\n", "Gaussian process models, in: 1st International Conference on the Use of\n", "Mobile ICT in Africa.\n", "\n", + "Ng, A.Y., Jordan, M.I., Weiss, Y., n.d. On spectral clustering: Analysis\n", + "and an algorithm.\n", + "\n", "Robbins, H., Monro, S., 1951. A stochastic approximation method. Annals\n", "of Mathematical Statistics 22, 400–407.\n", "\n", + "Shi, J., Malik, J., 2000. Normalized cuts and image segmentation. IEEE\n", + "Transactions on Pattern Analysis and Machine Intelligence 22, 888–905.\n", + "\n", "Taigman, Y., Yang, M., Ranzato, M., Wolf, L., 2014. DeepFace: Closing\n", "the gap to human-level performance in face verification, in: Proceedings\n", "of the IEEE Computer Society Conference on Computer Vision and Pattern\n", - "Recognition. " - ] + "Recognition. \n", + "\n", + "The Office of the Senior Special Assistant to the President on the\n", + "Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014.\n", + "Nigeria NMIS facility database." + ], + "id": "86df9150-8e0e-45ea-bc39-d874932f1b10" } ], "nbformat": 4, diff --git a/_notebooks/02-ml-systems.ipynb b/_notebooks/02-ml-systems.ipynb index 852a618..3ef769e 100644 --- a/_notebooks/02-ml-systems.ipynb +++ b/_notebooks/02-ml-systems.ipynb @@ -2,11 +2,9 @@ "cells": [ { "cell_type": "markdown", - "id": "38ac04d8", "metadata": {}, "source": [ - "Introduction to Machine Learning Systems\n", - "========================================\n", + "# Introduction to Machine Learning Systems\n", "\n", "### [Eric Meissner](https://www.linkedin.com/in/meissnereric/)\n", "\n", @@ -15,11 +13,11 @@ "### [Neil D. Lawrence](http://inverseprobability.com)\n", "\n", "### 2020-07-24" - ] + ], + "id": "a54aaab6-a9a8-4bd0-b81e-4ba898a5d5ec" }, { "cell_type": "markdown", - "id": "a8abc322", "metadata": {}, "source": [ "**Abstract**: This notebook introduces some of the challenges of\n", @@ -30,311 +28,24 @@ "concepts, not to authoritatively answer any questions about the state of\n", "Nigerian health facilities or Covid19, but it may give you ideas about\n", "how to try and do that in your own country." - ] + ], + "id": "48accf03-6f37-4508-a971-c9f0f8fbd53d" }, { "cell_type": "markdown", - "id": "53590e4b", "metadata": {}, "source": [ "$$\n", - "\\newcommand{\\tk}[1]{}\n", - "\\newcommand{\\Amatrix}{\\mathbf{A}}\n", - "\\newcommand{\\KL}[2]{\\text{KL}\\left( #1\\,\\|\\,#2 \\right)}\n", - "\\newcommand{\\Kaast}{\\kernelMatrix_{\\mathbf{ \\ast}\\mathbf{ \\ast}}}\n", - "\\newcommand{\\Kastu}{\\kernelMatrix_{\\mathbf{ \\ast} \\inducingVector}}\n", - "\\newcommand{\\Kff}{\\kernelMatrix_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kfu}{\\kernelMatrix_{\\mappingFunctionVector \\inducingVector}}\n", - "\\newcommand{\\Kuast}{\\kernelMatrix_{\\inducingVector \\bf\\ast}}\n", - "\\newcommand{\\Kuf}{\\kernelMatrix_{\\inducingVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kuu}{\\kernelMatrix_{\\inducingVector \\inducingVector}}\n", - "\\newcommand{\\Kuui}{\\Kuu^{-1}}\n", - "\\newcommand{\\Qaast}{\\mathbf{Q}_{\\bf \\ast \\ast}}\n", - "\\newcommand{\\Qastf}{\\mathbf{Q}_{\\ast \\mappingFunction}}\n", - "\\newcommand{\\Qfast}{\\mathbf{Q}_{\\mappingFunctionVector \\bf \\ast}}\n", - "\\newcommand{\\Qff}{\\mathbf{Q}_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\aMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\aScalar}{a}\n", - "\\newcommand{\\aVector}{\\mathbf{a}}\n", - "\\newcommand{\\acceleration}{a}\n", - "\\newcommand{\\bMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\bScalar}{b}\n", - "\\newcommand{\\bVector}{\\mathbf{b}}\n", - "\\newcommand{\\basisFunc}{\\phi}\n", - "\\newcommand{\\basisFuncVector}{\\boldsymbol{ \\basisFunc}}\n", - "\\newcommand{\\basisFunction}{\\phi}\n", - "\\newcommand{\\basisLocation}{\\mu}\n", - "\\newcommand{\\basisMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\basisScalar}{\\basisFunction}\n", - "\\newcommand{\\basisVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\activationFunction}{\\phi}\n", - "\\newcommand{\\activationMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\activationScalar}{\\basisFunction}\n", - "\\newcommand{\\activationVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\bigO}{\\mathcal{O}}\n", - "\\newcommand{\\binomProb}{\\pi}\n", - "\\newcommand{\\cMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\cbasisMatrix}{\\hat{\\boldsymbol{ \\Phi}}}\n", - "\\newcommand{\\cdataMatrix}{\\hat{\\dataMatrix}}\n", - "\\newcommand{\\cdataScalar}{\\hat{\\dataScalar}}\n", - "\\newcommand{\\cdataVector}{\\hat{\\dataVector}}\n", - "\\newcommand{\\centeredKernelMatrix}{\\mathbf{ \\MakeUppercase{\\centeredKernelScalar}}}\n", - "\\newcommand{\\centeredKernelScalar}{b}\n", - "\\newcommand{\\centeredKernelVector}{\\centeredKernelScalar}\n", - "\\newcommand{\\centeringMatrix}{\\mathbf{H}}\n", - "\\newcommand{\\chiSquaredDist}[2]{\\chi_{#1}^{2}\\left(#2\\right)}\n", - "\\newcommand{\\chiSquaredSamp}[1]{\\chi_{#1}^{2}}\n", - "\\newcommand{\\conditionalCovariance}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\coregionalizationMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\coregionalizationScalar}{b}\n", - "\\newcommand{\\coregionalizationVector}{\\mathbf{ \\coregionalizationScalar}}\n", - "\\newcommand{\\covDist}[2]{\\text{cov}_{#2}\\left(#1\\right)}\n", - "\\newcommand{\\covSamp}[1]{\\text{cov}\\left(#1\\right)}\n", - "\\newcommand{\\covarianceScalar}{c}\n", - "\\newcommand{\\covarianceVector}{\\mathbf{ \\covarianceScalar}}\n", - "\\newcommand{\\covarianceMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\covarianceMatrixTwo}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\croupierScalar}{s}\n", - "\\newcommand{\\croupierVector}{\\mathbf{ \\croupierScalar}}\n", - "\\newcommand{\\croupierMatrix}{\\mathbf{ \\MakeUppercase{\\croupierScalar}}}\n", - "\\newcommand{\\dataDim}{p}\n", - "\\newcommand{\\dataIndex}{i}\n", - "\\newcommand{\\dataIndexTwo}{j}\n", - "\\newcommand{\\dataMatrix}{\\mathbf{Y}}\n", - "\\newcommand{\\dataScalar}{y}\n", - "\\newcommand{\\dataSet}{\\mathcal{D}}\n", - "\\newcommand{\\dataStd}{\\sigma}\n", - "\\newcommand{\\dataVector}{\\mathbf{ \\dataScalar}}\n", - "\\newcommand{\\decayRate}{d}\n", - "\\newcommand{\\degreeMatrix}{\\mathbf{ \\MakeUppercase{\\degreeScalar}}}\n", - "\\newcommand{\\degreeScalar}{d}\n", - "\\newcommand{\\degreeVector}{\\mathbf{ \\degreeScalar}}\n", - "\\newcommand{\\diag}[1]{\\text{diag}\\left(#1\\right)}\n", - "\\newcommand{\\diagonalMatrix}{\\mathbf{D}}\n", - "\\newcommand{\\diff}[2]{\\frac{\\text{d}#1}{\\text{d}#2}}\n", - "\\newcommand{\\diffTwo}[2]{\\frac{\\text{d}^2#1}{\\text{d}#2^2}}\n", - "\\newcommand{\\displacement}{x}\n", - "\\newcommand{\\displacementVector}{\\textbf{\\displacement}}\n", - "\\newcommand{\\distanceMatrix}{\\mathbf{ \\MakeUppercase{\\distanceScalar}}}\n", - "\\newcommand{\\distanceScalar}{d}\n", - "\\newcommand{\\distanceVector}{\\mathbf{ \\distanceScalar}}\n", - "\\newcommand{\\eigenvaltwo}{\\ell}\n", - "\\newcommand{\\eigenvaltwoMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\eigenvaltwoVector}{\\mathbf{l}}\n", - "\\newcommand{\\eigenvalue}{\\lambda}\n", - "\\newcommand{\\eigenvalueMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\eigenvalueVector}{\\boldsymbol{ \\lambda}}\n", - "\\newcommand{\\eigenvector}{\\mathbf{ \\eigenvectorScalar}}\n", - "\\newcommand{\\eigenvectorMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\eigenvectorScalar}{u}\n", - "\\newcommand{\\eigenvectwo}{\\mathbf{v}}\n", - "\\newcommand{\\eigenvectwoMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\eigenvectwoScalar}{v}\n", - "\\newcommand{\\entropy}[1]{\\mathcal{H}\\left(#1\\right)}\n", - "\\newcommand{\\errorFunction}{E}\n", - "\\newcommand{\\expDist}[2]{\\left<#1\\right>_{#2}}\n", - "\\newcommand{\\expSamp}[1]{\\left<#1\\right>}\n", - "\\newcommand{\\expectation}[1]{\\left\\langle #1 \\right\\rangle }\n", - "\\newcommand{\\expectationDist}[2]{\\left\\langle #1 \\right\\rangle _{#2}}\n", - "\\newcommand{\\expectedDistanceMatrix}{\\mathcal{D}}\n", - "\\newcommand{\\eye}{\\mathbf{I}}\n", - "\\newcommand{\\fantasyDim}{r}\n", - "\\newcommand{\\fantasyMatrix}{\\mathbf{ \\MakeUppercase{\\fantasyScalar}}}\n", - "\\newcommand{\\fantasyScalar}{z}\n", - "\\newcommand{\\fantasyVector}{\\mathbf{ \\fantasyScalar}}\n", - "\\newcommand{\\featureStd}{\\varsigma}\n", - "\\newcommand{\\gammaCdf}[3]{\\mathcal{GAMMA CDF}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaDist}[3]{\\mathcal{G}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaSamp}[2]{\\mathcal{G}\\left(#1,#2\\right)}\n", - "\\newcommand{\\gaussianDist}[3]{\\mathcal{N}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gaussianSamp}[2]{\\mathcal{N}\\left(#1,#2\\right)}\n", - "\\newcommand{\\given}{|}\n", - "\\newcommand{\\half}{\\frac{1}{2}}\n", - "\\newcommand{\\heaviside}{H}\n", - "\\newcommand{\\hiddenMatrix}{\\mathbf{ \\MakeUppercase{\\hiddenScalar}}}\n", - "\\newcommand{\\hiddenScalar}{h}\n", - "\\newcommand{\\hiddenVector}{\\mathbf{ \\hiddenScalar}}\n", - "\\newcommand{\\identityMatrix}{\\eye}\n", - "\\newcommand{\\inducingInputScalar}{z}\n", - "\\newcommand{\\inducingInputVector}{\\mathbf{ \\inducingInputScalar}}\n", - "\\newcommand{\\inducingInputMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\inducingScalar}{u}\n", - "\\newcommand{\\inducingVector}{\\mathbf{ \\inducingScalar}}\n", - "\\newcommand{\\inducingMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\inlineDiff}[2]{\\text{d}#1/\\text{d}#2}\n", - "\\newcommand{\\inputDim}{q}\n", - "\\newcommand{\\inputMatrix}{\\mathbf{X}}\n", - "\\newcommand{\\inputScalar}{x}\n", - "\\newcommand{\\inputSpace}{\\mathcal{X}}\n", - "\\newcommand{\\inputVals}{\\inputVector}\n", - "\\newcommand{\\inputVector}{\\mathbf{ \\inputScalar}}\n", - "\\newcommand{\\iterNum}{k}\n", - "\\newcommand{\\kernel}{\\kernelScalar}\n", - "\\newcommand{\\kernelMatrix}{\\mathbf{K}}\n", - "\\newcommand{\\kernelScalar}{k}\n", - "\\newcommand{\\kernelVector}{\\mathbf{ \\kernelScalar}}\n", - "\\newcommand{\\kff}{\\kernelScalar_{\\mappingFunction \\mappingFunction}}\n", - "\\newcommand{\\kfu}{\\kernelVector_{\\mappingFunction \\inducingScalar}}\n", - "\\newcommand{\\kuf}{\\kernelVector_{\\inducingScalar \\mappingFunction}}\n", - "\\newcommand{\\kuu}{\\kernelVector_{\\inducingScalar \\inducingScalar}}\n", - "\\newcommand{\\lagrangeMultiplier}{\\lambda}\n", - "\\newcommand{\\lagrangeMultiplierMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\lagrangian}{L}\n", - "\\newcommand{\\laplacianFactor}{\\mathbf{ \\MakeUppercase{\\laplacianFactorScalar}}}\n", - "\\newcommand{\\laplacianFactorScalar}{m}\n", - "\\newcommand{\\laplacianFactorVector}{\\mathbf{ \\laplacianFactorScalar}}\n", - "\\newcommand{\\laplacianMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\laplacianScalar}{\\ell}\n", - "\\newcommand{\\laplacianVector}{\\mathbf{ \\ell}}\n", - "\\newcommand{\\latentDim}{q}\n", - "\\newcommand{\\latentDistanceMatrix}{\\boldsymbol{ \\Delta}}\n", - "\\newcommand{\\latentDistanceScalar}{\\delta}\n", - "\\newcommand{\\latentDistanceVector}{\\boldsymbol{ \\delta}}\n", - "\\newcommand{\\latentForce}{f}\n", - "\\newcommand{\\latentFunction}{u}\n", - "\\newcommand{\\latentFunctionVector}{\\mathbf{ \\latentFunction}}\n", - "\\newcommand{\\latentFunctionMatrix}{\\mathbf{ \\MakeUppercase{\\latentFunction}}}\n", - "\\newcommand{\\latentIndex}{j}\n", - "\\newcommand{\\latentScalar}{z}\n", - "\\newcommand{\\latentVector}{\\mathbf{ \\latentScalar}}\n", - "\\newcommand{\\latentMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\learnRate}{\\eta}\n", - "\\newcommand{\\lengthScale}{\\ell}\n", - "\\newcommand{\\rbfWidth}{\\ell}\n", - "\\newcommand{\\likelihoodBound}{\\mathcal{L}}\n", - "\\newcommand{\\likelihoodFunction}{L}\n", - "\\newcommand{\\locationScalar}{\\mu}\n", - "\\newcommand{\\locationVector}{\\boldsymbol{ \\locationScalar}}\n", - "\\newcommand{\\locationMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\variance}[1]{\\text{var}\\left( #1 \\right)}\n", - "\\newcommand{\\mappingFunction}{f}\n", - "\\newcommand{\\mappingFunctionMatrix}{\\mathbf{F}}\n", - "\\newcommand{\\mappingFunctionTwo}{g}\n", - "\\newcommand{\\mappingFunctionTwoMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\mappingFunctionTwoVector}{\\mathbf{ \\mappingFunctionTwo}}\n", - "\\newcommand{\\mappingFunctionVector}{\\mathbf{ \\mappingFunction}}\n", - "\\newcommand{\\scaleScalar}{s}\n", - "\\newcommand{\\mappingScalar}{w}\n", - "\\newcommand{\\mappingVector}{\\mathbf{ \\mappingScalar}}\n", - "\\newcommand{\\mappingMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\mappingScalarTwo}{v}\n", - "\\newcommand{\\mappingVectorTwo}{\\mathbf{ \\mappingScalarTwo}}\n", - "\\newcommand{\\mappingMatrixTwo}{\\mathbf{V}}\n", - "\\newcommand{\\maxIters}{K}\n", - "\\newcommand{\\meanMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanScalar}{\\mu}\n", - "\\newcommand{\\meanTwoMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanTwoScalar}{m}\n", - "\\newcommand{\\meanTwoVector}{\\mathbf{ \\meanTwoScalar}}\n", - "\\newcommand{\\meanVector}{\\boldsymbol{ \\meanScalar}}\n", - "\\newcommand{\\mrnaConcentration}{m}\n", - "\\newcommand{\\naturalFrequency}{\\omega}\n", - "\\newcommand{\\neighborhood}[1]{\\mathcal{N}\\left( #1 \\right)}\n", - "\\newcommand{\\neilurl}{http://inverseprobability.com/}\n", - "\\newcommand{\\noiseMatrix}{\\boldsymbol{ E}}\n", - "\\newcommand{\\noiseScalar}{\\epsilon}\n", - "\\newcommand{\\noiseVector}{\\boldsymbol{ \\epsilon}}\n", - "\\newcommand{\\norm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\normalizedLaplacianMatrix}{\\hat{\\mathbf{L}}}\n", - "\\newcommand{\\normalizedLaplacianScalar}{\\hat{\\ell}}\n", - "\\newcommand{\\normalizedLaplacianVector}{\\hat{\\mathbf{ \\ell}}}\n", - "\\newcommand{\\numActive}{m}\n", - "\\newcommand{\\numBasisFunc}{m}\n", - "\\newcommand{\\numComponents}{m}\n", - "\\newcommand{\\numComps}{K}\n", - "\\newcommand{\\numData}{n}\n", - "\\newcommand{\\numFeatures}{K}\n", - "\\newcommand{\\numHidden}{h}\n", - "\\newcommand{\\numInducing}{m}\n", - "\\newcommand{\\numLayers}{\\ell}\n", - "\\newcommand{\\numNeighbors}{K}\n", - "\\newcommand{\\numSequences}{s}\n", - "\\newcommand{\\numSuccess}{s}\n", - "\\newcommand{\\numTasks}{m}\n", - "\\newcommand{\\numTime}{T}\n", - "\\newcommand{\\numTrials}{S}\n", - "\\newcommand{\\outputIndex}{j}\n", - "\\newcommand{\\paramVector}{\\boldsymbol{ \\theta}}\n", - "\\newcommand{\\parameterMatrix}{\\boldsymbol{ \\Theta}}\n", - "\\newcommand{\\parameterScalar}{\\theta}\n", - "\\newcommand{\\parameterVector}{\\boldsymbol{ \\parameterScalar}}\n", - "\\newcommand{\\partDiff}[2]{\\frac{\\partial#1}{\\partial#2}}\n", - "\\newcommand{\\precisionScalar}{j}\n", - "\\newcommand{\\precisionVector}{\\mathbf{ \\precisionScalar}}\n", - "\\newcommand{\\precisionMatrix}{\\mathbf{J}}\n", - "\\newcommand{\\pseudotargetScalar}{\\widetilde{y}}\n", - "\\newcommand{\\pseudotargetVector}{\\mathbf{ \\pseudotargetScalar}}\n", - "\\newcommand{\\pseudotargetMatrix}{\\mathbf{ \\widetilde{Y}}}\n", - "\\newcommand{\\rank}[1]{\\text{rank}\\left(#1\\right)}\n", - "\\newcommand{\\rayleighDist}[2]{\\mathcal{R}\\left(#1|#2\\right)}\n", - "\\newcommand{\\rayleighSamp}[1]{\\mathcal{R}\\left(#1\\right)}\n", - "\\newcommand{\\responsibility}{r}\n", - "\\newcommand{\\rotationScalar}{r}\n", - "\\newcommand{\\rotationVector}{\\mathbf{ \\rotationScalar}}\n", - "\\newcommand{\\rotationMatrix}{\\mathbf{R}}\n", - "\\newcommand{\\sampleCovScalar}{s}\n", - "\\newcommand{\\sampleCovVector}{\\mathbf{ \\sampleCovScalar}}\n", - "\\newcommand{\\sampleCovMatrix}{\\mathbf{s}}\n", - "\\newcommand{\\scalarProduct}[2]{\\left\\langle{#1},{#2}\\right\\rangle}\n", - "\\newcommand{\\sign}[1]{\\text{sign}\\left(#1\\right)}\n", - "\\newcommand{\\sigmoid}[1]{\\sigma\\left(#1\\right)}\n", - "\\newcommand{\\singularvalue}{\\ell}\n", - "\\newcommand{\\singularvalueMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\singularvalueVector}{\\mathbf{l}}\n", - "\\newcommand{\\sorth}{\\mathbf{u}}\n", - "\\newcommand{\\spar}{\\lambda}\n", - "\\newcommand{\\trace}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\BasalRate}{B}\n", - "\\newcommand{\\DampingCoefficient}{C}\n", - "\\newcommand{\\DecayRate}{D}\n", - "\\newcommand{\\Displacement}{X}\n", - "\\newcommand{\\LatentForce}{F}\n", - "\\newcommand{\\Mass}{M}\n", - "\\newcommand{\\Sensitivity}{S}\n", - "\\newcommand{\\basalRate}{b}\n", - "\\newcommand{\\dampingCoefficient}{c}\n", - "\\newcommand{\\mass}{m}\n", - "\\newcommand{\\sensitivity}{s}\n", - "\\newcommand{\\springScalar}{\\kappa}\n", - "\\newcommand{\\springVector}{\\boldsymbol{ \\kappa}}\n", - "\\newcommand{\\springMatrix}{\\boldsymbol{ \\mathcal{K}}}\n", - "\\newcommand{\\tfConcentration}{p}\n", - "\\newcommand{\\tfDecayRate}{\\delta}\n", - "\\newcommand{\\tfMrnaConcentration}{f}\n", - "\\newcommand{\\tfVector}{\\mathbf{ \\tfConcentration}}\n", - "\\newcommand{\\velocity}{v}\n", - "\\newcommand{\\sufficientStatsScalar}{g}\n", - "\\newcommand{\\sufficientStatsVector}{\\mathbf{ \\sufficientStatsScalar}}\n", - "\\newcommand{\\sufficientStatsMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\switchScalar}{s}\n", - "\\newcommand{\\switchVector}{\\mathbf{ \\switchScalar}}\n", - "\\newcommand{\\switchMatrix}{\\mathbf{S}}\n", - "\\newcommand{\\tr}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\loneNorm}[1]{\\left\\Vert #1 \\right\\Vert_1}\n", - "\\newcommand{\\ltwoNorm}[1]{\\left\\Vert #1 \\right\\Vert_2}\n", - "\\newcommand{\\onenorm}[1]{\\left\\vert#1\\right\\vert_1}\n", - "\\newcommand{\\twonorm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\vScalar}{v}\n", - "\\newcommand{\\vVector}{\\mathbf{v}}\n", - "\\newcommand{\\vMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\varianceDist}[2]{\\text{var}_{#2}\\left( #1 \\right)}\n", - "\\newcommand{\\vecb}[1]{\\left(#1\\right):}\n", - "\\newcommand{\\weightScalar}{w}\n", - "\\newcommand{\\weightVector}{\\mathbf{ \\weightScalar}}\n", - "\\newcommand{\\weightMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\weightedAdjacencyMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\weightedAdjacencyScalar}{a}\n", - "\\newcommand{\\weightedAdjacencyVector}{\\mathbf{ \\weightedAdjacencyScalar}}\n", - "\\newcommand{\\onesVector}{\\mathbf{1}}\n", - "\\newcommand{\\zerosVector}{\\mathbf{0}}\n", "$$" - ] + ], + "id": "3a82b391-6034-464f-b372-255e5e377487" }, { "cell_type": "markdown", - "id": "49df968d", "metadata": {}, "source": [ + "::: {.cell .markdown}\n", + "\n", "\n", "\n", "\n", @@ -344,21 +55,20 @@ "" - ] + ], + "id": "4972f8f1-cd6f-491a-a978-9accd63f2474" }, { "cell_type": "markdown", - "id": "04c8e7d8", "metadata": {}, "source": [ - "Question\n", - "--------\n", + "## Nigerian Health Facility Distribution\n", "\n", "In this notebook, we explore the question of health facility\n", "distribution in Nigeria, spatially, and in relation to population\n", "density.\n", "\n", - "We answer and visualize the question “How does the number of health\n", + "We explore and visualize the question “How does the number of health\n", "facilities per capita vary across Nigeria?”\n", "\n", "Rather than focussing purely on using tools like `pandas` to manipulate\n", @@ -367,20 +77,19 @@ "Machine learning can be summarized as $$\n", "\\text{model} + \\text{data} \\xrightarrow{\\text{compute}} \\text{prediction}\n", "$$ and many machine learning courses focus a lot on the model part. But\n", - "to build a machine learning system in practice, a lot of work has to be\n", + "to build a machine learning system in practice, a lot of work must be\n", "put into the data part. This notebook gives some pointers on that work\n", "and how to think about your machine learning systems design." - ] + ], + "id": "41ff4764-7886-4289-a2c7-51c2b1df88e0" }, { "cell_type": "markdown", - "id": "bcd0cf4c", "metadata": {}, "source": [ - "Datasets\n", - "--------\n", + "## Datasets\n", "\n", - "In this notebook , we download 4 datasets:\n", + "In this notebook, we download 4 datasets:\n", "\n", "- Nigeria NMIS health facility data\n", "- Population data for Administrative Zone 1 (states) areas in Nigeria\n", @@ -393,110 +102,286 @@ "looking at the health examples, try to imagine how SafeBoda may have had\n", "to design their systems to be scalable and reliable for storing and\n", "sharing data." - ] + ], + "id": "1f6dbeca-f6a2-4c39-84ce-4b51a6213b1a" }, { "cell_type": "markdown", - "id": "49372c0f", "metadata": {}, "source": [ - "Imports, Installs, and Downloads\n", - "--------------------------------\n", + "## Imports, Installs, and Downloads\n", + "\n", + "\\[edit\\]\n", "\n", "First, we’re going to download some particular python libraries for\n", "dealing with geospatial data. We’re dowloading\n", "[`geopandas`](https://geopandas.org) which will help us deal with ‘shape\n", - "files’ that give the geographical lay out of Nigeria. And to get a small\n", - "database set up running quickly, we’re installing\n", - "[`csv-to-sqlite`](https://pypi.org/project/csv-to-sqlite/) which allows\n", - "us to convert CSV data to a simple database." - ] + "files’ that give the geographical lay out of Nigeria. We also need\n", + "`pygeos` for indexing." + ], + "id": "d1bd7bdf-f5f3-4f44-be78-a4f30ebd4fae" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install geopandas" + ], + "id": "5a2d6bb2-5754-4a6e-8688-c361f8f2a257" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pygeos" + ], + "id": "d0bf7f58-fc1c-4bb6-a51c-df9303cf3bd1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "\\[edit\\]" + ], + "id": "d547f658-2bc4-43cf-9597-3a3149040109" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.rcParams.update({'font.size': 22})" + ], + "id": "0a222833-387c-48ed-bdd3-d924b0f04638" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ], + "id": "86eaff6b-efe9-43a9-91d3-1b26024d242e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## notutils\n", + "\n", + "\\[edit\\]\n", + "\n", + "This small package is a helper package for various notebook utilities\n", + "used below.\n", + "\n", + "The software can be installed using" + ], + "id": "34fc07c2-0f78-4597-acf9-4d6e554cd5b1" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install notutils" + ], + "id": "edeb448b-ce55-45a5-b3a4-6578757a961f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub:\n", + "\n", + "\n", + "Once `notutils` is installed, it can be imported in the usual manner." + ], + "id": "38c9e248-4857-4151-b6b9-59f2b9268362" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils" + ], + "id": "74306c1b-74a7-4066-8eaa-8df2c1c3a908" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pods\n", + "\n", + "\\[edit\\]\n", + "\n", + "In Sheffield we created a suite of software tools for ‘Open Data\n", + "Science’. Open data science is an approach to sharing code, models and\n", + "data that should make it easier for companies, health professionals and\n", + "scientists to gain access to data science techniques.\n", + "\n", + "You can also check this blog post on [Open Data\n", + "Science](http://inverseprobability.com/2014/07/01/open-data-science).\n", + "\n", + "The software can be installed using" + ], + "id": "3e4ac54a-ae32-439f-b730-02d2d05d2373" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pods" + ], + "id": "ad601aa2-774b-40f6-93ba-5269b613af4f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub: \n", + "\n", + "Once `pods` is installed, it can be imported in the usual manner." + ], + "id": "7360e3bf-1705-4569-a9fe-afed9a2b66ea" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pods" + ], + "id": "b0cd2fc9-439e-438f-98f1-a0da84bc8c06" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlai\n", + "\n", + "\\[edit\\]\n", + "\n", + "The `mlai` software is a suite of helper functions for teaching and\n", + "demonstrating machine learning algorithms. It was first used in the\n", + "Machine Learning and Adaptive Intelligence course in Sheffield in 2013.\n", + "\n", + "The software can be installed using" + ], + "id": "8c14f655-0c71-4ef2-893d-5a8d1f5c6cc1" }, { "cell_type": "code", - "execution_count": 1, - "id": "70a7d7d2", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: geopandas in /Users/neil/anaconda3/lib/python3.6/site-packages (0.8.1)\n", - "Requirement already satisfied: pandas>=0.23.0 in /Users/neil/anaconda3/lib/python3.6/site-packages (from geopandas) (1.0.2)\n", - "Requirement already satisfied: pyproj>=2.2.0 in /Users/neil/anaconda3/lib/python3.6/site-packages (from geopandas) (3.0.0.post1)\n", - "Requirement already satisfied: shapely in /Users/neil/anaconda3/lib/python3.6/site-packages (from geopandas) (1.7.1)\n", - "Requirement already satisfied: fiona in /Users/neil/anaconda3/lib/python3.6/site-packages (from geopandas) (1.8.17)\n", - "Requirement already satisfied: numpy>=1.13.3 in /Users/neil/anaconda3/lib/python3.6/site-packages (from pandas>=0.23.0->geopandas) (1.18.2)\n", - "Requirement already satisfied: pytz>=2017.2 in /Users/neil/anaconda3/lib/python3.6/site-packages (from pandas>=0.23.0->geopandas) (2019.3)\n", - "Requirement already satisfied: python-dateutil>=2.6.1 in /Users/neil/anaconda3/lib/python3.6/site-packages (from pandas>=0.23.0->geopandas) (2.8.1)\n", - "Requirement already satisfied: certifi in /Users/neil/anaconda3/lib/python3.6/site-packages (from pyproj>=2.2.0->geopandas) (2020.6.20)\n", - "Requirement already satisfied: munch in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (2.5.0)\n", - "Requirement already satisfied: attrs>=17 in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (19.3.0)\n", - "Requirement already satisfied: six>=1.7 in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (1.14.0)\n", - "Requirement already satisfied: click<8,>=4.0 in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (7.1.1)\n", - "Requirement already satisfied: click-plugins>=1.0 in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (1.1.1)\n", - "Requirement already satisfied: cligj>=0.5 in /Users/neil/anaconda3/lib/python3.6/site-packages (from fiona->geopandas) (0.7.0)\n", - "Note: you may need to restart the kernel to use updated packages.\n", - "\u001b[31mERROR: Could not find a version that satisfies the requirement decarteslabs[complete] (from versions: none)\u001b[0m\n", - "\u001b[31mERROR: No matching distribution found for decarteslabs[complete]\u001b[0m\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } + "outputs": [], + "source": [ + "%pip install mlai" ], + "id": "ae2c92da-63cb-4ae2-a807-1be2dd094193" + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "%pip install geopandas\n", - "%pip install decarteslabs[complete]" - ] + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub: \n", + "\n", + "Once `mlai` is installed, it can be imported in the usual manner." + ], + "id": "0680bf0b-9ead-42c6-baa0-b145d3010987" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "78e87953-0cfa-452d-a3ab-c67541091ecb" }, { "cell_type": "markdown", - "id": "969895dc", "metadata": {}, "source": [ - "Databases and Joins\n", - "-------------------\n", + "## Databases and Joins\n", + "\n", + "\\[edit\\]\n", "\n", - "The main idea we will be working with today is called the ‘join’. A join\n", - "does exactly what it sounds like, it combines two database tables.\n", + "The main idea we will be working with in this practical is the ‘join’. A\n", + "join does exactly what it sounds like, it combines two database tables.\n", "\n", - "You have already started to look at data structures, in particular you\n", - "have been learning about `pandas` which is a great way of storing and\n", - "structuring your data set to make it easier to plot and manipulate your\n", - "data.\n", + "You may have already started to look at data structures and learning\n", + "about `pandas` which is a great way of storing and structuring your data\n", + "set to make it easier to plot and manipulate your data.\n", "\n", "Pandas is great for the data scientist to analyze data because it makes\n", "many operations easier. But it is not so good for building the machine\n", "learning system. In a machine learning system, you may have to handle a\n", "lot of data. Even if you start with building a system where you only\n", "have a few customers, perhaps you build an online taxi system (like\n", - "SafeBoda) for Kampala. Maybe you will have 50 customers. Then maybe your\n", - "system can be handled with some python scripts and pandas." - ] + "[SafeBoda](https://safeboda.com/ug/)) for Kampala. Maybe you will have\n", + "50 customers. Then maybe your system can be handled with some python\n", + "scripts and `pandas`." + ], + "id": "cf445d57-9757-4c76-8a00-ba4cb02b0d48" }, { "cell_type": "markdown", - "id": "3440c8cc", "metadata": {}, "source": [ - "Scaling ML Systems\n", - "------------------\n", + "## Scaling ML Systems\n", "\n", - "But what if you are succesful? What if everyone in Kampala wants to use\n", + "But what if you are successful? What if everyone in Kampala wants to use\n", "your system? There are 1.5 million people in Kampala and maybe 100,000\n", - "Boda Boda drivers.\n", + "Boda Boda drivers.[1]\n", "\n", "What if you are even more succesful? What if everyone in Lagos wants to\n", "use your system? There are around 20 million people in Lagos … and maybe\n", - "as many Okada drivers as people in Kampala!\n", + "as many Okada\\[^okada\\] drivers as people in Kampala!\n", + "\n", + "\\[^okada\\] In Lagos the Boda Boda is called an Okada.\n", "\n", "We want to build safe and reliable machine learning systems. Building\n", - "them from pandas and python is about as safe and reliable as [taking six\n", - "children to school on a boda\n", + "them from `pandas` and python is about as safe and reliable as [taking\n", + "six children to school on a boda\n", "boda](https://www.monitor.co.ug/News/National/Boda-accidents-kill-10-city-UN-report-Kampala/688334-4324032-15oru2dz/index.html).\n", "\n", "To build a reliable system, we need to turn to *databases*. In this\n", - "notebook [we’ll be focussing on SQL\n", + "notebook [we’ll be focusing on SQL\n", "databases](https://en.wikipedia.org/wiki/Join_(SQL)) and how you bring\n", "together different streams of data in a Machine Learning System.\n", "\n", @@ -507,422 +392,216 @@ "with screws.\n", "\n", "But instead of using a welder or screws to join data, we join it using\n", - "particular columns of the data. We can join data together using people’s\n", - "names. One database may contain where people live, another database may\n", - "contain where they go to school. If we join these two databases we can\n", - "have a database which shows where people live and where they got to\n", - "school.\n", - "\n", - "In the notebook, we will join together some data about where the health\n", - "centres are in Nigeria and where the have been cases of Covid19. There\n", - "are other challenges in the ML System Design that are not going to be\n", - "covered here. They include: how to update the data bases, and how to\n", - "control access to the data bases from different users (boda boda\n", - "drivers, riders, administrators etc)." - ] + "columns of the data. We can join data together using people’s names. One\n", + "database may contain where people live, another database may contain\n", + "where they go to school. If we join these two databases, we can have a\n", + "database which shows where people live and where they got to school.\n", + "\n", + "In the notebook, we will join some data about where the health centers\n", + "are in Nigeria with data about where there have been cases of Covid19.\n", + "There are other challenges in the ML System Design that are not going to\n", + "be covered here. They include how to update the databases and how to\n", + "control access to the databases from different users (boda boda drivers,\n", + "riders, administrators etc).\n", + "\n", + "[1] Boda Boda is the name for the motorbike taxis found commonly in\n", + "Kampala." + ], + "id": "e913aa42-201c-4d9b-93e1-f22e966f0083" }, { "cell_type": "markdown", - "id": "b916285a", "metadata": {}, "source": [ - "Hospital Data\n", - "-------------\n", - "\n", - "The first and primary dataset we use is the NMIS health facility\n", - "dataset, which contains data on the location, type, and staffing of\n", - "health facilities across Nigeria." - ] + "# Nigeria NMIS Data\n", + "\n", + "\\[edit\\]\n", + "\n", + "As an example data set we will use Nigerian Millennium Development Goals\n", + "Information System Health Facility (The Office of the Senior Special\n", + "Assistant to the President on the Millennium Development Goals\n", + "(OSSAP-MDGs) and Columbia University, 2014). It can be found here\n", + ".\n", + "\n", + "Taking from the information on the site,\n", + "\n", + "> The Nigeria MDG (Millennium Development Goals) Information System –\n", + "> NMIS health facility data is collected by the Office of the Senior\n", + "> Special Assistant to the President on the Millennium Development Goals\n", + "> (OSSAP-MDGs) in partner with the Sustainable Engineering Lab at\n", + "> Columbia University. A rigorous, geo-referenced baseline facility\n", + "> inventory across Nigeria is created spanning from 2009 to 2011 with an\n", + "> additional survey effort to increase coverage in 2014, to build\n", + "> Nigeria’s first nation-wide inventory of health facility. The database\n", + "> includes 34,139 health facilities info in Nigeria.\n", + ">\n", + "> The goal of this database is to make the data collected available to\n", + "> planners, government officials, and the public, to be used to make\n", + "> strategic decisions for planning relevant interventions.\n", + ">\n", + "> For data inquiry, please contact Ms. Funlola Osinupebi, Performance\n", + "> Monitoring & Communications, Advisory Power Team, Office of the Vice\n", + "> President at funlola.osinupebi@aptovp.org\n", + ">\n", + "> To learn more, please visit\n", + "> \n", + ">\n", + "> Suggested citation: Nigeria NMIS facility database (2014), the Office\n", + "> of the Senior Special Assistant to the President on the Millennium\n", + "> Development Goals (OSSAP-MDGs) & Columbia University\n", + "\n", + "For ease of use we’ve packaged this data set in the `pods` library" + ], + "id": "307d3133-e190-4fcc-b068-a33c5f02b5b6" }, { "cell_type": "code", - "execution_count": 2, - "id": "e0cabd7f", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "data = pods.datasets.nigeria_nmis()['Y']\n", + "data.head()" + ], + "id": "9bac110a-267f-4f2c-84d3-31f10fd7b13d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, you can access the data directly with the following\n", + "commands.\n", + "\n", + "``` python\n", "import urllib.request\n", - "import pandas as pd" - ] + "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')\n", + "\n", + "import pandas as pd\n", + "data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')\n", + "```\n", + "\n", + "Once it is loaded in the data can be summarized using the `describe`\n", + "method in pandas." + ], + "id": "79125ce7-edb4-4d27-a7b0-cc0ae6bfbc0b" }, { "cell_type": "code", - "execution_count": 3, - "id": "493b7e67", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')\n", - "hospital_data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')" - ] - }, - { - "cell_type": "markdown", - "id": "020b5f63", - "metadata": {}, - "source": [ - "It’s always a good idea to inspect your data once it’s downloaded to\n", - "check it contains what you expect. In `pandas` you can do this with the\n", - "`.head()` method. That allows us to see the first few entries of the\n", - "`pandas` data structure." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b79731dc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "
\n", "\n", "- Gold medal times for Olympic Marathon since 1896.\n", - "- Marathons before 1924 didn’t have a standardised distance.\n", + "- Marathons before 1924 didn’t have a standardized distance.\n", "- Present results using pace per km.\n", - "- In 1904 Marathon was badly organised leading to very slow times.\n", + "- In 1904 Marathon was badly organized leading to very slow times.\n", "\n", "\n", "\n", - "\n", - "Image from Wikimedia Commons\n", - "http://bit.ly/16kMKHQ\n", + "\n", + "Image from Wikimedia Commons \n", "\n", "
\n", "\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
facility_namefacility_type_displaymaternal_health_delivery_servicesemergency_transportskilled_birth_attendantnum_chews_fulltimephcn_electricityc_section_ynchild_health_measles_immun_calcnum_nurses_fulltime...antenatal_care_ynfamily_planning_ynmalaria_treatment_artemisininsectorformhub_photo_idgpssurvey_idunique_lgalatitudelongitude
0HEALTH POST KAGBANGHealth PostTrueTrueFalse1.0TrueFalseTrue0.0...TrueFalseTruehealth1393335750723.jpg6.54340807 9.08470312 218.8000030517578 5.0451a0efb-5fa6-4bad-93cd-7cf19eb50833cross_river_obudu6.5434089.084703
1Alhari ClinicPrimary Health Centre (PHC)TrueTrueTrue4.0TrueFalseTrue1.0...TrueFalseTruehealth1393316873034.jpg9.00720861 7.67780798 432.8999938964844 5.05ddb68d6-02d2-44de-9df3-ebc840a1da42nasarawa_karu9.0072097.677808
2Primary health centre umukuru okehiBasic Health Centre / Primary Health ClinicTrueTrueTrue2.0TrueTrueTrue4.0...TrueTrueTruehealth1393594715772.jpg5.1297 7.1592 73.3 52173b656-14eb-400d-9eef-76830379b065rivers_etche5.1297007.159200
3PHC EHOM CENTRALPrimary Health Centre (PHC)TrueFalseFalse2.0FalseFalseTrue0.0...TrueTrueTruehealth1393330657159.jpg5.4633 8.1464 117.8 5963abf9d-5a72-4b35-811e-9c1830adc88bcross_river_biase5.4633008.146400
4Health postHealth PostTrueFalseFalse0.0FalseFalseTrue0.0...TrueTrueTruehealth1393342042946.jpg5.504 8.0251 52.4 59d09aaac-578c-4a48-a054-dee678a05422cross_river_biase5.5040008.025100
\n", - "

5 rows × 30 columns

\n", - "
" - ], - "text/plain": [ - " facility_name \\\n", - "0 HEALTH POST KAGBANG \n", - "1 Alhari Clinic \n", - "2 Primary health centre umukuru okehi \n", - "3 PHC EHOM CENTRAL \n", - "4 Health post \n", - "\n", - " facility_type_display \\\n", - "0 Health Post \n", - "1 Primary Health Centre (PHC) \n", - "2 Basic Health Centre / Primary Health Clinic \n", - "3 Primary Health Centre (PHC) \n", - "4 Health Post \n", - "\n", - " maternal_health_delivery_services emergency_transport \\\n", - "0 True True \n", - "1 True True \n", - "2 True True \n", - "3 True False \n", - "4 True False \n", - "\n", - " skilled_birth_attendant num_chews_fulltime phcn_electricity c_section_yn \\\n", - "0 False 1.0 True False \n", - "1 True 4.0 True False \n", - "2 True 2.0 True True \n", - "3 False 2.0 False False \n", - "4 False 0.0 False False \n", - "\n", - " child_health_measles_immun_calc num_nurses_fulltime ... \\\n", - "0 True 0.0 ... \n", - "1 True 1.0 ... \n", - "2 True 4.0 ... \n", - "3 True 0.0 ... \n", - "4 True 0.0 ... \n", - "\n", - " antenatal_care_yn family_planning_yn malaria_treatment_artemisinin \\\n", - "0 True False True \n", - "1 True False True \n", - "2 True True True \n", - "3 True True True \n", - "4 True True True \n", - "\n", - " sector formhub_photo_id gps \\\n", - "0 health 1393335750723.jpg 6.54340807 9.08470312 218.8000030517578 5.0 \n", - "1 health 1393316873034.jpg 9.00720861 7.67780798 432.8999938964844 5.0 \n", - "2 health 1393594715772.jpg 5.1297 7.1592 73.3 5 \n", - "3 health 1393330657159.jpg 5.4633 8.1464 117.8 5 \n", - "4 health 1393342042946.jpg 5.504 8.0251 52.4 5 \n", - "\n", - " survey_id unique_lga latitude longitude \n", - "0 451a0efb-5fa6-4bad-93cd-7cf19eb50833 cross_river_obudu 6.543408 9.084703 \n", - "1 5ddb68d6-02d2-44de-9df3-ebc840a1da42 nasarawa_karu 9.007209 7.677808 \n", - "2 2173b656-14eb-400d-9eef-76830379b065 rivers_etche 5.129700 7.159200 \n", - "3 963abf9d-5a72-4b35-811e-9c1830adc88b cross_river_biase 5.463300 8.146400 \n", - "4 9d09aaac-578c-4a48-a054-dee678a05422 cross_river_biase 5.504000 8.025100 \n", - "\n", - "[5 rows x 30 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hospital_data.head()" - ] - }, - { - "cell_type": "markdown", - "id": "dbddc1d4", - "metadata": {}, - "source": [ - "We can also check in `pandas` what the different columns of the data\n", - "frame are to see what it contains." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "62e6f569", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['facility_name', 'facility_type_display',\n", - " 'maternal_health_delivery_services', 'emergency_transport',\n", - " 'skilled_birth_attendant', 'num_chews_fulltime', 'phcn_electricity',\n", - " 'c_section_yn', 'child_health_measles_immun_calc',\n", - " 'num_nurses_fulltime', 'num_nursemidwives_fulltime',\n", - " 'num_doctors_fulltime', 'date_of_survey', 'facility_id', 'community',\n", - " 'ward', 'management', 'improved_water_supply', 'improved_sanitation',\n", - " 'vaccines_fridge_freezer', 'antenatal_care_yn', 'family_planning_yn',\n", - " 'malaria_treatment_artemisinin', 'sector', 'formhub_photo_id', 'gps',\n", - " 'survey_id', 'unique_lga', 'latitude', 'longitude'],\n", - " dtype='object')" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "hospital_data.columns" - ] - }, - { - "cell_type": "markdown", - "id": "570aafdf", - "metadata": {}, - "source": [ - "We can immiediately see that there are facility names, dates, and some\n", - "characteristics of each health center such as number of doctors etc. As\n", - "well as all that, we have two fields, `latitude` and `longitude` that\n", - "likely give us the hospital locaiton. Let’s plot them to have a look." - ] + "data.describe()" + ], + "id": "fd13612b-bd3f-4454-8122-8498d59f3cb8" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also find out the dimensions of the dataset using the `shape`\n", + "property." + ], + "id": "8be7737e-5a9f-422c-97a9-09ade6b85317" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.shape" + ], + "id": "719dc90c-cf18-4a6a-a898-c6dc1795f352" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dataframes have different functions that you can use to explore and\n", + "understand your data. In python and the Jupyter notebook it is possible\n", + "to see a list of all possible functions and attributes by typing the\n", + "name of the object followed by `.` for example in the above case if\n", + "we type `data.` it show the columns available (these are attributes\n", + "in pandas dataframes) such as `num_nurses_fulltime`, and also functions,\n", + "such as `.describe()`.\n", + "\n", + "For functions we can also see the documentation about the function by\n", + "following the name with a question mark. This will open a box with\n", + "documentation at the bottom which can be closed with the x button." + ], + "id": "7c867c2b-a006-48b0-9001-4148a11c6b6c" }, { "cell_type": "code", - "execution_count": 6, - "id": "e4556763", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt" - ] + "data.describe?" + ], + "id": "90a047c0-3e0a-46cb-bb06-236a6b6ba481" }, { "cell_type": "code", - "execution_count": 7, - "id": "37b099c5", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzdya+lZ5Ie9jh3zsw75c05k2SRVdVNVXVJLTXKkgzBdrckDwLc8sKD0Ct5ABr+AwxbghbaaCHDC2+0asCyLMCWbdiAZWvVtgyjBUgCulpS9aDqrpEsMpnTnef5eBH9Q8Q5lSyyWFkkk3VeIJH3nnvOd77hfSOeeOKJeAfD4TAmYzImYzIm4+UbU5/0CUzGZEzGZEzGRxsTAz4ZkzEZk/GSjokBn4zJmIzJeEnHxIBPxmRMxmS8pGNiwCdjMiZjMl7SMfNxftnNmzeHr7/++sf5lZMxGZMxGS/9+K3f+q314XB4a/z1j9WAv/766/G1r33t4/zKyZiMyZiMl34MBoO3n/f6hEKZjMmYjMl4ScfEgE/GZEzGZLyk4wMN+GAw+NuDweDpYDD43ef87b8YDAbDwWBw8ydzepMxGZMxGZPxfuPDIPC/ExH/zviLg8Hg1Yj4NyPi+y/4nCZjMiZjMibjQ4wPNODD4fA3ImLzOX/6byPiv4yISTOVyZiMyZiMT2B8JBXKYDD4ixHxcDgcfn0wGHzQe381In41IuK11177KF83Ge83hsOIy8v8NxxGDAYRU1P57wOey2RMxmS8/ONHNuCDweBqRPy1iPi3Psz7h8Phr0XEr0VEfPWrX52g9Rc1Li8jTk/z//Pzen1mJg343Fz+PxmTMRmf2fFREPgXIuKNiIC+X4mIfzYYDP7kcDh8/CJPbjLaGA7TUJ+eRpyd5f9TU/nzxUUa8tnZiOnpNOKHhxELC/kzoz4ZkzEZn6nxIxvw4XD4OxFx2++DweCtiPjqcDhcf4Hn9dkdaI8f1ocdFRKR7724iNjdjXj8OP+/uEhjPTMTMT8fce1aHu/wMA345WUeY2kp4sqV/Pnq1Xz/hFqZjMn4zIwPNOCDweDvRcQvRsTNwWDwbkT89eFw+N/9pE/sMzPGDfblZRnns7N8HZIeDNIg979dXkZsb0f8wR9EHB0lNbKzE3F8HLG6GnHrVqLx2dmIg4M6ztxcxOZm/p2Bn5vLv11e5ndMT+c/nHk/1wmfPhmT8akfH2jAh8Phr3zA319/YWfzWRjDYSLki4v8eTgsCuP8vAw42oOhnp7O187OkvqAvIfDNNh+3tpKg3p+HvHsWb7vypX8f3o6X7tyJRE3Y333bh53aiqPsbhYxnp2to7HeYyft/dMxmRMxqdqfKy9UD7Tg+E+OSkUDOnOzyf6jSjjzXAPBomcB4NCvT57dpZGdG8v0ffubiLv2dmkS3Z3iyrZ38/jHx4WT760lMd++jSN9vJyHu/4OP+/vMyffd/OTjqBhYUy2NPT6RDm5ydGfDIm41M2Jgb8RQyKkPPzNODQcEQaaQnHLvsbDPJ1Bndmpj5zdJQ/DwZpYHd3E3nv76dBjigEf3xcyc3hMA3t8XGex/l5InFGemYmjTEncnpa9MvlZTkSxn1+vqiZ5eX87IRHn4zJ+NSMiQH/ccdwWEb77KyoCpzy1FT+7eQkUThje3GRhhp1cXiY7716teiUk5M03CcnaUQPDvKze3v593v38pgMr++5uMhj7OzkOV65kp/b3S2KZno6j8Mgo32Ojioa2NvL9y0uFn/OqE+M+GRMxic+Jgb8xx3okG4EIXCom0G9uCjDd3FRycyDg/oZvRGRxn1vL1+bmkrjeXaWBnRpKQ3rYJC/cxwnJ4X0oezFxTzWYJDnMTeXaP7iIo350VEZ76OjurapqYiVlfysBKpjixYmYzIm4xMbEwP+UYcCGhQIg8ZwDgaJfCNG+W1GnSFkFB1jdjZ/HgyKMnGs4TBRt6RmRBr109M03AsL+Y9TmZsrueHsbPHYJyelD6eOmZ7O60G3LCzk346Pi17hlJz3RKEyGR/HeJ466icx7z6u73mBY2LAP8royb+zszTi+GaKDtrrqalC1GgO/PbsbBrTiELgh4dpuKemEvUOh0WfSHweHhaNgcc+Ps7/r19P2eDpaaL069fz+5aWynivruZ7jo/LMJ+cVEKU7tzr167lz6KJhYV8bXa2qJWPc7yEC20yPuKQX4oYTfC/6FyMQjlgy++f8pzPxIB/lOFBQ7qQ9fFxolwcM6M3MzNKr5gQXt/eriTk5WUaaBWW5+dpZLe3y+j7bgnN/f3i0s/O8v+1tfyfxFDxj+pM5wyFHx9XFSdunHMSGVxclKZ8fz9fv3Ytv+cnbUTlGDisHs1E1H1BE33Wq09/kk6sS2Ejai5HfLyO8/KyQIwCNefzUWm897tv1qXr8f+nnC6cGPCPMkyk4bAUHDMz1ZNkdjb/TU2NqlFMEmqRo6OU+G1s5GclLqem0kDirr0+N5fvn5+vxCfVyMxMou2Oik3Ma9dK5TI3V8Z5OEwDf3ychl0pPnqIwadquX9/9NoZ/n4PXqQROT/P86TuiagkMJrJeZyf57m6vunpVM58EhHCT3q4Xj97Ji+i/81wWOChF5yhBXuCHkKNeLGGnQM5Pi7g0XMvnvlHOe77oWzHU2/hOj7K93yMY2LAP8qYmqqwDmetKIYED3qYmyvuGHVyfl6JQYqUZ89K992TmwcHxbNvbpZz2N7O9y4v1/uXl8tJzM0lHQI9zc0VJ47fnp7ORRKRhvz8PKkTP29v59+vXs3jUqU4lu/tapsXgVZw7ru7pfJhsC4u8j5sb+fPipI4I+e1v5/34dVXs5BJS4FP+/gwyJrB7kYV1fDj6vV9d/9eslPzN2I0Ge9cXwT14Dp6tCpx36uFP8o86yj7eZ08z89HHZTv/RSPiQH/KEPCj1ER2lN3kOqhK0w4yHB6Ot9zdBSxvp7GyHEYHob5+DilhOvrVc25uZnveeWVNKrXr6eBunKlJjqlikRqR6cmqJ8XFvI9rocEcXs7/3ZwkEaURHF2NuLmH27CND9fC+DHRSuQ18FBRiXopI2NcjjvvltO5NGjPMelpVTLKFIaDPL3s7OI996LePAg4stfrnzAp3UwGv0+dtqsv6+3Zug/i8i6ARZRqQL+YfSSY/Xv6worCNbxzYcXQT1I5juO40fk/yjHj0qPmafjvLrv+JSj7eeNiQH/UQfPvbCQxhZ3PTWVhqeXzVOWzM0VD312lkbn8eOIt9/Oz2xujlZmdoTw+HEV+uzt5ftN4pmZRMdHRxF37uTva2tpqBYX85/wED3SF9vpaalbUCZLSxEPH6YhxHX3QqSVlaoKXVsbdQgfhLjw+Tj7ubmibiCg4TCPDf2fnKQR39ysRGtExDvv5P8nJ3k+5Jc7O3lM/y8sRHznO3kNb76Z90lV7KdtMJIdBfp9pi1Vhg0Sp/8X5Y1TA4eH9d6Iqjd4nhF0f/vzpLYCQshmr14toy4nMY7cnQvn4u/j0QXndX5ehp9x7cewnj4Kun+e8XZf3LuLi8pBzc7m+uiRx6dsTAz4jzr6RFxcHNVfQ9sRo2GoyXl8nGhyby8N85Mn+fPJSZXLX7mSnPXWVk6mra2I3/7tiK9/vfTgP/dzaYwU3dy8WQnHmzfzGEtLabxOTgr1W9Bzc4WcGYmzs6KB/K4k/+CgpIMReSwcvtcZ4/cberfs7o4qZ65dS1UM5Hh5GfH97+d7Dw7yc/v7+d7Hj5OHx4+6H1A7mmd7O4/r98Eg0frRUcow793Lc5VL+KSVBpx1d1A9xO8OdJyrldzt1b3uo/ss4dzBxczM859XR9YM3N7eaE7COajiRRGK4BSjnZxUriKiDP+1a9XArddK9GRlv9YX1ZfHd/qeLqG1RrWkUHS3vZ2f6+0lPkVjYsB/1NHpAohD+Xr35h0d4XR3dnJSP3yYBmVrq5DF0VFOFij0/DwN1je+EfE7v1Pff3mZv1+5EvGlL0Xcvp2Ie2UlOw+KDPDxEaMoicHigJTtR+TrW1ulLjk9LYpndjbff3CQkYPin7t3RznqhYXRVriub28vUTRuG6qbm0tHNjdXXPvv/E7EP/kn5eBmZpIGmZ3N85udTSTNiF9e5j3Y3Mz7qmXAwUFFHzs7+e/NN/N9HN21a+nsyCI/rkXaDXQ3lgzyeCQmQvKsODGItdcTcNIR+ezM1bOzfE3twPMMOMpmPIqcni6Hbg0wwM4XOj8+zrnSowI0BZrrypVR/jxitFNnj+oAhq6EYXR/lOSpY/lcz0u5n64joq6bgOBTSL9NDPiPOoRhFhXj1Qt4/F0C5PAwJ8fhYRqYR4/SmECHkpmLi5V4ZKy68e7j3Xcjfv7n82dIkoRxbq6SnvPzeVx8fEQhMMgiYrT3ypUrtSidt0Kh8/NyRGica9dG2wRAWBDlxUWe78OHpaHHc0OC5Ijvvhvxf/1fSZE8fZr3KCKd08//fH7Pykpd98ZG3rONjTzu2VneN0hU0nZ6OuK7381j/sIvFKpbWqpnubT08TTt6lw32gSK9n9ESTa9l8Hk5DsvfHZWzxB67X15GHHv1XnyeYMRcx+mp3OO9u9C6/TOmxRTXfLpuxyPQ+i0kHOLqGRpTyKOq2u68qg7OtSI4z2PK+/n3SMDx2SkGftO7X0Kx8SA/yijJ9kiasIooqGlZkS7BBD3/Z3vJEI+PEzjpOBmYSEN0+JifebwMI9/5cpoiXtEyfYiyohCIIyQToLCaXKsniDqcrHp6USykqoM/fx8vm9np/TmFxdpNKen87zn5pIKcZ+gGxQIPe+771aFqWu6erUW4//z/+Q9Wl8fXTTPnqVRv38/r/Xp04gbN0pd4pkoROIcLP7Nzfzc3l7EN7+Z333zZp77wUEeS0L46tVCoT+JISKKyOdydFRUF6eJUmFUIOPZ2ULeUDkU7O+dZhkMqvBLodnxcd4n8wj37Dt6XsNzhHTNTd91eFjvp2JyfeOOw1xD00VU1NNpEwAIAh5XjXSU3ikn95CslTy2P8fOq6tgnpqqJm8iE59xHZ9C+iRiYsA//PDAhVnQd080zc3VRBXCHh7mv6dPI37/90v+tr09itJXV9OYQFhPntRCHDfeEUW74HLxjhC1c+mT/+SkVCl9H83OUy4v58+7u5UI3d6upNfmZn7HrVt5jI2NomampvI6LFAInFPZ2ipuGx8uIXx6mt/tHj1vvPVWoaG7d2vh7+zkzxKa5+d53js7+Z1Pn5ZzXFnJSMC2c6ureS4rK3keN26kEV9YKESuHfCPu5A9BwaYMRWBcZact+gEqrWrkuuMGFWC3Lgx6pA5Bs3Q9vfr+W9slBG9dq2S2V36KhpAQ+3vjxpvkYIocnc3f796tQrPFhfr2vraGE/Qivi6gXXt5mdXuoxL/MYNbTfA40nrzrl39E4o4DPu+9rap7YobGLAP+zoPbxNdHzj+G46OG8oc2enkpYR1bxqbi6NxXCYiTWG4r338tirq/n6o0c/eD6/8AsRb7xR/O3iYv4PzUbkcfVNgWaU2o831zJB8cerq3mcra2SH+K9oSBJqouLPC5k3lvUundHR3ks/cm3t8sBSkB+kPTs5CT5d50boVcLThIKYtzZKaXN4WEZkJWVPAfPcGGh6Kz19bw/168nQvd8rl7N46BjOs8fUa/1eYLCgZCpb4AA53l0lH+Xd2BwUQ69w+Tpac4H1zGOPOURXK9Ec0R+v3yG6zo7y2u/e7fu6c5OGVbPhQEVVbm2iMpFOIeeVOUAFHw5pkihJy+trXH0y2m7l93Q90Iic7kbZ07KcC/MTwbcd83PV9uK2dlaK5/SMTHgH3Z0OoTMaH+/UM2VK9U5UIJuMEhD8c47NUE2N3PS3b1byODevURPvD4VxWuv5ST62tcSNRp/4S9E/Lk/lwnMxcVM8N28WQtqerp494ODUpzgiO/erXA6ohZaxGgy9uwsj/+zP1sVo9CaSW17t55AI2mkYDk5SZR7cpLvffYsjbkiIvTP0VGinY2NH/4cnj7Nc7h9uwwEA4C/vXo1owRqHkVPESVVHAzynu/uViUrhOw53L2b53R0VInfrj5yH3uvGs+hGwVqhsGgKDRRE2O0t5f3Ync3jycRSJ5qfu3uVhthqD4ir1VewvkwZgzjzk46Z4bRc97YSIdweJjneuXKaCUwtZGE/Xh3S69FVFTFQXQOW1QBQDCeAE13GAZn2A2pCKFTgl3OyHiPS1xFtf7uOjz3w8MCMBzMhwEXn9CYGPAPGh4invvwMBdKRHHbEblo9/by71DlO+/ka4p2cJbCWMiJdnthIRfnykoa9c3NnNS3b6fROjjI4p0vfKFCfghheTl/78mjk5NE6AynZB1jMj9fr7m+o6MKkVVqzs2lMaTLxhF26ZiQ/91383cOCqITwr/1Vr3HfdnfL03xh6l8U425u1vImC45Is/pypWswrx+vVCY9rs237BZxcVF3Xs02e5uGXyJOAbN+UKsFrZ7zwChs8jwNC+T7O2FU47lnjBwZHadZjEvOR7IlqFEQeF3XXN3zJA45Ly/X62FGTtOy+AcRJoQt7mARx4O8z3n5xUdMrbUJD2/0Xn73k+//+s69v45Bnp2tu5nR/6cApVP5+K7dLMnizkDTsNc+RSOiQF/3ujaaInKiEIneEihu4VgUhwcJGWCMoBQFNxAD8vL+XvnFhcX6xwGgzTUqJGzs0zi3bmTry0vj24AwZgsLdWih8JMRAZza2t0p3qTuSto1tYKKc/NZZTw9Gkhov39/Bl9E5Gf3d5OBzQ9nX9z7Y8flw4YB8tAQPF6mkDn7zcYIChqfj7vx+xsIVSGaG0tX5d0jUjDPjVV5foSvcvLtfAvLurva2ujxU4MKlQ+NVVJacZIMdHlZXHyIhWg4Nq1MrRaJFxcVNR0eFhIm1KH3DSiDNe4AqVTGY6ly6V5YJ6jpLqhhTgh0F5I5PVeX2DnJ0lta8M2fs7VHOlGelxR0hOcPXnq+6yPcf1+R8wSsu6zCASdRR/fZZfmfXca1vQkifmSDIYM/SDMg576hBNCo1P6Tjubm3U8PUwsdHwxIzw/n7z34WFOflwp6R/DcetWbVjMYD55Us5mcbGkhBH5vvX1mowKYkz89fXaO3NurhC7RXB8nCh2a6sSYtAj6oFzunZtVIkABXMYtLRUD+4hZ9MNyauv5jH6FnLPGySDnAgEy3jv7VV7AdvCCY3JJSMKaR8e5r2zdZx7QM7n+l0XROnZP31a/DGnJvKRJ0FJMLQUIL7XPeBc9vYq6df1+2Sc+/ujGvbd3UqEzsxURSs0iXOHhDnAGzfyHmhu5vz8i6jn1+V2kpI3bpSjtl4WFytSEdF1A92VJ86Foe4cef/dtcw8x3T13ARpr1xAxKizo4xyLHRQxKgunVGfJDFfgmGC7O9Xksvi2d+vRQA5MtRCxXEqAlKSGERb6F0iCSXRGVH0xpUrhYjX1ipZqVBnby//v369+HYJu7m5/AwUScfbC0QePaoFsr6e38MwQT5nZ6UqUSmKR1flpzBDos//T57kuUXU3xkgTg/CdF2+Uxiu9P6DBqNxeJjGGE89N1fIV/EP3ff2duUDKCR8ZmWljDoqpCfIUCLONyLvoTyFqGNrK79PuwPRkopGORXKn729eq9nwsgzPru7eV+7xNB9Rr1cvVrb56EyzGXzg0G8erXmI1DSHcxgUE4wYrTMXWWvOYMO7OqUrijpBW6dquiUSk+W9veOo/SIWls98ug0iCiuV+v6HsVqEqL9+fYqZWt3gsBfgiH5aB/Ks7NCz5QVEHZfgNQj+/uJpBkU2XOFLhGJoikAvGdrqyoOLRaSvIWF/N9elORtvc+FLoaO53xRNIwvuZ4CGXSPhSXJevdunjM+f3Exr00lpb7jJJK3b+f3M+gzM/nenZ00EPPzo5z90VEtnunpfJ/mXXqfMzLP08A/b5yd5XlobDU3l9cBfdmUGc8+N5fHFQG5P/h+BVWSlLhmKH9pKQ3E3l5pn6Fg+n2G4Ogon6GipK7XFhlR/XBaqDsRoWvidNFft2+PJurkX5xzT+ZNTdV9xulL5HaaYHm5Er/khT3ngZ4AeCIKZXej2nXcvfKxo3cO0Hs6jeGceqFTByr+xim6ZvSWaAxF1QEHByEP5PcefZHofgqTl8bEgPfBe0v4mFyok8ePCw3J6FuEkoGSU1eupDIE+pmejvjc53IhkoXRQyuQgUYkWxYW0tg/eJDnIdGnvDcijYLSaH/nWIS2QtCNjXJOMzN5PYeHhRgvL9PgUwSsrZUBfPy4+rYwdu5V7x8BzaFWcOkPHhQd8a1vJUK3WNA+eHwqjI7YGcXnjevXy0BxUgwspCia6Vr98QKmq1fz38pKNe1aWMjnyDBwlp6DyMH5QrX01/hVaM51rq8XOkdj7O/nfdEDRmEUQwUdMqDUO+PVhZLtKLP5+fwuSLPnK1RJ+j7PlCSRke71BAw6ZO0cURicwjg/Pa7k6Iqd7iQ8k4hRGaVrc3xKJ9/vXCnFuuPgaER48hf+Z/xF2Rx35+s/hWNiwPuAYM/Py1gJt4+OSmKEPrDd2MlJoTBVfMLbk5N8TTHA+nr1sMazC1OFjFDM2lrx3tQmtkDrofV4P3Jo0AJZXi56hyFRbh5RBr9XcqIQ1tfTyeA6Sdl0p+td21AhjPLsbBkIxu/4OOLzn0+eezjMiGV+Pu+fkn1cKgM3XtjTZXoLC4lCfZfXoMGuYfeMtafVCyYiDfb0dDowydC1tULl7hv0rnioO4W5ubxfnDMqCn3g/jmO+0ea97u/m+cmwtnYSOeBnz45yfkA2ftelbjuLypEVa/39s0uuuGdmqqIyPXQ1ZsX3Xh3VYtz4UQiRgvDDAa2D5GfeQcEmZODQa3Bjv4Hg2r81hUj9OgRdW+9z7GHw2oVIdHLQZIDd6oH3dV7oXCmvucTrNScGHDDA1ZZOBhUkowWF1oiDZyezp95eKGXRUKOtrJSxpLCAg1iYtL1Mng3buRivXGjElwQLQStVLhHBQwZ9AZNzc7ma87t6dNaGGdnlWy6uEiDDS1KzF67lsaWQbCwUC0WRac7KBTm5tIQQata5F5e5v0TRaAfGDRGeHm55IsMpQQkx8GIOe/uaJeWarE+eTJaXYcfPzsblcGpbOT43FcIUcGUaxI5yI3Qfy8tVaKyzxFGWvL1m9/M+4vmQaNwhoNBRjF4dFFV7wHPGDMotO4im959sfO/njuj6fh+Z7gj6ngMnPnVE39dk29wAIrBoPVeENULoBhr88gx/K0/L0afIXd/HZdxFo0Mh7kWeqQr2mDkSXxRnyjTiLp+r+nE2YuMPiaDPjHgEaU2EP7SxEISJqMH3nfS6TrgnvCC6BhyhruH8DL1JhF0t7iYdMvqann+2dlKFpm0OHGJRlw8ftQ5olPW1vI8z85SS/7WW2nAoP2dnULyJydp5C0kVYjoiK2tfI2RGg7L2ZFGCvWnp9OAq9Q8OKhOjJAjxyJi6Jrkbpyh0V6A4T64h5LD7qdFDyleuVJqEQu3O10IjGNz3p2OgWJ1PZQEpLjBuc/MjLYMQC/hbeU/vvOdQuykbcNh1hLculUGdHs76R4OqHcCFI1R/vTrYtx6oQtDS5Xi811lYh65d4wYQ8mAu2edV7YmGMauhun5o26Eu8KEA4kopwQ4cEQ+05+vPEFXqnCCELt7bZ16XU7I9VMr9eRuX/fyNhRV5ua4xPEnNCYGPKKMgbC6y+ju38/ybeF5RPGLDDwE18tv9/fLuCgwoVFeXh7Vn+JOJaQkOrtefDAoiR+J1LVrVRlpApu0KAsLNKIWL6WFhCJjcOtWITwl5qenaTREJib58nIhWwvXd+CE9/fT4Lo+4evdu3kfnjwpFQrqpCs2emdF4W031BQxkqMkmtBvl5Lt7eU1zM2V8+ll5rdu1ffqh0HyB0F3GZrIglNdWqr5IUmtXQAawoDSO02AckG7cWqrq+W8IkqtIjJB76gepVunVT85qSQ4B+++Oo+IQs4Ms+syOtXQ0XNXZbk274kYBT+9mvJ5qNo5dcTveOZ3T676vHOIKL7e5xjRnrvwO8fTuzZy3OaTyIF9YBt6Ba1nIyqXC+sO8yeEyicGPGJ0sZoQOFCcJL50Y2OUGxRibWzk4oG+5ucTRQuVGVOVgyib5eU0cgcHuejI4IS8aIOdnXydKqCjJIoNId3+fhkevKDEKmQbkVWdS0vVP0QC0zVPT9duLqgOxgqN4R5Ab6in6emkYpaXS1p2dpbX8fBh0VBdKsbZuQaGZjAoRU6XYUI+jAOHQj3DiZEtSrj2MnBRFZ5d2Iwy0/AKXdKLfTwDzorhkUjrz0eCtRfRaHeASnn2LP/vSXE5kNXVkgJKvtlmz32Tn5A/6LmCmzcr+vDcOM1ewg7FGuat72TkGUSUBAM8jq4ZuvEEpnvN2HPyjG5XpTh+xA9GEaIQf+vySkZfJCb6JQDoCVNzoXcRNbe7pDJi9Fo6h68YzxowLzu1+oKN+MSAmxzQshCKd6WF3tkZ1YYvLZXBQjXwujLdb7+dyToLCV/aF9j166Nl5zhtEYAQfzDIc+ttY7v3dx2MwnBYfUq6lj2ijJ3zFMb2PS8VgaBCTL5r1/K43/teJYju3k1+dm2tELvKvK6UODrKz0GYR0dptDRgQpmsrBRVBemiMq5ezaiIAXbNEBl6qkdJfXNoCEq/FahXZerWVmn6leqjGNwbzlBBiPsseuMMGHoJZRw5A2GuXVxUO1uGEIp+9dWS+zEqjDa6BB/ctcxdsQLByrsAIAyn4zJUDGA3WAxTr6p0L4AZxqzL/1zvODJ3nh2RQ89d8cLBu/8MPIRuLqH3PPvp6bz/Ip1xKWK/h6JgarPe3rhfk3MFlNwz66/bDU7B/Xw/J/ZjjokBF1Lx3MI5E2VlpRoiMfAeCLRn0klWQtr2cvzc52oB0NhKvnXeUaLTAjOJoQJIAxqietHDw6KbmirNNsrD9dDAMgrOaWampHDXr9d10oYz+G+/nRsjROSi2doq6dubb1Y1Y0Six+99r5CY7oWHh3WtwvCrV9NRMrJ09RJxndv0dyX5KB9KoM5h0gOLnIbD4t0h0s7rLy9XBINnRbkwtKitR49q8Y/zsxGjvbKdh1yGvIdnfCRma6sAACAASURBVO1axBe/WEnxpaVEzaSoZJT7+5Uj6MbT392b8V7YioTcg4ii//qmJOZER57yKxGjyUWKD89FVShD1qkH9wdX35OLEYXUHc/59dd61KTZlzUr6qOoMQfowCUwzYte6Yz/7vQNeoXM0LWIOkSx1manL8e17Gg38/0FovCfbgPOaA6HpXyQxMIZ4hU7n2wizc1Vl72Imvxka96Dw1xerp+97+QkVRnHx8mBHxwU5bG6Wi1pTSql49PThb6El6en1c/i2bOqnPR3iVTviSijj/bpEjU9RBzj2rXqB+7eMLp2xFH6Lbn57FkaIbvmQEg7O0VPkGmtrka88Ub18EZtdb0vvp9jW1wsjnt/v0J7XRAZVMhIgZEEcTcWPg9ZTk0VmraQRVNLS/UZ1bPe0ymJiEpuupfdcHd989pa0lgRlbBF5zH4zpWMEdXD8NjIguGXr3B+qDaJbQlvKF301dUc5rvvpu7pCJ2B6+fTKSo5HDmB+fnRrpacBkDVKRTUJsfAiYz/7ll6dq6jJ1o5gk6fAC6+3zPq+SnvB+S64e7Ozdzs8ks0q/e8QCrlp9uAW2AmWpfOdQProTFAfdNWyIzxF5JHVMvTlZUqDPBANa3qXNr2dk5yErLLy9FkZg/LoBnoWgi9s1OqB0YV4oaQcNGUEZ3zI01bXi6p4fJylfpTrEBn8/NFnWxu1qLSjmB1NZUUP/uzo1EGA++eQYIRVZiDkoKQOmcOTTEwCqgkBDkTMkFcet+X1PjSlyJef73ayaLJaP+npqqKMqLuGSN3clI9tTn8iEKInc/vCVHP1ByCznrPdkolSdbNzUq2orfcU++FDmdmcv75O/7dvDs7y3MlKextX7vR7Bw0x+NcPZPxtgpd1dONLcTdnYLR5YcduaMkOh2keG2csulOpVeNdhUJPpxzcJ+BhL5xhnXIwEPZXWLYvw/aprox78YR+guiUj7QgA8Gg78dEf9uRDwdDodf+cPX/puI+OWIOI2I70TEfzIcDt9nG5VP8egTtCMBD5Tn7tuM6WInZF1bywdz9WrSCwzb0lIu6F5Crll8T2hZMF01MBhUIvPgoJQX5GoMgXPEX8/MJOKdmqpIAlKwrRYlAidBdgfZ6qAn08/Y4BAXFtIx3L49mgibnq6WAZApPl/rV3kCKhghL4TMILn/09NVEWmXece8dSuvYXOzHPBgUAaKQeyJ3ffrcPiNb+Tz/eN/vJCf3tgWvlDczjO3blUUhBYgTfQ8oWI9VtwTHQfJTiUuySGhU9y+XMfsbOYaUEVUPgyTBKtEuXntnDxbzl600pN9koQMsucO0DA+4/ShOoCI/KxqT5EHVM4YQssGAwiBAyvmr2fa1SNdmdJpMwZektR3dhWW4b52VI9qlKfhHBh+TqjTId1AO1ZX3XTUPX7tP8b4MAj870TE34qIv9te+78j4q8Oh8PzwWDwX0fEX42I/+qFndXHNUw+BiBi9KaTvRkPHqTBYMjQLjdvFg2hsnJ1tfZcXF2tCbC2Vtw0NBxRhsxihb5NNnpUTa4iKjoQLnae/OIijayGWgcHGZ4zsI4VURWU09O1jdjFRelaKU/sH/mP/3EV31B3vP56IXW67eEw3wcVnp9XZaVdeFZWqjoR6qTiEGGIIJxLl9JJjvpHwsmYo6J6ld7zxqNHaQx1Eew0A2TqPkoG9w0ZOKEub8Q5oxZ6cu78PL/P9dgXFUqG2HtOAFjQzErvl4jR3t3yFV1VIipBPbkX5Jau0VyiGhpP3jGUXV7HwV9eVsfN4bAcREf1jGrXaEvYO7bvdk4S/OatZ9tRdKcqeg8aEWVXHjl36rBOiQ6H1UBNkzmqHZtkQOOdDnEerrHXe7hGhvsFJjI/0IAPh8PfGAwGr4+99uvt138aEf/BCzujj3N0NQcEhLtimLphVhHIuExP56J6/fWkC9bWRhMdZIA3b5bEajhMo6WvuLAzogwiZyLBxhioLozIRWqLtog897290XPlgKheGBBhJxrD3/rEZBgtJOjhj//xfP/Xv15FJTdvJiKNKGR5/346O0nD09O8zp2dQupPniSaf/XV4lWfPi2DQD/fZV6UE7ZAi6hnxeBHjFIz6JQPGvhjcwIStjg9967yILPs8j80HATcZWvn5yUHHacBJDfPzvK4aBwGWUhvnqytFZ/McPSkN8Ml2vIMuioFGOnIkyE0J6wPVAqjLIHOkJsnwIRnBUEzdJ2Djqj51tciIMKRRdTcde8N3yNRzAkpKOoJyD5nOB/31vqgOiIkkADtdSC9n4xz6hSOaxIZyV+MJ3Z/zPEiOPD/NCL+l/f742Aw+NWI+NWIiNdee+0FfN0LHB09ePh6U/SEJOQJNXjv9HSiKN32hFtkhsJV1YUm1Px8Gr2uNOH5oW5J047KGVgLuWf8FR/oxcIx9JJp5fQ9WeUzvYcEvlGmHmJcWcnzW1yM+CN/pCoayejoryNqkt+4UbIu9xzNhNZYX69y5q2t/KfrosXYuyZG1GLsSUj/8PvT0+lYcL0fNCB/1AbN/NlZ7ezDATOMDAQNMGPfZag2iO5KGlGYKG59vVAiGWPfgNj3k69qYUARwWGg7HwHtZKaAM9maqpyJOOtHhgrBqYbOqCmG0kJVuvIfPY8rCfPfbzRFaPWpXsMKWeBs+9JSO87O0vH/81vVhuI27dT1aOLIydgLUHbxAaDQT17gIM6iBHnKNgB6Nq66Q6qf9f4fXTNLwCJ/1gGfDAY/LWIOI+I//H93jMcDn8tIn4tIuKrX/3qiyN/XtSQEOw8OENERnZ6mhMBh0x7ig/DeUPosuvdQEaMIhsGAvIXWkKQfScTyI2RMKmpPXoYqfLOeyFZ0UBHfYyfvRhNQosFd4l71+XNxgiudXGxqIXFxVxMFxd5z6AqdA9aoEvQHj0qrT31iCgFesWxuw9d8hkxinJUvAp/8cSPH//wuYBCgUSdoyiIc+4RlmIn9x8/a950Q4yf7xQAqSntcUftu7vV151D9kzlYRTudFUNoySac6xeBOR9nPPZWc7h69cr4gMOIipR2fXMkHCnb/ocRzeMywJFol2f7Vw7fYKbtkZ6qwHndHyckdy//JcR3/52RRpf/3q2J/jX/rW8N6hBkaXn1Dn27hTM1S4v7POrOySRgfcz4ug1oGjc7ryA8ZEN+GAw+MuRyc0/Nxy+QFb+4x6SXOP8pElmUdy7l8jw8LAmvjBScqt3BYQwKU6Ew703hwSK79LOtG8B1VF/l1b5bM+id60suR7E3/snQ0Q9Qel7etYcfYQT7YhKObt7hqoYDstgare7spLHoNmWRCWv8wzce3y2a9zdLXWOxXN6msezsEnzLPrV1XxewuMPM958sxRFqAn3TMIOtdGLo/xMeRBRSeuImis9T+G4DH8vnIkoNEhX777v7lYiVJHXcJjI8wtfyO9wT8/OKsrpnK9o0vH1yPF8zQn9UVCLnU5zvhCrOUmBQW3DyUlQMvgoEsdbXy9aiZLn1q26dx2xcrAczN5e7rOqgliLiD/4g/z9X//XC7h0qaK8j3vTr926ljuwTl2jSNA98Ax7MjPi+VTJC6JPIj6iAR8MBv9OZNLy3xgOhz+kUfNLMPpi6nI6ScM+FhdLqtX5RCqUmZlEU0+f5oRBHzx7lmXrvZtdDwMjStEBYXeecPyBo1giKgLQKAg3N653haogQ/KuiFqMvSiFkaSYiRitEu3OzQTWdIkBgeYUXtjirSdqhfOaL52eVhHU5WVdHwMRUY4ioqIEKpDeeOn69Xzv97+f37m2VlvdjY9f+qVSUnT9L6TWjbHRpYvdkUWUYZdEY9i7hr1vw/f4cT0HSTiyRAaR8ZibK+NzcpLzTIQTkYgUnadlwfx81RB4r2QrikYSO6IKiswlxVYMJzqD9JYqi+MxT7sG2xy33vw+HCZQgK456d4ytoMs9wfa1kKg7wBlLTx9msbd7lW4dkVWnjMwwwh7H+QNrPT3dGVLV570nFqXL3bU/jHKCP9eRPxiRNwcDAbvRsRfj1SdzEfE/z3Ih/VPh8Phf/5CzujjHJCVSWUyQEqMBlSA55Xo7B3oGOfFxZKX6R1ycZE0we3bFWKafBZ8D7W61tXvHnqXdTHiQsuOVtAqEBTDhqfEiXeuklqCoYTgTFYOAf0CEaMdLDjH4CwoSex0s7mZhnxmJhOY77xThl5f8Ih6Js7jxo28hxAyTh+qo2mG/pwjjvbsrDavmJ5OA3n3bvVZce840V4x6dolEClFIhIN96R152o5Bc+2F7qcn9eelY7nfaureT03b1YF8Olp/g4hmp+Sq1tbo1W5e3vVf35joxLn+/v5+4MHVU3bZaZkpNvbdW8Mz348YjXHRK+dE0ZHmLed5+5gSeEVMMHxQN+dzmEc0SGii46ovRcw296uiINCqMtjUUydw0Y7jvPb/fwZbq8x7J377nmB8WjrxxgfRoXyK895+b97Id/+SQ6o0GCce6iJP1RduLlZSIDqg0FjRHtVF4NAX43HNYmFjz1866gH5dLDbryfydTRsBAxYtQZcCwMpAUp5JfAgb6EjouLxb0eHVVhCiRisaMbNjYq5IeIe/n4O+/k8RkdSTGNmYTOvRiKUYPW9veTkmEkVbBGVGGQxSnxKfG7slIFTYNBxM/8TH52erqcGlqDQXNPhc+QH6S1t1fzxusKoSJKt390VIVRNvk4Pk50aDNrmxVzQp4dI2euiFi6PO/4uJwVzb3n8/RpqaciEu2jX6amSo2ysVH0w9JSRaA4Z2i702mQZ++3DTFHlHHtLXl7UZD5BrS4x6g/lEcHGRyXtbW8nPfv29/O5yQiWFxMgMBIA1wkoyhL59A564iKunxvlwSOI2iG+Xnjh/3txxwvQoXycg4IDXcKtUEQ8/MV4qp6Y1AlrjqnvL1dShNacdyhhBDHABFY7H1RQza4264zhaAZPhPcpO+cOikUR9R3pGHEJCUhJCG6RSAxyHBQloyHnF5zfsNhlXpPTdXuP/7GWHz722UMLi6KrxU1cHYcRkT+rq2ua56fT3QvQcdxSoi6V8JxDpfxiijD0sN6i/rwsJJg/kbFsb1dkYJNAiAsVJv5xliQS0LunK6oQlL8zp3RKIKjuH8/jTQVlCTlwUHRKyLAzsv3RKOopSfbGHjzczisaKg3y/IMxzlhRtV3Qeueh3ns/eZyREW++PIOcMZzCRQv7teNGxF/+k/nex4/rshuZSWpy8XFWgvWHfqzAxjN4jwnyNp8FwUDaM7pJ2ScP8z46TXgJgMUMD1dKhJogFqE8VxezkXHwOA/e3m6hw1RM2zQy/R0GapuoKAZ0jQ8sr9DXThVDaH8TQjfue0e2glPJRAt0IWF6vUCtan+7JK9hYWKREzwHkYqtumJqpWVNHIM6fx8HmNrK6tW19dHnQOO0ne6/tu389hQKg50ZqYoBteDf2RQFA1BhnZ1p8XufcHxzAy10P7aterjYcHafPnGjeL97bREteNauuHXOEoEQ7LHgdy6NeqEUB5Q9WBQ9Qa6LDJqBwe11+b5ed5rEQVa7PKyIphuEOnTOTj3c7y/h4jieXQByszPEYW+3QcOQcTCOSouOjzM+dV7oHMwXdooAlPR+7nPpcPb3MycE0WNNc4AA2DoRdFYdzQEBcBdn+vyMt2ZfYLjp9eAm3QmQg9HhZEUHhAt7pRRjqjE5vJyGqajo1zUkIW/C9OFiWSEpGTdMPRwV2IUqo1I7tIChpbsot6NriZYPfRl1LpksBcl+Ky8gEKNiNpYwntU8HX+b3W1DMy3vlV8pYZdU1NpZHpE0FEg7pdhoQWnZFldrSIllZDLyxWxeE6Ska++mgtO4Y/NFiBkhkNTLc6r6+Wnp+vZU8fgttfXC92Pl6+733IFPQEIgXYZKEWFPUwfP87zkJvBbUvI+Yz7f3Awqnfn4HtbCNyyPV9JZRcW0gBCmD3BDf13Iyoiihh977gs7+rVUbQqeeu9XbGB5jMvOprvCcLe2XFqqjYEkbj80pcqUbmzU/Ri13JTpVAVAVsiFiDFvWUHurTyBapJPur46TXgEJ6JYrKgDCJGvTXDbcFJUCrqmJlJw81gSU7hL01CnB2drjAST6sacHe3emcI5YW03/xm0Rb0u1rWMiKqOLt8i3ZYhBGR56TrIa5xYSENU19gDFuPWiAUcrMeRfh+XP7hYVE/T5+Wgd7YKPnY+noVJx0eFp115Urej7t369kxRIp2ImqzBoVU+Orbt9MYUmzs7OT3oTw4W0Z3bq4aWzmGkJsBZXQV6dg5R22AnIjnz3i6f5eXpZ/vCXSOQFj/3nvpwPf3M9rgUIfD/Nm8e/KkooWtrXwdsiVx7Q5bt8Kpqfy8PVQZSgab8ccX450lqbthRXd07rgn8xjPvuuT/jj9vd2Id3AQ8YOKDo7JHET/DYd5ndev5/NWlyAaMsc4EpG3WgNOVuTWaSDn9gLVJB91/PQacCqSrtbAv6lOEz4yiBaVRWjhdy349ev50E0cHt5igFwgMpNDcurkJA2ZkPXp05zEmispkrl5s3TnDNzFRerVXZOkn9AWcpCA7Wob6AJffPt2Gf+I6mOOp+8KnZ6971l6iDWiEkt7e8XfRlSCdDxxhd7xHtzu3FzeA9Waw2FFKJeXif57cy5oanW1FDU3blQHxa2tNKS7u+l0e5k7B0+NQE0zNZXn0MN0zqbLQOUajo/rfFFnXXa2uZmfe+212iZtairnjw6P5+c5L1ZX83pwvYNBqStQUQsLo+d1715p6y8vy9E8eJD3tytZRHXOgcoDyOlouif3AJf3U1h4Prhza5BTEQX3+wgRQ70AADRs/XUlFUPu2Iy7ZxxRuQlrj3PoORKUIkfVhQbe79rHHdXHOH56DXhEoUAIwsO0MITXu7s1Cci5GM2ODKamStnCgA4Go1twWQxQJCSBR6XOQIfgMk0oC2djo0LOg4OqetzaKjpgfb14PyXdd+8WqtnfL8QekYZB0yjIG3qG4iKKeuid7TgctIPEqWtZWqqSeyX4V64UTx5R3DSD2SsH5+dTPfD0aaGszc38TjSVUHhpqbhVyFKSmZG/dq2elairRzR2r9nYyPsCLW5vl7Hi2EVU+GZ9aw4P09C6D4zMlSu1BVqXW2qjAOFD4+4pBdHjx5XMXV8vZIujF/67Bvf02bOKEBhA9MPpaX2eEzJHoVYGXKQFjDC4EWX03M8uae3KKOurc9TmSi97F530AhwovNOewEQ33n2d2zylc/q3bpUkVR4HCndPx7+Pw+GAXWe/7o9x/HQb8IjiECVuLDAPdGGhFr4kF25NX+yIUndAC+vrhXp9zkJVlg/hWhBCR8eBPiB3BsKkZij1f7h/v0JBhkTod3ycn9ndTQQqoWria2gFfUFVFxdpxNwXKPq110bDWChacYl8gO9GtYw7s9XVur779ysx3Eu5qQbIJxWlUM8whKpeScZ6EQwDw1Axos4fcpe8jMjvWVsr5I5L57x8XsJraamKpeRXXBt6C30Aifvu3kxpaalAA1352Vmen801fM/cXNFIeuww0HNzRRMNh2mwRC0ARu+tg7Mm7ew1CZx016t3xU1E/e8eQu8MNUPbZYSMIsqtf581AxxA3BE/iPR7zYT7Pn5uXbFFyXJ4WF0GOU3igvHvknjtEUN3Fp+AImViwBm4jg767z2k6h63G1aSva6DnZmpzQTQML04hwFGM0AfDMXFRe1erxc5GoKx1/MiIl97+rQ2QxiXL0ZUQsxCcr1Qd8Qomiah6+XjjFMPrd1HDoImeXq6KBftZ20HRjnDoUhAiQqocdBbnBnD2FETfl5i9/vfr8rXXsGH4qJZnprKxJ3iF+hqe3uU9lKodeVKlqvr+724WMZ7eTn/vrZWSJGh5/wlJkV9cg/6tfQkNwNuX0eJVhFMd+pra6P8OZ5bG4Te62d3N19HiaFOIGrOjZE3P+UiRIW9Srkj0PPzBBSQsKhS5AcgdJksR9groaFrlEqvzeh1Ap1qY1w5CzkI87prvKFo0l8ApOvCzQfzJ6KcYad4esTxMY+JAe9JGDz0eDgYURMa6qMN5+3x0RcXpXs2EXd2qrjEgrSAyMMY1PGwGHKLqAVuMkO1ji08Nsl6ohKSM/EUnfQwViLSPaAlRpN0yaWWtY4fUSjZxgE9mYSntb0XvlIlJYUAyuD8vPbDpCARmXR5JLqhRxE7O2lE7t3L41tg0CXkST63tlZO6/Q0P6fa8vQ0nUFHqFr2SopFpGNYXc1/U1M5B6DGZ8/KGHjeojAGlu59c7MSy/IXd+6kgVeHEFEaZpSaRlxXrpSGvCf3KHtQCRw8xNsRMuNGodJ5aHOjK20ADzSd+WR+UlCpIu269K78MTdVqzL02gGjHhnliHqWgIRrQN+gRMeNbZ9DHKp1zkmY210R05OqHXlPOPBPcHigDFgfHlxE8Yl9Q9OLi+JScc8WK/TV6RcyKMaVhycnvHWrjND+fnUD3NqqsBSil/CDdq5fr+8YDqvZE0N7dlY7yUCy1Czj98Dx9SDvSSbX3ictdIRiishj28CCcbtzp+SQjq+bnfvyzjvFt29tRfz+79cz+MpXkr5B55DAMc6cACne/n61XWXkOOpe3BNRRoOBG+e0+64zaKjhML/r3r3KqYgYbCztHn/vexklafZ1fFzbpaGqbt6saOLJk3LU5JKrq+kQ9vbqOfYdkew/2htgieZ6r++pqTwO1Y9qYW2RGWdInrPpiqNuwDhdtQp9HUjYU12JqobDMuaO0SNezkO9AJ06A2+NmX+Mbk8s9jXue6Bw5wcsdcNsfYzXdPSkq2Nwop+AImViwI1xTWdPwIyjT5PYgkAlyJJbLPqA9Ey3EBJCgQKF5Io5FhZKwaCE32tHR2kYhd7Oj5qBsbUTkOKNtbVSV5jcJi5E2sNACPnRo4oG0DeQdEf7HcmrxLx2rYoyOBnJYGiRsSYDPD/P1//ZPxs13hERv/u7+bn790tN4llwOpA6vrbrwhVuKAqCvuwcpGUCg9EbXClEmpnJ68AF37xZ97LLUyNKg/zoUWmv33svDb7ioN3ddErn56MNlx48GK0OHgyqGReqgRMSvZEErq0VJSHxLsksNyGaPDioPMfWVs1vRUkLC+WEIooLZuj6VoAipohC/J0GMz86ldGTnJwwiovUr9N6ruHKlTLgFCHeC1H3Nf3DaI7nGeBxdZW5z1GwD/I0EwT+CQ4Pt2eVI0az5B4Q4ycR4nOKZxhDi8GEjKgFHlF8OdpDGIsDv3Yt4q230tjgXe1sr4c3fhSy8roJhROXLNS7o1e7uQ7nwLk4N5phemZI1aSOKMXO6WmVgENOuPylpVI94IwZPaiKHG5rK6/9eePb307jBtWr4lxcrJ4qogsca+fynV/EaNKqFzt55oyVzSxUN16/XnJF8k8oldGGZHHIjOzubvHSFA+9oMpcwkX39gu92ZSE5tpaFeJQ76DZ9FRnPN97r7hxLSA8IxQSUMLw6gfkWrucj0KpFwABCPIOETmHGFWgwXVaD53yA3Jw0xyG+0Au2ZUxvUlbxKixjfhBkPZhbEJH5uNJ0U7nfQLGO2JiwGt0eqDTJl2qJDTzu0naM/L4UdWRCwtVODI9XbuiC5txs4wGNH9xke/VQW5ch/vwYVYZmkiSgDhFyAwa7zva2FvRQusTtSdkJLZWVkYTXl02BqH2MmuSONfTd1BnsG2AIUkGQUkw4o9x6d0gkm/afebq1USmp6fVmxrfyTE5DnQsIvA8r1+vLcJWVvJ43kPnf3lZ6LZvoRZRtEJHiNeulZEl91QUI3EG5aN+HIvu/PIyjR999uJiPnvUmGt0zK5/PziohmJoANr1DkY001pfr2fdn2lPEjO8FxeF5IfDNPAaYNnsg/xTq1pzVZLX3OzSRRw4J8Q5RtTrEv1dmQWI9DVrHvTn/KPSHOPIvHPrPcn5CY2JATe6EYNE+4PvJcIMAsPZCz4kHU2szh0r3DERr17N7HzviRJRjbLOznLBbm+X7hnKp1q4e3dUPSMhKLw/Pa0ugmRS3fH43m6AjF5h15FN/xuj38vCGd47d6qAgnFyLK1Mh8PScjNyElP4dE5NPuDevaw8vXu31DyoEpQB1D81VXI6peP6hXi2EojT02lsNjdrk4Pz8+LQoXmbGvd+MeM9qjkYqhMVlOfned7mFxSunSqUChD0PiqLi6NdEs03YIJE8+QkDbccSOeCqYKWl9PoKslnLLe3y9neulXtkzXGwv/iu0Vj6D7zSzLX/aUAEd2JLCKqORkKBPft983Nckzmoz1YyVbRZ+4fB94TkJC64Xl0GqTfq/ezE59w9WUfEwPeR384DHnXeY8/vPEwStjada/DYRpbaGhra7RYQ6FIz/7rJnftWnKjT58WuoQEHzwoDhRaw9tC/FQIfdJyUrh43KKwdfx+OC9heF+I/XPCW/I40kjOAd8LLaI3uvOAXNfW8rrv3o347nfTeNy9m/dhbi7iy1+OeOONyh/QPCtm+v73KyKR6OU019aqTcEXv1gywK5UsdsNFIorpg4SZeDq3avOfTMKt26VA37jjUKts7NZ7YqzF71NTRWHrEDJHDs5SS6dY2Kc7Ms6HOb82tys5y4ZyElEFAetwEdiXuLVub/7bn721q2KFhjdLl+VTOwae2hddNEVIdZGR+WizojKNagsPTxM6sd9pv4RSZrnXSnmHMcTrcAOyavrAbxEhS/JmBjw543O4+HWJBw98K4oYdwZbh5/OCy+mWRQefvZWYbGik5U/1EyQPFnZxGf/3wm86BG3ebwpBYrxYE+3iKFzudDx4wyxNMr5MZDwk7TdH0uXpWkzO8MBV0t+gXK7dEKjrRrgc/OIl5/PeKXfzniX/yL3NvQfptf+Uq2DkVBiUjQC+fnic71JunVqFeu5HuuXy96oisXoLfeHMm9w9FH/CB/614zTHIHNPvXr+f32KlJKfvNm3k+N29WrxrnYW5J1J2fpxHTq9uc6xtaKJ46ByvcxAAAIABJREFUOKgK2i6946DRWJQ/t25VO1pc+NlZ3vOlpXIoHJl7YHMMCV5zCAp3f9E35oTv93svuDL/BoM8J3UFp6fpXERq7tXCQqlTxkvrzUdRZt+qz2YXEWW09X/pTds+5WNiwJ83xoX6vTjA/xrqW8wRo4ayJzMhJmhCePzsWTWN8j6Ts0uVbt+uSWjiS8gJsyXUIM2IcijQtnaeDHKXInZu73lyyo5k6K87are4LT7IC3fb1QKcHuOk0pHRvHIl+f2bN9MYv/12GdBXXsmNGCzIy8syaFNTxcG6jpWVvM/04CiX69cLMXajwXFzwtQcm5u10Hv04bo7JQUNoswocdAdKyv5TJ3D9nbSQhGjSdDuoCXBV1aqyRllx8ZGRijasYr2RCCSwjduFHrtTc84C31WXC/9OBkiw7eyUsndjrBtrOzY4/SNtdV/Ji2MGJVvRuRzFbG6j853YyOvR4sEDsD6Oz+vqDSiZLiotmfPqoYC9UM1JiJ6CcbEgD9vMAZ+NgkgBKgRNytpaOJBdyohIYgnT3IBmNx6QT98WM2zuqEzcSFn+yxSTiipRkcwTAxV54CF0bTr5+f1uyRRTzB1PXinkyBvv+O+O5/MkDknuxG5X87Je/C8jCEHNjeXi/RP/IlCw+6zzSY40Pn5NDqHh1VtiMLY2alz4YQZma2tUoC4r6gl8sbZ2XQcjHI3fNRC4zQBrllTMOfI6DOWjs8BcKh2lMH9bm6W4sTOQ5yYc5+aSsMm4pAPuHOnNiogn1RAZJ5rosboogOpedAkXb7YOwCqSkX7oA8jir7oiX8US6+WBCh6Z0CRghyAtYZy2dysAiGj707kHvTiutPT6oIph2R+dBrsJRgTA/680SWFPbloYkcUEmb0+kYQ3kNbK5O+s1PGxALiEKhHLGLhoTBd5R4EiF+0IDt6PjkpRwDt98IMn+8a3K7cgD4HgzIUp6eJWuzKQ/pIg23ozc0BSCD2hdE5UlwyR+CektZF/KDq5eAgDbt7//BhJgtdx+FhoTsJMM/BTuc7O3le8g7Cc8adUfL8j49LFsmA0O73IpLO6/s8+eXFRZWZMyyDQW3+LAknKUuRMa7ZtzvQ4WF1znQvJCcV6DiXe/dGIysyQHMRjaJZl2RvV2eZVypqKXv6dTtXn7u8rDnUk5fUSz1nxBn7/eSkmqB5ZtQpmsndvj2672kHGWik7e3RZmTuoy3uHjwokLW6+tLQJxETA/78wVCbcBBU13NHlCGE/iBSk0VhSi+qMDHPz/N/O8p0J9H5WNSKzYCF/TrXKV9eWRlNtPa+EBGjtAlOsmuMqRoYcmoH7/nOd0YX9tlZJuVQIjhQ1+3c8e2MMg2vncM5Efpq97Ub4Rs3qppU86eIutcKi87OSi2hDF7B0/R0dSbc3a2NIU5Pq+2vc5CL0CPGuSo6Ymx1+0OzdX3z3FyF9jTb7kVvJzscjibOSEU5xqOjSngL//Unf++9Qt7Hx5Un0C51aSnnFwWVZ0SOiSrglO7eLac3N1dKD/JHzgXIwDOLrMw7kkP5Cc9VQZXPOXf3z1zA7W9tFfUkOe9909N5vQ8eFF1pDruP+/ulxtnerupmhXKiof396g8jF/GSjIkB/6DBIOJGJTKhKhQKw2jH74hCc5ubWR7ee4wzaqenuch4fUkwi6Hz6gwOw9YTU9qnQnf48R6iOo5FCx2SYaEOejvT6elUdVh4PfGnv4WFHZHXLnLQS0X5PsPTk007O+kcfvM38zxu304FCt7YdSjV7+epAIjKxa41uOfr1/P8dP2j0iFPPDhIWks3Ogv89dcrEUsuurOTx19YKP15zzX0e2ugQ5zr9nZVxHbKRYIclUVDrakWp21H9Z2douOWl2tTCRJVSfHl5VGn05OLz6tUnJvLeyVpStGix717RyWF8uhyW69D+4DQ9nbN7aOjPO7rrxcA6Gouz/zWrTLSjHkvPNJ2wnpShIRC2drKf9/4Rn4fiuf7308H8eqrVeAmL4H6eknGxIA/bzB640bVAu0tJSXpImoBMlAqJh2LIgMnvbSUEwoig6g7N8ho+ruCFioOnCXEKiTu8i7UjhAX8vPP9aiwc/6KUGiZqUYoKSQRIUsICRIVCgu5u+M6Ps4Q9n/6nyL+5/85Df+dO1ki/0f/aMSf+lNpyG/cGDV2ogWRCqQtwuk7qkgm4qwZLjK1nZ18fn1TiYgygufn+Z7d3Xx21CvPntUmyhA1GWLXFlP8dJ0y1KsNrnnESItmPItnz4pXvnq1+qFHFC/fJY366VDSbG3ltdy+XfMHADg6Gk3uUpasr+d1U/Zwmp3acQzXbP705DIDrh+N94nw3n67dOTyRQYnYIOLW7fyfjHgCn98DwBFZosu+e53K9kq/7S8XD1oVlcz8lDINfNymcSX62w/rmEhS8TQrHod8oZGTRgh2fjkttApKVTCQUUUIhbRykp9T0/8aFo0GJRhxpELCaH3iFFesRfqOM++bVlXlvicJl3z89WUSfJRkm1cf0vj3hOZ3SlxJHt7Ef/b/xbxN/9m3fe9vSyT39tLpMmhrawUMmTY8PTu//Z2nZ89PyFJSgfVipzMo0fFz7qn09NJTUj6PXlSCLknB/VOYUg8az08Ispo4cztddqpC82fIFnzRl6lG0fvv3q16Btz8Pw8jdHKSiU5/c7Jea6cmapgnPrlZRr8nZ1KgjNomo71ja1FOp2a0WSqg5Dd3aJxREmAiDYAZ2eVgLcGAQ6gRFUntU2PiK1NrSXefjvngf4zvTkWJK5dr4rpiIkBf+lH57GhVaiv98qIKIOo+15HBQpsBoMyLhGlPIDahHASeByFxczA+rlzyuNJVcZeiMugXlwU5YM/hNZdr2tgfCBIEx+vLQl6cZGJMUoa+YD19UIyuOilpVElwuVlovp/+A+f/wz++T9PVPT5zxeC3d7O10j4qEhQB1evVjRy+3Y+L05GwlgyFNruzcFssAD5yWlIsklSoxAgUpWK5gRjzpBD8aI4CiVctgIt0lNzjkG0yQSUjAe3XRtHJeK5d6/44U6l2cFeqwJO37UdH1fflNnZpPwuLpKCmp7O5yWK0LKhq5G6sgiAoKwxfzjfXmhjnckTdMrQunAMRl0SfVznbT1sbGTkojJXJa01IJohAgBwnid7/JSPiQHvoyf1TE6TpxcadMWHxcFwMgwWjuTZcJhcngWvvanjQwCkUXZvUQDEKUiAbmykYXj0qDS4X/5yIXqcNClcRCUYGYOIUdTeES6UyxDcvp3XsL9f/KFIgXFDl0SUJIthgcbcw5OTiN/+7fd/Ft/8Zn3u4qKKLu7dK66TYkAYLmHctb4R+TvFCaqi7wHpWiUyHzyo/h+OPTWVr0G+GoNRZpgHeG+GSmJUGTwDj25TwCQagjbn5oqTh87NuWfP6pzNTdGHa++UmHNnUBW0rKwUpaJcHsp2TsCHKLS3PzAnB4OSbsqbiEQ5HdLbiNKxU1uZJ71fDopOj5Wus3dNol7PfmurZKSnp1WItLVV/XN06Lxxo/Ij9PIv4ZgY8D56Ge54PwYFDhACA4kTJgXE1/nZQhG2UpAw8l63c4meEkrPRQCSWGSC3/texO/9XvLGa2sZ6q+vR/zSL1Xhj4nd0fThYUm4OKOIasTPyH3/+4VYOw9KK31ykqiMIkEYe+dOZf9xsaIFYbr7y9g/bzD8jx6VTI86ou/AIhTG//ae3Tdv5r3vdIXFbMOFa9eqA6KyegbFPdzayuOhIFTK4tl7cleyFO/NeGkDq+hnaytL1W26fPduni8nNz2d9/Lb367IrrezpeIALHSphPj9XTtiBvDx48qnkOednyfinp3N158+LQdjMw1ti/WqIUlVBBNRxlUUqs2sQrP19VFarstZOVWSU79bg2glKi4RppzBwkLO2fX1uieKkHoOZWEhwcft21W5+5IU7TxvTAx4RE2kXpUH7eDWoNQeHjK80ITkjGTa/fuFBEmdbtwopMUYn5/nxBPC9r7ckBc0hft+663RnisM/re/XeXQJmdP8FABcAx7e5VAevIkr8cihMTn5gpVrq4WKpqezkWyuVkVfsNh0Rm9SRdE5t/9+xE/+7OpEHjekKBz/igcIT7+dWWl6KMbNyqZq8gkYjSiUMW3uTmKzBgZaI/SSEUj3j4iDS2DEFEctHsaUYgcDw4YbG3ltfzmb+a1T09ntelbb2WbgM9/vtD01FT+LvyXJIacp6erPYA6AREkqWanzKBwlNHFRfZc/43fSGdy505KQz/3uVLfkIK+9VZFk7OzeQ0HB/kcPW/fbX2gasxh80yVJPqLkZYvkftQNdyLyyJG0b0o57vfrc6LR0dJw33/+6Xyun8/r+3VVzPCImFFc76kY2LAu857XJ3RM9wWay+KoQ/ufLQFg6NlqPWMgPJNQgU+GutcXJQ6QnITOsTfWigrKzXZhd402rY308hJuA2p4k1xzBI90A/E12mkvrlB74WOirDY0EeSV50zheIGg4h//9+P+Bt/4wefyfx8xJe+VNetIEMf793d6mAH4dl4GlcKKTIOSugPD9OAizgg0K6R1+MaZTI9XbvS9F4gIjKfNZ86r767W9WRDPnXvhbx9a9XhLK5WVr+V1+tayGXG+fi8cMaaeFtqVE6qhQZiBYky9fXU7759/9+fs+dO2nwvvnNiH/lX0nHoce6OXrrVlUxdpWUKI/aw73m9PXF53DNL5Tb1au1ZdzGRkVNEaWgQqcBHe+9Vwqvs7PqMHl0lJt+fOc7+b3my3BYfe37vUNpvaRjYsAtlK73xsP1CkG83rjSw2YBpGEmPHSBJ++l4rTKkLsEk0WG/9vcrP0YJZ0gHHtr6poH2dy8WehM1VzXXdPT4nxRBhKF6+u1QQLFDCN182aV80tGCu/1yGAU8b29MRAHAR3/+T+f0ck/+AfVQfHGjYh/9V+N+At/Ib9ndzc/y1lBjrYXEz73SMP3RFRYT/3AKfSiH53oSATtpalBFJ59dTXf//RpPpfeegBtRPWhOEhEtr+fz3NpKT+/uFgNtzwvht75o4q2top66HkBc9E87oogEsFOoclZ2Hj5X/yLvPY7d/L4Dx5EfOtbmZv44hcL3HjW3RFKVOuZYp4DJwAOGSHnzrlwUpL6PtOT96ga1a448W9+s4QF29ulLHn6NKtyf+/3MppU2To3l9TRnTsRP/dzReP5+0uWuOxjYsAt+ogfLK6wIDzknpzxmZ68hITxxiY8Y8ZAQKjr62Xw0SPogKtXy7nculULKaJe+93fLarCjj2oB4goolC8BBaN7JMntdfjs2cZJpPP7e3l36emqm8HhNv1vYNBGhhhuwgEykHFWKBdmfAzPxPxH//Hifh+//fzPty5E/Fn/kyGvKenibRECJzH7dtVNSf5ZPFD2egvEUsv7Wbkrl3La1tfr2jKZg0kgtqaMqD4dEqQ4+M0enqI21zC84AeO+rj7CFR6g3gQaJ1e7t47oii3iQVzRkyOPOyR43mHWMlaasU/8GD4omnpxN5f+97NQ+/973aSWl3N/+3FR2jjMJDZ3CQVEiKpFzH/fvl1Cg/OGr5JLmV/f2IP/iDah53cjLq6CRkt7fz8++8k/ORDLZX3CrsAbRw9xMD/hKPXgQAMfdEZi/o8f5eEWjzgX6MrmWFGBnOy8uarKSFvc83PnVmpnhNx2eUFGt85StpeM/OMgn2xS8mgu00kOM5ztRULkqFHwcHyR/u71f423ty7O1lxdy9e0WnrK+X03n0KJGPzXhxsl0JgypCNUVUIdIbb6TR/qVfGi20gfgci5G5ebMSap6R7+CA0SiMusjFjjs0xJQh+qNEVHQgypLom53NxT8zkwZIAdBbb+Vxr16tqtJevn56Wtro5eW8d/PzeR4iAAVfP/uzJUvl9DkbDrjTFpqX9cIf1b+uQd9swEMU9fhx6dypoU5OUj9961ZJL6Ht4TCN5OJi5Q6WlnJe9E0d6L9tDCFJy2FeXOTf9NTRiZLskihAr5NvfSvPQTTx9a8XxQLBS7qLZtxXxVLmPf16T1r2/jAv4ZgY8N6sp6Pq9+PGOn0Cjcl6S/aZ/BIxih6E4/THvleBgkQOjeqNG4XCleGjBtbWcqExVDYs9jtnEVGGYG4u0SHjxdlsbCTSxfe6tuvX8xzu3KljQqqPH2cSbmMjF86zZxm+/vzPF1+MYujqnd5hkdKk6297Ob/E6MbGaO+No6PR3ieMPipFpDEupyQj07Xw8eN8z4MH+ZmnT8tYqpKcmsrX19fzvLQw7TsbMaD6cD98WHI6x9U868aN5LnxspKq9+4ldWRudBqOogN1hLJYWirDBjFTzmir0CkJz1vC/Rd/MeLv/t28titXMgE+GET8wi+UE8fBS7g/fZrfqYePyBDN43pUuqKCSGnReow1x4He6qqrb32r0L0q1jt38vO2mTs4qMgMIqcnB8gePUqg8PnP5+dFg30tv6TjAw34YDD42xHx70bE0+Fw+JU/fG0tIv6XiHg9It6KiP9oOBxu/eRO8yc4TGqoulcrfpjPMthQckQtzKmp2lQAj6po4uKiFnlPVs3O1lZWXhdWk/NpZRpRiEyvFBK4Ho4zMBFl2LxHmH3jRjkgioFr13LSU9JA5Ts7afwgUoqQzc1KJvUm+0qpoWA7yTPqfUeciOKzLX65iaOj0sQzAO4LtMWQ9epIaB+6trGvku9nz5KWmZpK40PGKbnYC4CePMnzf/AgUd/aWhWIoA+ePKkqQAoSPO78fFJHq6vlmNbW0rBoNyBHIuISvZBxmnccXG+dwIh3ukrCGa3mPP7oH434lV+J+PVfz+f22mv5ml40HJh5grK6di3PlbMzn/f2Khmv5oDq6fKyDPH5eX5+c3NU2dX5bjLG3m3z/Dyjn9/7vXKgT57kd1+/no5zejrij/2xfDbkk6+8EvHmmzmXu1N8ydF3xIdD4H8nIv5WRPzd9tpfiYh/OBwO/+ZgMPgrf/j7f/XiT+9jGj+OJ2Y8I8qAj2tP/U53CtkwKJrlR4xO8E4PWFCUF73EOCL/JtnI4EiCSZBq2vP0aZWga2+L49bzxO4sQnrFGgzAw4e1sBcWSmmiDJvD2Nkpo07qaHOBXpov9HZPqWTQKNBd10r3vi7OpVf59fd4xpK4utxtbaURePy48hEUDt/8ZrU/uH8/UbLoym4xvYpQ8nZnp57vzk5+N1oDt/vKK4kKe2TlX1dd6IWCUuNozs9Hk3TuH1VOByaoEHNHREQW+pf+UskOoWPJXry3BDyD535JdDLkzkPSFtes6OnKlXyGwIrjqqp0XtaLyKnP81deKUXU6mopqeQBcPs3b+ZnGHBb2K2uFnh6yccHGvDhcPgbg8Hg9bGX/72I+MU//Pl/iIj/L15GA96pBgvxR01odBmiyUNG9vDh6Mays7O1PyaaBN/H+EPcNLGQUz/fs7MqAOpGFF0gJKUY8E/ZtnATir28rIY+W1v5na+8knrgiFpQ/u3sJMJU4be3Vwkp6B1FRA7pvC2yXnotySkCwuMKt/HavS8HCqI3gMKHOqaoZ2qqnokkFqOv0dbTp5WIffQotdGqB5Xk/6k/lfJGCUYtT8kqGa1bt8pgR1SBjeOdnVWvl7ffrt7a5gXkS/IH+fZjRpTxZtAvLkaTi+6l+yo5Sq737Fk6L0UvFEtaJNgHE2UmJyMh2ROxHObjx+mwSfzM3ZOTnPvWzGCQ92BzswAJRwuE3LhRSexr1yqR/Cf/ZH73s2cZuUgaDwapZ1fHcOdOUWSKdyh/XrK2se83PioHfmc4HD6KiBgOh48Gg8Ht93vjYDD41Yj41YiI11577SN+3Y858MKMZEQhE5OP7EkBw4cduLSIQl97e6mqEPZLYuIsV1YKBZiwjABVhaSV82aMO1plDFEG6AOSRYsK2pF1f/312s7NhrhLS2k8HjwoOSLNbU9AcirT08mZ6iuyupr/FhdTCfD664VMqW465+++4Y7Hqx/x11Aa+oYhVPbNcOlrEVEG8/S0FrYy+SdP6jmgGBRRKdb52teqJbCxvx/x//6/1Q5WQnBtrZ4HR9jL7K9fz3N7++28N2guxkckRK8vV4GrVgyFRnAvzJMeXQASIjDzkiM9OkpHpeXx7m4lXtE8s7N5j8gmqTmUyaMGVWzqS4I+URF8cpKfefYsv99mGT26Gg4rAhkMSmnSe9V88YvlWO7cyd+pSpaW8vuoSqanq7CO415cTOOt86Bk+0vWNvb9xk88iTkcDn8tIn4tIuKrX/3q8APe/pM4gUooQrNQC2PY1RGQxY9yfIkX4efeXjmDiCpf1hpTS0ytPCMqCdedTC8aMuEhd82QdFZjrNfXC4H0xB9Dil9cWSkj/e1vJ4rBJapGhFoVXDACKBKJLdw0eZlezNQFEGxH2ML7g4P8LK09gw5RC3N95vCwrlEPGgqQra08/94XnMzM905NJcI+OEhEtrFRRTVnZ2nM3nnn/Z+1nh96Tq+tpRNEWY1XnA6HaWREVLu7lXB2zyJKhYFimZmpzZB7O1qIVpGX86K26VLTiPwdR+18bBS8u5vXu71difZHj/K5P3qUfLi6AluPPX48mnzWo/vZs9EEuu9cWSmuHE9/cFDRLspHDx9Od2urIrE7dxJY3L9fiiLboQE+T5/muTHQ1s6dO/ls33gj33/lykuv/e7joxrwJ4PB4N4fou97EfH0RZ7UCx0MgjCTLluzKJOuG8sPGlAvhQWEDKnu7JSiQ7GMisLZ2VzAFrTEDY0wOVhPRPWdyRlMO9RAo1ev1tZROEFSL2F012pbyA8fVohOFvfmm2Uwybw6v3txkYticbFCXJEGKRkZXO/PfP16nYskFBmmZK7r9Hlo//BwtA+4BC9DcXaW148aonNHHXFKdMFnZ+m4bLLgmT558sOfPcSsLcGNG3ldJye1g40qTs9EFSmqSG7hyZP62XxBM/Xt3MwNXHDvPz4+L6F099lz6TmSi4uSMfak9ttv52vaPbz1Vj7Pz38+j88Ay5n4jr4/J1ChadvRUVV0AgLoxMGgCsW2t2veouNEJd/7Xn7+/v3S+Pfkv+udmspnqfWxObqyUj9/BnjvPj6qAf8/I+IvR8Tf/MP///4LO6MXPSw0C51RoA8lsYsoPryjoYhR5BhRaB7XKORTKNHpGrwz1AJBCystBM7DOfoeqNukN1lV/tHIoggsgi7hg3YtfvfFRrkWA0P75EmhnevXq89ERB734cNc5KurtelA3+2FzlwYvLxcuunr18toMUD42fHybI7RfSbVtOmA921t5TPQRqA3ZNJfRkUttK81AO2wdqd6rjxvrKwkitMFsSdU7WvJWb/zTioh5ufTeHhO8gTC+4jazZ1zZcCpOAwO7v1UUmRzzql3LAQ40HJyNb6XEb5xowqZUHxoKM6wS2YVOnHWIgAcekfM165VGb5iNw5eu9+33hotolMQ9+RJcvMUS+bi1FQCIgVolF706+SWL2Gr2A8zPoyM8O9FJixvDgaDdyPir0ca7v91MBj8ZxHx/Yj4D3+SJ/ljDRVruEGoN6KMO3RCs834mYAMiYnXDa5F1bnmW7dSw4orpEt99dWS9OGjqQJoxKlPFGFAVa6FsZPQifjBndINCHppaVQnK6FIGshxLS3lYtjdrfN7551Csw8fFtXS9eAd2UNHvde0qMP916JWUg6PSapHW87oaBkgaRtRGwKQ60GAdmFnwDjq3d3qaCgCiihEiHv9YR0Sv/jFMlKSyDdulFPGeUfk/1Q7zi2iruvwMOfD9HTRTPp1c8z4beXqIoUflqvxGfPRXCYdpG+37RoEzLGh/ewUNTU1mn+h/hChcWb/8l/WzkVTU9WL5/y8Go5RUTH4Su1FP2olVHV2GSlkjgrCo/cNOTRx8wy19iVf/Axw3uPjw6hQfuV9/vTnXvC5vPgh4dU3GYgYzdYbqrpmxm7JkydpUGiYSdBwuhGFiKC63iMZ16jPBwN8/Xp+7tmzSiDa+Vz2HO2BA5dU7IkkqgSTvVe8Kee2L6J8wHvvlbFlkNfWKlKZmqok5+FhIup/8A9ykRr370f86T9dSGxjo0rnFxcrqdWpExwwLpsxhOg4svFCpH5e0OBwmDytDoGMxcVFUiMQ4NFRtS+IqEKOmZlKXJ6eVgm2ROv7zSc6dDLI995LA7WyksodSVHPAbLXevXZs9RZ375dRk4TKjTf+no5ZPTacFj7f3K647ka8wWN5e99c4+IUu5cXo4aafTY3bt5nyLyvrz2WlVTisoY74hUfkTUZtM7O6WKshZQNdaNfJTEo97gEpTWpPW4svKDBlsEJvLEh3fV12coYfm88dmsxFS5pnpO4iSiwncSLCFZ3wrLot7YSC0wxLa5mQb9jTcqweL7qCciCnE6D83yTUz9wYWwUI9JLImHivFvZqYKJbqcDP1B0y0cpSpQ8QaBu16JuIhyNgqCSPy++92I/+P/yIgioiKP997Llqi/+Iv5+5tvloyPs1NkI2kllN/cLO5fmIsDFfIq5OB4oFLRw+Zmbch8eZnGVx5jdzeP9fhxqXlEIlevVu/swSANau8Tfv16tUp1zq5DNKBHuJwKpOt85ubSubz9dhknJeJ4exSRPiyKckgbLy9TxulZovNI+8bzJOah43KYEo0S0Ds7NZdIQrUHWFkpbbVo5tq1LDxiiAECFZCPH+d9uXevGnhxaBGjZfbmJNWK+W9uKvJ6/Likq6Jg9RER5dxFLSJTPXPQVY49rkD7DI3PngFnNIXM9L4WwuVlPlxZe0PYGVGocHOzaAwIlxHtjZT0pZDUIXnr9EqvXDOh7CoeUVTJ7GyhDFpujkXFI4rFruW90rIn+fQmocLBd/eE0+JiGTFtPV2TZBTjzemRKr7zzuiuOK6JnE5BB+NsVxzyQPeYEdeOFNKk5tnZqbyBIiPUFFTWt0XjbPuGD8fHaRwk1BTCUNuIkKhh1tdH+3KYW8NhbRSMx97ZKV2y6EJksLBQyFYNAK2M3BLXAAAgAElEQVS/40rSduWQCMazZsC9j2PGTbsXcgioCbRexGhTK07d7kPQvmpUyccvfCG5fk7j8DCvk1MxH50XTj0iPydPsL5ekazaAxRf72bo/og4IjIaWFoa1bL3Jl/mo+el2tk5dqnvZ2x89gz48XEhKvyZZKUQ1aRj1LvkypAA6+0+Iwol40A7jeG4/ZiMOmciETkYVHITx2hyW2B94kHyMvk9S39ykguFsoXypNMTeEeGfne3dLeQW+cbVYl2SokD6Nzr06fZVEuFJTTWJYS6G0JHkmSSq5Qg5HLdQIsahMuQM0f9rW8VunYvX3mljOH16+k4PGt0AtUKowOlrqxUb26OBZePI+dYIyoxvLSUkjyo0Hf1LdW6EdnZqRYAfa50FI0PFo2gIPb2SnbX1SW97D2inLVNOvD9qhblFfx9MEj9/uc+VyjeM97bq3nufxEbrp/z0wvFPprHx4nwHzzI75KXsDaJAqiEOMb790dfsz6dtzXH+csLQOOiJrLhz+B4+Q14L9K5uEh0YIEKq2/dKkNKhRExWoADaUEyEYnYJBkhhsPDRFImCxT+fsME69IuhmttrULonswh48IL4wupCHSvM2ltBcbJ9DJrCAR3e3qamf5nzyoMnZrKxSUxiNvc2ir5Vd8Rpw+Im+YXdQLVCo85JWiJM5HItDgvLpIfxp8fHpaTlIS7vEwqC40yHJYSpu/Qo/KOvh1thELznYyixBwnbRd7zpOx6Vw5x/X22yXHk5gkIVxZGd1v032PKIMqZ8JQiWxQBPqkMPA9MosoY90lrtRR7vPVq4WC9RwRHULtnPfiYkU1enebpzdvpmPZ2KiE5tpaKVS2t5N6u3073/vsWWrth8N0EPrdM7IS4p4LsCWC7CopxVc+Iz+kHkAi1fq03iYG/FM4JG0kNEjVeF0TkvKAOuJ5w2eE2yYKtBNRRunu3fz9w2hKOz9pEokE5uYyPNzaKv2sBBEDIZxGJzCKHIdEjYVtAUA+ftbz2+bAaAbJNoVGEp74yRs3Iv7tfzvif//ff/Da3nyzON1XXqkkpWIXieHe99k9g0D7foqUEe6ThHBEyd4k/VT79c1xyRQNfLneG0dH1TTKVmlvvVXIf22tNPbQO6MlQadVqfLtO3fyM/v7owoiVY+Q7/x8GnJl+7dvV0WnOXFwUM63UyP+jqZirDmgjjrRTBo5yStsbBRFIU9BpeOe9/4zjs1IojgkM0n9zB3FO5zIvXtlqLV/3dpKTbkIq/f6wa9Ty9Dbo0Xkl7o6hVIL+udIRI4kuD3K+YyNl9uAoxg8HChJEkgyZm8vF8yNG0UhMKg8OHri/Lwm+82bOTEeP64mPHfuVFb7gwx4dzAoEqF8RFU5Qo3QG8cj4QRF6bXRZVnjFMuVK4WgISLG3P2y4HGEEnacg+tjdP7iX0zD89//93Vtf+bPRHz5y4V0VLjRbEuUQvj6vegeh1r43Oeqjzl9uwSuY6B7trerepOCheFB/ZDI6Ys+M1NO+5VXEgEyONeu5TN98qQWu06KGxuF/uyO1I1Fr2pkTLT33d/PYzJUenmQNopSvve9clIcUZd2SgpKiFOOQKY9+jS6ZLK3FKD6uXo1z/P3fi+fAaRPOiphL1LiTM1Z/b1FgJ4TyhJd8eBBbYg8HNZORdRRMzNp5DXB0kJ3bq7m1d5eqVZQimSM9imV8+DUKMkY9M+ofNB4uQ04z20IAyU4GJO1tSpbt6iEV4pdHKeHc4NBJb4gIJ9lBH/YGHcwuEN8oihBZejcXJURM1oKX3plpA0ULi+rlwTjf3pa/T7wtAzNo0e1qS7t9NxcGajl5VowvY3nyUnEL/9yLnyFG1BSREn15udHE5A29UVFbWxk4hNKOj3N3VZcl14ht27Vs6R2sI8m/nk4rArI09NKvs7NJXf66qvVRZEOWOLxG98oCsJmDqIE8r779/N89Qlxvl0XrV87pQR1DYcZkdeg943ncHxcrWZRFuYTA8QgahNg44m+p6TE+OVlPQ+FNySlJyeF7Dc2KiH55ps5jzY38/wfPBiliDgcEZ15TCXlHpjXWhGcnlYhlbyEhLuCM2tVPcNv/mbOg4WFdLB7e3l+Il3XgUpStXpxUaoleQA5GjLbz7DxjnjZDTjv7iFZRAo8JKl+5mcKbUs+QuDCQVVnqsY6Qo+o138ULm3cwXgNahVyzs9X35Fvf7v4yuPj/P2NN6qgiGFm4NAmtNIRo/27VUFqCHR4mItre7uQ5PFxoiFObWGhuuX5/fw8F/mzZ6O7jUtMktcxIF3F8t57eV77+4XuNG4iVZyZyWMzqCgvSTrGBdqF7ldWqoXt4mKibIZucTENFvnce+/ltb/9dhkkaF+vbQky6NZ7SAcl6W7dqmfkHN95pxAh9IqOMbc2NvJ6zbeZmXLCEGSnyhgmVa+2CeNoRRyiR/MJd+6Z26hZ97/p6WrFyhgeHlb/c2vFuSii6k7ZHABuzIEvfSm7OZ6fpxF++DB/vn+/ktgczDe+kQZcM60nT/L+zM1VXxsVzJL+rnthoVoZ92ikR5mf8fFyG3CTNqJ02HNzEf/8n5csSrMm20TRk+ILI0YTfmR/jmkR/SihGD767KzoE4ZfIhIPSS54cJBojzGBuKeni8PncBS06Gd961bxwxzD48d5LRJuKBhKkoiiVySIIJrBYHTneslE26ppkSskX1sr/hcahCpfeSXP/623spufZNiDB3k8CcTl5eLLNzYqwcWIo2I4CTvRUN3Q3uPAUQ3r67Uz/LvvVrT15En1L1F+jT+m0VeYZS4w1jpKolEYUQidVJLzkxtAldBA46k5nLW10a3rVHjii7s6qCufOjVGHogyM8+gcEYXbdQrUsk9e58TBV69r45nYM4pY79+vTZH/rN/NhVC6+vpRL/85Yq83OPvfjfrLMgZteTd2EgH8Mor5eT05OEwrAMA7MNQmp/B8XIbcA8Xoo7IxaFknVwvIifSG29UOEbCRPxvoVpgJoq/4/B6CfPzvD5EywBbQBCXMLTvAI/LOz0d3YFHBPDkSRkQ75HQgpokQlWmQdjUBtqjQmWrq4Wo6H87raSKU0II0masIGGNu3C43ZAIvx8+jPgn/2S0TP3hw/z36qu5sH/rt9KQvflmfR8jbqDGnMsrrxTSZ+i6PI5R2txM493RKpno6WnSJqRsOzt5PhKPXYFx92718uiInbGkdpLXgMJdw9xc7QT09Gk5JclE99vuSOhAUsfej3tpqeaUQaYqIWwuSO4uLY1udYbawHuTTU5N5fs5AE5DF8iVlZpv8gQiwatXc07du5cVnBGVMHc/JSvlLfoenj7/7rt5j+7erU6avVGWuWi+jldQ/5SMl/eqxxM4jOjhYfVS6GFq39VahhoVQa6HPzs+zrLx997LifIn/kTxreN0C6MsYYl6YeT7ZgYWx5MnmUTC5e3u5ncpWsEBQ3EmuOIQiTmI+d13a1duVIwWm8+e5aKRUHr6NOJ3fqcM9L17ET/3c5WQGg6r3e1rrxW9w3Dik1X23biRC5rUjy6YYTs8jPjt365zHx/vvJPHOD5O47ixUc2cREz2gUTNdJkfpDo9XUh8fr72vCSnkxiGmDulwTAwCGR9Es8S4pLJKyuF/iBKTcuUsS8vV08WGw/jwBm/hYWq+EVD7e4m0GDQOGvo305CjPs4iIH+UW2cXe8BpKjJPJLUvXmz5jND+/jxaL3E7Gw6XkDANd24Udy5RlK7u7VvJeMuonNvKVXcS1XLX/hCcegUYgqY3Ifx3io/hePlvGrIIGJUSUKLvLNTVIk+HdeuFdpl/PFkFm1EGrxf//WcKNev52f+0T/K973xRr6na4Atmr4zyniZey8Bh06hf4a+97/Y2KhCn7OznMyUFI5/dFRVixr49EY+Ck8gd8nA3/qtXFiSoPpYfOUr+f36eCwtJe2Bt7xzJ0NeSMt5Q4K937cSbUZIE6L3G10j//hxlVJD11/4QsQf+SPVh+bwMBEdKiWiHKTCH+Xq+qGIQnCwu7ulAkIvXFykEVpZqS6KWsRKzi0u1gbUOh5C4CgJicCIpAZWV0fpLYh1d7ciFsl1z+sLX6hkJpqKofr/23v32Mju687zXLL4KpL14KP46m52q19Sq2VZliI5ztgT2wnGxjq2YXsCD5yZbLI7zvyRZDDAYncGAXYwg8FigB3M7gwSzMDIehzAQRYbj5M4cezYjvOwndiyZD1b3ZJaTTXfZLHJYrH4LJJ3//jqk/Mrit1NNdtilXQP0Ogmux63bt17fuf3Pd/v97D7ChvkwAcUNODW4fAEmrHDw95IpfcxM6PnhU1xrjH6Ft3dclgMG9Z8dnZqJOLFRffcoUm8tqaigHulp0fHMjfnJmsMibhwwQsDGDfg3exuKXBu5V/zFo/GS+BUzmZ+04ceCdmsEg2VCxajo6NOVerocMgkbASaqTpl7BgLQGenJuyMjNQmCSptKh3kviHn26y2OkJpiDQZOIUb7/hx3SBsc0+e9JsKtSQVKMdOUltackc3BBhsUzs71Rtg2ntzszeJgDIQbwwOugScBieUPsQWVEHguigk29uVbJHfx7Fe+/Jl/Tw05FNa+JkFFZXj7KzDCfDXl5a0yMDNxrg/PO+YlqHuBC6gAYdStKdHr4nXB/REhkhMTSnRZDI+yADxCwlxYkKLGq/PVp5qHnodnHColCQqdou45e3saIE5ftwX5rNn/bunkUfVyY6HgoT7gf4NCZyGPPcHx0qBwqJCj4VkDEaP2yKQHjscdnorK3qdmRl9Jvx3Fhb0eCb2cB/CeNncFH2UuHpV733+vM7BuXPeA4FBRGFE9U2D923QrLxZNF4CDyW0NDDAKEmc996rVX162qlJbKVRop086apMmiO7u849DXm33Hgo17hhqICovDc2nKdqVrs7CJtN4MIkYBpP/BkZceFIT4/PnwwH4La1+SQUM0/UVPXh8UOn29jwbT0VKpL11VVXM0aRbkjwX3BfkgHsFhbQcJvf0+PfC/My2bmYOZ5JAu/qcnXi0pJ2BPCXd3c9YV+65Jg+kAQLGo01RB8wXLhewqYgE31IANPT3hPY3VUyRQ4PL/zkSf+sGDNRxcNDZwLQzo6Ooa9Pr4NAir7B8LCa1XyfHBuDB8DVYR0hOuL9uba4dikiwsRs5hAiTdV02rFr+N9m+jmVcsx7dtan24Q7hs5Ob1rjrMh1ywJVLGpBNqvF7+PYvb+bm51WSnF0/LjOyeysj3LjfACl0V8IBUDsnsMd79ssGieBhxJiqg4qnN1dHzhA4uLmw1p1dlb/T/KYnHSaWYiJ9/Rom0iCMnPIgW12aH/KDQKzBFw7xMNJLrjVLS8rCeDfvbCgm/7UKb+hzfz/gV64aUnKeGywuFB1gQczdQUmRHOzFo583het2VmHeGCe4BkN6wGmAeeDpEk/gZ+hxM3Pe7N4bk7n9MMfNvva1xxaYLI4ykiqO9gOGHXhYmem13r6abN3vtPPLwmjtVXNSIQ+7BjMfOADuw6+A5gn6bSOmYSSz+v92ZUgaIKPjT4AewJsAsDXceNjUQKvHx1VVQ0DJHRqPHdOx9/ZWWtJAORj5gsy1TpNUa6/MGCiAFfQpKUZj3gmn3doCRXt5cta2OilXLjg5zxcXLq7XVMwOOiJPYp8wAUFB8wejpOGKtqKwcFaP3aa0QyCYAcCREfyThJ4AwQVNskSTwsqDDyXuSDM9MWGbBKwOCpPJMYId6ioqlU1MLnhuAkfesgnfkBzAxIBAwfDBPcOt9ZUxysrbts5O+t0NZgJYJW5nCADM70nVDDwZm4wbl4qQarf7W1XFSK8KBRUJU1OqhIsFnVMp087lAEzBZwX+AX/FexY29ocquF44SpTIQEF7e4KkvjkJ93oicXv9OnaG3BtTQmOBbu93RdQM0FZk5P67J2d2m3df7/jwsA+LLbz865o7epyfQAWrignkY+HjXEgFeAFkt/MjO9K8HVnxwFkUirpvaHkVSqCCfJ5XVcoGZGId3f76C+SNqpcRGhbW3rNkDGEOGe/JBbi5KGhGYkvZPRQfT/zjNnzzzssVqmYff/7rqcIaYb0gDDqCsVCUaT/x4iKSh9IinsxnHvKLiW85jG94vvADZPdbEjRfRtGYyRwbna2ijRKSDAkDZI3VSjYMLg1sEs4ZcdMjy0WfYrO+98vLPyFF5QgPvYxVYoo50L1HX4pZr5tJoGBz1E5UYnv7jp3lmN86ik9FvtNGBzcxEjPd3Zcgp3L6cZYXXXxDZJ0OM/M4WxuVoWfTkvIsrCg17h4sXbSd7ms/2Mx2N11vHlrS+/JItbe7go+dj6wK7q67O8G5LIl3tnxm3tgwCmOVHULCzqW5WVvMrJotbf70IUTJ/S+CwviC7e0yGMDrJveBwtiONIuhD4wcjJzCiKNy9FRJdW+PveWBsMdGtLnh41BX4DHcN7TaW+eFgrOpz52rLbCRmyVTmthhXtNn4briWuOhEvBgKApjHAnx64E/B2BDg3x7m5dL7OzUkRyvVBJr63pu3z0UYcL+SyIaUjIZm7LG1bIPT06r8z35D5EnMV9EfaKwPjDHS/NS7NaHvvbNBrj04dNQb40GodsrWBuQA2EFggrAM4qUmU41FQA4+O+Nd7ZURIHihkcdKy9UlH1e/Wq38yhKg9Ih4ocuhmfA44ztLPdXb3eiRO6WaJIr7e1pQTR1+eQS09P7Viplha/calWYdQgXoEJAiY6MqLkhG8F7A2auyQ2/CbY4saxe3ybKQGHLouhTzMUvp6eWh430u+eHlVz9A2AuqpVN8UaG3OZ/bFjes7yss4R1RminatXhVPz+RlfFrI1UL1mMj78mO042HZrq0/L4bls34GZzPR8hhnPzXmVHNIaFxb8+wfPJWlzDZjpfJTLes/z5/3zocRkEeV7IXlzPwAV0UTm90BMuCKyG2WhBSajkkWIBUzCrhWYDpycqh8Tr5B9E9Jq8R0i+QKvhTTCbFbnKeSPMz0JtgyLFJoCFmlYL2/jBqZZoyTwMAkCRZAkaaKw2tPEYvuFv/OVK2qEra8L2zx/XjcNN1koGeaCwyPazBeNK1dUqSMcuXpVfx57zD0ZuKEQu6BYgymCgIeEhsoRkQdybPyggXjMXHlJczGX0+OXlnTTbG3pJqAqwwoWrxNocUAKLDo0mTBKAtfGCGl42D8zzBcWC/B38E6sagsFXxiBvsw8SZo5P7+7W9/JzIw+0wMPKDniyd7S4tPJeT2Od2FB79nb67syFksWKa4hmC7Q0sKG2D33aJECayWRYmJ144aP2MObg8WN3gdJjwHGAwPev8D/hF1NaLmbzdZaFOBBz3vvVaTSUAbnBhOmycm/0RhQUXO/hLtaeNj4jCws6PsJPUiOH/cFEkVlJqOFmsETmYxX1ZircVwI1drbnX1DQQVHH38edq7cN7wfcCWf5W2evM0aJYFz0VExgFGylSIBILel4cbK/cILZl/9qq/cV68KIvmVX1GCxksDz2v8uGlqskUsFl1uzFavo0MX8Kuv1mLVUaSLvrVVNwSLCub56+tKuj09LiE3U5UD7aqnxyXasCTA7NltwJGlImxqEtRw7JgvcEA9LS2qwEPpPH4Y2awS0+qqU8l4L76DkG4JrYxEWKnoPK6u1rrFMaR3d9cZBtDQ2N2wi+Azk3SzWS08eNqMjnrFxiT1tTXnWc/Pu+CGie5UuzTIaNSiGmULDzaNpziCJSiGS0tq7gH34C2C3ziuflgI9PX5wGdGiJk5hlup6BwhUqIynZ52DQDXO0IhaIosYnxX7ERIlmZ67OKiwzwsBPQreC5Cnrk5HedP/ITZX/yFfh4Z0XsVCmbvepfeZ3nZJ9fTZD9xwt0iwbXD7yn0q2fnE6p8MUBDAMWOiAWYezzE9Pmcb/NojARu5hcLlRVcZPi7VI6Li/qyqZ4WFsRc2N5WQkVJNjtr9qd/avaJT7jH9/PP6/XwhV5fFwZO8oIWhmWtmV9I8/NOp6OqgZZn5skGLJJqplr1sWbAKjBqCoVa+lh7uyo6MFwmz+zu6vixj6VZROXGYNytLd1sVMLMpAQaoDING3osmMvLfr6bmnSz0kAKzZcQDDU1ecO2r8/7DVAGm5r0HKqr3V09FgFPyBQZHNTx9/SYfetb/nwS02OPKakgjcfulH4EXjPAFyFzhsRC1Q2FdHzcMX5UhWYuUGpvdzdC4BUoj+C0x4/7lp8FnSYxxwFmbuaMIRrwwIBwr1lUOD+plKtPCXZFxaKzZOJY90Uup+8GeIfvD1ZTU5N2Ibmcip441vXywQ+6AIcdLgXJyIgXI9yH2DlTIff2+s6H3WQIJ3K9ca+wOCAY4zOEzduk+jazRkngfHlmvgXliwxNfcAr2UamUu5CB67MTZLJqBKH4tXaavaOd7gNaj4vahdNLgYfdHW51SdWtODPodqO4bgMUGDcFpUVyYUKksQGvfDeez0B9/c7mwK3vfl5l5rDGAEnzeVUTUPP4wY5edKnsnPcMEzAK6mMQ5kyghmON5/3hLm05EOQwVXhmYd4LKo7YJgo8qoUMyngAxJAJqPjuXLFj/0Tn9DuaWxMC9y5c4Kf2JJznPyMAIdzgJgLymfoyU6zcnnZGUflshZ2sOGpKcfLu7t1DaGUJbHCGsHgCYl86NjIDqdadUydxiU9DAyuqJzp39CsxxsHCqeZ89yhCqJ/QPnJPcB3EceqtrmWgHMoXE6dcjEPjUcgHBYXM30HUBOBEYFlwgWnqcm9XFioeF8WWnYlvD5QF8M9gFKSaJAEbuY3NzJyKm8uXppRfMlUfFQC8IphpayuupycqimbVbXX1eW4K/Lg1lanyo2Pu+IMeTIiEaqFmRlVi5//vH+GVErDEc6fd0ZEd7cnQBal4WGvxJiMA6aKZ3Zfn1d2NO5YxGBywJAAVjpzpnYOJ9U557erS4lya0uLHjcS1EGUd0zOAdYh0a2vKxmgtGxv1+MGBryJR4WYSjncUSq5//nsrC/QNG9RrUJFe+97zR5+2G9uMNzWViWbkLFDpYc3DO83O+vwADRBFhWOm2Y48B3Ta1j4ymV9V3w3W1vO2BkZcYk81yVNdXoSJETYSEBO+JZw7ULZY+oPiy/NbBrIsE3YSYWiN2CSnh6HrUjoDHJgN8BxdnU5hEg/JxR/tbR4o5vPxIg5XouBHMCOoSfQ6qpX1dAH2e3SU0CARMIOeeBJNFACJ0LlW9ioAtMN8TMEBCMjwvUyGd1sMAc++EG/0cOLhIo//JlK4/77dWE/84wSwbFjTpkDIy6Xzb7wBbO//MvaY9/eNvvyl/W+996r55886T4m3BxTU94YY8gCEFHY+EGFSYLa2HD+7mOPKWGRbFgUwiZtaFOKDJ5kT+Kh2lxedo8ZFj34xeWy3mt1VeeBxQEY6sQJ90fh/4AN4lhJ9MYNb/JiMUtDF2YQ3t5UbVTBra21nHq262avb9zNzDi1knMY2ugi2qFZFseOGbPrCRuoJ074NclwYyAy8G4zn+VJEx41KZVmd7fOAwmZqpPvxswTMxBYKOCh/8Px8RoUAGDnLDC7u241APTFdUZ/hWue65Oihd4KIjKoqcB3FEyVil5vcdGLlULBd0ShHQVFAckZIR6VfhL7RuMk8FAYA8fbzLeONBXDCgJ88id/UjfuxIQSRTot0/lz53QzFArePWfOX7jKUwXxu5MnndJHhdHX502cycnXJ+8wnnlGW85s1itOM90kPT1eXdEAohHG5wSnxkODSo3mK7QwvKXBZLlZaKKG2Ck3NZRDkhYJBpoYeDjeFAw6SKW0dQ+l/lTSL72kqi9sblJFRpFTDrmRgbWwdoUJAvwA3RDVKHALTeUo0oLFZPSrV71Bu7CgY2AwBZAAuzE+y/CwL8ahWIkKEs+YdNq59zRNaSqyCGNQxnmmH9Hf75+PhAnGzW6K64/XpgHNtRjqIngeNsrgyOwqenqc7QJODnuEngO4Pvx3pkXR11ledpMpjgP2CupfrlV2BSwKZrXKZRI53y87qqEhndukyr5tNEYCByYIV+K9jQy2d2GXmubawICk3FNTupF6elRJY1APRQnWB5UXQSOFJIbyEUwPJkZvrxaKsbFbfx4SfybjGGjITQ+rRjB0MG6sTKl++cxmXtUiokANF/LGwVBpRtFMZJtPUujv9yQBfIVyEWolja/+fr3HlSs+YIHvimbc6Kh+RgCEkIQqLpvVa2BLGtLFNje9AY3tAF4e8M5JxhwvU4nwRe/sVCUIRoxKdXNT3xnju8CsNzf1/XR2akEeGnI5uJl2ddWqqy0RvFSrut64/mAp4WPDuDO+C+Azeg8kMxIf3ytNPnZOoQMikAvXwV6IgaHO7Ai2t2ubjjRUmSGbz7vwCP481xbnl+uV66dQ8B0U5wAlJ30WPG5Y2Hp6fHHJ5Xw+KHqIJG4bjZHAUX+Z1XogwIrgAkeaHHo/0MgaHnYclyYnVq3QnKje9lscwMJpyLCthG2CRBx2BgZY+wVbYBqeoUkPeDAd/2LRoYRCQe/PlpQKmgqexIWCE8wxbHBRKYUQAVxvPifCJ0yHwKH7+ryqjWMfWgCOms16c5akH/pXcH6Qn4cccrjmVNd4vaytKRmYKZGWy/rO6FvQJ4jj2mk8fX0SAnV1KTnQ6F1YcKjEzN8DLnM+77s3jgncm+sOBoeZV6Pg5DT7oAiSANkd0rClyUoCLRR0nPQdWHhDC1mqXr7rkAsNVAKcFjbr2V3Gce2EJnoYXNuck5Mn3RKW+4m+Cf0IICdsILheT5xQjwiojEUD+mgq5apmhkpzfXCsLFxJ3DYaI4FzoVBxhAq60CMFsQgXF5VNb6/bWyIZp8qEYQEzgKSwt9pH1MD2c21NkAwVOAIdKvF775U8fr946CEd2/R0rQIyvKhZLNbXRe1iPBwCoJDRsrKimwYeNdUgNEa24mYOATB9Bbiiqck9SBAYAaEMDvpnjyK/6YB2kJUzQNdM5wBMGk+tQuQAACAASURBVG4vf0olh1So/MNFmeR37pyr8159VVXvuXM6tslJd/kDs+b5VPhtbW7PSoJdWqodoLCz43RN4CIWJ6ARYKcQm4Z7D7YL5ATdby9zCtsAcGYzZ7q0tvruL+Q90+SkGQ2UF/YrWKTpv7Bwo/4k6XONscAjaWfxwUpgeLi2B0LiBobjvLJwoLTkmiqVaodjsCjgZEhlDkmA921p8cLnbext8kajMRK4mSftMKjy+MKBQkjQ0JJaWlxwAUeZ7rmZ+2uTHMEaSdwhNxk2BDxbpsRTEUHres979PgXXqg95kJBf0olfR6ohtDE4NHiwQ3mTIJYWHBWR1eXEud3v6tqqbNTyam5We8RTpsHriHxw8Vle0yDMjTkovEIPrmz4zcwdE3Gt5XL7qvBzU8FODRUCwOw0MEiAmIBEwUnRvjT2qpFiS0+bBCgCzNf4MMeyLFjfl6Z7nL9ujc4Q7zazBdIkgnug2C8NL/pJzB42czfM1RjgkuD6/f1eZEBtIBaeG+zjgQY7v7AtkMsHCiKRRo7YRZ3Ej0LLpL1lhZvgtM0xLgr5MrTrGVXxW4QKIUdSRRpUeZ6w2se2ikNVhZcxviZ+W6SRTSJA0fjJPD9IvSEMPOkA84KtjY/rwvLTMmkVPIKFW8OKhW2yjSDSNyIF8z0/Kkp/f3446oOt7bk73HmjCegj35U1TPb2UrF7IknzP7sz/Q6PT3i2YLFFovOyKDih47W3OzVy8SELx5/9Vfe0CyXfTjDXhEOo75CYy0z386zVQ9nIJIooMhBKwu9XhDPwJBhyx2OysLTmsYaMnKadsPD+txgvyyEQCxzc0oOJDEa2jhCsnsJG3wcz3PP6XOC9+LBwvGjjM3nfRoMnHYz9xTZ3XU5OI1jJODwqTkWkhXHSZW9vu7yec4x/HsESATVLz0OdgdmtdYSIdWVngwLP4sm3wnHhvwfPxMgkYEB7yOETBMw9uZm7TrYFfB98rp4e4+N6TGFgn9HGHXRvKfRaeaNZHYXSRw4GjuBg82FSZwbiqqA6np93SXIYWILfU/AisPKm7+pdMy0GMzMmP3e7ylBUK0/+aQsU//+3/fK9OxZ/f+rr5p973tuYpRK6XWWlyVThiKGl0kUuXFWLidaIIl2c1Pwz9WrtT4T3ACoBRFVAI+Y6RyEntZgl1RznK9QJQpLAn/wpaXaxlbImoE/TKJmV4ANbnjzU5GhMKQa5w/yd3YSobERSdjMkwtJk+QIjDA2ps8/PCz46vHHJayqVJS0Bgf13jAoQvYRVD7sXjmW7W0tpNi+8t4kQKpuzjmUORZJKnlwbpI4Qih6KqHkP6zwgYBCuI8FhEWHMXfAK7w+LBh2pjRGQ+YQSZmqmGsi9MMBj2exAapJp93ilh3BwIAbdXFuOU9J1X3H0dgJnEZYCK/A/Tbzht/Cgv7Gcxu6GomayhoxAwwUMEPeK4p8yszXvubV3cCAD4z4zndUVeN6R3Po+nUdT2+vHhdOiR8bc0rZ1JSO9fhxp9ytr8uTmUr34kXn19LZ5zxgJwAcMTurnxFrwLog0cAUQGRDMiL5gGfy+YE+8PqmKiZBATMNDjr+TCLi++D9aGjx+iws4O3wudNpt5ZFhLW5qYSMYIjFlgUI7nMuZ/bII/7/Zt5IDZu8VKIk6dAwigS+u+uCJbxPMGHKZNwWAIw7irxoMHNoCYgmbBCSyJmOxO4H7xa+ExIrr8+OJPRLIXhdrrNw0SBJs3CSvENaHz+z4wp55xQtLA4UUXw+dnowZ8DpQzouTc0k7jgOdfaiKPoXZvY/m1lsZs+Z2S/FcbxxNw6sJljhgTa4WFBjVqtKJisrnsCpMm7ccBl6qeTezPCIqQRQuZHEsKUlIcFeoQJ74gkdQ3+/08VYIEol3eDANaurSuChvDyMhQWnC4a/y2bd+D/cOZTLOgZEIwxrpqHIzoLpQtWqjpPKFVVia6uSLOZS+bzfqCHFzMx/B2WPm5aki/MdiwI2v+DX7AaoavEJp9FJVVetOo0QOTbYOosbeDIilL4+d21kUQ6FWCQxdij9/e6HDg1xd9enBFE98zoMIWDnEdrNsnuDAscxhBYPZr6gspujsg2rWNhDYbMSGIXvAWiN74MkGnpnh7J4CpJwkQippey6WDDCJj6fI2R+hXTdkK0T2ltAE2xvd6965lqiPKXxmUAmh4o7TuBRFI2Y2a+b2YU4jtejKPr/zOzTZvaFu3RsCi7QkN8dcp9RlM3N+YWytORjtF5+2Yfd0lyanPSK5o/+SEl2YMDsM5+xvxtrZubqSBJApaLngl2aualVSBmkQYo5FcyV8PG3i9VVJcSdHedSX74sARK7iIsXvUkKZNPWJqUnUEl/vycXjhsrAryiMURiwAHsC5JhWMmFeCjNTyAAM6/gucHZ1vOH76RUEqMkn/d+BdODWDR5X8y+wmSAz01rq9Mrw6ZfmGS4jsKBAwMDOo+wU1jwSEB40lAMmOlY8Y0BwuFnrk0orEAqBLg8x8hiDGuKHQAVK8wTrll2FSRX7gfeH2EW+Djng4WAXRoJnX8zJxVfFZggLE5h4obhwr0HrBZCN1TZLCIMj+Acs7ByjSRxqDjs/iVlZh1RFFXNLG1m04c/pD3BBRPikmZ+gTKCKzR57+z0am131xtvqZRuxtlZORROTOim7O8XrPBbv2X2q7/qW3MafCQS4BOEHRMTDr+EMTenmwIWzNycN8UOGtjbIj+nk1+p6A9V5+nT+qxwaEP1Xrms54W8XTxGgF5omOZynjjhLpNECGAapPxgqvDv2QWk0zpnDGMulfS8uTmdZwZH41t+8qTTJFmsOAck3bk5T2jhlB4azGzXsfKlSRlqBGDRkHRI3jAueF2uo0rF51tGkcM4cVzrOolsnkWPytfMCw6KAhYfEjYSfwoM/L1JltwDJNRQ4k5QifP9sIgglDJzRSa9Es7N9rbj7VTwJP1wAeQe4JrgZ5I3fQ0z701wfQCjwXRhgQrv6yTuKO44gcdxPBVF0X8ws3EzWzezb8Rx/I29j4ui6LNm9lkzsxMnTtzJG73+Sw4ZImzNcXGjGuTCIYlwc/X36/E/+IGbV7HNn501++Y3zf7RP9J7jI/r5xde0HNOnXLf7Ecfdf+TMDIZNciWlhxeuHTpYJ+1r68WSpkO1kO2qFSpc3M63nxeLopUNBsbOj78mPN5LWQrKz7geX3dIRUznaP+fsfb2d6G3hbgmlDMQgHS5KTj41jhrq/rPMSxwx5s1/v7a8UbJFy8xJlhCSec5wIVbG8r+QMh8f3DyybBAAFQ6ZLsWYTBq2HMwFve2nI6KVRBbGeBkTY3HYvGfz60KABrLpf9+eDplYquraUl7R76+72K39527D5MwiwY+zXuSfR7dzphwkWrwBxUBDbsXkold1IMG6OhoVZIXYTRFEIsNI7j2E3P6EmkUrq+acoC6+ylBifxhuIwEErezD5mZqfMrGRmvx9F0S/EcfzF8HFxHH/OzD5nZvbII4/Ed/BGr0/i3DhcnMyqpGqcmfEE3tXltML1db+odndV+VHVcYHNzuriGxsz+4//UYwFWATf+55YJffdpxvqoYfMfvhDP67hYVXely97oy6T8Ybl7YKJ7XuD5uArrygBkXyZdhNyhDc2vFIEmjh+XM/FxhPWy4svuhFToeA9gL18YzPfwSBZX11VErx8WWwYBEx9fS45n5pyp77d3dqRaGa+ULAQT0zo554e907v7XV2CIZaHM/Cgk+lR4qN/w1YdYhxA8swYgzBTWisxaLB6Lq1NV8U4tgZNjgeorzk9WieYloFjITtwPXruj6R0F+/rs90773+WJIbcEaowAS64roIG40h5MGOiD/r62JCmel16Q2xEPf26vlzc7XNbCT9Ifeb6w38mx0L32lvr/7QwzBT4RMKgLivkzhUHAZC+RkzG4vjuGhmFkXRl83sPWb2xVs+643GrWT0JJ/29lo/EbbCIyM+0Ybp20xUHxrSzXPmjGOmk5NKjuWyXAP/4i+8qjDTxf3DH7rkOZv1ygxBxpUrjoczAeagETIIzJQQUD8iwsH4KZtVgnz6aZd7MyJtfl7JjcRBgsYVsFLRroIqbmdHO5Le3loPcfBXFgkzryJfekmNXOiZKytm1675VpnxWLu7DtPQgCUh8T67u3ouirzQOnh21huiYL3Q8KhYoS8ieaeBR9MO2IDrAGgGMQ0GTgwDgZ1C/4JqnwUOdWk4zmxtTZ+TsXbsdNravOlbKul7AapAhciuCU94sHcWorA5SHLmc4ULbMjG4jE8/tVX3Y+lXPbF6NVXtcCjIoUYMDWle6SpqXaXS4UP1g08BGwTsnDou4SsGWK/nXUSbzgOk8DHzezdURSlTRDKB83sibtyVGGE3e6wmRJidODDNC8HBhyTzOfV7Bsb079JumZmX/qSfj86qiqyWtXAgCiSYyDWouGNYqZEjqkUmCx/UPkdJmDQhDMt8SIBOgJfL5X8ZgdqeOopfRbUmBhAkSCpdAksBS5dct8Rqn44x/B8s1ln1QANIHxZW1NFjrsdAws6O51uR9MNxamZy60RjcCsYbuOdQBiJXZbcL7Bf4tFX6R7e50OSLNzZ0eP+frX9VnxezlzRkmMZIUDJPNHd3aU3DGbam11WmposctkmmzWpeKIxlpaHMLAERCuO5zyUJkYSuFJ4GETH2EPzCuSKdj/3l3U9rY+a0ghRTMQUj35vmhwwp5h0Ak9Fo4LXJ7vIxT2sLsNdzl7i7AkDhWHwcB/EEXRl8zsR2a2bWZP2WtQyV2PsLIguOBCzi7VEDjj+rpu+v5+swcfVAJcXdUF+dhjuiG/8Q1d1GfO6Hc9PUp4S0tuTg8/nFhZ0Q08N1fbxLzT5I03NO/JlpURbmzXNzaEw6dSfsOHPG626OVy7YiwpSUdb0+Pb93Btdka00SECREmbaot+MFU51TeSORJJiTK+Xkl0mzWEwkNuo4Ol7Ej8CmVapt+nAuGNExOOkaN2RWNslSq1gphe7uWtrm1pXPwx38sTj0NQ3YO732v2fvfr+diNRvy0/GHQU0J7z+V8hFlcP9DSIbvFIx/ctKbluxS8Fh5+WV9v+y4qIL38qzNdCylUu2UH84diZXdoZl2jbCxeMzGhnZvPB5qJMfDosDuiYYnNEvOO2K4sNcUJuhbFWFJHCoOxUKJ4/hfm9m/vkvH8sYCIQA3Ig00uMGZjA8sBhqg+ULld+aMqGw049bXtZUtlyWBHxtzKXcYHR1m3/723fss7BpGR5X0Vlb8hge6AI6ZmhL2Pjysc7C05DhsJuPNuSjyBm5ra61IJp8Xfs6N2Nqqn3M5h5WosEKDLxK6mc4xla2ZV9WplI4Rb5Tl5drpM9WqGyZ1d3ujlKQfWqViU4r9a2enL0Ykl/Fxp/fhrwHcgfkSMNr8vP3d1Hu8TBAWwZTB+5sEB02yqUnHQUUKnxqBEgsQtMDFRWfzUFBEkYqIGzf0HfN+nOvQt4QFE5dBzjPsKywAwLpZeEmO7LZ4Tl+fPntYiDQ3i/uO6RmWDnyXPHcv75zqnfPCQhfuFPYm6P2KsCQOHY0hg6KawqiIKo6Vneq7WPSBCGClTU26mbJZXcDgedCoRka8WbW87EyJhx5SpbaXJtjfr0rpbge4K8wZ6HP7xVNPKQl1d3uDlOYrCceslh0RRWLOmPnNx78vX9ZnvXpV5/Ohh8x+6ZfMPvQh336jdlxZcUOwmRndsNj0mnlC4fPAxqFvsL7u/Yr1dS2uqEWjSK8/Pa3v4OJFbwZS1YZ8aOwCFhb0Xixk16+7n3cU+XAGKmGwbCAhjrNY9Aq/UnFKYkuLFhKzWpYTDWM+F8l4fd0XPaAIMy00mYzO2+ysc6WPHXPDtMVF9+JmVBkJnV0YjJCQarg3eYYiGai1Fy5ogWecHN4nQDrlsr6P7m6HxICHwLJJ4uwIqL4TUc6RRP0ncChOXKBMYKFrbuY4WxzrQoVlAE1paUkX2CuvuAIPNRjWpXioAL00NWmSz7PPqnIz0401MuIubnczaIQdNCoVUSC5gUPvl5UVJQn48WaCkMbGxEDhczzzjJgoP/yhzo2ZzsN3v6s//+7fmX3kI74T6OpyeCKbVUK4ft0rYjxZuJkRU9GjYJrL7KweNzSkpIHZFjYDONnNz7tQK6TFUfWy1QdmAeJgqw/8Y+YsG2wAwIF53aYmLRxDQzqWa9cclmpvd49rJvBQ7eMJH0I/5bL3YXCaBCNHpXrihN4XkzGweqyAGT4dKkzNnE0VcrFDmicsq/2qX5g6J0740ApgLRZ1oLOVFZ2nri4VLRxjKI+HNZOIco4s6j+BU3Fwc0SRbs61NR+DRTXW2uocaNgGKyt6/osv6ubJ5bzS6e72v2ngsEVNpZSAqOBmZnwc248jWGxaWtx2dC90EwaVLNv6kyf12bJZ0R0nJ2sff+WKRD8PPKAbEi8R6In7xb/5N0ooZ87oHM7N6fiY4H7mjOPglYqfH7b20O+Q31MxVyo6nyQGGmxtbQ7LtLVpRwD8w+6LhAULhSYuEu3FRR/HBeVuZUX/NzSkz0wyhs2Sz2vQtJlP1SkUHKbAiXBqyvUG2AB0d/vubXDQ//+VV/SYyUmHQzY2xPrIZBwug5pJ0uzo0O/7+vT7wUFfEMNd0142FtcsQpn9gmQNLzscYII4aX1dO7ypKb3v/LwS+QMPOHRCNb6XaprEmx71n8CBQ4pFV+5h8nTjhs8cZFs9NeXVztycU9hmZmRqhOINwQG8Z3w9MBN68kn9CWNnx7m0N4tQZv9GYn1dSSOcX3irBF4uexNsfl7HdfasXoeRbmDc4LB/9mdKutWqqsjmZm/G7Q0SRrGo1zXzhRSePT7UDLblMSQmhDaLi+5CCF4bx06JhE3ClJhiUc+ZnnbMGOgkldL3A10QhgeJKVRAksxYYE6eNPv4x/XYZ5/1IuD++5VQERLBS4eXj23u+rp6JuzSdnZ0bHClzbS4pdP6/diYN9rpQYRMjdCHPp3WAgt7p7lZi00ofEE5aebnEJESbCyq6ZsFDX/omyyKwJRPP60ETmOdPkOh4B7sYPZcJ0kcWdR/Aq9WxVlGPLK8rATBQF8k5VQig4OOZ87N6UIE837mGeHA8H7B+LDvhPb21FPu2f1GIp8XNJHJmP3N37yx57a0OFacy2kLPTFxc3FPSGlrb1clzgT7xUXfdsPOMHP6X3e3MzyY8bg3qLDwyMDfBWk/DdE41vEuL7vfy+CgD58A9kL2jrJweFhJrqnJEx6DiVdWdJzg23x/WJ8CXdCQNVOC7epy/jMLcmj0hbjnox9V0l5acjYMi0JTk9MWYdTA9FleFv99aMib5MAqzc1aVM08mS8t6VpDTMQ1CluJa5o5kOm0FgCsV0OXRXYq4Y4UjJ6hF2ZOMbxdEoflFOL00EA7Oz15I6yamFAVDqyU8LjrIuo/gZNkSRpggIwPGx93cyJgExRxo6O68GdnfcI2wwEwK+rtrWVoPP+82efukA2J8u+N0AmjSLg0eCZJamNDi1OYwIEFmpv12QsFx3qhAnZ3+3DlvTJ/M8cxSyUfv8YIrb3x4IPeeGMxMKulb4Y2roigzPQ7htPSxAPHBT8O+cowidradGzwkysV55DTQCOpYKyFYVJIi8TrhGYmXHWMvIaGaulxUOlgu7ADmZ8XHPL8885Fv+ceNR5RYra1uWUvSdfMF5Mo8oYqDWG+wxAeAlfGRzsU6bA7CK8FMxf0IPTCvvZ2HtsoTtfXfYcQWtPSUIfnbeZN8YTHXTdR/wm8WpXIAqENSr+5Ob8ZkY7T1Ud0gKwZWW84nKC9XZxbKnA4uzMzBz+28+eVcMtlbfep2PZikHthlZERJWcodCxKKBTxT0YBaqbtNZL+iYla29CtLVW0vb1KTO9+t9m3vqXz0NvrTJUTJ3x+pJlu2osXhdP++Z/XHvM998gygMTHSLbeXm/OofZkF3DjhvOCzWo/S7XqO6WQNkdjk6admc+ynJ11mtr2tuCx06eddx4aQNHIzOV8gejo0M80Rycm/JwxPR1+cibjDeF0WovjxISO7/HHfTxcpaKd3NKSCgEokDCAWFAoIObna/26qbahBnK8fNcsZrwW1TfJHxw6XJyZdE81vbnp3HsYK/sFbpL0iXZ2dK6uX9dxmXlj+Kd+qhb7TnjcdRH1n8DZfqJCZIgACkUz/e7SJV1g99/vysJSSc+j0gNrPH7c1Y5MI4f/u1fOfrNA2YjgAY/ugQFnvZC0w+SNDDtkbQCdIEcmkYSsFBqN6bQSA9xluNqIf3Z3zR5+WL/73vecMTM0pC0w3GsqxXTa7O/9PVWUk5OuTrxwQecSL5flZS084LJQOcFs9/KeEf2kUsLd2UngSRLHSmQwM/g+KxWXuUPro7mLmRcNbZIWWDhQzeCgPheVOZg2cnmq8/D5DDbY2HBaXXe3zjG8dTjZCwva+TGfE3Mw7IbPn9dzqbyXl31WaX+/7yTa2oTLA1NRkMBs4Zpk58lgBXYHUCGhIFIVh8ZSLKJ7k+32tkNcUGoZ4RcOG4Erfu5cUnHXYdR/AqeZh3oNKTV86WvXfNyVmaqjc+eU5JaX9fMrr+iGi2Ozn/s53TRc/KurtaOpDjreqa/Ppe5mtcfY3a1kOTurRiiJuKtLxxVFPsigvd2ZEkAGfLYXX3z9+2K/upcatrLimHQ6rceMjOgYlpaUnC5eVNKsVh0zZiE6flzNyp4e52zj5zww4CIeePLd3a4y3NrSDgTDJoQmVI/9/U4ZZJfBgtPWpt9jCzA5KcbM7Ky/J74hVPAoBqkcgY+KRWchMTy6r8+Vs2EzMGS3MCQBCAUq3vi4ft/Xp3OP8pOd3cqKztXUlP4UClr0cjm/tnjtwUFfYOfmtLthNmippD/ALXNzOiednfq+8nkdE/L8cFRauey7MHaomFXRHN7P9S8cKkFzFIrjo4+6FfPAgBZgPNGTqKuo/wTe0aGkxwBi+N9UHExyJxGGAoutLbP//J9rX+/551WZ/sqv6IajyYfoJxy2eqvIZp1OhvPf6dNaPJCUnzqlhEjCnp31sWBgjyQaGlSYNr30kr8X234zbzh94AOeAPAlZ7YjzcFcThLx0GgIpgYMiIUFnyiDydPmpleMoYdzOM4MWOTKFS005bLjr5hPhV7UuZzeBwoiMnZEIaureh28qYELaCxDkQO3Zssfx179zs3pWoGZg6UAyQ+YDesBMHIqVvoB4O7g+tPTbtEK/9zMaZAkad6jqcmpoLOzumbpB5h5xYsXSWiVUCx69b26qteBt4+sfm1Nr8nnIVjUuZYrFcfW9yZwFrHdXfdvZ8EdGRGERlM4tK5Noq6i/hO4mUuqaeqxvU6lHDstlRxKuHZNN/3v/M7+r/fXf62K85OfdDodUSjI0OrLX97/uadPC3KAVsb29cwZWYJCycKYKZ9XMiQxwj9G7o8Cjm18R8frm49USKELXT7vWG8m42O+8LBGeHH8uLBv6H3FohtGUaXRXNzaksoU2t/oqBYhEgzbcSb2fP3rYuxMTDhccOqUN2LTaW2/UW4OD/tEcj5jpeLVtJleAxhncbF2qjmMC9S1JF48xDkn/AHSuPdep4uixgRSAc5aXq71OudYMhlR6/Doxs6WUXQ0nrERiCI3IkPUhOUt1XtHhxbOUNNQrfqQ6/l5Z/5gqgW1FE9t2Dv0HkKDN84ZcBEDu0MYpbm5VlVKfwU72CRxN0Q0RgKHNkUChKvb3u7eylxsYK1jY7eWvD/9tFSGy8tezbe0KOEgVnn8cVXC1aqq25/8SVW+0PZgbmxtaSFhLubSkleHjP6CqUHTla07/Fo8zVMp/e74cedzEyjtBgY8eZMkmLzDFpqmLZPhOa61tdoxZPPzjin/6Ee+q1lYUEWL5Wo266KVGzf052/+RkmCHcX0tDNJymVP5IODOj7mJO7sCHJIp/V6qEYXF/U8mmssTltb7nEzOaljJ8HQ30AMBIRDv2R62vsfXD84KIZYN4Mjrl7Vca2uilvf0aFF6cUXdXwEQ46x8TXTazCWje97cNCHIoNfs/tiYV5a0vmkOJme9lFn6bQvWuWyFkGaoEBa2A6zS6DJiXCK5B02NWm+hgMvwOhpQidR99EYCRzZMNDE0pInhmJRCQAub6lUy7y4WWBeVKnodWGu4BsxNGT2qU+5hSmY7sSEmoSnTnnDcXnZHeugPS4v+5YWv+X1dR1zHOsmDYdOIEQimY6Ovj6BmwljffRRJaGhIceF4T4DkVSrwsHb25UcgZUYWVap+HPSafcHxwSpo0Pb/0uXxGrBzxps/DvfcRgAIZSZEvPIiGOqbW1KalTk7CLa23UuX3lFoprr1x2rhuqITweYNAMbGJ7c2+ue3SxaCFPm5txT5bnnnPoImySX03sNDTkjBAsCehMkxJERfUcsluWyri98YKATQrOjaQ3EcvmyJ89KRc/nHIWYOgssFrUsXNmsriEzx8GhPSJ+gokVyuHZDdCb4F7ie2dxRLIPeyaJhonGSOBgnqjqMELq6DB75zuVAEolPfbUKfe1uFVkMk5Tu3ZNF/3Kiv6mMWimm//KFV3Yw8PevPr0p3UTgv1ieA+WzU0zMaHXu+eeWhoWCbS5WVUywwbAbLNZUbfgrnd1Cab56Z8WbxylHH4o16+7sGN93UeBXbvmiS2d1nmantZjsRJFUo3HOQmso0NVKCIVvKC3t7Vwbm87G4JJ5wwLGBz0Ri9VKcl1ZkZ/rl1TT+LSJSX+vQEDI5fTa9B0jWM/PqCq1lada95zZsZHwNG0xWoWXBhGCEMV5uedGURDEU9v6JFhktzcVMVeKOg76e5WwoXlRF/m5EmdF64VkiVNWuAn7Fi5/mDJIMzp73e6I7BfqLbd2vLzAaQDXIaLYjjNB5MszglNXo4rdB7k92auRUhohEce9Z/A4bY2NSlpzc/rJpqf9+oPYQGY8tqaLuzHHtOkmf3iU5/SjX7pkvNoQbfk9QAAIABJREFU4XIzECCV8gTY3q7tO4Nzv/1ts3/yT5yhAUZr5skYLndXlxIDiWJhwf1IenpcDIKxFjft6Kg3bOE9Dw8rSbFoZDKyCBgZUXN2fV2LWEdH7VR7ttGM0aLiO3VK75VOeyXLIsDPWK9SvY2P+zgwqrbNTZ33QsF9Uvg3wpgbN5ynPDsrWGJsrBaa2BvAHTBAgGBorrI7O35cibJYVMIFIllbc5aLmWPCNB1hNMFhT6e1mCwsODvkxg2HnBDY0EDmvE5N6T3Pn9d3S4M2k3EYK5/X+wNbsVhXq3otxFlx7P7tuZw+V1+fGuShFw8c+bY2V0gynYgEi/qTBj+7G64Bms4sAOyqgHdCimI67dcD3PYkiR9p1H8CpxlH9cOFlcmoMn3mGZ/FODWlKmx42Ce3P/CAqmiiq8vs135NlXu1qiTyx3/s/3/qlJIP+Od3vuP/l0rpeYWCEn+xqC041TBueUyFIbFT5aC8KxZ1gy8t6Qbt7/fq+eJF3SxXrzoPFwiBAbgkTSrmpiZV+IWCbvz5effoYJABuxYk9Mjemfxy4YJcCUlsqPPOn3d4gKkyzOUcH3faI3jwww8r0YyMeNVMdU6DbXpa53d6en+qZBjz816dwubgs2P4xGLb16cm89iYoBmYLuDfoash05mo1BlWPTXl/t2zsz6ZHuiOvgUNVphMSN2h+KEnoM9g5hh4X587LUaRm6xR6WazXv12d+t6vv9+320BWVG4hFRKsG/une1tt1WgmobJRAGEdzuLAdg59Fp2k6iceR0q8SSOLOo/gbP13tjQDR8aUC0s1F74uZwq5r/9W99aZrNmP//znpgXF51t8dJLnrxpqI2N+c1x7VrtsWxvaw7kiRNKtAhXwHW5sbioSSrQHBcWvEqlqUgFyLaaxiML0uqqe3/Dp0YgA7WQph++zVT/JP65OVXLMAyoqDhvsGVOntTj8AgZHdXnZIFhNNs99ziHemFBN3t7u57/0EOqhnHqA3vGEXB6Wq8/OXnz3VEYVHhIz/lDs7Zc9q09VSbNSrjYVOH0J9rafKjv6dM6HhIdFq9QH9kNcZ6Wlrx6HRhQ8oUB09SkhYlGIK9Bc5rkTuW6uemOjb29PgkqitzCoLtb7zMwUOs8SGETQh9w5fns0C+Bf0jCsI7wGudYOI/lcm3y5txQsTN7laSfxJFF/SdwM99ilsvOj4WzDF0M3LJSUfV1/ryS9aVLuhEeflgVFR347m6zz39erw+2Wyi498WtYnxcN1Sx6HAGlRDHQiXLNvnkSSUeJrFMTPhNB62NOZfAKti1cqPi6WHm9DnUo7wn0megpa4uLV7j434jhoMNXnzR35tdQibjzT64462t+gxzc3o+1DhggWzW6WdYyVarOpfZrF6HZM/idZDIZp3FwiINpk6jFNoc+DsKU8Z/seB3dTlchVS8qUkJ18yht60tb3LncjrW6Wn3j8cJk50BC2cUeUUP44j+AwkXDJ9j6e/3hFoouPoUe4JMRswovndw+VBWT1EDBMTjoCFynXEeosgpofyO88m1w64NyAvRGNcc+HwSRxqNkcCpqMBRacaggMtm3ePj5Zf1+Gef9effuKHZl2bCxamOwtc30/Pz+dsnl9ZWPa5YFMd4bs7xYS72UCQD/MDzwMYREJFAob5B2SP5A8NAEwwplci/GVcWTkJHep3LubEXYpvNTS1WS0v6HfMo2UbPzDh17V3vcjMmxm+RTHk/3gc6ZDhMgUqvWKyVbt8uosir3M5OJXKmyPD8zk5XmyJi2dlRkscfheZkHDsjhs+zvu5GaPPzDp+srmqRLZe1W+jtdVEWC/XLLzt0ZabPjD0Axlokx3DSfGg/iyVBKqXv/dQpv9Z5zl5vHYqQ/WZMgvHDZoIzTrKHmcLrAPUg3wfuCemIZo57w7rh+UkcaTRGAi+VhGMXi4I4qJJu3PCG1dqaIJHb0QfZtnPTmflFGs6gvFUMDirBYm9LIwhLWNgkeGAUCi5TRnzR16fkQBJmu49PB5U31RWVEdt7YJvQH5rf8R5AAnEsTJpG8NycKnIzwUHI+KmMabDt7opeuLQkDjw862LREwZ+G8eOubc33jUkGSa141k9NnZrr3NiYMBnfzLogl0Gi0Wp5DxvaJ0M7oWTvr2t98MioKVFSR/VJmrOF17QZ5ue1vU2O+tspPV1x4pJ8GZ6fG+vc/QvXHC4JBTVMKkerxI8XmDxhAMWWPBvlSBJ1vsFPaP9gt1COLUH217UoMBOYNy8HvDcXk/wJI4s6j+Br62JXTE+7g2Xq1cdPx4Y8OrxoD4mZl5phEHyfve7NSPyZoFCL5NR8stmdRNwU1K58nqYbsHRBtOHMcAOAJbK7Kx7nbMtDt3oQroiiQNL1HRaSXJ52RWCKytacOLY7WeRpWezSm5ra64EZIsdx0rwlYqSLjao0Ay5mZktOTxcuwuIImcuhGZLKytKnsBI+8XIiGAwdiDQEUl6QAJAMhwTvu6hLQI+HkAg7ACA4qanlcBochaLrjVg1ufamh4HtkysrKgSf9e7vJkeuhTS1wg9u1mAwZCBNcLG4I+ruqWhDn8e5Wmp5D0Cs9peE8cFZEQPJkngRx71n8AnJ7WdB/8ul3XjT08rCbS1eSMom1USuhUt7VbR1SUZ/UMP6Sb8wz98/WPgbff0CAfmIkZoBCxCA6mpSckACfXqqipyMN0QtkAeDp5bqeg9EBjFsU9mB6IgcQNrUGH193uS7u31ZiICGDPdtFNTtT0GxCqweMzcbmBnx9k1wAIoJBHdIJ6hasP8CuMp2DYDA9642xsnT5p9+MO+o+jo8OHT6+s+pBhK6eqqjp3dC0wKqt041g5hc9PhMSwOEN+Mj/t0os1NtyBeWvJFhoV2v2Dns73t5lPAG7BxQk8SEinQxJsFR4SYfS6n8w/ExPcF+2VtzRfNdFrfPdcUM0WTONKo/wQ+Pu43ImKIUsl5skwBZybjzeYB3iwGBlSB/szPyPhpcFAX6s/9nBLBH/yBY4YXLwrzZtArWDw+LHC8Sb5IoME0Q8UiUnV2Ej09tcNqgQQ6OvwGAmYJsUi40VSjsDOAUUKmSRRp8QA3p0GHTSrV/+6uD74FoqJRC4uDahKGDfMgqf7n5rySBxPu7XVIAdtWMOkbN/Qcmn9NTS6ZZ9eAvSp2vYzNm5lx3J3hHExiAgtnAYHdsbXl0+SZFERjmEQN/HS7oB9z333uKFmt6tqiYY6DIosf5mBvdoSGV+D5hYJX34jM8LVHSYuTIrNHE/ZJXUT9J3Dc4ph9yLY2ipQEb9zQRYXTH7ayBw2w2Pl5KS5pHK6sqBK/7z6XmJNc4FKzLab5hcgDKTvVabns8EgmowYrFRjNTkydqJbBffEYhxZ544bjk8jaSagh24CGJ1UgJlg011j80mn5wmxvC7/t6xN9kmYYCwAwTVeXQw/h9BYz58CvrPiC8Oqrev6jj7oIJJ+XAnNiwoU5YMqbm1q0f+u3xBx63/tczMMuZH5e52ttzTnvZs5dZ/ACPHpgHIYqLCz4omLmiQpmE5AY8MmtggUXw6iQvskgDD43jw9VjUcRLCA0QxlMzfmi0dnf73ALjXQGciRRF1HfCRxRw/S0kiGzK4tFd4yDkWHmisA3EuCRDARgtiSMDehwVLZgp1DmcBkkadNcw10OzwwScKnk9Dbw5+1tfUaUmE1NboBk5tt85ieyyITDfGmssdjg40HyIzmReEmc587phl1bc8hmZETQ1c6Odh0M1RgZ8QEMLFacJyYkYQjGQGT47YuL3phdXtZCgUnUzeLJJ3XeP/xhX+Cosq9e9ep5ddWrd6bCYzY1Py9e+vCwc7jb2nTu8SfnXPB6iKOAOLjW9vPYoVI9dswxdypbEne90e1CUQ+L1+Cgi4u47oeHvefBdfVGPPOT+LFHfSdwLDUHBhwimZlxaToClxdecByZTvpBo6dHfGt4zFRsx497ww03QTBnBtWWSr7dJzHCBkFGDbcYOfrYmDfRlpa0c4CbzFZ/dVWfq6nJPTXAlPmZ5ubyso7jq181+/3f9wk8TU1m//gfm/3szwr2Qb0J5kzzFMwYHjnY/rFjOkYmDOF1zexEKn8m8iAgwYs8HJwcxz7nkj/5vM77xobEQzeLxx8XAwYYCD8XWBME02UGBhxWw9OjXHZMmuNMp71nwWtvbSmRsQOikcz4teVlvVbYxGxvlzq3r8+TIrASnG+YJqGnyFEmQY6DhiQ+9P39DheyuwQz5xrm7yTqIuo7ga+uKlmfPetNqZUVx2O52O65R4ljZEQ3cjarCvIg8e53+xTwnR1J70dHfZs4OOh+GJgO4U9CUsVFj60pJk9YxlJxw6aYn3fMmiYli1EU6efjx917G84wVR0VJDakf/iHLkoidnflh14sSok6MuKJemHBF5pwWjxeMtiKco5TKX2+iQlfvJqbVfXiQY60HJ8Tbnbc8GDSvPii46f7NTD3i+ef1/eDGvH6dacMQiHEp7tcFitkYEC7C/QBUB+Bjba3HebAMndkRK81NOS2udmsFrPFRf+81apX4hcvuhd7W5vL+vkOQrodlfhRS9DxRoHRBB+e64vvNJTtYwOQVN91FfWdwKGpnT7tnXGoXlSA4MxI4Lu7D4ZdmunmfOc7fYufy/k8RSpqbsyBAbdrRalHFZNO6+YPh8tiBISsnBs5n3eBxcKC3zTQuWi6wXYpFLwa5IYL5yeWy2Z/8ic3/4x/+qfC8UNONBU0yXV11W1cEROF/HK42HHsUAWLFckJ+iCDIpDdwx5CjYkatrnZrQJuFygsMRpjcUTUFHrdEHNz+syPPOKzMmEC0eBlgAGQBw6HmYwWcawPSMzHj7ucfnHRhyh3dKjIQF0Z2ilw/YR+JAhrjipCIRBVOApO+kuhRD+kxybMk7qK+k7gbPMzGfe2PntWFSyKxnxeyfTRR5UMZmddtIHF7M3i53++dmv98MPeGKRaAw+lemJLiYk+ohcYJ+CoCDkItuAMbOBGfvXV2qTd1ub2sog7aJQxpDf0nV5a2p/THsbiosvOkdOj/gTCAOdn0hF+LWCgLIrsKHA5ZLJMf79DRMWi3vPqVe9JNDUJMunrczjFzHdRt4py2ZvTy8verEylHDLaL+bnvccBbr+7632KYtHx8nxezyGRITyCr40YiN5FHPv339srPB8Iz8ybf+EA6HqKkHMeqjoxJgvprQlkUrdR3wk8nGOYzerCevBBJRwqQHjBDzzgjamrV7Xdv3LF7Ctfef3rnjxp9m//rW68SkU377FjtcmbCgRMnRuTn5HdhxJlPDbwlAitO1dW9P+ZjP5/cVGfBbobCk6SIa57Cwva0oeeKDBhYJPcLjBj6uhw2iCDHYACVlcdUoHxMTjojBOahzRxSyUlSDj5t4vdXSllKxVRFPEThyZ4s2BIMAsoGDOsJAZG7xd4t2MqBQwDRx+b3nze+w8MbGZ8GY9rbnYIB4iB16H5zOLI36F8vp7jVqrOJOo66juBt7fr5qLaam/3yd+MNOvqUqJhi55Kmb3//a4i28+YPhx3BrZOpb3fFhFbVgbsghua1VqHAofAKQ69yqEM4uqWTiuZFQqq7HZ2HIfNZPR5eF229ozhgrLW0qJk+OijavbdLO6/3yvMzk6njKE+ZNJRSMOkIQtWz26oVPIFanxcTeU3EphChX4czL/cG6dPewXY3u7e1JyrxUVXue4XWL+2t2thp9EIjxlBUyhvN1NBMDDgCzgDFJqanCIZLvTQFKHfAfchnU+SYxI/pqjvBB6yMGi6kKTPn39zjwNmwX5BEg8NhMLZneEkHhYRBC/r624bikWnmXtFI3RhcUHKjf/KwIDZr/+62S/8wv7H9ulPCwNn5BzsC0QZs7MOUZCYQyUjsBBYv5lTI99o8iZefdWbfYuLOrbBQR0XtrFYI8BNBnIBhopjPRcmzH5x333OG0ewAi0wbErHsXYmYONwt1mMSyWHuDIZh9dQV5rV8ry5VhLcOIkfcxwqgUdRlDOz3zazi2YWm9kvx3H8t3fjwP4uaLJQHdVj0BQKVaBUjMAvUP8QJlWrzltmXibJGW9xmkuhvzivQRMujtWo++pXzX7zN82+9S09/8QJs3/4D6UwLRScelkuO2vHzPnpDHKemnK1I0m2p8fHiKFUfeaZw52zxUUf+gDWevas70y2t/VeXV0+Ao7mLf9mWjtQ1d7I5Zxfjw/62JjYKQiTsI1FtNTUpB3R6qpT/kLvma4uh7dCOTkNbQqMJGkn8SbEYSvw/2RmX4/j+FNRFLWaWfp2T3jbRFi1w56hUgQOCbfcNCtXV7XVpylq5gsDiQ7GB6KTalUY/m/8htmv/qoSIAZT4Oc08sz0WHY21apPCSqXnY9OEkJKDfsFC4Ob7UYOGnEsqieYO0KhVEqJvFLxsXM0qnkewzfM9uf8t7frufPzvvAsL/siNjnpTcr1df9clYoWjRs3tLM5dkzvB47e0+PnHwgLFksSSRxB3HECj6IoY2bvM7P/0cwsjuMtMzsApeBtGM3NzvGmUdne7jQukiVim72WsmFgxL+97WpDDLRgPNAcRGCEPwgVplkt22V21g2sgINOnVLixKfEzD1Y2ttlM3Ar+uJBA3+TwUEdx/S0jjeXU6KFo27mfyO6udVrzsx4TwHLW5g2sIiWltxPhYUzlVKV/sIL4niPjur5fI+hZcGP0zUwiSQOEIepwO8xs6KZ/bcoih40syfN7J/HcVxDKYii6LNm9lkzsxMnThzi7Ro8SOLNze5OiIQ7lKWDlcNgoIlmpuRLExI4BUEOAxNKJYllcAU8e1aQQU+Pz9Ck2megLkm0u9sbp6F7opmzXRAp5fNmn/yk2X//73fn/KCeNHOuOP0C8P7lZe8T4E19s4DvTQOSBZTkTQ8A+TswzNWrbpQ1M+OUytVVn4gEtzuJJI44DkPwTJnZu8zsv8Rx/JCZrZrZv9z7oDiOPxfH8SNxHD/Sz+iqt2uQgAsFJdNMxulq7e1KHPm8K01pZDL0AV4yfhQo6MyU2ObmzP7qr5S829u1UHz726JTzs6qksaFL7Q8XVlRtTs/r9ccHNTvl5eVJJk7CoSDovQznzH7B//g7pyb8XHh78jfzXwAQxzrnLHATEzcmn3CuaZ5DL4PVRMMG+ENPQpEYjgXsngwUYgGLzz8pPpO4ojjMAl80swm4zhmMu2XTAk9iVsFuHJ/v2hyvb1KUkNDPkqts9NxWfw3SObMpwRGQISBAhJvdMZ7dXUpOWJfi8oSZscPf6jKHbrd+Lh+3t11+iCT5eE3Q5NraRGUgunW3YiZGefLY1uAfQAq2IOYQ2H5OzTkdgcDA2r4FgreQB4acg8cM/2NyAnnxrU1p1ci0ArhqCSSOKK4YwgljuPZKIomoig6H8fxi2b2QTN74e4d2ls8mpqcu14qKWFQYcJ4YIhCKqUknst544xGJo046IGdnUpAm5ve6JudFabd0+NOfq2tcvvDqxo1Z2ur7GTvvdcrdRqHoStib6+OKZMRznzlyt07N5OT+uylUu0g4M5OcbTNHNvfG/39Dnf09Un+zvzLd7zDZ2yurmrB29wUY+f6dU/KDJ7G94XFFdoggxmSCjyJI47DslB+zcx+9zUGyjUz+6XDH9LbKIBUwHOrVfcWZ/htOAR4bc3hFWhvzH5E9PTyy0owmDQx5R4IgAYpMvy+PsfB19fdBTGX02NpcILHFwq1fio9PYJc7mYCx6irWNTnPX/eh1X09en9l5d1DvZK6VMpSfaxtU2ndcynTrlJGY1gvHR2d5Xki0XBOA8+WLvAgpmPjByefZNEEncxDpXA4zh+2sweuUvH8vYMGnTh3EEalxhQ0XzDzhYTpxCXbm3VRKFr19x5b2fH517Oz7tJP8MwmprcETCKlCzByItFLQL5vFuO3rihhDYy4vQ9MPO7FeDyiLhgvSCm2dpS5T835+pSVJHAIu3tqtTf9z59JiiB+HOH0vHFRV+Y3vlOfa5r1/R+6bRz0Y8dczZKEknUSdS3EvPtEqFNJ7Ml4RrD9ybpUEXDxAhx4UzG7OMfN/vOd4RtA0W89JIS9eio4+hIwvGVaWtTtd3WJlx7YkLv1dvrVgCMRMNxDy77yZN67Vv5eh80aBRS/XZ0aDewsSF2TSrl7ogoQTc3VXXncqrQ02mz97xH7Bt8VGDVoDA186YwXi9m+nc2q8/U3+89hXw+wbyTqLtIEni9BEkcNzhc/2BbkKzDSfCYW+EumE57wnn+eUnWw/jpn1ZlzfAKMOb5eR9m29qqhL+2pqoTNz4zN9GqVh1rjyK93pkzYr+Eww7uNEjUDz6o956cdAgFP5jeXp9MhFgpl/O/T592lgkDeRE9hRV4KqUdxeysPtvOjip2BFgYqSV4dxJ1GEkCr6dA4GPm9L7OTvfVwBgLehsNTjOnwhWLZl/84uuTt5nZX/6l2WOPKWHNzbnQh8bp4KAqTcapDQ66cIXjGR3V+6yu6nGMlBsaUsV6EGfCg8T2tpqsjz3mPQJUm1iddnbqz6lTqso5V7mcEjwe6qGSFYZOyC9PpZS0mc7e1ubJHypiEknUYSQJvN4Coc3OjhIiWDZ8cKamY+hEVUmjc2Hh1g3FF1/0UWEMq52a8teFK55OC6KggoWh0dmpx5bLjhO3tSmhj47evQROTE4Km2aQQ2jnG1oRMPiZYRqzs/p7aMhxb2iQoTsgFEzseRkFFz4G29kkkqizSEC9eoxwQgw8aHBo+NmLiz7HEiYKg3lvFaWSTwhiGnmx6GwU5Obd3e5KuLKivy9cULNva0tJP5v1ijef1//dc8/dPReYQx0/7o1UFg88X1ZXtXDduKHHAEPduKFEDqxDM5jANoAFMjQZw+8bo6/EEjaJOowkgddrIJiBkUKiYrAviZwkFjb/bhewSmiGZrN6bX6GU97RocQ+PKym3sCA/r9aVXU+MaG/qYIZQH03Y3tbn7Fc9gEaa2t6r6Eht8kFEmLYg5lglLU1P9YbN/Ra2AWQzIGiGNgLzRCoJvR/TyKJOooEQqnXwIu6qcktaPHxoHlJUxOxi5kajx/6kNnXv77/6x4/roTNazY1CRNnKHMc+6Djs2edG55K+UT6555z/5Ry2X25mWx+N6OjQ7BMuewY9unTtbNFgUXw6oYSiQyeSptztrvrI9bwbDerHcTMQol/TRJJ1GEkCbxeg2RCEscJD0dDM5fXh43NY8fM/uk/VeL67ndrX/PCBVWtjAdLpdyytafH/bHX1kTBM9PrkAyrVbFbGFi8seHV/+ysj7S72YSdNxodHe6kmM/7wGqERwyYwLiKCeo0fSsV95Dh/3p7fap8OMMSkRSNzeZmn2ifRBJ1GkkCr+doanIaXzgKLnQnNHNvDqb/DA+b/bN/plFrzz3nE3fCAQ3AC93dgiBgegwO+hT53V0l+t3d2kHRTMUhwabT+l0UiRHy3veaff/7es5hor1di0VXlydyzK2YNcqItKEh3zkwRCOO3WY39IwhqXd2Oh0Te19w/3ocRJxEEnsiSeCNEBhgbW25DzbJiSRDQxOcvFCQn0l/v5Lg0pJ+39Xl5lmplCsy19Ychunt9Sq2rU3/t7rqjocMdaCqrVb1HJgt+JHPzyvh47c9NOTDkG8XHBNNxrU1r7pxYsQvvbNT/6bpi89JZ6ezc1Br0jsoFNxqFnaKmQ+9SCW3RhL1H8lV2ihBxUuCIint7LhrYHOzknhXlw9fxiMFNSJwxOamKvO1NT2mt1fVON4g+GeTvONYbJWtLWHS09OOOW9va7EAnz97Vv83NaVEevKkIBaGEh8kgfPafHZ2BRsbbgmQzXr1jQFYd7c+38qKuzjCW0fdChYO9g0EBX6eGFUl0SCRJPBGCapwGnBUoQwipkkH/W193afOl8sOG2BXS/MOoyuody0tqtbn5rzq3972qrZYVLJbW3NqYxwreTIjcn3dfUpw8CsUxAa51RCGMLq73dLVzCcMwTCZmtLnQlhk5rL/SsXtdaemdAw9PT5K7cwZp1uSzPFeSUQ7STRQJAm8kQJMnMammcMCsEqYwj456XzytTWfa9nU5LQ5hhaQzFdWHFdeXvYpOfwO2CaOlRTxAcfulUYg1TAVMkOIu7sPbggFpbGz01Wm2AvMzjpX/fJlLQ6PPaZdAP0CuOLj4/o38zBPn1bFjuc3DdDWVv0xc2gliSTqPJIE3miBqhBJORzxUHhCkkXgg6Mf48DCOZpMgI8icaWRrDPUIZ9XZQudETpfW5svBrBlqNTN9O+5OYdYmpvfmE8Ki9Xqqi8Q1aqSt5net1rVsS4uykEQCIfF6vJls698xaGSbNbsgQeU5B99VOdia8unIYUT5pNIogEiSeCNFlTeJO5QTQg+DKuCRNTbq4qa6pRxaTxuZ8dHqpXL+kPFjVCor08VbXOzwzHg6kyyL5edzjgz44tGtSo4hwnwtwuMpGiIrq/rOPYOcGCxSKe14EB1XFrS+3z5y7WPX14WtXJz06f1MHItPF8J/p1Eg0SSwBstqL43N90TJZ1WcoUXTiOOhl04RQbMN5fzxAszBJk6TI/paSVP5PPNzb4YULW3troUfXtbj3vhBQ2WQO7f3e3Hdzupv5lPh4eXncvpefsZdO3uCvqhgcvEoJde8scwgYd49lntNuDXp9M+pILfJZFEA0SSwBsxwoG9KAvBhGlEMgiC8WBg1mGjcXPTEyq/h/8cSvY3NvReAwPOq4bZAdQB1j4+bvbEE7Uc8HL54J8t5KlvbvokeAyrpqddfGPmxw/WjoNjmMDD5G2m152bE8yC/SzwDhYGSSTRAJEk8EaOlhavjLu7ncs9OKifFxedjtfZ6XABU3jAsBlosLnpyZLkzER6Rovt7Pig4K0tVeNLS852+cEP7lzAk836UAaOGcios1MLyfHj+1fiVPn5vHDyvVPrOV4zNTJRdVYqOnazJHkn0XCRJPBGjb1+4AwqYHDByooqWWZrMDa4AAAKtElEQVRAokycm1My3thQwt/cVCXa3a2qF1Vme7s3LplLaeZTeBimsLzsXPFKRbS9NxKdnUrauZzeK5t1G4GeHj0G7LulRXBOtVr7Pv39Lihipujp06rWSyU9huSdyZj9xE/o8ewezFyRmUQSDRRJAm/UwDWPihFDK7Dori53MDRz2GRtzYcYYOi0uyuflNZWeYmvrbm3NhS90VFV9ODrwCtQAxlyXCgcTKhDwBlHxp7Nylyrq8sXlDhWlQ+dMZt19gzTclhEymU9b2hIifrpp33w8cMPmz3yiM/wLBS0SPT1+c4jiSQaKJIE3qgBrIEwBt/sjQ0lc8ydgEcQ2yDOQcWZyahKbWmRj8n2thgkTKkZHtbrl0ouhcf/BBXn6dPO9b527WAJHGUpPPOuLr0XFEDmboK/I3eHiw6NsFLRcWFstbXlIp6+Ps0ITaV03BcuuMCos1PGX5mM+5onkUSDRZLAGzmg9GHERGLGitbMByRXqz4IeXvbkzp+4DBITpyQIGZjwwcoU+2bCa7AexxnxNVVJcZUyuz97xcGPTFx8+PO55VMGdNWLjtThs9RKum129qcD7666jMx29pU/be2Ov97e9ttAtbXHY5pazP7wAfcvyWT0YLBQtHdnUzcSaIhI0ngjRxU4bAyQgc9oBQeAyyysaFEmcv5TEyYHgsL7h+SzaqSBvNmUERLi5JkLif8++pVnzvJCLNf/mWzS5fMxsZ8EANV7vKyHsP4tvZ2LS5bW/7++bz8Uxgth4dJS4uOlSk5KEKhPZq56IjKPp3Waz3wgFf1KFIRNiHkSSKJBoskgTd6hMOOkcZjzIQyk+oWXJypNL29eo2QodHWpuetrroQp73daYZMaW9vV/KDY10s+uT43l6zT3xC73HtmhqOGxv6/+vX9fooQXkvPguS9vl5t4jd2HD/lpUVLR4kcI6jUNBr9Pe7KVdrq4y1RkfdmRDIBCUrdgMJ/p1EA0aSwBs9SKrI6hGiwBEHLgErp2IFWzZzFSUJe2XF8e3WVlXE0BFJsPiHZDKqdnt6XDQUWreOjup3U1OqyqNIIh+8Tkoll/szPBhmDY1SBisA+Zj5WDU48V1dSt7HjimZl8t6veFhHw4NnIKgKYkkGjySBP5WCGASIBMohsyvpApHft/eroSHZ0oUqeFXqbigBVdCRpXh1If4pbPThwS3t7tfCe8fLiqVinDxtjazhx7SgvDyyw7RZDLOeoljHUsu59NzOjs9ycexe7LgX5JOK8mTvAcHfSFj+ASYN7uRJJJ4C0RyJb9VgqRJc9LMxTBmzgUPOeHhhHYqWRqiDDcG3gCWSad98AOvAZyxtlY7n9NMVTozMzkmmodjY8Knl5d9cAQUx1OndOxbW5p0j+3t5qZ+t7ioXQQccnYWfX3e+DTTz3zmKHJ+exJJvAUiSeBvpYCjzb+rVf8ZCAHcmUnysDfW1517HcdqMLa0CIeGhdLZqYQINBOKYPBkAbLBe8VMiZaGJAySEyccMpmddQokHuMDA/44hEYdHc5bb21VZQ9Wn816EmcgAwsMplsJ1p3EWyySBP5WDZIVgx5IZkAcVKJwqRn8C3MD33Eqe6pckj72rkAn+LGE1T+QSCYjhklzsy8ca2taHDIZDVBGkr+5qap8cFBJGcgjnDxvVouNszjtTdDhgpZEEm/BSBL4WzVI0syVpCrFcS/0/QA+odEJRoxa08wn61DV81rg3vwdJm5+LhQEebS0uMy/pcXsvvuUkNkRNDf7XEuOCUglqZyTSOJ1kSTwt3KAgYewhplP7yEpkphDe9idHWecVKv7UxVpVIa/N6sdjmCmYzh3Tkl8dVV4NlzyJDknkcQdR5LA3+oRGjYhpw8rXjMl3EpFGDP8caa9mzlMEjJbYKSYeYPwZgOBo8gHNCTc6ySSuGuRJPC3UzAYmUoZ3nh3t0vumR+ZTrsHSWen87r3NgTByG+XjBM8Ookk7nocOoFHUdRsZk+Y2VQcxx85/CEl8WMNpORAJWbeCIwiQRuYRSHcSRJvEknUZdyNCvyfm9llM8vchddK4s0Iqub9BC3wtJNIIom6j0MpGqIoOmZm/4OZ/fbdOZwkkkgiiSQOGoeVpP3fZva/mtnuzR4QRdFnoyh6IoqiJ4oY6yeRRBJJJHHouOMEHkXRR8xsPo7jJ2/1uDiOPxfH8SNxHD/S399/p2+XRBJJJJHEnjhMBf5TZvbRKIpeNbP/18w+EEXRF+/KUSWRRBJJJHHbuOMEHsfxv4rj+FgcxyfN7NNm9u04jn/hrh1ZEkkkkUQSt4w3lQf+5JNPLkRRdP3NfM+7HH1mtnDUB3GXIvks9RnJZ6nPOOrPMrrfL6MYUUcSt40oip6I4/iRoz6OuxHJZ6nPSD5LfUa9fpbEGDmJJJJIokEjSeBJJJFEEg0aSQJ/Y/G5oz6AuxjJZ6nPSD5LfUZdfpYEA08iiSSSaNBIKvAkkkgiiQaNJIEnkUQSSTRoJAn8gBFFUXMURU9FUfQnR30sh40oinJRFH0piqIrURRdjqLoJ4/6mO40oij6F1EUXYqi6Pkoin4viqL2oz6mg0YURZ+Pomg+iqLng9/1RFH0zSiKXn7t7/xRHuNB4yaf5f987Rp7NoqiP4iiKHeUx3jQ2O+zBP/3v0RRFEdR1HcUx7Y3kgR+8MA2960Q/8nMvh7H8b1m9qA16OeKomjEzH7dzB6J4/iimTWbVMGNEl8wsw/t+d2/NLM/j+P4rJn9+Ws/N0J8wV7/Wb5pZhfjOH6Hmb1kZv/qzT6oO4wv2Os/i0VRdNzMftbMxt/sA7pZJAn8APFWss2NoihjZu8zs//HzCyO4604jktHe1SHipSZdURRlDKztJlNH/HxHDjiOP5rM1vc8+uPmdnvvPbv3zGzj7+pB3WHsd9nieP4G3Ecb7/24/fN7NibfmB3EDf5XszM/i+T+2rdMD+SBH6wuK1tbgPFPWZWNLP/9hok9NtRFHUe9UHdScRxPGVm/8FUEc2Y2XIcx9842qM6dAzEcTxjZvba34UjPp67Fb9sZl876oO404ii6KOmqWPPHPWxhJEk8NvEQW1zGyhSZvYuM/svcRw/ZGar1jjb9Jp4DR/+mJmdMrNhM+uMoigxVKuziKLoN8xs28x+96iP5U4iiqK0mf2Gmf3vR30seyNJ4LePt5pt7qSZTcZx/IPXfv6SKaE3YvyMmY3FcVyM47hqZl82s/cc8TEdNuaiKBoyM3vt7/kjPp5DRRRFv2hmHzGzz8SNKzo5bSoSnnktDxwzsx9FUTR4pEdlSQK/bbzVbHPjOJ41s4kois6/9qsPmtkLR3hIh4lxM3t3FEXpKIoi02dpyIZsEF8xs1987d+/aGZ/dITHcqiIouhDZva/mdlH4zheO+rjudOI4/i5OI4LcRyffC0PTJrZu167l440kgT+9oxfM7PfjaLoWTN7p5n9H0d8PHcUr+0ivmRmPzKz50zXc11KnveLKIp+z8z+1szOR1E0GUXR/2Rm/97MfjaKopdNjId/f5THeNC4yWf5TTPrNrNvRlH0dBRF//VID/KAcZPPUpeRSOmTSCKJJBo0kgo8iSSSSKJBI0ngSSSRRBINGkkCTyKJJJJo0EgSeBJJJJFEg0aSwJNIIokkGjSSBJ5EEkkk0aCRJPAkkkgiiQaN/x8dcX53obyelQAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai\n", + "import mlai.plot as plot" ], + "id": "f169de61-7368-4f03-b326-6ef75d0755cb" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "plt.plot(hospital_data.longitude, hospital_data.latitude,'ro', alpha=0.01)" - ] + "fig, ax = plt.subplots(figsize=plot.big_figsize)\n", + "ax.plot(data.longitude, data.latitude, 'ro', alpha=0.01)\n", + "ax.set_xlabel('longitude')\n", + "ax.set_ylabel('latitude')\n", + "\n", + "mlai.write_figure('nigerian-health-facilities.png', directory='./ml')" + ], + "id": "6f621a86-5ada-4e65-92f9-48badfd400f6" }, { "cell_type": "markdown", - "id": "1042a712", "metadata": {}, "source": [ - "There we have the location of these different hospitals. We set alpha in\n", - "the plot to 0.01 to make the dots transparent, so we can see the\n", - "locations of each health center." - ] + "\n", + "\n", + "Figure: Location of the over thirty-four thousand health facilities\n", + "registered in the NMIS data across Nigeria. Each facility plotted\n", + "according to its latitude and longitude." + ], + "id": "44788536-f449-491d-a2b0-871e15786e9f" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "hospital_data = data" + ], + "id": "bc157f71-4d3e-41f2-8ebd-08a064d3a42e" }, { "cell_type": "markdown", - "id": "036e8aba", "metadata": {}, "source": [ - "Administrative Zone Geo Data\n", - "----------------------------\n", + "## Administrative Zone Geo Data\n", + "\n", + "\\[edit\\]\n", "\n", "A very common operation is the need to map from locations in a country\n", "to the administrative regions. If we were building a ride sharing app,\n", @@ -930,10 +609,10 @@ "could know how many riders we had in different city areas.\n", "\n", "Administrative regions have various names like cities, counties,\n", - "districts or states. These conversions for the administrative regions\n", + "districts, or states. These conversions for the administrative regions\n", "are important for getting the right information to the right people.\n", "\n", - "Of course, if we had a knowlegdeable Nigerian, we could ask her about\n", + "Of course, if we had a knowledgeable Nigerian, we could ask her about\n", "what the right location for each of these health facilities is, which\n", "state is it in? But given that we have the latitude and longitude, we\n", "should be able to find out automatically what the different states are.\n", @@ -949,34 +628,115 @@ "They have been made available by the [Humanitarian Data\n", "Exchange](https://data.humdata.org/), you can also find other states\n", "data from the same site." - ] + ], + "id": "b23c6eeb-223a-4a3c-9edb-8d9717300cc5" }, { - "cell_type": "code", - "execution_count": 8, - "id": "1cd93f7b", + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import zipfile" - ] + "## Nigerian Administrative Zones Data\n", + "\n", + "\\[edit\\]\n", + "\n", + "For ease of use we’ve packaged this data set in the `pods` library" + ], + "id": "c7084525-45f7-4550-91ab-30901e727a60" }, { "cell_type": "code", - "execution_count": 9, - "id": "e90d978c", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "data = pods.datasets.nigerian_administrative_zones()['Y']\n", + "data.set_index(\"admin1Name_en\", inplace=True)\n", + "data.head()" + ], + "id": "a45a41c2-7f71-4b1d-82bd-182533fe46a9" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively you can access the data directly with the following\n", + "commands.\n", + "\n", + "``` python\n", + "import zipfile\n", + "\n", "admin_zones_url = 'https://data.humdata.org/dataset/81ac1d38-f603-4a98-804d-325c658599a3/resource/0bc2f7bb-9ff6-40db-a569-1989b8ffd3bc/download/nga_admbnda_osgof_eha_itos.gdb.zip'\n", "_, msg = urllib.request.urlretrieve(admin_zones_url, 'nga_admbnda_osgof_eha_itos.gdb.zip')\n", - "with zipfile.ZipFile('nga_admbnda_osgof_eha_itos.gdb.zip', 'r') as zip_ref:\n", - " zip_ref.extractall('nga_admbnda_osgof_eha_itos.gdb')" - ] + "with zipfile.ZipFile('/content/nga_admbnda_osgof_eha_itos.gdb.zip', 'r') as zip_ref:\n", + " zip_ref.extractall('/content/nga_admbnda_osgof_eha_itos.gdb')\n", + "\n", + "import geopandas as gpd\n", + "import fiona\n", + "\n", + "states_file = \"./nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/\"\n", + "\n", + "layers = fiona.listlayers(states_file)\n", + "data = gpd.read_file(states_file, layer=1)\n", + "data.crs = \"EPSG:4326\"\n", + "data = data.set_index('admin1Name_en')\n", + " \n", + "```" + ], + "id": "6c915ec0-5c49-4e74-9e01-f8f16343700f" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai\n", + "import mlai.plot as plot" + ], + "id": "bea028f6-e900-4da3-8ac6-3ff16c462d5e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_figsize)\n", + "data.plot(ax=ax, color='white', edgecolor='black')\n", + "ax.set_xlabel('longitude')\n", + "ax.set_ylabel('latitude')\n", + "\n", + "mlai.write_figure('nigerian-state-borders.svg', directory='./ml')" + ], + "id": "eb6fe05d-970a-413f-a565-d6b321ca3305" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: Border locations for the thirty-six different states of\n", + "Nigeria." + ], + "id": "8791c8a9-d107-4a1d-9338-5f24947e6651" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zones_gdf = data\n", + "zones_gdf['admin1Name_en'] = zones_gdf.index" + ], + "id": "70c7efa9-2e00-45b9-890e-fb483e2978bf" }, { "cell_type": "markdown", - "id": "8584ac77", "metadata": {}, "source": [ "Now we have this data of the outlines of the different states in\n", @@ -1003,19 +763,18 @@ "coordinates of individual health facilities (which we already converted\n", "to the appropriate `Point` type when moving the health data to a\n", "GeoDataFrame.)" - ] + ], + "id": "b5834f5e-ea5f-4f00-80ba-59483ebe6ce7" }, { "cell_type": "markdown", - "id": "525e9b77", "metadata": {}, "source": [ - "Joining a GeoDataFrame\n", - "----------------------\n", + "## Joining a GeoDataFrame\n", "\n", "The first database join we’re going to do is a special one, it’s a\n", - "‘spatial join’. We’re going to join together the locations of the\n", - "hospitals with their states.\n", + "‘spatial join’. We’re going to join the locations of the hospitals with\n", + "their states.\n", "\n", "This join is unusual because it requires some mathematics to get right.\n", "The outline files give us the borders of the different states in\n", @@ -1026,137 +785,120 @@ "belongs to. Fortunately, the mathematics you need is already programmed\n", "for you in GeoPandas. That means all we need to do is convert our\n", "`pandas` dataframe of health facilities into a `GeoDataFrame` which\n", - "allows us to do the spatial join." - ] + "allows us to do the spatial join.\n", + "\n", + "First, we convert the hospital data to a `geopandas` data frame." + ], + "id": "b9a66674-b01c-419b-9358-de77fb7e024d" }, { "cell_type": "code", - "execution_count": 10, - "id": "db249d99", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import geopandas as gpd" - ] + ], + "id": "8d002657-a7c9-4743-abd9-788d159e6097" }, { "cell_type": "code", - "execution_count": 11, - "id": "b1c2304d", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "hosp_gdf = gpd.GeoDataFrame(\n", - " hospital_data, geometry=gpd.points_from_xy(hospital_data.longitude, hospital_data.latitude))\n", + "geometry = gpd.points_from_xy(hospital_data.longitude, hospital_data.latitude)\n", + "hosp_gdf = gpd.GeoDataFrame(hospital_data, \n", + " geometry=geometry)\n", "hosp_gdf.crs = \"EPSG:4326\"" - ] + ], + "id": "bed94194-2d0f-42e6-9c1d-396e3ebf035a" }, { "cell_type": "markdown", - "id": "59d9a338", "metadata": {}, "source": [ "There are some technial details here: the `crs` refers to the coordinate\n", "system in use by a particular GeoDataFrame. `EPSG:4326` is the standard\n", "coordinate system of latitude/longitude." - ] + ], + "id": "2cb1dee1-3bad-44ee-a854-9c0d17527f6b" }, { "cell_type": "markdown", - "id": "e7712d27", "metadata": {}, "source": [ - "Your First Join: Converting GPS Coordinates to States\n", - "-----------------------------------------------------\n", + "## Your First Join: Converting GPS Coordinates to States\n", "\n", "Now we have the data in the `GeoPandas` format, we can start converting\n", "into states. We will use the [`fiona`](https://pypi.org/project/Fiona/)\n", "library for reading the right layers from the files. Before we do the\n", "join, lets plot the location of health centers and states on the same\n", "map." - ] + ], + "id": "37517ab2-bd79-4384-bca0-07238dab3f17" }, { "cell_type": "code", - "execution_count": 12, - "id": "b75f9523", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "\n", - "import fiona" - ] + "world_gdf = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))\n", + "world_gdf.crs = \"EPSG:4326\"\n", + "nigeria_gdf = world_gdf[(world_gdf['name'] == 'Nigeria')]" + ], + "id": "35e27957-c44c-4cd5-8415-1f3158fdd2bf" }, { "cell_type": "code", - "execution_count": 13, - "id": "340ec265", + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ImportError", - "evalue": "The descartes package is required for plotting polygons in geopandas. You can install it using 'conda install -c conda-forge descartes' or 'pip install descartes'.", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/geopandas/plotting.py\u001b[0m in \u001b[0;36m_plot_polygon_collection\u001b[0;34m(ax, geoms, values, color, cmap, vmin, vmax, **kwargs)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 120\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mdescartes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpatch\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mPolygonPatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 121\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'descartes'", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mworld\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcrs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"EPSG:4326\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mnigeria\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mworld\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mworld\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'name'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'Nigeria'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mbase\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnigeria\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'white'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0medgecolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'black'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0malpha\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfigsize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m11\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m11\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mlayers\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfiona\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlistlayers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstates_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/geopandas/geodataframe.py\u001b[0m in \u001b[0;36mplot\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 919\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mthere\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 920\u001b[0m \"\"\"\n\u001b[0;32m--> 921\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mplot_dataframe\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 922\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 923\u001b[0m \u001b[0mplot\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mplot_dataframe\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/geopandas/plotting.py\u001b[0m in \u001b[0;36mplot_dataframe\u001b[0;34m(df, column, cmap, color, ax, cax, categorical, legend, scheme, k, vmin, vmax, markersize, figsize, legend_kwds, categories, classification_kwds, missing_kwds, aspect, **style_kwds)\u001b[0m\n\u001b[1;32m 621\u001b[0m \u001b[0mmarkersize\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmarkersize\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0maspect\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maspect\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 623\u001b[0;31m \u001b[0;34m**\u001b[0m\u001b[0mstyle_kwds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 624\u001b[0m )\n\u001b[1;32m 625\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/geopandas/plotting.py\u001b[0m in \u001b[0;36mplot_series\u001b[0;34m(s, cmap, color, ax, figsize, aspect, **style_kwds)\u001b[0m\n\u001b[1;32m 412\u001b[0m \u001b[0mvalues_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mpoly_idx\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcmap\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m _plot_polygon_collection(\n\u001b[0;32m--> 414\u001b[0;31m \u001b[0max\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpolys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalues_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfacecolor\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfacecolor\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcmap\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcmap\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mstyle_kwds\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 415\u001b[0m )\n\u001b[1;32m 416\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.6/site-packages/geopandas/plotting.py\u001b[0m in \u001b[0;36m_plot_polygon_collection\u001b[0;34m(ax, geoms, values, color, cmap, vmin, vmax, **kwargs)\u001b[0m\n\u001b[1;32m 121\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 122\u001b[0m raise ImportError(\n\u001b[0;32m--> 123\u001b[0;31m \u001b[0;34m\"The descartes package is required for plotting polygons in geopandas. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 124\u001b[0m \u001b[0;34m\"You can install it using 'conda install -c conda-forge descartes' or \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;34m\"'pip install descartes'.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mImportError\u001b[0m: The descartes package is required for plotting polygons in geopandas. You can install it using 'conda install -c conda-forge descartes' or 'pip install descartes'." - ] - }, - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnsAAAJ5CAYAAADW/PacAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAWB0lEQVR4nO3dX4jv913n8de7iVGotcLmLEj+mMCebo1BiDuESi+stLskuUhuupJA0UroudkouxYholSJV7aIIMQ/Z9nSVbDZ6IUeJJILN6KIKTmlu8GkBA7RbQ4Rcqw1N6WN2f3sxYwynk7O/Gbym5PkxeMBB+b7+33mN2/4MHOefL/zm++stQIAQKd3vdUDAABwcsQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAsUNjb2Y+OzOvzMxfvcHzMzO/NjMXZubZmfnB7Y8JAMBxbHJm73NJ7rrC83cnOb3370yS33jzYwEAsA2Hxt5a68+S/P0VltyX5LfXrqeTfPfMfM+2BgQA4Pi28Tt7NyR5ad/xxb3HAAB4i127hdeYAx478B5sM3Mmu5d68+53v/vfvf/979/ClwcA6PfFL37x79Zap476eduIvYtJbtp3fGOSlw9auNY6m+Rskuzs7Kzz589v4csDAPSbmf9znM/bxmXcc0l+bO9duR9I8upa62+38LoAALxJh57Zm5nPJ/lQkutn5mKSX0jybUmy1vrNJE8kuSfJhSRfT/ITJzUsAABHc2jsrbUeOOT5leQ/bW0iAAC2xh00AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCgmNgDACgm9gAAiok9AIBiYg8AoJjYAwAoJvYAAIqJPQCAYmIPAKCY2AMAKCb2AACKiT0AgGJiDwCg2EaxNzN3zcwLM3NhZh4+4PmbZ+apmfnSzDw7M/dsf1QAAI7q0NibmWuSPJrk7iS3JXlgZm67bNnPJ3l8rXVHkvuT/Pq2BwUA4Og2ObN3Z5ILa60X11qvJXksyX2XrVlJvmvv4/cmeXl7IwIAcFybxN4NSV7ad3xx77H9fjHJx2bmYpInkvzkQS80M2dm5vzMnL906dIxxgUA4Cg2ib054LF12fEDST631roxyT1JfmdmvuW111pn11o7a62dU6dOHX1aAACOZJPYu5jkpn3HN+ZbL9M+mOTxJFlr/WWS70hy/TYGBADg+DaJvWeSnJ6ZW2fmuuy+AePcZWu+kuTDSTIz35fd2HOdFgDgLXZo7K21Xk/yUJInk3w5u++6fW5mHpmZe/eWfTLJJ2bmfyf5fJKPr7Uuv9QLAMBVdu0mi9ZaT2T3jRf7H/vUvo+fT/LB7Y4GAMCb5Q4aAADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQLGNYm9m7pqZF2bmwsw8/AZrfnRmnp+Z52bmd7c7JgAAx3HtYQtm5pokjyb590kuJnlmZs6ttZ7ft+Z0kp9N8sG11tdm5l+f1MAAAGxukzN7dya5sNZ6ca31WpLHktx32ZpPJHl0rfW1JFlrvbLdMQEAOI5NYu+GJC/tO76499h+70vyvpn5i5l5embu2taAAAAc36GXcZPMAY+tA17ndJIPJbkxyZ/PzO1rrX/4Fy80cybJmSS5+eabjzwsAABHs8mZvYtJbtp3fGOSlw9Y84drrX9ca/11kheyG3//wlrr7FprZ621c+rUqePODADAhjaJvWeSnJ6ZW2fmuiT3Jzl32Zo/SPIjSTIz12f3su6L2xwUAICjOzT21lqvJ3koyZNJvpzk8bXWczPzyMzcu7fsySRfnZnnkzyV5GfWWl89qaEBANjMrHX5r99dHTs7O+v8+fNvydcGAHinmZkvrrV2jvp57qABAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFNoq9mblrZl6YmQsz8/AV1n10ZtbM7GxvRAAAjuvQ2JuZa5I8muTuJLcleWBmbjtg3XuS/FSSL2x7SAAAjmeTM3t3Jrmw1npxrfVakseS3HfAul9K8ukk39jifAAAvAmbxN4NSV7ad3xx77F/NjN3JLlprfVHW5wNAIA3aZPYmwMeW//85My7kvxqkk8e+kIzZ2bm/Mycv3Tp0uZTAgBwLJvE3sUkN+07vjHJy/uO35Pk9iR/OjN/k+QDSc4d9CaNtdbZtdbOWmvn1KlTx58aAICNbBJ7zyQ5PTO3zsx1Se5Pcu6fnlxrvbrWun6tdcta65YkTye5d611/kQmBgBgY4fG3lrr9SQPJXkyyZeTPL7Wem5mHpmZe096QAAAju/aTRattZ5I8sRlj33qDdZ+6M2PBQDANriDBgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFNso9mbmrpl5YWYuzMzDBzz/0zPz/Mw8OzN/MjPfu/1RAQA4qkNjb2auSfJokruT3JbkgZm57bJlX0qys9b6gSS/n+TT2x4UAICj2+TM3p1JLqy1XlxrvZbksST37V+w1npqrfX1vcOnk9y43TEBADiOTWLvhiQv7Tu+uPfYG3kwyR8f9MTMnJmZ8zNz/tKlS5tPCQDAsWwSe3PAY+vAhTMfS7KT5DMHPb/WOrvW2llr7Zw6dWrzKQEAOJZrN1hzMclN+45vTPLy5Ytm5iNJfi7JD6+1vrmd8QAAeDM2ObP3TJLTM3PrzFyX5P4k5/YvmJk7kvxWknvXWq9sf0wAAI7j0Nhba72e5KEkTyb5cpLH11rPzcwjM3Pv3rLPJPnOJL83M/9rZs69wcsBAHAVbXIZN2utJ5I8cdljn9r38Ue2PBcAAFvgDhoAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAMbEHAFBM7AEAFBN7AADFxB4AQDGxBwBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAsY1ib2bumpkXZubCzDx8wPPfPjP/Y+/5L8zMLdseFACAozs09mbmmiSPJrk7yW1JHpiZ2y5b9mCSr621/k2SX03yy9seFACAo9vkzN6dSS6stV5ca72W5LEk91225r4k/33v499P8uGZme2NCQDAcWwSezckeWnf8cW9xw5cs9Z6PcmrSf7VNgYEAOD4rt1gzUFn6NYx1mRmziQ5s3f4zZn5qw2+Pu8s1yf5u7d6CLbOvnayr53sa69/e5xP2iT2Lia5ad/xjUlefoM1F2fm2iTvTfL3l7/QWutskrNJMjPn11o7xxmaty/72sm+drKvnexrr5k5f5zP2+Qy7jNJTs/MrTNzXZL7k5y7bM25JD++9/FHk/zPtda3nNkDAODqOvTM3lrr9Zl5KMmTSa5J8tm11nMz80iS82utc0n+W5LfmZkL2T2jd/9JDg0AwGY2uYybtdYTSZ647LFP7fv4G0n+4xG/9tkjruedwb52sq+d7Gsn+9rrWHs7rrYCAPRyuzQAgGInHntutdZpg3396Zl5fmaenZk/mZnvfSvm5GgO29d96z46M2tmvOPvHWCTfZ2ZH937nn1uZn73as/I0W3wc/jmmXlqZr6097P4nrdiTo5mZj47M6+80Z+nm12/trfvz87MDx72micae2611mnDff1Skp211g9k964qn766U3JUG+5rZuY9SX4qyReu7oQcxyb7OjOnk/xskg+utb4/yX++6oNyJBt+v/58ksfXWndk942Tv351p+SYPpfkris8f3eS03v/ziT5jcNe8KTP7LnVWqdD93Wt9dRa6+t7h09n9+8z8va2yfdrkvxSduP9G1dzOI5tk339RJJH11pfS5K11itXeUaObpN9XUm+a+/j9+Zb/0Yub0NrrT/LAX+reJ/7kvz22vV0ku+eme+50muedOy51VqnTfZ1vweT/PGJTsQ2HLqvM3NHkpvWWn90NQfjTdnk+/V9Sd43M38xM0/PzJXOKvD2sMm+/mKSj83Mxez+RY2fvDqjccKO+n/wZn965U3Y2q3WeFvZeM9m5mNJdpL88IlOxDZccV9n5l3Z/VWLj1+tgdiKTb5fr83uJaEPZfcs/J/PzO1rrX844dk4vk329YEkn1tr/crM/FB2/x7u7Wut/3fy43GCjtxNJ31m7yi3WsuVbrXG28om+5qZ+UiSn0ty71rrm1dpNo7vsH19T5Lbk/zpzPxNkg8kOedNGm97m/4c/sO11j+utf46yQvZjT/evjbZ1weTPJ4ka62/TPId2b1vLu9sG/0fvN9Jx55brXU6dF/3Lvf9VnZDz+//vDNccV/XWq+uta5fa92y1rolu7+Lee9a61j3auSq2eTn8B8k+ZEkmZnrs3tZ98WrOiVHtcm+fiXJh5NkZr4vu7F36apOyUk4l+TH9t6V+4Ekr661/vZKn3Cil3Hdaq3Thvv6mSTfmeT39t5v85W11r1v2dAcasN95R1mw319Msl/mJnnk/zfJD+z1vrqWzc1h9lwXz+Z5L/OzH/J7mW+jzuZ8vY3M5/P7q9UXL/3+5a/kOTbkmSt9ZvZ/f3Le5JcSPL1JD9x6GvadwCAXu6gAQBQTOwBABQTewAAxcQeAEAxsQcAUEzsAQAUE3sAAMXEHgBAsf8PcJbBV772OwgAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" ], + "id": "9fa2dd6a-b5b8-4974-b342-3dacf232f14e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "states_file = \"nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/nga_admbnda_osgof_eha_itos.gdb/\"\n", + "fig, ax = plt.subplots(figsize=plot.big_figsize)\n", + "nigeria_gdf.plot(ax=ax, color='white', edgecolor='black', alpha=0)\n", + "zones_gdf.plot(ax=ax, color='white', edgecolor='black')\n", + "hosp_gdf.plot(ax=ax, color='b', alpha=0.02)\n", + "ax.set_xlabel('longitude')\n", + "ax.set_ylabel('latitude')\n", "\n", - "# geopandas included map, filtered to just Nigeria\n", - "world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))\n", - "world.crs = \"EPSG:4326\"\n", - "nigeria = world[(world['name'] == 'Nigeria')]\n", - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", - "\n", - "layers = fiona.listlayers(states_file)\n", - "zones_gdf = gpd.read_file(states_file, layer=1)\n", - "zones_gdf.crs = \"EPSG:4326\"\n", - "zones_gdf = zones_gdf.set_index('admin1Name_en')\n", - "zones_gdf.plot(ax=base, color='white', edgecolor='black')\n", - "\n", - "# We can now plot our ``GeoDataFrame``.\n", - "hosp_gdf.plot(ax=base, color='b', alpha=0.02, )\n", + "mlai.write_figure('nigeria-states-and-health-facilities.svg', directory='./ml')" + ], + "id": "dc4deada-1c37-4ab8-aba9-9e4377823db3" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", "\n", - "plt.show()" - ] + "Figure: The outline of the thirty-six different states of nigeria\n", + "with the location sof the health centers plotted on the map." + ], + "id": "9682fa64-d4aa-4645-95cb-168d6ddb2a31" }, { "cell_type": "markdown", - "id": "97ac1323", "metadata": {}, "source": [ - "Performing the Spatial Join\n", - "---------------------------\n", + "## Performing the Spatial Join\n", "\n", "We’ve now plotted the different health center locations across the\n", "states. You can clearly see that each of the dots falls within a\n", - "different state. For helping the visualisation, we’ve made the dots\n", + "different state. For helping the visualization, we’ve made the dots\n", "somewhat transparent (we set the `alpha` in the plot). This means that\n", "we can see the regions where there are more health centers, you should\n", "be able to spot where the major cities in Nigeria are given the\n", @@ -1167,80 +909,80 @@ "`GeoPandas` provides us with the spatial join. Here we’re going to do a\n", "[`left` or `outer`\n", "join](https://en.wikipedia.org/wiki/Join_(SQL)#Left_outer_join)." - ] + ], + "id": "dc505ed5-c0d0-4141-be37-85db59a6f340" }, { "cell_type": "code", "execution_count": null, - "id": "4ad0c444", "metadata": {}, "outputs": [], "source": [ "from geopandas.tools import sjoin" - ] + ], + "id": "4e2ab850-a067-4fb6-a8b2-c5777ca7982b" }, { "cell_type": "markdown", - "id": "0c5895fa", "metadata": {}, "source": [ "We have two GeoPandas data frames, `hosp_gdf` and `zones_gdf`. Let’s\n", "have a look at the columns the contain." - ] + ], + "id": "269a4b98-70f7-4732-a37f-f54889116e88" }, { "cell_type": "code", "execution_count": null, - "id": "bd3f6eae", "metadata": {}, "outputs": [], "source": [ "hosp_gdf.columns" - ] + ], + "id": "eac229a0-98c4-4d6f-915e-7fe1e2c99fb5" }, { "cell_type": "markdown", - "id": "841e856e", "metadata": {}, "source": [ "We can see that this is the GeoDataFrame containing the information\n", "about the hospital. Now let’s have a look at the `zones_gdf` data frame." - ] + ], + "id": "920deb10-9802-4232-8944-a597a5563484" }, { "cell_type": "code", "execution_count": null, - "id": "5c4641b0", "metadata": {}, "outputs": [], "source": [ "zones_gdf.columns" - ] + ], + "id": "af60a622-c0e7-4d2c-8bd5-e0f3ebaecaee" }, { "cell_type": "markdown", - "id": "4efda939", "metadata": {}, "source": [ "You can see that this data frame has a different set of columns. It has\n", "all the different administrative regions. But there is one column name\n", "that overlaps. We can find it by looking for the intersection between\n", "the two sets." - ] + ], + "id": "85ca9607-306a-4f8c-84b8-52cd6299341b" }, { "cell_type": "code", "execution_count": null, - "id": "72114d0b", "metadata": {}, "outputs": [], "source": [ "set(hosp_gdf.columns).intersection(set(zones_gdf.columns))" - ] + ], + "id": "8dadbfeb-2b50-4169-b570-7b0595131722" }, { "cell_type": "markdown", - "id": "3444a292", "metadata": {}, "source": [ "Here we’ve converted the lists of columns into python ‘sets’, and then\n", @@ -1253,21 +995,21 @@ "on geographical locations, if the join was on customer name or some\n", "other discrete variable, we could do the join in pandas or directly in\n", "SQL." - ] + ], + "id": "b45b4314-4bca-4df9-83ed-37c63ac42a8e" }, { "cell_type": "code", "execution_count": null, - "id": "5085bb4d", "metadata": {}, "outputs": [], "source": [ "hosp_state_joined = sjoin(hosp_gdf, zones_gdf, how='left')" - ] + ], + "id": "5cad2909-decd-408c-997c-75a409681882" }, { "cell_type": "markdown", - "id": "3cceea5e", "metadata": {}, "source": [ "The intersection of the two data frames indicates how the two data\n", @@ -1276,24 +1018,24 @@ "together on two pieces of metal. If the holes don’t match, the join\n", "can’t be done. There has to be an intersection.\n", "\n", - "But what will the result look like? Well the join should be the ‘union’\n", + "But what will the result look like? Well, the join should be the ‘union’\n", "of the two data frames. We can have a look at what the union should be\n", "by (again) converting the columns to sets." - ] + ], + "id": "dc3c96c8-b6f9-424a-9aa9-2acb51e0c003" }, { "cell_type": "code", "execution_count": null, - "id": "cde93dd2", "metadata": {}, "outputs": [], "source": [ "set(hosp_gdf.columns).union(set(zones_gdf.columns))" - ] + ], + "id": "97f2ef6e-3de3-46aa-ac97-d2e25c0c594e" }, { "cell_type": "markdown", - "id": "b34ca270", "metadata": {}, "source": [ "That gives a list of all the columns (notice that ‘geometry’ only\n", @@ -1304,21 +1046,21 @@ "new column: `index_right`. The two original data bases had separate\n", "indices. The `index_right` column represents the index from the\n", "`zones_gdf`, which is the Nigerian state." - ] + ], + "id": "6f982542-f6c7-4719-8fb8-d332bb41213c" }, { "cell_type": "code", "execution_count": null, - "id": "ba54160e", "metadata": {}, "outputs": [], "source": [ "set(hosp_state_joined.columns)" - ] + ], + "id": "de5c926a-a69d-4363-9fbc-ea7771017f59" }, { "cell_type": "markdown", - "id": "f8019f71", "metadata": {}, "source": [ "Great! They are all there! We have completed our join. We had two\n", @@ -1326,25 +1068,28 @@ "hospitals. But by performing an ‘outer’ or a ‘left’ join, we now have a\n", "single data frame with all the information in the same place! Let’s have\n", "a look at the first frew entries in the new data frame." - ] + ], + "id": "be2217a2-4a44-4a1a-afd6-b57fe4c1f011" }, { "cell_type": "code", "execution_count": null, - "id": "2aa496aa", "metadata": {}, "outputs": [], "source": [ "hosp_state_joined.head()" - ] + ], + "id": "b9289463-9881-4499-947c-671e5bc8df3c" }, { "cell_type": "markdown", - "id": "d6ae432e", "metadata": {}, "source": [ - "SQL Database\n", - "------------\n", + "## SQL Database\n", + "\n", + "\\[edit\\]\n", "\n", "Our first join was a special one, because it involved spatial data. That\n", "meant using the special `gdb` format and the `GeoPandas` tool for\n", @@ -1356,19 +1101,21 @@ "playing with database commands on your local machine. For a real system,\n", "you would need to set up a server to run the database. The server is a\n", "separate machine with the job of answering database queries. SQLite\n", - "pretends to be a proper database, but doesn’t require us to go to the\n", + "pretends to be a proper database but doesn’t require us to go to the\n", "extra work of setting up a server. Popular SQL server software includes\n", - "[`MySQL`](https://www.mysql.com/) which is free or [Microsoft’s SQL\n", - "Server](https://www.microsoft.com/en-gb/sql-server/sql-server-2019).\n", + "[`MariaDB`](https://mariadb.org/) which is open source, or [Microsoft’s\n", + "SQL Server](https://www.microsoft.com/en-gb/sql-server/sql-server-2019).\n", "\n", "A typical machine learning installation might have you running a\n", "database from a cloud service (such as AWS, Azure or Google Cloud\n", - "Platform). That cloud service would host the database for you and you\n", + "Platform). That cloud service would host the database for you, and you\n", "would pay according to the number of queries made.\n", "\n", "Many start-up companies were formed on the back of a `MySQL` server\n", - "hosted on top of AWS. You can [read how to do that\n", - "here](https://aws.amazon.com/getting-started/hands-on/create-mysql-db/).\n", + "hosted on top of AWS. Although since MySQL was sold to Sun, and then\n", + "passed on to Oracle, the open source community has turned its attention\n", + "to `MariaDB`, here’s the [AWS instructions on how to set up\n", + "`MariaDB`](https://aws.amazon.com/getting-started/hands-on/create-mariadb-db/).\n", "\n", "If you were designing your own ride hailing app, or any other major\n", "commercial software you would want to investigate whether you would need\n", @@ -1380,15 +1127,18 @@ "production ML system) we’ll also give the equivalent `pandas` commands,\n", "which would often be what you would use when you’re doing data analysis\n", "in `python` and `Jupyter`." - ] + ], + "id": "156afa6d-b366-49e2-98ea-e58b76bc1e70" }, { "cell_type": "markdown", - "id": "2bdc6802", "metadata": {}, "source": [ - "Create the SQLite Database\n", - "--------------------------\n", + "## Create the SQLite Database\n", + "\n", + "\\[edit\\]\n", "\n", "The beautiful thing about SQLite is that it allows us to play with SQL\n", "without going to the work of setting up a proper SQL server. Creating a\n", @@ -1396,41 +1146,41 @@ "database, we’ll first write our joined data to a CSV file, then we’ll\n", "use a little utility to convert our hospital database into a SQLite\n", "database." - ] + ], + "id": "3324a769-c424-414f-8148-98c1fb5d4019" }, { "cell_type": "code", "execution_count": null, - "id": "a7e8889d", "metadata": {}, "outputs": [], "source": [ - "hosp_state_joined.to_csv('facilities.csv')" - ] + "hosp_state_joined.to_csv(\"hospitals_zones_joined.csv\")" + ], + "id": "e0f61f56-94d1-4fdd-ad9d-815aa5b5ab3b" }, { "cell_type": "code", "execution_count": null, - "id": "bf345450", "metadata": {}, "outputs": [], "source": [ "%pip install csv-to-sqlite" - ] + ], + "id": "dfc5f012-9c47-4589-bd9e-86891c09bab8" }, { "cell_type": "code", "execution_count": null, - "id": "5d819b08", "metadata": {}, "outputs": [], "source": [ - "!csv-to-sqlite -f facilities.csv -t full -o db.sqlite" - ] + "!csv-to-sqlite -f hospitals_zones_joined.csv -t full -o db.sqlite" + ], + "id": "c0accaf5-3fb2-41dd-9441-5cc3e2907db2" }, { "cell_type": "markdown", - "id": "5e4febc1", "metadata": {}, "source": [ "Rather than being installed on a separate server, SQLite simply stores\n", @@ -1438,40 +1188,63 @@ "\n", "In the database there can be several ‘tables’. Each table can be thought\n", "of as like a separate dataframe. The table name we’ve just saved is\n", - "‘hospitals\\_zones\\_joined’." - ] + "‘hospitals_zones_joined’." + ], + "id": "590ffdeb-6fb5-403d-8075-2c2c4ac522dd" }, { "cell_type": "markdown", - "id": "27030945", "metadata": {}, "source": [ - "Accessing the SQL Database\n", - "--------------------------\n", + "## Accessing the SQL Database\n", "\n", "Now that we have a SQL database, we can create a connection to it and\n", "query it using SQL commands. Let’s try to simply select the data we\n", - "wrote to it, to make sure its the same.\n", + "wrote to it, to make sure it’s the same.\n", "\n", "Start by making a connection to the database. This will often be done\n", "via remote connections, but for this example we’ll connect locally to\n", - "the database using the filepath directly." - ] + "the database using the filepath directly.\n", + "\n", + "To access a data base, the first thing that is made is a connection.\n", + "Then SQL is used to extract the information required. A typical SQL\n", + "command is `SELECT`. It allows us to extract rows from a given table. It\n", + "operates a bit like the `.head()` method in `pandas`, it will return the\n", + "first `N` rows (by default the `.head()` command returns the first 5\n", + "rows, but you can set `N` to whatever you like. Here we’ve included a\n", + "default value of 5 to make it match the `pandas` command.\n", + "\n", + "We do this using an `execute` command on the connection.\n", + "\n", + "Typically, its good software engineering practice to ‘wrap’ the database\n", + "command in some python code. This allows the commands to be maintained.\n", + "You will also be asked to do this in your final assessment, including\n", + "re-writing some of the code - pay attention to the slight syntax\n", + "differences and multi-statement queries.Below we wrap the SQL command\n", + "\n", + " SELECT * FROM table_name LIMIT N\n", + "\n", + "in python code. This SQL command selects the first `N` entries from a\n", + "given database called `table_name`.\n", + "\n", + "We can pass the `table_name` and number of rows, `n`, to the python\n", + "command." + ], + "id": "a8bfb8f0-ef6a-4560-8e85-4d4509a6dce3" }, { "cell_type": "code", "execution_count": null, - "id": "e6fd2166", "metadata": {}, "outputs": [], "source": [ "import sqlite3" - ] + ], + "id": "048ceaaa-6342-4811-834a-ebb3a8e8d6fd" }, { "cell_type": "code", "execution_count": null, - "id": "4f23c8c6", "metadata": {}, "outputs": [], "source": [ @@ -1488,21 +1261,21 @@ " print(e)\n", "\n", " return conn" - ] + ], + "id": "0012adc1-a954-4e2d-89c5-8ca082f6f7c3" }, { "cell_type": "code", "execution_count": null, - "id": "1150e6af", "metadata": {}, "outputs": [], "source": [ "conn = create_connection(\"db.sqlite\")" - ] + ], + "id": "97234f30-a175-46e1-9af6-dbef5cf9d869" }, { "cell_type": "markdown", - "id": "f0a7f5f4", "metadata": {}, "source": [ "Now that we have a connection, we can write a command and pass it to the\n", @@ -1513,30 +1286,17 @@ "command is `SELECT`. It allows us to extract rows from a given table. It\n", "operates a bit like the `.head()` method in `pandas`, it will return the\n", "first `N` rows (by default the `.head()` command returns the first 5\n", - "rows, but you can set `n` to whatever you like. Here we’ve included a\n", + "rows, but you can set `N` to whatever you like. Here we’ve included a\n", "default value of 5 to make it match the `pandas` command.\n", "\n", "The python library, `sqlite3`, allows us to access the SQL database\n", - "directly from python. We do this using an `execute` command on the\n", - "connection.\n", - "\n", - "Typically, its good software engineering practice to ‘wrap’ the database\n", - "command in some python code. This allows the commands to be maintained.\n", - "Below we wrap the SQL command\n", - "\n", - " SELECT * FROM [table_name] LIMIT : N\n", - "\n", - "in python code. This SQL command selects the first `N` entries from a\n", - "given database called `table_name`.\n", - "\n", - "We can pass the `table_name` and number of rows, `N` to the python\n", - "command." - ] + "directly from python." + ], + "id": "eb595c74-f37f-4793-a1f3-9f02d13a2857" }, { "cell_type": "code", "execution_count": null, - "id": "5003ba78", "metadata": {}, "outputs": [], "source": [ @@ -1548,27 +1308,27 @@ " :param n: Number of rows to query\n", " \"\"\"\n", " cur = conn.cursor()\n", - " cur.execute(f\"SELECT * FROM [{table}] LIMIT :limitNum\", {\"limitNum\": n})\n", + " cur.execute(f'SELECT * FROM {table} LIMIT {n}')\n", "\n", " rows = cur.fetchall()\n", " return rows" - ] + ], + "id": "b2d6aa3b-5666-4f9c-b899-85b941507f86" }, { "cell_type": "markdown", - "id": "5970024f", "metadata": {}, "source": [ "Let’s have a go at calling the command to extract the first three\n", "facilities from our health center database. Let’s try creating a\n", - "function that does the same thing the pandas .head() method does so we\n", + "function that does the same thing the pandas `.head()` method does so we\n", "can inspect our database." - ] + ], + "id": "56aecafc-cffe-4348-94f4-cc8a521d207b" }, { "cell_type": "code", "execution_count": null, - "id": "bd2a75b7", "metadata": {}, "outputs": [], "source": [ @@ -1576,134 +1336,221 @@ " rows = select_top(conn, table, n)\n", " for r in rows:\n", " print(r)" - ] + ], + "id": "25cddf48-2cc6-4b51-ab50-08ea3ed23eeb" }, { "cell_type": "code", "execution_count": null, - "id": "a0e262b9", "metadata": {}, "outputs": [], "source": [ - "head(conn, 'facilities')" - ] + "head(conn, \"hospitals_zones_joined\")" + ], + "id": "5b8824aa-2e9f-4c90-8622-13377606b4f1" }, { "cell_type": "markdown", - "id": "c81bef14", "metadata": {}, "source": [ - "Great! We now have the data base in SQLite, and some python functions\n", - "that operate on the data base by wrapping SQL commands.\n", + "Great! We now have the database in and some python functions that\n", + "operate on the data base by wrapping SQL commands.\n", "\n", "We will return to the SQL command style after download and add the other\n", "datasets to the database using a combination of `pandas` and the\n", - "`csv-to-sqlite` utility.\n", + "database utilities.\n", "\n", "Our next task will be to introduce data on COVID19 so that we can join\n", "that to our other data sets." - ] + ], + "id": "ad665ce5-b4ab-4087-b1f2-15c985c1dc08" }, { "cell_type": "markdown", - "id": "948bca93", "metadata": {}, "source": [ - "Covid Data\n", - "----------\n", + "## Covid Data\n", + "\n", + "\\[edit\\]\n", "\n", "Now we have the health data, we’re going to combine it with [data about\n", "COVID-19 cases in Nigeria over\n", "time](https://github.com/dsfsi/covid19africa). This data is kindly\n", - "provided by Africa open COVID-19 data working group, which Elaine\n", - "Nsoesie has been working with. The data is taken from Twitter, and only\n", - "goes up until May 2020.\n", + "provided by Africa open COVID-19 data working group, which [Elaine\n", + "Nsoesie](https://www.bu.edu/sph/profile/elaine-nsoesie/) has been\n", + "working with. The data is taken from Twitter, and only goes up until May\n", + "2020.\n", "\n", - "They provide their data in github. We can access the cases we’re\n", + "They provide their data in GitHub. We can access the cases we’re\n", "interested in from the following URL.\n", "\n", + "\n", + "\n", "For convenience, we’ll load the data into pandas first, but our next\n", "step will be to create a new SQLite table containing the data. Then\n", "we’ll join that table to our existing tables." - ] + ], + "id": "3dedc866-fa97-4421-986f-7647a9b7f925" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Nigerian COVID Data\n", + "\n", + "\\[edit\\]\n", + "\n", + "At the beginning of the COVID-19 outbreak, the Consortium for African\n", + "COVID-19 Data formed to bring together data from across the African\n", + "continent on COVID-19 cases (Marivate et al., 2020). These cases are\n", + "recorded in the following GitHub repository:\n", + ".\n", + "\n", + "For ease of use we’ve packaged this data set in the `pods` library" + ], + "id": "3d39c6f3-e03a-492d-978c-8940f2044aa1" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pods" + ], + "id": "cfa6c63c-8495-4cfb-b0d7-979aba06c6ac" }, { "cell_type": "code", "execution_count": null, - "id": "13ad6017", "metadata": {}, "outputs": [], "source": [ - "covid_data_url = 'https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv'\n", - "covid_data_csv = 'cases.csv'\n", - "urllib.request.urlretrieve(covid_data_url, covid_data_csv)\n", - "covid_data = pd.read_csv(covid_data_csv)" - ] + "data = pods.datasets.nigerian_covid()['Y']\n", + "data.head()" + ], + "id": "87984557-8ccc-4ce5-a39c-ef3378a15dcf" }, { "cell_type": "markdown", - "id": "2b7ac867", "metadata": {}, "source": [ - "As normal, we should inspect our data to check that it contains what we\n", - "expect." - ] + "Alternatively, you can access the data directly with the following\n", + "commands.\n", + "\n", + "``` python\n", + "import urllib.request\n", + "import pandas as pd\n", + "\n", + "urllib.request.urlretrieve('https://raw.githubusercontent.com/dsfsi/covid19africa/master/data/line_lists/line-list-nigeria.csv', 'line-list-nigeria.csv')\n", + "data = pd.read_csv('line-list-nigeria.csv', parse_dates=['date', \n", + " 'date_confirmation', \n", + " 'date_admission_hospital', \n", + " 'date_onset_symptoms',\n", + " 'death_date'])\n", + "```\n", + "\n", + "Once it is loaded in the data can be summarized using the `describe`\n", + "method in pandas." + ], + "id": "8b73decb-8e66-4619-8730-6dde4e6bce88" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.describe()" + ], + "id": "9b0cb616-de45-4e43-877e-a4347dd5a04e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai\n", + "import mlai.plot as plot" + ], + "id": "dd83bf84-7ffd-4671-992e-6eee49a1f098" }, { "cell_type": "code", "execution_count": null, - "id": "84982973", "metadata": {}, "outputs": [], "source": [ - "covid_data.head()" - ] + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "data['count_column'] = True\n", + "fig.autofmt_xdate(rotation=45)\n", + "ax.plot(data.date, data.count_column.cumsum())\n", + "\n", + "ax.plot()\n", + "ax.set_xlabel('date')\n", + "ax.set_ylabel('case counts')\n", + "\n", + "mlai.write_figure('nigerian-covid-data.svg', directory='./datasets')" + ], + "id": "0088efd9-02a8-4699-a6fd-05c6331eff68" }, { "cell_type": "markdown", - "id": "fc698290", "metadata": {}, "source": [ - "And we can get an idea of all the information in the data from looking\n", - "at the columns." - ] + "\n", + "\n", + "Figure: Evolution of COVID-19 cases in Nigeria." + ], + "id": "8ef62652-45b1-458e-9884-849b171a23d3" }, { "cell_type": "code", "execution_count": null, - "id": "0ce0c5e3", "metadata": {}, "outputs": [], "source": [ - "covid_data.columns" - ] + "covid_data=data\n", + "covid_data.to_csv('cases.csv')" + ], + "id": "5bd2cb75-166e-42c6-97a2-4f43074ae2d7" }, { "cell_type": "markdown", - "id": "034d4d36", "metadata": {}, "source": [ "Now we convert this CSV file we’ve downloaded into a new table in the\n", - "database file. We can do this, again, with the csv-to-sqlite script." - ] + "database file.\n", + "\n", + "We can do this, again, with the csv-to-sqlite script." + ], + "id": "d53aa2dc-f4fe-4077-bc2d-82dc5f1a42ac" }, { "cell_type": "code", "execution_count": null, - "id": "bf5550df", "metadata": {}, "outputs": [], "source": [ "!csv-to-sqlite -f cases.csv -t full -o db.sqlite" - ] + ], + "id": "1e0ccf80-d4cb-480d-aa63-fe1dbc6e693d" }, { "cell_type": "markdown", - "id": "6ae426ec", "metadata": {}, "source": [ - "Population Data\n", - "---------------\n", + "## Population Data\n", + "\n", + "\\[edit\\]\n", "\n", "Now we have information about COVID cases, and we have information about\n", "how many health centers and how many doctors and nurses there are in\n", @@ -1713,114 +1560,141 @@ "\n", "If we were running our ride hailing service, we would also need\n", "information about how many people there were in different areas, so we\n", - "could understand what the *demand* for the boda boda rides might be.\n", + "could understand what the demand for the boda boda rides might be.\n", "\n", "To access the number of people we can get population statistics from the\n", "[Humanitarian Data Exchange](https://data.humdata.org/).\n", "\n", "We also want to have population data for each state in Nigeria, so that\n", "we can see attributes like whether there are zones of high health\n", - "facility density but low population density." - ] + "facility density but low population density.\n", + "\n", + "``` python\n", + "import urllib\n", + "\n", + "pop_url = \"https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/d9fc551a-b5e4-4bed-9d0d-b047b6961817/download/nga_admpop_adm1_2020.csv\"\n", + "_, msg = urllib.request.urlretrieve(pop_url,\"nga_admpop_adm1_2020.csv\")\n", + "data = pd.read_csv(\"nga_admpop_adm1_2020.csv\")\n", + "```\n", + "\n", + "To do joins with this data, we must first make sure that the columns\n", + "have the right names. The name should match the same name of the column\n", + "in our existing data. So we reset the column names, and the name of the\n", + "index, as follows.\n", + "\n", + "``` python\n", + "data.dropna(axis=0, how=\"all\", inplace=True)\n", + "data.dropna(axis=1, how=\"all\", inplace=True)\n", + "data.rename(columns = {\"ADM0_NAME\" : \"admin0Name_en\", \n", + " \"ADM0_PCODE\" : \"admin0Pcode\", \n", + " \"ADM1_NAME\" : \"admin1Name_en\", \n", + " \"ADM1_PCODE\" : \"admin1Pcode\", \n", + " \"T_TL\" : \"population\"},\n", + " inplace=True)\n", + "data[\"admin0Name_en\"] = data[\"admin0Name_en\"].str.title()\n", + "data[\"admin1Name_en\"] = data[\"admin1Name_en\"].str.title()\n", + " \n", + "data = data.set_index(\"admin1Name_en\")\n", + "```" + ], + "id": "bbfd2dec-ddd4-480e-b8ce-c719c27ea6bd" }, { "cell_type": "code", "execution_count": null, - "id": "a525bf66", "metadata": {}, "outputs": [], "source": [ - "pop_url = 'https://data.humdata.org/dataset/a7c3de5e-ff27-4746-99cd-05f2ad9b1066/resource/d9fc551a-b5e4-4bed-9d0d-b047b6961817/download/nga_pop_adm1_2016.csv'\n", - "_, msg = urllib.request.urlretrieve(pop_url,'nga_pop_adm1_2016.csv')\n", - "pop_data = pd.read_csv('nga_pop_adm1_2016.csv')" - ] + "data = pods.datasets.nigerian_population()[\"Y\"]" + ], + "id": "242f0a36-3cf1-43bd-bfd0-466b21467721" }, { "cell_type": "code", "execution_count": null, - "id": "14ada4a5", "metadata": {}, "outputs": [], "source": [ - "pop_data.head()" - ] - }, - { - "cell_type": "markdown", - "id": "69d3ead5", - "metadata": {}, - "source": [ - "To do joins with this data, we must first make sure that the columns\n", - "have the right names. The name should match the same name of the column\n", - "in our existing data. So we reset the column names, and the name of the\n", - "index, as follows." - ] + "data.head()" + ], + "id": "22e6228d-5b49-4629-8764-859a8dc96142" }, { "cell_type": "code", "execution_count": null, - "id": "7e21595f", "metadata": {}, "outputs": [], "source": [ - "pop_data.columns = ['admin1Name_en', 'admin1Pcode', 'admin0Name_en', 'admin0Pcode', 'population']\n", - "pop_data = pop_data.set_index('admin1Name_en')" - ] + "pop_data=data" + ], + "id": "bf49baf7-c0ca-443d-ba96-0f98a143d0e3" }, { "cell_type": "markdown", - "id": "b6d16737", "metadata": {}, "source": [ "When doing this for real world data, you should also make sure that the\n", "names used in the rows are the same across the different data bases. For\n", "example, has someone decided to use an abbreviation for ‘Federal Capital\n", "Territory’ and set it as ‘FCT’. The computer won’t understand these are\n", - "the same states, and if you do a join with such data you can get\n", + "the same states, and if you do a join with such data, you can get\n", "duplicate entries or missing entries. This sort of thing happens a lot\n", "in real world data and takes a lot of time to sort out. Fortunately, in\n", - "this case, the data is well curated and we don’t have these problems." - ] + "this case, the data is well curated, and we don’t have these problems." + ], + "id": "4d9cdeee-a84c-402e-91de-96fc17b1b1fd" }, { "cell_type": "markdown", - "id": "e4c4c400", "metadata": {}, "source": [ - "Save to database file\n", - "---------------------\n", + "## Save to database file\n", "\n", "The next step is to add this new CSV file as an additional table in our\n", - "SQLite database. This is done using the script as before." - ] + "database." + ], + "id": "1f9fe873-2cb6-4394-b86c-4f42b6522b6f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading the Population Data into the SQLite Database\n", + "\n", + "\\[edit\\]\n", + "\n", + "We can load the data into the SQLite database using the script as\n", + "before." + ], + "id": "5d09def5-6eab-43a5-aa21-6e83ea395d04" }, { "cell_type": "code", "execution_count": null, - "id": "8ccd03e7", "metadata": {}, "outputs": [], "source": [ "pop_data.to_csv('pop_data.csv')" - ] + ], + "id": "2ea483f0-4f2a-44a6-bed3-dcd701e07539" }, { "cell_type": "code", "execution_count": null, - "id": "79e6964d", "metadata": {}, "outputs": [], "source": [ "!csv-to-sqlite -f pop_data.csv -t full -o db.sqlite" - ] + ], + "id": "7e54c4dc-4699-435a-a60b-55d1bd723907" }, { "cell_type": "markdown", - "id": "62f8f385", "metadata": {}, "source": [ - "Computing per capita hospitals and COVID\n", - "----------------------------------------\n", + "## Computing per capita hospitals and COVID\n", "\n", "The Minister of Health in Abuja may be interested in which states are\n", "most vulnerable to COVID19. We now have all the information in our SQL\n", @@ -1828,14 +1702,20 @@ "and what the COVID19 situation is.\n", "\n", "To do this, we will use the `JOIN` operation from SQL and introduce a\n", - "new operation called `GROUPBY`.\n", - "\n", - "#### Joining in Pandas\n", + "new operation called `GROUPBY`." + ], + "id": "43f37d96-aed7-4cac-9fd5-9cd3e04dc59b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Joining in Pandas\n", "\n", "As before, these operations can be done in pandas or GeoPandas. Before\n", "we create the SQL commands, we’ll show how you can do that in pandas.\n", "\n", - "In pandas, the equivalent of a database table is a dataframe. So the\n", + "In `pandas`, the equivalent of a database table is a dataframe. So, the\n", "JOIN operation takes two dataframes and joins them based on the key. The\n", "key is that special shared column between the two tables. The place\n", "where the ‘holes align’ so the two databases can be joined together.\n", @@ -1846,31 +1726,31 @@ "\n", "This is sometimes where problems can creep in. If in one table Abuja’s\n", "state is encoded as ‘FCT’ or ‘FCT-Abuja’, and in another table it’s\n", - "encoded as ‘Federal Capital Territory’, they won’t match and that data\n", + "encoded as ‘Federal Capital Territory’, they won’t match, and that data\n", "wouldn’t appear in the joined table.\n", "\n", "In simple terms, a JOIN operation takes two tables (or dataframes) and\n", "combines them based on some key, in this case the index of the Pandas\n", "data frame which is the state name." - ] + ], + "id": "a9dcd40c-85ac-4fb2-bffa-d2e3acd5e3e7" }, { "cell_type": "code", "execution_count": null, - "id": "19e56178", "metadata": {}, "outputs": [], "source": [ + "zones_gdf.set_index(\"admin1Name_en\", inplace=True)\n", "pop_joined = zones_gdf.join(pop_data['population'], how='inner')" - ] + ], + "id": "f6895105-0655-4ceb-bb01-7f82ba44d96a" }, { "cell_type": "markdown", - "id": "3137bff4", "metadata": {}, "source": [ - "GroupBy in Pandas\n", - "-----------------\n", + "## GroupBy in Pandas\n", "\n", "Our COVID19 data is in the form of individual cases. But we are\n", "interested in total case counts for each state. There is a special data\n", @@ -1885,153 +1765,162 @@ "such as to count the rows in each group, or to sum or take the mean over\n", "the values in some column (imagine each case row had the age of the\n", "patient, and you were interested in the mean age of patients.)" - ] + ], + "id": "c5747c36-52a8-4c3c-b212-acf10aa62ea4" }, { "cell_type": "code", "execution_count": null, - "id": "32008ffa", "metadata": {}, "outputs": [], "source": [ "covid_cases_by_state = covid_data.groupby(['province/state']).count()['case_id']" - ] + ], + "id": "9615e6f0-033e-409f-9d56-fc9160f63b35" }, { "cell_type": "markdown", - "id": "5138a50b", "metadata": {}, "source": [ "The `.groupby()` method on the dataframe has now given us a new data\n", "series that contains the total number of covid cases in each state. We\n", "can examine it to check we have something sensible." - ] + ], + "id": "c02f95b6-51e8-4693-8719-d25f04707b1e" }, { "cell_type": "code", "execution_count": null, - "id": "3d7f60d8", "metadata": {}, "outputs": [], "source": [ "covid_cases_by_state" - ] + ], + "id": "81afb692-4691-4b1b-8f2f-d629042dab90" }, { "cell_type": "markdown", - "id": "f240cb48", "metadata": {}, "source": [ - "Now we have this new data series, it can be added to the pandas data\n", - "frame as a new column." - ] + "Now we have this new data series, it can be added to the pandas\n", + "dataframe as a new column." + ], + "id": "eb022ae7-4cda-4eae-ba5e-97103a978e01" }, { "cell_type": "code", "execution_count": null, - "id": "e703b61f", "metadata": {}, "outputs": [], "source": [ "pop_joined['covid_cases_by_state'] = covid_cases_by_state" - ] + ], + "id": "7ab1d5be-2fd7-4691-86f7-907b6292324d" }, { "cell_type": "markdown", - "id": "ba33abde", "metadata": {}, "source": [ "The spatial join we did on the original data frame to obtain\n", - "hosp\\_state\\_joined introduced a new column, index\\_right which contains\n", + "hosp_state_joined introduced a new column, `index_right` that contains\n", "the state of each of the hospitals. Let’s have a quick look at it below." - ] + ], + "id": "85287173-538e-496b-83c6-95384841b59b" }, { "cell_type": "code", "execution_count": null, - "id": "1ac1c160", "metadata": {}, "outputs": [], "source": [ "hosp_state_joined['index_right']" - ] + ], + "id": "2d29904d-33f3-44f1-b57f-5786ce62a095" }, { "cell_type": "markdown", - "id": "f071c427", "metadata": {}, "source": [ "To count the hospitals in each of the states, we first create a grouped\n", "series where we’ve grouped on these states." - ] + ], + "id": "1a36f96b-9dbe-4b35-9e4a-0332b25671fa" }, { "cell_type": "code", "execution_count": null, - "id": "99a7f974", "metadata": {}, "outputs": [], "source": [ - "grouped = hosp_state_joined.groupby('index_right')" - ] + "grouped = hosp_state_joined.groupby('admin1Name_en')" + ], + "id": "5e0e1293-c603-4f72-97a3-210f77aa2de9" }, { "cell_type": "markdown", - "id": "04d4631d", "metadata": {}, "source": [ "This python operation now goes through each of the groups and counts how\n", "many hospitals there are in each state. It stores the result in a\n", - "dictionary. If you’re new to Python, then to understand this code you\n", + "dictionary. If you’re new to python, then to understand this code you\n", "need to understand what a ‘dictionary comprehension’ is. In this case\n", "the dictionary comprehension is being used to create a python dictionary\n", "of states and total hospital counts. That’s then being converted into a\n", "`pandas` Data Series and added to the `pop_joined` dataframe." - ] + ], + "id": "310ef6bd-08a2-4156-a7ba-5af71719ca3a" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ], + "id": "0ba8c126-8305-47e5-9e1d-22bc2f50445d" }, { "cell_type": "code", "execution_count": null, - "id": "5c50a0e7", "metadata": {}, "outputs": [], "source": [ "counted_groups = {k: len(v) for k, v in grouped.groups.items()}\n", "pop_joined['hosp_state'] = pd.Series(counted_groups)" - ] + ], + "id": "a50604aa-56e6-4ecf-b246-51399d4de23d" }, { "cell_type": "markdown", - "id": "42f29c61", "metadata": {}, "source": [ "For convenience, we can now add a new data series to the data frame that\n", "contains the per capita information about hospitals. that makes it easy\n", "to retrieve later." - ] + ], + "id": "bc054bcb-2584-4f18-9285-8499c9a04779" }, { "cell_type": "code", "execution_count": null, - "id": "baec3556", "metadata": {}, "outputs": [], "source": [ "pop_joined['hosp_per_capita_10k'] = (pop_joined['hosp_state'] * 10000 )/ pop_joined['population']" - ] + ], + "id": "dc883ad4-bd2a-404e-b8d8-a0319639f3c2" }, { "cell_type": "markdown", - "id": "1d1df617", "metadata": {}, "source": [ - "SQL-style\n", - "---------\n", + "## SQL-style\n", "\n", "That’s the `pandas` approach to doing it. But `pandas` itself is\n", - "inspired by database language, in particular relational databases such\n", - "as SQL. To do these types of joins at scale, e.g. for our ride hailing\n", - "app, we need to see how to do these joins in a database.\n", + "inspired by database languages, in particular relational databases such\n", + "as SQL. To do these types of joins at scale, e.g., for a ride hailing\n", + "app, we need to do these joins in a database.\n", "\n", "As before, we’ll wrap the underlying SQL commands with a convenient\n", "python command.\n", @@ -2040,13 +1929,13 @@ "command](https://www.w3schools.com/sql/sql_select.asp), which extracts\n", "`FROM` a particular table. It then completes an\n", "[`INNER JOIN`](https://www.w3schools.com/sql/sql_join_inner.asp) using\n", - "particular columns (`provice/state` and `index_right`)" - ] + "particular columns (`province/state` and `admin1Name_en`)" + ], + "id": "b15ca88d-c5fe-482c-b89f-8595d6190638" }, { "cell_type": "code", "execution_count": null, - "id": "132f2e16", "metadata": {}, "outputs": [], "source": [ @@ -2056,159 +1945,236 @@ " \"\"\"\n", " cur = conn.cursor()\n", " cur.execute(\"\"\"\n", - " SELECT ct.[province/state] as [state], ct.[case_count], ft.[facility_count]\n", + " SELECT ct.`province/state` as state, ct.case_count, ft.facility_count\n", " FROM\n", - " (SELECT [province/state], COUNT(*) as [case_count] FROM [cases] GROUP BY [province/state]) ct\n", + " (SELECT `province/state`, COUNT(*) as case_count FROM cases GROUP BY `province/state`) ct\n", " INNER JOIN \n", - " (SELECT [index_right], COUNT(*) as [facility_count] FROM [facilities] GROUP BY [index_right]) ft\n", + " (SELECT admin1Name_en, COUNT(*) as facility_count FROM hospitals_zones_joined GROUP BY admin1Name_en) ft\n", " ON\n", - " ct.[province/state] = ft.[index_right]\n", + " ct.`province/state` = ft.admin1Name_en\n", " \"\"\")\n", "\n", " rows = cur.fetchall()\n", " return rows" - ] + ], + "id": "2d36f94b-9507-46c7-8b33-a8deb290fb6c" }, { "cell_type": "markdown", - "id": "4b61896f", "metadata": {}, "source": [ "Now we’ve created our python wrapper, we can connect to the data base\n", "and run our SQL command on the database using the wrapper." - ] + ], + "id": "7d4cf587-5206-4a61-98e7-70dabc18d14d" }, { "cell_type": "code", "execution_count": null, - "id": "fe5f21cd", "metadata": {}, "outputs": [], "source": [ "conn = create_connection(\"db.sqlite\")" - ] + ], + "id": "8d4ede0f-3eb1-47bc-a1e8-70b2c0ee3b80" }, { "cell_type": "code", "execution_count": null, - "id": "5c4edd2a", "metadata": {}, "outputs": [], "source": [ "state_cases_hosps = join_counts(conn)" - ] + ], + "id": "5d9815c5-b1e7-45a8-b9f2-ea4ffa5e36d4" }, { "cell_type": "code", "execution_count": null, - "id": "1a03858b", "metadata": {}, "outputs": [], "source": [ "for row in state_cases_hosps:\n", " print(\"State {} \\t\\t Covid Cases {} \\t\\t Health Facilities {}\".format(row[0], row[1], row[2]))" - ] + ], + "id": "4dd5953e-b6e2-4415-9029-72bf72e34fe7" }, { "cell_type": "code", "execution_count": null, - "id": "c421edda", "metadata": {}, "outputs": [], "source": [ - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", + "base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", "pop_joined.plot(ax=base, column='population', edgecolor='black', legend=True)\n", "base.set_title(\"Population of Nigerian States\")" - ] + ], + "id": "9c7a58f6-0bcf-46b8-8ad0-0b33ab4136eb" }, { "cell_type": "code", "execution_count": null, - "id": "be53c352", "metadata": {}, "outputs": [], "source": [ - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", + "base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", "pop_joined.plot(ax=base, column='hosp_per_capita_10k', edgecolor='black', legend=True)\n", "base.set_title(\"Hospitals Per Capita (10k) of Nigerian States\")" - ] + ], + "id": "738d90e0-82d8-4f59-b100-ca1155349cf3" }, { "cell_type": "markdown", - "id": "0ff80566", "metadata": {}, "source": [ - "::: {.cell .markdown}\n", + "### Exercise 1\n", + "\n", + "Add a new column the dataframe for covid cases per 10,000 population, in\n", + "the same way we computed health facilities per 10k capita." + ], + "id": "84052d05-e19c-4bcd-87d0-bbb64386b0f3" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your answer to Exercise 1 here\n", + "\n", + "\n", + "\n", + "\n" + ], + "id": "700c002b-2838-4bb6-aa63-b0fb8233293a" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise 2\n", + "\n", + "Add a new column for covid cases per health facility." + ], + "id": "66d4670a-9842-4f74-9753-10d9aa1b72d3" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your answer to Exercise 2 here\n", "\n", - "Exercise\n", - "--------\n", "\n", - "1. Add a new column the dataframe for covid cases per 10,000\n", - " population, in the same way we computed health facilities per 10k\n", - " capita.\n", "\n", - "2. Add a new column for covid cases per health facility.\n", + "\n" + ], + "id": "fb3861ab-b49a-43a4-815f-9dbefcd10cf9" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise 3\n", "\n", "Do this in both the SQL and the Pandas styles to get a feel for how they\n", - "differ.\n", + "differ." + ], + "id": "753812a9-c2f8-4b00-b722-1c80d3580260" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your answer to Exercise 3 here\n", + "\n", + "\n", + "\n", + "\n" + ], + "id": "f2f258e2-c935-4e55-a122-d1e32d793f05" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exercise 4\n", + "\n", + "Perform an inner join using SQL on your databases and convert the result\n", + "into a `pandas` DataFrame." + ], + "id": "2e4f36b4-7e44-4c4c-979b-22cd7390a0e5" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Write your answer to Exercise 4 here\n", + "\n", + "\n", "\n", - "{:::" - ] + "\n" + ], + "id": "3e279494-79f6-4269-8168-ac0d65c2f19c" }, { "cell_type": "code", "execution_count": null, - "id": "7bf114dc", "metadata": {}, "outputs": [], "source": [ + "\n", "# pop_joined['cases_per_capita_10k'] = ???\n", "# pop_joined['cases_per_facility'] = ???" - ] + ], + "id": "aac06e79-50c0-4206-9815-4a8f8a96fbfb" }, { "cell_type": "code", "execution_count": null, - "id": "cce92a95", "metadata": {}, "outputs": [], "source": [ - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", + "base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", "pop_joined.plot(ax=base, column='cases_per_capita_10k', edgecolor='black', legend=True)\n", "base.set_title(\"Covid Cases Per Capita (10k) of Nigerian States\")" - ] + ], + "id": "0f464b0f-dadd-4b2f-8dd4-7296ea437410" }, { "cell_type": "code", "execution_count": null, - "id": "702c54c1", "metadata": {}, "outputs": [], "source": [ - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", + "base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", "pop_joined.plot(ax=base, column='covid_cases_by_state', edgecolor='black', legend=True)\n", "base.set_title(\"Covid Cases by State\")" - ] + ], + "id": "f31401f4-5bae-43a6-abff-01d98761dd4a" }, { "cell_type": "code", "execution_count": null, - "id": "68940fd8", "metadata": {}, "outputs": [], "source": [ - "base = nigeria.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", + "base = nigeria_gdf.plot(color='white', edgecolor='black', alpha=0, figsize=(11, 11))\n", "pop_joined.plot(ax=base, column='cases_per_facility', edgecolor='black', legend=True)\n", "base.set_title(\"Covid Cases per Health Facility\")" - ] + ], + "id": "30b4a5cc-0454-457a-b3d2-46bc623bdb36" }, { "cell_type": "markdown", - "id": "31d8c1c3", "metadata": {}, "source": [ - "Thanks!\n", - "-------\n", + "## Thanks!\n", "\n", "For more information on these subjects and more you might want to check\n", "the following resources.\n", @@ -2219,37 +2185,33 @@ " Page](http://www.theguardian.com/profile/neil-lawrence)\n", "- blog:\n", " [http://inverseprobability.com](http://inverseprobability.com/blog.html)" - ] + ], + "id": "98c2e522-fc18-48d6-bf88-04a23963d302" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ], + "id": "c75788f5-00bb-4e10-a4f4-da8d005243d4" }, { "cell_type": "markdown", - "id": "aa93382e", "metadata": {}, "source": [ - "References\n", - "----------" - ] + "Marivate, V., Nsoesie, E., Bekele, E., Africa open COVID-19 data working\n", + "group, 2020. Coronavirus COVID-19 (2019-nCoV) Data\n", + "Repository for Africa. \n", + "\n", + "The Office of the Senior Special Assistant to the President on the\n", + "Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014.\n", + "Nigeria NMIS facility database." + ], + "id": "6ae372af-4e54-4156-86bd-ea85bc5731c2" } ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" - } - }, "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 5, + "metadata": {} } diff --git a/_notebooks/03-bayesian-methods-abuja.ipynb b/_notebooks/03-bayesian-methods-abuja.ipynb index f7a2167..564f3d4 100644 --- a/_notebooks/03-bayesian-methods-abuja.ipynb +++ b/_notebooks/03-bayesian-methods-abuja.ipynb @@ -4,15 +4,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Bayesian Methods\n", - "================\n", + "# Bayesian Methods\n", "\n", "### [Neil D. Lawrence](http://inverseprobability.com), Amazon Cambridge\n", "\n", - "and University of Sheffield \\#\\#\\# [Oluwasanmi\n", - "Koyejo](https://sanmi.cs.illinois.edu/), Google and University of\n", - "Illinois \\#\\#\\# 2018-11-14" - ] + "and University of Sheffield\n", + "\n", + "### [Oluwasanmi Koyejo](https://sanmi.cs.illinois.edu/), Google and\n", + "\n", + "University of Illinois\n", + "\n", + "### 2018-11-14" + ], + "id": "635dac69-8cd8-4b4b-b9ac-2e9106cd1213" }, { "cell_type": "markdown", @@ -24,309 +28,24 @@ "practice to Naive Bayesian classification. In this session we review the\n", "probabilistic formulation of a classification model, reviewing initially\n", "maximum likelihood and the naive Bayes model." - ] + ], + "id": "efc361ac-eff9-4c78-9b90-32328f379948" }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$\n", - "\\newcommand{\\tk}[1]{}\n", - "\\newcommand{\\Amatrix}{\\mathbf{A}}\n", - "\\newcommand{\\KL}[2]{\\text{KL}\\left( #1\\,\\|\\,#2 \\right)}\n", - "\\newcommand{\\Kaast}{\\kernelMatrix_{\\mathbf{ \\ast}\\mathbf{ \\ast}}}\n", - "\\newcommand{\\Kastu}{\\kernelMatrix_{\\mathbf{ \\ast} \\inducingVector}}\n", - "\\newcommand{\\Kff}{\\kernelMatrix_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kfu}{\\kernelMatrix_{\\mappingFunctionVector \\inducingVector}}\n", - "\\newcommand{\\Kuast}{\\kernelMatrix_{\\inducingVector \\bf\\ast}}\n", - "\\newcommand{\\Kuf}{\\kernelMatrix_{\\inducingVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kuu}{\\kernelMatrix_{\\inducingVector \\inducingVector}}\n", - "\\newcommand{\\Kuui}{\\Kuu^{-1}}\n", - "\\newcommand{\\Qaast}{\\mathbf{Q}_{\\bf \\ast \\ast}}\n", - "\\newcommand{\\Qastf}{\\mathbf{Q}_{\\ast \\mappingFunction}}\n", - "\\newcommand{\\Qfast}{\\mathbf{Q}_{\\mappingFunctionVector \\bf \\ast}}\n", - "\\newcommand{\\Qff}{\\mathbf{Q}_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\aMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\aScalar}{a}\n", - "\\newcommand{\\aVector}{\\mathbf{a}}\n", - "\\newcommand{\\acceleration}{a}\n", - "\\newcommand{\\bMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\bScalar}{b}\n", - "\\newcommand{\\bVector}{\\mathbf{b}}\n", - "\\newcommand{\\basisFunc}{\\phi}\n", - "\\newcommand{\\basisFuncVector}{\\boldsymbol{ \\basisFunc}}\n", - "\\newcommand{\\basisFunction}{\\phi}\n", - "\\newcommand{\\basisLocation}{\\mu}\n", - "\\newcommand{\\basisMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\basisScalar}{\\basisFunction}\n", - "\\newcommand{\\basisVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\activationFunction}{\\phi}\n", - "\\newcommand{\\activationMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\activationScalar}{\\basisFunction}\n", - "\\newcommand{\\activationVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\bigO}{\\mathcal{O}}\n", - "\\newcommand{\\binomProb}{\\pi}\n", - "\\newcommand{\\cMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\cbasisMatrix}{\\hat{\\boldsymbol{ \\Phi}}}\n", - "\\newcommand{\\cdataMatrix}{\\hat{\\dataMatrix}}\n", - "\\newcommand{\\cdataScalar}{\\hat{\\dataScalar}}\n", - "\\newcommand{\\cdataVector}{\\hat{\\dataVector}}\n", - "\\newcommand{\\centeredKernelMatrix}{\\mathbf{ \\MakeUppercase{\\centeredKernelScalar}}}\n", - "\\newcommand{\\centeredKernelScalar}{b}\n", - "\\newcommand{\\centeredKernelVector}{\\centeredKernelScalar}\n", - "\\newcommand{\\centeringMatrix}{\\mathbf{H}}\n", - "\\newcommand{\\chiSquaredDist}[2]{\\chi_{#1}^{2}\\left(#2\\right)}\n", - "\\newcommand{\\chiSquaredSamp}[1]{\\chi_{#1}^{2}}\n", - "\\newcommand{\\conditionalCovariance}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\coregionalizationMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\coregionalizationScalar}{b}\n", - "\\newcommand{\\coregionalizationVector}{\\mathbf{ \\coregionalizationScalar}}\n", - "\\newcommand{\\covDist}[2]{\\text{cov}_{#2}\\left(#1\\right)}\n", - "\\newcommand{\\covSamp}[1]{\\text{cov}\\left(#1\\right)}\n", - "\\newcommand{\\covarianceScalar}{c}\n", - "\\newcommand{\\covarianceVector}{\\mathbf{ \\covarianceScalar}}\n", - "\\newcommand{\\covarianceMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\covarianceMatrixTwo}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\croupierScalar}{s}\n", - "\\newcommand{\\croupierVector}{\\mathbf{ \\croupierScalar}}\n", - "\\newcommand{\\croupierMatrix}{\\mathbf{ \\MakeUppercase{\\croupierScalar}}}\n", - "\\newcommand{\\dataDim}{p}\n", - "\\newcommand{\\dataIndex}{i}\n", - "\\newcommand{\\dataIndexTwo}{j}\n", - "\\newcommand{\\dataMatrix}{\\mathbf{Y}}\n", - "\\newcommand{\\dataScalar}{y}\n", - "\\newcommand{\\dataSet}{\\mathcal{D}}\n", - "\\newcommand{\\dataStd}{\\sigma}\n", - "\\newcommand{\\dataVector}{\\mathbf{ \\dataScalar}}\n", - "\\newcommand{\\decayRate}{d}\n", - "\\newcommand{\\degreeMatrix}{\\mathbf{ \\MakeUppercase{\\degreeScalar}}}\n", - "\\newcommand{\\degreeScalar}{d}\n", - "\\newcommand{\\degreeVector}{\\mathbf{ \\degreeScalar}}\n", - "\\newcommand{\\diag}[1]{\\text{diag}\\left(#1\\right)}\n", - "\\newcommand{\\diagonalMatrix}{\\mathbf{D}}\n", - "\\newcommand{\\diff}[2]{\\frac{\\text{d}#1}{\\text{d}#2}}\n", - "\\newcommand{\\diffTwo}[2]{\\frac{\\text{d}^2#1}{\\text{d}#2^2}}\n", - "\\newcommand{\\displacement}{x}\n", - "\\newcommand{\\displacementVector}{\\textbf{\\displacement}}\n", - "\\newcommand{\\distanceMatrix}{\\mathbf{ \\MakeUppercase{\\distanceScalar}}}\n", - "\\newcommand{\\distanceScalar}{d}\n", - "\\newcommand{\\distanceVector}{\\mathbf{ \\distanceScalar}}\n", - "\\newcommand{\\eigenvaltwo}{\\ell}\n", - "\\newcommand{\\eigenvaltwoMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\eigenvaltwoVector}{\\mathbf{l}}\n", - "\\newcommand{\\eigenvalue}{\\lambda}\n", - "\\newcommand{\\eigenvalueMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\eigenvalueVector}{\\boldsymbol{ \\lambda}}\n", - "\\newcommand{\\eigenvector}{\\mathbf{ \\eigenvectorScalar}}\n", - "\\newcommand{\\eigenvectorMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\eigenvectorScalar}{u}\n", - "\\newcommand{\\eigenvectwo}{\\mathbf{v}}\n", - "\\newcommand{\\eigenvectwoMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\eigenvectwoScalar}{v}\n", - "\\newcommand{\\entropy}[1]{\\mathcal{H}\\left(#1\\right)}\n", - "\\newcommand{\\errorFunction}{E}\n", - "\\newcommand{\\expDist}[2]{\\left<#1\\right>_{#2}}\n", - "\\newcommand{\\expSamp}[1]{\\left<#1\\right>}\n", - "\\newcommand{\\expectation}[1]{\\left\\langle #1 \\right\\rangle }\n", - "\\newcommand{\\expectationDist}[2]{\\left\\langle #1 \\right\\rangle _{#2}}\n", - "\\newcommand{\\expectedDistanceMatrix}{\\mathcal{D}}\n", - "\\newcommand{\\eye}{\\mathbf{I}}\n", - "\\newcommand{\\fantasyDim}{r}\n", - "\\newcommand{\\fantasyMatrix}{\\mathbf{ \\MakeUppercase{\\fantasyScalar}}}\n", - "\\newcommand{\\fantasyScalar}{z}\n", - "\\newcommand{\\fantasyVector}{\\mathbf{ \\fantasyScalar}}\n", - "\\newcommand{\\featureStd}{\\varsigma}\n", - "\\newcommand{\\gammaCdf}[3]{\\mathcal{GAMMA CDF}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaDist}[3]{\\mathcal{G}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaSamp}[2]{\\mathcal{G}\\left(#1,#2\\right)}\n", - "\\newcommand{\\gaussianDist}[3]{\\mathcal{N}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gaussianSamp}[2]{\\mathcal{N}\\left(#1,#2\\right)}\n", - "\\newcommand{\\given}{|}\n", - "\\newcommand{\\half}{\\frac{1}{2}}\n", - "\\newcommand{\\heaviside}{H}\n", - "\\newcommand{\\hiddenMatrix}{\\mathbf{ \\MakeUppercase{\\hiddenScalar}}}\n", - "\\newcommand{\\hiddenScalar}{h}\n", - "\\newcommand{\\hiddenVector}{\\mathbf{ \\hiddenScalar}}\n", - "\\newcommand{\\identityMatrix}{\\eye}\n", - "\\newcommand{\\inducingInputScalar}{z}\n", - "\\newcommand{\\inducingInputVector}{\\mathbf{ \\inducingInputScalar}}\n", - "\\newcommand{\\inducingInputMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\inducingScalar}{u}\n", - "\\newcommand{\\inducingVector}{\\mathbf{ \\inducingScalar}}\n", - "\\newcommand{\\inducingMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\inlineDiff}[2]{\\text{d}#1/\\text{d}#2}\n", - "\\newcommand{\\inputDim}{q}\n", - "\\newcommand{\\inputMatrix}{\\mathbf{X}}\n", - "\\newcommand{\\inputScalar}{x}\n", - "\\newcommand{\\inputSpace}{\\mathcal{X}}\n", - "\\newcommand{\\inputVals}{\\inputVector}\n", - "\\newcommand{\\inputVector}{\\mathbf{ \\inputScalar}}\n", - "\\newcommand{\\iterNum}{k}\n", - "\\newcommand{\\kernel}{\\kernelScalar}\n", - "\\newcommand{\\kernelMatrix}{\\mathbf{K}}\n", - "\\newcommand{\\kernelScalar}{k}\n", - "\\newcommand{\\kernelVector}{\\mathbf{ \\kernelScalar}}\n", - "\\newcommand{\\kff}{\\kernelScalar_{\\mappingFunction \\mappingFunction}}\n", - "\\newcommand{\\kfu}{\\kernelVector_{\\mappingFunction \\inducingScalar}}\n", - "\\newcommand{\\kuf}{\\kernelVector_{\\inducingScalar \\mappingFunction}}\n", - "\\newcommand{\\kuu}{\\kernelVector_{\\inducingScalar \\inducingScalar}}\n", - "\\newcommand{\\lagrangeMultiplier}{\\lambda}\n", - "\\newcommand{\\lagrangeMultiplierMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\lagrangian}{L}\n", - "\\newcommand{\\laplacianFactor}{\\mathbf{ \\MakeUppercase{\\laplacianFactorScalar}}}\n", - "\\newcommand{\\laplacianFactorScalar}{m}\n", - "\\newcommand{\\laplacianFactorVector}{\\mathbf{ \\laplacianFactorScalar}}\n", - "\\newcommand{\\laplacianMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\laplacianScalar}{\\ell}\n", - "\\newcommand{\\laplacianVector}{\\mathbf{ \\ell}}\n", - "\\newcommand{\\latentDim}{q}\n", - "\\newcommand{\\latentDistanceMatrix}{\\boldsymbol{ \\Delta}}\n", - "\\newcommand{\\latentDistanceScalar}{\\delta}\n", - "\\newcommand{\\latentDistanceVector}{\\boldsymbol{ \\delta}}\n", - "\\newcommand{\\latentForce}{f}\n", - "\\newcommand{\\latentFunction}{u}\n", - "\\newcommand{\\latentFunctionVector}{\\mathbf{ \\latentFunction}}\n", - "\\newcommand{\\latentFunctionMatrix}{\\mathbf{ \\MakeUppercase{\\latentFunction}}}\n", - "\\newcommand{\\latentIndex}{j}\n", - "\\newcommand{\\latentScalar}{z}\n", - "\\newcommand{\\latentVector}{\\mathbf{ \\latentScalar}}\n", - "\\newcommand{\\latentMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\learnRate}{\\eta}\n", - "\\newcommand{\\lengthScale}{\\ell}\n", - "\\newcommand{\\rbfWidth}{\\ell}\n", - "\\newcommand{\\likelihoodBound}{\\mathcal{L}}\n", - "\\newcommand{\\likelihoodFunction}{L}\n", - "\\newcommand{\\locationScalar}{\\mu}\n", - "\\newcommand{\\locationVector}{\\boldsymbol{ \\locationScalar}}\n", - "\\newcommand{\\locationMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\variance}[1]{\\text{var}\\left( #1 \\right)}\n", - "\\newcommand{\\mappingFunction}{f}\n", - "\\newcommand{\\mappingFunctionMatrix}{\\mathbf{F}}\n", - "\\newcommand{\\mappingFunctionTwo}{g}\n", - "\\newcommand{\\mappingFunctionTwoMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\mappingFunctionTwoVector}{\\mathbf{ \\mappingFunctionTwo}}\n", - "\\newcommand{\\mappingFunctionVector}{\\mathbf{ \\mappingFunction}}\n", - "\\newcommand{\\scaleScalar}{s}\n", - "\\newcommand{\\mappingScalar}{w}\n", - "\\newcommand{\\mappingVector}{\\mathbf{ \\mappingScalar}}\n", - "\\newcommand{\\mappingMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\mappingScalarTwo}{v}\n", - "\\newcommand{\\mappingVectorTwo}{\\mathbf{ \\mappingScalarTwo}}\n", - "\\newcommand{\\mappingMatrixTwo}{\\mathbf{V}}\n", - "\\newcommand{\\maxIters}{K}\n", - "\\newcommand{\\meanMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanScalar}{\\mu}\n", - "\\newcommand{\\meanTwoMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanTwoScalar}{m}\n", - "\\newcommand{\\meanTwoVector}{\\mathbf{ \\meanTwoScalar}}\n", - "\\newcommand{\\meanVector}{\\boldsymbol{ \\meanScalar}}\n", - "\\newcommand{\\mrnaConcentration}{m}\n", - "\\newcommand{\\naturalFrequency}{\\omega}\n", - "\\newcommand{\\neighborhood}[1]{\\mathcal{N}\\left( #1 \\right)}\n", - "\\newcommand{\\neilurl}{http://inverseprobability.com/}\n", - "\\newcommand{\\noiseMatrix}{\\boldsymbol{ E}}\n", - "\\newcommand{\\noiseScalar}{\\epsilon}\n", - "\\newcommand{\\noiseVector}{\\boldsymbol{ \\epsilon}}\n", - "\\newcommand{\\norm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\normalizedLaplacianMatrix}{\\hat{\\mathbf{L}}}\n", - "\\newcommand{\\normalizedLaplacianScalar}{\\hat{\\ell}}\n", - "\\newcommand{\\normalizedLaplacianVector}{\\hat{\\mathbf{ \\ell}}}\n", - "\\newcommand{\\numActive}{m}\n", - "\\newcommand{\\numBasisFunc}{m}\n", - "\\newcommand{\\numComponents}{m}\n", - "\\newcommand{\\numComps}{K}\n", - "\\newcommand{\\numData}{n}\n", - "\\newcommand{\\numFeatures}{K}\n", - "\\newcommand{\\numHidden}{h}\n", - "\\newcommand{\\numInducing}{m}\n", - "\\newcommand{\\numLayers}{\\ell}\n", - "\\newcommand{\\numNeighbors}{K}\n", - "\\newcommand{\\numSequences}{s}\n", - "\\newcommand{\\numSuccess}{s}\n", - "\\newcommand{\\numTasks}{m}\n", - "\\newcommand{\\numTime}{T}\n", - "\\newcommand{\\numTrials}{S}\n", - "\\newcommand{\\outputIndex}{j}\n", - "\\newcommand{\\paramVector}{\\boldsymbol{ \\theta}}\n", - "\\newcommand{\\parameterMatrix}{\\boldsymbol{ \\Theta}}\n", - "\\newcommand{\\parameterScalar}{\\theta}\n", - "\\newcommand{\\parameterVector}{\\boldsymbol{ \\parameterScalar}}\n", - "\\newcommand{\\partDiff}[2]{\\frac{\\partial#1}{\\partial#2}}\n", - "\\newcommand{\\precisionScalar}{j}\n", - "\\newcommand{\\precisionVector}{\\mathbf{ \\precisionScalar}}\n", - "\\newcommand{\\precisionMatrix}{\\mathbf{J}}\n", - "\\newcommand{\\pseudotargetScalar}{\\widetilde{y}}\n", - "\\newcommand{\\pseudotargetVector}{\\mathbf{ \\pseudotargetScalar}}\n", - "\\newcommand{\\pseudotargetMatrix}{\\mathbf{ \\widetilde{Y}}}\n", - "\\newcommand{\\rank}[1]{\\text{rank}\\left(#1\\right)}\n", - "\\newcommand{\\rayleighDist}[2]{\\mathcal{R}\\left(#1|#2\\right)}\n", - "\\newcommand{\\rayleighSamp}[1]{\\mathcal{R}\\left(#1\\right)}\n", - "\\newcommand{\\responsibility}{r}\n", - "\\newcommand{\\rotationScalar}{r}\n", - "\\newcommand{\\rotationVector}{\\mathbf{ \\rotationScalar}}\n", - "\\newcommand{\\rotationMatrix}{\\mathbf{R}}\n", - "\\newcommand{\\sampleCovScalar}{s}\n", - "\\newcommand{\\sampleCovVector}{\\mathbf{ \\sampleCovScalar}}\n", - "\\newcommand{\\sampleCovMatrix}{\\mathbf{s}}\n", - "\\newcommand{\\scalarProduct}[2]{\\left\\langle{#1},{#2}\\right\\rangle}\n", - "\\newcommand{\\sign}[1]{\\text{sign}\\left(#1\\right)}\n", - "\\newcommand{\\sigmoid}[1]{\\sigma\\left(#1\\right)}\n", - "\\newcommand{\\singularvalue}{\\ell}\n", - "\\newcommand{\\singularvalueMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\singularvalueVector}{\\mathbf{l}}\n", - "\\newcommand{\\sorth}{\\mathbf{u}}\n", - "\\newcommand{\\spar}{\\lambda}\n", - "\\newcommand{\\trace}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\BasalRate}{B}\n", - "\\newcommand{\\DampingCoefficient}{C}\n", - "\\newcommand{\\DecayRate}{D}\n", - "\\newcommand{\\Displacement}{X}\n", - "\\newcommand{\\LatentForce}{F}\n", - "\\newcommand{\\Mass}{M}\n", - "\\newcommand{\\Sensitivity}{S}\n", - "\\newcommand{\\basalRate}{b}\n", - "\\newcommand{\\dampingCoefficient}{c}\n", - "\\newcommand{\\mass}{m}\n", - "\\newcommand{\\sensitivity}{s}\n", - "\\newcommand{\\springScalar}{\\kappa}\n", - "\\newcommand{\\springVector}{\\boldsymbol{ \\kappa}}\n", - "\\newcommand{\\springMatrix}{\\boldsymbol{ \\mathcal{K}}}\n", - "\\newcommand{\\tfConcentration}{p}\n", - "\\newcommand{\\tfDecayRate}{\\delta}\n", - "\\newcommand{\\tfMrnaConcentration}{f}\n", - "\\newcommand{\\tfVector}{\\mathbf{ \\tfConcentration}}\n", - "\\newcommand{\\velocity}{v}\n", - "\\newcommand{\\sufficientStatsScalar}{g}\n", - "\\newcommand{\\sufficientStatsVector}{\\mathbf{ \\sufficientStatsScalar}}\n", - "\\newcommand{\\sufficientStatsMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\switchScalar}{s}\n", - "\\newcommand{\\switchVector}{\\mathbf{ \\switchScalar}}\n", - "\\newcommand{\\switchMatrix}{\\mathbf{S}}\n", - "\\newcommand{\\tr}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\loneNorm}[1]{\\left\\Vert #1 \\right\\Vert_1}\n", - "\\newcommand{\\ltwoNorm}[1]{\\left\\Vert #1 \\right\\Vert_2}\n", - "\\newcommand{\\onenorm}[1]{\\left\\vert#1\\right\\vert_1}\n", - "\\newcommand{\\twonorm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\vScalar}{v}\n", - "\\newcommand{\\vVector}{\\mathbf{v}}\n", - "\\newcommand{\\vMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\varianceDist}[2]{\\text{var}_{#2}\\left( #1 \\right)}\n", - "\\newcommand{\\vecb}[1]{\\left(#1\\right):}\n", - "\\newcommand{\\weightScalar}{w}\n", - "\\newcommand{\\weightVector}{\\mathbf{ \\weightScalar}}\n", - "\\newcommand{\\weightMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\weightedAdjacencyMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\weightedAdjacencyScalar}{a}\n", - "\\newcommand{\\weightedAdjacencyVector}{\\mathbf{ \\weightedAdjacencyScalar}}\n", - "\\newcommand{\\onesVector}{\\mathbf{1}}\n", - "\\newcommand{\\zerosVector}{\\mathbf{0}}\n", "$$" - ] + ], + "id": "f84ad120-166e-4626-9c93-1065425036c8" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "::: {.cell .markdown}\n", + "\n", "\n", "\n", "\n", @@ -336,14 +55,18 @@ "" - ] + ], + "id": "b9e351e3-271e-4bcd-9320-afe094ff1d37" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "What is Machine Learning?\n", - "=========================\n", + "# What is Machine Learning?\n", + "\n", + "\\[edit\\]\n", "\n", "What is machine learning? At its most basic level machine learning is a\n", "combination of\n", @@ -364,22 +87,22 @@ "In practice we normally perform machine learning using two functions. To\n", "combine data with a model we typically make use of:\n", "\n", - "**a prediction function** a function which is used to make the\n", - "predictions. It includes our beliefs about the regularities of the\n", - "universe, our assumptions about how the world works, e.g. smoothness,\n", - "spatial similarities, temporal similarities.\n", + "**a prediction function** it is used to make the predictions. It\n", + "includes our beliefs about the regularities of the universe, our\n", + "assumptions about how the world works, e.g., smoothness, spatial\n", + "similarities, temporal similarities.\n", "\n", - "**an objective function** a function which defines the cost of\n", - "misprediction. Typically it includes knowledge about the world’s\n", - "generating processes (probabilistic objectives) or the costs we pay for\n", - "mispredictions (empiricial risk minimization).\n", + "**an objective function** it defines the ‘cost’ of misprediction.\n", + "Typically, it includes knowledge about the world’s generating processes\n", + "(probabilistic objectives) or the costs we pay for mispredictions\n", + "(empirical risk minimization).\n", "\n", "The combination of data and model through the prediction function and\n", "the objective function leads to a *learning algorithm*. The class of\n", "prediction functions and objective functions we can make use of is\n", "restricted by the algorithms they lead to. If the prediction function or\n", "the objective function are too complex, then it can be difficult to find\n", - "an appropriate learning algorithm. Much of the acdemic field of machine\n", + "an appropriate learning algorithm. Much of the academic field of machine\n", "learning is the quest for new learning algorithms that allow us to bring\n", "different types of models and data together.\n", "\n", @@ -389,19 +112,25 @@ "Example](https://royalsociety.org/~/media/policy/projects/machine-learning/publications/machine-learning-report.pdf).\n", "\n", "You can also check my post blog post on [What is Machine\n", - "Learning?](http://inverseprobability.com/2017/07/17/what-is-machine-learning).." - ] + "Learning?](http://inverseprobability.com/2017/07/17/what-is-machine-learning)." + ], + "id": "459396c4-9250-472e-8a24-acc7b3dddca1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Nigerian NMIS Data\n", - "==================\n", + "# Nigeria NMIS Data\n", + "\n", + "\\[edit\\]\n", "\n", - "As an example data set we will use Nigerian NMIS Health Facility data\n", - "from openAFRICA. It can be found here\n", - "https://africaopendata.org/dataset/nigeria-nmis-health-facility-data-2014\n", + "As an example data set we will use Nigerian Millennium Development Goals\n", + "Information System Health Facility (The Office of the Senior Special\n", + "Assistant to the President on the Millennium Development Goals\n", + "(OSSAP-MDGs) and Columbia University, 2014). It can be found here\n", + ".\n", "\n", "Taking from the information on the site,\n", "\n", @@ -424,21 +153,37 @@ "> President at funlola.osinupebi@aptovp.org\n", ">\n", "> To learn more, please visit\n", - "> http://csd.columbia.edu/2014/03/10/the-nigeria-mdg-information-system-nmis-takes-open-data-further/\n", + "> \n", ">\n", "> Suggested citation: Nigeria NMIS facility database (2014), the Office\n", "> of the Senior Special Assistant to the President on the Millennium\n", - "> Development Goals (OSSAP-MDGs) & Columbia University" - ] + "> Development Goals (OSSAP-MDGs) & Columbia University\n", + "\n", + "For ease of use we’ve packaged this data set in the `pods` library" + ], + "id": "ae5e993a-d5d2-4b95-9bf7-07f28ad73997" }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "import urllib.request" - ] + "## pods\n", + "\n", + "\\[edit\\]\n", + "\n", + "In Sheffield we created a suite of software tools for ‘Open Data\n", + "Science’. Open data science is an approach to sharing code, models and\n", + "data that should make it easier for companies, health professionals and\n", + "scientists to gain access to data science techniques.\n", + "\n", + "You can also check this blog post on [Open Data\n", + "Science](http://inverseprobability.com/2014/07/01/open-data-science).\n", + "\n", + "The software can be installed using" + ], + "id": "98d4ce23-e3ad-4bb4-9249-4c0f06ed0dc3" }, { "cell_type": "code", @@ -446,8 +191,21 @@ "metadata": {}, "outputs": [], "source": [ - "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')" - ] + "%pip install pods" + ], + "id": "7ae7c2da-669c-4b99-b067-1168b3741792" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub: \n", + "\n", + "Once `pods` is installed, it can be imported in the usual manner." + ], + "id": "2f9c0b88-4a83-46f4-b838-d301067ddad9" }, { "cell_type": "code", @@ -455,8 +213,9 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" - ] + "import pods" + ], + "id": "29e12ec7-66d0-45cf-888a-acf6fff59958" }, { "cell_type": "code", @@ -464,16 +223,30 @@ "metadata": {}, "outputs": [], "source": [ - "data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')" - ] + "data = pods.datasets.nigeria_nmis()['Y']\n", + "data.head()" + ], + "id": "71d554c7-32a6-400d-a9b0-2edf5648233d" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "Alternatively, you can access the data directly with the following\n", + "commands.\n", + "\n", + "``` python\n", + "import urllib.request\n", + "urllib.request.urlretrieve('https://energydata.info/dataset/f85d1796-e7f2-4630-be84-79420174e3bd/resource/6e640a13-cab4-457b-b9e6-0336051bac27/download/healthmopupandbaselinenmisfacility.csv', 'healthmopupandbaselinenmisfacility.csv')\n", + "\n", + "import pandas as pd\n", + "data = pd.read_csv('healthmopupandbaselinenmisfacility.csv')\n", + "```\n", + "\n", "Once it is loaded in the data can be summarized using the `describe`\n", "method in pandas." - ] + ], + "id": "1d3c1ac8-a582-40d6-ae92-d5fe54c1f84f" }, { "cell_type": "code", @@ -482,23 +255,45 @@ "outputs": [], "source": [ "data.describe()" - ] + ], + "id": "0809ee8c-cadc-41b4-8edd-8e33af59e419" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In python and jupyter notebook it is possible to see a list of all\n", - "possible functions and attributes by typing the name of the object\n", - "followed by `.` for example in the above case if we type\n", - "`data.` it show the columns available (these are attributes in\n", - "pandas dataframes) such as `num_nurses_fulltime`, and also functions,\n", + "We can also find out the dimensions of the dataset using the `shape`\n", + "property." + ], + "id": "c30fc32a-ea71-4a5b-acc3-591fb66f2a93" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.shape" + ], + "id": "53b20f89-a6cb-4927-b18f-cad4ad7fc62f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dataframes have different functions that you can use to explore and\n", + "understand your data. In python and the Jupyter notebook it is possible\n", + "to see a list of all possible functions and attributes by typing the\n", + "name of the object followed by `.` for example in the above case if\n", + "we type `data.` it show the columns available (these are attributes\n", + "in pandas dataframes) such as `num_nurses_fulltime`, and also functions,\n", "such as `.describe()`.\n", "\n", "For functions we can also see the documentation about the function by\n", "following the name with a question mark. This will open a box with\n", "documentation at the bottom which can be closed with the x button." - ] + ], + "id": "947507dc-d0ec-449f-8dda-97a1e98aa645" }, { "cell_type": "code", @@ -507,12 +302,97 @@ "outputs": [], "source": [ "data.describe?" - ] + ], + "id": "47e49c2e-8caf-46e0-b585-375ab38c7f06" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import mlai\n", + "import mlai.plot as plot" + ], + "id": "3e958953-7b25-41c6-b2d8-a816a5199f41" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=plot.big_figsize)\n", + "ax.plot(data.longitude, data.latitude, 'ro', alpha=0.01)\n", + "ax.set_xlabel('longitude')\n", + "ax.set_ylabel('latitude')\n", + "\n", + "mlai.write_figure('nigerian-health-facilities.png', directory='./ml')" + ], + "id": "d4c02825-863b-4fd3-91b0-a99787810204" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: Location of the over thirty-four thousand health facilities\n", + "registered in the NMIS data across Nigeria. Each facility plotted\n", + "according to its latitude and longitude." + ], + "id": "18f2e7d0-dbde-496b-bf88-f8d25b0576be" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "# Probabilities\n", + "\n", + "\\[edit\\]\n", + "\n", + "We are now going to do some simple review of probabilities and use this\n", + "review to explore some aspects of our data.\n", + "\n", + "A probability distribution expresses uncertainty about the outcome of an\n", + "event. We often encode this uncertainty in a variable. So if we are\n", + "considering the outcome of an event, $Y$, to be a coin toss, then we\n", + "might consider $Y=1$ to be heads and $Y=0$ to be tails. We represent the\n", + "probability of a given outcome with the notation: $$\n", + "P(Y=1) = 0.5\n", + "$$ The first rule of probability is that the probability must normalize.\n", + "The sum of the probability of all events must equal 1. So if the\n", + "probability of heads ($Y=1$) is 0.5, then the probability of tails (the\n", + "only other possible outcome) is given by $$\n", + "P(Y=0) = 1-P(Y=1) = 0.5\n", + "$$\n", + "\n", + "Probabilities are often defined as the limit of the ratio between the\n", + "number of positive outcomes (e.g. *heads*) given the number of trials.\n", + "If the number of positive outcomes for event $y$ is denoted by $n$ and\n", + "the number of trials is denoted by $N$ then this gives the ratio $$\n", + "P(Y=y) = \\lim_{N\\rightarrow\n", + "\\infty}\\frac{n_y}{N}.\n", + "$$ In practice we never get to observe an event infinite times, so\n", + "rather than considering this we often use the following estimate $$\n", + "P(Y=y) \\approx \\frac{n_y}{N}.\n", + "$$" + ], + "id": "0aaa36b6-cd8d-4e7b-acf3-6e2452308cca" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Exploring the NMIS Data\n", + "\n", + "\\[edit\\]\n", + "\n", "The NMIS facility data is stored in an object known as a ‘data frame’.\n", "Data frames come from the statistical family of programming languages\n", "based on `S`, the most widely used of which is\n", @@ -521,7 +401,8 @@ "method summarizes which columns there are in the data frame and gives us\n", "counts, means, standard deviations and percentiles for the values in\n", "those columns. To access a column directly we can write" - ] + ], + "id": "aa518683-d702-4507-b9e1-039e7163c253" }, { "cell_type": "code", @@ -531,7 +412,8 @@ "source": [ "print(data['num_doctors_fulltime'])\n", "#print(data['num_nurses_fulltime'])" - ] + ], + "id": "c1e360ea-6f5d-4408-8cf3-e0521b85f605" }, { "cell_type": "markdown", @@ -540,7 +422,8 @@ "This shows the number of doctors per facility, number of nurses and\n", "number of community health workers (CHEWS). We can plot the number of\n", "doctors against the number of nurses as follows." - ] + ], + "id": "40882eac-844d-4089-9c1d-573af8be83d6" }, { "cell_type": "code", @@ -548,10 +431,9 @@ "metadata": {}, "outputs": [], "source": [ - "# this ensures the plot appears in the web browser\n", - "%matplotlib inline \n", "import matplotlib.pyplot as plt # this imports the plotting library in python" - ] + ], + "id": "a8ae57eb-776a-4c6c-8546-41a234a305e8" }, { "cell_type": "code", @@ -560,7 +442,8 @@ "outputs": [], "source": [ "_ = plt.plot(data['num_doctors_fulltime'], data['num_nurses_fulltime'], 'rx')" - ] + ], + "id": "fcf23955-2c14-4f76-b17d-ac27901eef11" }, { "cell_type": "markdown", @@ -568,7 +451,8 @@ "source": [ "You may be curious what the arguments we give to `plt.plot` are for, now\n", "is the perfect time to look at the documentation" - ] + ], + "id": "b221a310-5099-480e-9113-083f5fee877e" }, { "cell_type": "code", @@ -577,7 +461,8 @@ "outputs": [], "source": [ "plt.plot?" - ] + ], + "id": "2ff0eda9-bc9f-405c-ba25-f5929df11ef3" }, { "cell_type": "markdown", @@ -586,7 +471,8 @@ "We immediately note that some facilities have a lot of nurses, which\n", "prevent’s us seeing the detail of the main number of facilities. First\n", "lets identify the facilities with the most nurses." - ] + ], + "id": "745b51a5-de56-4a0f-895f-e0ebb34631b4" }, { "cell_type": "code", @@ -595,7 +481,8 @@ "outputs": [], "source": [ "data[data['num_nurses_fulltime']>100]" - ] + ], + "id": "de2047f4-c32a-40ce-9fbc-af8a60201ea8" }, { "cell_type": "markdown", @@ -610,7 +497,8 @@ "`True`. We can also sort the result. To sort the result by the values in\n", "the `num_nurses_fulltime` column in *descending* order we use the\n", "following command." - ] + ], + "id": "ff714701-cde0-44b6-a5fa-90a49ba974f8" }, { "cell_type": "code", @@ -619,7 +507,8 @@ "outputs": [], "source": [ "data[data['num_nurses_fulltime']>100].sort_values(by='num_nurses_fulltime', ascending=False)" - ] + ], + "id": "075d06d4-2c24-4f5f-8c91-3c44b0a95094" }, { "cell_type": "markdown", @@ -628,15 +517,16 @@ "We now see that the ‘University of Calabar Teaching Hospital’ is a large\n", "outlier with 513 nurses. We can try and determine how much of an outlier\n", "by histograming the data." - ] + ], + "id": "504c8a04-282a-4455-b0f3-f254daf34f96" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Plotting the Data\n", - "-----------------" - ] + "## Plotting the Data" + ], + "id": "5ecf51b2-b1b2-44a9-ad2b-21158445cde3" }, { "cell_type": "code", @@ -646,7 +536,8 @@ "source": [ "data['num_nurses_fulltime'].hist(bins=20) # histogram the data with 20 bins.\n", "plt.title('Histogram of Number of Nurses')" - ] + ], + "id": "cebea1df-6ee9-4514-9fdf-37e656ce5650" }, { "cell_type": "markdown", @@ -656,7 +547,8 @@ "facilities with zero or one nurse that we don’t see the histogram for\n", "hospitals with many nurses. We can try more bins and using a *log* scale\n", "on the $y$-axis." - ] + ], + "id": "caf6f225-949f-417e-ae79-7e74a17eee38" }, { "cell_type": "code", @@ -668,32 +560,17 @@ "plt.title('Histogram of Number of Nurses')\n", "ax = plt.gca()\n", "ax.set_yscale('log')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Exercise 1\n", - "\n", - "Read on the internet about the following python libraries: `numpy`,\n", - "`matplotlib`, `scipy` and `pandas`. What functionality does each provide\n", - "python?" - ] + ], + "id": "6d2e6f5e-b5b3-4973-8286-5fc2a20d3ca7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "::: {.cell .markdown}\n", - "\n", - "### Exercise 1 Answer\n", - "\n", - "Write your answer to Exercise 1 here\n", - "\n", "Let’s try and see how the number of nurses relates to the number of\n", "doctors." - ] + ], + "id": "bab7ce3e-2b80-48cc-b85d-fd0491d1d429" }, { "cell_type": "code", @@ -709,7 +586,8 @@ "plt.title('Number of Nurses against Number of Doctors')\n", "plt.ylabel('number of nurses')\n", "plt.xlabel('number of doctors')" - ] + ], + "id": "3c974e61-adaf-4e12-9e47-384957a6c469" }, { "cell_type": "markdown", @@ -730,53 +608,23 @@ "console window. We can move up and down the notebook and run each part\n", "in a different order. The *state* of the program is always as we left it\n", "after running the previous part." - ] + ], + "id": "a6ae1605-035b-44fb-b439-bcf63d0275f3" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Probabilities\n", - "=============\n", - "\n", - "We are now going to do some simple review of probabilities and use this\n", - "review to explore some aspects of our data.\n", + "## Probability and the NMIS Data\n", "\n", - "A probability distribution expresses uncertainty about the outcome of an\n", - "event. We often encode this uncertainty in a variable. So if we are\n", - "considering the outcome of an event, $Y$, to be a coin toss, then we\n", - "might consider $Y=1$ to be heads and $Y=0$ to be tails. We represent the\n", - "probability of a given outcome with the notation: $$\n", - "P(Y=1) = 0.5\n", - "$$ The first rule of probability is that the probability must normalize.\n", - "The sum of the probability of all events must equal 1. So if the\n", - "probability of heads ($Y=1$) is 0.5, then the probability of tails (the\n", - "only other possible outcome) is given by $$\n", - "P(Y=0) = 1-P(Y=1) = 0.5\n", - "$$\n", - "\n", - "Probabilities are often defined as the limit of the ratio between the\n", - "number of positive outcomes (e.g. *heads*) given the number of trials.\n", - "If the number of positive outcomes for event $y$ is denoted by $n$ and\n", - "the number of trials is denoted by $N$ then this gives the ratio $$\n", - "P(Y=y) = \\lim_{N\\rightarrow\n", - "\\infty}\\frac{n_y}{N}.\n", - "$$ In practice we never get to observe an event infinite times, so\n", - "rather than considering this we often use the following estimate $$\n", - "P(Y=y) \\approx \\frac{n_y}{N}.\n", - "$$" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Probability and the NMIS Data\n", - "-----------------------------\n", + "\\[edit\\]\n", "\n", "Let’s use the sum rule to compute the estimate the probability that a\n", "facility has more than two nurses." - ] + ], + "id": "e4c72e3a-ea67-4372-a779-9ac538ac98c2" }, { "cell_type": "code", @@ -789,14 +637,14 @@ "\n", "prob_large = float(large)/float(total_facilities)\n", "print(\"Probability of number of nurses being greather than 2 is:\", prob_large)" - ] + ], + "id": "3c8f753c-e0a7-4e91-9e2a-e0e78b7da9c5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Conditioning\n", - "============\n", + "# Conditioning\n", "\n", "When predicting whether a coin turns up head or tails, we might think\n", "that this event is *independent* of the year or time of day. If we\n", @@ -811,7 +659,8 @@ "the number of doctors. For this we can try estimating $P(Y>2 | X>1)$ and\n", "compare the result, for example to $P(Y>2|X\\leq 1)$ using our empirical\n", "estimate of the probability." - ] + ], + "id": "2d56ae4b-dc02-4ee3-ad4e-361777cacc0b" }, { "cell_type": "code", @@ -823,17 +672,19 @@ "total_large_doctors = (data.num_doctors_fulltime>1).sum()\n", "prob_both_large = large/total_large_doctors\n", "print(\"Probability of number of nurses being greater than 2 given number of doctors is greater than 1 is:\", prob_both_large)" - ] + ], + "id": "eaba9bfe-f9b7-4983-b77d-c111b0846dba" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 2\n", + "### Exercise 1\n", "\n", "Write code that prints out the probability of nurses being greater than\n", "2 for different numbers of doctors." - ] + ], + "id": "9026afa3-370d-4a4e-9f0c-cc2aabf60bdb" }, { "cell_type": "code", @@ -841,10 +692,13 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your answer to Exercise 2 here\n", + "# Write your answer to Exercise 1 here\n", + "\n", + "\n", "\n", "\n" - ] + ], + "id": "0824097f-1494-49e0-92eb-f73a21f5d6da" }, { "cell_type": "markdown", @@ -865,33 +719,20 @@ "The different basic probability distributions.\n", "\n", "" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import teaching_plots as plot" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plot.prob_diagram(diagrams='./mlai')" - ] + ], + "id": "69d6f854-b47c-4f3a-8ad1-04d55d95de2f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A Pictorial Definition of Probability\n", - "-------------------------------------" - ] + "## A Pictorial Definition of Probability\n", + "\n", + "\\[edit\\]" + ], + "id": "dbed4771-9566-45b8-b8a1-927315ccf714" }, { "cell_type": "code", @@ -899,8 +740,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "from mlai import plot" + ], + "id": "c9308c64-60c3-4e4f-acdb-e9a392eeef09" }, { "cell_type": "code", @@ -908,43 +750,43 @@ "metadata": {}, "outputs": [], "source": [ - "plot.prob_diagram(diagrams='../slides/diagrams')" - ] + "plot.prob_diagram(diagrams='./mlai')" + ], + "id": "d318ed86-2bbb-4adb-9125-9ecda8dd0ba5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Diagram representing the different probabilities, joint,\n", "marginal and conditional. This diagram was inspired by lectures given by\n", "Christopher Bishop.\n", "\n", - "Inspired by lectures from Christopher\n", - "Bishop" - ] + "Inspired by lectures from Christopher Bishop" + ], + "id": "2750d35e-29a3-4a08-88ec-7e7da2b8150c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Definition of probability distributions\n", - "---------------------------------------\n", + "## Definition of probability distributions\n", "\n", "| Terminology | Definition | Probability Notation |\n", - "|:------------------------|:-------------------------------------------------------|:-----------------------------|\n", + "|:----------|:-------------------------------------|:---------------------|\n", "| Joint Probability | $\\lim_{N\\rightarrow\\infty}\\frac{n_{X=3,Y=4}}{N}$ | $P\\left(X=3,Y=4\\right)$ |\n", "| Marginal Probability | $\\lim_{N\\rightarrow\\infty}\\frac{n_{X=5}}{N}$ | $P\\left(X=5\\right)$ |\n", "| Conditional Probability | $\\lim_{N\\rightarrow\\infty}\\frac{n_{X=3,Y=4}}{n_{Y=4}}$ | $P\\left(X=3\\vert Y=4\\right)$ |" - ] + ], + "id": "2ea4f118-43e7-4658-a5de-953e45d17956" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notational Details\n", - "------------------\n", + "## Notational Details\n", "\n", "Typically we should write out $P\\left(X=x,Y=y\\right)$, but in practice\n", "we often shorten this to $P\\left(x,y\\right)$. This looks very much like\n", @@ -969,7 +811,8 @@ "simultaneous questions, what’s the probability that the number of nurses\n", "was over 2 and the number of doctors was 1? Or any other question that\n", "may occur to us. Again we can easily use pandas to ask such questions." - ] + ], + "id": "0f515cae-6851-466c-915a-09d273d36126" }, { "cell_type": "code", @@ -982,14 +825,14 @@ "total_facilities = data.num_nurses_fulltime.count() # this is total number of films\n", "prob_large = float(large)/float(total_facilities)\n", "print(\"Probability of nurses being greater than 2 and number of doctors being\", num_doctors, \"is:\", prob_large)" - ] + ], + "id": "88b63db9-445f-48cc-879d-58a54c08bbf6" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The Product Rule\n", - "----------------\n", + "## The Product Rule\n", "\n", "This number is the joint probability, $P(Y, X)$ which is much *smaller*\n", "than the conditional probability. The number can never be bigger than\n", @@ -1010,7 +853,8 @@ "p(y, x) = p(y|x)p(x)\n", "$$ We can see the relation working in practice for our data above by\n", "computing the different values for $x=1$." - ] + ], + "id": "22a399d6-b7b1-46eb-828e-b347139a1a20" }, { "cell_type": "code", @@ -1020,21 +864,21 @@ "source": [ "num_doctors=1\n", "num_nurses=2\n", - "p_x = float((data.num_doctors_fulltime==num_doctors).sum())/float(data.num_nurses_fulltime.count())\n", + "p_x = float((data.num_doctors_fulltime==num_doctors).sum())/float(data.num_doctors_fulltime.count())\n", "p_y_given_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float((data.num_doctors_fulltime==num_doctors).sum())\n", "p_y_and_x = float((data.num_nurses_fulltime[data.num_doctors_fulltime==num_doctors]>num_nurses).sum())/float(data.num_nurses_fulltime.count())\n", "\n", "print(\"P(x) is\", p_x)\n", "print(\"P(y|x) is\", p_y_given_x)\n", "print(\"P(y,x) is\", p_y_and_x)" - ] + ], + "id": "fe063a3b-a9f0-4f5b-bf66-1c7f53157519" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The Sum Rule\n", - "------------\n", + "## The Sum Rule\n", "\n", "The other *fundamental rule* of probability is the *sum rule* this tells\n", "us how to get a *marginal* distribution from the joint distribution.\n", @@ -1044,17 +888,19 @@ "$$ Or in our shortened notation $$\n", "P(y) = \\sum_{x} P(y, x)\n", "$$" - ] + ], + "id": "e726a451-d109-48cb-93df-908457d6d064" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 3\n", + "### Exercise 2\n", "\n", "Write code that computes $P(y)$ by adding $P(y, x)$ for all values of\n", "$x$." - ] + ], + "id": "8e5d1449-7680-42b8-ab60-85e420595755" }, { "cell_type": "code", @@ -1062,17 +908,19 @@ "metadata": {}, "outputs": [], "source": [ - "# Write your answer to Exercise 3 here\n", + "# Write your answer to Exercise 2 here\n", + "\n", + "\n", "\n", "\n" - ] + ], + "id": "0a08847f-3284-4f8f-8354-202b6b06094a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bayes’ Rule\n", - "-----------\n", + "## Bayes’ Rule\n", "\n", "Bayes’ rule is a very simple rule, it’s hardly worth the name of a rule\n", "at all. It follows directly from the product rule of probability.\n", @@ -1087,39 +935,44 @@ "Each of these probability distributions represents the answer to a\n", "question we have about the world. Bayes rule (via the product rule)\n", "tells us how to *invert* the probability." - ] + ], + "id": "07b6d44d-2351-4fc4-96d6-d264893d48f8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Further Reading\n", - "---------------\n", + "## Further Reading\n", "\n", "- Probability distributions: page 12–17 (Section 1.2) of Bishop (2006)" - ] + ], + "id": "b92021d4-ac11-4c5a-9dee-44d42be89a08" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Exercises\n", - "---------\n", + "## Exercises\n", "\n", "- Exercise 1.3 of Bishop (2006)" - ] + ], + "id": "80aa9579-3db6-4931-9157-3bbcd4294c1e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Probabilities for Extracting Information from Data\n", - "--------------------------------------------------\n", + "## Probabilities for Extracting Information from Data\n", + "\n", + "\\[edit\\]\n", "\n", "What use is all this probability in data science? Let’s think about how\n", "we might use the probabilities to do some decision making. Let’s look at\n", "the information data." - ] + ], + "id": "41cbc108-a169-4033-84bc-deebff83dd82" }, { "cell_type": "code", @@ -1128,13 +981,14 @@ "outputs": [], "source": [ "data.columns" - ] + ], + "id": "39fd24a9-7611-41aa-b277-c5f4c73cab99" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 1\n", + "### Exercise 3\n", "\n", "Now we see we have several additional features. Let’s assume we want to\n", "predict `maternal_health_delivery_services`. How would we go about doing\n", @@ -1146,25 +1000,28 @@ "Should you be using a joint or a conditional distribution? If it’s\n", "conditional, what should the distribution be over, and what should it be\n", "conditioned on?" - ] + ], + "id": "dd6e3dab-4553-422e-8648-dc73a3b029ac" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "::: {.cell .markdown}\n", + "### Exercise 3 Answer\n", "\n", - "### Exercise 1 Answer\n", - "\n", - "Write your answer to Exercise 1 here" - ] + "Write your answer to Exercise 3 here" + ], + "id": "e2b2df91-e6ab-4797-8c9c-da769307e9ff" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Probabilistic Modelling\n", - "-----------------------\n", + "## Probabilistic Modelling\n", + "\n", + "\\[edit\\]\n", "\n", "This Bayesian approach is designed to deal with uncertainty arising from\n", "fitting our prediction function to the data we have, a reduced data set.\n", @@ -1220,14 +1077,18 @@ "p(\\mathbf{ y}_*|\\mathbf{ y}) = \\int p(\\mathbf{ y}_*|\\mathbf{X}_*, \\boldsymbol{ \\theta}) p(\\boldsymbol{ \\theta}| \\mathbf{ y}, \\mathbf{X}) p(\\mathbf{X}) p(\\mathbf{X}_*) \\text{d} \\boldsymbol{ \\theta}\\text{d} \\mathbf{X}\\text{d}\\mathbf{X}_*\n", "$$ and we have *unsupervised learning* (from where we can get deep\n", "generative models)." - ] + ], + "id": "d66df8c0-1679-44b6-9af4-152527c046b1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Graphical Models\n", - "----------------\n", + "## Graphical Models\n", + "\n", + "\\[edit\\]\n", "\n", "One way of representing a joint distribution is to consider conditional\n", "dependencies between data. Conditional dependencies allow us to\n", @@ -1236,7 +1097,8 @@ "conditional relationships between points that are neighboring, often in\n", "time or space. It can be decomposed in the following form.\n", "$$p(\\mathbf{ y}) = p(y_n| y_{n-1}) p(y_{n-1}|y_{n-2}) \\dots p(y_{2} | y_{1})$$" - ] + ], + "id": "b9f736a6-ce5f-4b08-93f4-027c0e76fff4" }, { "cell_type": "code", @@ -1249,7 +1111,8 @@ "\n", "rc(\"font\", **{'family':'sans-serif','sans-serif':['Helvetica']}, size=30)\n", "rc(\"text\", usetex=True)" - ] + ], + "id": "786061bf-2ca6-4778-96fb-7bbebbbed1b9" }, { "cell_type": "code", @@ -1272,13 +1135,14 @@ "pgm.add_edge(\"y_2\", \"y_3\")\n", "\n", "pgm.render().figure.savefig(\"./ml/markov.svg\", transparent=True)" - ] + ], + "id": "bed4a782-61f5-4e31-a89c-cd1d0db61cd5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A Markov chain is a simple form of probabilistic graphical\n", "model providing a particular decomposition of the joint density.\n", @@ -1298,7 +1162,7 @@ "of *C Difficile* infection following colon surgery (Steele et al.,\n", "2012).\n", "\n", - "\n", + "\n", "\n", "Figure: A probabilistic directed graph used to predict the\n", "perioperative risk of *C Difficile* infection following colon surgery.\n", @@ -1308,14 +1172,18 @@ "\n", "To capture the complexity in the interelationship between the data, the\n", "graph itself becomes more complex, and less interpretable." - ] + ], + "id": "2ec95e98-fdb6-403e-b16c-dc42c965dea9" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Introduction to Classification\n", - "------------------------------\n", + "## Introduction to Classification\n", + "\n", + "\\[edit\\]\n", "\n", "Classification is perhaps the technique most closely assocated with\n", "machine learning. In the speech based agents, on-device classifiers are\n", @@ -1365,14 +1233,18 @@ "relevant in the prediction, (2) defining the appropriate *class of\n", "function*, $f(\\cdot)$, to use and (3) selecting the right parameters,\n", "$\\mathbf{ w}$." - ] + ], + "id": "bba2ea18-c655-4ac0-b9de-3c6bad0fd322" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Classification Examples\n", - "-----------------------\n", + "## Classification Examples\n", + "\n", + "\\[edit\\]\n", "\n", "- Classifiying hand written digits from binary images (automatic zip\n", " code reading)\n", @@ -1381,14 +1253,18 @@ "- Classifying type of cancer given gene expression data.\n", "- Categorization of document types (different types of news article on\n", " the internet)" - ] + ], + "id": "bda292e2-0ffc-4184-a7fe-9af7759b03f8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bernoulli Distribution\n", - "----------------------\n", + "## Bernoulli Distribution\n", + "\n", + "\\[edit\\]\n", "\n", "Our focus has been on models where the objective function is inspired by\n", "a probabilistic analysis of the problem. In particular we’ve argued that\n", @@ -1404,7 +1280,7 @@ "$\\pi$ to be a variable) then we can specify the probability distribution\n", "through a table.\n", "\n", - "| $y$ | 0 | 1 |\n", + "| $y$ | 0 | 1 |\n", "|:------:|:---------:|:-----:|\n", "| $P(y)$ | $(1-\\pi)$ | $\\pi$ |\n", "\n", @@ -1438,7 +1314,8 @@ "where he considers Pascal’s triangle in forming combinations of the\n", "Bernoulli distribution to realise the binomial distribution for the\n", "outcome of positive trials." - ] + ], + "id": "bafe09f0-bc3e-42dc-a20b-993d005cc10f" }, { "cell_type": "code", @@ -1446,9 +1323,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", - "pods.notebook.display_google_book(id='CF4UAAAAQAAJ', page='PA87')" - ] + "import notutils as nu\n", + "nu.display_google_book(id='CF4UAAAAQAAJ', page='PA87')" + ], + "id": "2ebf86b9-3995-4f57-9cca-5fe0c2ba92b7" }, { "cell_type": "code", @@ -1457,8 +1335,9 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "af1b17b2-6be7-48bd-93b7-04681fe7638f" }, { "cell_type": "code", @@ -1468,13 +1347,14 @@ "source": [ "fig, ax = plt.subplots(figsize=plot.one_figsize)\n", "plot.bernoulli_urn(ax, diagrams='./ml/')" - ] + ], + "id": "306a4bb4-7315-4067-9f1c-6f9d151608d1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Jacob Bernoulli described the Bernoulli distribution through\n", "an urn in which there are black and red balls.\n", @@ -1496,7 +1376,8 @@ "\n", "For this reason in Bayes’s distribution there is considered to be\n", "*aleatoric* uncertainty about the distribution parameter." - ] + ], + "id": "9287a5c6-1979-4bd9-bba2-51f40c72e6b4" }, { "cell_type": "code", @@ -1505,8 +1386,9 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "a8a061dd-c360-4da0-bbca-2fbc8c33f617" }, { "cell_type": "code", @@ -1516,13 +1398,14 @@ "source": [ "fig, ax = plt.subplots(figsize=plot.one_figsize)\n", "plot.bayes_billiard(ax, diagrams='./ml/')" - ] + ], + "id": "674ab06c-b1d2-4cfc-810a-ecd431644803" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Thomas Bayes described the Bernoulli distribution\n", "independently of Jacob Bernoulli. He used the analogy of a billiard\n", @@ -1532,7 +1415,8 @@ "ball (in the figure) gives the outcome as either left or right (relative\n", "to the first ball). This is the origin of the term Bayesian because the\n", "parameter of the distribution is drawn from a probsbility." - ] + ], + "id": "64663c60-a90c-483f-8421-5d526371b031" }, { "cell_type": "code", @@ -1540,9 +1424,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "56c8516a-7317-4658-847f-17ab6f533390" }, { "cell_type": "code", @@ -1550,17 +1435,31 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('bayes-billiard{counter:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "f32fa897-fddc-43b4-b205-a70a76a0055b" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('bayes-billiard{counter:0>3}.svg', \n", " directory='./ml', \n", " counter=IntSlider(0,0,9,1))" - ] + ], + "id": "50d84159-84c0-4c23-a2d4-dde76da3f41d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Maximum Likelihood in the Bernoulli\n", - "-----------------------------------\n", + "## Maximum Likelihood in the Bernoulli\n", + "\n", + "\\[edit\\]\n", "\n", "Maximum likelihood in the Bernoulli distribution is straightforward.\n", "Let’s assume we have data, $\\mathbf{ y}$ which consists of a vector of\n", @@ -1595,7 +1494,8 @@ "estimate the probability of a coin being heads, and you tossed the coin\n", "100 times, and recovered 47 heads, then the estimate of the probability\n", "of heads should be $\\frac{47}{100}$." - ] + ], + "id": "43f4a9d5-94b3-442a-8ba4-5c434cd659ef" }, { "cell_type": "markdown", @@ -1605,7 +1505,8 @@ "\n", "Show that the maximum likelihood solution we have found is a *minimum*\n", "for our objective." - ] + ], + "id": "681a5f5e-ce48-49f0-9de7-e0fdd81faaaf" }, { "cell_type": "markdown", @@ -1614,7 +1515,8 @@ "### Exercise 4 Answer\n", "\n", "Write your answer to Exercise 4 here" - ] + ], + "id": "efa2418d-88bf-411a-a538-96b4c726aab4" }, { "cell_type": "code", @@ -1624,7 +1526,8 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "028ec197-091d-4376-8d58-f6b0bcfc5db0" }, { "cell_type": "markdown", @@ -1641,14 +1544,18 @@ "2. Likelihood\n", "3. Posterior distribution\n", "4. Marginal likelihood" - ] + ], + "id": "427ed3f3-6825-4f87-9a5a-db6b4538cdfa" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Naive Bayes Classifiers\n", - "-----------------------\n", + "## Naive Bayes Classifiers\n", + "\n", + "\\[edit\\]\n", "\n", "*Note*: Everything we do below is possible using standard packages like\n", "`scikit-learn`, our purpose in this session is to help you understand\n", @@ -1686,14 +1593,14 @@ "\n", "In naive Bayes we make certain simplifying assumptions that allow us to\n", "perform all of the above in practice." - ] + ], + "id": "5cac2b6b-a77e-4e0c-89b0-750ce01dbeaa" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Data Conditional Independence\n", - "-----------------------------\n", + "## Data Conditional Independence\n", "\n", "If we are given model parameters $\\boldsymbol{ \\theta}$ we assume that\n", "conditioned on all these parameters that all data points in the model\n", @@ -1713,14 +1620,14 @@ "\n", "Computing posterior distribution in this case becomes easier, this is\n", "known as the ‘Bayes classifier’." - ] + ], + "id": "66751653-054a-4863-a188-189ec908e36c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Feature Conditional Independence\n", - "--------------------------------\n", + "## Feature Conditional Independence\n", "\n", "$$\n", "p(\\mathbf{ x}_i | y_i, \\boldsymbol{ \\theta}) = \\prod_{j=1}^{p} p(x_{i,j}|y_i, \\boldsymbol{ \\theta})\n", @@ -1732,14 +1639,14 @@ "parameters *and* the label. So for each data point we have\n", "$$p(\\mathbf{ x}_i | y_i, \\boldsymbol{ \\theta}) = \\prod_{j=1}^{p} p(x_{i,j}|y_i,\\boldsymbol{ \\theta})$$\n", "where $p$ is the dimensionality of our inputs." - ] + ], + "id": "c6bdbb40-f0cf-4bf6-9e9d-67a76a59769a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Marginal Density for $y_i$\n", - "--------------------------\n", + "## Marginal Density for $y_i$\n", "\n", "$$\n", "p(x_{i,j},y_i| \\boldsymbol{ \\theta}) = p(x_{i,j}|y_i, \\boldsymbol{ \\theta})p(y_i).\n", @@ -1759,14 +1666,14 @@ "for our prior over $y_i$, $$p(y_i|\\pi) = \\pi^{y_i} (1-\\pi)^{1-y_i}$$\n", "where $\\pi$ now has the interpretation as being the *prior* probability\n", "that the classification should be positive." - ] + ], + "id": "86d41a8f-c6b2-4723-99a7-f4305ec3c230" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Joint Density for Naive Bayes\n", - "-----------------------------\n", + "## Joint Density for Naive Bayes\n", "\n", "This allows us to write down the full joint density of the training\n", "data, $$\n", @@ -1815,39 +1722,27 @@ "$\\boldsymbol{ \\theta}$ alone so we have, $$\n", "E(\\pi, \\boldsymbol{ \\theta}) = E(\\boldsymbol{ \\theta}) + E(\\pi).\n", "$$" - ] + ], + "id": "9701196b-8ae7-401b-a40a-2efaba270034" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Nigerian NMIS Data\n", - "------------------\n", + "## Nigeria NMIS Data Classification\n", "\n", - "First we will load in the Nigerian NMIS health data. Our aim will be to\n", - "predict whether a center has maternal health delivery services given the\n", - "attributes in the data. We will predict of the number of nurses, the\n", - "number of doctors, location etc.\n", + "\\[edit\\]\n", + "\n", + "Our aim will be to predict whether a center has maternal health delivery\n", + "services given the attributes in the data. We will predict of the number\n", + "of nurses, the number of doctors, location etc.\n", "\n", - "Let’s first remind ourselves of the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ "Now we will convert this data into a form which we can use as inputs\n", "`X`, and labels `y`." - ] + ], + "id": "4cc866d0-d400-4c13-a87a-758e4a0e507d" }, { "cell_type": "code", @@ -1857,7 +1752,8 @@ "source": [ "import pandas as pd\n", "import numpy as np" - ] + ], + "id": "54a93596-6354-4130-935a-3723d7c8d628" }, { "cell_type": "code", @@ -1897,7 +1793,8 @@ " type_names.append(type_col)\n", " X.loc[:, type_col] = 0.0 \n", " X.loc[index, type_col] = 1.0" - ] + ], + "id": "c6206f8d-d977-43b5-b7a8-ec7681d31923" }, { "cell_type": "markdown", @@ -1905,7 +1802,8 @@ "source": [ "This has given us a new data frame `X` which contains the different\n", "facility types in different columns." - ] + ], + "id": "6872db9b-3dca-4ad0-aca7-8b6bddd5ad4f" }, { "cell_type": "code", @@ -1914,20 +1812,25 @@ "outputs": [], "source": [ "X.describe()" - ] + ], + "id": "6bdc4fd3-b21b-4bb0-bb6f-97b1feb0aebf" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Naive Bayes NMIS\n", - "----------------\n", + "## Naive Bayes NMIS\n", + "\n", + "\\[edit\\]\n", "\n", "We can now specify the naive Bayes model. For the genres we want to\n", "model the data as Bernoulli distributed, and for the year and body count\n", "we want to model the data as Gaussian distributed. We set up two data\n", "frames to contain the parameters for the rows and the columns below." - ] + ], + "id": "c04cbf9e-9d06-4711-ab97-2fb95c348b73" }, { "cell_type": "code", @@ -1952,7 +1855,8 @@ " 'longitude']\n", "Bernoulli = pd.DataFrame(data=np.zeros((2,len(binary_columns))), columns=binary_columns, index=['theta_0', 'theta_1'])\n", "Gaussian = pd.DataFrame(data=np.zeros((4,len(real_columns))), columns=real_columns, index=['mu_0', 'sigma2_0', 'mu_1', 'sigma2_1'])" - ] + ], + "id": "f4fdda5f-1a49-488e-83d0-463678626db2" }, { "cell_type": "markdown", @@ -1960,7 +1864,8 @@ "source": [ "Now we have the data in a form ready for analysis, let’s construct our\n", "data matrix." - ] + ], + "id": "d001db1b-ea24-4f99-a757-832ccd0cb490" }, { "cell_type": "code", @@ -1976,7 +1881,8 @@ "y_train = y.iloc[train_indices]==True\n", "X_test = X.iloc[test_indices]\n", "y_test = y.iloc[test_indices]==True" - ] + ], + "id": "f7554a83-b758-41fb-808a-5815ceb89ece" }, { "cell_type": "markdown", @@ -1988,7 +1894,8 @@ "solution for the Bernoulli. Or by computing the empirical mean and\n", "variance of the data for the Gaussian, which also gives us the maximum\n", "likelihood solution." - ] + ], + "id": "4250fac6-bf65-4de8-8f2c-8d2b4e08667f" }, { "cell_type": "code", @@ -2005,7 +1912,8 @@ " if column in Bernoulli:\n", " Bernoulli[column]['theta_0'] = X_train[column][~y_train].sum()/(~y_train).sum()\n", " Bernoulli[column]['theta_1'] = X_train[column][y_train].sum()/(y_train).sum()" - ] + ], + "id": "8760b1aa-a00d-486f-b91c-9b9993a90376" }, { "cell_type": "markdown", @@ -2013,7 +1921,8 @@ "source": [ "We can examine the nature of the distributions we’ve fitted to the model\n", "by looking at the entries in these data frames." - ] + ], + "id": "2f615c45-4ba0-4319-8aba-f233dede119a" }, { "cell_type": "code", @@ -2022,7 +1931,8 @@ "outputs": [], "source": [ "Bernoulli" - ] + ], + "id": "7494c4ce-c912-42b4-bf16-4c7780491484" }, { "cell_type": "markdown", @@ -2039,7 +1949,8 @@ "The naive Bayes assumption says that the joint probability for these\n", "services is given by the product of each of these Bernoulli\n", "distributions." - ] + ], + "id": "fae89b19-2626-48d8-b4a3-0b4f3ae7527f" }, { "cell_type": "code", @@ -2048,7 +1959,8 @@ "outputs": [], "source": [ "Gaussian" - ] + ], + "id": "0deeeab2-e5be-428d-a925-816aa8c07839" }, { "cell_type": "markdown", @@ -2068,7 +1980,8 @@ "\n", "The final model parameter is the prior probability of the positive\n", "class, $\\pi$, which is computed by maximum likelihood." - ] + ], + "id": "471e4729-c44a-48d9-bae4-6211119087ed" }, { "cell_type": "code", @@ -2077,7 +1990,8 @@ "outputs": [], "source": [ "prior = float(y_train.sum())/len(y_train)" - ] + ], + "id": "103c933f-730a-424d-a4b9-25fa32fba500" }, { "cell_type": "markdown", @@ -2085,14 +1999,14 @@ "source": [ "The prior probability tells us that slightly more facilities have\n", "maternity services than those that don’t." - ] + ], + "id": "a64dbed9-a2c1-4016-a581-2149ea3a5250" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Making Predictions\n", - "------------------\n", + "## Making Predictions\n", "\n", "Naive Bayes has given us the class conditional densities:\n", "$p(\\mathbf{ x}_i | y_i, \\boldsymbol{ \\theta})$. To make predictions with\n", @@ -2125,7 +2039,8 @@ "\\boldsymbol{ \\theta})p(y^*|\\pi)}{\\sum_{y^*=0}^1 \\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \\boldsymbol{ \\theta})p(y^*|\\pi)}\n", "$$ This formula is also fairly straightforward to implement. First we\n", "implement the log probabilities for the Gaussian density." - ] + ], + "id": "d2799300-234e-40ff-abac-8087ce64e438" }, { "cell_type": "code", @@ -2135,7 +2050,8 @@ "source": [ "def log_gaussian(x, mu, sigma2):\n", " return -0.5* np.log(2*np.pi*sigma2)-((x-mu)**2)/(2*sigma2)" - ] + ], + "id": "4be2e55c-2c48-4f43-9dcb-df3fb566b9fd" }, { "cell_type": "markdown", @@ -2149,7 +2065,8 @@ "and smaller, and may be difficult to represent accurately (or even\n", "underflow). Working in log space can ameliorate this problem. We can\n", "also compute the log probability for the Bernoulli distribution." - ] + ], + "id": "a8c20a0f-48f7-4884-ad99-9e2e02d5e701" }, { "cell_type": "code", @@ -2159,14 +2076,14 @@ "source": [ "def log_bernoulli(x, theta):\n", " return x*np.log(theta) + (1-x)*np.log(1-theta)" - ] + ], + "id": "eb84ad72-ccb3-4327-a7f0-82db7e3b7ef3" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Laplace Smoothing\n", - "-----------------\n", + "## Laplace Smoothing\n", "\n", "Before we proceed, let’s just pause and think for a moment what will\n", "happen if `theta` here is either zero or one. This will result in\n", @@ -2181,7 +2098,8 @@ "wish to predict the sun rise the following day to describe his idea of\n", "smoothing, which can be found at the bottom of following page from\n", "Laplace’s ‘Essai Philosophique …’" - ] + ], + "id": "3718bd49-2da9-4825-b9e3-97d58b5d2ab5" }, { "cell_type": "code", @@ -2189,9 +2107,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", - "pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PA16')" - ] + "import notutils as nu\n", + "nu.display_google_book(id='1YQPAAAAQAAJ', page='PA16')" + ], + "id": "ce02c4df-d41e-45c9-badc-a3e723dfa02c" }, { "cell_type": "markdown", @@ -2236,7 +2155,8 @@ "\\pi = \\frac{\\sum_{i=1}^{n} y_i + 1}{n+ 2}\n", "$$ to prevent problems with certainty causing numerical issues and\n", "misclassifications. Let’s refit the Bernoulli features now." - ] + ], + "id": "92e50a8a-e3a2-404e-97bc-386c28c4026c" }, { "cell_type": "code", @@ -2249,14 +2169,16 @@ " if column in Bernoulli:\n", " Bernoulli[column]['theta_0'] = (X_train[column][~y_train].sum() + 1)/((~y_train).sum() + 2)\n", " Bernoulli[column]['theta_1'] = (X_train[column][y_train].sum() + 1)/((y_train).sum() + 2)" - ] + ], + "id": "557af691-9c3c-4b68-b25c-6254e590bca7" }, { "cell_type": "markdown", "metadata": {}, "source": [ "That places us in a position to write the prediction function." - ] + ], + "id": "928a92b1-6747-403e-acd0-248b89d14abf" }, { "cell_type": "code", @@ -2266,7 +2188,8 @@ "source": [ "import numpy as np\n", "import pandas as pd" - ] + ], + "id": "dc4a68cd-2c27-4f60-9bb0-93ce42941841" }, { "cell_type": "code", @@ -2291,14 +2214,16 @@ " + np.exp(log_negative.values[i] + np.log(1-prior)))\n", " return v\n", " #return np.exp(log_positive + np.log(prior))/(np.exp(log_positive + np.log(prior)) + np.exp(log_negative + np.log(1-prior)))" - ] + ], + "id": "e6fb8db7-615c-446f-a50c-cb41cb855186" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we are in a position to make the predictions for the test data." - ] + ], + "id": "95ea08cb-d622-448a-94c0-9a5d4b99c1e5" }, { "cell_type": "code", @@ -2307,7 +2232,8 @@ "outputs": [], "source": [ "p_y = predict(X_test, Gaussian, Bernoulli, prior)" - ] + ], + "id": "f1724a5d-5128-4dc0-a253-fedda26731bf" }, { "cell_type": "markdown", @@ -2318,7 +2244,8 @@ "with greater than 50% probability of membership of the positive class to\n", "the positive class. We can then compare to the true values, and see how\n", "many of these values we got correct. This is our total number correct." - ] + ], + "id": "a7536f34-f130-46c8-a904-37f0f9118121" }, { "cell_type": "code", @@ -2329,7 +2256,8 @@ "correct = y_test.eq(p_y>0.5)\n", "total_correct = sum(correct)\n", "print(\"Total correct\", total_correct, \" out of \", len(y_test), \"which is\", float(total_correct)/len(y_test), \"%\")" - ] + ], + "id": "8de1b8aa-d613-474f-a7c0-63c13a3ad5ec" }, { "cell_type": "markdown", @@ -2344,7 +2272,8 @@ "contain the false positives and the false negatives. Along the rows of\n", "the matrix we place the actual class, and along the columns we place our\n", "predicted class." - ] + ], + "id": "d641a709-1cab-475a-b63c-b0c4b723972e" }, { "cell_type": "code", @@ -2360,7 +2289,8 @@ "confusion_matrix['predicted no maternity']['actual maternity'] = (y_test & ~(p_y>0.5)).sum()\n", "confusion_matrix['predicted no maternity']['actual no maternity'] = (~y_test & ~(p_y>0.5)).sum()\n", "confusion_matrix" - ] + ], + "id": "25e9f749-905e-4c7a-bd5e-76ae2f8b4eb7" }, { "cell_type": "markdown", @@ -2372,7 +2302,8 @@ "valid? Are some features more helpful than others? What happens if you\n", "remove features that appear to be less helpful. How might you select\n", "such features?" - ] + ], + "id": "8159efb1-3fcf-454b-99fe-9f9d265a4558" }, { "cell_type": "markdown", @@ -2381,7 +2312,8 @@ "### Exercise 5 Answer\n", "\n", "Write your answer to Exercise 5 here" - ] + ], + "id": "a167afda-3863-4568-8f46-2e2578241a0c" }, { "cell_type": "code", @@ -2391,7 +2323,8 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "94819798-190c-48c9-a931-a5f04d733ecc" }, { "cell_type": "markdown", @@ -2406,7 +2339,8 @@ "test set how low do you have to set the threshold to avoid all the false\n", "negatives (i.e. facilities where you predicted there was no maternity,\n", "but in actuality there were?" - ] + ], + "id": "5c804c94-7d87-4e80-ba87-7bb5d57db940" }, { "cell_type": "markdown", @@ -2415,7 +2349,8 @@ "### Exercise 6 Answer\n", "\n", "Write your answer to Exercise 6 here" - ] + ], + "id": "bf62eb17-bfb7-4b44-814d-22fb58c6b64a" }, { "cell_type": "code", @@ -2425,21 +2360,22 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "63f9748d-e24b-4348-a6c2-b95c1e0d3791" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Making Predictions\n", - "------------------\n", + "## Making Predictions\n", "\n", "Naive Bayes has given us the class conditional densities:\n", "$p(\\mathbf{ x}_i | y_i, \\boldsymbol{ \\theta})$. To make predictions with\n", "these densities we need to form the distribution given by $$\n", "P(y^*| \\mathbf{ y}, \\mathbf{X}, \\mathbf{ x}^*, \\boldsymbol{ \\theta})\n", "$$" - ] + ], + "id": "7f1ff4a3-01d6-4bca-981a-f866bd15d7d6" }, { "cell_type": "markdown", @@ -2455,7 +2391,8 @@ "$$ $$\n", "\\sigma^2 = \\frac{\\sum_{i=1}^{n} (x_i - \\mu)^2}{n}\n", "$$" - ] + ], + "id": "27548be0-bd80-4584-ab07-fcf20bc69b07" }, { "cell_type": "markdown", @@ -2464,7 +2401,8 @@ "### Exercise 7 Answer\n", "\n", "Write your answer to Exercise 7 here" - ] + ], + "id": "8cd6d641-70fc-4d8c-8811-6ac3213654a3" }, { "cell_type": "code", @@ -2474,7 +2412,8 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "b469980e-0e9e-42f2-bcb4-d154b7a74907" }, { "cell_type": "markdown", @@ -2508,14 +2447,14 @@ "data. To fit the model we consider each feature in turn, we select the\n", "positive class and fit parameters for that class, then we select each\n", "negative class and fit features for that class. We have code below." - ] + ], + "id": "59370eb2-5fb6-4df4-a3a8-a81e247b385a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Naive Bayes Summary\n", - "-------------------\n", + "## Naive Bayes Summary\n", "\n", "Naive Bayes is making very simple assumptions about the data, in\n", "particular it is modeling the full *joint* probability of the data set,\n", @@ -2536,33 +2475,33 @@ "of the modeling the joint probability density. However, the\n", "factorization assumption that allows us to do this efficiently is very\n", "strong and may lead to poor decision boundaries in practice." - ] + ], + "id": "8c77ada3-0dce-4422-9c4b-db046432761c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Other Reading\n", - "-------------\n", + "## Other Reading\n", "\n", "- Chapter 5 of Rogers and Girolami (2011) up to pg 179 (Section 5.1,\n", " and 5.2 up to 5.2.2)." - ] + ], + "id": "51ea3522-37cf-41f3-9a6e-b074d5fa8ea5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "References\n", - "----------" - ] + "## References" + ], + "id": "626ecc68-e486-432e-9ea5-5fef4235efc5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Thanks!\n", - "-------\n", + "## Thanks!\n", "\n", "For more information on these subjects and more you might want to check\n", "the following resources.\n", @@ -2573,7 +2512,8 @@ " Page](http://www.theguardian.com/profile/neil-lawrence)\n", "- blog:\n", " [http://inverseprobability.com](http://inverseprobability.com/blog.html)" - ] + ], + "id": "fd2742da-f26b-4059-a183-97ddc76fac94" }, { "cell_type": "markdown", @@ -2592,8 +2532,13 @@ "E., Avital, I., Stojadinovic, A., 2012. Using machine-learned Bayesian\n", "belief networks to predict perioperative risk of clostridium difficile\n", "infection following colon surgery. Interact J Med Res 1, e6.\n", - "" - ] + "\n", + "\n", + "The Office of the Senior Special Assistant to the President on the\n", + "Millennium Development Goals (OSSAP-MDGs), Columbia University, 2014.\n", + "Nigeria NMIS facility database." + ], + "id": "8b58cc63-ef57-42ad-b92e-962e2ca0bf03" } ], "nbformat": 4, diff --git a/_notebooks/04-gaussian-processes.ipynb b/_notebooks/04-gaussian-processes.ipynb index 15aceae..2eca555 100644 --- a/_notebooks/04-gaussian-processes.ipynb +++ b/_notebooks/04-gaussian-processes.ipynb @@ -4,13 +4,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Gaussian Processes\n", - "==================\n", - "\n", - "### [Neil D. Lawrence](http://inverseprobability.com)\n", + "# Gaussian Processes\n", "\n", "### 2020-11-13" - ] + ], + "id": "c02c1cfa-fa00-4b15-a731-827989b57aae" }, { "cell_type": "markdown", @@ -25,311 +23,24 @@ "fitting Gaussian processes tend to be more complex than parametric\n", "models. In this sessions I will introduce Gaussian processes and explain\n", "why sustaining uncertainty is important." - ] + ], + "id": "e2d7b20e-37c0-4364-bf26-c0d9c6fdbdf9" }, { "cell_type": "markdown", "metadata": {}, "source": [ "$$\n", - "\\newcommand{\\tk}[1]{}\n", - "\\newcommand{\\Amatrix}{\\mathbf{A}}\n", - "\\newcommand{\\KL}[2]{\\text{KL}\\left( #1\\,\\|\\,#2 \\right)}\n", - "\\newcommand{\\Kaast}{\\kernelMatrix_{\\mathbf{ \\ast}\\mathbf{ \\ast}}}\n", - "\\newcommand{\\Kastu}{\\kernelMatrix_{\\mathbf{ \\ast} \\inducingVector}}\n", - "\\newcommand{\\Kff}{\\kernelMatrix_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kfu}{\\kernelMatrix_{\\mappingFunctionVector \\inducingVector}}\n", - "\\newcommand{\\Kuast}{\\kernelMatrix_{\\inducingVector \\bf\\ast}}\n", - "\\newcommand{\\Kuf}{\\kernelMatrix_{\\inducingVector \\mappingFunctionVector}}\n", - "\\newcommand{\\Kuu}{\\kernelMatrix_{\\inducingVector \\inducingVector}}\n", - "\\newcommand{\\Kuui}{\\Kuu^{-1}}\n", - "\\newcommand{\\Qaast}{\\mathbf{Q}_{\\bf \\ast \\ast}}\n", - "\\newcommand{\\Qastf}{\\mathbf{Q}_{\\ast \\mappingFunction}}\n", - "\\newcommand{\\Qfast}{\\mathbf{Q}_{\\mappingFunctionVector \\bf \\ast}}\n", - "\\newcommand{\\Qff}{\\mathbf{Q}_{\\mappingFunctionVector \\mappingFunctionVector}}\n", - "\\newcommand{\\aMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\aScalar}{a}\n", - "\\newcommand{\\aVector}{\\mathbf{a}}\n", - "\\newcommand{\\acceleration}{a}\n", - "\\newcommand{\\bMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\bScalar}{b}\n", - "\\newcommand{\\bVector}{\\mathbf{b}}\n", - "\\newcommand{\\basisFunc}{\\phi}\n", - "\\newcommand{\\basisFuncVector}{\\boldsymbol{ \\basisFunc}}\n", - "\\newcommand{\\basisFunction}{\\phi}\n", - "\\newcommand{\\basisLocation}{\\mu}\n", - "\\newcommand{\\basisMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\basisScalar}{\\basisFunction}\n", - "\\newcommand{\\basisVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\activationFunction}{\\phi}\n", - "\\newcommand{\\activationMatrix}{\\boldsymbol{ \\Phi}}\n", - "\\newcommand{\\activationScalar}{\\basisFunction}\n", - "\\newcommand{\\activationVector}{\\boldsymbol{ \\basisFunction}}\n", - "\\newcommand{\\bigO}{\\mathcal{O}}\n", - "\\newcommand{\\binomProb}{\\pi}\n", - "\\newcommand{\\cMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\cbasisMatrix}{\\hat{\\boldsymbol{ \\Phi}}}\n", - "\\newcommand{\\cdataMatrix}{\\hat{\\dataMatrix}}\n", - "\\newcommand{\\cdataScalar}{\\hat{\\dataScalar}}\n", - "\\newcommand{\\cdataVector}{\\hat{\\dataVector}}\n", - "\\newcommand{\\centeredKernelMatrix}{\\mathbf{ \\MakeUppercase{\\centeredKernelScalar}}}\n", - "\\newcommand{\\centeredKernelScalar}{b}\n", - "\\newcommand{\\centeredKernelVector}{\\centeredKernelScalar}\n", - "\\newcommand{\\centeringMatrix}{\\mathbf{H}}\n", - "\\newcommand{\\chiSquaredDist}[2]{\\chi_{#1}^{2}\\left(#2\\right)}\n", - "\\newcommand{\\chiSquaredSamp}[1]{\\chi_{#1}^{2}}\n", - "\\newcommand{\\conditionalCovariance}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\coregionalizationMatrix}{\\mathbf{B}}\n", - "\\newcommand{\\coregionalizationScalar}{b}\n", - "\\newcommand{\\coregionalizationVector}{\\mathbf{ \\coregionalizationScalar}}\n", - "\\newcommand{\\covDist}[2]{\\text{cov}_{#2}\\left(#1\\right)}\n", - "\\newcommand{\\covSamp}[1]{\\text{cov}\\left(#1\\right)}\n", - "\\newcommand{\\covarianceScalar}{c}\n", - "\\newcommand{\\covarianceVector}{\\mathbf{ \\covarianceScalar}}\n", - "\\newcommand{\\covarianceMatrix}{\\mathbf{C}}\n", - "\\newcommand{\\covarianceMatrixTwo}{\\boldsymbol{ \\Sigma}}\n", - "\\newcommand{\\croupierScalar}{s}\n", - "\\newcommand{\\croupierVector}{\\mathbf{ \\croupierScalar}}\n", - "\\newcommand{\\croupierMatrix}{\\mathbf{ \\MakeUppercase{\\croupierScalar}}}\n", - "\\newcommand{\\dataDim}{p}\n", - "\\newcommand{\\dataIndex}{i}\n", - "\\newcommand{\\dataIndexTwo}{j}\n", - "\\newcommand{\\dataMatrix}{\\mathbf{Y}}\n", - "\\newcommand{\\dataScalar}{y}\n", - "\\newcommand{\\dataSet}{\\mathcal{D}}\n", - "\\newcommand{\\dataStd}{\\sigma}\n", - "\\newcommand{\\dataVector}{\\mathbf{ \\dataScalar}}\n", - "\\newcommand{\\decayRate}{d}\n", - "\\newcommand{\\degreeMatrix}{\\mathbf{ \\MakeUppercase{\\degreeScalar}}}\n", - "\\newcommand{\\degreeScalar}{d}\n", - "\\newcommand{\\degreeVector}{\\mathbf{ \\degreeScalar}}\n", - "\\newcommand{\\diag}[1]{\\text{diag}\\left(#1\\right)}\n", - "\\newcommand{\\diagonalMatrix}{\\mathbf{D}}\n", - "\\newcommand{\\diff}[2]{\\frac{\\text{d}#1}{\\text{d}#2}}\n", - "\\newcommand{\\diffTwo}[2]{\\frac{\\text{d}^2#1}{\\text{d}#2^2}}\n", - "\\newcommand{\\displacement}{x}\n", - "\\newcommand{\\displacementVector}{\\textbf{\\displacement}}\n", - "\\newcommand{\\distanceMatrix}{\\mathbf{ \\MakeUppercase{\\distanceScalar}}}\n", - "\\newcommand{\\distanceScalar}{d}\n", - "\\newcommand{\\distanceVector}{\\mathbf{ \\distanceScalar}}\n", - "\\newcommand{\\eigenvaltwo}{\\ell}\n", - "\\newcommand{\\eigenvaltwoMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\eigenvaltwoVector}{\\mathbf{l}}\n", - "\\newcommand{\\eigenvalue}{\\lambda}\n", - "\\newcommand{\\eigenvalueMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\eigenvalueVector}{\\boldsymbol{ \\lambda}}\n", - "\\newcommand{\\eigenvector}{\\mathbf{ \\eigenvectorScalar}}\n", - "\\newcommand{\\eigenvectorMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\eigenvectorScalar}{u}\n", - "\\newcommand{\\eigenvectwo}{\\mathbf{v}}\n", - "\\newcommand{\\eigenvectwoMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\eigenvectwoScalar}{v}\n", - "\\newcommand{\\entropy}[1]{\\mathcal{H}\\left(#1\\right)}\n", - "\\newcommand{\\errorFunction}{E}\n", - "\\newcommand{\\expDist}[2]{\\left<#1\\right>_{#2}}\n", - "\\newcommand{\\expSamp}[1]{\\left<#1\\right>}\n", - "\\newcommand{\\expectation}[1]{\\left\\langle #1 \\right\\rangle }\n", - "\\newcommand{\\expectationDist}[2]{\\left\\langle #1 \\right\\rangle _{#2}}\n", - "\\newcommand{\\expectedDistanceMatrix}{\\mathcal{D}}\n", - "\\newcommand{\\eye}{\\mathbf{I}}\n", - "\\newcommand{\\fantasyDim}{r}\n", - "\\newcommand{\\fantasyMatrix}{\\mathbf{ \\MakeUppercase{\\fantasyScalar}}}\n", - "\\newcommand{\\fantasyScalar}{z}\n", - "\\newcommand{\\fantasyVector}{\\mathbf{ \\fantasyScalar}}\n", - "\\newcommand{\\featureStd}{\\varsigma}\n", - "\\newcommand{\\gammaCdf}[3]{\\mathcal{GAMMA CDF}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaDist}[3]{\\mathcal{G}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gammaSamp}[2]{\\mathcal{G}\\left(#1,#2\\right)}\n", - "\\newcommand{\\gaussianDist}[3]{\\mathcal{N}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\gaussianSamp}[2]{\\mathcal{N}\\left(#1,#2\\right)}\n", - "\\newcommand{\\uniformDist}[3]{\\mathcal{U}\\left(#1|#2,#3\\right)}\n", - "\\newcommand{\\uniformSamp}[2]{\\mathcal{U}\\left(#1,#2\\right)}\n", - "\\newcommand{\\given}{|}\n", - "\\newcommand{\\half}{\\frac{1}{2}}\n", - "\\newcommand{\\heaviside}{H}\n", - "\\newcommand{\\hiddenMatrix}{\\mathbf{ \\MakeUppercase{\\hiddenScalar}}}\n", - "\\newcommand{\\hiddenScalar}{h}\n", - "\\newcommand{\\hiddenVector}{\\mathbf{ \\hiddenScalar}}\n", - "\\newcommand{\\identityMatrix}{\\eye}\n", - "\\newcommand{\\inducingInputScalar}{z}\n", - "\\newcommand{\\inducingInputVector}{\\mathbf{ \\inducingInputScalar}}\n", - "\\newcommand{\\inducingInputMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\inducingScalar}{u}\n", - "\\newcommand{\\inducingVector}{\\mathbf{ \\inducingScalar}}\n", - "\\newcommand{\\inducingMatrix}{\\mathbf{U}}\n", - "\\newcommand{\\inlineDiff}[2]{\\text{d}#1/\\text{d}#2}\n", - "\\newcommand{\\inputDim}{q}\n", - "\\newcommand{\\inputMatrix}{\\mathbf{X}}\n", - "\\newcommand{\\inputScalar}{x}\n", - "\\newcommand{\\inputSpace}{\\mathcal{X}}\n", - "\\newcommand{\\inputVals}{\\inputVector}\n", - "\\newcommand{\\inputVector}{\\mathbf{ \\inputScalar}}\n", - "\\newcommand{\\iterNum}{k}\n", - "\\newcommand{\\kernel}{\\kernelScalar}\n", - "\\newcommand{\\kernelMatrix}{\\mathbf{K}}\n", - "\\newcommand{\\kernelScalar}{k}\n", - "\\newcommand{\\kernelVector}{\\mathbf{ \\kernelScalar}}\n", - "\\newcommand{\\kff}{\\kernelScalar_{\\mappingFunction \\mappingFunction}}\n", - "\\newcommand{\\kfu}{\\kernelVector_{\\mappingFunction \\inducingScalar}}\n", - "\\newcommand{\\kuf}{\\kernelVector_{\\inducingScalar \\mappingFunction}}\n", - "\\newcommand{\\kuu}{\\kernelVector_{\\inducingScalar \\inducingScalar}}\n", - "\\newcommand{\\lagrangeMultiplier}{\\lambda}\n", - "\\newcommand{\\lagrangeMultiplierMatrix}{\\boldsymbol{ \\Lambda}}\n", - "\\newcommand{\\lagrangian}{L}\n", - "\\newcommand{\\laplacianFactor}{\\mathbf{ \\MakeUppercase{\\laplacianFactorScalar}}}\n", - "\\newcommand{\\laplacianFactorScalar}{m}\n", - "\\newcommand{\\laplacianFactorVector}{\\mathbf{ \\laplacianFactorScalar}}\n", - "\\newcommand{\\laplacianMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\laplacianScalar}{\\ell}\n", - "\\newcommand{\\laplacianVector}{\\mathbf{ \\ell}}\n", - "\\newcommand{\\latentDim}{q}\n", - "\\newcommand{\\latentDistanceMatrix}{\\boldsymbol{ \\Delta}}\n", - "\\newcommand{\\latentDistanceScalar}{\\delta}\n", - "\\newcommand{\\latentDistanceVector}{\\boldsymbol{ \\delta}}\n", - "\\newcommand{\\latentForce}{f}\n", - "\\newcommand{\\latentFunction}{u}\n", - "\\newcommand{\\latentFunctionVector}{\\mathbf{ \\latentFunction}}\n", - "\\newcommand{\\latentFunctionMatrix}{\\mathbf{ \\MakeUppercase{\\latentFunction}}}\n", - "\\newcommand{\\latentIndex}{j}\n", - "\\newcommand{\\latentScalar}{z}\n", - "\\newcommand{\\latentVector}{\\mathbf{ \\latentScalar}}\n", - "\\newcommand{\\latentMatrix}{\\mathbf{Z}}\n", - "\\newcommand{\\learnRate}{\\eta}\n", - "\\newcommand{\\lengthScale}{\\ell}\n", - "\\newcommand{\\rbfWidth}{\\ell}\n", - "\\newcommand{\\likelihoodBound}{\\mathcal{L}}\n", - "\\newcommand{\\likelihoodFunction}{L}\n", - "\\newcommand{\\locationScalar}{\\mu}\n", - "\\newcommand{\\locationVector}{\\boldsymbol{ \\locationScalar}}\n", - "\\newcommand{\\locationMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\variance}[1]{\\text{var}\\left( #1 \\right)}\n", - "\\newcommand{\\mappingFunction}{f}\n", - "\\newcommand{\\mappingFunctionMatrix}{\\mathbf{F}}\n", - "\\newcommand{\\mappingFunctionTwo}{g}\n", - "\\newcommand{\\mappingFunctionTwoMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\mappingFunctionTwoVector}{\\mathbf{ \\mappingFunctionTwo}}\n", - "\\newcommand{\\mappingFunctionVector}{\\mathbf{ \\mappingFunction}}\n", - "\\newcommand{\\scaleScalar}{s}\n", - "\\newcommand{\\mappingScalar}{w}\n", - "\\newcommand{\\mappingVector}{\\mathbf{ \\mappingScalar}}\n", - "\\newcommand{\\mappingMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\mappingScalarTwo}{v}\n", - "\\newcommand{\\mappingVectorTwo}{\\mathbf{ \\mappingScalarTwo}}\n", - "\\newcommand{\\mappingMatrixTwo}{\\mathbf{V}}\n", - "\\newcommand{\\maxIters}{K}\n", - "\\newcommand{\\meanMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanScalar}{\\mu}\n", - "\\newcommand{\\meanTwoMatrix}{\\mathbf{M}}\n", - "\\newcommand{\\meanTwoScalar}{m}\n", - "\\newcommand{\\meanTwoVector}{\\mathbf{ \\meanTwoScalar}}\n", - "\\newcommand{\\meanVector}{\\boldsymbol{ \\meanScalar}}\n", - "\\newcommand{\\mrnaConcentration}{m}\n", - "\\newcommand{\\naturalFrequency}{\\omega}\n", - "\\newcommand{\\neighborhood}[1]{\\mathcal{N}\\left( #1 \\right)}\n", - "\\newcommand{\\neilurl}{http://inverseprobability.com/}\n", - "\\newcommand{\\noiseMatrix}{\\boldsymbol{ E}}\n", - "\\newcommand{\\noiseScalar}{\\epsilon}\n", - "\\newcommand{\\noiseVector}{\\boldsymbol{ \\epsilon}}\n", - "\\newcommand{\\norm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\normalizedLaplacianMatrix}{\\hat{\\mathbf{L}}}\n", - "\\newcommand{\\normalizedLaplacianScalar}{\\hat{\\ell}}\n", - "\\newcommand{\\normalizedLaplacianVector}{\\hat{\\mathbf{ \\ell}}}\n", - "\\newcommand{\\numActive}{m}\n", - "\\newcommand{\\numBasisFunc}{m}\n", - "\\newcommand{\\numComponents}{m}\n", - "\\newcommand{\\numComps}{K}\n", - "\\newcommand{\\numData}{n}\n", - "\\newcommand{\\numFeatures}{K}\n", - "\\newcommand{\\numHidden}{h}\n", - "\\newcommand{\\numInducing}{m}\n", - "\\newcommand{\\numLayers}{\\ell}\n", - "\\newcommand{\\numNeighbors}{K}\n", - "\\newcommand{\\numSequences}{s}\n", - "\\newcommand{\\numSuccess}{s}\n", - "\\newcommand{\\numTasks}{m}\n", - "\\newcommand{\\numTime}{T}\n", - "\\newcommand{\\numTrials}{S}\n", - "\\newcommand{\\outputIndex}{j}\n", - "\\newcommand{\\paramVector}{\\boldsymbol{ \\theta}}\n", - "\\newcommand{\\parameterMatrix}{\\boldsymbol{ \\Theta}}\n", - "\\newcommand{\\parameterScalar}{\\theta}\n", - "\\newcommand{\\parameterVector}{\\boldsymbol{ \\parameterScalar}}\n", - "\\newcommand{\\partDiff}[2]{\\frac{\\partial#1}{\\partial#2}}\n", - "\\newcommand{\\precisionScalar}{j}\n", - "\\newcommand{\\precisionVector}{\\mathbf{ \\precisionScalar}}\n", - "\\newcommand{\\precisionMatrix}{\\mathbf{J}}\n", - "\\newcommand{\\pseudotargetScalar}{\\widetilde{y}}\n", - "\\newcommand{\\pseudotargetVector}{\\mathbf{ \\pseudotargetScalar}}\n", - "\\newcommand{\\pseudotargetMatrix}{\\mathbf{ \\widetilde{Y}}}\n", - "\\newcommand{\\rank}[1]{\\text{rank}\\left(#1\\right)}\n", - "\\newcommand{\\rayleighDist}[2]{\\mathcal{R}\\left(#1|#2\\right)}\n", - "\\newcommand{\\rayleighSamp}[1]{\\mathcal{R}\\left(#1\\right)}\n", - "\\newcommand{\\responsibility}{r}\n", - "\\newcommand{\\rotationScalar}{r}\n", - "\\newcommand{\\rotationVector}{\\mathbf{ \\rotationScalar}}\n", - "\\newcommand{\\rotationMatrix}{\\mathbf{R}}\n", - "\\newcommand{\\sampleCovScalar}{s}\n", - "\\newcommand{\\sampleCovVector}{\\mathbf{ \\sampleCovScalar}}\n", - "\\newcommand{\\sampleCovMatrix}{\\mathbf{s}}\n", - "\\newcommand{\\scalarProduct}[2]{\\left\\langle{#1},{#2}\\right\\rangle}\n", - "\\newcommand{\\sign}[1]{\\text{sign}\\left(#1\\right)}\n", - "\\newcommand{\\sigmoid}[1]{\\sigma\\left(#1\\right)}\n", - "\\newcommand{\\singularvalue}{\\ell}\n", - "\\newcommand{\\singularvalueMatrix}{\\mathbf{L}}\n", - "\\newcommand{\\singularvalueVector}{\\mathbf{l}}\n", - "\\newcommand{\\sorth}{\\mathbf{u}}\n", - "\\newcommand{\\spar}{\\lambda}\n", - "\\newcommand{\\trace}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\BasalRate}{B}\n", - "\\newcommand{\\DampingCoefficient}{C}\n", - "\\newcommand{\\DecayRate}{D}\n", - "\\newcommand{\\Displacement}{X}\n", - "\\newcommand{\\LatentForce}{F}\n", - "\\newcommand{\\Mass}{M}\n", - "\\newcommand{\\Sensitivity}{S}\n", - "\\newcommand{\\basalRate}{b}\n", - "\\newcommand{\\dampingCoefficient}{c}\n", - "\\newcommand{\\mass}{m}\n", - "\\newcommand{\\sensitivity}{s}\n", - "\\newcommand{\\springScalar}{\\kappa}\n", - "\\newcommand{\\springVector}{\\boldsymbol{ \\kappa}}\n", - "\\newcommand{\\springMatrix}{\\boldsymbol{ \\mathcal{K}}}\n", - "\\newcommand{\\tfConcentration}{p}\n", - "\\newcommand{\\tfDecayRate}{\\delta}\n", - "\\newcommand{\\tfMrnaConcentration}{f}\n", - "\\newcommand{\\tfVector}{\\mathbf{ \\tfConcentration}}\n", - "\\newcommand{\\velocity}{v}\n", - "\\newcommand{\\sufficientStatsScalar}{g}\n", - "\\newcommand{\\sufficientStatsVector}{\\mathbf{ \\sufficientStatsScalar}}\n", - "\\newcommand{\\sufficientStatsMatrix}{\\mathbf{G}}\n", - "\\newcommand{\\switchScalar}{s}\n", - "\\newcommand{\\switchVector}{\\mathbf{ \\switchScalar}}\n", - "\\newcommand{\\switchMatrix}{\\mathbf{S}}\n", - "\\newcommand{\\tr}[1]{\\text{tr}\\left(#1\\right)}\n", - "\\newcommand{\\loneNorm}[1]{\\left\\Vert #1 \\right\\Vert_1}\n", - "\\newcommand{\\ltwoNorm}[1]{\\left\\Vert #1 \\right\\Vert_2}\n", - "\\newcommand{\\onenorm}[1]{\\left\\vert#1\\right\\vert_1}\n", - "\\newcommand{\\twonorm}[1]{\\left\\Vert #1 \\right\\Vert}\n", - "\\newcommand{\\vScalar}{v}\n", - "\\newcommand{\\vVector}{\\mathbf{v}}\n", - "\\newcommand{\\vMatrix}{\\mathbf{V}}\n", - "\\newcommand{\\varianceDist}[2]{\\text{var}_{#2}\\left( #1 \\right)}\n", - "\\newcommand{\\vecb}[1]{\\left(#1\\right):}\n", - "\\newcommand{\\weightScalar}{w}\n", - "\\newcommand{\\weightVector}{\\mathbf{ \\weightScalar}}\n", - "\\newcommand{\\weightMatrix}{\\mathbf{W}}\n", - "\\newcommand{\\weightedAdjacencyMatrix}{\\mathbf{A}}\n", - "\\newcommand{\\weightedAdjacencyScalar}{a}\n", - "\\newcommand{\\weightedAdjacencyVector}{\\mathbf{ \\weightedAdjacencyScalar}}\n", - "\\newcommand{\\onesVector}{\\mathbf{1}}\n", - "\\newcommand{\\zerosVector}{\\mathbf{0}}\n", "$$" - ] + ], + "id": "c4b84963-87a9-4fe4-b0c4-4135141c9e8b" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "::: {.cell .markdown}\n", + "\n", "\n", "\n", "\n", @@ -339,17 +50,20 @@ "" - ] + ], + "id": "66921a82-61bd-4235-92cf-0341f200bfab" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Setup\n", - "-----\n", + "## Setup\n", "\n", - "First we download some libraries and files to support the notebook." - ] + "\\[edit\\]" + ], + "id": "0d90c63c-a8bc-437f-81e9-d2f2fbc31cd6" }, { "cell_type": "code", @@ -357,17 +71,38 @@ "metadata": {}, "outputs": [], "source": [ - "import urllib.request" - ] + "import matplotlib.pyplot as plt\n", + "plt.rcParams.update({'font.size': 22})" + ], + "id": "d941bf3c-5c1d-4631-a09e-89e1cffb7a0a" }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')" - ] + "" + ], + "id": "a32c1689-a047-4c86-8a57-b603ca208ad1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## notutils\n", + "\n", + "\\[edit\\]\n", + "\n", + "This small package is a helper package for various notebook utilities\n", + "used below.\n", + "\n", + "The software can be installed using" + ], + "id": "b627017c-f2fd-4899-8d74-2cd632681d65" }, { "cell_type": "code", @@ -375,8 +110,22 @@ "metadata": {}, "outputs": [], "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/teaching_plots.py','teaching_plots.py')" - ] + "%pip install notutils" + ], + "id": "cfc320f7-d489-4a2e-ad38-056a0575c7b2" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub:\n", + "\n", + "\n", + "Once `notutils` is installed, it can be imported in the usual manner." + ], + "id": "cd94f4d4-9ecd-4e9a-91fb-9f45668a93a9" }, { "cell_type": "code", @@ -384,15 +133,19 @@ "metadata": {}, "outputs": [], "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/gp_tutorial.py','gp_tutorial.py')" - ] + "import notutils" + ], + "id": "7ce0f3c6-58ad-4f44-ae04-f81025adeff8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "pods\n", - "----\n", + "## pods\n", + "\n", + "\\[edit\\]\n", "\n", "In Sheffield we created a suite of software tools for ‘Open Data\n", "Science’. Open data science is an approach to sharing code, models and\n", @@ -403,7 +156,8 @@ "Science](http://inverseprobability.com/2014/07/01/open-data-science).\n", "\n", "The software can be installed using" - ] + ], + "id": "8b5c9e7f-1f21-4de5-aff7-3bb57ddc5e8e" }, { "cell_type": "code", @@ -411,8 +165,9 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install --upgrade git+https://github.com/sods/ods" - ] + "%pip install pods" + ], + "id": "0bb5fed6-2b6a-42b3-9875-f53981a40039" }, { "cell_type": "markdown", @@ -420,11 +175,11 @@ "source": [ "from the command prompt where you can access your python installation.\n", "\n", - "The code is also available on github:\n", - "https://github.com/sods/ods\n", + "The code is also available on GitHub: \n", "\n", "Once `pods` is installed, it can be imported in the usual manner." - ] + ], + "id": "8b090fc1-d1f5-4fb3-9934-2e009962a254" }, { "cell_type": "code", @@ -433,46 +188,105 @@ "outputs": [], "source": [ "import pods" - ] + ], + "id": "f6df8cdb-b094-43f8-b4a2-de4ddf3f970d" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlai\n", + "\n", + "\\[edit\\]\n", + "\n", + "The `mlai` software is a suite of helper functions for teaching and\n", + "demonstrating machine learning algorithms. It was first used in the\n", + "Machine Learning and Adaptive Intelligence course in Sheffield in 2013.\n", + "\n", + "The software can be installed using" + ], + "id": "9c065d71-d568-4053-bfeb-da0441e4ec02" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install mlai" + ], + "id": "b5a1945c-9b25-489b-bae5-6f487eac5dbb" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "from the command prompt where you can access your python installation.\n", + "\n", + "The code is also available on GitHub: \n", + "\n", + "Once `mlai` is installed, it can be imported in the usual manner." + ], + "id": "7e05a0e8-381a-4086-a8da-af9267c371dd" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "8dad9c59-e3c8-461d-ae7f-44879fd5561e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A key reference for Gaussian process models remains the\n", "excellent book “Gaussian Processes for Machine Learning” (Rasmussen and\n", "Williams (2006)). The book is also\n", - "freely\n", + "freely\n", "available online.\n", "\n", "Rasmussen and Williams (2006) is still one of the most important\n", "references on Gaussian process models. It is [available freely\n", "online](http://www.gaussianprocess.org/gpml/)." - ] + ], + "id": "90c84595-b4be-44eb-82aa-b3262b413049" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A First Course in Machine Learning\n", - "----------------------------------\n", + "## A First Course in Machine Learning\n", "\n", - "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", "Figure: The main course text is “A First Course in Machine Learning”\n", "by Rogers and Girolami (2011).\n", "\n", "" - ] + ], + "id": "afecd9d9-c61d-4395-a152-17dd5e0479a1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Example: Prediction of Malaria Incidence in Uganda\n", - "--------------------------------------------------\n", + "## Example: Prediction of Malaria Incidence in Uganda\n", + "\n", + "\\[edit\\]\n", "\n", "\n", "\n", @@ -492,7 +306,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -509,11 +323,11 @@ "\n", "\n", "\n", - "Ricardo Andrade Pacheco\n", + "Ricardo Andrade Pacecho\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -534,7 +348,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -549,7 +363,12 @@ "collaboration with John Quinn and Martin Mubangizi (Andrade-Pacheco et\n", "al., 2014; Mubangizi et al., 2014). John and Martin were initally from\n", "the AI-DEV group from the University of Makerere in Kampala and more\n", - "latterly they were based at UN Global Pulse in Kampala.\n", + "latterly they were based at UN Global Pulse in Kampala. You can see the\n", + "work summarized on the UN Global Pulse [disease outbreaks project site\n", + "here](https://diseaseoutbreaks.unglobalpulse.net/uganda/).\n", + "\n", + "- See [UN Global Pulse Disease Outbreaks\n", + " Site](https://diseaseoutbreaks.unglobalpulse.net/uganda/)\n", "\n", "Malaria data is spatial data. Uganda is split into districts, and health\n", "reports can be found for each district. This suggests that models such\n", @@ -559,20 +378,19 @@ "location within a district, such as Nagongera which is a sentinel site\n", "based in the Tororo district.\n", "\n", - "\n", + "\n", "\n", - "Figure: Ugandan districs. Data SRTM/NASA from\n", - "https://dds.cr.usgs.gov/srtm/version2_1.\n", + "Figure: Ugandan districts. Data SRTM/NASA from\n", + ".\n", "\n", - "(Andrade-Pacheco et al., 2014; Mubangizi\n", - "et al., 2014)\n", + "(Andrade-Pacheco et al., 2014; Mubangizi et al., 2014)\n", "\n", "The common standard for collecting health data on the African continent\n", "is from the Health management information systems (HMIS). However, this\n", "data suffers from missing values (Gething et al., 2006) and diagnosis of\n", "diseases like typhoid and malaria may be confounded.\n", "\n", - "\n", + "\n", "\n", "Figure: The Tororo district, where the sentinel site, Nagongera, is\n", "located.\n", @@ -584,7 +402,7 @@ "sites give accurate assessment of malaria disease levels in Uganda,\n", "including a site in Nagongera.\n", "\n", - "\n", + "\n", "\n", "Figure: Sentinel and HMIS data along with rainfall and temperature\n", "for the Nagongera sentinel station in the Tororo district.\n", @@ -599,33 +417,33 @@ "and temperature, to improve predictions from HMIS data of levels of\n", "malaria.\n", "\n", - "\n", + "\n", "\n", "Figure: The Mubende District.\n", "\n", - "\n", + "\n", "\n", "Figure: Prediction of malaria incidence in Mubende.\n", "\n", - "\n", + "\n", "\n", "Figure: The project arose out of the Gaussian process summer school\n", "held at Makerere in Kampala in 2013. The school led, in turn, to the\n", "Data Science Africa initiative." - ] + ], + "id": "e2a3abf4-e15c-49b6-8e75-ba59ee7b320e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Early Warning Systems\n", - "---------------------\n", + "## Early Warning Systems\n", "\n", - "\n", + "\n", "\n", "Figure: The Kabarole district in Uganda.\n", "\n", - "\n", + "\n", "\n", "Figure: Estimate of the current disease situation in the Kabarole\n", "district over time. Estimate is constructed with a Gaussian process with\n", @@ -653,7 +471,7 @@ "Finally, there is a gray region which represents when the scale of the\n", "effect is small.\n", "\n", - "\n", + "\n", "\n", "Figure: The map of Ugandan districts with an overview of the Malaria\n", "situation in each district.\n", @@ -661,14 +479,18 @@ "These colors can now be observed directly on a spatial map of the\n", "districts to give an immediate impression of the current status of the\n", "disease across the country." - ] + ], + "id": "1dac77bd-8a7e-4b01-ae39-c5d1ed0f9200" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "What is Machine Learning?\n", - "=========================\n", + "# What is Machine Learning?\n", + "\n", + "\\[edit\\]\n", "\n", "What is machine learning? At its most basic level machine learning is a\n", "combination of\n", @@ -689,22 +511,22 @@ "In practice we normally perform machine learning using two functions. To\n", "combine data with a model we typically make use of:\n", "\n", - "**a prediction function** a function which is used to make the\n", - "predictions. It includes our beliefs about the regularities of the\n", - "universe, our assumptions about how the world works, e.g. smoothness,\n", - "spatial similarities, temporal similarities.\n", + "**a prediction function** it is used to make the predictions. It\n", + "includes our beliefs about the regularities of the universe, our\n", + "assumptions about how the world works, e.g., smoothness, spatial\n", + "similarities, temporal similarities.\n", "\n", - "**an objective function** a function which defines the cost of\n", - "misprediction. Typically it includes knowledge about the world’s\n", - "generating processes (probabilistic objectives) or the costs we pay for\n", - "mispredictions (empiricial risk minimization).\n", + "**an objective function** it defines the ‘cost’ of misprediction.\n", + "Typically, it includes knowledge about the world’s generating processes\n", + "(probabilistic objectives) or the costs we pay for mispredictions\n", + "(empirical risk minimization).\n", "\n", "The combination of data and model through the prediction function and\n", "the objective function leads to a *learning algorithm*. The class of\n", "prediction functions and objective functions we can make use of is\n", "restricted by the algorithms they lead to. If the prediction function or\n", "the objective function are too complex, then it can be difficult to find\n", - "an appropriate learning algorithm. Much of the acdemic field of machine\n", + "an appropriate learning algorithm. Much of the academic field of machine\n", "learning is the quest for new learning algorithms that allow us to bring\n", "different types of models and data together.\n", "\n", @@ -714,15 +536,19 @@ "Example](https://royalsociety.org/~/media/policy/projects/machine-learning/publications/machine-learning-report.pdf).\n", "\n", "You can also check my post blog post on [What is Machine\n", - "Learning?](http://inverseprobability.com/2017/07/17/what-is-machine-learning).." - ] + "Learning?](http://inverseprobability.com/2017/07/17/what-is-machine-learning)." + ], + "id": "0d1bf8f3-8054-4bc3-906f-2d74e2cf95e5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Overdetermined System\n", - "---------------------\n", + "## Overdetermined System\n", + "\n", + "\\[edit\\]\n", "\n", "The challenge with a linear model is that it has two unknowns, $m$, and\n", "$c$. Observing data allows us to write down a system of simultaneous\n", @@ -731,30 +557,35 @@ "a second data point, $x= 3$, $y=1$, then we can write two simultaneous\n", "linear equations of the form.\n", "\n", - "point 1: $x= 1$, $y=3$ $$3 = m + c$$ point 2: $x= 3$, $y=1$\n", - "$$1 = 3m + c$$\n", + "point 1: $x= 1$, $y=3$ $$\n", + "3 = m + c\n", + "$$ point 2: $x= 3$, $y=1$ $$\n", + "1 = 3m + c\n", + "$$\n", "\n", "The solution to these two simultaneous equations can be represented\n", "graphically as\n", "\n", - "\n", + "\n", "\n", "Figure: The solution of two linear equations represented as the fit\n", "of a straight line through two data\n", "\n", - "The challenge comes when a third data point is observed and it doesn’t\n", - "naturally fit on the straight line.\n", + "The challenge comes when a third data point is observed, and it doesn’t\n", + "fit on the straight line.\n", "\n", - "point 3: $x= 2$, $y=2.5$ $$2.5 = 2m + c$$\n", + "point 3: $x= 2$, $y=2.5$ $$\n", + "2.5 = 2m + c\n", + "$$\n", "\n", - "\n", + "\n", "\n", "Figure: A third observation of data is inconsistent with the solution\n", "dictated by the first two observations\n", "\n", "Now there are three candidate lines, each consistent with our data.\n", "\n", - "\n", + "\n", "\n", "Figure: Three solutions to the problem, each consistent with two\n", "points of the three observations\n", @@ -763,7 +594,8 @@ "than we need to determine our parameters. The problem arises because the\n", "model is a simplification of the real world, and the data we observe is\n", "therefore inconsistent with our model." - ] + ], + "id": "a5a068bd-65f7-4f04-9140-baae13da1ebc" }, { "cell_type": "code", @@ -771,8 +603,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "5c6d2f26-2d50-4e5f-aee9-6cd216d75e6c" }, { "cell_type": "code", @@ -781,7 +614,8 @@ "outputs": [], "source": [ "plot.over_determined_system(diagrams='./ml')" - ] + ], + "id": "b29e09c9-23a9-4da9-9e40-0756c0380e92" }, { "cell_type": "code", @@ -790,8 +624,9 @@ "outputs": [], "source": [ "from ipywidgets import IntSlider\n", - "import pods" - ] + "import notutils as nu" + ], + "id": "ad30c66e-a39b-41fc-9702-e1a628f4dad7" }, { "cell_type": "code", @@ -799,20 +634,28 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('over_determined_system{samp:0>3}.svg',\n", - " directory='./ml', \n", - " samp=IntSlider(1,1,7,1))" - ] + "nu.display_plots('over_determined_system{samp:0>3}.svg',\n", + " directory='./ml', \n", + " samp=IntSlider(1,1,7,1))" + ], + "id": "ced7e252-facd-4dac-bdfd-cad1eaadb15a" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "## Pierre-Simon Laplace\n", + "\n", + "\\[edit\\]\n", + "\n", "The solution was proposed by Pierre-Simon Laplace. His idea was to\n", "accept that the model was an incomplete representation of the real\n", - "world, and the manner in which it was incomplete is *unknown*. His idea\n", - "was that such unknowns could be dealt with through probability." - ] + "world, and the way it was incomplete is *unknown*. His idea was that\n", + "such unknowns could be dealt with through probability." + ], + "id": "936ea41f-2e24-4609-bfe7-a957e647fecb" }, { "cell_type": "markdown", @@ -820,10 +663,15 @@ "source": [ "### Pierre-Simon Laplace\n", "\n", - "\n", + "\\[edit\\]\n", + "\n", + "\n", "\n", "Figure: Pierre-Simon Laplace 1749-1827." - ] + ], + "id": "f3fc089c-e321-4791-a5dd-b18f9ff40df1" }, { "cell_type": "code", @@ -831,9 +679,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", - "pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA2')" - ] + "import notutils as nu\n", + "nu.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA2')" + ], + "id": "851cd53b-1272-4566-a4e0-f13c5ba2d6f4" }, { "cell_type": "markdown", @@ -856,9 +705,21 @@ "\n", "This notion is known as *Laplace’s demon* or *Laplace’s superman*.\n", "\n", - "\n", + "\n", "\n", - "Figure: Laplace’s determinsim in English translation.\n", + "Figure: Laplace’s determinsim in English translation." + ], + "id": "7dd072ba-fb77-428b-ab62-eac1cdcf954c" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Laplace’s Gremlin\n", + "\n", + "\\[edit\\]\n", "\n", "Unfortunately, most analyses of his ideas stop at that point, whereas\n", "his real point is that such a notion is unreachable. Not so much\n", @@ -871,7 +732,8 @@ ">\n", "> Probability is relative, in part to this ignorance, in part to our\n", "> knowledge." - ] + ], + "id": "d5047db3-0cfe-445a-88e6-f3bcd44dc27c" }, { "cell_type": "code", @@ -879,15 +741,16 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", - "pods.notebook.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA4')" - ] + "import notutils as nu\n", + "nu.display_google_book(id='1YQPAAAAQAAJ', page='PR17-IA4')" + ], + "id": "e686be3b-c6b3-4728-b0aa-d9a02bc201e7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: To Laplace, determinism is a strawman. Ignorance of mechanism\n", "and data leads to uncertainty which should be dealt with through\n", @@ -897,14 +760,28 @@ "Universe due to our ignorance about the world, Laplace’s suggestion, and\n", "focus in this essay is that we turn to probability to deal with this\n", "uncertainty. This is also our inspiration for using probability in\n", - "machine learning.\n", + "machine learning. This is the true message of Laplace’s essay, not\n", + "determinism, but the gremlin of uncertainty that emerges from our\n", + "ignorance.\n", "\n", "The “forces by which nature is animated” is our *model*, the “situation\n", "of beings that compose it” is our *data* and the “intelligence\n", "sufficiently vast enough to submit these data to analysis” is our\n", "compute. The fly in the ointment is our *ignorance* about these aspects.\n", "And *probability* is the tool we use to incorporate this ignorance\n", - "leading to uncertainty or *doubt* in our predictions.\n", + "leading to uncertainty or *doubt* in our predictions." + ], + "id": "14739d2b-c8ad-48ed-8109-f15616ac72a1" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Latent Variables\n", + "\n", + "\\[edit\\]\n", "\n", "Laplace’s concept was that the reason that the data doesn’t match up to\n", "the model is because of unconsidered factors, and that these might be\n", @@ -914,30 +791,25 @@ "But in the context Laplace uses it, the variable is so common that it\n", "has other names such as a “slack” variable or the *noise* in the system.\n", "\n", - "point 1: $x= 1$, $y=3$ $$\n", - "3 = m + c + \\epsilon_1\n", - "$$ point 2: $x= 3$, $y=1$ $$\n", - "1 = 3m + c + \\epsilon_2\n", - "$$ point 3: $x= 2$, $y=2.5$ $$\n", - "2.5 = 2m + c + \\epsilon_3\n", - "$$\n", + "point 1: $x= 1$, $y=3$ \\[ 3 = m + c + \\_1 \\] point 2: $x= 3$, $y=1$ \\[ 1\n", + "= 3m + c + \\_2 \\] point 3: $x= 2$, $y=2.5$ \\[ 2.5 = 2m + c + \\_3 \\]\n", "\n", "Laplace’s trick has converted the *overdetermined* system into an\n", "*underdetermined* system. He has now added three variables,\n", "$\\{\\epsilon_i\\}_{i=1}^3$, which represent the unknown corruptions of the\n", "real world. Laplace’s idea is that we should represent that unknown\n", "corruption with a *probability distribution*." - ] + ], + "id": "e67221b2-a772-4ad2-8767-4fa8182e3d3c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A Probabilistic Process\n", - "-----------------------\n", + "## A Probabilistic Process\n", "\n", - "However, it was left to an admirer of Gauss to develop a practical\n", - "probability density for that purpose. It was Carl Friederich Gauss who\n", + "However, it was left to an admirer of Laplace to develop a practical\n", + "probability density for that purpose. It was Carl Friedrich Gauss who\n", "suggested that the *Gaussian* density (which at the time was unnamed!)\n", "should be used to represent this error.\n", "\n", @@ -945,25 +817,29 @@ "part, and a stochastic part. This type of function is sometimes known as\n", "a probabilistic or stochastic process, to distinguish it from a\n", "deterministic process." - ] + ], + "id": "b22425a5-4c8f-4a78-a31e-633627a4f081" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Two Important Gaussian Properties\n", - "---------------------------------\n", + "## Two Important Gaussian Properties\n", + "\n", + "\\[edit\\]\n", "\n", "The Gaussian density has many important properties, but for the moment\n", "we’ll review two of them." - ] + ], + "id": "28900d08-c4e0-455c-87be-5a5fad7ce59a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Sum of Gaussians\n", - "----------------\n", + "## Sum of Gaussians\n", "\n", "If we assume that a variable, $y_i$, is sampled from a Gaussian density,\n", "\n", @@ -986,14 +862,14 @@ "summed together tend to a Gaussian density. That is the [*central limit\n", "theorem*](https://en.wikipedia.org/wiki/Central_limit_theorem) which is\n", "a major justification for the use of a Gaussian density." - ] + ], + "id": "fba05094-086c-4342-8c9b-1441779ad631" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Scaling a Gaussian\n", - "------------------\n", + "## Scaling a Gaussian\n", "\n", "Less unusual is the *scaling* property of a Gaussian density. If a\n", "variable, $y$, is sampled from a Gaussian density,\n", @@ -1040,14 +916,18 @@ "Principal Component Analysis (Tipping and Bishop, 1999), because we\n", "integrated out the inputs (or *latent* variables they would be called in\n", "that case)." - ] + ], + "id": "3e1649e1-f85d-4d2a-964d-f3851a9306e6" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Laplace’s Idea\n", - "--------------\n", + "## Laplace’s Idea\n", + "\n", + "\\[edit\\]\n", "\n", "Laplace had the idea to augment the observations by noise, that is\n", "equivalent to considering a probability density whose mean is given by\n", @@ -1058,18 +938,17 @@ "corrupted by noise. Laplace didn’t suggest the Gaussian density for that\n", "purpose, that was an innovation from Carl Friederich Gauss, which is\n", "what gives the Gaussian density its name." - ] + ], + "id": "6dfce428-858c-4b51-b81e-7a2721c05331" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Height as a Function of Weight\n", - "------------------------------\n", - "\n", - "In the standard Gaussian, parametized by mean and variance.\n", + "## Height as a Function of Weight\n", "\n", - "Make the mean a linear function of an *input*.\n", + "In the standard Gaussian, parameterized by mean and variance, make the\n", + "mean a linear function of an *input*.\n", "\n", "This leads to a regression model. $$\n", "\\begin{align*}\n", @@ -1079,34 +958,44 @@ "$$\n", "\n", "Assume $y_i$ is height and $x_i$ is weight." - ] + ], + "id": "4acbcf5f-e049-47ec-a937-ce2892a81eb7" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Linear Algebra\n", - "==============\n", + "## Olympic Marathon Data\n", "\n", - "Linear algebra provides a very similar role, when we introduce [linear\n", - "algebra](http://en.wikipedia.org/wiki/Linear_algebra), it is because we\n", - "are faced with a large number of addition and multiplication operations.\n", - "These operations need to be done together and would be very tedious to\n", - "write down as a group. So the first reason we reach for linear algebra\n", - "is for a more compact representation of our mathematical formulae." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Running Example: Olympic Marathons\n", - "----------------------------------\n", + "\\[edit\\]\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + "- Gold medal times for Olympic Marathon since 1896.\n", + "- Marathons before 1924 didn’t have a standardized distance.\n", + "- Present results using pace per km.\n", + "- In 1904 Marathon was badly organized leading to very slow times.\n", + "\n", + "\n", + "\n", + "\n", + "Image from Wikimedia Commons \n", + "\n", + "
\n", "\n", - "Now we will load in the Olympic marathon data. This is data of the\n", - "olympic marath times for the men’s marathon from the first olympics in\n", - "1896 up until the London 2012 olympics." - ] + "The first thing we will do is load a standard data set for regression\n", + "modelling. The data consists of the pace of Olympic Gold Medal Marathon\n", + "winners for the Olympics from 1896 to present. Let’s load in the data\n", + "and plot." + ], + "id": "ec70b849-801d-4a4c-a164-a19719947900" }, { "cell_type": "code", @@ -1114,8 +1003,10 @@ "metadata": {}, "outputs": [], "source": [ + "import numpy as np\n", "import pods" - ] + ], + "id": "93012545-e599-4961-b557-10cf12802b7f" }, { "cell_type": "code", @@ -1125,15 +1016,25 @@ "source": [ "data = pods.datasets.olympic_marathon_men()\n", "x = data['X']\n", - "y = data['Y']" - ] + "y = data['Y']\n", + "\n", + "offset = y.mean()\n", + "scale = np.sqrt(y.var())\n", + "yhat = (y - offset)/scale" + ], + "id": "cdd8b1f7-9310-4115-8633-3ec002b290ff" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "You can see what these values are by typing:" - ] + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "aee76fd2-17a4-40f0-a8f0-197cbe36f131" }, { "cell_type": "code", @@ -1141,62 +1042,65 @@ "metadata": {}, "outputs": [], "source": [ - "print(x)\n", - "print(y)" - ] + "\n", + "xlim = (1875,2030)\n", + "ylim = (2.5, 6.5)\n", + "\n", + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "_ = ax.plot(x, y, 'r.',markersize=10)\n", + "ax.set_xlabel('year', fontsize=20)\n", + "ax.set_ylabel('pace min/km', fontsize=20)\n", + "ax.set_xlim(xlim)\n", + "ax.set_ylim(ylim)\n", + "\n", + "mlai.write_figure(filename='olympic-marathon.svg', \n", + " directory='./datasets')" + ], + "id": "afc902a7-19f5-4948-b7f2-9dd57b01886e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Note that they are not `pandas` data frames for this example, they are\n", - "just arrays of dimensionality $n\\times 1$, where $n$ is the number of\n", - "data.\n", + "\n", "\n", - "The aim of this lab is to have you coding linear regression in python.\n", - "We will do it in two ways, once using iterative updates (coordinate\n", - "ascent) and then using linear algebra. The linear algebra approach will\n", - "not only work much better, it is easy to extend to multiple input linear\n", - "regression and *non-linear* regression using basis functions." - ] + "Figure: Olympic marathon pace times since 1896.\n", + "\n", + "Things to notice about the data include the outlier in 1904, in that\n", + "year the Olympics was in St Louis, USA. Organizational problems and\n", + "challenges with dust kicked up by the cars following the race meant that\n", + "participants got lost, and only very few participants completed. More\n", + "recent years see more consistently quick marathons." + ], + "id": "5c5b5436-81ee-4dc0-924c-3eabb4b30547" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Plotting the Data\n", - "-----------------\n", + "## Running Example: Olympic Marathons\n", "\n", - "You can make a plot of $y$ vs $x$ with the following command:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline \n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "plt.plot(x, y, 'rx')\n", - "plt.xlabel('year')\n", - "plt.ylabel('pace in min/km')" - ] + "\\[edit\\]\n", + "\n", + "Note that `x` and `y` are not `pandas` data frames for this example,\n", + "they are just arrays of dimensionality $n\\times 1$, where $n$ is the\n", + "number of data.\n", + "\n", + "The aim of this lab is to have you coding linear regression in python.\n", + "We will do it in two ways, once using iterative updates (coordinate\n", + "ascent) and then using linear algebra. The linear algebra approach will\n", + "not only work much better, it is also easy to extend to multiple input\n", + "linear regression and *non-linear* regression using basis functions." + ], + "id": "b122ff52-870a-4cba-a7a4-00939b61f733" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Maximum Likelihood: Iterative Solution\n", - "--------------------------------------\n", + "## Maximum Likelihood: Iterative Solution\n", "\n", "Now we will take the maximum likelihood approach we derived in the\n", "lecture to fit a line, $y_i=mx_i + c$, to the data you’ve plotted. We\n", @@ -1204,7 +1108,8 @@ "E(m, c) = \\sum_{i=1}^n(y_i-mx_i-c)^2\n", "$$ with respect to $m$, $c$ and $\\sigma^2$. We can start with an initial\n", "guess for $m$," - ] + ], + "id": "67cf9fe1-0716-46c1-b1b9-bfca3df3c24e" }, { "cell_type": "code", @@ -1214,7 +1119,8 @@ "source": [ "m = -0.4\n", "c = 80" - ] + ], + "id": "879a23d6-6ba1-4277-8a30-6686cd313191" }, { "cell_type": "markdown", @@ -1222,41 +1128,92 @@ "source": [ "Then we use the maximum likelihood update to find an estimate for the\n", "offset, $c$." - ] + ], + "id": "9b3a95b7-c8d4-42a6-bbc0-e2073288537e" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Log Likelihood for Multivariate Regression\n", + "\n", + "\\[edit\\]" + ], + "id": "33a88913-9eab-4e21-9f14-618be040f396" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quadratic Loss\n", + "\n", + "\\[edit\\]\n", + "\n", + "Now we’ve identified the empirical risk with the loss, we’ll use\n", + "$E(\\mathbf{ w})$ to represent our objective function. $$\n", + "E(\\mathbf{ w}) = \\sum_{i=1}^n\\left(y_i - f(\\mathbf{ x}_i, \\mathbf{ w})\\right)^2\n", + "$$ gives us our objective.\n", + "\n", + "In the case of the linear prediction function, we can substitute\n", + "$f(\\mathbf{ x}_i, \\mathbf{ w}) = \\mathbf{ w}^\\top \\mathbf{ x}_i$. $$\n", + "E(\\mathbf{ w}) = \\sum_{i=1}^n\\left(y_i - \\mathbf{ w}^\\top \\mathbf{ x}_i\\right)^2\n", + "$$ To compute the gradient of the objective, we first expand the\n", + "brackets." + ], + "id": "83013905-8081-4a4b-9f61-e29d8683d532" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Log Likelihood for Multivariate Regression\n", - "------------------------------------------" - ] + "## Bracket Expansion\n", + "\n", + "$$\n", + "\\begin{align*}\n", + " E(\\mathbf{ w},\\sigma^2) = &\n", + "\\frac{n}{2}\\log \\sigma^2 + \\frac{1}{2\\sigma^2}\\sum\n", + "_{i=1}^{n}y_i^{2}-\\frac{1}{\\sigma^2}\\sum\n", + "_{i=1}^{n}y_i\\mathbf{ w}^{\\top}\\mathbf{ x}_i\\\\&+\\frac{1}{2\\sigma^2}\\sum\n", + "_{i=1}^{n}\\mathbf{ w}^{\\top}\\mathbf{ x}_i\\mathbf{ x}_i^{\\top}\\mathbf{ w}\n", + "+\\text{const}.\\\\\n", + " = & \\frac{n}{2}\\log \\sigma^2 + \\frac{1}{2\\sigma^2}\\sum\n", + "_{i=1}^{n}y_i^{2}-\\frac{1}{\\sigma^2}\n", + "\\mathbf{ w}^\\top\\sum_{i=1}^{n}\\mathbf{ x}_iy_i\\\\&+\\frac{1}{2\\sigma^2}\n", + "\\mathbf{ w}^{\\top}\\left[\\sum\n", + "_{i=1}^{n}\\mathbf{ x}_i\\mathbf{ x}_i^{\\top}\\right]\\mathbf{ w}+\\text{const}.\n", + "\\end{align*}\n", + "$$" + ], + "id": "e6a267d8-6821-45d1-93d1-abb08c52bb63" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Multiple Input Solution with Linear Algebra\n", - "===========================================\n", + "# Solution with Linear Algebra\n", "\n", - "You’ve now seen how slow it can be to perform a coordinate ascent on a\n", - "system. Another approach to solving the system (which is not always\n", - "possible, particularly in *non-linear* systems) is to go direct to the\n", - "minimum. To do this we need to introduce *linear algebra*. We will\n", - "represent all our errors and functions in the form of linear algebra. As\n", - "we mentioned above, linear algebra is just a shorthand for performing\n", - "lots of multiplications and additions simultaneously. What does it have\n", - "to do with our system then? Well the first thing to note is that the\n", - "linear function we were trying to fit has the following form: $$\n", + "In this section we’re going compute the minimum of the quadratic loss\n", + "with respect to the parameters. When we do this, we’ll also review\n", + "*linear algebra*. We will represent all our errors and functions in the\n", + "form of matrices and vectors.\n", + "\n", + "Linear algebra is just a shorthand for performing lots of\n", + "multiplications and additions simultaneously. What does it have to do\n", + "with our system then? Well, the first thing to note is that the classic\n", + "linear function we fit for a one-dimensional regression has the form: $$\n", "f(x) = mx + c\n", "$$ the classical form for a straight line. From a linear algebraic\n", - "perspective we are looking for multiplications and additions. We are\n", + "perspective, we are looking for multiplications and additions. We are\n", "also looking to separate our parameters from our data. The data is the\n", - "*givens* remember, in French the word is données literally translated\n", - "means *givens* that’s great, because we don’t need to change the data,\n", - "what we need to change are the parameters (or variables) of the model.\n", - "In this function the data comes in through $x$, and the parameters are\n", - "$m$ and $c$.\n", + "*givens*. In French the word is données literally translated means\n", + "*givens* that’s great, because we don’t need to change the data, what we\n", + "need to change are the parameters (or variables) of the model. In this\n", + "function the data comes in through $x$, and the parameters are $m$ and\n", + "$c$.\n", "\n", "What we’d like to create is a vector of parameters and a vector of data.\n", "Then we could represent the system with vectors that represent the data,\n", @@ -1264,11 +1221,11 @@ "\n", "We look to turn the multiplications and additions into a linear\n", "algebraic form, we have one multiplication ($m\\times c$) and one\n", - "addition ($mx + c$). But we can turn this into a inner product by\n", + "addition ($mx + c$). But we can turn this into an inner product by\n", "writing it in the following way, $$\n", "f(x) = m \\times x +\n", "c \\times 1,\n", - "$$ in other words we’ve extracted the unit value, from the offset, $c$.\n", + "$$ in other words, we’ve extracted the unit value from the offset, $c$.\n", "We can think of this unit value like an extra item of data, because it\n", "is always given to us, and it is always set to 1 (unlike regular data,\n", "which is likely to vary!). We can therefore write each input data\n", @@ -1279,11 +1236,12 @@ "Now we choose to also turn our parameters into a vector. The parameter\n", "vector will be defined to contain $$\n", "\\mathbf{ w}= \\begin{bmatrix} c \\\\ m\\end{bmatrix}\n", - "$$ because if we now take the inner product between these to vectors we\n", + "$$ because if we now take the inner product between these two vectors we\n", "recover $$\n", "\\mathbf{ x}\\cdot\\mathbf{ w}= 1 \\times c + x \\times m = mx + c\n", "$$ In `numpy` we can define this vector as follows" - ] + ], + "id": "0327c2d2-94f8-42a1-a199-2ee183131cc8" }, { "cell_type": "code", @@ -1292,7 +1250,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "5e1673aa-692d-4e07-9c20-eae9c3b51889" }, { "cell_type": "code", @@ -1304,7 +1263,8 @@ "w = np.zeros(shape=(2, 1))\n", "w[0] = m\n", "w[1] = c" - ] + ], + "id": "f1daadd6-419f-45a4-8bc1-207f348e7799" }, { "cell_type": "markdown", @@ -1313,10 +1273,10 @@ "This gives us the equivalence between original operation and an\n", "operation in vector space. Whilst the notation here isn’t a lot shorter,\n", "the beauty is that we will be able to add as many features as we like\n", - "and still keep the seame representation. In general, we are now moving\n", - "to a system where each of our predictions is given by an inner product.\n", - "When we want to represent a linear product in linear algebra, we tend to\n", - "do it with the transpose operation, so since we have\n", + "and keep the same representation. In general, we are now moving to a\n", + "system where each of our predictions is given by an inner product. When\n", + "we want to represent a linear product in linear algebra, we tend to do\n", + "it with the transpose operation, so since we have\n", "$\\mathbf{a}\\cdot\\mathbf{b} = \\mathbf{a}^\\top\\mathbf{b}$ we can write $$\n", "f(\\mathbf{ x}_i) = \\mathbf{ x}_i^\\top\\mathbf{ w}.\n", "$$ Where we’ve assumed that each data point, $\\mathbf{ x}_i$, is now\n", @@ -1326,19 +1286,19 @@ "x_i\n", "\\end{bmatrix}\n", "$$" - ] + ], + "id": "61776281-a38f-4855-9597-028867ef57ae" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Design Matrix\n", - "=============\n", + "# Design Matrix\n", "\n", "We can do this for the entire data set to form a [*design\n", - "matrix*](http://en.wikipedia.org/wiki/Design_matrix) $\\mathbf{X}$,\n", - "\n", - "$$\\mathbf{X}\n", + "matrix*](http://en.wikipedia.org/wiki/Design_matrix)\n", + "$\\boldsymbol{ \\Phi}$, $$\n", + "\\boldsymbol{ \\Phi}\n", "= \\begin{bmatrix} \n", "\\mathbf{ x}_1^\\top \\\\\\ \n", "\\mathbf{ x}_2^\\top \\\\\\ \n", @@ -1350,10 +1310,10 @@ "\\vdots\n", "& \\vdots \\\\\\\n", "1 & x_n\n", - "\\end{bmatrix},$$\n", - "\n", - "which in `numpy` can be done with the following commands:" - ] + "\\end{bmatrix},\n", + "$$ which in `numpy` can be done with the following commands:" + ], + "id": "cbb0c592-7513-4e10-b42a-7106863f3d04" }, { "cell_type": "code", @@ -1362,7 +1322,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "da72c5ac-a526-4be8-8077-ff95961615ec" }, { "cell_type": "code", @@ -1370,23 +1331,23 @@ "metadata": {}, "outputs": [], "source": [ - "X = np.hstack((np.ones_like(x), x))\n", - "print(X)" - ] + "Phi = np.hstack((np.ones_like(x), x))\n", + "print(Phi)" + ], + "id": "9dd074c1-7a75-4bae-a159-100bc821aa69" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Writing the Objective with Linear Algebra\n", - "-----------------------------------------\n", + "## Writing the Objective with Linear Algebra\n", "\n", "When we think of the objective function, we can think of it as the\n", "errors where the error is defined in a similar way to what it was in\n", "Legendre’s day $y_i - f(\\mathbf{ x}_i)$, in statistics these errors are\n", "also sometimes called\n", "[*residuals*](http://en.wikipedia.org/wiki/Errors_and_residuals_in_statistics).\n", - "So we can think as the objective and the prediction function as two\n", + "So, we can think as the objective and the prediction function as two\n", "separate parts, first we have, $$\n", "E(\\mathbf{ w}) = \\sum_{i=1}^n(y_i - f(\\mathbf{ x}_i; \\mathbf{ w}))^2,\n", "$$ where we’ve made the function $f(\\cdot)$’s dependence on the\n", @@ -1396,43 +1357,44 @@ "$$ Let’s look again at these two equations and see if we can identify\n", "any inner products. The first equation is a sum of squares, which is\n", "promising. Any sum of squares can be represented by an inner product, $$\n", - "a = \\sum_{i=1}^{k} b^2_i = \\mathbf{b}^\\top\\mathbf{b},\n", - "$$ so if we wish to represent $E(\\mathbf{ w})$ in this way, all we need\n", - "to do is convert the sum operator to an inner product. We can get a\n", - "vector from that sum operator by placing both $y_i$ and\n", + "a = \\sum_{i=1}^{k} b^2_i = \\mathbf{b}^\\top\\mathbf{b}.\n", + "$$ If we wish to represent $E(\\mathbf{ w})$ in this way, all we need to\n", + "do is convert the sum operator to an inner product. We can get a vector\n", + "from that sum operator by placing both $y_i$ and\n", "$f(\\mathbf{ x}_i; \\mathbf{ w})$ into vectors, which we do by defining $$\n", "\\mathbf{ y}= \\begin{bmatrix}y_1\\\\ y_2\\\\ \\vdots \\\\ y_n\\end{bmatrix}\n", "$$ and defining $$\n", "\\mathbf{ f}(\\mathbf{ x}_1; \\mathbf{ w}) = \\begin{bmatrix}f(\\mathbf{ x}_1; \\mathbf{ w})\\\\ f(\\mathbf{ x}_2; \\mathbf{ w})\\\\ \\vdots \\\\ f(\\mathbf{ x}_n; \\mathbf{ w})\\end{bmatrix}.\n", - "$$ The second of these is actually a vector-valued function. This term\n", - "may appear intimidating, but the idea is straightforward. A vector\n", - "valued function is simply a vector whose elements are themselves defined\n", - "as *functions*, i.e. it is a vector of functions, rather than a vector\n", - "of scalars. The idea is so straightforward, that we are going to ignore\n", - "it for the moment, and barely use it in the derivation. But it will\n", - "reappear later when we introduce *basis functions*. So we will, for the\n", - "moment, ignore the dependence of $\\mathbf{ f}$ on $\\mathbf{ w}$ and\n", - "$\\mathbf{X}$ and simply summarise it by a vector of numbers $$\n", + "$$ The second of these is a vector-valued function. This term may appear\n", + "intimidating, but the idea is straightforward. A vector valued function\n", + "is simply a vector whose elements are themselves defined as *functions*,\n", + "i.e., it is a vector of functions, rather than a vector of scalars. The\n", + "idea is so straightforward, that we are going to ignore it for the\n", + "moment, and barely use it in the derivation. But it will reappear later\n", + "when we introduce *basis functions*. So, we will for the moment ignore\n", + "the dependence of $\\mathbf{ f}$ on $\\mathbf{ w}$ and\n", + "$\\boldsymbol{ \\Phi}$ and simply summarise it by a vector of numbers $$\n", "\\mathbf{ f}= \\begin{bmatrix}f_1\\\\f_2\\\\\n", "\\vdots \\\\ f_n\\end{bmatrix}.\n", "$$ This allows us to write our objective in the folowing, linear\n", "algebraic form, $$\n", "E(\\mathbf{ w}) = (\\mathbf{ y}- \\mathbf{ f})^\\top(\\mathbf{ y}- \\mathbf{ f})\n", - "$$ from the rules of inner products. But what of our matrix $\\mathbf{X}$\n", - "of input data? At this point, we need to dust off [*matrix-vector\n", + "$$ from the rules of inner products. But what of our matrix\n", + "$\\boldsymbol{ \\Phi}$ of input data? At this point, we need to dust off\n", + "[*matrix-vector\n", "multiplication*](http://en.wikipedia.org/wiki/Matrix_multiplication).\n", "Matrix multiplication is simply a convenient way of performing many\n", - "inner products together, and it’s exactly what we need to summarise the\n", + "inner products together, and it’s exactly what we need to summarize the\n", "operation $$\n", "f_i = \\mathbf{ x}_i^\\top\\mathbf{ w}.\n", "$$ This operation tells us that each element of the vector $\\mathbf{ f}$\n", "(our vector valued function) is given by an inner product between\n", - "$\\mathbf{ x}_i$ and $\\mathbf{ w}$. In other words it is a series of\n", + "$\\mathbf{ x}_i$ and $\\mathbf{ w}$. In other words, it is a series of\n", "inner products. Let’s look at the definition of matrix multiplication,\n", "it takes the form $$\n", - "\\mathbf{c} = \\mathbf{B}\\mathbf{a}\n", + "\\mathbf{c} = \\mathbf{B}\\mathbf{a},\n", "$$ where $\\mathbf{c}$ might be a $k$ dimensional vector (which we can\n", - "intepret as a $k\\times 1$ dimensional matrix), and $\\mathbf{B}$ is a\n", + "interpret as a $k\\times 1$ dimensional matrix), and $\\mathbf{B}$ is a\n", "$k\\times k$ dimensional matrix and $\\mathbf{a}$ is a $k$ dimensional\n", "vector ($k\\times 1$ dimensional matrix).\n", "\n", @@ -1447,16 +1409,19 @@ "b_{1, k}a_k\\\\\n", "b_{2, 1}a_1 + b_{2, 2}a_2 + \\dots + b_{2, k}a_k \\\\ \n", "\\vdots\\\\\n", - "b_{k, 1}a_1 + b_{k, 2}a_2 + \\dots + b_{k, k}a_k\\end{bmatrix}\n", - "$$ so we see that each element of the result, $\\mathbf{a}$ is simply the\n", + "b_{k, 1}a_1 + b_{k, 2}a_2 + \\dots + b_{k, k}a_k\\end{bmatrix}.\n", + "$$ We see that each element of the result, $\\mathbf{a}$ is simply the\n", "inner product between each *row* of $\\mathbf{B}$ and the vector\n", "$\\mathbf{c}$. Because we have defined each element of $\\mathbf{ f}$ to\n", "be given by the inner product between each *row* of the design matrix\n", "and the vector $\\mathbf{ w}$ we now can write the full operation in one\n", - "matrix multiplication, $$\n", - "\\mathbf{ f}= \\mathbf{X}\\mathbf{ w}.\n", + "matrix multiplication,\n", + "\n", + "$$\n", + "\\mathbf{ f}= \\boldsymbol{ \\Phi}\\mathbf{ w}.\n", "$$" - ] + ], + "id": "d9185cc2-6ffc-4125-a288-e8962f8871d1" }, { "cell_type": "code", @@ -1465,7 +1430,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "9a587441-9725-48bf-a737-0fd3f797894a" }, { "cell_type": "code", @@ -1473,8 +1439,9 @@ "metadata": {}, "outputs": [], "source": [ - "f = X@w # The @ sign performs matrix multiplication" - ] + "f = Phi@w # The @ sign performs matrix multiplication" + ], + "id": "ad80ebba-60da-4bed-b0b7-564ec30fc82e" }, { "cell_type": "markdown", @@ -1485,7 +1452,8 @@ "$$ we find we have defined the *model* with two equations. One equation\n", "tells us the form of our predictive function and how it depends on its\n", "parameters, the other tells us the form of our objective function." - ] + ], + "id": "413fc864-1f4f-4dda-9c79-271458e880c7" }, { "cell_type": "code", @@ -1496,49 +1464,25 @@ "resid = (y-f)\n", "E = np.dot(resid.T, resid) # matrix multiplication on a single vector is equivalent to a dot product.\n", "print(\"Error function is:\", E)" - ] + ], + "id": "b9d7f1da-7bf4-4a89-baab-3aa861d4b201" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 0\n", - "\n", - "The prediction for our movie recommender system had the form $$\n", - "f_{i,j} = \\mathbf{u}_i^\\top \\mathbf{v}_j\n", - "$$ and the objective function was then $$\n", - "E = \\sum_{i,j} s_{i,j}(y_{i,j} - f_{i, j})^2\n", - "$$ Try writing this down in matrix and vector form. How many of the\n", - "terms can you do? For each variable and parameter carefully think about\n", - "whether it should be represented as a matrix or vector. Do as many of\n", - "the terms as you can. Use $\\LaTeX$ to give your answers and give the\n", - "*dimensions* of any matrices you create." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "::: {.cell .markdown}\n", - "\n", - "### Exercise 0 Answer\n", + "# Objective Optimization\n", "\n", - "Write your answer to Exercise 0 here" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Objective Optimisation\n", - "======================\n", + "\\[edit\\]\n", "\n", - "Our *model* has now been defined with two equations, the prediction\n", - "function and the objective function. Next we will use multivariate\n", + "Our *model* has now been defined with two equations: the prediction\n", + "function and the objective function. Now we will use multivariate\n", "calculus to define an *algorithm* to fit the model. The separation\n", "between model and algorithm is important and is often overlooked. Our\n", "model contains a function that shows how it will be used for prediction,\n", - "and a function that describes the objective function we need to optimise\n", + "and a function that describes the objective function we need to optimize\n", "to obtain a good set of parameters.\n", "\n", "The model linear regression model we have described is still the same as\n", @@ -1550,17 +1494,17 @@ "algorithm, it just appears to be a single operation (or function).\n", "However, underneath the computer calls an algorithm to find the\n", "solution. Further, the algorithm we obtain is very widely used, and\n", - "because of this it turns out to be highly optimised.\n", + "because of this it turns out to be highly optimized.\n", "\n", - "Once again we are going to try and find the stationary points of our\n", + "Once again, we are going to try and find the stationary points of our\n", "objective by finding the *stationary points*. However, the stationary\n", - "points of a multivariate function, are a little bit more complext to\n", - "find. Once again we need to find the point at which the derivative is\n", - "zero, but now we need to use *multivariate calculus* to find it. This\n", - "involves learning a few additional rules of differentiation (that allow\n", - "you to do the derivatives of a function with respect to vector), but in\n", - "the end it makes things quite a bit easier. We define vectorial\n", - "derivatives as follows, $$\n", + "points of a multivariate function, are a little bit more complex to\n", + "find. As before we need to find the point at which the gradient is zero,\n", + "but now we need to use *multivariate calculus* to find it. This involves\n", + "learning a few additional rules of differentiation (that allow you to do\n", + "the derivatives of a function with respect to vector), but in the end it\n", + "makes things quite a bit easier. We define vectorial derivatives as\n", + "follows, $$\n", "\\frac{\\text{d}E(\\mathbf{ w})}{\\text{d}\\mathbf{ w}} =\n", "\\begin{bmatrix}\\frac{\\text{d}E(\\mathbf{ w})}{\\text{d}w_1}\\\\\\frac{\\text{d}E(\\mathbf{ w})}{\\text{d}w_2}\\end{bmatrix}.\n", "$$ where $\\frac{\\text{d}E(\\mathbf{ w})}{\\text{d}w_1}$ is the [partial\n", @@ -1569,19 +1513,19 @@ "\n", "Differentiation through multiplications and additions is relatively\n", "straightforward, and since linear algebra is just multiplication and\n", - "addition, then its rules of diffentiation are quite straightforward too,\n", - "but slightly more complex than regular derivatives." - ] + "addition, then its rules of differentiation are quite straightforward\n", + "too, but slightly more complex than regular derivatives." + ], + "id": "6656e63d-e6bd-4d58-b150-d52bad3f84bc" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Multivariate Derivatives\n", - "------------------------\n", + "## Multivariate Derivatives\n", "\n", "We will need two rules of multivariate or *matrix* differentiation. The\n", - "first is diffentiation of an inner product. By remembering that the\n", + "first is differentiation of an inner product. By remembering that the\n", "inner product is made up of multiplication and addition, we can hope\n", "that its derivative is quite straightforward, and so it proves to be. We\n", "can start by thinking about the definition of the inner product, $$\n", @@ -1589,10 +1533,10 @@ "z_i,\n", "$$ which if we were to take the derivative with respect to $z_k$ would\n", "simply return the gradient of the one term in the sum for which the\n", - "derivative was non zero, that of $a_k$, so we know that $$\n", + "derivative was non-zero, that of $a_k$, so we know that $$\n", "\\frac{\\text{d}}{\\text{d}z_k} \\mathbf{a}^\\top \\mathbf{z} = a_k\n", - "$$ and by our definition of multivariate derivatives we can simply stack\n", - "all the partial derivatives of this form in a vector to obtain the\n", + "$$ and by our definition for multivariate derivatives, we can simply\n", + "stack all the partial derivatives of this form in a vector to obtain the\n", "result that $$\n", "\\frac{\\text{d}}{\\text{d}\\mathbf{z}}\n", "\\mathbf{a}^\\top \\mathbf{z} = \\mathbf{a}.\n", @@ -1602,7 +1546,8 @@ "$k \\times k$ *matrix* of coefficients then the matrix quadratic form is\n", "written as $\\mathbf{z}^\\top \\mathbf{C}\\mathbf{z}$, which is itself a\n", "*scalar* quantity, but it is a function of a *vector*." - ] + ], + "id": "be145100-1931-4f54-a0b2-a0ed62028ecc" }, { "cell_type": "markdown", @@ -1610,15 +1555,15 @@ "source": [ "### Matching Dimensions in Matrix Multiplications\n", "\n", - "There’s a trick for telling that it’s a scalar result. When you are\n", - "doing maths with matrices, it’s always worth pausing to perform a quick\n", - "sanity check on the dimensions. Matrix multplication only works when the\n", - "dimensions match. To be precise, the ‘inner’ dimension of the matrix\n", - "must match. What is the inner dimension. If we multiply two matrices\n", - "$\\mathbf{A}$ and $\\mathbf{B}$, the first of which has $k$ rows and\n", - "$\\ell$ columns and the second of which has $p$ rows and $q$ columns,\n", - "then we can check whether the multiplication works by writing the\n", - "dimensionalities next to each other, $$\n", + "There’s a trick for telling a multiplication leads to a scalar result.\n", + "When you are doing mathematics with matrices, it’s always worth pausing\n", + "to perform a quick sanity check on the dimensions. Matrix multplication\n", + "only works when the dimensions match. To be precise, the ‘inner’\n", + "dimension of the matrix must match. What is the inner dimension? If we\n", + "multiply two matrices $\\mathbf{A}$ and $\\mathbf{B}$, the first of which\n", + "has $k$ rows and $\\ell$ columns and the second of which has $p$ rows and\n", + "$q$ columns, then we can check whether the multiplication works by\n", + "writing the dimensionalities next to each other, $$\n", "\\mathbf{A} \\mathbf{B} \\rightarrow (k \\times\n", "\\underbrace{\\ell)(p}_\\text{inner dimensions} \\times q) \\rightarrow (k\\times q).\n", "$$ The inner dimensions are the two inside dimensions, $\\ell$ and $p$.\n", @@ -1628,20 +1573,20 @@ "not [*commutative*](http://en.wikipedia.org/wiki/Commutative_property).\n", "And if you change the order of the multiplication, $$\n", "\\mathbf{B} \\mathbf{A} \\rightarrow (\\ell \\times \\underbrace{k)(q}_\\text{inner dimensions} \\times p) \\rightarrow (\\ell \\times p).\n", - "$$ firstly it may no longer even work, because now the condition is that\n", - "$k=q$, and secondly the result could be of a different dimensionality.\n", - "An exception is if the matrices are square matrices (e.g. same number of\n", - "rows as columns) and they are both *symmetric*. A symmetric matrix is\n", - "one for which $\\mathbf{A}=\\mathbf{A}^\\top$, or equivalently,\n", - "$a_{i,j} = a_{j,i}$ for all $i$ and $j$.\n", - "\n", - "You will need to get used to working with matrices and vectors applying\n", - "and developing new machine learning techniques. You should have come\n", + "$$ Firstly, it may no longer even work, because now the condition is\n", + "that $k=q$, and secondly the result could be of a different\n", + "dimensionality. An exception is if the matrices are square matrices\n", + "(e.g., same number of rows as columns) and they are both *symmetric*. A\n", + "symmetric matrix is one for which $\\mathbf{A}=\\mathbf{A}^\\top$, or\n", + "equivalently, $a_{i,j} = a_{j,i}$ for all $i$ and $j$.\n", + "\n", + "For applying and developing machine learning algorithms you should get\n", + "familiar with working with matrices and vectors. You should have come\n", "across them before, but you may not have used them as extensively as we\n", - "will now do in this course. You should get used to using this trick to\n", - "check your work and ensure you know what the dimension of an output\n", - "matrix should be. For our matrix quadratic form, it turns out that we\n", - "can see it as a special type of inner product. $$\n", + "are doing now. It’s worth getting used to using this trick to check your\n", + "work and ensure you know what the dimension of an output matrix should\n", + "be. For our matrix quadratic form, it turns out that we can see it as a\n", + "special type of inner product. $$\n", "\\mathbf{z}^\\top\\mathbf{C}\\mathbf{z} \\rightarrow (1\\times\n", "\\underbrace{k) (k}_\\text{inner dimensions}\\times k) (k\\times 1) \\rightarrow\n", "\\mathbf{b}^\\top\\mathbf{z}\n", @@ -1663,21 +1608,19 @@ "\\frac{\\text{d}}{\\text{d}\\mathbf{z}} \\mathbf{z}^\\top\\mathbf{C}\\mathbf{z}=\n", "2\\mathbf{C}\\mathbf{z}.\n", "$$" - ] + ], + "id": "441440cf-f84e-46d8-8e17-59c50024915e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "::: {.cell .markdown}\n", - "\n", - "Differentiate the Objective\n", - "---------------------------\n", + "## Differentiate the Objective\n", "\n", "First, we need to compute the full objective by substituting our\n", "prediction function into the objective function to obtain the objective\n", "in terms of $\\mathbf{ w}$. Doing this we obtain $$\n", - "E(\\mathbf{ w})= (\\mathbf{ y}- \\mathbf{X}\\mathbf{ w})^\\top (\\mathbf{ y}- \\mathbf{X}\\mathbf{ w}).\n", + "E(\\mathbf{ w})= (\\mathbf{ y}- \\boldsymbol{ \\Phi}\\mathbf{ w})^\\top (\\mathbf{ y}- \\boldsymbol{ \\Phi}\\mathbf{ w}).\n", "$$ We now need to differentiate this *quadratic form* to find the\n", "minimum. We differentiate with respect to the *vector* $\\mathbf{ w}$.\n", "But before we do that, we’ll expand the brackets in the quadratic form\n", @@ -1687,25 +1630,28 @@ "(\\mathbf{c} - \\mathbf{d}) = \\mathbf{a}^\\top \\mathbf{c} - \\mathbf{a}^\\top\n", "\\mathbf{d} - \\mathbf{b}^\\top \\mathbf{c} + \\mathbf{b}^\\top \\mathbf{d}\n", "$$ which substituting for $\\mathbf{a} = \\mathbf{c} = \\mathbf{ y}$ and\n", - "$\\mathbf{b}=\\mathbf{d} = \\mathbf{X}\\mathbf{ w}$ gives $$\n", + "$\\mathbf{b}=\\mathbf{d} = \\boldsymbol{ \\Phi}\\mathbf{ w}$ gives $$\n", "E(\\mathbf{ w})=\n", - "\\mathbf{ y}^\\top\\mathbf{ y}- 2\\mathbf{ y}^\\top\\mathbf{X}\\mathbf{ w}+\n", - "\\mathbf{ w}^\\top\\mathbf{X}^\\top\\mathbf{X}\\mathbf{ w}\n", + "\\mathbf{ y}^\\top\\mathbf{ y}- 2\\mathbf{ y}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w}+\n", + "\\mathbf{ w}^\\top\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w}\n", "$$ where we used the fact that\n", - "$\\mathbf{ y}^\\top\\mathbf{X}\\mathbf{ w}=\\mathbf{ w}^\\top\\mathbf{X}^\\top\\mathbf{ y}$.\n", + "$\\mathbf{ y}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w}=\\mathbf{ w}^\\top\\boldsymbol{ \\Phi}^\\top\\mathbf{ y}$.\n", + "\n", "Now we can use our rules of differentiation to compute the derivative of\n", "this form, which is, $$\n", - "\\frac{\\text{d}}{\\text{d}\\mathbf{ w}}E(\\mathbf{ w})=- 2\\mathbf{X}^\\top \\mathbf{ y}+\n", - "2\\mathbf{X}^\\top\\mathbf{X}\\mathbf{ w},\n", - "$$ where we have exploited the fact that $\\mathbf{X}^\\top\\mathbf{X}$ is\n", - "symmetric to obtain this result." - ] + "\\frac{\\text{d}}{\\text{d}\\mathbf{ w}}E(\\mathbf{ w})=- 2\\boldsymbol{ \\Phi}^\\top \\mathbf{ y}+\n", + "2\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w},\n", + "$$ where we have exploited the fact that\n", + "$\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}$ is symmetric to obtain this\n", + "result." + ], + "id": "81f80251-83a0-47da-ae07-fa50c74f55b4" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 0\n", + "### Exercise 1\n", "\n", "Use the equivalence between our vector and our matrix formulations of\n", "linear regression, alongside our definition of vector derivates, to\n", @@ -1713,144 +1659,90 @@ "$\\frac{\\text{d}E(c, m)}{\\text{d}c}$ and\n", "$\\frac{\\text{d}E(c, m)}{\\text{d}m}$ to those for\n", "$\\frac{\\text{d}E(\\mathbf{ w})}{\\text{d}\\mathbf{ w}}$." - ] + ], + "id": "ef9f728d-ff81-4006-9b13-85756c5e4bcd" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 0 Answer\n", + "### Exercise 1 Answer\n", "\n", - "Write your answer to Exercise 0 here" - ] + "Write your answer to Exercise 1 here" + ], + "id": "94b33929-d077-4b16-87ba-2793abce0179" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Update Equation for Global Optimum\n", - "==================================\n", + "# Update Equation for Global Optimum\n", "\n", - "Once again, we need to find the minimum of our objective function. Using\n", - "our likelihood for multiple input regression we can now minimize for our\n", - "parameter vector $\\mathbf{ w}$. Firstly, just as in the single input\n", - "case, we seek stationary points by find parameter vectors that solve for\n", - "when the gradients are zero, $$\n", - "\\mathbf{0}=- 2\\mathbf{X}^\\top\n", - "\\mathbf{ y}+ 2\\mathbf{X}^\\top\\mathbf{X}\\mathbf{ w},\n", - "$$ where $\\mathbf{0}$ is a *vector* of zeros. Rearranging this equation\n", + "We need to find the minimum of our objective function. Using our\n", + "objective function, we can minimize for our parameter vector\n", + "$\\mathbf{ w}$. Firstly, we seek stationary points by find parameter\n", + "vectors that solve for when the gradients are zero, $$\n", + "\\mathbf{0}=- 2\\boldsymbol{ \\Phi}^\\top\n", + "\\mathbf{ y}+ 2\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w},\n", + "$$ where $\\mathbf{0}$ is a *vector* of zeros. Rearranging this equation,\n", "we find the solution to be $$\n", - "\\mathbf{ w}= \\left[\\mathbf{X}^\\top \\mathbf{X}\\right]^{-1} \\mathbf{X}^\\top\n", + "\\boldsymbol{ \\Phi}^\\top \\boldsymbol{ \\Phi}\\mathbf{ w}= \\boldsymbol{ \\Phi}^\\top\n", "\\mathbf{ y}\n", - "$$ where $\\mathbf{A}^{-1}$ denotes [*matrix\n", - "inverse*](http://en.wikipedia.org/wiki/Invertible_matrix)." - ] + "$$ which is a matrix equation of the familiar form\n", + "$\\mathbf{A}\\mathbf{x} = \\mathbf{b}$." + ], + "id": "88260728-0e30-4cb5-b34c-9c4fb77d79df" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Solving the Multivariate System\n", - "-------------------------------\n", + "## Solving the Multivariate System\n", "\n", - "The solution for $\\mathbf{ w}$ is given in terms of a matrix inverse,\n", - "but computation of a matrix inverse requires, in itself, an algorithm to\n", - "resolve it. You’ll know this if you had to invert, by hand, a\n", - "$3\\times 3$ matrix in high school. From a numerical stability\n", - "perspective, it is also best not to compute the matrix inverse directly,\n", - "but rather to ask the computer to *solve* the system of linear equations\n", - "given by\n", - "$$\\mathbf{X}^\\top\\mathbf{X}\\mathbf{ w}= \\mathbf{X}^\\top\\mathbf{ y}$$ for\n", - "$\\mathbf{ w}$. This can be done in `numpy` using the command" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.linalg.solve?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "so we can obtain the solution using" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "w = np.linalg.solve(X.T@X, X.T@y)\n", - "print(w)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can map it back to the liner regression and plot the fit as follows" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "m = w[1]; c=w[0]\n", - "f_test = m*x_test + c\n", - "print(m)\n", - "print(c)\n", - "plt.plot(x_test, f_test, 'b-')\n", - "plt.plot(x, y, 'rx')" - ] + "The solution for $\\mathbf{ w}$ can be written mathematically in terms of\n", + "a matrix inverse of $\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}$, but\n", + "computation of a matrix inverse requires an algorithm to resolve it.\n", + "You’ll know this if you had to invert, by hand, a $3\\times 3$ matrix in\n", + "high school. From a numerical stability perspective, it is also best not\n", + "to compute the matrix inverse directly, but rather to ask the computer\n", + "to *solve* the system of linear equations given by $$\n", + "\\boldsymbol{ \\Phi}^\\top\\boldsymbol{ \\Phi}\\mathbf{ w}= \\boldsymbol{ \\Phi}^\\top\\mathbf{ y}\n", + "$$ for $\\mathbf{ w}$." + ], + "id": "3b1b0e18-8684-437b-b676-0a4166b55050" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Multivariate Linear Regression\n", - "------------------------------\n", + "## Multivariate Linear Regression\n", "\n", "A major advantage of the new system is that we can build a linear\n", "regression on a multivariate system. The matrix calculus didn’t specify\n", "what the length of the vector $\\mathbf{ x}$ should be, or equivalently\n", "the size of the design matrix." - ] + ], + "id": "58122b14-5ebb-4499-a20c-3dec5f086f93" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Movie Body Count Data\n", - "---------------------\n", + "## Movie Body Count Data\n", "\n", - "Let’s consider the movie body count data." - ] + "\\[edit\\]\n", + "\n", + "This is a data set created by Simon Garnier and Rany Olson for exploring\n", + "the differences between R and Python for data science. The data contains\n", + "information about different movies augmented by estimates about how many\n", + "on-screen deaths are contained in the movie. The data is craped from\n", + ". The data contains the following\n", + "featuers for each movie: `Year`, `Body_Count`, `MPAA_Rating`, `Genre`,\n", + "`Director`, `Actors`, `Length_Minutes`, `IMDB_Rating`." + ], + "id": "cec02248-83cd-4822-97d8-2946dea433e4" }, { "cell_type": "code", @@ -1859,7 +1751,8 @@ "outputs": [], "source": [ "import pods" - ] + ], + "id": "3051e759-0159-4746-80e4-15dc30486708" }, { "cell_type": "code", @@ -1869,14 +1762,18 @@ "source": [ "data = pods.datasets.movie_body_count()\n", "movies = data['Y']" - ] + ], + "id": "d3163d4d-130d-4acd-982e-efb3ab428803" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let’s remind ourselves of the features we’ve been provided with." - ] + "The data is provided to us in the form of a pandas data frame, we can\n", + "see the features we’re provided with by inspecting the columns of the\n", + "data frame." + ], + "id": "94e545bb-1507-4d12-9c0e-f2b468a6b02c" }, { "cell_type": "code", @@ -1885,26 +1782,26 @@ "outputs": [], "source": [ "print(', '.join(movies.columns))" - ] + ], + "id": "4ff32bdd-ee4b-4112-8c8d-4432c9065fe8" }, { "cell_type": "markdown", "metadata": {}, "source": [ + "## Multivariate Regression on Movie Body Count Data\n", + "\n", + "\\[edit\\]\n", + "\n", "Now we will build a design matrix based on the numeric features: year,\n", - "Body\\_Count, Length\\_Minutes in an effort to predict the rating. We\n", - "build the design matrix as follows:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Relation to Single Input System\n", - "-------------------------------\n", + "Body_Count, Length_Minutes in an effort to predict the rating. We build\n", + "the design matrix as follows:\n", "\n", "Bias as an additional feature." - ] + ], + "id": "27631a77-e51a-4a06-b3d5-e6f1f774cf84" }, { "cell_type": "code", @@ -1913,10 +1810,11 @@ "outputs": [], "source": [ "select_features = ['Year', 'Body_Count', 'Length_Minutes']\n", - "X = movies[select_features]\n", - "X['Eins'] = 1 # add a column for the offset\n", + "Phi = movies[select_features]\n", + "Phi['Eins'] = 1 # add a column for the offset\n", "y = movies[['IMDB_Rating']]" - ] + ], + "id": "15cde525-cba1-46d8-ba24-68042d2a55ae" }, { "cell_type": "markdown", @@ -1925,7 +1823,8 @@ "Now let’s perform a linear regression. But this time, we will create a\n", "pandas data frame for the result so we can store it in a form that we\n", "can visualise easily." - ] + ], + "id": "ff420038-9345-4bcd-a306-12957f0afe67" }, { "cell_type": "code", @@ -1934,7 +1833,8 @@ "outputs": [], "source": [ "import pandas as pd" - ] + ], + "id": "57cc55da-c5bd-41db-bec3-f76073e1d78b" }, { "cell_type": "code", @@ -1942,17 +1842,21 @@ "metadata": {}, "outputs": [], "source": [ - "w = pd.DataFrame(data=np.linalg.solve(X.T@X, X.T@y), # solve linear regression here\n", - " index = X.columns, # columns of X become rows of w\n", - " columns=['regression_coefficient']) # the column of X is the value of regression coefficient" - ] + "w = pd.DataFrame(data=np.linalg.solve(Phi.T@Phi, Phi.T@y), # solve linear regression here\n", + " index = Phi.columns, # columns of Phi become rows of w\n", + " columns=['regression_coefficient']) # the column of Phi is the value of regression coefficient" + ], + "id": "490aa0ac-e0d0-4da7-9107-7e05cd72434b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can check the residuals to see how good our estimates are" - ] + "We can check the residuals to see how good our estimates are. First we\n", + "create a pandas data frame containing the predictions and use it to\n", + "compute the residuals." + ], + "id": "d532154e-f264-4516-bf5a-9dd874bfa171" }, { "cell_type": "code", @@ -1960,17 +1864,22 @@ "metadata": {}, "outputs": [], "source": [ - "(y - X@w).hist()" - ] + "ypred = pd.DataFrame(data=(Phi@w).values, columns=['IMDB_Rating'])\n", + "resid = y-ypred" + ], + "id": "2b4751c9-e57d-4686-897d-0e1acd9ec745" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Which shows our model *hasn’t* yet done a great job of representation,\n", - "because the spread of values is large. We can check what the rating is\n", - "dominated by in terms of regression coefficients." - ] + "import matplotlib.pyplot as plt\n", + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "9c94779f-7aa3-45d7-9ea5-85209f47ccad" }, { "cell_type": "code", @@ -1978,29 +1887,63 @@ "metadata": {}, "outputs": [], "source": [ - "w" - ] + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", + "resid.hist(ax=ax)\n", + "mlai.write_figure(filename='movie-body-count-rating-residuals.svg', \n", + " directory='./ml')" + ], + "id": "33dcd9a6-7f86-4adf-bba0-5ac7c322390d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Although we have to be a little careful about interpretation because our\n", - "input values live on different scales, however it looks like we are\n", - "dominated by the bias, with a small negative effect for later films (but\n", + "\n", + "\n", + "Figure: Residual values for the ratings from the prediction of the\n", + "movie rating given the data from the film.\n", + "\n", + "Which shows our model *hasn’t* yet done a great job of representation,\n", + "because the spread of values is large. We can check what the rating is\n", + "dominated by in terms of regression coefficients." + ], + "id": "c7d7847a-6269-40a2-8c3d-cbe74bcb1b39" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "w" + ], + "id": "bdd15839-444f-4e94-9b4c-4cd5c7e15f09" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Although we have to be a little careful about interpretation because our\n", + "input values live on different scales, however it looks like we are\n", + "dominated by the bias, with a small negative effect for later films (but\n", "bear in mind the years are large, so this effect is probably larger than\n", "it looks) and a positive effect for length. So it looks like long\n", "earlier films generally do better, but the residuals are so high that we\n", "probably haven’t modelled the system very well." - ] + ], + "id": "ef8d73a8-e48e-4466-be10-7473cf7f9be2" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Underdetermined System\n", - "======================" - ] + "# Underdetermined System\n", + "\n", + "\\[edit\\]" + ], + "id": "9ed20728-2192-48e1-8407-73a4d915bc11" }, { "cell_type": "code", @@ -2008,8 +1951,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "f745ba5e-5b58-4e54-9c6f-74646dc33616" }, { "cell_type": "code", @@ -2018,7 +1962,8 @@ "outputs": [], "source": [ "plot.under_determined_system(diagrams='./ml')" - ] + ], + "id": "08518f5f-827c-4138-b324-eca9f7eca669" }, { "cell_type": "markdown", @@ -2026,7 +1971,7 @@ "source": [ "What about the situation where you have more parameters than data in\n", "your simultaneous equation? This is known as an *underdetermined*\n", - "system. In fact this set up is in some sense *easier* to solve, because\n", + "system. In fact, this set up is in some sense *easier* to solve, because\n", "we don’t need to think about introducing a slack variable (although it\n", "might make a lot of sense from a *modelling* perspective to do so).\n", "\n", @@ -2034,31 +1979,32 @@ "introduce slack variables, $\\epsilon_i$, which needed to be estimated\n", "for each point. The slack variable represented the difference between\n", "our actual prediction and the true observation. This is known as the\n", - "*residual*. By introducing the slack variable we now have an additional\n", + "*residual*. By introducing the slack variable, we now have an additional\n", "$n$ variables to estimate, one for each data point, $\\{\\epsilon_i\\}$.\n", - "This actually turns the overdetermined system into an underdetermined\n", - "system. Introduction of $n$ variables, plus the original $m$ and $c$\n", - "gives us $n+2$ parameters to be estimated from $n$ observations, which\n", - "actually makes the system *underdetermined*. However, we then made a\n", - "probabilistic assumption about the slack variables, we assumed that the\n", - "slack variables were distributed according to a probability density. And\n", - "for the moment we have been assuming that density was the Gaussian,\n", + "This turns the overdetermined system into an underdetermined system.\n", + "Introduction of $n$ variables, plus the original $m$ and $c$ gives us\n", + "$n+2$ parameters to be estimated from $n$ observations, which makes the\n", + "system *underdetermined*. However, we then made a probabilistic\n", + "assumption about the slack variables, we assumed that the slack\n", + "variables were distributed according to a probability density. And for\n", + "the moment we have been assuming that density was the Gaussian,\n", "$$\\epsilon_i \\sim \\mathcal{N}\\left(0,\\sigma^2\\right),$$ with zero mean\n", "and variance $\\sigma^2$.\n", "\n", "The follow up question is whether we can do the same thing with the\n", - "parameters. If we have two parameters and only one unknown can we place\n", - "a probability distribution over the parameters, as we did with the slack\n", + "parameters. If we have two parameters and only one unknown, can we place\n", + "a probability distribution over the parameters as we did with the slack\n", "variables? The answer is yes." - ] + ], + "id": "903c59ef-3d29-4158-9da8-edc050347c7b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Underdetermined System\n", - "----------------------" - ] + "## Underdetermined System" + ], + "id": "055ed30c-4047-48b2-b422-a088570e308d" }, { "cell_type": "code", @@ -2066,9 +2012,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "6989236a-7e2d-4ce3-9b1b-54990c2a1dd0" }, { "cell_type": "code", @@ -2076,27 +2023,32 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('under_determined_system{samp:0>3}.svg', \n", - " directory='./ml', samp=IntSlider(0, 0, 10, 1))" - ] + "nu.display_plots('under_determined_system{samp:0>3}.svg', \n", + " directory='./ml', samp=IntSlider(0, 0, 9, 1))" + ], + "id": "6fe8e747-555f-4fa1-9153-ce8e9faf1c16" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: An underdetermined system can be fit by considering\n", "uncertainty. Multiple solutions are consistent with one specified\n", "point." - ] + ], + "id": "57da1e3b-71be-4b8e-919d-6d36821608f9" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Two Dimensional Gaussian\n", - "------------------------\n", + "## Two Dimensional Gaussian\n", + "\n", + "\\[edit\\]\n", "\n", "Consider the distribution of height (in meters) of an adult male human\n", "population. We will approximate the marginal density of heights as a\n", @@ -2109,7 +2061,8 @@ "deviation of $6 kg$ (implying a variance of 36), $$\n", " p(w) \\sim \\mathcal{N}\\left(75,36\\right).\n", " $$" - ] + ], + "id": "30739355-71b0-48e0-b24c-d60372b30fb8" }, { "cell_type": "code", @@ -2117,8 +2070,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "483179fe-0b8e-4b79-aae1-2268e98172b2" }, { "cell_type": "code", @@ -2127,23 +2081,24 @@ "outputs": [], "source": [ "plot.height_weight(diagrams='./ml')" - ] + ], + "id": "ab732e7c-75f7-47f9-9348-715ce42432ab" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Gaussian distributions for height and weight." - ] + ], + "id": "360df30c-cf42-4cf4-a549-fca59a1c5d89" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Independence Assumption\n", - "-----------------------\n", + "## Independence Assumption\n", "\n", "First of all, we make an independence assumption, we assume that height\n", "and weight are independent. The definition of probabilistic independence\n", @@ -2152,7 +2107,8 @@ " p(w, h) = p(w)p(h).\n", " $$ Given this assumption we can sample from the joint distribution by\n", "independently sampling weights and heights." - ] + ], + "id": "6cfefbde-6143-40fb-8bfa-24a202d59d67" }, { "cell_type": "code", @@ -2160,8 +2116,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "0ef697bf-b1cf-43b7-ac51-47b451bf33ff" }, { "cell_type": "code", @@ -2171,7 +2128,8 @@ "source": [ "plot.independent_height_weight(num_samps=8, \n", " diagrams='./ml')" - ] + ], + "id": "65e3a055-8cad-46db-89e3-51f546cd0969" }, { "cell_type": "code", @@ -2179,9 +2137,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "d10f64a6-742b-414f-bd98-5ffa4fc3ea5e" }, { "cell_type": "code", @@ -2189,16 +2148,27 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('independent_height_weight{fig:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "4b12582a-235f-426b-bb70-7d3d55bd15fa" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('independent_height_weight{fig:0>3}.svg', \n", " directory='./ml', \n", " fig=IntSlider(0, 0, 7, 1))" - ] + ], + "id": "b29ec920-013c-4533-aa64-5443f9c83891" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Samples from independent Gaussian variables that might\n", "represent heights and weights.\n", @@ -2212,15 +2182,20 @@ "\\text{BMI} = \\frac{w}{h^2}\n", "$$To deal with this dependence we now introduce the notion of\n", "*correlation* to the multivariate Gaussian density." - ] + ], + "id": "000e0f54-9fea-4d98-8805-2d88fee8e3e4" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Sampling Two Dimensional Variables\n", - "----------------------------------" - ] + "## Sampling Two Dimensional Variables\n", + "\n", + "\\[edit\\]" + ], + "id": "dbce5533-e31a-4968-9c76-e652681ef434" }, { "cell_type": "code", @@ -2228,8 +2203,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "30d9e57b-e601-41ed-9c05-14bc84557b2c" }, { "cell_type": "code", @@ -2239,7 +2215,8 @@ "source": [ "plot.correlated_height_weight(num_samps=8, \n", " diagrams='./ml')" - ] + ], + "id": "b49c3250-3745-4283-bbf3-8b94604596c2" }, { "cell_type": "code", @@ -2247,9 +2224,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "d9f49880-33e3-4641-b77f-8200e6803788" }, { "cell_type": "code", @@ -2257,27 +2235,32 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('correlated_height_weight{fig:0>3}.svg', \n", + "nu.display_plots('correlated_height_weight{fig:0>3}.svg', \n", " directory='./ml', \n", " fig=IntSlider(0, 0, 7, 1))" - ] + ], + "id": "f31aada5-a0b0-4fdd-9bb7-c847f91ae2c5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Samples from *correlated* Gaussian variables that might\n", "represent heights and weights." - ] + ], + "id": "3d7c463b-6498-4b4d-8b53-5c03d2227689" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Independent Gaussians\n", - "---------------------\n", + "## Independent Gaussians\n", + "\n", + "\\[edit\\]\n", "\n", "$$\n", "p(w, h) = p(w)p(h)\n", @@ -2294,14 +2277,14 @@ "$$\n", "p(\\mathbf{ y}) = \\frac{1}{\\det{2\\pi \\mathbf{D}}^{\\frac{1}{2}}} \\exp\\left(-\\frac{1}{2}(\\mathbf{ y}- \\boldsymbol{ \\mu})^\\top\\mathbf{D}^{-1}(\\mathbf{ y}- \\boldsymbol{ \\mu})\\right)\n", "$$" - ] + ], + "id": "e592a8de-d2d3-41da-af57-3747010debae" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Correlated Gaussian\n", - "-------------------\n", + "## Correlated Gaussian\n", "\n", "Form correlated from original by rotating the data space using matrix\n", "$\\mathbf{R}$.\n", @@ -2325,27 +2308,31 @@ "$$ this gives a covariance matrix: $$\n", "\\mathbf{C}= \\mathbf{R}\\mathbf{D} \\mathbf{R}^\\top\n", "$$" - ] + ], + "id": "a653e114-2fab-4e33-904a-d991d284d2d9" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Basis Functions\n", - "---------------\n", + "## Basis Functions\n", + "\n", + "\\[edit\\]\n", "\n", "Here’s the idea, instead of working directly on the original input\n", "space, $\\mathbf{ x}$, we build models in a new space,\n", "$\\boldsymbol{ \\phi}(\\mathbf{ x})$ where $\\boldsymbol{ \\phi}(\\cdot)$ is a\n", "*vector-valued* function that is defined on the space $\\mathbf{ x}$." - ] + ], + "id": "bdebfc1a-2cdf-4c89-9748-5c8c28c8ee2b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Quadratic Basis\n", - "---------------\n", + "## Quadratic Basis\n", "\n", "Remember, that a *vector-valued function* is just a vector that contains\n", "functions instead of values. Here’s an example for a one dimensional\n", @@ -2378,7 +2365,8 @@ "\n", "Let’s try constructing such a matrix for a set of inputs. First of all,\n", "we create a function that returns the matrix valued function." - ] + ], + "id": "249616e5-70f3-4632-a02c-784a84497bed" }, { "cell_type": "code", @@ -2387,7 +2375,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "e1000dad-ce2e-41ec-bf4e-17a16494b5b7" }, { "cell_type": "code", @@ -2399,19 +2388,20 @@ " \"\"\"Take in a vector of input values and return the design matrix associated \n", " with the basis functions.\"\"\"\n", " return np.hstack([np.ones((x.shape[0], 1)), x, x**2])" - ] + ], + "id": "1a7242ac-5c69-4887-9554-1fd6de2b6da2" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Functions Derived from Quadratic Basis\n", - "--------------------------------------\n", + "## Functions Derived from Quadratic Basis\n", "\n", "$$\n", "f(x) = {\\color{red}{w_0}} + {\\color{magenta}{w_1 x}} + {\\color{blue}{w_2 x^2}}\n", "$$" - ] + ], + "id": "7b023f1e-4067-47bd-ba8f-814767907249" }, { "cell_type": "code", @@ -2420,8 +2410,9 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "8496c280-0bd8-4ee6-a8b6-590d4eab95e3" }, { "cell_type": "code", @@ -2440,17 +2431,19 @@ "plot.basis(quadratic, x_min=-1.3, x_max=1.3, \n", " fig=f, ax=ax, loc=loc, text=text,\n", " diagrams='./ml')\n" - ] + ], + "id": "17972117-2b60-4d1a-89c3-aa394f7307cc" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The set of functions which are combined to form a *quadratic*\n", "basis." - ] + ], + "id": "592f7ad2-f8c4-4384-92a0-15489de85980" }, { "cell_type": "code", @@ -2458,9 +2451,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "90262aad-dfab-4bef-8398-5c6a46701a29" }, { "cell_type": "code", @@ -2468,10 +2462,21 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('quadratic_basis{num_basis:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "8aa85ee8-8421-499e-869b-a8d812aad60f" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('quadratic_basis{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(0,0,2,1))" - ] + ], + "id": "af15c5b4-c1c0-4b79-9897-b98c60dbc3bd" }, { "cell_type": "markdown", @@ -2480,7 +2485,8 @@ "This function takes in an $n\\times 1$ dimensional vector and returns an\n", "$n\\times 3$ dimensional *design matrix* containing the basis functions.\n", "We can plot those basis functions against there input as follows." - ] + ], + "id": "99092d91-311c-401b-9be1-995f8afe222c" }, { "cell_type": "code", @@ -2502,7 +2508,8 @@ "ax.plot(x[:,0], Phi[:, 2], 'b-', label = '$\\phi=x^2$', linewidth=3)\n", "ax.legend(loc='lower right')\n", "_ = ax.set_title('Quadratic Basis Functions')" - ] + ], + "id": "e9571da9-1ade-408b-8573-aa6eaf953af5" }, { "cell_type": "markdown", @@ -2514,20 +2521,21 @@ "process’, and in this context they form the underlying support for our\n", "prediction function. Our prediction function can only be composed of a\n", "weighted linear sum of our basis functions." - ] + ], + "id": "121679d0-af95-4b0f-8f9d-b0f3c29443bb" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Quadratic Functions\n", - "-------------------\n", + "## Quadratic Functions\n", "\n", - "\n", + "\n", "\n", "Figure: Functions constructed by weighted sum of the components of a\n", "quadratic basis." - ] + ], + "id": "61285a7b-043c-4390-94f6-413d241806e3" }, { "cell_type": "code", @@ -2535,9 +2543,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "2a6a8843-9be1-43d4-adc4-dcec93a4dbe8" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "34fd435d-7c08-43f6-8fa6-2744bca64985" }, { "cell_type": "code", @@ -2545,17 +2564,21 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('quadratic_function{num_function:0>3}.svg', \n", + "nu.display_plots('quadratic_function{num_function:0>3}.svg', \n", " directory='./ml', \n", " num_function=IntSlider(0,0,2,1))" - ] + ], + "id": "549a2c01-d7f8-4fb6-ad02-5eab60a28b13" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Rectified Linear Units\n", - "----------------------\n", + "## Rectified Linear Units\n", + "\n", + "\\[edit\\]\n", "\n", "The rectified linear unit is a basis function that emerged out of the\n", "deep learning community. Rectified linear units are popular in the\n", @@ -2564,7 +2587,8 @@ "certain threshold. $$\n", "\\phi_j(x) = xH(v_j x+ v_0)\n", "$$" - ] + ], + "id": "50256dd5-faf5-45d0-a0f0-af64149d32ae" }, { "cell_type": "code", @@ -2573,7 +2597,18 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "feab5bd5-89ec-40dc-b0fc-7587a86fcb45" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "bc7c6870-efe8-4147-9c45-232ab8088e47" }, { "cell_type": "code", @@ -2581,8 +2616,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s relu mlai.py" - ] + "%load -n mlai.relu" + ], + "id": "6c36c241-e2c7-4724-8513-d0451c3e1d5e" }, { "cell_type": "code", @@ -2591,9 +2627,10 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import teaching_plots as plot\n", + "import mlai.plot as plot\n", "import mlai" - ] + ], + "id": "5e708974-3822-4007-9c9b-82c275901521" }, { "cell_type": "code", @@ -2616,17 +2653,19 @@ " fig=f, ax=ax, loc=loc, text=text,\n", " diagrams='./ml',\n", " num_basis=5)" - ] + ], + "id": "ed2b3ce2-1f61-450f-b6f4-75a158c19442" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The set of functions which are combined to form a rectified\n", "linear unit basis." - ] + ], + "id": "372a998b-9f3c-437e-a818-1350e6120fb9" }, { "cell_type": "code", @@ -2634,9 +2673,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "974f0ed7-b5bd-4069-ac3d-0f69085fb086" }, { "cell_type": "code", @@ -2644,10 +2684,31 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('fourier_basis{num_basis:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "95a00cd1-b978-44a9-bf8d-b9aaf7d00ec8" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('relu_basis{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(0,0,4,1))" - ] + ], + "id": "ce21ed6e-ccc5-4b4b-abba-fde4207e6459" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "de8fb220-2e39-4428-96aa-8c9977c81648" }, { "cell_type": "code", @@ -2655,25 +2716,26 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_prediction(basis=mlai.relu, num_basis=5)" - ] + "nu.display_prediction(basis=mlai.relu, num_basis=5)" + ], + "id": "ea44023a-03a2-4434-a4aa-c3926e64c9cc" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Functions Derived from Relu Basis\n", - "---------------------------------\n", + "## Functions Derived from Relu Basis\n", "\n", "$$\n", "f(x) = \\color{red}{w_0} + \\color{magenta}{w_1 xH(x+1.0) } + \\color{blue}{w_2 xH(x+0.33) } + \\color{green}{w_3 xH(x-0.33)} + \\color{cyan}{w_4 xH(x-1.0)}\n", "$$\n", "\n", - "\n", + "\n", "\n", "Figure: A rectified linear unit basis is made up of different\n", "rectified linear unit functions centered at different points." - ] + ], + "id": "c16a171d-5388-40ee-bc6f-52733f786b61" }, { "cell_type": "code", @@ -2681,9 +2743,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "772fd1e0-7970-410d-a308-f3b1028199d6" }, { "cell_type": "code", @@ -2691,17 +2754,27 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('relu_function{func_num:0>3}.svg', \n", + "import notutils as nu" + ], + "id": "7db4cc79-600f-485b-b459-58475f4d763b" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('relu_function{func_num:0>3}.svg', \n", " directory='./ml', \n", " func_num=IntSlider(0,0,2,1))" - ] + ], + "id": "6c7b1ade-bbc5-4cd6-8fd0-ae853df29cc5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Gaussian Processes\n", - "------------------\n", + "## Gaussian Processes\n", "\n", "Models where we model the entire joint distribution of our training\n", "data, $p(\\mathbf{ y}, \\mathbf{X})$ are sometimes described as\n", @@ -2729,14 +2802,18 @@ "$$ where the conditioning is on the inputs $\\mathbf{X}$ which are used\n", "for computing the mean and covariance. For this reason they are known as\n", "mean and covariance functions." - ] + ], + "id": "e3d047e8-2976-4ae0-8a76-8ad19cf4703a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Linear Model Overview\n", - "---------------------\n", + "## Linear Model Overview\n", + "\n", + "\\[edit\\]\n", "\n", "However, we are focussing on what happens in models which are non-linear\n", "in the inputs, whereas the above would be *linear* in the inputs. To\n", @@ -2780,14 +2857,14 @@ "k_f\\left(\\mathbf{ x}_i, \\mathbf{ x}_j\\right) = \\alpha \\boldsymbol{ \\phi}\\left(\\mathbf{W}_1, \\mathbf{ x}_i\\right)^\\top \\boldsymbol{ \\phi}\\left(\\mathbf{W}_1, \\mathbf{ x}_j\\right)\n", "$$ so the elements of the covariance or *kernel* matrix are formed by\n", "inner products of the rows of the *design matrix*." - ] + ], + "id": "6a7047fc-f2b6-4bcc-879c-afe26a0d21d8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Gaussian Process\n", - "----------------\n", + "## Gaussian Process\n", "\n", "This is the essence of a Gaussian process. Instead of making assumptions\n", "about our density over each data point, $y_i$ as i.i.d. we make a joint\n", @@ -2795,14 +2872,14 @@ "function of both the parameters of the activation function,\n", "$\\mathbf{V}$, and the input variables, $\\mathbf{X}$. This comes about\n", "through integrating out the parameters of the model, $\\mathbf{ w}$." - ] + ], + "id": "e099576c-9a9b-4094-8156-13a59199ad60" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Basis Functions\n", - "---------------\n", + "## Basis Functions\n", "\n", "We can basically put anything inside the basis functions, and many\n", "people do. These can be deep kernels (Cho and Saul, 2009) or we can\n", @@ -2810,14 +2887,18 @@ "\n", "Viewing a neural network in this way is also what allows us to beform\n", "sensible *batch* normalizations (Ioffe and Szegedy, 2015)." - ] + ], + "id": "e96df1ea-bd0c-424f-a0bb-84f1ea1eb200" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Radial Basis Functions\n", - "----------------------\n", + "## Radial Basis Functions\n", + "\n", + "\\[edit\\]\n", "\n", "Another type of basis is sometimes known as a ‘radial basis’ because the\n", "effect basis functions are constructed on ‘centres’ and the effect of\n", @@ -2827,7 +2908,8 @@ "$$\n", "\\phi_j(x) = \\exp\\left(-\\frac{(x-\\mu_j)^2}{\\ell^2}\\right)\n", "$$" - ] + ], + "id": "175ed08c-d82f-435d-b667-a935eece7011" }, { "cell_type": "code", @@ -2835,8 +2917,19 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s radial mlai.py" - ] + "import mlai" + ], + "id": "41e0691a-99df-4e49-a28c-34d2ba84e9a7" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.radial" + ], + "id": "7c8504e7-079f-49d1-bb40-b113971ad27f" }, { "cell_type": "code", @@ -2845,9 +2938,10 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", - "import mlai\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot\n", + "import mlai" + ], + "id": "a0650d05-b893-4a3a-bd68-a463b6daaa0e" }, { "cell_type": "code", @@ -2866,17 +2960,19 @@ "plot.basis(mlai.radial, x_min=-2, x_max=2, \n", " fig=f, ax=ax, loc=loc, text=text,\n", " diagrams='./ml')" - ] + ], + "id": "4db35504-5a63-4ea1-a7c5-7cb579dda244" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The set of functions which are combined to form the radial\n", "basis." - ] + ], + "id": "0ff3a905-e830-4ccb-9d8c-9d5bd2409925" }, { "cell_type": "code", @@ -2884,9 +2980,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "0b1f7e4a-2aeb-4b99-afe0-77c73788819c" }, { "cell_type": "code", @@ -2894,10 +2991,11 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('radial_basis{num_basis:0>3}.svg', \n", + "nu.display_plots('radial_basis{num_basis:0>3}.svg', \n", " directory='./ml', \n", " num_basis=IntSlider(0,0,2,1))" - ] + ], + "id": "f5023d54-1bb8-4267-911e-631d990c813b" }, { "cell_type": "code", @@ -2905,25 +3003,26 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_prediction(basis=mlai.radial, num_basis=3)" - ] + "nu.display_prediction(basis=mlai.radial, num_basis=3)" + ], + "id": "82864bcb-4652-411f-b78e-8fa468f4e91f" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Functions Derived from Radial Basis\n", - "-----------------------------------\n", + "## Functions Derived from Radial Basis\n", "\n", "$$\n", "f(x) = \\color{red}{w_1 e^{-2(x+1)^2}} + \\color{magenta}{w_2e^{-2x^2}} + \\color{blue}{w_3 e^{-2(x-1)^2}}\n", "$$\n", "\n", - "\n", + "\n", "\n", "Figure: A radial basis is made up of different locally effective\n", "functions centered at different points." - ] + ], + "id": "305d49ef-0245-43a3-adf7-1fe8c8844c6c" }, { "cell_type": "code", @@ -2931,9 +3030,10 @@ "metadata": {}, "outputs": [], "source": [ - "from ipywidgets import IntSlider\n", - "import pods" - ] + "import notutils as nu\n", + "from ipywidgets import IntSlider" + ], + "id": "c6da9895-2b5b-4ab1-9ece-abd19bc2b665" }, { "cell_type": "code", @@ -2941,17 +3041,21 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('radial_function{func_num:0>3}.svg', \n", + "nu.display_plots('radial_function{func_num:0>3}.svg', \n", " directory='./ml', \n", " func_num=IntSlider(0,0,2,1))" - ] + ], + "id": "793ec318-77af-4850-9625-ba04eab0e086" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Marginal Likelihood\n", - "-------------------\n", + "## Marginal Likelihood\n", + "\n", + "\\[edit\\]\n", "\n", "To understand the Gaussian process we’re going to build on our\n", "understanding of the marginal likelihood for Bayesian regression. In the\n", @@ -2964,19 +3068,20 @@ "of basis function models, where the parameters are sampled from a prior,\n", "but move to thinking about sampling from the marginal likelihood\n", "directly." - ] + ], + "id": "064c5735-ea6e-4da2-97c1-1b636d8a6887" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Sampling from the Prior\n", - "-----------------------\n", + "## Sampling from the Prior\n", "\n", "The first thing we’ll do is to set up the parameters of the model, these\n", "include the parameters of the prior, the parameters of the basis\n", "functions and the noise level." - ] + ], + "id": "91794807-3b19-4c4a-9a89-150b11494e2a" }, { "cell_type": "code", @@ -2990,7 +3095,8 @@ "degree = 5\n", "# set the noise variance\n", "sigma2 = 0.01" - ] + ], + "id": "21bdb0f8-2028-47a5-b7c7-ef5c5d999ee3" }, { "cell_type": "markdown", @@ -3001,7 +3107,8 @@ "\n", "Let’s now compute a range of values to make predictions at, spanning the\n", "*new* space of inputs," - ] + ], + "id": "b9d97e0b-7a58-4cd6-8f6f-ce740084934a" }, { "cell_type": "code", @@ -3010,7 +3117,8 @@ "outputs": [], "source": [ "import numpy as np" - ] + ], + "id": "da357676-c3a2-4f8c-bff9-488f34dafb8d" }, { "cell_type": "code", @@ -3021,14 +3129,16 @@ "def polynomial(x, degree, loc, scale):\n", " degrees = np.arange(degree+1)\n", " return ((x-loc)/scale)**degrees" - ] + ], + "id": "5b36fe1a-70e8-4906-8f5c-cc78484013a3" }, { "cell_type": "markdown", "metadata": {}, "source": [ "now let’s build the basis matrices. First we load in the data" - ] + ], + "id": "2b908960-96ae-4b2e-bfb9-d28e44710f63" }, { "cell_type": "code", @@ -3037,7 +3147,8 @@ "outputs": [], "source": [ "import pods" - ] + ], + "id": "b9ea310f-d5fc-49d4-9365-e790675350c8" }, { "cell_type": "code", @@ -3048,7 +3159,8 @@ "data = pods.datasets.olympic_marathon_men()\n", "x = data['X']\n", "y = data['Y']" - ] + ], + "id": "fde53976-318f-4784-a97e-9a5f2e22f52f" }, { "cell_type": "code", @@ -3063,14 +3175,14 @@ "x_pred = np.linspace(1880, 2030, num_pred_data)[:, np.newaxis] # input locations for predictions\n", "Phi_pred = polynomial(x_pred, degree=degree, loc=loc, scale=scale)\n", "Phi = polynomial(x, degree=degree, loc=loc, scale=scale)" - ] + ], + "id": "4a7a2559-0c8d-4fcd-8573-bafce7f04a78" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Weight Space View\n", - "-----------------\n", + "## Weight Space View\n", "\n", "To generate typical functional predictions from the model, we need a set\n", "of model parameters. We assume that the parameters are drawn\n", @@ -3083,7 +3195,8 @@ "$\\mathbf{ w}$ using the function `np.random.normal` and combine these\n", "parameters with our basis to create some samples of what\n", "$f(\\mathbf{ x})$ looks like," - ] + ], + "id": "6734e0de-ac1c-4752-ac2d-7ef5c66e4a85" }, { "cell_type": "code", @@ -3092,7 +3205,8 @@ "outputs": [], "source": [ "import matplotlib.pyplot as plt" - ] + ], + "id": "1f6d777b-8d61-40ed-a050-f1a35bc5f671" }, { "cell_type": "code", @@ -3107,14 +3221,14 @@ " w_sample = z_vec*np.sqrt(alpha)\n", " f_sample = Phi_pred@w_sample\n", " plt.plot(x_pred, f_sample)" - ] + ], + "id": "e7309360-f018-4831-983b-94d4ac878dc3" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Function Space View\n", - "-------------------\n", + "## Function Space View\n", "\n", "The process we have used to generate the samples is a two stage process.\n", "To obtain each function, we first generated a sample from the prior, $$\n", @@ -3149,7 +3263,8 @@ "\\mathbf{K}= \\alpha\n", "\\boldsymbol{ \\Phi}\\boldsymbol{ \\Phi}^\\top.\n", "$$" - ] + ], + "id": "f574f765-73bd-4197-9299-21ef469de904" }, { "cell_type": "code", @@ -3158,7 +3273,8 @@ "outputs": [], "source": [ "K = alpha*Phi_pred@Phi_pred.T" - ] + ], + "id": "1550b7e0-4076-4b40-a540-fd5027e58160" }, { "cell_type": "markdown", @@ -3167,7 +3283,8 @@ "Now we can use the `np.random.multivariate_normal` command for sampling\n", "from a multivariate normal with covariance given by $\\mathbf{K}$ and\n", "zero mean," - ] + ], + "id": "56ae9b60-f9c9-40be-9844-430873ad2778" }, { "cell_type": "code", @@ -3181,13 +3298,14 @@ " ax.plot(x_pred.flatten(), f_sample.flatten(), linewidth=2)\n", " \n", "mlai.write_figure('gp-sample-basis-function.svg', directory='./kern')" - ] + ], + "id": "3545028e-5ecb-445f-a37f-33c651acf2e1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Samples directly from the covariance function implied by the\n", "basis function based covariance,\n", @@ -3199,7 +3317,8 @@ "directly we created the covariance for $\\mathbf{ f}$. We can visualise\n", "the form of this covaraince in an image in python with a colorbar to\n", "show scale." - ] + ], + "id": "caad7d07-8cc9-4469-a181-ceb47b8844be" }, { "cell_type": "code", @@ -3207,9 +3326,10 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot\n", + "import mlai.plot as plot\n", "import mlai" - ] + ], + "id": "6e4b8149-ead1-4e3f-8d40-e2b1007db157" }, { "cell_type": "code", @@ -3222,13 +3342,14 @@ "fig.colorbar(im)\n", "\n", "mlai.write_figure('basis-covariance-function.svg', directory='./kern')" - ] + ], + "id": "b57b6f77-9cc9-4f83-8a91-97961a5c5591" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Covariance of the function implied by the basis set\n", "$\\alpha\\boldsymbol{ \\Phi}\\boldsymbol{ \\Phi}^\\top$.\n", @@ -3247,7 +3368,8 @@ "\\mathbf{ y}\\sim \\mathcal{N}\\left(\\mathbf{0},\\boldsymbol{ \\Phi}\\boldsymbol{ \\Phi}^\\top +\\sigma^2\\mathbf{I}\\right).\n", "$$ Sampling directly from this density gives us the noise corrupted\n", "functions," - ] + ], + "id": "4af4bab1-b66b-49dd-ae22-a605bfc5fec7" }, { "cell_type": "code", @@ -3255,8 +3377,9 @@ "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)" - ] + "import mlai" + ], + "id": "f7b9b696-b216-4377-8937-ae59c9065af7" }, { "cell_type": "code", @@ -3264,20 +3387,22 @@ "metadata": {}, "outputs": [], "source": [ + "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", "K = alpha*Phi_pred@Phi_pred.T + sigma2*np.eye(x_pred.size)\n", "for i in range(10):\n", " y_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)\n", " ax.plot(x_pred.flatten(), y_sample.flatten())\n", " \n", "mlai.write_figure('gp-sample-basis-function-plus-noise.svg', \n", - " './kern')" - ] + " directory='./kern')" + ], + "id": "16560c95-1970-4dd4-894f-3e121a734896" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Samples directly from the covariance function implied by the\n", "noise corrupted basis function based covariance,\n", @@ -3286,7 +3411,8 @@ "where the effect of our noise term is to roughen the sampled functions,\n", "we can also increase the variance of the noise to see a different\n", "effect," - ] + ], + "id": "0c97406b-d72a-41a6-881d-28b72205c739" }, { "cell_type": "code", @@ -3296,7 +3422,8 @@ "source": [ "sigma2 = 1.\n", "K = alpha*Phi_pred@Phi_pred.T + sigma2*np.eye(x_pred.size)" - ] + ], + "id": "6fdd94a4-6c78-4c2e-950c-caceda54ff92" }, { "cell_type": "code", @@ -3310,26 +3437,31 @@ " plt.plot(x_pred.flatten(), y_sample.flatten())\n", " \n", "mlai.write_figure('gp-sample-basis-function-plus-large-noise.svg', \n", - " './kern')" - ] + " directory='./kern')" + ], + "id": "91426d23-74e1-4083-a441-c9f4c17b7c29" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Samples directly from the covariance function implied by the\n", "noise corrupted basis function based covariance,\n", "$\\alpha \\boldsymbol{ \\Phi}\\boldsymbol{ \\Phi}^\\top + \\mathbf{I}$." - ] + ], + "id": "a5d2958b-1174-44cf-8df4-4056fb16fe22" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Non-degenerate Gaussian Processes\n", - "---------------------------------\n", + "## Non-degenerate Gaussian Processes\n", + "\n", + "\\[edit\\]\n", "\n", "The process described above is degenerate. The covariance function is of\n", "rank at most $h$ and since the theoretical amount of data could always\n", @@ -3363,7 +3495,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -3373,7 +3505,7 @@ "and in considered what would happen if you took the number of hidden\n", "nodes, or neurons, to infinity, i.e. $h\\rightarrow \\infty$.\n", "\n", - "\n", + "\n", "\n", "Figure: Page 37 of [Radford Neal’s 1994\n", "thesis](http://www.cs.toronto.edu/~radford/ftp/thesis.pdf)\n", @@ -3401,14 +3533,14 @@ " \\end{align*}\n", " $$ has finite variance, then the result of taking the number of hidden\n", "units to infinity, with appropriate scaling, is also a Gaussian process." - ] + ], + "id": "9bb7f4c3-99c8-436d-92c1-1634b5ff3f77" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Further Reading\n", - "---------------\n", + "## Further Reading\n", "\n", "To understand this argument in more detail, I highly recommend reading\n", "chapter 2 of Neal’s thesis (Neal, 1994), which remains easy to read and\n", @@ -3419,14 +3551,18 @@ "business of machine learning in the 1990s. Radford and David were also\n", "pioneers in making their software widely available and publishing\n", "material on the web." - ] + ], + "id": "04644992-0802-422d-8f4c-a438d2064cd4" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Gaussian Process\n", - "----------------\n", + "## Gaussian Process\n", + "\n", + "\\[edit\\]\n", "\n", "In our we sampled from the prior over paraemters. Through the properties\n", "of multivariate Gaussian densities this prior over parameters implies a\n", @@ -3447,7 +3583,18 @@ "\\left\\Vert\\mathbf{ x}- \\mathbf{ x}^\\prime\\right\\Vert^2 = (\\mathbf{ x}- \\mathbf{ x}^\\prime)^\\top (\\mathbf{ x}- \\mathbf{ x}^\\prime) \n", "$$ Let’s build a covariance matrix based on this function. First we\n", "define the form of the covariance function," - ] + ], + "id": "c313df03-8bb8-49ae-85d1-979178deed1d" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "522ab10a-940f-48cd-97a6-0108bdfd9bec" }, { "cell_type": "code", @@ -3455,8 +3602,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s eq_cov mlai.py" - ] + "%load -n mlai.eq_cov" + ], + "id": "fe079494-9218-423e-98b9-228a621e64a9" }, { "cell_type": "markdown", @@ -3465,7 +3613,8 @@ "We can use this to compute *directly* the covariance for $\\mathbf{ f}$\n", "at the points given by `x_pred`. Let’s define a new function `K()` which\n", "does this," - ] + ], + "id": "92f826fc-0801-4876-a1a3-e96a330a78a3" }, { "cell_type": "code", @@ -3473,15 +3622,27 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s Kernel mlai.py" - ] + "import mlai" + ], + "id": "9c226529-1be7-417b-b0a9-d113eda16333" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.Kernel" + ], + "id": "fbf206f2-915a-4f9c-864e-e3229b6767e1" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we can image the resulting covariance," - ] + ], + "id": "21e59da4-8b2c-4b23-a861-ce063c081afc" }, { "cell_type": "code", @@ -3491,7 +3652,8 @@ "source": [ "kernel = Kernel(function=eq_cov, variance=1., lengthscale=10.)\n", "K = kernel.K(x_pred, x_pred)" - ] + ], + "id": "36821f69-6b10-4699-be6b-a56e50170de4" }, { "cell_type": "markdown", @@ -3499,7 +3661,8 @@ "source": [ "To visualise the covariance between the points we can use the `imshow`\n", "function in matplotlib." - ] + ], + "id": "f23a8b59-1e5a-43aa-af4d-e1ca0c8ad1a3" }, { "cell_type": "code", @@ -3510,14 +3673,16 @@ "fig, ax = plt.subplots(figsize=(8,8))\n", "im = ax.imshow(K, interpolation='none')\n", "fig.colorbar(im)" - ] + ], + "id": "784dc75f-30ce-47ce-855c-bc208b1110c3" }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we can sample functions from the marginal likelihood." - ] + ], + "id": "cf2fbd3c-749c-47e3-bd69-3234fcd83928" }, { "cell_type": "code", @@ -3525,31 +3690,34 @@ "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize(8, 5))\n", + "fig, ax = plt.subplots(figsize=(8, 5))\n", "for i in range(10):\n", " y_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)\n", " ax.plot(x_pred.flatten(), y_sample.flatten())" - ] + ], + "id": "237c14ac-8db6-43c1-aba9-f6d0bdb96cbd" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 1\n", + "### Exercise 2\n", "\n", "**Moving Parameters** Have a play with the parameters for this\n", "covariance function (the lengthscale and the variance) and see what\n", "effects the parameters have on the types of functions you observe." - ] + ], + "id": "4d027376-4790-4411-acff-b9a5376b7816" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Exercise 1 Answer\n", + "### Exercise 2 Answer\n", "\n", - "Write your answer to Exercise 1 here" - ] + "Write your answer to Exercise 2 here" + ], + "id": "ee0a1e09-c608-4803-ad59-74f947169058" }, { "cell_type": "code", @@ -3559,29 +3727,32 @@ "source": [ "# Use this box for any code you need\n", "\n" - ] + ], + "id": "0ffde6e8-e4eb-47d0-9a5c-0860a9e69c6d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Bayesian Inference by Rejection Sampling\n", - "----------------------------------------\n", + "## Bayesian Inference by Rejection Sampling\n", + "\n", + "\\[edit\\]\n", "\n", "One view of Bayesian inference is to assume we are given a mechanism for\n", - "generating samples, where we assume that mechanism is representing on\n", + "generating samples, where we assume that mechanism is representing an\n", "accurate view on the way we believe the world works.\n", "\n", "This mechanism is known as our *prior* belief.\n", "\n", "We combine our prior belief with our observations of the real world by\n", - "discarding all those samples that are inconsistent with our prior. The\n", - "*likelihood* defines mathematically what we mean by inconsistent with\n", - "the prior. The higher the noise level in the likelihood, the looser the\n", - "notion of consistent.\n", + "discarding all those prior samples that are inconsistent with our\n", + "observations. The *likelihood* defines mathematically what we mean by\n", + "inconsistent with the observations. The higher the noise level in the\n", + "likelihood, the looser the notion of consistent.\n", "\n", - "The samples that remain are considered to be samples from the\n", - "*posterior*.\n", + "The samples that remain are samples from the *posterior*.\n", "\n", "This approach to Bayesian inference is closely related to two sampling\n", "techniques known as *rejection sampling* and *importance sampling*. It\n", @@ -3589,8 +3760,8 @@ "computation* (ABC) or likelihood-free inference.\n", "\n", "In practice, the algorithm is often too slow to be practical, because\n", - "most samples will be inconsistent with the data and as a result the\n", - "mechanism has to be operated many times to obtain a few posterior\n", + "most samples will be inconsistent with the observations and as a result\n", + "the mechanism must be operated many times to obtain a few posterior\n", "samples.\n", "\n", "However, in the Gaussian process case, when the likelihood also assumes\n", @@ -3598,9 +3769,85 @@ "the posterior density *analytically*. This is the benefit of Gaussian\n", "processes.\n", "\n", - "First we will load in two python functions for computing the covariance\n", + "First, we will load in two python functions for computing the covariance\n", "function." - ] + ], + "id": "d6ea6441-8154-4ee7-b020-47c4630dbe52" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "60ebcb7c-5b74-42a3-baeb-891a8dd11f11" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.Kernel" + ], + "id": "bd19b06f-8839-4fde-80e0-f29971d40bff" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %load -n mlai.Kernel\n", + "class Kernel():\n", + " \"\"\"Covariance function\n", + " :param function: covariance function\n", + " :type function: function\n", + " :param name: name of covariance function\n", + " :type name: string\n", + " :param shortname: abbreviated name of covariance function\n", + " :type shortname: string\n", + " :param formula: latex formula of covariance function\n", + " :type formula: string\n", + " :param function: covariance function\n", + " :type function: function\n", + " :param \\**kwargs:\n", + " See below\n", + "\n", + " :Keyword Arguments:\n", + " * \"\"\"\n", + "\n", + " def __init__(self, function, name=None, shortname=None, formula=None, **kwargs): \n", + " self.function=function\n", + " self.formula = formula\n", + " self.name = name\n", + " self.shortname = shortname\n", + " self.parameters=kwargs\n", + " \n", + " def K(self, X, X2=None):\n", + " \"\"\"Compute the full covariance function given a kernel function for two data points.\"\"\"\n", + " if X2 is None:\n", + " X2 = X\n", + " K = np.zeros((X.shape[0], X2.shape[0]))\n", + " for i in np.arange(X.shape[0]):\n", + " for j in np.arange(X2.shape[0]):\n", + " K[i, j] = self.function(X[i, :], X2[j, :], **self.parameters)\n", + "\n", + " return K\n", + "\n", + " def diag(self, X):\n", + " \"\"\"Compute the diagonal of the covariance function\"\"\"\n", + " diagK = np.zeros((X.shape[0], 1))\n", + " for i in range(X.shape[0]): \n", + " diagK[i] = self.function(X[i, :], X[i, :], **self.parameters)\n", + " return diagK\n", + "\n", + " def _repr_html_(self):\n", + " raise NotImplementedError" + ], + "id": "e1051008-92ac-4d12-99c9-f78d12ae0045" }, { "cell_type": "code", @@ -3608,8 +3855,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s Kernel mlai.py" - ] + "import mlai" + ], + "id": "cff9d5fd-888d-458f-8b36-8b9cc5ce5020" }, { "cell_type": "code", @@ -3617,8 +3865,23 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s eq_cov mlai.py" - ] + "%load -n mlai.eq_cov" + ], + "id": "6ca4aa43-ba18-46a3-aa24-7aeb889ace4e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# %load -n mlai.eq_cov\n", + "def eq_cov(x, x_prime, variance=1., lengthscale=1.):\n", + " \"\"\"Exponentiated quadratic covariance function.\"\"\"\n", + " diffx = x - x_prime\n", + " return variance*np.exp(-0.5*np.dot(diffx, diffx)/lengthscale**2)" + ], + "id": "66122e8c-2e72-4acf-b408-4894073e345e" }, { "cell_type": "code", @@ -3630,15 +3893,17 @@ " name='Exponentiated Quadratic',\n", " shortname='eq', \n", " lengthscale=0.25)" - ] + ], + "id": "86159041-4311-4626-884e-b2f4fc70c241" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Next we sample from a multivariate normal density (a multivariate\n", + "Next, we sample from a multivariate normal density (a multivariate\n", "Gaussian), using the covariance function as the covariance matrix." - ] + ], + "id": "d2626cb4-ca61-4c62-8430-06ce49536dcb" }, { "cell_type": "code", @@ -3648,8 +3913,9 @@ "source": [ "import numpy as np\n", "np.random.seed(10)\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "c8593253-cbe1-4d3e-9a16-7a721b464e33" }, { "cell_type": "code", @@ -3659,7 +3925,8 @@ "source": [ "plot.rejection_samples(kernel=kernel, \n", " diagrams='./gp')" - ] + ], + "id": "5ef941ef-3c1c-4ba8-88dd-2e8a7485aeb9" }, { "cell_type": "code", @@ -3667,9 +3934,10 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "a6242abe-307d-42bc-886e-77fbd8dcd4c1" }, { "cell_type": "code", @@ -3677,18 +3945,19 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('gp_rejection_sample{sample:0>3}.png', \n", - " directory='./gp', \n", - " sample=IntSlider(1,1,5,1))" - ] + "nu.display_plots('gp_rejection_sample{sample:0>3}.png', \n", + " directory='./gp', \n", + " sample=IntSlider(1,1,5,1))" + ], + "id": "8d6ce516-d6ac-43b3-8478-f5427458267e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "Figure: One view of Bayesian inference is we have a machine for\n", "generating samples (the *prior*), and we discard all samples\n", @@ -3696,14 +3965,14 @@ "*posterior*). This is a rejection sampling view of Bayesian inference.\n", "The Gaussian process allows us to do this analytically by multiplying\n", "the *prior* by the *likelihood*." - ] + ], + "id": "5008b936-0c06-4a1d-8b3a-d8a286e18eae" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Gaussian Process\n", - "----------------\n", + "## Gaussian Process\n", "\n", "The Gaussian process perspective takes the marginal likelihood of the\n", "data to be a joint Gaussian density with a covariance given by\n", @@ -3724,8 +3993,306 @@ "$$ where the *parameters* of the model are also embedded in the\n", "covariance function, they include the parameters of the kernel (such as\n", "lengthscale and variance), and the noise variance, $\\sigma^2$. Let’s\n", - "create a class in python for storing these variables." - ] + "create a set of classes in python for storing these variables." + ], + "id": "2e8caddd-f6ba-45cf-b66f-ede002918ebc" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "166b2d4a-2a12-4c64-82f8-bf8af60fb036" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.Model" + ], + "id": "7a8cff25-ffdf-4615-bd38-c22a5a1caf89" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "c3dcbe99-5b9b-4426-ad51-38c3367d48f0" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.MapModel" + ], + "id": "fa593473-6261-4b35-a7fb-f2ce796fda03" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "c02b16e4-8b5a-4339-a22d-836ef1fff404" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.ProbModel" + ], + "id": "76575cb6-80bc-4ccb-9ec5-79e8ca909125" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "950fdcc4-18b5-4c2f-8375-56ee64b82f39" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.ProbMapModel" + ], + "id": "6b60a783-bcae-436c-9e26-c9b3f1342884" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "12f0a510-a3a3-4799-98b6-82b6654f1009" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.GP" + ], + "id": "1b3490e4-ba5b-4a8a-81c9-d6d276db9699" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making Predictions\n", + "\n", + "We now have a probability density that represents functions. How do we\n", + "make predictions with this density? The density is known as a process\n", + "because it is *consistent*. By consistency, here, we mean that the model\n", + "makes predictions for $\\mathbf{ f}$ that are unaffected by future values\n", + "of $\\mathbf{ f}^*$ that are currently unobserved (such as test points).\n", + "If we think of $\\mathbf{ f}^*$ as test points, we can still write down a\n", + "joint probability density over the training observations, $\\mathbf{ f}$\n", + "and the test observations, $\\mathbf{ f}^*$. This joint probability\n", + "density will be Gaussian, with a covariance matrix given by our\n", + "covariance function, $k(\\mathbf{ x}_i, \\mathbf{ x}_j)$. $$\n", + "\\begin{bmatrix}\\mathbf{ f}\\\\ \\mathbf{ f}^*\\end{bmatrix} \\sim \\mathcal{N}\\left(\\mathbf{0},\\begin{bmatrix} \\mathbf{K}& \\mathbf{K}_\\ast \\\\\n", + "\\mathbf{K}_\\ast^\\top & \\mathbf{K}_{\\ast,\\ast}\\end{bmatrix}\\right)\n", + "$$ where here $\\mathbf{K}$ is the covariance computed between all the\n", + "training points, $\\mathbf{K}_\\ast$ is the covariance matrix computed\n", + "between the training points and the test points and\n", + "$\\mathbf{K}_{\\ast,\\ast}$ is the covariance matrix computed betwen all\n", + "the tests points and themselves. To be clear, let’s compute these now\n", + "for our example, using `x` and `y` for the training data (although `y`\n", + "doesn’t enter the covariance) and `x_pred` as the test locations." + ], + "id": "1f85d452-7ddf-44fa-8afd-a9b0b770dafd" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# set covariance function parameters\n", + "variance = 16.0\n", + "lengthscale = 8\n", + "# set noise variance\n", + "sigma2 = 0.05\n", + "\n", + "kernel = Kernel(eq_cov, variance=variance, lengthscale=lengthscale)\n", + "K = kernel.K(x, x)\n", + "K_star = kernel.K(x, x_pred)\n", + "K_starstar = kernel.K(x_pred, x_pred)" + ], + "id": "5a2c1b18-d5b5-48b0-b7c6-a9a5f798c3dd" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we use this structure to visualise the covariance between test data\n", + "and training data. This structure is how information is passed between\n", + "test and training data. Unlike the maximum likelihood formalisms we’ve\n", + "been considering so far, the structure expresses *correlation* between\n", + "our different data points. However, just like the we now have a *joint\n", + "density* between some variables of interest. In particular we have the\n", + "joint density over $p(\\mathbf{ f}, \\mathbf{ f}^*)$. The joint density is\n", + "*Gaussian* and *zero mean*. It is specified entirely by the *covariance\n", + "matrix*, $\\mathbf{K}$. That covariance matrix is, in turn, defined by a\n", + "covariance function. Now we will visualise the form of that covariance\n", + "in the form of the matrix, $$\n", + "\\begin{bmatrix} \\mathbf{K}& \\mathbf{K}_\\ast \\\\ \\mathbf{K}_\\ast^\\top\n", + "& \\mathbf{K}_{\\ast,\\ast}\\end{bmatrix}\n", + "$$" + ], + "id": "b6a4cd35-bac1-4488-8a10-eb9067e48774" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "dd96c366-8765-4944-99a7-0f7e7148bf57" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(figsize=(8,8))\n", + "im = ax.imshow(np.vstack([np.hstack([K, K_star]), np.hstack([K_star.T, K_starstar])]), interpolation='none')\n", + "# Add lines for separating training and test data\n", + "ax.axvline(x.shape[0]-1, color='w')\n", + "ax.axhline(x.shape[0]-1, color='w')\n", + "fig.colorbar(im)\n", + "\n", + "mlai.write_figure('block-predictive-covariance.svg', diagrams='./gp')" + ], + "id": "bf767349-49ba-4fbb-9d15-03926329f2dc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "Figure: Different blocks of the covariance function. The upper left\n", + "block is the covariance of the training data with itself, $\\mathbf{K}$.\n", + "The top right is the cross covariance between training data (rows) and\n", + "prediction locations (columns). The lower left is the same matrix\n", + "transposed. The bottom right is the covariance matrix of the test data\n", + "with itself.\n", + "\n", + "There are four blocks to this plot. The upper left block is the\n", + "covariance of the training data with itself, $\\mathbf{K}$. We see some\n", + "structure here due to the missing data from the first and second world\n", + "wars. Alongside this covariance (to the right and below) we see the\n", + "cross covariance between the training and the test data ($\\mathbf{K}_*$\n", + "and $\\mathbf{K}_*^\\top$). This is giving us the covariation between our\n", + "training and our test data. Finally the lower right block The banded\n", + "structure we now observe is because some of the training points are near\n", + "to some of the test points. This is how we obtain ‘communication’\n", + "between our training data and our test data. If there is no structure in\n", + "$\\mathbf{K}_*$ then our belief about the test data simply matches our\n", + "prior." + ], + "id": "497b74dd-9fe5-4f85-9ffa-23d222f51b9b" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prediction Across Two Points with GPs\n", + "\n", + "\\[edit\\]" + ], + "id": "d08a1884-8687-490c-b743-e78643295cfc" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(4949)" + ], + "id": "95d87cfd-1a20-4cbf-9a52-db92c364a6a5" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai.plot as plot\n", + "import pods" + ], + "id": "daec981c-de8a-42f3-ad68-d3f19f376958" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(4949)" + ], + "id": "5e3116cf-aa55-47bc-a545-e384553afac7" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sampling a Function\n", + "\n", + "\\[edit\\]\n", + "\n", + "We will consider a Gaussian distribution with a particular structure of\n", + "covariance matrix. We will generate *one* sample from a 25-dimensional\n", + "Gaussian density. $$\n", + "\\mathbf{ f}=\\left[f_{1},f_{2}\\dots f_{25}\\right].\n", + "$$ in the figure below we plot these data on the $y$-axis against their\n", + "*indices* on the $x$-axis." + ], + "id": "42398f0a-6904-4e4b-a333-7dca89305fb6" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "d2fc5f51-5998-4d3f-9aa1-7d4cc3481acf" }, { "cell_type": "code", @@ -3733,36 +4300,19 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s GP mlai.py" - ] + "%load -n mlai.Kernel" + ], + "id": "0b421395-4eb7-41ea-9f56-fa9e83c4d1da" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Making Predictions\n", - "------------------\n", - "\n", - "We now have a probability density that represents functions. How do we\n", - "make predictions with this density? The density is known as a process\n", - "because it is *consistent*. By consistency, here, we mean that the model\n", - "makes predictions for $\\mathbf{ f}$ that are unaffected by future values\n", - "of $\\mathbf{ f}^*$ that are currently unobserved (such as test points).\n", - "If we think of $\\mathbf{ f}^*$ as test points, we can still write down a\n", - "joint probability density over the training observations, $\\mathbf{ f}$\n", - "and the test observations, $\\mathbf{ f}^*$. This joint probability\n", - "density will be Gaussian, with a covariance matrix given by our\n", - "covariance function, $k(\\mathbf{ x}_i, \\mathbf{ x}_j)$. $$\n", - "\\begin{bmatrix}\\mathbf{ f}\\\\ \\mathbf{ f}^*\\end{bmatrix} \\sim \\mathcal{N}\\left(\\mathbf{0},\\begin{bmatrix} \\mathbf{K}& \\mathbf{K}_\\ast \\\\\n", - "\\mathbf{K}_\\ast^\\top & \\mathbf{K}_{\\ast,\\ast}\\end{bmatrix}\\right)\n", - "$$ where here $\\mathbf{K}$ is the covariance computed between all the\n", - "training points, $\\mathbf{K}_\\ast$ is the covariance matrix computed\n", - "between the training points and the test points and\n", - "$\\mathbf{K}_{\\ast,\\ast}$ is the covariance matrix computed betwen all\n", - "the tests points and themselves. To be clear, let’s compute these now\n", - "for our example, using `x` and `y` for the training data (although `y`\n", - "doesn’t enter the covariance) and `x_pred` as the test locations." - ] + "import mlai" + ], + "id": "830642ba-a19b-4f4e-90b0-8525653b6460" }, { "cell_type": "code", @@ -3770,37 +4320,19 @@ "metadata": {}, "outputs": [], "source": [ - "# set covariance function parameters\n", - "variance = 16.0\n", - "lengthscale = 8\n", - "# set noise variance\n", - "sigma2 = 0.05\n", - "\n", - "kernel = Kernel(eq_cov, variance=variance, lengthscale=lengthscale)\n", - "K = kernel.K(x, x)\n", - "K_star = kernel.K(x, x_pred)\n", - "K_starstar = kernel.K(x_pred, x_pred)" - ] + "%load -n mlai.polynomial_cov" + ], + "id": "c5b4f961-0fcf-4308-8c68-e8f3c0d6a837" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Now we use this structure to visualise the covariance between test data\n", - "and training data. This structure is how information is passed between\n", - "test and training data. Unlike the maximum likelihood formalisms we’ve\n", - "been considering so far, the structure expresses *correlation* between\n", - "our different data points. However, just like the we now have a *joint\n", - "density* between some variables of interest. In particular we have the\n", - "joint density over $p(\\mathbf{ f}, \\mathbf{ f}^*)$. The joint density is\n", - "*Gaussian* and *zero mean*. It is specified entirely by the *covariance\n", - "matrix*, $\\mathbf{K}$. That covariance matrix is, in turn, defined by a\n", - "covariance function. Now we will visualise the form of that covariance\n", - "in the form of the matrix, $$\n", - "\\begin{bmatrix} \\mathbf{K}& \\mathbf{K}_\\ast \\\\ \\mathbf{K}_\\ast^\\top\n", - "& \\mathbf{K}_{\\ast,\\ast}\\end{bmatrix}\n", - "$$" - ] + "import mlai" + ], + "id": "623aef44-3e02-44cc-9835-3f60f24983e9" }, { "cell_type": "code", @@ -3808,39 +4340,31 @@ "metadata": {}, "outputs": [], "source": [ - "fig, ax = plt.subplots(figsize=(8,8))\n", - "im = ax.imshow(np.vstack([np.hstack([K, K_star]), np.hstack([K_star.T, K_starstar])]), interpolation='none')\n", - "# Add lines for separating training and test data\n", - "ax.axvline(x.shape[0]-1, color='w')\n", - "ax.axhline(x.shape[0]-1, color='w')\n", - "fig.colorbar(im)" - ] + "%load -n mlai.exponentiated_quadratic" + ], + "id": "835a8656-454f-4f86-b2ba-35c594caf813" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "There are four blocks to this color plot. The upper left block is the\n", - "covariance of the training data with itself, $\\mathbf{K}$. We see some\n", - "structure here due to the missing data from the first and second world\n", - "wars. Alongside this covariance (to the right and below) we see the\n", - "cross covariance between the training and the test data ($\\mathbf{K}_*$\n", - "and $\\mathbf{K}_*^\\top$). This is giving us the covariation between our\n", - "training and our test data. Finally the lower right block The banded\n", - "structure we now observe is because some of the training points are near\n", - "to some of the test points. This is how we obtain ‘communication’\n", - "between our training data and our test data. If there is no structure in\n", - "$\\mathbf{K}_*$ then our belief about the test data simply matches our\n", - "prior." - ] + "import mlai.plot as plot\n", + "from mlai import Kernel, exponentiated_quadratic" + ], + "id": "c24a4c7f-b1ac-4448-b61b-1635a57b64a9" }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Prediction Across Two Points with GPs\n", - "-------------------------------------" - ] + "kernel=Kernel(function=exponentiated_quadratic, lengthscale=0.5)\n", + "plot.two_point_sample(kernel.K, diagrams='./gp')" + ], + "id": "f28381e5-fb7d-4d50-8a57-a5dd1d48cf22" }, { "cell_type": "code", @@ -3848,9 +4372,10 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", - "np.random.seed(4949)" - ] + "import notutils as nu\n", + "from ipywidgets import IntSlider" + ], + "id": "b62ab1e2-cadf-4dc8-88c3-1c86b935cff0" }, { "cell_type": "code", @@ -3858,16 +4383,42 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot\n", - "import pods" - ] + "import notutils as nu" + ], + "id": "c8abd0c4-76f3-4976-9c34-34541251deba" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('two_point_sample{sample:0>3}.svg', './gp', sample=IntSlider(0, 0, 8, 1))" + ], + "id": "804faf46-d7ba-4e5a-b1f7-e4f94fad3407" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Sampling a Function from a Gaussian" - ] + "\n", + "\n", + "Figure: A 25 dimensional correlated random variable (values ploted\n", + "against index)" + ], + "id": "5d09dbf2-a8aa-4055-b3d3-0b8230b605dc" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sampling a Function from a Gaussian\n", + "\n", + "\\[edit\\]" + ], + "id": "1d7d8c51-8dba-4ff4-af07-a31540562799" }, { "cell_type": "code", @@ -3875,9 +4426,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "92a804ff-5f53-4896-9d47-24e8ba96233c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "7f7644dd-afa8-4e44-bbae-e225b76b08fc" }, { "cell_type": "code", @@ -3885,27 +4447,30 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('two_point_sample{sample:0>3}.svg', \n", + "nu.display_plots('two_point_sample{sample:0>3}.svg', \n", " './gp', \n", " sample=IntSlider(0, 0, 8, 1))" - ] + ], + "id": "180438e6-86bd-4079-bbd2-e77701f5ed70" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The joint Gaussian over $f_1$ and $f_2$ along with the\n", "conditional distribution of $f_2$ given $f_1$" - ] + ], + "id": "b444945e-5697-4c80-83c3-e63df5fc0b6f" }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Joint Density of $f_1$ and $f_2$" - ] + ], + "id": "62e15bb9-3640-412e-9859-4934db1c77f9" }, { "cell_type": "code", @@ -3913,9 +4478,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "07f0e633-fcad-45b8-bdda-c50da4bd580c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "6aa1f9d0-82a3-4e2c-ace5-2832ec67a824" }, { "cell_type": "code", @@ -3923,46 +4499,91 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('two_point_sample{sample:0>3}.svg', \n", + "nu.display_plots('two_point_sample{sample:0>3}.svg', \n", " './gp', \n", " sample=IntSlider(9, 9, 12, 1))" - ] + ], + "id": "c4c2744c-fa03-4d35-bead-c98b36434f26" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The joint Gaussian over $f_1$ and $f_2$ along with the\n", - "conditional distribution of $f_2$ given $f_1$\n", + "conditional distribution of $f_2$ given $f_1$" + ], + "id": "b25633e9-68b3-4f55-be5d-b00907ab0a60" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Uluru\n", "\n", - "- The single contour of the Gaussian density represents the\n", - " joint distribution, $p(f_1, f_2)$\n", + "\n", "\n", - ". . .\n", + "Figure: Uluru, the sacred rock in Australia. If we think of it as a\n", + "probability density, viewing it from this side gives us one *marginal*\n", + "from the density. Figuratively speaking, slicing through the rock would\n", + "give a conditional density.\n", "\n", - "- We observe that $f_1=?$\n", + "When viewing these contour plots, I sometimes find it helpful to think\n", + "of Uluru, the prominent rock formation in Australia. The rock rises\n", + "above the surface of the plane, just like a probability density rising\n", + "above the zero line. The rock is three dimensional, but when we view\n", + "Uluru from the classical position, we are looking at one side of it.\n", + "This is equivalent to viewing the marginal density.\n", "\n", - ". . .\n", + "The joint density can be viewed from above, using contours. The\n", + "conditional density is equivalent to *slicing* the rock. Uluru is a holy\n", + "rock, so this has to be an imaginary slice. Imagine we cut down a\n", + "vertical plane orthogonal to our view point (e.g. coming across our view\n", + "point). This would give a profile of the rock, which when renormalized,\n", + "would give us the conditional distribution, the value of conditioning\n", + "would be the location of the slice in the direction we are facing." + ], + "id": "c4af1e14-07ba-4cd6-83b4-9cb20f2a516f" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prediction with Correlated Gaussians\n", "\n", - "- Conditional density: $p(f_2|f_1=?)$\n", + "Of course in practice, rather than manipulating mountains physically,\n", + "the advantage of the Gaussian density is that we can perform these\n", + "manipulations mathematically.\n", "\n", - "- Prediction of $f_2$ from $f_1$ requires *conditional density*.\n", + "Prediction of $f_2$ given $f_1$ requires the *conditional density*,\n", + "$p(f_2|f_1)$.Another remarkable property of the Gaussian density is that\n", + "this conditional distribution is *also* guaranteed to be a Gaussian\n", + "density. It has the form, $$\n", + "p(f_2|f_1) = \\mathcal{N}\\left(f_2|\\frac{k_{1, 2}}{k_{1, 1}}f_1, k_{2, 2} - \\frac{k_{1,2}^2}{k_{1,1}}\\right)\n", + "$$where we have assumed that the covariance of the original joint\n", + "density was given by $$\n", + "\\mathbf{K}= \\begin{bmatrix} k_{1, 1} & k_{1, 2}\\\\ k_{2, 1} & k_{2, 2}.\\end{bmatrix}\n", + "$$\n", "\n", - "- Conditional density is *also* Gaussian. $$\n", - " p(f_2|f_1) = {\\mathcal{N}\\left(f_2|\\frac{k_{1, 2}}{k_{1, 1}}f_1,k_{2, 2} - \\frac{k_{1,2}^2}{k_{1,1}}\\right)}\n", - " $$ where covariance of joint density is given by $$\n", - " \\mathbf{K}= \\begin{bmatrix} k_{1, 1} & k_{1, 2}\\\\ k_{2, 1} & k_{2, 2}\\end{bmatrix}\n", - " $$" - ] + "Using these formulae we can determine the conditional density for any of\n", + "the elements of our vector $\\mathbf{ f}$. For example, the variable\n", + "$f_8$ is less correlated with $f_1$ than $f_2$. If we consider this\n", + "variable we see the conditional density is more diffuse." + ], + "id": "7c52b7a2-8f94-4b5b-81a2-486878edb080" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Joint Density of $f_1$ and $f_8$" - ] + "### Joint Density of $f_1$ and $f_8$\n", + "\n", + "\\[edit\\]" + ], + "id": "1b28600d-182b-435e-8d7f-99e7d8390b34" }, { "cell_type": "code", @@ -3970,9 +4591,20 @@ "metadata": {}, "outputs": [], "source": [ - "import pods\n", + "import notutils as nu\n", "from ipywidgets import IntSlider" - ] + ], + "id": "5b507bf9-e0b9-427c-8fc2-3da44d9e306e" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import notutils as nu" + ], + "id": "3c84e2c9-722e-411c-b0cb-aede98bd492b" }, { "cell_type": "code", @@ -3980,20 +4612,22 @@ "metadata": {}, "outputs": [], "source": [ - "pods.notebook.display_plots('two_point_sample{sample:0>3}.svg', \n", + "nu.display_plots('two_point_sample{sample:0>3}.svg', \n", " './gp', \n", " sample=IntSlider(13, 13, 17, 1))" - ] + ], + "id": "61ac85c7-92b3-4515-a63d-5c850bf6422a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Sample from the joint Gaussian model, points indexed by 1 and\n", "8 highlighted." - ] + ], + "id": "43d7b1e6-a0bd-4e6a-abcd-e8397dad3341" }, { "cell_type": "markdown", @@ -4001,7 +4635,7 @@ "source": [ "### Prediction of $f_{8}$ from $f_{1}$\n", "\n", - "\n", + "\n", "\n", "Figure: The joint Gaussian over $f_1$ and $f_8$ along with the\n", "conditional distribution of $f_8$ given $f_1$\n", @@ -4015,7 +4649,7 @@ "\n", ". . .\n", "\n", - "- Conditional density: $p(f_5|f_1=?)$.\n", + "- Conditional density: $p(f_8|f_1=?)$.\n", "\n", "- Prediction of $\\mathbf{ f}_*$ from $\\mathbf{ f}$ requires\n", " multivariate *conditional density*.\n", @@ -4042,14 +4676,18 @@ "- Here covariance of joint density is given by $$\n", " \\mathbf{K}= \\begin{bmatrix} \\mathbf{K}_{\\mathbf{ f}, \\mathbf{ f}} & \\mathbf{K}_{*, \\mathbf{ f}}\\\\ \\mathbf{K}_{\\mathbf{ f}, *} & \\mathbf{K}_{*, *}\\end{bmatrix}\n", " $$" - ] + ], + "id": "416c5359-c2a5-439c-a34f-62d117cfa00c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The Importance of the Covariance Function\n", - "-----------------------------------------\n", + "## The Importance of the Covariance Function\n", + "\n", + "\\[edit\\]\n", "\n", "The covariance function encapsulates our assumptions about the data. The\n", "equations for the distribution of the prediction function, given the\n", @@ -4068,7 +4706,8 @@ "process. It has a computational burden which is $O(n^3)$ and a storage\n", "burden which is $O(n^2)$. This makes working with Gaussian processes\n", "computationally intensive for the situation where $n>10,000$." - ] + ], + "id": "89df4d0c-0a38-4306-841e-b70996ba3440" }, { "cell_type": "code", @@ -4078,7 +4717,8 @@ "source": [ "from IPython.lib.display import YouTubeVideo\n", "YouTubeVideo('ewJ3AxKclOg')" - ] + ], + "id": "8ef48b7a-089e-45a8-8178-c92a732b5e75" }, { "cell_type": "markdown", @@ -4087,20 +4727,35 @@ "Figure: Introduction to Gaussian processes given by Neil Lawrence at\n", "the 2014 Gaussian process Winter School at the University of\n", "Sheffield." - ] + ], + "id": "98f96a6f-0bf5-491d-bd64-5c6d30eb31aa" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Improving the Numerics\n", - "----------------------\n", + "## Improving the Numerics\n", + "\n", + "\\[edit\\]\n", "\n", "In practice we shouldn’t be using matrix inverse directly to solve the\n", "GP system. One more stable way is to compute the *Cholesky\n", "decomposition* of the kernel matrix. The log determinant of the\n", "covariance can also be derived from the Cholesky decomposition." - ] + ], + "id": "02c9ed7d-61a5-46ed-bb76-d2e8bc2b5b2a" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "ff45a144-502e-4f3f-9a5a-510908cd4c3b" }, { "cell_type": "code", @@ -4108,8 +4763,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s update_inverse mlai.py" - ] + "%load -n mlai.update_inverse" + ], + "id": "03368356-7098-4a79-a7ca-6ad545ebee81" }, { "cell_type": "code", @@ -4118,14 +4774,14 @@ "outputs": [], "source": [ "GP.update_inverse = update_inverse" - ] + ], + "id": "db2cc71d-e41c-422d-99a0-aff51491bdff" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Capacity Control\n", - "----------------\n", + "## Capacity Control\n", "\n", "Gaussian processes are sometimes seen as part of a wider family of\n", "methods known as kernel methods. Kernel methods are also based around\n", @@ -4150,46 +4806,47 @@ "parameter estimation (in the simplest case proceeds) by maximum\n", "likelihood. This involves taking gradients of the likelihood with\n", "respect to the parameters of the covariance function." - ] + ], + "id": "71022a58-f314-43b6-94ef-dba48b694fb3" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Gradients of the Likelihood\n", - "---------------------------\n", + "## Gradients of the Likelihood\n", "\n", "The easiest conceptual way to obtain the gradients is a two step\n", "process. The first step involves taking the gradient of the likelihood\n", "with respect to the covariance function, the second step involves\n", "considering the gradient of the covariance function with respect to its\n", "parameters." - ] + ], + "id": "face961f-ebdc-4ee1-89c8-aedce8856441" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Overall Process Scale\n", - "---------------------\n", + "## Overall Process Scale\n", "\n", "In general we won’t be able to find parameters of the covariance\n", "function through fixed point equations, we will need to do gradient\n", "based optimization." - ] + ], + "id": "d9a56244-5622-4ca5-a9d3-9445c3bea660" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Capacity Control and Data Fit\n", - "-----------------------------\n", + "## Capacity Control and Data Fit\n", "\n", "The objective function can be decomposed into two terms, a capacity\n", "control term, and a data fit term. The capacity control term is the log\n", "determinant of the covariance. The data fit term is the matrix inner\n", "product between the data and the inverse covariance." - ] + ], + "id": "c983ce2e-0b75-48f7-9230-4786dc7d79bf" }, { "cell_type": "code", @@ -4210,14 +4867,18 @@ " new = rotationMatrix*[xd(:)'; yd(:)'];\n", " set(handle(i), 'xdata', new(1, :));\n", " set(handle(i), 'ydata', new(2, :));" - ] + ], + "id": "ca5b109c-8e6c-496f-b56c-b202a2b61f2e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Learning Covariance Parameters\n", - "------------------------------\n", + "## Learning Covariance Parameters\n", + "\n", + "\\[edit\\]\n", "\n", "Can we determine covariance parameters from the data?\n", "\n", @@ -4240,67 +4901,29 @@ "$$\n", "E(\\boldsymbol{ \\theta}) = \\color{blue}{\\frac{1}{2}\\log\\det{\\mathbf{K}}} + \\color{red}{\\frac{\\mathbf{ y}^{\\top}\\mathbf{K}^{-1}\\mathbf{ y}}{2}}\n", "$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - " clf\n", - " lambda1 = 3;\n", - " lambda2 = 1;\n", - " t = linspace(-pi, pi, 200);\n", - " R = [sqrt(2)/2 -sqrt(2)/2; sqrt(2)/2 sqrt(2)/2];\n", - " xy = R*[lambda1*sin(t); lambda2*cos(t)];\n", - " line(xy(1, :), xy(2, :), 'linewidth', 3, 'color', blackColor);\n", - " axis off, axis equal\n", - " a = arrow([0 lambda1*R(1, 1)], [0 lambda1*R(2, 1)]);\n", - " set(a, 'linewidth', 3, 'color', blueColor);\n", - " a = arrow([0 lambda2*R(1, 2)], [0 lambda2*R(2, 2)]);\n", - " set(a, 'linewidth', 3, 'color', blueColor);\n", - " xlim = get(gca, 'xlim');\n", - " xspan = xlim(2) - xlim(1);\n", - " ylim = get(gca, 'ylim');\n", - " yspan = ylim(2) - ylim(1);\n", - " text(lambda1*0.5*R(1, 1)-0.05*xspan, lambda1*0.5*R(2, 1)-yspan*0.05, '$\\eigenvalue_1$')\n", - " text(lambda2*0.5*R(1, 2)-0.05*xspan, lambda2*0.5*R(2, 2)-yspan*0.05, '$\\eigenvalue_2$')\n", - " fileName = 'gpOptimiseEigen';\n", - " printLatexPlot(fileName, directory, 0.45*textWidth)" - ] + ], + "id": "92fbe5fe-9776-4e12-af52-ae1181d5e4c1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Capacity Control through the Determinant\n", - "----------------------------------------\n", + "## Capacity Control through the Determinant\n", + "\n", + "\\[edit\\]\n", "\n", "The parameters are *inside* the covariance function (matrix).\n", "$$k_{i, j} = k(\\mathbf{ x}_i, \\mathbf{ x}_j; \\boldsymbol{ \\theta})$$\n", "\n", - "$$\\mathbf{K}= \\mathbf{R}\\boldsymbol{ \\Lambda}^2 \\mathbf{R}^\\top$$" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "gpoptimizePlot1" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ + "$$\\mathbf{K}= \\mathbf{R}\\boldsymbol{ \\Lambda}^2 \\mathbf{R}^\\top$$\n", + "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", @@ -4646,12 +5304,12 @@ "\n", "\n", "\n", "\n", @@ -4659,15 +5317,40 @@ "\n", "Figure: Variation in the data fit term, the capacity term and the\n", "negative log likelihood for different lengthscales." - ] + ], + "id": "26c6e925-8425-414d-8ccf-04435e6b1757" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Exponentiated Quadratic Covariance\n", - "----------------------------------" - ] + "## Exponentiated Quadratic Covariance\n", + "\n", + "\\[edit\\]" + ], + "id": "bf10aca3-f991-44c1-84b5-ed309971b643" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import mlai" + ], + "id": "66db095e-15f2-4a85-adce-2d990c74b386" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load -n mlai.Kernel" + ], + "id": "b4ea3b1b-b5ee-4f7e-b66b-2501f27bca40" }, { "cell_type": "code", @@ -4675,8 +5358,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s Kernel mlai.py" - ] + "import mlai" + ], + "id": "d60fe34b-05c5-4029-ae7c-10174847f300" }, { "cell_type": "code", @@ -4684,8 +5368,9 @@ "metadata": {}, "outputs": [], "source": [ - "%load -s eq_cov mlai.py" - ] + "%load -n mlai.eq_cov" + ], + "id": "8c1dedb6-e18f-4892-aa82-563f229fcae8" }, { "cell_type": "code", @@ -4698,7 +5383,8 @@ " shortname='eq', \n", " formula='\\kernelScalar(\\inputVector, \\inputVector^\\prime) = \\alpha \\exp\\left(-\\frac{\\ltwoNorm{\\inputVector-\\inputVector^\\prime}^2}{2\\lengthScale^2}\\right)',\n", " lengthscale=0.2)" - ] + ], + "id": "5a895009-5987-4ccf-8ec5-fa559fd56f41" }, { "cell_type": "code", @@ -4706,8 +5392,9 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "e6fd782c-2024-4725-8794-3bde63502027" }, { "cell_type": "code", @@ -4716,7 +5403,8 @@ "outputs": [], "source": [ "plot.covariance_func(kernel=kernel, diagrams='./kern/')" - ] + ], + "id": "19fb2343-b8f1-4f35-8772-6b2d1c6c6bfb" }, { "cell_type": "markdown", @@ -4745,53 +5433,69 @@ "\n", "\n", "\n", "\n", "
\n", "\n", - "\n", + "\n", "\n", "\n", @@ -4316,7 +4939,8 @@ " $\\mathbf{R}^\\top\\mathbf{R}= \\mathbf{I}$.\n", "- Useful representation since\n", " $\\det{\\mathbf{K}} = \\det{\\boldsymbol{ \\Lambda}^2} = \\det{\\boldsymbol{ \\Lambda}}^2$." - ] + ], + "id": "9685b3ba-a262-474b-bfd1-81c9379549e7" }, { "cell_type": "code", @@ -4327,8 +4951,9 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import mlai\n", - "import teaching_plots as plot" - ] + "import mlai.plot as plot" + ], + "id": "a8fdcddb-bbb3-4622-a240-f6cd53551ec5" }, { "cell_type": "code", @@ -4336,8 +4961,9 @@ "metadata": {}, "outputs": [], "source": [ - "diagrams = './gp/'" - ] + "plot.covariance_capacity(rotate_angle=np.pi/4, lambda1 = 0.5, lambda2 = 0.3, diagrams = './gp/')" + ], + "id": "45ed7b2d-6b76-4c2e-a68b-e4301b6dc29e" }, { "cell_type": "code", @@ -4345,18 +4971,45 @@ "metadata": {}, "outputs": [], "source": [ - "plot.covariance_capacity(rotate_angle=np.pi/4, lambda1 = 0.5, lambda2 = 0.3, diagrams = './gp/')" - ] + "import notutils as nu\n", + "from ipywidgets import IntSlider" + ], + "id": "f6ce0136-7863-4661-8da1-baaa45e86fbf" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nu.display_plots('gp-optimise-determinant{sample:0>3}.svg', \n", + " directory='./gp', \n", + " sample=IntSlider(0, 0, 9, 1))" + ], + "id": "407e98c6-153a-4893-b6cb-174703219dbb" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The determinant of the covariance is dependent only on the\n", "eigenvalues. It represents the ‘footprint’ of the Gaussian." - ] + ], + "id": "deb2fc1f-7d86-4d53-9efb-25afbefec960" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Quadratic Data Fit\n", + "\n", + "\\[edit\\]" + ], + "id": "73e3302f-2b45-4f62-8e06-d36f8cad139d" }, { "cell_type": "code", @@ -4424,34 +5077,32 @@ " counter = counter + 1;\n", " \n", " printLatexText(includeText, 'gpOptimiseQuadraticIncludeText.tex', directory)" - ] + ], + "id": "c9eb9b41-e13a-45a8-8ca9-4cdb096f6a47" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The data fit term of the Gaussian process is a quadratic loss\n", "centered around zero. This has eliptical contours, the principal axes of\n", "which are given by the covariance matrix." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Quadratic Data Fit\n", - "------------------" - ] + ], + "id": "904b4574-347e-4627-b8b6-3e6f8d5e132a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Data Fit Term\n", - "-------------" - ] + "## Data Fit Term\n", + "\n", + "\\[edit\\]" + ], + "id": "0b953695-1a2c-47a5-8184-a8584db2e554" }, { "cell_type": "code", @@ -4462,7 +5113,8 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import os" - ] + ], + "id": "2d2eca6e-096d-4afd-ab87-8a949671ad9e" }, { "cell_type": "code", @@ -4471,10 +5123,11 @@ "outputs": [], "source": [ "import GPy\n", - "import teaching_plots as plot\n", + "import mlai.plot as plot\n", "import mlai\n", "import gp_tutorial" - ] + ], + "id": "891ebfaf-4737-4720-b9f9-7998c9a13731" }, { "cell_type": "code", @@ -4490,7 +5143,8 @@ "blue_color=[0., 0., 1.]\n", "magenta_color=[1., 0., 1.]\n", "fontsize=18" - ] + ], + "id": "011a76fc-1364-406a-88e0-13d12bbc669a" }, { "cell_type": "code", @@ -4507,7 +5161,8 @@ "linewidth=3\n", "markersize=15\n", "markertype='.'" - ] + ], + "id": "8e9392d1-8f85-4984-91c4-95352931636f" }, { "cell_type": "code", @@ -4524,7 +5179,8 @@ "true_kern.white.variance = 0.01\n", "K = true_kern.K(x) \n", "y = np.random.multivariate_normal(np.zeros((6,)), K, 1).T" - ] + ], + "id": "7bbe1b3c-0e3e-4e7f-a1a5-613425e395ea" }, { "cell_type": "code", @@ -4532,6 +5188,7 @@ "metadata": {}, "outputs": [], "source": [ + "\n", "# Fitted model\n", "kern = GPy.kern.RBF(1) + GPy.kern.White(1)\n", "kern.rbf.lengthscale = 1.0\n", @@ -4622,7 +5279,8 @@ " xlim = ax2.get_xlim()\n", " ax2.plot([xlim[0], xlim[0]], err_y_lim, color=black_color)\n", " ax2.plot(xlim, [err_y_lim[0], err_y_lim[0]], color=black_color)" - ] + ], + "id": "656830bc-4777-46d4-b97a-77143c70d253" }, { "cell_type": "markdown", @@ -4632,12 +5290,12 @@ "
\n", "\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
\n", "\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
\n", "\n", - "\n", + "\n", "\n", "\n", "\n", - "\n", + "\n", "\n", "
\n", "\n", "Figure: The exponentiated quadratic covariance function." - ] + ], + "id": "6e5a0205-dfa4-478a-a242-41e0ad2706a9" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "GPSS: Gaussian Process Summer School\n", - "------------------------------------\n", + "## GPSS: Gaussian Process Summer School\n", + "\n", + "\\[edit\\]\n", "\n", "If you’re interested in finding out more about Gaussian processes, you\n", "can attend the Gaussian process summer school, or view the lectures and\n", "material on line. Details of the school, future events and past events\n", - "can be found at the website\n", - "http://gpss.cc." - ] + "can be found at the website ." + ], + "id": "188df5f1-e093-44a0-82cc-71aff5c3b08c" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install gpy" + ], + "id": "706f59f0-060a-4f15-bd00-d11766a5ca60" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "GPy: A Gaussian Process Framework in Python\n", - "-------------------------------------------\n", + "## GPy: A Gaussian Process Framework in Python\n", + "\n", + "\\[edit\\]\n", "\n", "Gaussian processes are a flexible tool for non-parametric analysis with\n", "uncertainty. The GPy software was started in Sheffield to provide a easy\n", "to use interface to GPs. One which allowed the user to focus on the\n", "modelling rather than the mathematics.\n", "\n", - "\n", + "\n", "\n", "Figure: GPy is a BSD licensed software code base for implementing\n", "Gaussian process models in Python. It is designed for teaching and\n", - "modelling. We welcome contributions which can be made through the Github\n", - "repository\n", - "https://github.com/SheffieldML/GPy\n", + "modelling. We welcome contributions which can be made through the GitHub\n", + "repository
\n", "\n", "GPy is a BSD licensed software code base for implementing Gaussian\n", "process models in python. This allows GPs to be combined with a wide\n", @@ -4802,21 +5506,25 @@ "contributions.\n", "\n", "The aim for GPy is to be a probabilistic-style programming language,\n", - "i.e. you specify the model rather than the algorithm. As well as a large\n", - "range of covariance functions the software allows for non-Gaussian\n", + "i.e., you specify the model rather than the algorithm. As well as a\n", + "large range of covariance functions the software allows for non-Gaussian\n", "likelihoods, multivariate outputs, dimensionality reduction and\n", "approximations for larger data sets.\n", "\n", "The documentation for GPy can be found\n", "[here](https://gpy.readthedocs.io/en/latest/)." - ] + ], + "id": "2b56d58e-4a11-4282-ae1d-e47de62a8a58" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "GPy Tutorial\n", - "------------\n", + "## GPy Tutorial\n", + "\n", + "\\[edit\\]\n", "\n", "\n", "\n", @@ -4836,7 +5544,7 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", @@ -4857,56 +5565,20 @@ "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", "This GPy tutorial is based on material we share in the Gaussian process\n", - "summer school for teaching these models\n", - "https://gpss.cc. It contains\n", + "summer school for teaching these models . It contains\n", "material from various members and former members of the Sheffield\n", "machine learning group, but particular mention should be made of\n", "[Nicolas\n", "Durrande](https://sites.google.com/site/nicolasdurrandehomepage/) and\n", "[James Hensman](https://jameshensman.github.io/), see\n", - "http://gpss.cc/gpss17/labs/GPSS_Lab1_2017.ipynb." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install gpy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/mlai.py','mlai.py')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/teaching_plots.py','teaching_plots.py')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "urllib.request.urlretrieve('https://raw.githubusercontent.com/lawrennd/talks/gh-pages/gp_tutorial.py','gp_tutorial.py')" - ] + "." + ], + "id": "18fc2fb5-f556-4050-9472-69f1866ac945" }, { "cell_type": "code", @@ -4916,7 +5588,8 @@ "source": [ "import numpy as np\n", "import GPy" - ] + ], + "id": "95f8a306-5d0f-472f-9c3c-193b1250f2b7" }, { "cell_type": "code", @@ -4925,19 +5598,21 @@ "outputs": [], "source": [ "from matplotlib import pyplot as plt" - ] + ], + "id": "f4199b72-b30f-452b-9d00-315f8d75cbb8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To give a feel for the sofware we’ll start by creating an exponentiated\n", + "To give a feel for the software we’ll start by creating an exponentiated\n", "quadratic covariance function, $$\n", "k(\\mathbf{ x}, \\mathbf{ x}^\\prime) = \\alpha \\exp\\left(-\\frac{\\left\\Vert \\mathbf{ x}- \\mathbf{ x}^\\prime \\right\\Vert_2^2}{2\\ell^2}\\right),\n", "$$ where the length scale is $\\ell$ and the variance is $\\alpha$.\n", "\n", "To set this up in GPy we create a kernel in the following manner." - ] + ], + "id": "509cd373-088e-4f10-97ed-ce904bc4b01d" }, { "cell_type": "code", @@ -4949,14 +5624,16 @@ "alpha = 1.0\n", "lengthscale = 2.0\n", "kern = GPy.kern.RBF(input_dim=input_dim, variance=alpha, lengthscale=lengthscale)" - ] + ], + "id": "9cb58963-a578-4f1a-9cf8-f5bf360d24f9" }, { "cell_type": "markdown", "metadata": {}, "source": [ "That builds a kernel object for us. The kernel can be displayed." - ] + ], + "id": "b086ad8d-5017-4c52-a715-378967439fb2" }, { "cell_type": "code", @@ -4965,7 +5642,8 @@ "outputs": [], "source": [ "display(kern)" - ] + ], + "id": "6ff83cfa-3513-4d4a-b8c3-de61f4f76dac" }, { "cell_type": "markdown", @@ -4973,7 +5651,8 @@ "source": [ "Or because it’s one dimensional, you can also plot the kernel as a\n", "function of its inputs (while the other is fixed)." - ] + ], + "id": "ffb3026a-8847-4f3c-8f34-3f5f0955777f" }, { "cell_type": "code", @@ -4981,9 +5660,10 @@ "metadata": {}, "outputs": [], "source": [ - "import teaching_plots as plot\n", - "import mlai" - ] + "import mlai\n", + "import mlai.plot as plot" + ], + "id": "74a0113b-f9db-4a34-801d-df923dd4f837" }, { "cell_type": "code", @@ -4994,20 +5674,22 @@ "fig, ax = plt.subplots(figsize=plot.big_wide_figsize)\n", "kern.plot(ax=ax)\n", "mlai.write_figure('gpy-eq-covariance.svg', directory='./kern')" - ] + ], + "id": "712e324f-a612-4c90-a0b6-2181f88d03d5" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The exponentiated quadratic covariance function as plotted by\n", "the `GPy.kern.plot` command.\n", "\n", - "You can set the lengthscale of the covariance to different values and\n", + "You can set the length scale of the covariance to different values and\n", "plot the result." - ] + ], + "id": "58f6ce49-0e5b-4bc4-9ec6-0e9294f71864" }, { "cell_type": "code", @@ -5017,7 +5699,8 @@ "source": [ "kern = GPy.kern.RBF(input_dim=input_dim) # By default, the parameters are set to 1.\n", "lengthscales = np.asarray([0.2,0.5,1.,2.,4.])" - ] + ], + "id": "8e285179-159c-4422-8802-a6a98b3c8f2a" }, { "cell_type": "code", @@ -5033,36 +5716,38 @@ "\n", "ax.legend(lengthscales)\n", "mlai.write_figure('gpy-eq-covariance-lengthscales.svg', directory='./kern')" - ] + ], + "id": "ca4f95e4-6319-46fc-8404-2ae10ed0fd07" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: The exponentiated quadratic covariance function plotted for\n", - "different lengthscales by `GPy.kern.plot` command." - ] + "different length scales by `GPy.kern.plot` command." + ], + "id": "bfb4b220-6cca-4295-85f8-f882137b7425" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Covariance Functions in GPy\n", - "---------------------------\n", + "## Covariance Functions in GPy\n", "\n", "Many covariance functions are already implemented in GPy. Instead of\n", "rbf, try constructing and plotting the following covariance functions:\n", "`exponential`, `Matern32`, `Matern52`, `Brownian`, `linear`, `bias`,\n", "`rbfcos`, `periodic_Matern32`, etc. Some of these covariance functions,\n", - "such as `rbfcos`, are not parametrized by a variance and a lengthscale.\n", - "Furthermore, not all kernels are stationary (i.e., they can’t all be\n", - "written as\n", + "such as `rbfcos`, are not parametrized by a variance and a length scale.\n", + "Further, not all kernels are stationary (i.e., they can’t all be written\n", + "as\n", "$k(\\mathbf{ x}, \\mathbf{ x}^\\prime) = f(\\mathbf{ x}-\\mathbf{ x}^\\prime)$,\n", - "see for example the Brownian covariance function). For plotting so it\n", + "see for example the Brownian covariance function). So for plotting it\n", "may be interesting to change the value of the fixed input." - ] + ], + "id": "3ad72463-dffa-4b79-87e7-446a59a90ebb" }, { "cell_type": "code", @@ -5080,20 +5765,21 @@ "ax.set_ylim(-0.1,5.1)\n", "\n", "mlai.write_figure('gpy-brownian-covariance-lengthscales.svg', directory='./kern')" - ] + ], + "id": "a5c929f9-6ddf-48fe-84e8-17a7fd4fb8a1" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Combining Covariance Functions in GPy\n", - "-------------------------------------\n", + "## Combining Covariance Functions in GPy\n", "\n", "In GPy you can easily combine covariance functions you have created\n", "using the sum and product operators, `+` and `*`. So, for example, if we\n", "wish to combine an exponentiated quadratic covariance with a Matern 5/2\n", "then we can write" - ] + ], + "id": "09302adb-1cdf-42c1-b73c-7a476d1f103f" }, { "cell_type": "code", @@ -5105,7 +5791,8 @@ "kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)\n", "kern = kern1 + kern2\n", "display(kern)" - ] + ], + "id": "38f962cc-f3dd-455d-8b0d-bdd56f77c355" }, { "cell_type": "code", @@ -5118,19 +5805,21 @@ "kern.plot(ax=ax)\n", "\n", "mlai.write_figure('gpy-eq-plus-matern52-covariance.svg', directory='./kern')" - ] + ], + "id": "5e37179a-dc24-4810-9a52-8eb3c2cc6e59" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A combination of the exponentiated quadratic covariance plus\n", "the Matern $5/2$ covariance.\n", "\n", - "Or if we wanted to multiply them we can write" - ] + "Or if we wanted to multiply them, we can write" + ], + "id": "730f7620-054c-4df8-84ed-dec71eddd4a6" }, { "cell_type": "code", @@ -5142,7 +5831,8 @@ "kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)\n", "kern = kern1 * kern2\n", "display(kern)" - ] + ], + "id": "955ad67a-c4ef-4eb6-9b98-337d795dc52f" }, { "cell_type": "code", @@ -5155,20 +5845,22 @@ "kern.plot(ax=ax)\n", "\n", "mlai.write_figure('gpy-eq-times-matern52-covariance.svg', directory='./kern')" - ] + ], + "id": "2293c39a-03fa-4940-bdc8-fc12edd86caa" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A combination of the exponentiated quadratic covariance\n", "multiplied by the Matern $5/2$ covariance.\n", "\n", "You can learn about how to implement [new kernel objects in GPy\n", "here](https://gpy.readthedocs.io/en/latest/tuto_creating_new_kernels.html)." - ] + ], + "id": "8e2aed5f-59eb-4053-b656-19a1271be6a2" }, { "cell_type": "code", @@ -5178,7 +5870,8 @@ "source": [ "from IPython.lib.display import YouTubeVideo\n", "YouTubeVideo('-sY8zW3Om1Y')" - ] + ], + "id": "6db61b54-a73a-4a3f-be12-21ea6706b4ed" }, { "cell_type": "markdown", @@ -5188,14 +5881,14 @@ "is a key place in which you introduce your understanding of the data\n", "problem. To learn more about the design of covariance functions, see\n", "this talk from Nicolas Durrande at GPSS in 2016." - ] + ], + "id": "5a647089-a657-45c3-b632-d9e7651ab59a" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "A Gaussian Process Regression Model\n", - "-----------------------------------\n", + "## A Gaussian Process Regression Model\n", "\n", "We will now combine the Gaussian process prior with some data to form a\n", "GP regression model with GPy. We will generate data from the function $$\n", @@ -5204,7 +5897,8 @@ "y(x) = f(x) + \\epsilon,\n", "$$ with the noise being Gaussian distributed,\n", "$\\epsilon\\sim \\mathcal{N}\\left(0,0.01\\right)$." - ] + ], + "id": "e8c0276e-33c3-4273-a9f1-957262f11933" }, { "cell_type": "code", @@ -5214,7 +5908,8 @@ "source": [ "X = np.linspace(0.05,0.95,10)[:,np.newaxis]\n", "Y = -np.cos(np.pi*X) + np.sin(4*np.pi*X) + np.random.normal(loc=0.0, scale=0.1, size=(10,1))" - ] + ], + "id": "4db259da-5a79-4c44-9322-f5f78d830314" }, { "cell_type": "code", @@ -5226,20 +5921,22 @@ "ax.plot(X,Y,'kx',mew=1.5, linewidth=2)\n", "\n", "mlai.write_figure('noisy-sine.svg', directory='./gp')" - ] + ], + "id": "6ce1d82f-d9ee-472b-a5e2-8ba098d5a90c" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: Data from the noisy sine wave for fitting with a GPy\n", "model.\n", "\n", "A GP regression model based on an exponentiated quadratic covariance\n", "function can be defined by first defining a covariance function." - ] + ], + "id": "2530a600-05e1-4aa9-87fd-6434c82231ff" }, { "cell_type": "code", @@ -5248,14 +5945,16 @@ "outputs": [], "source": [ "kern = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)" - ] + ], + "id": "2c9ea062-8f28-40e0-b14e-1e2c239ed811" }, { "cell_type": "markdown", "metadata": {}, "source": [ "And then combining it with the data to form a Gaussian process model." - ] + ], + "id": "1e44d548-a057-42ed-aab8-05907315d2f4" }, { "cell_type": "code", @@ -5264,7 +5963,8 @@ "outputs": [], "source": [ "model = GPy.models.GPRegression(X,Y,kern)" - ] + ], + "id": "9e8c777b-bb08-40d5-a0df-ec968ae932f8" }, { "cell_type": "markdown", @@ -5272,7 +5972,8 @@ "source": [ "Just as for the covariance function object, we can find out about the\n", "model using the command `display(model)`." - ] + ], + "id": "f4e16e4b-62e3-4edd-b1fb-5465d8def64e" }, { "cell_type": "code", @@ -5281,7 +5982,8 @@ "outputs": [], "source": [ "display(model)" - ] + ], + "id": "9861f7b6-df78-47f0-8cb1-a8a59f82d1ec" }, { "cell_type": "markdown", @@ -5290,7 +5992,8 @@ "Note that by default the model includes some observation noise with\n", "variance 1. We can see the posterior mean prediction and visualize the\n", "marginal posterior variances using `model.plot()`." - ] + ], + "id": "ff3ab136-afc8-4c53-a9ba-e6b2be022b5a" }, { "cell_type": "code", @@ -5302,20 +6005,22 @@ "model.plot(ax=ax)\n", "\n", "mlai.write_figure('noisy-sine-gp-fit.svg', directory='./gp')" - ] + ], + "id": "c8d53cec-1bda-4184-ac3c-a43fcf90d506" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A Gaussian process fit to the noisy sine data. Here the\n", "parameters of the process and the covariance function haven’t yet been\n", "optimized.\n", "\n", "You can also look directly at the predictions for the model using." - ] + ], + "id": "ff3a6b7b-65be-4c68-b1d1-5ab2af038398" }, { "cell_type": "code", @@ -5325,7 +6030,8 @@ "source": [ "Xstar = np.linspace(0, 10, 100)[:, np.newaxis]\n", "Ystar, Vstar = model.predict(Xstar)" - ] + ], + "id": "c3b864a9-c307-4270-98e7-8edf136d2ba5" }, { "cell_type": "markdown", @@ -5333,21 +6039,22 @@ "source": [ "Which gives you the mean (`Ystar`), the variance (`Vstar`) at the\n", "locations given by `Xstar`." - ] + ], + "id": "01af8003-d7a7-4ab3-98ce-3cdebcb8a8f2" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Covariance Function Parameter Estimation\n", - "----------------------------------------\n", + "## Covariance Function Parameter Estimation\n", "\n", "As we have seen during the lectures, the parameters values can be\n", "estimated by maximizing the likelihood of the observations. Since we\n", - "don’t want one of the variance to become negative during the\n", + "don’t want any of the variances to become negative during the\n", "optimization, we can constrain all parameters to be positive before\n", - "running the optimisation." - ] + "running the optimization." + ], + "id": "edd056ac-3b8a-4960-bf97-c27d9b6f8e7d" }, { "cell_type": "code", @@ -5356,7 +6063,8 @@ "outputs": [], "source": [ "model.constrain_positive()" - ] + ], + "id": "d6f7b3b7-21aa-4432-b2f6-90714a5c9d8e" }, { "cell_type": "markdown", @@ -5366,9 +6074,10 @@ "default, the software is warning us that they are being reconstrained.\n", "\n", "Now we can optimize the model using the `model.optimize()` method. Here\n", - "we switch messages on, which allows us to see the progession of the\n", + "we switch messages on, which allows us to see the progression of the\n", "optimization." - ] + ], + "id": "062cbf43-9589-4660-b454-5768f969a50c" }, { "cell_type": "code", @@ -5377,18 +6086,20 @@ "outputs": [], "source": [ "model.optimize(messages=True)" - ] + ], + "id": "262d8496-2d82-4f52-a21f-6edd1bd24638" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By default the optimization is using a limited memory BFGS optimizer\n", + "By default, the optimization is using a limited memory BFGS optimizer\n", "(Byrd et al., 1995).\n", "\n", - "Once again we can display the model, now to see how the parameters have\n", + "Once again, we can display the model, now to see how the parameters have\n", "changed." - ] + ], + "id": "2710e78e-54bc-44f6-814b-8844b7ab25f1" }, { "cell_type": "code", @@ -5397,15 +6108,17 @@ "outputs": [], "source": [ "display(model)" - ] + ], + "id": "9954521a-35b3-4fb1-a7eb-e59942bf4138" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The lengthscale is much smaller, as well as the noise level. The\n", + "The length scale is much smaller, as well as the noise level. The\n", "variance of the exponentiated quadratic has also reduced." - ] + ], + "id": "c67102d8-b67a-49b5-8ea6-41d5fc2b2d69" }, { "cell_type": "code", @@ -5417,32 +6130,37 @@ "model.plot(ax=ax)\n", "\n", "mlai.write_figure('noisy-sine-gp-optimized-fit.svg', directory='./gp')" - ] + ], + "id": "162eca98-4bce-474f-8847-4dad4e52e87b" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\n", + "\n", "\n", "Figure: A Gaussian process fit to the noisy sine data with parameters\n", "optimized." - ] + ], + "id": "03a71ddf-e58d-456e-ad55-4f279fe6e71e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Review\n", - "------" - ] + "## Review" + ], + "id": "04436eb3-082b-4485-a574-02fcb65f0eec" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Other Software\n", - "--------------\n", + "## Other Software\n", + "\n", + "\\[edit\\]\n", "\n", "GPy has inspired other software solutions, first of all\n", "[GPflow](https://github.com/GPflow/GPflow), which uses Tensor Flow’s\n", @@ -5453,28 +6171,28 @@ "\n", "The Probabilistic programming language [pyro](https://pyro.ai/) also has\n", "GP support." - ] + ], + "id": "07c463ae-6d0f-4a44-a456-a76de6b4ba4e" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Further Reading\n", - "---------------\n", + "## Further Reading\n", "\n", "- Chapter 2 of Neal (1994)\n", "\n", "- Rest of Neal (1994)\n", "\n", "- All of MacKay (1992)" - ] + ], + "id": "c27bf95d-a480-4130-8600-70d94f305aa8" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Thanks!\n", - "-------\n", + "## Thanks!\n", "\n", "For more information on these subjects and more you might want to check\n", "the following resources.\n", @@ -5485,15 +6203,16 @@ " Page](http://www.theguardian.com/profile/neil-lawrence)\n", "- blog:\n", " [http://inverseprobability.com](http://inverseprobability.com/blog.html)" - ] + ], + "id": "76c3eb82-6609-4010-a909-bd9e0b0fb28d" }, { "cell_type": "markdown", "metadata": {}, "source": [ - "References\n", - "----------" - ] + "## References" + ], + "id": "0c3a5da3-84d5-44ae-9b35-d25f85639795" }, { "cell_type": "markdown", @@ -5508,10 +6227,11 @@ "bound constrained optimization. SIAM Journal on Scientific and\n", "Statistical Computing 16, 1190–1208.\n", "\n", - "Cho, Y., Saul, L.K., 2009. Kernel methods for deep learning, in: Bengio,\n", - "Y., Schuurmans, D., Lafferty, J.D., Williams, C.K.I., Culotta, A.\n", - "(Eds.), Advances in Neural Information Processing Systems 22. Curran\n", - "Associates, Inc., pp. 342–350.\n", + "Cho, Y., Saul, L.K., 2009. [Kernel methods for deep\n", + "learning](http://papers.nips.cc/paper/3628-kernel-methods-for-deep-learning.pdf),\n", + "in: Bengio, Y., Schuurmans, D., Lafferty, J.D., Williams, C.K.I.,\n", + "Culotta, A. (Eds.), Advances in Neural Information Processing Systems\n", + "22. Curran Associates, Inc., pp. 342–350.\n", "\n", "Gething, P.W., Noor, A.M., Gikandi, P.W., Ogara, E.A.A., Hay, S.I.,\n", "Nixon, M.S., Snow, R.W., Atkinson, P.M., 2006. Improving imperfect data\n", @@ -5519,8 +6239,9 @@ "geostatistics. PLoS Medicine 3.\n", "\n", "\n", - "Ioffe, S., Szegedy, C., 2015. Batch normalization: Accelerating deep\n", - "network training by reducing internal covariate shift, in: Bach, F.,\n", + "Ioffe, S., Szegedy, C., 2015. [Batch normalization: Accelerating deep\n", + "network training by reducing internal covariate\n", + "shift](http://proceedings.mlr.press/v37/ioffe15.html), in: Bach, F.,\n", "Blei, D. (Eds.), Proceedings of the 32nd International Conference on\n", "Machine Learning, Proceedings of Machine Learning Research. PMLR, Lille,\n", "France, pp. 448–456.\n", @@ -5548,7 +6269,8 @@ "Tipping, M.E., Bishop, C.M., 1999. Probabilistic principal component\n", "analysis. Journal of the Royal Statistical Society, B 6, 611–622.\n", "" - ] + ], + "id": "c44b0836-5102-4b02-ac1c-107b9e83c56e" } ], "nbformat": 4, diff --git a/slides/01-what-is-machine-learning.slides.html b/slides/01-what-is-machine-learning.slides.html index 9a74c27..8ddb65e 100644 --- a/slides/01-what-is-machine-learning.slides.html +++ b/slides/01-what-is-machine-learning.slides.html @@ -8,7 +8,7 @@ - + - - + + - + -\[\newcommand{\tk}[1]{} -\newcommand{\Amatrix}{\mathbf{A}} -\newcommand{\KL}[2]{\text{KL}\left( #1\,\|\,#2 \right)} -\newcommand{\Kaast}{\kernelMatrix_{\mathbf{ \ast}\mathbf{ \ast}}} -\newcommand{\Kastu}{\kernelMatrix_{\mathbf{ \ast} \inducingVector}} -\newcommand{\Kff}{\kernelMatrix_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\Kfu}{\kernelMatrix_{\mappingFunctionVector \inducingVector}} -\newcommand{\Kuast}{\kernelMatrix_{\inducingVector \bf\ast}} -\newcommand{\Kuf}{\kernelMatrix_{\inducingVector \mappingFunctionVector}} -\newcommand{\Kuu}{\kernelMatrix_{\inducingVector \inducingVector}} -\newcommand{\Kuui}{\Kuu^{-1}} -\newcommand{\Qaast}{\mathbf{Q}_{\bf \ast \ast}} -\newcommand{\Qastf}{\mathbf{Q}_{\ast \mappingFunction}} -\newcommand{\Qfast}{\mathbf{Q}_{\mappingFunctionVector \bf \ast}} -\newcommand{\Qff}{\mathbf{Q}_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\aMatrix}{\mathbf{A}} -\newcommand{\aScalar}{a} -\newcommand{\aVector}{\mathbf{a}} -\newcommand{\acceleration}{a} -\newcommand{\bMatrix}{\mathbf{B}} -\newcommand{\bScalar}{b} -\newcommand{\bVector}{\mathbf{b}} -\newcommand{\basisFunc}{\phi} -\newcommand{\basisFuncVector}{\boldsymbol{ \basisFunc}} -\newcommand{\basisFunction}{\phi} -\newcommand{\basisLocation}{\mu} -\newcommand{\basisMatrix}{\boldsymbol{ \Phi}} -\newcommand{\basisScalar}{\basisFunction} -\newcommand{\basisVector}{\boldsymbol{ \basisFunction}} -\newcommand{\activationFunction}{\phi} -\newcommand{\activationMatrix}{\boldsymbol{ \Phi}} -\newcommand{\activationScalar}{\basisFunction} -\newcommand{\activationVector}{\boldsymbol{ \basisFunction}} -\newcommand{\bigO}{\mathcal{O}} -\newcommand{\binomProb}{\pi} -\newcommand{\cMatrix}{\mathbf{C}} -\newcommand{\cbasisMatrix}{\hat{\boldsymbol{ \Phi}}} -\newcommand{\cdataMatrix}{\hat{\dataMatrix}} -\newcommand{\cdataScalar}{\hat{\dataScalar}} -\newcommand{\cdataVector}{\hat{\dataVector}} -\newcommand{\centeredKernelMatrix}{\mathbf{ \MakeUppercase{\centeredKernelScalar}}} -\newcommand{\centeredKernelScalar}{b} -\newcommand{\centeredKernelVector}{\centeredKernelScalar} -\newcommand{\centeringMatrix}{\mathbf{H}} -\newcommand{\chiSquaredDist}[2]{\chi_{#1}^{2}\left(#2\right)} -\newcommand{\chiSquaredSamp}[1]{\chi_{#1}^{2}} -\newcommand{\conditionalCovariance}{\boldsymbol{ \Sigma}} -\newcommand{\coregionalizationMatrix}{\mathbf{B}} -\newcommand{\coregionalizationScalar}{b} -\newcommand{\coregionalizationVector}{\mathbf{ \coregionalizationScalar}} -\newcommand{\covDist}[2]{\text{cov}_{#2}\left(#1\right)} -\newcommand{\covSamp}[1]{\text{cov}\left(#1\right)} -\newcommand{\covarianceScalar}{c} -\newcommand{\covarianceVector}{\mathbf{ \covarianceScalar}} -\newcommand{\covarianceMatrix}{\mathbf{C}} -\newcommand{\covarianceMatrixTwo}{\boldsymbol{ \Sigma}} -\newcommand{\croupierScalar}{s} -\newcommand{\croupierVector}{\mathbf{ \croupierScalar}} -\newcommand{\croupierMatrix}{\mathbf{ \MakeUppercase{\croupierScalar}}} -\newcommand{\dataDim}{p} -\newcommand{\dataIndex}{i} -\newcommand{\dataIndexTwo}{j} -\newcommand{\dataMatrix}{\mathbf{Y}} -\newcommand{\dataScalar}{y} -\newcommand{\dataSet}{\mathcal{D}} -\newcommand{\dataStd}{\sigma} -\newcommand{\dataVector}{\mathbf{ \dataScalar}} -\newcommand{\decayRate}{d} -\newcommand{\degreeMatrix}{\mathbf{ \MakeUppercase{\degreeScalar}}} -\newcommand{\degreeScalar}{d} -\newcommand{\degreeVector}{\mathbf{ \degreeScalar}} -\newcommand{\diag}[1]{\text{diag}\left(#1\right)} -\newcommand{\diagonalMatrix}{\mathbf{D}} -\newcommand{\diff}[2]{\frac{\text{d}#1}{\text{d}#2}} -\newcommand{\diffTwo}[2]{\frac{\text{d}^2#1}{\text{d}#2^2}} -\newcommand{\displacement}{x} -\newcommand{\displacementVector}{\textbf{\displacement}} -\newcommand{\distanceMatrix}{\mathbf{ \MakeUppercase{\distanceScalar}}} -\newcommand{\distanceScalar}{d} -\newcommand{\distanceVector}{\mathbf{ \distanceScalar}} -\newcommand{\eigenvaltwo}{\ell} -\newcommand{\eigenvaltwoMatrix}{\mathbf{L}} -\newcommand{\eigenvaltwoVector}{\mathbf{l}} -\newcommand{\eigenvalue}{\lambda} -\newcommand{\eigenvalueMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\eigenvalueVector}{\boldsymbol{ \lambda}} -\newcommand{\eigenvector}{\mathbf{ \eigenvectorScalar}} -\newcommand{\eigenvectorMatrix}{\mathbf{U}} -\newcommand{\eigenvectorScalar}{u} -\newcommand{\eigenvectwo}{\mathbf{v}} -\newcommand{\eigenvectwoMatrix}{\mathbf{V}} -\newcommand{\eigenvectwoScalar}{v} -\newcommand{\entropy}[1]{\mathcal{H}\left(#1\right)} -\newcommand{\errorFunction}{E} -\newcommand{\expDist}[2]{\left<#1\right>_{#2}} -\newcommand{\expSamp}[1]{\left<#1\right>} -\newcommand{\expectation}[1]{\left\langle #1 \right\rangle } -\newcommand{\expectationDist}[2]{\left\langle #1 \right\rangle _{#2}} -\newcommand{\expectedDistanceMatrix}{\mathcal{D}} -\newcommand{\eye}{\mathbf{I}} -\newcommand{\fantasyDim}{r} -\newcommand{\fantasyMatrix}{\mathbf{ \MakeUppercase{\fantasyScalar}}} -\newcommand{\fantasyScalar}{z} -\newcommand{\fantasyVector}{\mathbf{ \fantasyScalar}} -\newcommand{\featureStd}{\varsigma} -\newcommand{\gammaCdf}[3]{\mathcal{GAMMA CDF}\left(#1|#2,#3\right)} -\newcommand{\gammaDist}[3]{\mathcal{G}\left(#1|#2,#3\right)} -\newcommand{\gammaSamp}[2]{\mathcal{G}\left(#1,#2\right)} -\newcommand{\gaussianDist}[3]{\mathcal{N}\left(#1|#2,#3\right)} -\newcommand{\gaussianSamp}[2]{\mathcal{N}\left(#1,#2\right)} -\newcommand{\given}{|} -\newcommand{\half}{\frac{1}{2}} -\newcommand{\heaviside}{H} -\newcommand{\hiddenMatrix}{\mathbf{ \MakeUppercase{\hiddenScalar}}} -\newcommand{\hiddenScalar}{h} -\newcommand{\hiddenVector}{\mathbf{ \hiddenScalar}} -\newcommand{\identityMatrix}{\eye} -\newcommand{\inducingInputScalar}{z} -\newcommand{\inducingInputVector}{\mathbf{ \inducingInputScalar}} -\newcommand{\inducingInputMatrix}{\mathbf{Z}} -\newcommand{\inducingScalar}{u} -\newcommand{\inducingVector}{\mathbf{ \inducingScalar}} -\newcommand{\inducingMatrix}{\mathbf{U}} -\newcommand{\inlineDiff}[2]{\text{d}#1/\text{d}#2} -\newcommand{\inputDim}{q} -\newcommand{\inputMatrix}{\mathbf{X}} -\newcommand{\inputScalar}{x} -\newcommand{\inputSpace}{\mathcal{X}} -\newcommand{\inputVals}{\inputVector} -\newcommand{\inputVector}{\mathbf{ \inputScalar}} -\newcommand{\iterNum}{k} -\newcommand{\kernel}{\kernelScalar} -\newcommand{\kernelMatrix}{\mathbf{K}} -\newcommand{\kernelScalar}{k} -\newcommand{\kernelVector}{\mathbf{ \kernelScalar}} -\newcommand{\kff}{\kernelScalar_{\mappingFunction \mappingFunction}} -\newcommand{\kfu}{\kernelVector_{\mappingFunction \inducingScalar}} -\newcommand{\kuf}{\kernelVector_{\inducingScalar \mappingFunction}} -\newcommand{\kuu}{\kernelVector_{\inducingScalar \inducingScalar}} -\newcommand{\lagrangeMultiplier}{\lambda} -\newcommand{\lagrangeMultiplierMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\lagrangian}{L} -\newcommand{\laplacianFactor}{\mathbf{ \MakeUppercase{\laplacianFactorScalar}}} -\newcommand{\laplacianFactorScalar}{m} -\newcommand{\laplacianFactorVector}{\mathbf{ \laplacianFactorScalar}} -\newcommand{\laplacianMatrix}{\mathbf{L}} -\newcommand{\laplacianScalar}{\ell} -\newcommand{\laplacianVector}{\mathbf{ \ell}} -\newcommand{\latentDim}{q} -\newcommand{\latentDistanceMatrix}{\boldsymbol{ \Delta}} -\newcommand{\latentDistanceScalar}{\delta} -\newcommand{\latentDistanceVector}{\boldsymbol{ \delta}} -\newcommand{\latentForce}{f} -\newcommand{\latentFunction}{u} -\newcommand{\latentFunctionVector}{\mathbf{ \latentFunction}} -\newcommand{\latentFunctionMatrix}{\mathbf{ \MakeUppercase{\latentFunction}}} -\newcommand{\latentIndex}{j} -\newcommand{\latentScalar}{z} -\newcommand{\latentVector}{\mathbf{ \latentScalar}} -\newcommand{\latentMatrix}{\mathbf{Z}} -\newcommand{\learnRate}{\eta} -\newcommand{\lengthScale}{\ell} -\newcommand{\rbfWidth}{\ell} -\newcommand{\likelihoodBound}{\mathcal{L}} -\newcommand{\likelihoodFunction}{L} -\newcommand{\locationScalar}{\mu} -\newcommand{\locationVector}{\boldsymbol{ \locationScalar}} -\newcommand{\locationMatrix}{\mathbf{M}} -\newcommand{\variance}[1]{\text{var}\left( #1 \right)} -\newcommand{\mappingFunction}{f} -\newcommand{\mappingFunctionMatrix}{\mathbf{F}} -\newcommand{\mappingFunctionTwo}{g} -\newcommand{\mappingFunctionTwoMatrix}{\mathbf{G}} -\newcommand{\mappingFunctionTwoVector}{\mathbf{ \mappingFunctionTwo}} -\newcommand{\mappingFunctionVector}{\mathbf{ \mappingFunction}} -\newcommand{\scaleScalar}{s} -\newcommand{\mappingScalar}{w} -\newcommand{\mappingVector}{\mathbf{ \mappingScalar}} -\newcommand{\mappingMatrix}{\mathbf{W}} -\newcommand{\mappingScalarTwo}{v} -\newcommand{\mappingVectorTwo}{\mathbf{ \mappingScalarTwo}} -\newcommand{\mappingMatrixTwo}{\mathbf{V}} -\newcommand{\maxIters}{K} -\newcommand{\meanMatrix}{\mathbf{M}} -\newcommand{\meanScalar}{\mu} -\newcommand{\meanTwoMatrix}{\mathbf{M}} -\newcommand{\meanTwoScalar}{m} -\newcommand{\meanTwoVector}{\mathbf{ \meanTwoScalar}} -\newcommand{\meanVector}{\boldsymbol{ \meanScalar}} -\newcommand{\mrnaConcentration}{m} -\newcommand{\naturalFrequency}{\omega} -\newcommand{\neighborhood}[1]{\mathcal{N}\left( #1 \right)} -\newcommand{\neilurl}{http://inverseprobability.com/} -\newcommand{\noiseMatrix}{\boldsymbol{ E}} -\newcommand{\noiseScalar}{\epsilon} -\newcommand{\noiseVector}{\boldsymbol{ \epsilon}} -\newcommand{\norm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\normalizedLaplacianMatrix}{\hat{\mathbf{L}}} -\newcommand{\normalizedLaplacianScalar}{\hat{\ell}} -\newcommand{\normalizedLaplacianVector}{\hat{\mathbf{ \ell}}} -\newcommand{\numActive}{m} -\newcommand{\numBasisFunc}{m} -\newcommand{\numComponents}{m} -\newcommand{\numComps}{K} -\newcommand{\numData}{n} -\newcommand{\numFeatures}{K} -\newcommand{\numHidden}{h} -\newcommand{\numInducing}{m} -\newcommand{\numLayers}{\ell} -\newcommand{\numNeighbors}{K} -\newcommand{\numSequences}{s} -\newcommand{\numSuccess}{s} -\newcommand{\numTasks}{m} -\newcommand{\numTime}{T} -\newcommand{\numTrials}{S} -\newcommand{\outputIndex}{j} -\newcommand{\paramVector}{\boldsymbol{ \theta}} -\newcommand{\parameterMatrix}{\boldsymbol{ \Theta}} -\newcommand{\parameterScalar}{\theta} -\newcommand{\parameterVector}{\boldsymbol{ \parameterScalar}} -\newcommand{\partDiff}[2]{\frac{\partial#1}{\partial#2}} -\newcommand{\precisionScalar}{j} -\newcommand{\precisionVector}{\mathbf{ \precisionScalar}} -\newcommand{\precisionMatrix}{\mathbf{J}} -\newcommand{\pseudotargetScalar}{\widetilde{y}} -\newcommand{\pseudotargetVector}{\mathbf{ \pseudotargetScalar}} -\newcommand{\pseudotargetMatrix}{\mathbf{ \widetilde{Y}}} -\newcommand{\rank}[1]{\text{rank}\left(#1\right)} -\newcommand{\rayleighDist}[2]{\mathcal{R}\left(#1|#2\right)} -\newcommand{\rayleighSamp}[1]{\mathcal{R}\left(#1\right)} -\newcommand{\responsibility}{r} -\newcommand{\rotationScalar}{r} -\newcommand{\rotationVector}{\mathbf{ \rotationScalar}} -\newcommand{\rotationMatrix}{\mathbf{R}} -\newcommand{\sampleCovScalar}{s} -\newcommand{\sampleCovVector}{\mathbf{ \sampleCovScalar}} -\newcommand{\sampleCovMatrix}{\mathbf{s}} -\newcommand{\scalarProduct}[2]{\left\langle{#1},{#2}\right\rangle} -\newcommand{\sign}[1]{\text{sign}\left(#1\right)} -\newcommand{\sigmoid}[1]{\sigma\left(#1\right)} -\newcommand{\singularvalue}{\ell} -\newcommand{\singularvalueMatrix}{\mathbf{L}} -\newcommand{\singularvalueVector}{\mathbf{l}} -\newcommand{\sorth}{\mathbf{u}} -\newcommand{\spar}{\lambda} -\newcommand{\trace}[1]{\text{tr}\left(#1\right)} -\newcommand{\BasalRate}{B} -\newcommand{\DampingCoefficient}{C} -\newcommand{\DecayRate}{D} -\newcommand{\Displacement}{X} -\newcommand{\LatentForce}{F} -\newcommand{\Mass}{M} -\newcommand{\Sensitivity}{S} -\newcommand{\basalRate}{b} -\newcommand{\dampingCoefficient}{c} -\newcommand{\mass}{m} -\newcommand{\sensitivity}{s} -\newcommand{\springScalar}{\kappa} -\newcommand{\springVector}{\boldsymbol{ \kappa}} -\newcommand{\springMatrix}{\boldsymbol{ \mathcal{K}}} -\newcommand{\tfConcentration}{p} -\newcommand{\tfDecayRate}{\delta} -\newcommand{\tfMrnaConcentration}{f} -\newcommand{\tfVector}{\mathbf{ \tfConcentration}} -\newcommand{\velocity}{v} -\newcommand{\sufficientStatsScalar}{g} -\newcommand{\sufficientStatsVector}{\mathbf{ \sufficientStatsScalar}} -\newcommand{\sufficientStatsMatrix}{\mathbf{G}} -\newcommand{\switchScalar}{s} -\newcommand{\switchVector}{\mathbf{ \switchScalar}} -\newcommand{\switchMatrix}{\mathbf{S}} -\newcommand{\tr}[1]{\text{tr}\left(#1\right)} -\newcommand{\loneNorm}[1]{\left\Vert #1 \right\Vert_1} -\newcommand{\ltwoNorm}[1]{\left\Vert #1 \right\Vert_2} -\newcommand{\onenorm}[1]{\left\vert#1\right\vert_1} -\newcommand{\twonorm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\vScalar}{v} -\newcommand{\vVector}{\mathbf{v}} -\newcommand{\vMatrix}{\mathbf{V}} -\newcommand{\varianceDist}[2]{\text{var}_{#2}\left( #1 \right)} -\newcommand{\vecb}[1]{\left(#1\right):} -\newcommand{\weightScalar}{w} -\newcommand{\weightVector}{\mathbf{ \weightScalar}} -\newcommand{\weightMatrix}{\mathbf{W}} -\newcommand{\weightedAdjacencyMatrix}{\mathbf{A}} -\newcommand{\weightedAdjacencyScalar}{a} -\newcommand{\weightedAdjacencyVector}{\mathbf{ \weightedAdjacencyScalar}} -\newcommand{\onesVector}{\mathbf{1}} -\newcommand{\zerosVector}{\mathbf{0}} -\]

What is Machine Learning?

-

Neil D. Lawrence

+

Neil +D. Lawrence

-

Data Science Africa Summer School, Addis Ababa, Ethiopia

+

Data Science Africa Summer +School, Addis Ababa, Ethiopia

@@ -409,9 +122,6 @@

What is Machine Learning?

-
@@ -422,29 +132,49 @@

- +
-

Data Science Africa is a bottom up initiative for capacity building in data science, machine learning and AI on the African continent

+

Data Science Africa is a bottom up initiative for capacity building +in data science, machine learning and AI on the African continent

+
+ + +
+
+ +
+
+

+
- +
-
+

Example: Prediction of Malaria Incidence in Uganda

@@ -458,7 +188,7 @@

Example: Prediction of Malaria Incidence in Uganda

Martin Mubangizi - + @@ -469,9 +199,9 @@

Example: Prediction of Malaria Incidence in Uganda

-Ricardo Andrade Pacheco +Ricardo Andrade Pacecho - + @@ -484,12 +214,15 @@

Example: Prediction of Malaria Incidence in Uganda

John Quinn - +
    -
  • Work with Ricardo Andrade Pacheco, John Quinn and Martin Mubaganzi (Makerere University, Uganda)
  • +
  • Work with Ricardo Andrade Pacheco, John Quinn and Martin Mubangizi +(Makerere University, Uganda)
  • See AI-DEV Group.
  • +
  • See UN +Global Pulse Disease Outbreaks Site
@@ -497,20 +230,26 @@

Malaria Prediction in Uganda

- +
-

(Andrade-Pacheco et al., 2014; Mubangizi et al., 2014)

+
+(Andrade-Pacheco +et al., 2014; Mubangizi et al., 2014) +

Tororo District

- + @@ -518,24 +257,26 @@

Tororo District

The Tororo district, where the sentinel site, Nagongera, is located. -
+

Malaria Prediction in Nagongera (Sentinel Site)

- +

Mubende District

- + @@ -548,7 +289,7 @@

Malaria Prediction in Uganda

- +
@@ -560,20 +301,22 @@

Malaria Prediction in Uganda

GP School at Makerere

-
- +
+

Kabarole District

- + @@ -586,12 +329,14 @@

Early Warning System

- +
@@ -599,12 +344,13 @@

Early Warning Systems

- +
@@ -617,66 +363,87 @@

Rise of Machine Learning

  • Fundamentally dependent on models
  • \[ -\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction} +\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} +\text{prediction} \]

    Data Revolution

    - +

    Supply Chain

    -
    - +
    +

    Cromford

    -
    - +
    +
    -
    +

    -
    - +
    +

    Deep Freeze

    -
    - +
    +
    @@ -688,13 +455,33 @@

    Deep Freeze

    Deep Freeze

    -
    - +
    + +
    +
    +
    + +
    +
    +

    +
    +
    +
    +
    @@ -717,25 +504,30 @@

    For Africa

    Data Driven

      -
    • Machine Learning: Replicate Processes through direct use of data.
    • +
    • Machine Learning: Replicate Processes through direct use of +data.
    • Aim to emulate cognitive processes through the use of data.
    • -
    • Use data to provide new approaches in control and optimization that should allow for emulation of human motor skills.
    • +
    • Use data to provide new approaches in control and optimization that +should allow for emulation of human motor skills.

    Process Emulation

    • Key idea: emulate the process as a mathematical function.
    • -
    • Each function has a set of parameters which control its behaviour.
    • -
    • Learning is the process of changing these parameters to change the shape of the function
    • -
    • Choice of which class of mathematical functions we use is a vital component of our model.
    • +
    • Each function has a set of parameters which control its +behaviour.
    • +
    • Learning is the process of changing these parameters to +change the shape of the function
    • +
    • Choice of which class of mathematical functions we use is a vital +component of our model.

    Kapchorwa District

    - + @@ -750,16 +542,18 @@

    Olympic Marathon Data

    • Gold medal times for Olympic Marathon since 1896.
    • -
    • Marathons before 1924 didn’t have a standardised distance.
    • +
    • Marathons before 1924 didn’t have a standardized distance.
    • Present results using pace per km.
    • -
    • In 1904 Marathon was badly organised leading to very slow times.
    • +
    • In 1904 Marathon was badly organized leading to very slow +times.
    -
    - +
    +
    -Image from Wikimedia Commons http://bit.ly/16kMKHQ +Image from Wikimedia Commons http://bit.ly/16kMKHQ @@ -768,202 +562,95 @@

    Olympic Marathon Data

    Olympic Marathon Data

    - + -
    -

    Polynomial Fits to Olympic Data

    - -

    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    -
    -
    -

    -
    - - -
    +
    +

    Polynomial Fits to Olympic Marthon Data

    +
      +
    • Fit linear model with polynomial basis to marathon data.
    • +
    • Try different numbers of basis functions (different degress of +polynomial).
    • +
    • Check the quality of fit.
    • +
    -
    -

    -
    - +
    +

    Linear Fit

    +

    \[f(x, \mathbf{ w}) = w_0 + +w_1x\]

    +
    +
    +
    -
    -
    -

    -
    - -
    +
    -
    -

    -
    - +
    +

    Cubic Fit

    +

    \[f(x, \mathbf{ w}) = w_0 + w_1 x+ w_2 x^2 ++ w_3 x^3\]

    +
    +
    +
    -
    -
    -

    -
    - -
    +
    -
    -

    -
    - +
    +

    9th Degree Polynomial Fit

    +

    \[f(x, \mathbf{ w}) = w_0 + w_1 x+ w_2 x^2 ++ \dots + w_9 x^9\]

    +
    +
    +
    -
    -
    -

    -
    - -
    +
    -
    -

    -
    - +
    +

    16th Degree Polynomial Fit

    +

    \[f(x, \mathbf{ w}) = w_0 + w_1 x+ w_2 x^2 ++ \dots + w_{16} x^{16}\]

    +
    +
    +
    -
    -
    -

    -
    - -
    +
    -
    -

    -
    - +
    +

    26th Degree Polynomial Fit

    +

    \[f(x, \mathbf{ w}) = w_0 + w_1 x+ w_2 x^2 ++ \dots + w_{26} x^{26}\]

    +
    +
    +
    -
    -
    -

    -
    - -
    +

    What does Machine Learning do?

    @@ -980,40 +667,62 @@

    What does Machine Learning do?

    -
    +

    Codify Through Mathematical Functions

    • How does machine learning work?
    • Jumper (jersey/sweater) purchase with logistic regression
    -

    \[ \text{odds} = \frac{p(\text{bought})}{p(\text{not bought})} \]

    -

    \[ \log \text{odds} = \beta_0 + \beta_1 \text{age} + \beta_2 \text{latitude}.\]

    +

    \[ \text{odds} = +\frac{p(\text{bought})}{p(\text{not bought})} \]

    +

    \[ \log \text{odds} = w_0 + w_1 +\text{age} + w_2 \text{latitude}.\]

    +
    +
    +

    Sigmoid Function

    +
    +
    + + +
    +
    +
    -
    +

    Codify Through Mathematical Functions

    • How does machine learning work?
    • Jumper (jersey/sweater) purchase with logistic regression
    -

    \[ p(\text{bought}) = \sigma\left(\beta_0 + \beta_1 \text{age} + \beta_2 \text{latitude}\right).\]

    +

    \[ p(\text{bought}) = \sigma\left(w_0 + +w_1 \text{age} + w_2 \text{latitude}\right).\]

    -
    +

    Codify Through Mathematical Functions

    • How does machine learning work?
    • Jumper (jersey/sweater) purchase with logistic regression
    -

    \[ p(\text{bought}) = \sigma\left(\boldsymbol{\beta}^\top \mathbf{ x}\right).\]

    +

    \[ p(\text{bought}) += \sigma\left(\mathbf{ w}^\top \mathbf{ x}\right).\]

    -
    +

    Codify Through Mathematical Functions

    • How does machine learning work?
    • Jumper (jersey/sweater) purchase with logistic regression
    -

    \[ y= f\left(\mathbf{ x}, \boldsymbol{\beta}\right).\]

    +

    \[ y= f\left(\mathbf{ x}, \mathbf{ +w}\right).\]

    -

    We call \(f(\cdot)\) the prediction function.

    +

    We call \(f(\cdot)\) the +prediction function.

    @@ -1021,47 +730,62 @@

    Fit to Data

    • Use an objective function
    -

    \[E(\boldsymbol{\beta}, \mathbf{Y}, \mathbf{X})\]

    +

    \[E(\mathbf{ w}, \mathbf{Y}, +\mathbf{X})\]

      -
    • E.g. least squares \[E(\boldsymbol{\beta}, \mathbf{Y}, \mathbf{X}) = \sum_{i=1}^n\left(y_i - f(\mathbf{ x}_i, \boldsymbol{\beta})\right)^2.\]
    • +
    • E.g. least squares \[E(\mathbf{ w}, +\mathbf{Y}, \mathbf{X}) = \sum_{i=1}^n\left(y_i - f(\mathbf{ x}_i, +\mathbf{ w})\right)^2.\]

    Two Components

      -
    • Prediction function, \(f(\cdot)\)
    • -
    • Objective function, \(E(\cdot)\)
    • +
    • Prediction function, \(f(\cdot)\)
    • +
    • Objective function, \(E(\cdot)\)
    -
    +
    +

    Prediction vs Interpretation

    +

    \[ p(\text{bought}) = \sigma\left(w_0 + +w_1 \text{age} + w_2 \text{latitude}\right).\]

    +

    \[ p(\text{bought}) = \sigma\left(\beta_0 ++ \beta_1 \text{age} + \beta_2 \text{latitude}\right).\]

    +
    +

    -

    \[\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

    +

    \[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

    From Model to Decision

    -
    +

    - +
    -\[\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}\] +\[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\] - +
    -
    +

    Artificial Intelligence and Data Science

    • AI aims to equip computers with human capabilities @@ -1082,7 +806,9 @@

      Supervised Learning for AI

    • Generate large labelled data set from humans.
    • Use supervised learning to emulate that data.
        -
      • E.g. ImageNet Russakovsky et al. (2015)
      • +
      • E.g. ImageNet Russakovsky et al. +(2015)
  • Significant advances due to deep learning @@ -1095,43 +821,62 @@

    Supervised Learning for AI

    Data Science

    • Arises from happenstance data.
    • -
    • Differs from statistics in that the question comes after data collection.
    • +
    • Differs from statistics in that the question comes after +data collection.
  • -
    +

    Neural Networks and Prediction Functions

      -
    • adaptive non-linear function models inspired by simple neuron models (McCulloch and Pitts, 1943)
    • +
    • adaptive non-linear function models inspired by simple neuron models +(McCulloch and +Pitts, 1943)
    • have become popular because of their ability to model data.
    • can be composed to form highly complex functions
    • start by focussing on one hidden layer
    -
    +

    Prediction Function of One Hidden Layer

    \[ -f(\mathbf{ x}) = \left.\mathbf{ w}^{(2)}\right.^\top \boldsymbol{ \phi}(\mathbf{W}_{1}, \mathbf{ x}) +f(\mathbf{ x}) = \left.\mathbf{ w}^{(2)}\right.^\top \boldsymbol{ +\phi}(\mathbf{W}_{1}, \mathbf{ x}) \]

    -

    \(f(\cdot)\) is a scalar function with vector inputs,

    -

    \(\boldsymbol{ \phi}(\cdot)\) is a vector function with vector inputs.

    +

    \(f(\cdot)\) is a scalar function +with vector inputs,

    +

    \(\boldsymbol{ \phi}(\cdot)\) is a +vector function with vector inputs.

      -
    • dimensionality of the vector function is known as the number of hidden units, or the number of neurons.

    • -
    • elements of \(\boldsymbol{ \phi}(\cdot)\) are the activation function of the neural network

    • -
    • elements of \(\mathbf{W}_{1}\) are the parameters of the activation functions.

    • +
    • dimensionality of the vector function is known as the number of +hidden units, or the number of neurons.

    • +
    • elements of \(\boldsymbol{ +\phi}(\cdot)\) are the activation function of the neural +network

    • +
    • elements of \(\mathbf{W}_{1}\) +are the parameters of the activation functions.

    Relations with Classical Statistics

      -
    • In statistics activation functions are known as basis functions.

    • -
    • would think of this as a linear model: not linear predictions, linear in the parameters

    • -
    • \(\mathbf{ w}_{1}\) are static parameters.

    • +
    • In statistics activation functions are known as basis +functions.

    • +
    • would think of this as a linear model: not linear +predictions, linear in the parameters

    • +
    • \(\mathbf{ w}_{1}\) are +static parameters.

    Adaptive Basis Functions

      -
    • In machine learning we optimize \(\mathbf{W}_{1}\) as well as \(\mathbf{W}_{2}\) (which would normally be denoted in statistics by \(\boldsymbol{\beta}\)).
    • +
    • In machine learning we optimize \(\mathbf{W}_{1}\) as well as \(\mathbf{W}_{2}\) (which would normally be +denoted in statistics by \(\boldsymbol{\beta}\)).
    @@ -1182,39 +927,64 @@

    Introduction to Classification

    Classification

      -
    • Wake word classification (Global Pulse Project).

    • -
    • Breakthrough in 2012 with ImageNet result of Alex Krizhevsky, Ilya Sutskever and Geoff Hinton

    • -
    • We are given a data set containing ‘inputs’, \(\mathbf{X}\) and ‘targets’, \(\mathbf{ y}\).

    • -
    • Each data point consists of an input vector \(\mathbf{ x}_i\) and a class label, \(y_i\).

    • -
    • For binary classification assume \(y_i\) should be either \(1\) (yes) or \(-1\) (no).

    • +
    • Wake word classification (Global Pulse +Project).

    • +
    • Breakthrough in 2012 with ImageNet result of Alex +Krizhevsky, Ilya Sutskever and Geoff Hinton

    • +
    • We are given a data set containing ‘inputs’, \(\mathbf{X}\) and ‘targets’, \(\mathbf{ y}\).

    • +
    • Each data point consists of an input vector \(\mathbf{ x}_i\) and a class label, \(y_i\).

    • +
    • For binary classification assume \(y_i\) should be either \(1\) (yes) or \(-1\) (no).

    • Input vector can be thought of as features.

    Discrete Probability

      -
    • Algorithms based on prediction function and objective function.
    • -
    • For regression the codomain of the functions, \(f(\mathbf{X})\) was the real numbers or sometimes real vectors.
    • -
    • In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, \(y\) which either takes the value \(-1\) or \(1\).
    • +
    • Algorithms based on prediction function and +objective function.
    • +
    • For regression the codomain of the functions, \(f(\mathbf{X})\) was the real numbers or +sometimes real vectors.
    • +
    • In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, +\(y\) which either takes the value +\(-1\) or \(1\).

    Classification

      -
    • Inputs, \(\mathbf{ x}\), mapped to a label, \(y\), through a function \(f(\cdot)\) dependent on parameters, \(\mathbf{ w}\), \[ +
    • Inputs, \(\mathbf{ x}\), mapped to +a label, \(y\), through a function +\(f(\cdot)\) dependent on parameters, +\(\mathbf{ w}\), \[ y= f(\mathbf{ x}; \mathbf{ w}). \]
    • -
    • \(f(\cdot)\) is known as the prediction function.
    • +
    • \(f(\cdot)\) is known as the +prediction function.

    Classification Examples

      -
    • Classifiying hand written digits from binary images (automatic zip code reading)
    • +
    • Classifiying hand written digits from binary images (automatic zip +code reading)
    • Detecting faces in images (e.g. digital cameras).
    • Who a detected face belongs to (e.g. Facebook, DeepFace)
    • Classifying type of cancer given gene expression data.
    • -
    • Categorization of document types (different types of news article on the internet)
    • +
    • Categorization of document types (different types of news article on +the internet)
    @@ -1222,313 +992,184 @@

    Perceptron

    -

    -
    -
    -

    +

    + + +

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - + - -
    -

    - +

    Simple classification with the perceptron algorithm.

    @@ -1536,11 +1177,15 @@

    Logistic Regression and GLMs

      -
    • Modelling entire density allows any question to be answered (also missing data).
    • -
    • Comes at the possible expense of strong assumptions about data generation distribution.
    • -
    • In regression we model probability of \(y_i |\mathbf{ x}_i\) directly. +
    • Modelling entire density allows any question to be answered (also +missing data).
    • +
    • Comes at the possible expense of strong assumptions about +data generation distribution.
    • +
    • In regression we model probability of \(y_i |\mathbf{ x}_i\) directly.
        -
      • Allows less flexibility in the question, but more flexibility in the model assumptions.
      • +
      • Allows less flexibility in the question, but more +flexibility in the model assumptions.
    • Can do this not just for regression, but classification.
    • Framework is known as generalized linear models.
    • @@ -1550,32 +1195,55 @@

      Logistic Regression and GLMs

      Log Odds

      • model the log-odds with the basis functions.
      • -
      • odds are defined as the ratio of the probability of a positive outcome, to the probability of a negative outcome.
      • -
      • Probability is between zero and one, odds are: \[ \frac{\pi}{1-\pi} \]
      • -
      • Odds are between \(0\) and \(\infty\).
      • -
      • Logarithm of odds maps them to \(-\infty\) to \(\infty\).
      • +
      • odds are defined as +the ratio of the probability of a positive outcome, to the probability +of a negative outcome.
      • +
      • Probability is between zero and one, odds are: \[ \frac{\pi}{1-\pi} \]
      • +
      • Odds are between \(0\) and \(\infty\).
      • +
      • Logarithm of odds maps them to \(-\infty\) to \(\infty\).

    Logistic function

      -
    • Logistic (or sigmoid) squashes real line to between 0 & 1. Sometimes also called a ‘squashing function’.
    • +
    • Logistic (or +sigmoid) squashes real line to between 0 & 1. Sometimes also called +a ‘squashing function’. +
    @@ -1584,29 +1252,41 @@

    Basis Function

    Prediction Function

      -
    • Can now write \(\pi\) as a function of the input and the parameter vector as, \[\pi(\mathbf{ x},\mathbf{ w}) = \frac{1}{1+ -\exp\left(-\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x})\right)}.\]
    • -
    • Compute the output of a standard linear basis function composition (\(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x})\), as we did for linear regression)
    • -
    • Apply the inverse link function, \(g(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}))\).
    • -
    • Use this value in a Bernoulli distribution to form the likelihood.
    • +
    • Can now write \(\pi\) as a function +of the input and the parameter vector as, \[\pi(\mathbf{ x},\mathbf{ w}) = \frac{1}{1+ +\exp\left(-\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x})\right)}.\]
    • +
    • Compute the output of a standard linear basis function composition +(\(\mathbf{ w}^\top \boldsymbol{ +\phi}(\mathbf{ x})\), as we did for linear regression)
    • +
    • Apply the inverse link function, \(g(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ +x}))\).
    • +
    • Use this value in a Bernoulli distribution to form the +likelihood.

    Bernoulli Reminder

      -
    • From last time \[P(y_i|\mathbf{ w}, \mathbf{ x}) = \pi_i^{y_i} (1-\pi_i)^{1-y_i}\]

    • +
    • From last time \[P(y_i|\mathbf{ w}, +\mathbf{ x}) = \pi_i^{y_i} (1-\pi_i)^{1-y_i}\]

    • Trick for switching betwen probabilities

    -
    def bernoulli(y, pi):
    -    if y == 1:
    -        return pi
    -    else:
    -return 1-pi
    +
    def bernoulli(y, pi):
    +    if y == 1:
    +        return pi
    +    else:
    +return 1-pi

    Maximum Likelihood

      -
    • Conditional independence of data: \[P(\mathbf{ y}|\mathbf{ w}, \mathbf{X}) = \prod_{i=1}^nP(y_i|\mathbf{ w}, +
    • Conditional independence of data: \[P(\mathbf{ y}|\mathbf{ w}, \mathbf{X}) = +\prod_{i=1}^nP(y_i|\mathbf{ w}, \mathbf{ x}_i). \]
    @@ -1614,32 +1294,43 @@

    Maximum Likelihood

    Log Likelihood

    \[\begin{align*} \log P(\mathbf{ y}|\mathbf{ w}, \mathbf{X}) = & - \sum_{i=1}^n\log P(y_i|\mathbf{ w}, \mathbf{ x}_i) \\ = &\sum_{i=1}^ny_i \log + \sum_{i=1}^n\log P(y_i|\mathbf{ w}, \mathbf{ x}_i) \\ = +&\sum_{i=1}^ny_i \log \pi_i \\ & + \sum_{i=1}^n(1-y_i)\log (1-\pi_i) \end{align*}\]

    Objective Function

      -
    • Probability of positive outcome for the \(i\)th data point \[\pi_i = g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}_i)\right),\] where \(g(\cdot)\) is the inverse link function
    • -
    • Objective function of the form \[\begin{align*} +
    • Probability of positive outcome for the \(i\)th data point \[\pi_i = g\left(\mathbf{ w}^\top \boldsymbol{ +\phi}(\mathbf{ x}_i)\right),\] where \(g(\cdot)\) is the inverse link +function
    • +
    • Objective function of the form \[\begin{align*} E(\mathbf{ w}) = & - \sum_{i=1}^ny_i \log - g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}_i)\right) \\& - + g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}_i)\right) +\\& - \sum_{i=1}^n(1-y_i)\log \left(1-g\left(\mathbf{ w}^\top \boldsymbol{ \phi}(\mathbf{ x}_i)\right)\right). - \end{align*}\]
    • +\end{align*}\]

    Minimize Objective

      -
    • Grdient wrt \(\pi(\mathbf{ x};\mathbf{ w})\) \[\begin{align*} +
    • Grdient wrt \(\pi(\mathbf{ x};\mathbf{ +w})\) \[\begin{align*} \frac{\text{d}E(\mathbf{ w})}{\text{d}\mathbf{ w}} = & -\sum_{i=1}^n\frac{y_i}{g\left(\mathbf{ w}^\top -\boldsymbol{ \phi}(\mathbf{ x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} +\boldsymbol{ \phi}(\mathbf{ +x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} \boldsymbol{ \phi}(\mathbf{ x}_i) \\ & + \sum_{i=1}^n \frac{1-y_i}{1-g\left(\mathbf{ w}^\top -\boldsymbol{ \phi}(\mathbf{ x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} +\boldsymbol{ \phi}(\mathbf{ +x})\right)}\frac{\text{d}g(f_i)}{\text{d}f_i} \boldsymbol{ \phi}(\mathbf{ x}_i) \end{align*}\]
    @@ -1647,10 +1338,12 @@

    Minimize Objective

    Optimization of the Function

      -
    • Can’t find a stationary point of the objective function analytically.
    • +
    • Can’t find a stationary point of the objective function +analytically.
    • Optimization has to proceed by numerical methods.
    • -
    • Similarly to matrix factorization, for large data stochastic gradient descent (Robbins Munro (Robbins and Monro, 1951) optimization procedure) works well.
    • +
    • Similarly to matrix factorization, for large data stochastic +gradient descent (Robbins Munro (Robbins and Monro, 1951) +optimization procedure) works well.
    -

    data.head()}

    +
    -
    +
    +

    Nigeria NMIS Data

    +
    +
    +

    Nigeria NMIS Data: Notebook

    +
    +
    +
    + +
    +
    +
    + +
    +
    +

    Nigeria NMIS Data Classification

    +
    +

    Batch Gradient Descent

    Stochastic Gradient Descent

    -
    -

    Exercise 2

    -

    Now construct a stochastic gradient descent algorithm and run it on the data. Is it faster or slower than batch gradient descent? What can you do to improve convergence speed?

    -

    Regression

      @@ -1702,18 +1420,25 @@

      Regression

      Regression Examples

        -
      • Predict a real value, \(y_i\) given some inputs \(\mathbf{ x}_i\).
      • -
      • Predict quality of meat given spectral measurements (Tecator data).
      • -
      • Radiocarbon dating, the C14 calibration curve: predict age given quantity of C14 isotope.
      • -
      • Predict quality of different Go or Backgammon moves given expert rated training data.
      • +
      • Predict a real value, \(y_i\) given +some inputs \(\mathbf{ x}_i\).
      • +
      • Predict quality of meat given spectral measurements (Tecator +data).
      • +
      • Radiocarbon dating, the C14 calibration curve: predict age given +quantity of C14 isotope.
      • +
      • Predict quality of different Go or Backgammon moves given expert +rated training data.

      Supervised Learning Challenges

        -
      1. choosing which features, \(\mathbf{ x}\), are relevant in the prediction
      2. -
      3. defining the appropriate class of function, \(f(\cdot)\).
      4. -
      5. selecting the right parameters, \(\mathbf{ w}\).
      6. +
      7. choosing which features, \(\mathbf{ +x}\), are relevant in the prediction
      8. +
      9. defining the appropriate class of function, \(f(\cdot)\).
      10. +
      11. selecting the right parameters, \(\mathbf{ +w}\).
      @@ -1728,13 +1453,16 @@

      Feature Selection

      Applications

      • rank search results, what adverts to show, newsfeed ranking
      • -
      • Features: number of likes, image present, friendship relationship
      • +
      • Features: number of likes, image present, friendship +relationship
      -

      Class of Function, \(f(\cdot)\)

      +

      Class of Function, \(f(\cdot)\)

        -
      • Mapping characteristic between \(\mathbf{ x}\) and \(y\)? +
      • Mapping characteristic between \(\mathbf{ +x}\) and \(y\)?
        • smooth (similar inputs lead to similar outputs).
        • linear function.
        • @@ -1742,17 +1470,34 @@

          Class of Function, \(f(\cdot)\)

      -
      +

      +
      + + + + + +Aki Vehtari + + + +
      - +
      @@ -1763,12 +1508,12 @@

      Gelman Book

      - +
      - +
      @@ -1776,9 +1521,13 @@

      Gelman Book

      -

      Gelman et al. (2013)

      +
      +Gelman et al. +(2013) +

      Class of Function: Neural Networks

      @@ -1792,7 +1541,8 @@

      Class of Function: Invariances

      • An invariance is a transformation of the input
          -
        • e.g. a cat remains a cat regardless of location (translation), size (scale) or upside-down (rotation and reflection).
        • +
        • e.g. a cat remains a cat regardless of location (translation), size +(scale) or upside-down (rotation and reflection).
      @@ -1803,62 +1553,83 @@

      Deep Learning

      Deep Learning

        -
      • These are interpretable models: vital for disease modeling etc.

      • +
      • These are interpretable models: vital for disease modeling +etc.

      • Modern machine learning methods are less interpretable

      • Example: face recognition

      -
      -

      DeepFace

      -

      Outline of the DeepFace architecture. A front-end of a single convolution-pooling-convolution filtering on the rectified input, followed by three locally-connected layers and two fully-connected layers. Color illustrates feature maps produced at each layer. The net includes more than 120 million parameters, where more than 95% come from the local and fully connected.

      +
      +

      +

      Outline of the DeepFace +architecture. A front-end of a single convolution-pooling-convolution +filtering on the rectified input, followed by three locally-connected +layers and two fully-connected layers. Color illustrates feature maps +produced at each layer. The net includes more than 120 million +parameters, where more than 95% come from the local and fully +connected.

      - +
      -

      Source: DeepFace (Taigman et al., 2014)

      +
      +Source: DeepFace (Taigman et al., 2014) +
      -
      -

      Deep Learning as Pinball

      +
      +

      -
      - +
      +
      -
      +

      - + -
      +

      - +
      @@ -1873,16 +1644,20 @@

      Encoding Knowledge

      Choosing Prediction Function

        -
      • Any function e.g. polynomials for olympic data \[ +
      • Any function e.g. polynomials for olympic data \[ f(x) = w_0 + w_1 x+ w_2 x^2 + w_3 x^3 + w_4 x^4. \]
      -
      +

      Parameter Estimation: Objective Functions

        -
      • After choosing features and function class we need parameters.
      • -
      • Estimate \(\mathbf{ w}\) by specifying an objective function.
      • +
      • After choosing features and function class we need +parameters.
      • +
      • Estimate \(\mathbf{ w}\) by +specifying an objective function.
      @@ -1897,8 +1672,17 @@

      Labels and Squared Error

      Data Provision

        -
      • Given \(n\) inputs, \(\mathbf{ x}_1\), \(\mathbf{ x}_2\), \(\mathbf{ x}_3\), \(\dots\), \(\mathbf{ x}_n\)
      • -
      • And labels \(y_1\), \(y_2\), \(y_3\), \(\dots\), \(y_n\).
      • +
      • Given \(n\) inputs, \(\mathbf{ x}_1\), \(\mathbf{ x}_2\), \(\mathbf{ x}_3\), \(\dots\), \(\mathbf{ x}_n\)
      • +
      • And labels \(y_1\), \(y_2\), \(y_3\), \(\dots\), \(y_n\).
      • Sometimes label is cheap e.g. Newsfeed ranking
      • Often it is very expensive.
          @@ -1911,7 +1695,8 @@

          Annotation

          • Human annotators
              -
            • E.g. in ImageNet annotated using Amazon’s Mechanical Turk. (AI?)
            • +
            • E.g. in ImageNet annotated using Amazon’s Mechanical Turk. +(AI?)
          • Without humans no AI.
          • Not real intelligence, emulated
          • @@ -1964,13 +1749,15 @@

            Difficult Trap

          • Validation data is different from test data.
      -
      +

      Hold Out Validation on Olympic Marathon Data

      Overfitting

        -
      • Increase number of basis functions we obtain a better ‘fit’ to the data.
      • +
      • Increase number of basis functions we obtain a better ‘fit’ to the +data.
      • How will the model perform on previously unseen data?
      • Let’s consider predicting the future.
      @@ -1980,82 +1767,63 @@

      Future Prediction: Extrapolation

      -

      -
      -
      -

      -
      - +

      + + +

      +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      @@ -2067,17 +1835,21 @@

      Extrapolation

    • Extrapolation is predicting into the future here, but could be:
      • Predicting back to the unseen past (pre 1892)
      • -
      • Spatial prediction (e.g. Cholera rates outside Manchester given rates inside Manchester).
      • +
      • Spatial prediction (e.g. Cholera rates outside Manchester given +rates inside Manchester).
    • Interpolation

        -
      • Predicting the wining time for 1946 Olympics is interpolation.
      • +
      • Predicting the wining time for 1946 Olympics is +interpolation.
      • This is because we have times from 1936 and 1948.
      • -
      • If we want a model for interpolation how can we test it?
      • -
      • One trick is to sample the validation set from throughout the data set.
      • +
      • If we want a model for interpolation how can we test +it?
      • +
      • One trick is to sample the validation set from throughout the data +set.
      @@ -2085,122 +1857,103 @@

      Future Prediction: Interpolation

      -

      -
      -
      -

      -
      - +

      + + +

      +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +

      Choice of Validation Set

        -
      • The choice of validation set should reflect how you will use the model in practice.
      • -
      • For extrapolation into the future we tried validating with data from the future.
      • +
      • The choice of validation set should reflect how you will use the +model in practice.
      • +
      • For extrapolation into the future we tried validating with data from +the future.
      • For interpolation we chose validation set from data.
      • For different validation sets we could get different results.
      -
      -

      Exercise 3

      -

      For both the linear and quadratic models, fit the model to the data up until 1980 and then compute the error on the held out data (from 1980 onwards). Which model performs better on the validation data?

      -
      -
      -

      Exercise 4

      -

      Now we are going to build a more sophisticated form of basis function, one that can accept arguments to its inputs (similar to those we used in this lab). Here we will start with a polynomial basis.

      -
      def polynomial(x, degree, loc, scale):
      -    degrees =np.arange(degree+1)
      -    return ((x-loc)/scale)**degrees
      -

      The basis as we’ve defined it has three arguments as well as the input. The degree of the polynomial, the scale of the polynomial and the offset. These arguments need to be passed to the basis functions whenever they are called. Modify your code to pass these additional arguments to the python function for creating the basis. Do this for each of your functions predict, fit and objective. You will find *args (or **kwargs) useful.

      -

      Write code that tries to fit different models to the data with polynomial basis. Use a maximum degree for your basis from 0 to 17. For each polynomial store the hold out validation error and the training error. When you have finished the computation plot the hold out error for your models and the training error for your p. When computing your polynomial basis use offset=1956. and scale=120. to ensure that the data is mapped (roughly) to the -1, 1 range.

      -

      Which polynomial has the minimum training error? Which polynomial has the minimum validation error?

      -

      Bias Variance Decomposition

      -

      Generalisation error \[ -\mathbb{E}\left[ \left(y- f^*(\mathbf{ y})\right)^2 \right]. -\] Decompose as \[ -\mathbb{E}\left[ \left(y- f(\mathbf{ y})\right)^2 \right] = \text{bias}\left[f^*(\mathbf{ y})\right]^2 + \text{variance}\left[f^*(\mathbf{ y})\right] +\sigma^2, +

      Generalisation error \[\begin{align*} +R(\mathbf{ w}) = & \int \left(y- f^*(\mathbf{ x})\right)^2 +\mathbb{P}(y, \mathbf{ x}) \text{d}y\text{d}\mathbf{ x}\\ +& \triangleq \mathbb{E}\left[ \left(y- f^*(\mathbf{ x})\right)^2 +\right]. +\end{align*}\]

      +
      +
      +

      Decompose

      +

      Decompose as \[ +\begin{align*} +\mathbb{E}\left[ \left(y- f(\mathbf{ x})\right)^2 \right] = & +\text{bias}\left[f^*(\mathbf{ x})\right]^2 \\ +& + \text{variance}\left[f^*(\mathbf{ x})\right] \\ \\ +&+\sigma^2, +\end{align*} \]

      Bias

      • Given by \[ -\text{bias}\left[f^*(\mathbf{ y})\right] = -\mathbb{E}\left[f^*(\mathbf{ y})\right] * f(\mathbf{ y}) +\text{bias}\left[f^*(\mathbf{ x})\right] = +\mathbb{E}\left[f^*(\mathbf{ x})\right] - f(\mathbf{ x}) \]

      • Error due to bias comes from a model that’s too simple.

      @@ -2209,13 +1962,16 @@

      Bias

      Variance

      • Given by \[ -\text{variance}\left[f^*(\mathbf{ y})\right] = \mathbb{E}\left[\left(f^*(\mathbf{ y}) - \mathbb{E}\left[f^*(\mathbf{ y})\right]\right)^2\right]. +\text{variance}\left[f^*(\mathbf{ x})\right] = +\mathbb{E}\left[\left(f^*(\mathbf{ x}) - \mathbb{E}\left[f^*(\mathbf{ +x})\right]\right)^2\right]. \]

      • -
      • Slight variations in the training set cause changes in the prediction. Error due to variance is error in the model due to an overly complex model.

      • +
      • Slight variations in the training set cause changes in the +prediction. Error due to variance is error in the model due to an overly +complex model.

      -

      .

      -
      +

      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      -
      +

      - +
      @@ -2456,7 +2135,7 @@

      Overfitting

      -
      @@ -2465,192 +2144,118 @@

      Overfitting

      Alex Ihler on Polynomials and Overfitting

      -
      +

      Olympic Data with Bayesian Polynomials

      -

      -
      -
      -

      +

      + + +

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + @@ -2659,187 +2264,138 @@

      Hold Out Validation

      -

      - -
      -

      -
      - +

      + + +

      +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      @@ -2848,187 +2404,138 @@

      5-fold Cross Validation

      -

      - -
      -

      -
      - +

      + + +

      +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      - -
      -

      -
      - +
      +
      @@ -3036,54 +2543,113 @@

      Thanks!

      References

      -
      -
      -

      Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. Consistent mapping of government malaria records across a changing territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5

      -
      -
      -

      Gelman, A., Carlin, J.B., Stern, H.S., Rubin, D.B., 2013. Bayesian data analysis, 3rd ed. Chapman; Hall.

      -
      -
      -

      McCulloch, W.S., Pitts, W., 1943. A logical calculus of the ideas immanent in nervous activity. Bulletin of Mathematical Biophysics 5, 115–133.

      -
      -
      -

      Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, N.D., 2014. Malaria surveillance with multiple data sources using Gaussian process models, in: 1st International Conference on the Use of Mobile ICT in Africa.

      -
      -
      -

      Robbins, H., Monro, S., 1951. A stochastic approximation method. Annals of Mathematical Statistics 22, 400–407.

      -
      -
      -

      Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, L., 2015. ImageNet Large Scale Visual Recognition Challenge. International Journal of Computer Vision (IJCV) 115, 211–252. https://doi.org/10.1007/s11263-015-0816-y

      -
      -
      -

      Taigman, Y., Yang, M., Ranzato, M., Wolf, L., 2014. DeepFace: Closing the gap to human-level performance in face verification, in: Proceedings of the IEEE Computer Society Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/CVPR.2014.220

      +
      +
      +Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. +Consistent mapping of government malaria records across a changing +territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5 +
      +
      +Gelman, A., Carlin, J.B., Stern, H.S., Dunson, D.B., Vehtari, A., Rubin, +D.B., 2013. Bayesian data analysis, 3rd ed. Chapman; Hall. +
      +
      +McCulloch, W.S., Pitts, W., 1943. A logical calculus of the ideas +immanent in nervous activity. Bulletin of Mathematical Biophysics 5, +115–133. https://doi.org/10.1007/BF02478259 +
      +
      +Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, +N.D., 2014. Malaria surveillance with multiple data sources using +Gaussian process models, in: 1st International Conference +on the Use of Mobile ICT in Africa. +
      +
      +Robbins, H., Monro, S., 1951. A stochastic approximation method. Annals +of Mathematical Statistics 22, 400–407. +
      +
      +Russakovsky, O., Deng, J., Su, H., Krause, J., Satheesh, S., Ma, S., +Huang, Z., Karpathy, A., Khosla, A., Bernstein, M., Berg, A.C., Fei-Fei, +L., 2015. ImageNet Large Scale Visual Recognition +Challenge. International Journal of Computer Vision (IJCV) 115, +211–252. https://doi.org/10.1007/s11263-015-0816-y +
      +
      +Taigman, Y., Yang, M., Ranzato, M., Wolf, L., 2014. +DeepFace: Closing the gap to human-level performance in +face verification, in: Proceedings of the IEEE Computer +Society Conference on Computer Vision and Pattern Recognition. https://doi.org/10.1109/CVPR.2014.220
      - - + + diff --git a/slides/02-ml-systems.slides.html b/slides/02-ml-systems.slides.html index c760ad8..5434690 100644 --- a/slides/02-ml-systems.slides.html +++ b/slides/02-ml-systems.slides.html @@ -8,25 +8,25 @@ - + - - + + - + -\[\newcommand{\tk}[1]{} -\newcommand{\Amatrix}{\mathbf{A}} -\newcommand{\KL}[2]{\text{KL}\left( #1\,\|\,#2 \right)} -\newcommand{\Kaast}{\kernelMatrix_{\mathbf{ \ast}\mathbf{ \ast}}} -\newcommand{\Kastu}{\kernelMatrix_{\mathbf{ \ast} \inducingVector}} -\newcommand{\Kff}{\kernelMatrix_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\Kfu}{\kernelMatrix_{\mappingFunctionVector \inducingVector}} -\newcommand{\Kuast}{\kernelMatrix_{\inducingVector \bf\ast}} -\newcommand{\Kuf}{\kernelMatrix_{\inducingVector \mappingFunctionVector}} -\newcommand{\Kuu}{\kernelMatrix_{\inducingVector \inducingVector}} -\newcommand{\Kuui}{\Kuu^{-1}} -\newcommand{\Qaast}{\mathbf{Q}_{\bf \ast \ast}} -\newcommand{\Qastf}{\mathbf{Q}_{\ast \mappingFunction}} -\newcommand{\Qfast}{\mathbf{Q}_{\mappingFunctionVector \bf \ast}} -\newcommand{\Qff}{\mathbf{Q}_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\aMatrix}{\mathbf{A}} -\newcommand{\aScalar}{a} -\newcommand{\aVector}{\mathbf{a}} -\newcommand{\acceleration}{a} -\newcommand{\bMatrix}{\mathbf{B}} -\newcommand{\bScalar}{b} -\newcommand{\bVector}{\mathbf{b}} -\newcommand{\basisFunc}{\phi} -\newcommand{\basisFuncVector}{\boldsymbol{ \basisFunc}} -\newcommand{\basisFunction}{\phi} -\newcommand{\basisLocation}{\mu} -\newcommand{\basisMatrix}{\boldsymbol{ \Phi}} -\newcommand{\basisScalar}{\basisFunction} -\newcommand{\basisVector}{\boldsymbol{ \basisFunction}} -\newcommand{\activationFunction}{\phi} -\newcommand{\activationMatrix}{\boldsymbol{ \Phi}} -\newcommand{\activationScalar}{\basisFunction} -\newcommand{\activationVector}{\boldsymbol{ \basisFunction}} -\newcommand{\bigO}{\mathcal{O}} -\newcommand{\binomProb}{\pi} -\newcommand{\cMatrix}{\mathbf{C}} -\newcommand{\cbasisMatrix}{\hat{\boldsymbol{ \Phi}}} -\newcommand{\cdataMatrix}{\hat{\dataMatrix}} -\newcommand{\cdataScalar}{\hat{\dataScalar}} -\newcommand{\cdataVector}{\hat{\dataVector}} -\newcommand{\centeredKernelMatrix}{\mathbf{ \MakeUppercase{\centeredKernelScalar}}} -\newcommand{\centeredKernelScalar}{b} -\newcommand{\centeredKernelVector}{\centeredKernelScalar} -\newcommand{\centeringMatrix}{\mathbf{H}} -\newcommand{\chiSquaredDist}[2]{\chi_{#1}^{2}\left(#2\right)} -\newcommand{\chiSquaredSamp}[1]{\chi_{#1}^{2}} -\newcommand{\conditionalCovariance}{\boldsymbol{ \Sigma}} -\newcommand{\coregionalizationMatrix}{\mathbf{B}} -\newcommand{\coregionalizationScalar}{b} -\newcommand{\coregionalizationVector}{\mathbf{ \coregionalizationScalar}} -\newcommand{\covDist}[2]{\text{cov}_{#2}\left(#1\right)} -\newcommand{\covSamp}[1]{\text{cov}\left(#1\right)} -\newcommand{\covarianceScalar}{c} -\newcommand{\covarianceVector}{\mathbf{ \covarianceScalar}} -\newcommand{\covarianceMatrix}{\mathbf{C}} -\newcommand{\covarianceMatrixTwo}{\boldsymbol{ \Sigma}} -\newcommand{\croupierScalar}{s} -\newcommand{\croupierVector}{\mathbf{ \croupierScalar}} -\newcommand{\croupierMatrix}{\mathbf{ \MakeUppercase{\croupierScalar}}} -\newcommand{\dataDim}{p} -\newcommand{\dataIndex}{i} -\newcommand{\dataIndexTwo}{j} -\newcommand{\dataMatrix}{\mathbf{Y}} -\newcommand{\dataScalar}{y} -\newcommand{\dataSet}{\mathcal{D}} -\newcommand{\dataStd}{\sigma} -\newcommand{\dataVector}{\mathbf{ \dataScalar}} -\newcommand{\decayRate}{d} -\newcommand{\degreeMatrix}{\mathbf{ \MakeUppercase{\degreeScalar}}} -\newcommand{\degreeScalar}{d} -\newcommand{\degreeVector}{\mathbf{ \degreeScalar}} -\newcommand{\diag}[1]{\text{diag}\left(#1\right)} -\newcommand{\diagonalMatrix}{\mathbf{D}} -\newcommand{\diff}[2]{\frac{\text{d}#1}{\text{d}#2}} -\newcommand{\diffTwo}[2]{\frac{\text{d}^2#1}{\text{d}#2^2}} -\newcommand{\displacement}{x} -\newcommand{\displacementVector}{\textbf{\displacement}} -\newcommand{\distanceMatrix}{\mathbf{ \MakeUppercase{\distanceScalar}}} -\newcommand{\distanceScalar}{d} -\newcommand{\distanceVector}{\mathbf{ \distanceScalar}} -\newcommand{\eigenvaltwo}{\ell} -\newcommand{\eigenvaltwoMatrix}{\mathbf{L}} -\newcommand{\eigenvaltwoVector}{\mathbf{l}} -\newcommand{\eigenvalue}{\lambda} -\newcommand{\eigenvalueMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\eigenvalueVector}{\boldsymbol{ \lambda}} -\newcommand{\eigenvector}{\mathbf{ \eigenvectorScalar}} -\newcommand{\eigenvectorMatrix}{\mathbf{U}} -\newcommand{\eigenvectorScalar}{u} -\newcommand{\eigenvectwo}{\mathbf{v}} -\newcommand{\eigenvectwoMatrix}{\mathbf{V}} -\newcommand{\eigenvectwoScalar}{v} -\newcommand{\entropy}[1]{\mathcal{H}\left(#1\right)} -\newcommand{\errorFunction}{E} -\newcommand{\expDist}[2]{\left<#1\right>_{#2}} -\newcommand{\expSamp}[1]{\left<#1\right>} -\newcommand{\expectation}[1]{\left\langle #1 \right\rangle } -\newcommand{\expectationDist}[2]{\left\langle #1 \right\rangle _{#2}} -\newcommand{\expectedDistanceMatrix}{\mathcal{D}} -\newcommand{\eye}{\mathbf{I}} -\newcommand{\fantasyDim}{r} -\newcommand{\fantasyMatrix}{\mathbf{ \MakeUppercase{\fantasyScalar}}} -\newcommand{\fantasyScalar}{z} -\newcommand{\fantasyVector}{\mathbf{ \fantasyScalar}} -\newcommand{\featureStd}{\varsigma} -\newcommand{\gammaCdf}[3]{\mathcal{GAMMA CDF}\left(#1|#2,#3\right)} -\newcommand{\gammaDist}[3]{\mathcal{G}\left(#1|#2,#3\right)} -\newcommand{\gammaSamp}[2]{\mathcal{G}\left(#1,#2\right)} -\newcommand{\gaussianDist}[3]{\mathcal{N}\left(#1|#2,#3\right)} -\newcommand{\gaussianSamp}[2]{\mathcal{N}\left(#1,#2\right)} -\newcommand{\given}{|} -\newcommand{\half}{\frac{1}{2}} -\newcommand{\heaviside}{H} -\newcommand{\hiddenMatrix}{\mathbf{ \MakeUppercase{\hiddenScalar}}} -\newcommand{\hiddenScalar}{h} -\newcommand{\hiddenVector}{\mathbf{ \hiddenScalar}} -\newcommand{\identityMatrix}{\eye} -\newcommand{\inducingInputScalar}{z} -\newcommand{\inducingInputVector}{\mathbf{ \inducingInputScalar}} -\newcommand{\inducingInputMatrix}{\mathbf{Z}} -\newcommand{\inducingScalar}{u} -\newcommand{\inducingVector}{\mathbf{ \inducingScalar}} -\newcommand{\inducingMatrix}{\mathbf{U}} -\newcommand{\inlineDiff}[2]{\text{d}#1/\text{d}#2} -\newcommand{\inputDim}{q} -\newcommand{\inputMatrix}{\mathbf{X}} -\newcommand{\inputScalar}{x} -\newcommand{\inputSpace}{\mathcal{X}} -\newcommand{\inputVals}{\inputVector} -\newcommand{\inputVector}{\mathbf{ \inputScalar}} -\newcommand{\iterNum}{k} -\newcommand{\kernel}{\kernelScalar} -\newcommand{\kernelMatrix}{\mathbf{K}} -\newcommand{\kernelScalar}{k} -\newcommand{\kernelVector}{\mathbf{ \kernelScalar}} -\newcommand{\kff}{\kernelScalar_{\mappingFunction \mappingFunction}} -\newcommand{\kfu}{\kernelVector_{\mappingFunction \inducingScalar}} -\newcommand{\kuf}{\kernelVector_{\inducingScalar \mappingFunction}} -\newcommand{\kuu}{\kernelVector_{\inducingScalar \inducingScalar}} -\newcommand{\lagrangeMultiplier}{\lambda} -\newcommand{\lagrangeMultiplierMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\lagrangian}{L} -\newcommand{\laplacianFactor}{\mathbf{ \MakeUppercase{\laplacianFactorScalar}}} -\newcommand{\laplacianFactorScalar}{m} -\newcommand{\laplacianFactorVector}{\mathbf{ \laplacianFactorScalar}} -\newcommand{\laplacianMatrix}{\mathbf{L}} -\newcommand{\laplacianScalar}{\ell} -\newcommand{\laplacianVector}{\mathbf{ \ell}} -\newcommand{\latentDim}{q} -\newcommand{\latentDistanceMatrix}{\boldsymbol{ \Delta}} -\newcommand{\latentDistanceScalar}{\delta} -\newcommand{\latentDistanceVector}{\boldsymbol{ \delta}} -\newcommand{\latentForce}{f} -\newcommand{\latentFunction}{u} -\newcommand{\latentFunctionVector}{\mathbf{ \latentFunction}} -\newcommand{\latentFunctionMatrix}{\mathbf{ \MakeUppercase{\latentFunction}}} -\newcommand{\latentIndex}{j} -\newcommand{\latentScalar}{z} -\newcommand{\latentVector}{\mathbf{ \latentScalar}} -\newcommand{\latentMatrix}{\mathbf{Z}} -\newcommand{\learnRate}{\eta} -\newcommand{\lengthScale}{\ell} -\newcommand{\rbfWidth}{\ell} -\newcommand{\likelihoodBound}{\mathcal{L}} -\newcommand{\likelihoodFunction}{L} -\newcommand{\locationScalar}{\mu} -\newcommand{\locationVector}{\boldsymbol{ \locationScalar}} -\newcommand{\locationMatrix}{\mathbf{M}} -\newcommand{\variance}[1]{\text{var}\left( #1 \right)} -\newcommand{\mappingFunction}{f} -\newcommand{\mappingFunctionMatrix}{\mathbf{F}} -\newcommand{\mappingFunctionTwo}{g} -\newcommand{\mappingFunctionTwoMatrix}{\mathbf{G}} -\newcommand{\mappingFunctionTwoVector}{\mathbf{ \mappingFunctionTwo}} -\newcommand{\mappingFunctionVector}{\mathbf{ \mappingFunction}} -\newcommand{\scaleScalar}{s} -\newcommand{\mappingScalar}{w} -\newcommand{\mappingVector}{\mathbf{ \mappingScalar}} -\newcommand{\mappingMatrix}{\mathbf{W}} -\newcommand{\mappingScalarTwo}{v} -\newcommand{\mappingVectorTwo}{\mathbf{ \mappingScalarTwo}} -\newcommand{\mappingMatrixTwo}{\mathbf{V}} -\newcommand{\maxIters}{K} -\newcommand{\meanMatrix}{\mathbf{M}} -\newcommand{\meanScalar}{\mu} -\newcommand{\meanTwoMatrix}{\mathbf{M}} -\newcommand{\meanTwoScalar}{m} -\newcommand{\meanTwoVector}{\mathbf{ \meanTwoScalar}} -\newcommand{\meanVector}{\boldsymbol{ \meanScalar}} -\newcommand{\mrnaConcentration}{m} -\newcommand{\naturalFrequency}{\omega} -\newcommand{\neighborhood}[1]{\mathcal{N}\left( #1 \right)} -\newcommand{\neilurl}{http://inverseprobability.com/} -\newcommand{\noiseMatrix}{\boldsymbol{ E}} -\newcommand{\noiseScalar}{\epsilon} -\newcommand{\noiseVector}{\boldsymbol{ \epsilon}} -\newcommand{\norm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\normalizedLaplacianMatrix}{\hat{\mathbf{L}}} -\newcommand{\normalizedLaplacianScalar}{\hat{\ell}} -\newcommand{\normalizedLaplacianVector}{\hat{\mathbf{ \ell}}} -\newcommand{\numActive}{m} -\newcommand{\numBasisFunc}{m} -\newcommand{\numComponents}{m} -\newcommand{\numComps}{K} -\newcommand{\numData}{n} -\newcommand{\numFeatures}{K} -\newcommand{\numHidden}{h} -\newcommand{\numInducing}{m} -\newcommand{\numLayers}{\ell} -\newcommand{\numNeighbors}{K} -\newcommand{\numSequences}{s} -\newcommand{\numSuccess}{s} -\newcommand{\numTasks}{m} -\newcommand{\numTime}{T} -\newcommand{\numTrials}{S} -\newcommand{\outputIndex}{j} -\newcommand{\paramVector}{\boldsymbol{ \theta}} -\newcommand{\parameterMatrix}{\boldsymbol{ \Theta}} -\newcommand{\parameterScalar}{\theta} -\newcommand{\parameterVector}{\boldsymbol{ \parameterScalar}} -\newcommand{\partDiff}[2]{\frac{\partial#1}{\partial#2}} -\newcommand{\precisionScalar}{j} -\newcommand{\precisionVector}{\mathbf{ \precisionScalar}} -\newcommand{\precisionMatrix}{\mathbf{J}} -\newcommand{\pseudotargetScalar}{\widetilde{y}} -\newcommand{\pseudotargetVector}{\mathbf{ \pseudotargetScalar}} -\newcommand{\pseudotargetMatrix}{\mathbf{ \widetilde{Y}}} -\newcommand{\rank}[1]{\text{rank}\left(#1\right)} -\newcommand{\rayleighDist}[2]{\mathcal{R}\left(#1|#2\right)} -\newcommand{\rayleighSamp}[1]{\mathcal{R}\left(#1\right)} -\newcommand{\responsibility}{r} -\newcommand{\rotationScalar}{r} -\newcommand{\rotationVector}{\mathbf{ \rotationScalar}} -\newcommand{\rotationMatrix}{\mathbf{R}} -\newcommand{\sampleCovScalar}{s} -\newcommand{\sampleCovVector}{\mathbf{ \sampleCovScalar}} -\newcommand{\sampleCovMatrix}{\mathbf{s}} -\newcommand{\scalarProduct}[2]{\left\langle{#1},{#2}\right\rangle} -\newcommand{\sign}[1]{\text{sign}\left(#1\right)} -\newcommand{\sigmoid}[1]{\sigma\left(#1\right)} -\newcommand{\singularvalue}{\ell} -\newcommand{\singularvalueMatrix}{\mathbf{L}} -\newcommand{\singularvalueVector}{\mathbf{l}} -\newcommand{\sorth}{\mathbf{u}} -\newcommand{\spar}{\lambda} -\newcommand{\trace}[1]{\text{tr}\left(#1\right)} -\newcommand{\BasalRate}{B} -\newcommand{\DampingCoefficient}{C} -\newcommand{\DecayRate}{D} -\newcommand{\Displacement}{X} -\newcommand{\LatentForce}{F} -\newcommand{\Mass}{M} -\newcommand{\Sensitivity}{S} -\newcommand{\basalRate}{b} -\newcommand{\dampingCoefficient}{c} -\newcommand{\mass}{m} -\newcommand{\sensitivity}{s} -\newcommand{\springScalar}{\kappa} -\newcommand{\springVector}{\boldsymbol{ \kappa}} -\newcommand{\springMatrix}{\boldsymbol{ \mathcal{K}}} -\newcommand{\tfConcentration}{p} -\newcommand{\tfDecayRate}{\delta} -\newcommand{\tfMrnaConcentration}{f} -\newcommand{\tfVector}{\mathbf{ \tfConcentration}} -\newcommand{\velocity}{v} -\newcommand{\sufficientStatsScalar}{g} -\newcommand{\sufficientStatsVector}{\mathbf{ \sufficientStatsScalar}} -\newcommand{\sufficientStatsMatrix}{\mathbf{G}} -\newcommand{\switchScalar}{s} -\newcommand{\switchVector}{\mathbf{ \switchScalar}} -\newcommand{\switchMatrix}{\mathbf{S}} -\newcommand{\tr}[1]{\text{tr}\left(#1\right)} -\newcommand{\loneNorm}[1]{\left\Vert #1 \right\Vert_1} -\newcommand{\ltwoNorm}[1]{\left\Vert #1 \right\Vert_2} -\newcommand{\onenorm}[1]{\left\vert#1\right\vert_1} -\newcommand{\twonorm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\vScalar}{v} -\newcommand{\vVector}{\mathbf{v}} -\newcommand{\vMatrix}{\mathbf{V}} -\newcommand{\varianceDist}[2]{\text{var}_{#2}\left( #1 \right)} -\newcommand{\vecb}[1]{\left(#1\right):} -\newcommand{\weightScalar}{w} -\newcommand{\weightVector}{\mathbf{ \weightScalar}} -\newcommand{\weightMatrix}{\mathbf{W}} -\newcommand{\weightedAdjacencyMatrix}{\mathbf{A}} -\newcommand{\weightedAdjacencyScalar}{a} -\newcommand{\weightedAdjacencyVector}{\mathbf{ \weightedAdjacencyScalar}} -\newcommand{\onesVector}{\mathbf{1}} -\newcommand{\zerosVector}{\mathbf{0}} -\]
      @@ -336,7 +46,8 @@

      Introduction to Machine Learning Systems

      Eric Meissner

      Andrei Paleyes

      -

      Neil D. Lawrence

      +

      Neil +D. Lawrence

      Virtual DSA

      @@ -347,9 +58,6 @@

      Introduction to Machine Learning Systems

      -
      @@ -369,7 +77,7 @@

      Supply Chain Optimization

      Llew Mason - + @@ -382,11 +90,18 @@

      Supply Chain Optimization

      Devesh Mishra - +
      +
      +
      +
      +
      +

      Supply Chain Optimization

      @@ -402,7 +117,7 @@

      Supply Chain Optimization

      Llew Mason - + @@ -415,14 +130,22 @@

      Supply Chain Optimization

      Devesh Mishra - +
      - +
      +
      + +
      +
      +

      Forecasting

      +

      Forecasting

      Jenny Freshwater - + @@ -447,7 +170,7 @@

      Forecasting

      Ping Xu - + @@ -460,8 +183,18 @@

      Forecasting

      Dean Foster - + +
      +
      +
      + +
      +
      +

      Inventory and Buying

      @@ -477,7 +210,7 @@

      Inventory and Buying

      Deepak Bhatia - + @@ -490,7 +223,7 @@

      Inventory and Buying

      Piyush Saraogi - + @@ -501,9 +234,9 @@

      Inventory and Buying

      -Salal Humair +Raman Iyer - + @@ -514,17 +247,38 @@

      Inventory and Buying

      +Salal Humair + + + + + + + + Narayan Venkatasubramanyan - + +
        +
      • Automated buying based on: +
          +
        • Supplier lead times.
        • +
        • Demand Forecast.
        • +
        • Cost basis of the product.
        • +
      • +
      -
      -

      Service Oriented Architecture

      +
      +

      Monolithic System

      - +

      Service Oriented Architecture

      Charlie Bell - + - +

      Service Oriented Architecture

      Peter Vosshall - +
      - + @@ -560,11 +314,11 @@

      Service Oriented Architecture

      A potential path of models in a machine learning system. -
      +

      Service Oriented Architecture

      - +

      Service Oriented Architecture

      Charlie Bell - + - +

      Service Oriented Architecture

      Peter Vosshall - +
      - + + +
      +

      Intellectual Debt

      +
      +
      +
      + +
      +
      +
      +
      @@ -609,34 +377,52 @@

      - +
      -

      Data Science Africa is a bottom up initiative for capacity building in data science, machine learning and AI on the African continent

      +

      Data Science Africa is a bottom up initiative for capacity building +in data science, machine learning and AI on the African continent

      +
      + + +
      +
      + +
      +
      +

      +
      - +
      -

      (2015/aug/25/africa-benefit-data-science-information}

      Crop Monitoring

      - +

      Crop Monitoring

      Ernest Mwebaze - +
      - +
      @@ -664,7 +450,7 @@

      Crop Monitoring

      Biosurveillance

      - +

      Biosurveillance

      Martin Mubangizi - +
      -
      +
      - +
      @@ -692,7 +479,7 @@

      Biosurveillance

      Community Radio

      - +

      Community Radio

      Morine Amutorine - +
      - +
      @@ -721,7 +508,7 @@

      Kudu Project

      - +
      @@ -734,7 +521,7 @@

      Safe Boda

      - +
      @@ -745,11 +532,18 @@

      Safe Boda

      Thanks!

      @@ -758,18 +552,47 @@

      References

      - - + + diff --git a/slides/03-bayesian-methods-abuja.slides.html b/slides/03-bayesian-methods-abuja.slides.html index 7471eb6..922a8a1 100644 --- a/slides/03-bayesian-methods-abuja.slides.html +++ b/slides/03-bayesian-methods-abuja.slides.html @@ -8,7 +8,7 @@ - + - - + + - + -\[\newcommand{\tk}[1]{} -\newcommand{\Amatrix}{\mathbf{A}} -\newcommand{\KL}[2]{\text{KL}\left( #1\,\|\,#2 \right)} -\newcommand{\Kaast}{\kernelMatrix_{\mathbf{ \ast}\mathbf{ \ast}}} -\newcommand{\Kastu}{\kernelMatrix_{\mathbf{ \ast} \inducingVector}} -\newcommand{\Kff}{\kernelMatrix_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\Kfu}{\kernelMatrix_{\mappingFunctionVector \inducingVector}} -\newcommand{\Kuast}{\kernelMatrix_{\inducingVector \bf\ast}} -\newcommand{\Kuf}{\kernelMatrix_{\inducingVector \mappingFunctionVector}} -\newcommand{\Kuu}{\kernelMatrix_{\inducingVector \inducingVector}} -\newcommand{\Kuui}{\Kuu^{-1}} -\newcommand{\Qaast}{\mathbf{Q}_{\bf \ast \ast}} -\newcommand{\Qastf}{\mathbf{Q}_{\ast \mappingFunction}} -\newcommand{\Qfast}{\mathbf{Q}_{\mappingFunctionVector \bf \ast}} -\newcommand{\Qff}{\mathbf{Q}_{\mappingFunctionVector \mappingFunctionVector}} -\newcommand{\aMatrix}{\mathbf{A}} -\newcommand{\aScalar}{a} -\newcommand{\aVector}{\mathbf{a}} -\newcommand{\acceleration}{a} -\newcommand{\bMatrix}{\mathbf{B}} -\newcommand{\bScalar}{b} -\newcommand{\bVector}{\mathbf{b}} -\newcommand{\basisFunc}{\phi} -\newcommand{\basisFuncVector}{\boldsymbol{ \basisFunc}} -\newcommand{\basisFunction}{\phi} -\newcommand{\basisLocation}{\mu} -\newcommand{\basisMatrix}{\boldsymbol{ \Phi}} -\newcommand{\basisScalar}{\basisFunction} -\newcommand{\basisVector}{\boldsymbol{ \basisFunction}} -\newcommand{\activationFunction}{\phi} -\newcommand{\activationMatrix}{\boldsymbol{ \Phi}} -\newcommand{\activationScalar}{\basisFunction} -\newcommand{\activationVector}{\boldsymbol{ \basisFunction}} -\newcommand{\bigO}{\mathcal{O}} -\newcommand{\binomProb}{\pi} -\newcommand{\cMatrix}{\mathbf{C}} -\newcommand{\cbasisMatrix}{\hat{\boldsymbol{ \Phi}}} -\newcommand{\cdataMatrix}{\hat{\dataMatrix}} -\newcommand{\cdataScalar}{\hat{\dataScalar}} -\newcommand{\cdataVector}{\hat{\dataVector}} -\newcommand{\centeredKernelMatrix}{\mathbf{ \MakeUppercase{\centeredKernelScalar}}} -\newcommand{\centeredKernelScalar}{b} -\newcommand{\centeredKernelVector}{\centeredKernelScalar} -\newcommand{\centeringMatrix}{\mathbf{H}} -\newcommand{\chiSquaredDist}[2]{\chi_{#1}^{2}\left(#2\right)} -\newcommand{\chiSquaredSamp}[1]{\chi_{#1}^{2}} -\newcommand{\conditionalCovariance}{\boldsymbol{ \Sigma}} -\newcommand{\coregionalizationMatrix}{\mathbf{B}} -\newcommand{\coregionalizationScalar}{b} -\newcommand{\coregionalizationVector}{\mathbf{ \coregionalizationScalar}} -\newcommand{\covDist}[2]{\text{cov}_{#2}\left(#1\right)} -\newcommand{\covSamp}[1]{\text{cov}\left(#1\right)} -\newcommand{\covarianceScalar}{c} -\newcommand{\covarianceVector}{\mathbf{ \covarianceScalar}} -\newcommand{\covarianceMatrix}{\mathbf{C}} -\newcommand{\covarianceMatrixTwo}{\boldsymbol{ \Sigma}} -\newcommand{\croupierScalar}{s} -\newcommand{\croupierVector}{\mathbf{ \croupierScalar}} -\newcommand{\croupierMatrix}{\mathbf{ \MakeUppercase{\croupierScalar}}} -\newcommand{\dataDim}{p} -\newcommand{\dataIndex}{i} -\newcommand{\dataIndexTwo}{j} -\newcommand{\dataMatrix}{\mathbf{Y}} -\newcommand{\dataScalar}{y} -\newcommand{\dataSet}{\mathcal{D}} -\newcommand{\dataStd}{\sigma} -\newcommand{\dataVector}{\mathbf{ \dataScalar}} -\newcommand{\decayRate}{d} -\newcommand{\degreeMatrix}{\mathbf{ \MakeUppercase{\degreeScalar}}} -\newcommand{\degreeScalar}{d} -\newcommand{\degreeVector}{\mathbf{ \degreeScalar}} -\newcommand{\diag}[1]{\text{diag}\left(#1\right)} -\newcommand{\diagonalMatrix}{\mathbf{D}} -\newcommand{\diff}[2]{\frac{\text{d}#1}{\text{d}#2}} -\newcommand{\diffTwo}[2]{\frac{\text{d}^2#1}{\text{d}#2^2}} -\newcommand{\displacement}{x} -\newcommand{\displacementVector}{\textbf{\displacement}} -\newcommand{\distanceMatrix}{\mathbf{ \MakeUppercase{\distanceScalar}}} -\newcommand{\distanceScalar}{d} -\newcommand{\distanceVector}{\mathbf{ \distanceScalar}} -\newcommand{\eigenvaltwo}{\ell} -\newcommand{\eigenvaltwoMatrix}{\mathbf{L}} -\newcommand{\eigenvaltwoVector}{\mathbf{l}} -\newcommand{\eigenvalue}{\lambda} -\newcommand{\eigenvalueMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\eigenvalueVector}{\boldsymbol{ \lambda}} -\newcommand{\eigenvector}{\mathbf{ \eigenvectorScalar}} -\newcommand{\eigenvectorMatrix}{\mathbf{U}} -\newcommand{\eigenvectorScalar}{u} -\newcommand{\eigenvectwo}{\mathbf{v}} -\newcommand{\eigenvectwoMatrix}{\mathbf{V}} -\newcommand{\eigenvectwoScalar}{v} -\newcommand{\entropy}[1]{\mathcal{H}\left(#1\right)} -\newcommand{\errorFunction}{E} -\newcommand{\expDist}[2]{\left<#1\right>_{#2}} -\newcommand{\expSamp}[1]{\left<#1\right>} -\newcommand{\expectation}[1]{\left\langle #1 \right\rangle } -\newcommand{\expectationDist}[2]{\left\langle #1 \right\rangle _{#2}} -\newcommand{\expectedDistanceMatrix}{\mathcal{D}} -\newcommand{\eye}{\mathbf{I}} -\newcommand{\fantasyDim}{r} -\newcommand{\fantasyMatrix}{\mathbf{ \MakeUppercase{\fantasyScalar}}} -\newcommand{\fantasyScalar}{z} -\newcommand{\fantasyVector}{\mathbf{ \fantasyScalar}} -\newcommand{\featureStd}{\varsigma} -\newcommand{\gammaCdf}[3]{\mathcal{GAMMA CDF}\left(#1|#2,#3\right)} -\newcommand{\gammaDist}[3]{\mathcal{G}\left(#1|#2,#3\right)} -\newcommand{\gammaSamp}[2]{\mathcal{G}\left(#1,#2\right)} -\newcommand{\gaussianDist}[3]{\mathcal{N}\left(#1|#2,#3\right)} -\newcommand{\gaussianSamp}[2]{\mathcal{N}\left(#1,#2\right)} -\newcommand{\given}{|} -\newcommand{\half}{\frac{1}{2}} -\newcommand{\heaviside}{H} -\newcommand{\hiddenMatrix}{\mathbf{ \MakeUppercase{\hiddenScalar}}} -\newcommand{\hiddenScalar}{h} -\newcommand{\hiddenVector}{\mathbf{ \hiddenScalar}} -\newcommand{\identityMatrix}{\eye} -\newcommand{\inducingInputScalar}{z} -\newcommand{\inducingInputVector}{\mathbf{ \inducingInputScalar}} -\newcommand{\inducingInputMatrix}{\mathbf{Z}} -\newcommand{\inducingScalar}{u} -\newcommand{\inducingVector}{\mathbf{ \inducingScalar}} -\newcommand{\inducingMatrix}{\mathbf{U}} -\newcommand{\inlineDiff}[2]{\text{d}#1/\text{d}#2} -\newcommand{\inputDim}{q} -\newcommand{\inputMatrix}{\mathbf{X}} -\newcommand{\inputScalar}{x} -\newcommand{\inputSpace}{\mathcal{X}} -\newcommand{\inputVals}{\inputVector} -\newcommand{\inputVector}{\mathbf{ \inputScalar}} -\newcommand{\iterNum}{k} -\newcommand{\kernel}{\kernelScalar} -\newcommand{\kernelMatrix}{\mathbf{K}} -\newcommand{\kernelScalar}{k} -\newcommand{\kernelVector}{\mathbf{ \kernelScalar}} -\newcommand{\kff}{\kernelScalar_{\mappingFunction \mappingFunction}} -\newcommand{\kfu}{\kernelVector_{\mappingFunction \inducingScalar}} -\newcommand{\kuf}{\kernelVector_{\inducingScalar \mappingFunction}} -\newcommand{\kuu}{\kernelVector_{\inducingScalar \inducingScalar}} -\newcommand{\lagrangeMultiplier}{\lambda} -\newcommand{\lagrangeMultiplierMatrix}{\boldsymbol{ \Lambda}} -\newcommand{\lagrangian}{L} -\newcommand{\laplacianFactor}{\mathbf{ \MakeUppercase{\laplacianFactorScalar}}} -\newcommand{\laplacianFactorScalar}{m} -\newcommand{\laplacianFactorVector}{\mathbf{ \laplacianFactorScalar}} -\newcommand{\laplacianMatrix}{\mathbf{L}} -\newcommand{\laplacianScalar}{\ell} -\newcommand{\laplacianVector}{\mathbf{ \ell}} -\newcommand{\latentDim}{q} -\newcommand{\latentDistanceMatrix}{\boldsymbol{ \Delta}} -\newcommand{\latentDistanceScalar}{\delta} -\newcommand{\latentDistanceVector}{\boldsymbol{ \delta}} -\newcommand{\latentForce}{f} -\newcommand{\latentFunction}{u} -\newcommand{\latentFunctionVector}{\mathbf{ \latentFunction}} -\newcommand{\latentFunctionMatrix}{\mathbf{ \MakeUppercase{\latentFunction}}} -\newcommand{\latentIndex}{j} -\newcommand{\latentScalar}{z} -\newcommand{\latentVector}{\mathbf{ \latentScalar}} -\newcommand{\latentMatrix}{\mathbf{Z}} -\newcommand{\learnRate}{\eta} -\newcommand{\lengthScale}{\ell} -\newcommand{\rbfWidth}{\ell} -\newcommand{\likelihoodBound}{\mathcal{L}} -\newcommand{\likelihoodFunction}{L} -\newcommand{\locationScalar}{\mu} -\newcommand{\locationVector}{\boldsymbol{ \locationScalar}} -\newcommand{\locationMatrix}{\mathbf{M}} -\newcommand{\variance}[1]{\text{var}\left( #1 \right)} -\newcommand{\mappingFunction}{f} -\newcommand{\mappingFunctionMatrix}{\mathbf{F}} -\newcommand{\mappingFunctionTwo}{g} -\newcommand{\mappingFunctionTwoMatrix}{\mathbf{G}} -\newcommand{\mappingFunctionTwoVector}{\mathbf{ \mappingFunctionTwo}} -\newcommand{\mappingFunctionVector}{\mathbf{ \mappingFunction}} -\newcommand{\scaleScalar}{s} -\newcommand{\mappingScalar}{w} -\newcommand{\mappingVector}{\mathbf{ \mappingScalar}} -\newcommand{\mappingMatrix}{\mathbf{W}} -\newcommand{\mappingScalarTwo}{v} -\newcommand{\mappingVectorTwo}{\mathbf{ \mappingScalarTwo}} -\newcommand{\mappingMatrixTwo}{\mathbf{V}} -\newcommand{\maxIters}{K} -\newcommand{\meanMatrix}{\mathbf{M}} -\newcommand{\meanScalar}{\mu} -\newcommand{\meanTwoMatrix}{\mathbf{M}} -\newcommand{\meanTwoScalar}{m} -\newcommand{\meanTwoVector}{\mathbf{ \meanTwoScalar}} -\newcommand{\meanVector}{\boldsymbol{ \meanScalar}} -\newcommand{\mrnaConcentration}{m} -\newcommand{\naturalFrequency}{\omega} -\newcommand{\neighborhood}[1]{\mathcal{N}\left( #1 \right)} -\newcommand{\neilurl}{http://inverseprobability.com/} -\newcommand{\noiseMatrix}{\boldsymbol{ E}} -\newcommand{\noiseScalar}{\epsilon} -\newcommand{\noiseVector}{\boldsymbol{ \epsilon}} -\newcommand{\norm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\normalizedLaplacianMatrix}{\hat{\mathbf{L}}} -\newcommand{\normalizedLaplacianScalar}{\hat{\ell}} -\newcommand{\normalizedLaplacianVector}{\hat{\mathbf{ \ell}}} -\newcommand{\numActive}{m} -\newcommand{\numBasisFunc}{m} -\newcommand{\numComponents}{m} -\newcommand{\numComps}{K} -\newcommand{\numData}{n} -\newcommand{\numFeatures}{K} -\newcommand{\numHidden}{h} -\newcommand{\numInducing}{m} -\newcommand{\numLayers}{\ell} -\newcommand{\numNeighbors}{K} -\newcommand{\numSequences}{s} -\newcommand{\numSuccess}{s} -\newcommand{\numTasks}{m} -\newcommand{\numTime}{T} -\newcommand{\numTrials}{S} -\newcommand{\outputIndex}{j} -\newcommand{\paramVector}{\boldsymbol{ \theta}} -\newcommand{\parameterMatrix}{\boldsymbol{ \Theta}} -\newcommand{\parameterScalar}{\theta} -\newcommand{\parameterVector}{\boldsymbol{ \parameterScalar}} -\newcommand{\partDiff}[2]{\frac{\partial#1}{\partial#2}} -\newcommand{\precisionScalar}{j} -\newcommand{\precisionVector}{\mathbf{ \precisionScalar}} -\newcommand{\precisionMatrix}{\mathbf{J}} -\newcommand{\pseudotargetScalar}{\widetilde{y}} -\newcommand{\pseudotargetVector}{\mathbf{ \pseudotargetScalar}} -\newcommand{\pseudotargetMatrix}{\mathbf{ \widetilde{Y}}} -\newcommand{\rank}[1]{\text{rank}\left(#1\right)} -\newcommand{\rayleighDist}[2]{\mathcal{R}\left(#1|#2\right)} -\newcommand{\rayleighSamp}[1]{\mathcal{R}\left(#1\right)} -\newcommand{\responsibility}{r} -\newcommand{\rotationScalar}{r} -\newcommand{\rotationVector}{\mathbf{ \rotationScalar}} -\newcommand{\rotationMatrix}{\mathbf{R}} -\newcommand{\sampleCovScalar}{s} -\newcommand{\sampleCovVector}{\mathbf{ \sampleCovScalar}} -\newcommand{\sampleCovMatrix}{\mathbf{s}} -\newcommand{\scalarProduct}[2]{\left\langle{#1},{#2}\right\rangle} -\newcommand{\sign}[1]{\text{sign}\left(#1\right)} -\newcommand{\sigmoid}[1]{\sigma\left(#1\right)} -\newcommand{\singularvalue}{\ell} -\newcommand{\singularvalueMatrix}{\mathbf{L}} -\newcommand{\singularvalueVector}{\mathbf{l}} -\newcommand{\sorth}{\mathbf{u}} -\newcommand{\spar}{\lambda} -\newcommand{\trace}[1]{\text{tr}\left(#1\right)} -\newcommand{\BasalRate}{B} -\newcommand{\DampingCoefficient}{C} -\newcommand{\DecayRate}{D} -\newcommand{\Displacement}{X} -\newcommand{\LatentForce}{F} -\newcommand{\Mass}{M} -\newcommand{\Sensitivity}{S} -\newcommand{\basalRate}{b} -\newcommand{\dampingCoefficient}{c} -\newcommand{\mass}{m} -\newcommand{\sensitivity}{s} -\newcommand{\springScalar}{\kappa} -\newcommand{\springVector}{\boldsymbol{ \kappa}} -\newcommand{\springMatrix}{\boldsymbol{ \mathcal{K}}} -\newcommand{\tfConcentration}{p} -\newcommand{\tfDecayRate}{\delta} -\newcommand{\tfMrnaConcentration}{f} -\newcommand{\tfVector}{\mathbf{ \tfConcentration}} -\newcommand{\velocity}{v} -\newcommand{\sufficientStatsScalar}{g} -\newcommand{\sufficientStatsVector}{\mathbf{ \sufficientStatsScalar}} -\newcommand{\sufficientStatsMatrix}{\mathbf{G}} -\newcommand{\switchScalar}{s} -\newcommand{\switchVector}{\mathbf{ \switchScalar}} -\newcommand{\switchMatrix}{\mathbf{S}} -\newcommand{\tr}[1]{\text{tr}\left(#1\right)} -\newcommand{\loneNorm}[1]{\left\Vert #1 \right\Vert_1} -\newcommand{\ltwoNorm}[1]{\left\Vert #1 \right\Vert_2} -\newcommand{\onenorm}[1]{\left\vert#1\right\vert_1} -\newcommand{\twonorm}[1]{\left\Vert #1 \right\Vert} -\newcommand{\vScalar}{v} -\newcommand{\vVector}{\mathbf{v}} -\newcommand{\vMatrix}{\mathbf{V}} -\newcommand{\varianceDist}[2]{\text{var}_{#2}\left( #1 \right)} -\newcommand{\vecb}[1]{\left(#1\right):} -\newcommand{\weightScalar}{w} -\newcommand{\weightVector}{\mathbf{ \weightScalar}} -\newcommand{\weightMatrix}{\mathbf{W}} -\newcommand{\weightedAdjacencyMatrix}{\mathbf{A}} -\newcommand{\weightedAdjacencyScalar}{a} -\newcommand{\weightedAdjacencyVector}{\mathbf{ \weightedAdjacencyScalar}} -\newcommand{\onesVector}{\mathbf{1}} -\newcommand{\zerosVector}{\mathbf{0}} -\]

      Bayesian Methods

      -

      Probabilistic Machine Learning

      -

      Neil D. Lawrence

      +

      Probabilistic Machine +Learning

      +

      Neil +D. Lawrence

      Oluwasanmi Koyejo

      DSA, Abuja

      @@ -411,9 +124,6 @@

      Bayesian Methods

      -
      @@ -422,53 +132,76 @@

      What is Machine Learning?

      What is Machine Learning?

      -

      \[ \text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

      +

      \[ \text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

        -
      • data : observations, could be actively or passively acquired (meta-data).
      • +
      • data : observations, could be actively or passively +acquired (meta-data).
        -
      • model : assumptions, based on previous experience (other data! transfer learning etc), or beliefs about the regularities of the universe. Inductive bias.
      • +
      • model : assumptions, based on previous experience +(other data! transfer learning etc), or beliefs about the regularities +of the universe. Inductive bias.
        -
      • prediction : an action to be taken or a categorization or a quality score.
      • +
      • prediction : an action to be taken or a +categorization or a quality score.

      What is Machine Learning?

      -

      \[\text{data} + \text{model} \stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

      +

      \[\text{data} + \text{model} +\stackrel{\text{compute}}{\rightarrow} \text{prediction}\]

      • To combine data with a model need:
      • -
      • a prediction function \(f(\cdot)\) includes our beliefs about the regularities of the universe
      • -
      • an objective function \(E(\cdot)\) defines the cost of misprediction.
      • +
      • a prediction function \(f(\cdot)\) includes our beliefs about the +regularities of the universe
      • +
      • an objective function \(E(\cdot)\) defines the cost of +misprediction.
      - -
      -
      -

      Nigerian NMIS Data

      +
      -
      -

      Nigerian NMIS Data: Notebook

      +
      +

      Nigeria NMIS Data

      -
      -

      Exercise 1

      -

      Read on the internet about the following python libraries: numpy, matplotlib, scipy and pandas. What functionality does each provide python?

      +
      +

      Nigeria NMIS Data: Notebook

      +
      +
      +
      + +
      +
      +
      +

      Probabilities

      +
      +

      Exploring the NMIS Data

      +

      Probability and the NMIS Data

      @@ -476,15 +209,16 @@

      Probability and the NMIS Data

      Conditioning

      -
      -

      Exercise 2

      -

      Write code that prints out the probability of nurses being greater than 2 for different numbers of doctors.

      -

      Probability Review

        -
      • We are interested in trials which result in two random variables, \(X\) and \(Y\), each of which has an ‘outcome’ denoted by \(x\) or \(y\).
      • -
      • We summarise the notation and terminology for these distributions in the following table.
      • +
      • We are interested in trials which result in two random variables, +\(X\) and \(Y\), each of which has an ‘outcome’ denoted +by \(x\) or \(y\).
      • +
      • We summarise the notation and terminology for these distributions in +the following table.
      @@ -519,20 +253,26 @@

      The different basic probability distributions.
      -
      +

      A Pictorial Definition of Probability

      - + -

      Inspired by lectures from Christopher Bishop

      +
      +Inspired by lectures from Christopher Bishop +
      -
      +

      Definition of probability distributions

      @@ -550,17 +290,20 @@

      Definition of probability distributions

      - + - + - + @@ -569,27 +312,43 @@

      Definition of probability distributions

      Notational Details

        -
      • Typically we should write out \(P\left(X=x,Y=y\right)\).

      • -
      • In practice, we often use \(P\left(x,y\right)\).

      • -
      • This looks very much like we might write a multivariate function, e.g. \(f\left(x,y\right)=\frac{x}{y}\).

        -
          -
        • For a multivariate function though, \(f\left(x,y\right)\neq f\left(y,x\right)\).
        • -
        • However \(P\left(x,y\right)=P\left(y,x\right)\) because \(P\left(X=x,Y=y\right)=P\left(Y=y,X=x\right)\).
        • +
        • Typically we should write out \(P\left(X=x,Y=y\right)\).

        • +
        • In practice, we often use \(P\left(x,y\right)\).

        • +
        • This looks very much like we might write a multivariate function, +e.g. \(f\left(x,y\right)=\frac{x}{y}\).

          +
            +
          • For a multivariate function though, \(f\left(x,y\right)\neq +f\left(y,x\right)\).
          • +
          • However \(P\left(x,y\right)=P\left(y,x\right)\) +because \(P\left(X=x,Y=y\right)=P\left(Y=y,X=x\right)\).
        • We now quickly review the ‘rules of probability’.

      Normalization

      -

      All distributions are normalized. This is clear from the fact that \(\sum_{x}n_{x}=N\), which gives \[\sum_{x}P\left(x\right)={\lim_{N\rightarrow\infty}}\frac{\sum_{x}n_{x}}{N}={\lim_{N\rightarrow\infty}}\frac{N}{N}=1.\] A similar result can be derived for the marginal and conditional distributions.

      +

      All distributions are normalized. This is clear from the +fact that \(\sum_{x}n_{x}=N\), which +gives \[\sum_{x}P\left(x\right)={\lim_{N\rightarrow\infty}}\frac{\sum_{x}n_{x}}{N}={\lim_{N\rightarrow\infty}}\frac{N}{N}=1.\] +A similar result can be derived for the marginal and conditional +distributions.

      The Product Rule

        -
      • \(P\left(x|y\right)\) is \[ +
      • \(P\left(x|y\right)\) is \[ {\lim_{N\rightarrow\infty}}\frac{n_{x,y}}{n_{y}}. \]
      • -
      • \(P\left(x,y\right)\) is \[ +
      • \(P\left(x,y\right)\) is \[ {\lim_{N\rightarrow\infty}}\frac{n_{x,y}}{N}={\lim_{N\rightarrow\infty}}\frac{n_{x,y}}{n_{y}}\frac{n_{y}}{N} \] or in other words \[ P\left(x,y\right)=P\left(x|y\right)P\left(y\right). @@ -600,24 +359,27 @@

        The Product Rule

        The Sum Rule

        Ignoring the limit in our definitions:

          -
        • The marginal probability \(P\left(y\right)\) is \({\lim_{N\rightarrow\infty}}\frac{n_{y}}{N}\) .

        • -
        • The joint distribution \(P\left(x,y\right)\) is \({\lim_{N\rightarrow\infty}}\frac{n_{x,y}}{N}\).

        • -
        • \(n_{y}=\sum_{x}n_{x,y}\) so \[ +

        • The marginal probability \(P\left(y\right)\) is \({\lim_{N\rightarrow\infty}}\frac{n_{y}}{N}\) +.

        • +
        • The joint distribution \(P\left(x,y\right)\) is \({\lim_{N\rightarrow\infty}}\frac{n_{x,y}}{N}\).

        • +
        • \(n_{y}=\sum_{x}n_{x,y}\) so +\[ {\lim_{N\rightarrow\infty}}\frac{n_{y}}{N}={\lim_{N\rightarrow\infty}}\sum_{x}\frac{n_{x,y}}{N}, \] in other words \[ P\left(y\right)=\sum_{x}P\left(x,y\right). \] This is known as the sum rule of probability.

      -
      -

      Exercise 3

      -

      Write code that computes \(P(y)\) by adding \(P(y, x)\) for all values of \(x\).

      -

      Bayes’ Rule

      • From the product rule, \[ -P\left(y,x\right)=P\left(x,y\right)=P\left(x|y\right)P\left(y\right),\] so \[ +P\left(y,x\right)=P\left(x,y\right)=P\left(x|y\right)P\left(y\right),\] +so \[ P\left(y|x\right)P\left(x\right)=P\left(x|y\right)P\left(y\right) \] which leads to Bayes’ rule, \[ P\left(y|x\right)=\frac{P\left(x|y\right)P\left(y\right)}{P\left(x\right)}. @@ -627,7 +389,10 @@

        Bayes’ Rule

        Bayes’ Theorem Example

          -
        • There are two barrels in front of you. Barrel One contains 20 apples and 4 oranges. Barrel Two other contains 4 apples and 8 oranges. You choose a barrel randomly and select a fruit. It is an apple. What is the probability that the barrel was Barrel One?
        • +
        • There are two barrels in front of you. Barrel One contains 20 apples +and 4 oranges. Barrel Two other contains 4 apples and 8 oranges. You +choose a barrel randomly and select a fruit. It is an apple. What is the +probability that the barrel was Barrel One?
        @@ -644,12 +409,17 @@

        Bayes’ Rule Example: Answer I

        Bayes’ Rule Example: Answer II

          -
        • We use the sum rule to compute: \[\begin{aligned} - P(\text{F}=\text{A}) = & P(\text{F}=\text{A}|\text{B}=1)P(\text{B}=1) \\& + P(\text{F}=\text{A}|\text{B}=2)P(\text{B}=2) \\ +
        • We use the sum rule to compute: \[\begin{aligned} + P(\text{F}=\text{A}) = & +P(\text{F}=\text{A}|\text{B}=1)P(\text{B}=1) \\& + +P(\text{F}=\text{A}|\text{B}=2)P(\text{B}=2) \\ = & 20/24\times 0.5 + 4/12 \times 0.5 = 7/12 - \end{aligned}\]
        • -
        • And Bayes’ rule tells us that: \[\begin{aligned} - P(\text{B}=1|\text{F}=\text{A}) = & \frac{P(\text{F} = \text{A}|\text{B}=1)P(\text{B}=1)}{P(\text{F}=\text{A})}\\ +\end{aligned}\]
        • +
        • And Bayes’ rule tells us that: \[\begin{aligned} + P(\text{B}=1|\text{F}=\text{A}) = & \frac{P(\text{F} = +\text{A}|\text{B}=1)P(\text{B}=1)}{P(\text{F}=\text{A})}\\ = & \frac{20/24 \times 0.5}{7/12} = 5/7 \end{aligned}\]
        @@ -657,13 +427,15 @@

        Bayes’ Rule Example: Answer II

        Further Reading

          -
        • Probability distributions: page 12–17 (Section 1.2) of Bishop (2006)
        • +
        • Probability distributions: page 12–17 (Section 1.2) of Bishop (2006)

        Exercises

          -
        • Exercise 1.3 of Bishop (2006)
        • +
        • Exercise 1.3 of Bishop (2006)
        @@ -694,8 +466,10 @@

        Computing Expectations Example

        • What is the mean of the distribution?
        • What is the standard deviation of the distribution?
        • -
        • Are the mean and standard deviation representative of the distribution form?
        • -
        • What is the expected value of \(-\log P(y)\)?
        • +
        • Are the mean and standard deviation representative of the +distribution form?
        • +
        • What is the expected value of \(-\log +P(y)\)?
        @@ -738,10 +512,14 @@

        Expectations Example: Answer

      Joint Probability\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{N}\)\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{N}\) \(P\left(X=3,Y=4\right)\)
      Marginal Probability\(\lim_{N\rightarrow\infty}\frac{n_{X=5}}{N}\)\(\lim_{N\rightarrow\infty}\frac{n_{X=5}}{N}\) \(P\left(X=5\right)\)
      Conditional Probability\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{n_{Y=4}}\)\(\lim_{N\rightarrow\infty}\frac{n_{X=3,Y=4}}{n_{Y=4}}\) \(P\left(X=3\vert Y=4\right)\)
        -
      • Mean: \(1\times 0.3 + 2\times 0.2 + 3 \times 0.1 + 4 \times 0.4 = 2.6\)
      • -
      • Second moment: \(1 \times 0.3 + 4 \times 0.2 + 9 \times 0.1 + 16 \times 0.4 = 8.4\)
      • -
      • Variance: \(8.4 - 2.6\times 2.6 = 1.64\)
      • -
      • Standard deviation: \(\sqrt{1.64} = 1.2806\)
      • +
      • Mean: \(1\times 0.3 + 2\times 0.2 + 3 +\times 0.1 + 4 \times 0.4 = 2.6\)
      • +
      • Second moment: \(1 \times 0.3 + 4 \times +0.2 + 9 \times 0.1 + 16 \times 0.4 = 8.4\)
      • +
      • Variance: \(8.4 - 2.6\times 2.6 = +1.64\)
      • +
      • Standard deviation: \(\sqrt{1.64} = +1.2806\)
      @@ -784,13 +562,16 @@

      Expectations Example: Answer II

        -
      • Expectation \(-\log(P(y))\): \(0.3\times 1.204 + 0.2\times 1.609 + 0.1\times 2.302 +0.4\times 0.916 = 1.280\)
      • +
      • Expectation \(-\log(P(y))\): \(0.3\times 1.204 + 0.2\times 1.609 + 0.1\times +2.302 +0.4\times 0.916 = 1.280\)

      Sample Based Approximation Example

        -
      • You are given the following values samples of heights of students,

        +
      • You are given the following values samples of heights of +students,

        @@ -817,10 +598,12 @@

        Sample Based Approximation Example

      • What is the sample mean?

      • What is the sample variance?

      • -
      • Can you compute sample approximation expected value of \(-\log P(y)\)?

      • +
      • Can you compute sample approximation expected value of \(-\log P(y)\)?

      -
      +

      Sample Based Approximation Example: Answer

      • We can compute:
      • @@ -859,17 +642,21 @@

        Sample Based Approximation Example: Answer

          -
        • Mean: \(\frac{1.76 + 1.73 + 1.79 + 1.81 + 1.85 + 1.80}{6} = 1.79\)
        • -
        • Second moment: $ = 3.2055$
        • -
        • Variance: \(3.2055 - 1.79\times1.79 = 1.43\times 10^{-3}\)
        • +
        • Mean: \(\frac{1.76 + 1.73 + 1.79 + 1.81 + +1.85 + 1.80}{6} = 1.79\)
        • +
        • Second moment: $ = 3.2055$
        • +
        • Variance: \(3.2055 - 1.79\times1.79 = +1.43\times 10^{-3}\)
        • Standard deviation: \(0.0379\)
        • -
        • No, you can’t compute it. You don’t have access to \(P(y)\) directly.
        • +
        • No, you can’t compute it. You don’t have access to \(P(y)\) directly.

      Sample Based Approximation Example

        -
      • You are given the following values samples of heights of students,

        +
      • You are given the following values samples of heights of +students,

        @@ -894,51 +681,65 @@

        Sample Based Approximation Example

      • -
      • Actually these “data” were sampled from a Gaussian with mean 1.7 and standard deviation 0.15. Are your estimates close to the real values? If not why not?

      • +
      • Actually these “data” were sampled from a Gaussian with mean 1.7 +and standard deviation 0.15. Are your estimates close to the real +values? If not why not?

      -
      -

      Exercise 1

      -

      Now we see we have several additional features. Let’s assume we want to predict maternal_health_delivery_services. How would we go about doing it?

      -

      Using what you’ve learnt about joint, conditional and marginal probabilities, as well as the sum and product rule, how would you formulate the question you want to answer in terms of probabilities? Should you be using a joint or a conditional distribution? If it’s conditional, what should the distribution be over, and what should it be conditioned on?

      -

      Probabilistic Modelling

      • Probabilistically we want, \[ p(y_*|\mathbf{ y}, \mathbf{X}, \mathbf{ x}_*), -\] \(y_*\) is a test output \(\mathbf{ x}_*\) is a test input \(\mathbf{X}\) is a training input matrix \(\mathbf{ y}\) is training outputs
      • +\] \(y_*\) is a test output +\(\mathbf{ x}_*\) is a test input \(\mathbf{X}\) is a training input matrix +\(\mathbf{ y}\) is training +outputs

      Joint Model of World

      \[ -p(y_*|\mathbf{ y}, \mathbf{X}, \mathbf{ x}_*) = \int p(y_*|\mathbf{ x}_*, \mathbf{W}) p(\mathbf{W}| \mathbf{ y}, \mathbf{X}) \text{d} \mathbf{W} +p(y_*|\mathbf{ y}, \mathbf{X}, \mathbf{ x}_*) = \int p(y_*|\mathbf{ +x}_*, \mathbf{W}) p(\mathbf{W}| \mathbf{ y}, \mathbf{X}) \text{d} +\mathbf{W} \]

      -

      \(\mathbf{W}\) contains \(\mathbf{W}_1\) and \(\mathbf{W}_2\)

      -

      \(p(\mathbf{W}| \mathbf{ y}, \mathbf{X})\) is posterior density

      +

      \(\mathbf{W}\) contains \(\mathbf{W}_1\) and \(\mathbf{W}_2\)

      +

      \(p(\mathbf{W}| \mathbf{ y}, +\mathbf{X})\) is posterior density

      Likelihood

      -

      \(p(y|\mathbf{ x}, \mathbf{W})\) is the likelihood of data point

      +

      \(p(y|\mathbf{ x}, \mathbf{W})\) is +the likelihood of data point

      Normally assume independence: \[ -p(\mathbf{ y}|\mathbf{X}, \mathbf{W}) = \prod_{i=1}^np(y_i|\mathbf{ x}_i, \mathbf{W}),\]

      +p(\mathbf{ y}|\mathbf{X}, \mathbf{W}) = \prod_{i=1}^np(y_i|\mathbf{ +x}_i, \mathbf{W}),\]

      Likelihood and Prediction Function

      \[ -p(y_i | f(\mathbf{ x}_i)) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp\left(-\frac{\left(y_i - f(\mathbf{ x}_i)\right)^2}{2\sigma^2}\right) +p(y_i | f(\mathbf{ x}_i)) = \frac{1}{\sqrt{2\pi \sigma^2}} +\exp\left(-\frac{\left(y_i - f(\mathbf{ +x}_i)\right)^2}{2\sigma^2}\right) \]

      Unsupervised Learning

        -
      • Can also consider priors over latents \[ -p(\mathbf{ y}_*|\mathbf{ y}) = \int p(\mathbf{ y}_*|\mathbf{X}_*, \mathbf{W}) p(\mathbf{W}| \mathbf{ y}, \mathbf{X}) p(\mathbf{X}) p(\mathbf{X}_*) \text{d} \mathbf{W}\text{d} \mathbf{X}\text{d}\mathbf{X}_* +

      • Can also consider priors over latents \[ +p(\mathbf{ y}_*|\mathbf{ y}) = \int p(\mathbf{ y}_*|\mathbf{X}_*, +\mathbf{W}) p(\mathbf{W}| \mathbf{ y}, \mathbf{X}) p(\mathbf{X}) +p(\mathbf{X}_*) \text{d} \mathbf{W}\text{d} +\mathbf{X}\text{d}\mathbf{X}_* \]

      • This gives unsupervised learning.

      @@ -947,39 +748,50 @@

      Unsupervised Learning

      Probabilistic Inference

      • Data: \(\mathbf{ y}\)

      • -
      • Model: \(p(\mathbf{ y}, \mathbf{ y}^*)\)

      • -
      • Prediction: \(p(\mathbf{ y}^*| \mathbf{ y})\)

      • +
      • Model: \(p(\mathbf{ y}, \mathbf{ +y}^*)\)

      • +
      • Prediction: \(p(\mathbf{ y}^*| \mathbf{ +y})\)

      Graphical Models

        -
      • Represent joint distribution through conditional dependencies.
      • +
      • Represent joint distribution through conditional +dependencies.
      • E.g. Markov chain
      -

      \[p(\mathbf{ y}) = p(y_n| y_{n-1}) p(y_{n-1}|y_{n-2}) \dots p(y_{2} | y_{1})\]

      +

      \[p(\mathbf{ y}) = p(y_n| y_{n-1}) +p(y_{n-1}|y_{n-2}) \dots p(y_{2} | y_{1})\]

      - +

      -

      Predict Perioperative Risk of Clostridium Difficile Infection Following Colon Surgery (Steele et al., 2012)

      +

      Predict Perioperative Risk of Clostridium Difficile Infection +Following Colon Surgery (Steele et al., 2012)

      - +
      @@ -988,39 +800,64 @@

      Introduction to Classification

      Classification

        -
      • Wake word classification (Global Pulse Project).

      • -
      • Breakthrough in 2012 with ImageNet result of Alex Krizhevsky, Ilya Sutskever and Geoff Hinton

      • -
      • We are given a data set containing ‘inputs’, \(\mathbf{X}\) and ‘targets’, \(\mathbf{ y}\).

      • -
      • Each data point consists of an input vector \(\mathbf{ x}_i\) and a class label, \(y_i\).

      • -
      • For binary classification assume \(y_i\) should be either \(1\) (yes) or \(-1\) (no).

      • +
      • Wake word classification (Global Pulse +Project).

      • +
      • Breakthrough in 2012 with ImageNet result of Alex +Krizhevsky, Ilya Sutskever and Geoff Hinton

      • +
      • We are given a data set containing ‘inputs’, \(\mathbf{X}\) and ‘targets’, \(\mathbf{ y}\).

      • +
      • Each data point consists of an input vector \(\mathbf{ x}_i\) and a class label, \(y_i\).

      • +
      • For binary classification assume \(y_i\) should be either \(1\) (yes) or \(-1\) (no).

      • Input vector can be thought of as features.

      Discrete Probability

        -
      • Algorithms based on prediction function and objective function.
      • -
      • For regression the codomain of the functions, \(f(\mathbf{X})\) was the real numbers or sometimes real vectors.
      • -
      • In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, \(y\) which either takes the value \(-1\) or \(1\).
      • +
      • Algorithms based on prediction function and +objective function.
      • +
      • For regression the codomain of the functions, \(f(\mathbf{X})\) was the real numbers or +sometimes real vectors.
      • +
      • In classification we are given an input vector, \(\mathbf{ x}\), and an associated label, +\(y\) which either takes the value +\(-1\) or \(1\).

      Classification

        -
      • Inputs, \(\mathbf{ x}\), mapped to a label, \(y\), through a function \(f(\cdot)\) dependent on parameters, \(\mathbf{ w}\), \[ +
      • Inputs, \(\mathbf{ x}\), mapped to +a label, \(y\), through a function +\(f(\cdot)\) dependent on parameters, +\(\mathbf{ w}\), \[ y= f(\mathbf{ x}; \mathbf{ w}). \]
      • -
      • \(f(\cdot)\) is known as the prediction function.
      • +
      • \(f(\cdot)\) is known as the +prediction function.

      Classification Examples

        -
      • Classifiying hand written digits from binary images (automatic zip code reading)
      • +
      • Classifiying hand written digits from binary images (automatic zip +code reading)
      • Detecting faces in images (e.g. digital cameras).
      • Who a detected face belongs to (e.g. Facebook, DeepFace)
      • Classifying type of cancer given gene expression data.
      • -
      • Categorization of document types (different types of news article on the internet)
      • +
      • Categorization of document types (different types of news article on +the internet)
      @@ -1029,16 +866,22 @@

      Reminder on the Term “Bayesian”

    • We use Bayes’ rule to invert probabilities in the Bayesian approach.
      • Bayesian is not named after Bayes’ rule (v. common confusion).
      • -
      • The term Bayesian refers to the treatment of the parameters as stochastic variables.
      • -
      • Proposed by Laplace (1774) and Bayes (1763) independently.
      • -
      • For early statisticians this was very controversial (Fisher et al).
      • +
      • The term Bayesian refers to the treatment of the parameters as +stochastic variables.
      • +
      • Proposed by Laplace (1774) and Bayes (1763) +independently.
      • +
      • For early statisticians this was very controversial (Fisher et +al).
    • Reminder on the Term “Bayesian”

        -
      • The use of Bayes’ rule does not imply you are being Bayesian. +
      • The use of Bayes’ rule does not imply you are being +Bayesian.
        • It is just an application of the product rule of probability.
      • @@ -1047,27 +890,37 @@

        Reminder on the Term “Bayesian”

        Bernoulli Distribution

          -
        • Binary classification: need a probability distribution for discrete variables.
        • -
        • Discrete probability is in some ways easier: \(P(y=1) = \pi\) & specify distribution as a table.
        • -
        • Instead of \(y=-1\) for negative class we take \(y=0\).
        • +
        • Binary classification: need a probability distribution for discrete +variables.
        • +
        • Discrete probability is in some ways easier: \(P(y=1) = \pi\) & specify distribution +as a table.
        • +
        • Instead of \(y=-1\) for negative +class we take \(y=0\).
        - + - - - + + +
        \(y\)\(y\) 0 1
        \(P(y)\)\((1-\pi)\)\(\pi\)\(P(y)\)\((1-\pi)\)\(\pi\)
        -

        This is the Bernoulli distribution.

        +

        This is the Bernoulli +distribution.

        Mathematical Switch

        @@ -1075,47 +928,63 @@

        Mathematical Switch

      • The Bernoulli distribution \[ P(y) = \pi^y(1-\pi)^{(1-y)} \]

      • -
      • Is a clever trick for switching probabilities, as code it would be

      • +
      • Is a clever trick for switching probabilities, as code it would +be

      -
      def bernoulli(y_i, pi):
      -    if y_i == 1:
      -        return pi
      -    else:
      -        return 1-pi
      +
      def bernoulli(y_i, pi):
      +    if y_i == 1:
      +        return pi
      +    else:
      +        return 1-pi

      Jacob Bernoulli’s Bernoulli

        -
      • Bernoulli described the Bernoulli distribution in terms of an ‘urn’ filled with balls.
      • -
      • There are red and black balls. There is a fixed number of balls in the urn.
      • -
      • The portion of red balls is given by \(\pi\).
      • -
      • For this reason in Bernoulli’s distribution there is epistemic uncertainty about the distribution parameter.
      • +
      • Bernoulli described the Bernoulli distribution in terms of an ‘urn’ +filled with balls.
      • +
      • There are red and black balls. There is a fixed number of balls in +the urn.
      • +
      • The portion of red balls is given by \(\pi\).
      • +
      • For this reason in Bernoulli’s distribution there is +epistemic uncertainty about the distribution parameter.

      -

      +
      + +

      Jacob Bernoulli’s Bernoulli

      - +

      Thomas Bayes’s Bernoulli

        -
      • Bayes described the Bernoulli distribution (he didn’t call it that!) in terms of a table and two balls.
      • -
      • Each ball is rolled so it comes to rest at a uniform distribution across the table.
      • -
      • The first ball comes to rest at a position that is a \(\pi\) times the width of table.
      • -
      • After placing the first ball you consider whether a second would land to the left or the right.
      • -
      • For this reason in Bayes’s distribution there is considered to be aleatoric uncertainty about the distribution parameter.
      • +
      • Bayes described the Bernoulli distribution (he didn’t call it that!) +in terms of a table and two balls.
      • +
      • Each ball is rolled so it comes to rest at a uniform distribution +across the table.
      • +
      • The first ball comes to rest at a position that is a \(\pi\) times the width of table.
      • +
      • After placing the first ball you consider whether a second would +land to the left or the right.
      • +
      • For this reason in Bayes’s distribution there is considered to be +aleatoric uncertainty about the distribution parameter.
      @@ -1123,83 +992,59 @@

      Thomas Bayes’ Bernoulli

      -

      -
      -
      -

      +

      + + +

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - + - -
      -

      - +

      Maximum Likelihood in the Bernoulli

        -
      • Assume data, \(\mathbf{ y}\) is binary vector length \(n\).
      • -
      • Assume each value was sampled independently from the Bernoulli distribution, given probability \(\pi\) \[ +
      • Assume data, \(\mathbf{ y}\) is +binary vector length \(n\).
      • +
      • Assume each value was sampled independently from the Bernoulli +distribution, given probability \(\pi\) +\[ p(\mathbf{ y}|\pi) = \prod_{i=1}^{n} \pi^{y_i} (1-\pi)^{1-y_i}. \]
      @@ -1207,34 +1052,48 @@

      Maximum Likelihood in the Bernoulli

      Negative Log Likelihood

        -
      • Minimize the negative log likelihood \[\begin{align*} -E(\pi)& = -\log p(\mathbf{ y}|\pi)\\ - & = -\sum_{i=1}^{n} y_i \log \pi - \sum_{i=1}^{n} (1-y_i) \log(1-\pi), +
      • Minimize the negative log likelihood \[\begin{align*} +E(\pi)& = -\log p(\mathbf{ y}|\pi)\\ + & = -\sum_{i=1}^{n} y_i \log \pi - \sum_{i=1}^{n} +(1-y_i) \log(1-\pi), \end{align*}\]
      • -
      • Take gradient with respect to the parameter \(\pi\). \[\frac{\text{d}E(\pi)}{\text{d}\pi} = -\frac{\sum_{i=1}^{n} y_i}{\pi} + \frac{\sum_{i=1}^{n} (1-y_i)}{1-\pi},\]
      • +
      • Take gradient with respect to the parameter \(\pi\). \[\frac{\text{d}E(\pi)}{\text{d}\pi} = +-\frac{\sum_{i=1}^{n} y_i}{\pi} + \frac{\sum_{i=1}^{n} +(1-y_i)}{1-\pi},\]

      Fixed Point

        -
      • Stationary point: set derivative to zero \[0 = -\frac{\sum_{i=1}^{n} y_i}{\pi} + \frac{\sum_{i=1}^{n} (1-y_i)}{1-\pi},\]

      • -
      • Rearrange to form \[(1-\pi)\sum_{i=1}^{n} y_i = \pi\sum_{i=1}^{n} (1-y_i),\]

      • -
      • Giving \[\sum_{i=1}^{n} y_i = \pi\left(\sum_{i=1}^{n} (1-y_i) + \sum_{i=1}^{n} y_i\right),\]

      • +
      • Stationary point: set derivative to zero \[0 = -\frac{\sum_{i=1}^{n} y_i}{\pi} + +\frac{\sum_{i=1}^{n} (1-y_i)}{1-\pi},\]

      • +
      • Rearrange to form \[(1-\pi)\sum_{i=1}^{n} y_i = \pi\sum_{i=1}^{n} +(1-y_i),\]

      • +
      • Giving \[\sum_{i=1}^{n} y_i += \pi\left(\sum_{i=1}^{n} (1-y_i) + \sum_{i=1}^{n} +y_i\right),\]

      Solution

        -
      • Recognise that \(\sum_{i=1}^{n} (1-y_i) + \sum_{i=1}^{n} y_i = n\) so we have \[\pi = \frac{\sum_{i=1}^{n} y_i}{n}\]

      • -
      • Estimate the probability associated with the Bernoulli by setting it to the number of observed positives, divided by the total length of \(y\).

      • +
      • Recognise that \(\sum_{i=1}^{n} (1-y_i) ++ \sum_{i=1}^{n} y_i = n\) so we have \[\pi = \frac{\sum_{i=1}^{n} +y_i}{n}\]

      • +
      • Estimate the probability associated with the Bernoulli by setting +it to the number of observed positives, divided by the total length of +\(y\).

      • Makes intiutive sense.

      • -
      • What’s your best guess of probability for coin toss is heads when you get 47 heads from 100 tosses?

      • +
      • What’s your best guess of probability for coin toss is heads when +you get 47 heads from 100 tosses?

      -
      -

      Exercise 4

      -

      Show that the maximum likelihood solution we have found is a minimum for our objective.

      -

      Bayes’ Rule Reminder

      \[ @@ -1252,19 +1111,30 @@

      Bayes’ Rule Reminder

      Naive Bayes Classifiers

        -
      • Probabilistic Machine Learning: place probability distributions (or densities) over all the variables of interest.

      • +
      • Probabilistic Machine Learning: place probability distributions +(or densities) over all the variables of interest.

      • In naive Bayes this is exactly what we do.

      • -
      • Form a classification algorithm by modelling the joint density of our observations.

      • +
      • Form a classification algorithm by modelling the joint +density of our observations.

      • Need to make assumption about joint density.

      Assumptions about Density

        -
      • Make assumptions to reduce the number of parameters we need to optimise.
      • -
      • Given label data \(\mathbf{ y}\) and the inputs \(\mathbf{X}\) could specify joint density of all potential values of \(\mathbf{ y}\) and \(\mathbf{X}\), \(p(\mathbf{ y}, \mathbf{X})\).
      • -
      • If \(\mathbf{X}\) and \(\mathbf{ y}\) are training data.
      • -
      • If \(\mathbf{ x}^*\) is a test input and \(y^*\) a test location we want \[ +
      • Make assumptions to reduce the number of parameters we need to +optimise.
      • +
      • Given label data \(\mathbf{ y}\) +and the inputs \(\mathbf{X}\) could +specify joint density of all potential values of \(\mathbf{ y}\) and \(\mathbf{X}\), \(p(\mathbf{ y}, \mathbf{X})\).
      • +
      • If \(\mathbf{X}\) and \(\mathbf{ y}\) are training data.
      • +
      • If \(\mathbf{ x}^*\) is a test +input and \(y^*\) a test location we +want \[ p(y^*|\mathbf{X}, \mathbf{ y}, \mathbf{ x}^*), \]
      @@ -1273,16 +1143,24 @@

      Assumptions about Density

      Answer from Rules of Probability

      • Compute this distribution using the product and sum rules.
      • -
      • Need the probability associated with all possible combinations of \(\mathbf{ y}\) and \(\mathbf{X}\).
      • -
      • There are \(2^{n}\) possible combinations for the vector \(\mathbf{ y}\)
      • -
      • Probability for each of these combinations must be jointly specified along with the joint density of the matrix \(\mathbf{X}\),
      • -
      • Also need to extend the density for any chosen test location \(\mathbf{ x}^*\).
      • +
      • Need the probability associated with all possible combinations of +\(\mathbf{ y}\) and \(\mathbf{X}\).
      • +
      • There are \(2^{n}\) possible +combinations for the vector \(\mathbf{ +y}\)
      • +
      • Probability for each of these combinations must be jointly specified +along with the joint density of the matrix \(\mathbf{X}\),
      • +
      • Also need to extend the density for any chosen test +location \(\mathbf{ x}^*\).

      Naive Bayes Assumptions

        -
      • In naive Bayes we make certain simplifying assumptions that allow us to perform all of the above in practice.
      • +
      • In naive Bayes we make certain simplifying assumptions that +allow us to perform all of the above in practice.
      1. Data Conditional Independence
      2. @@ -1293,22 +1171,37 @@

        Naive Bayes Assumptions

        Data Conditional Independence

          -
        • Given model parameters \(\boldsymbol{ \theta}\) we assume that all data points in the model are independent. \[ -p(y^*, \mathbf{ x}^*, \mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}) = p(y^*, \mathbf{ x}^*|\boldsymbol{ \theta})\prod_{i=1}^{n} p(y_i, \mathbf{ x}_i | \boldsymbol{ \theta}). +

        • Given model parameters \(\boldsymbol{ +\theta}\) we assume that all data points in the model are +independent. \[ +p(y^*, \mathbf{ x}^*, \mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}) = +p(y^*, \mathbf{ x}^*|\boldsymbol{ \theta})\prod_{i=1}^{n} p(y_i, +\mathbf{ x}_i | \boldsymbol{ \theta}). \]

        • This is a conditional independence assumption.

        • -
        • We also make similar assumptions for regression (where \(\boldsymbol{ \theta}= \left\{\mathbf{ w},\sigma^2\right\}\)).

        • -
        • Here we assume joint density of \(\mathbf{ y}\) and \(\mathbf{X}\) is independent across the data given the parameters.

        • +
        • We also make similar assumptions for regression (where \(\boldsymbol{ \theta}= \left\{\mathbf{ +w},\sigma^2\right\}\)).

        • +
        • Here we assume joint density of \(\mathbf{ y}\) and \(\mathbf{X}\) is independent across the data +given the parameters.

        Bayes Classifier

        -

        Computing posterior distribution in this case becomes easier, this is known as the ‘Bayes classifier’.

        +

        Computing posterior distribution in this case becomes easier, this is +known as the ‘Bayes classifier’.

        Feature Conditional Independence

          -
        • Particular to naive Bayes: assume features are also conditionally independent, given param and the label. \[p(\mathbf{ x}_i | y_i, \boldsymbol{ \theta}) = \prod_{j=1}^{p} p(x_{i,j}|y_i,\boldsymbol{ \theta})\] where \(p\) is the dimensionality of our inputs.
        • +
        • Particular to naive Bayes: assume features are also +conditionally independent, given param and the label. \[p(\mathbf{ x}_i | y_i, \boldsymbol{ \theta}) = +\prod_{j=1}^{p} p(x_{i,j}|y_i,\boldsymbol{ \theta})\] where \(p\) is the dimensionality of our +inputs.
        • This is known as the naive Bayes assumption.
        • Bayes classifier + feature conditional independence.
        @@ -1316,22 +1209,36 @@

        Feature Conditional Independence

        Marginal Density for \(y_i\)

          -
        • To specify the joint distribution we also need the marginal for \(p(y_i)\) \[p(x_{i,j},y_i| \boldsymbol{ \theta}) = p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i).\]

        • -
        • Because \(y_i\) is binary the Bernoulli density makes a suitable choice for our prior over \(y_i\), \[p(y_i|\pi) = \pi^{y_i} (1-\pi)^{1-y_i}\] where \(\pi\) now has the interpretation as being the prior probability that the classification should be positive.

        • +
        • To specify the joint distribution we also need the marginal for +\(p(y_i)\) \[p(x_{i,j},y_i| \boldsymbol{ \theta}) = +p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i).\]

        • +
        • Because \(y_i\) is binary the +Bernoulli density makes a suitable choice for our prior over +\(y_i\), \[p(y_i|\pi) = \pi^{y_i} (1-\pi)^{1-y_i}\] +where \(\pi\) now has the +interpretation as being the prior probability that the +classification should be positive.

        Joint Density for Naive Bayes

          -
        • This allows us to write down the full joint density of the training data, \[ -p(\mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}, \pi) = \prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi) +
        • This allows us to write down the full joint density of the training +data, \[ +p(\mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}, \pi) = \prod_{i=1}^{n} +\prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi) \] which can now be fit by maximum likelihood.

        Objective Function

        \[\begin{align*} -E(\boldsymbol{ \theta}, \pi)& = -\log p(\mathbf{ y}, \mathbf{X}|\boldsymbol{ \theta}, \pi) \\ &= -\sum_{i=1}^{n} \sum_{j=1}^{p} \log p(x_{i, j}|y_i, \boldsymbol{ \theta}) - \sum_{i=1}^{n} \log p(y_i|\pi), +E(\boldsymbol{ \theta}, \pi)& = -\log p(\mathbf{ y}, +\mathbf{X}|\boldsymbol{ \theta}, \pi) \\ &= -\sum_{i=1}^{n} +\sum_{j=1}^{p} \log p(x_{i, j}|y_i, \boldsymbol{ \theta}) +- \sum_{i=1}^{n} \log p(y_i|\pi), \end{align*}\]

        @@ -1340,8 +1247,10 @@

        Maximum Likelihood

        Fit Prior

          -
        • We can minimize prior. For Bernoulli likelihood over the labels we have, \[\begin{align*} -E(\pi) & = - \sum_{i=1}^{n}\log p(y_i|\pi)\\ & = -\sum_{i=1}^{n} y_i \log \pi - \sum_{i=1}^{n} (1-y_i) \log (1-\pi) +
        • We can minimize prior. For Bernoulli likelihood over the labels we +have, \[\begin{align*} +E(\pi) & = - \sum_{i=1}^{n}\log p(y_i|\pi)\\ & = -\sum_{i=1}^{n} +y_i \log \pi - \sum_{i=1}^{n} (1-y_i) \log (1-\pi) \end{align*}\]
        • Solution from above is \[ \pi = \frac{\sum_{i=1}^{n} y_i}{n}. @@ -1352,42 +1261,85 @@

          Fit Prior

          Fit Conditional

          • Minimize conditional distribution: \[ -E(\boldsymbol{ \theta}) = -\sum_{i=1}^{n} \sum_{j=1}^{p} \log p(x_{i, j} |y_i, \boldsymbol{ \theta}), +E(\boldsymbol{ \theta}) = -\sum_{i=1}^{n} \sum_{j=1}^{p} \log p(x_{i, j} +|y_i, \boldsymbol{ \theta}), \]
          • Implies making an assumption about it’s form.
          • The right assumption will depend on the data.
          • -
          • E.g. for real valued data, use a Gaussian \[ +
          • E.g. for real valued data, use a Gaussian \[ p(x_{i, j} | y_i,\boldsymbol{ \theta}) = -\frac{1}{\sqrt{2\pi \sigma_{y_i,j}^2}} \exp \left(-\frac{(x_{i,j} - \mu_{y_i, +\frac{1}{\sqrt{2\pi \sigma_{y_i,j}^2}} \exp \left(-\frac{(x_{i,j} - +\mu_{y_i, j})^2}{\sigma_{y_i,j}^2}\right), \]
          -

          The distributions show the parameters of the independent class conditional probabilities for no maternity services. It is a Bernoulli distribution with the parameter, \(\pi\), given by (theta_0) for the facilities without maternity services and theta_1 for the facilities with maternity services. The parameters whow that, facilities with maternity services also are more likely to have other services such as grid electricity, emergency transport, immunization programs etc.

          -

          The naive Bayes assumption says that the joint probability for these services is given by the product of each of these Bernoulli distributions.

          -

          We have modelled the numbers in our table with a Gaussian density. Since several of these numbers are counts, a more appropriate distribution might be the Poisson distribution. But here we can see that the average number of nurses, healthworkers and doctors is higher in the facilities with maternal services (mu_1) than those without maternal services (mu_0). There is also a small difference between the mean latitude and longitudes. However, the standard deviation which would be given by the square root of the variance parameters (sigma_0 and sigma_1) is large, implying that a difference in latitude and longitude may be due to sampling error. To be sure more analysis would be required.

        -
        +
        +

        Nigeria NMIS Data Classification

        +

        The distributions show the parameters of the independent +class conditional probabilities for no maternity services. It is a +Bernoulli distribution with the parameter, \(\pi\), given by (theta_0) for +the facilities without maternity services and theta_1 for +the facilities with maternity services. The parameters whow that, +facilities with maternity services also are more likely to have other +services such as grid electricity, emergency transport, immunization +programs etc.

        +

        The naive Bayes assumption says that the joint probability for these +services is given by the product of each of these Bernoulli +distributions.

        +

        We have modelled the numbers in our table with a Gaussian density. +Since several of these numbers are counts, a more appropriate +distribution might be the Poisson distribution. But here we can see that +the average number of nurses, healthworkers and doctors is +higher in the facilities with maternal services +(mu_1) than those without maternal services +(mu_0). There is also a small difference between the mean +latitude and longitudes. However, the standard deviation which +would be given by the square root of the variance parameters +(sigma_0 and sigma_1) is large, implying that +a difference in latitude and longitude may be due to sampling error. To +be sure more analysis would be required.

        +
        +

        Compute Posterior for Test Point Label

        • We know that \[ -P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta})p(\mathbf{ y},\mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = p(y*, \mathbf{ y}, \mathbf{X},\mathbf{ x}^*| \boldsymbol{ \theta}) +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ +\theta})p(\mathbf{ y},\mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = +p(y*, \mathbf{ y}, \mathbf{X},\mathbf{ x}^*| \boldsymbol{ \theta}) \]
        • This implies \[ -P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = \frac{p(y*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta})}{p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta})} +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = +\frac{p(y*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ +\theta})}{p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ +\theta})} \]
        -
        +

        Compute Posterior for Test Point Label

          -
        • From conditional independence assumptions \[ -p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = \prod_{j=1}^{p} p(x^*_{j}|y^*, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi) +
        • From conditional independence assumptions \[ +p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = +\prod_{j=1}^{p} p(x^*_{j}|y^*, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi) \]
        • We also need \[ -p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta})\] which can be found from \[p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta})\] +which can be found from \[p(y^*, \mathbf{ y}, +\mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) \]
        • -
        • Using the sum rule of probability, \[ -p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = \sum_{y^*=0}^1 p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}). +
        • Using the sum rule of probability, \[ +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*|\boldsymbol{ \theta}) = +\sum_{y^*=0}^1 p(y^*, \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| +\boldsymbol{ \theta}). \]
        @@ -1395,10 +1347,18 @@

        Compute Posterior for Test Point Label

        Independence Assumptions

        • From independence assumptions \[ -p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = \sum_{y^*=0}^1 \prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi). +p(\mathbf{ y}, \mathbf{X}, \mathbf{ x}^*| \boldsymbol{ \theta}) = +\sum_{y^*=0}^1 \prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi). \]
        • Substitute both forms to recover, \[ -P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = \frac{\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi)}{\sum_{y^*=0}^1 \prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi)} +P(y^*| \mathbf{ y}, \mathbf{X}, \mathbf{ x}^*, \boldsymbol{ \theta}) = +\frac{\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ +\theta})p(y^*|\pi)\prod_{i=1}^{n} \prod_{j=1}^{p} p(x_{i,j}|y_i, +\boldsymbol{ \theta})p(y_i|\pi)}{\sum_{y^*=0}^1 \prod_{j=1}^{p} +p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)\prod_{i=1}^{n} +\prod_{j=1}^{p} p(x_{i,j}|y_i, \boldsymbol{ \theta})p(y_i|\pi)} \]
        @@ -1406,14 +1366,21 @@

        Independence Assumptions

        Cancelation

        • Note training data terms cancel. \[ -p(y^*| \mathbf{ x}^*, \boldsymbol{ \theta}) = \frac{\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)}{\sum_{y^*=0}^1 \prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)} +p(y^*| \mathbf{ x}^*, \boldsymbol{ \theta}) = \frac{\prod_{j=1}^{p} +p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)}{\sum_{y^*=0}^1 +\prod_{j=1}^{p} p(x^*_{j}|y^*_i, \boldsymbol{ \theta})p(y^*|\pi)} \]
        • -
        • This formula is also fairly straightforward to implement for different class conditional distributions.
        • +
        • This formula is also fairly straightforward to implement for +different class conditional distributions.

        Laplace Smoothing

        -

        +
        + +

        Pseudo Counts

        @@ -1421,18 +1388,12 @@

        Pseudo Counts

        \pi = \frac{\sum_{i=1}^{n} y_i + 1}{n+ 2} \]

        -
        -

        Exercise 5

        -

        How can you improve your classification, are all the features equally valid? Are some features more helpful than others? What happens if you remove features that appear to be less helpful. How might you select such features?

        -
        -
        -

        Exercise 6

        -

        We have decided to classify positive if probability of maternity is greater than 0.5. This has led us to accidentally classify some facilities as havien’t facilities for maternity when in fact they don’t. Imagine you wish to ensure that a facility handles maternity. With your test set how low do you have to set the threshold to avoid all the false negatives (i.e. facilities where you predicted there was no maternity, but in actuality there were?

        -

        Naive Bayes Summary

          -
        • Model full joint distribution of data, \(p(\mathbf{ y}, \mathbf{X}| \boldsymbol{ \theta}, \pi)\)
        • +
        • Model full joint distribution of data, \(p(\mathbf{ y}, \mathbf{X}| \boldsymbol{ \theta}, +\pi)\)
        • Make conditional independence assumptions about the data.
          • feature conditional independence
          • @@ -1445,7 +1406,9 @@

            Naive Bayes Summary

            Other Reading

              -
            • Chapter 5 of Rogers and Girolami (2011) up to pg 179 (Section 5.1, and 5.2 up to 5.2.2).
            • +
            • Chapter 5 of Rogers and Girolami (2011) up to pg +179 (Section 5.1, and 5.2 up to 5.2.2).
            @@ -1454,45 +1417,95 @@

            References

            Thanks!

            -
            -
            -

            Bayes, T., 1763. An essay towards solving a problem in the doctrine of chances. Philosophical Transactions of the Royal Society 53, 370–418. https://doi.org/10.1098/rstl.1763.0053

            +
            +
            +Bayes, T., 1763. An essay towards solving a problem in the doctrine of +chances. Philosophical Transactions of the Royal Society 53, 370–418. https://doi.org/10.1098/rstl.1763.0053
            -
            -

            Bishop, C.M., 2006. Pattern recognition and machine learning. springer.

            +
            +Bishop, C.M., 2006. Pattern recognition and machine learning. springer.
            -
            -

            Laplace, P.S., 1774. Mémoire sur la probabilité des causes par les évènemens, in: Mémoires de Mathèmatique et de Physique, Presentés à lAcadémie Royale Des Sciences, Par Divers Savans, & Lù Dans Ses Assemblées 6. pp. 621–656.

            +
            +Laplace, P.S., 1774. Mémoire sur la probabilité des causes par les +évènemens, in: Mémoires de Mathèmatique Et de Physique, Presentés à +lAcadémie Royale Des Sciences, Par Divers Savans, & Lù Dans Ses +Assemblées 6. pp. 621–656.
            -
            -

            Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC Press.

            +
            +Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC +Press.
            -
            -

            Steele, S., Bilchik, A., Eberhardt, J., Kalina, P., Nissan, A., Johnson, E., Avital, I., Stojadinovic, A., 2012. Using machine-learned Bayesian belief networks to predict perioperative risk of clostridium difficile infection following colon surgery. Interact J Med Res 1, e6. https://doi.org/10.2196/ijmr.2131

            +
            +Steele, S., Bilchik, A., Eberhardt, J., Kalina, P., Nissan, A., Johnson, +E., Avital, I., Stojadinovic, A., 2012. Using machine-learned +Bayesian belief networks to predict perioperative risk of +clostridium difficile infection following colon surgery. Interact J Med +Res 1, e6. https://doi.org/10.2196/ijmr.2131
            - - + + diff --git a/slides/04-gaussian-processes.slides.html b/slides/04-gaussian-processes.slides.html index 407ff64..bbd48d8 100644 --- a/slides/04-gaussian-processes.slides.html +++ b/slides/04-gaussian-processes.slides.html @@ -8,7 +8,7 @@ - + - - + + -

            +

            + + +

            - +
            - +
            - +
            - +
            - +
            - +
            - +

            \(y= mx+ c\)

            -

            point 1: \(x= 1\), \(y=3\) \[ +

            point 1: \(x= 1\), \(y=3\) \[ 3 = m + c \]

            -

            point 2: \(x= 3\), \(y=1\) \[ +

            point 2: \(x= 3\), \(y=1\) \[ 1 = 3m + c \]

            -

            point 3: \(x= 2\), \(y=2.5\)

            -

            \[2.5 = 2m + c\]

            +

            point 3: \(x= 2\), \(y=2.5\) \[ +2.5 = 2m + c +\]

            -
            -

            +
            +

            Pierre-Simon Laplace

            -
            +

            - +
            @@ -425,16 +478,20 @@

            Pierre-Simon Laplace 1749-1827.
            -
            +

            -

            +
            + +
            -
            +

            - +
            @@ -442,81 +499,92 @@

            Laplace’s determinsim in English translation.
            -
            +

            - +
            -
            -
            +
            +

            Laplace’s Gremlin

            +
            +

            -

            +
            + +
            -
            +

            - +
            -
            +

            -
            +
            - +
            -
            +
            +

            Latent Variables

            +

            \(y= mx+ c + \epsilon\)

            -

            point 1: \(x= 1\), \(y=3\) \[ -3 = m + c + \epsilon_1 -\]

            +

            point 1: \(x= 1\), \(y=3\) [ 3 = m + c + _1 ]

            -

            point 2: \(x= 3\), \(y=1\) \[ -1 = 3m + c + \epsilon_2 -\]

            +

            point 2: \(x= 3\), \(y=1\) [ 1 = 3m + c + _2 ]

            -

            point 3: \(x= 2\), \(y=2.5\) \[ -2.5 = 2m + c + \epsilon_3 -\]

            +

            point 3: \(x= 2\), \(y=2.5\) [ 2.5 = 2m + c + _3 ]

            A Probabilistic Process

            -

            Set the mean of Gaussian to be a function. \[ -p\left(y_i|x_i\right)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp \left(-\frac{\left(y_i-f\left(x_i\right)\right)^{2}}{2\sigma^2}\right). +

            Set the mean of Gaussian to be a function. \[ +p\left(y_i|x_i\right)=\frac{1}{\sqrt{2\pi\sigma^2}}\exp +\left(-\frac{\left(y_i-f\left(x_i\right)\right)^{2}}{2\sigma^2}\right). \]

            @@ -532,30 +600,45 @@

            Two Important Gaussian Properties

            Sum of Gaussians

            -

            Sum of Gaussian variables is also Gaussian.

            -

            \[y_i \sim \mathcal{N}\left(\mu_i,\sigma_i^2\right)\]

            +
            +Sum of Gaussian variables is also Gaussian. +
            +

            \[y_i \sim +\mathcal{N}\left(\mu_i,\sigma_i^2\right)\]

            -

            And the sum is distributed as

            +
            +And the sum is distributed as +

            \[ -\sum_{i=1}^{n} y_i \sim \mathcal{N}\left(\sum_{i=1}^n\mu_i,\sum_{i=1}^n\sigma_i^2\right) +\sum_{i=1}^{n} y_i \sim +\mathcal{N}\left(\sum_{i=1}^n\mu_i,\sum_{i=1}^n\sigma_i^2\right) \]

            -

            (Aside: As sum increases, sum of non-Gaussian, finite variance variables is also Gaussian because of central limit theorem.)

            +

            (Aside: As sum increases, sum of non-Gaussian, finite +variance variables is also Gaussian because of central limit +theorem.)

            Scaling a Gaussian

            -

            Scaling a Gaussian leads to a Gaussian.

            +
            +Scaling a Gaussian leads to a Gaussian. +
            -

            \[y\sim \mathcal{N}\left(\mu,\sigma^2\right)\]

            +

            \[y\sim +\mathcal{N}\left(\mu,\sigma^2\right)\]

            -

            And the scaled variable is distributed as

            -

            \[wy\sim \mathcal{N}\left(w\mu,w^2 \sigma^2\right).\]

            +
            +And the scaled variable is distributed as +
            +

            \[wy\sim \mathcal{N}\left(w\mu,w^2 +\sigma^2\right).\]

            @@ -566,14 +649,114 @@

            Multivariate Gaussian Properties

            \]

          • Assume \[ \begin{align} -\mathbf{ x}& \sim \mathcal{N}\left(\boldsymbol{ \mu},\mathbf{C}\right)\\ -\boldsymbol{ \epsilon}& \sim \mathcal{N}\left(\mathbf{0},\boldsymbol{ \Sigma}\right) +\mathbf{ x}& \sim \mathcal{N}\left(\boldsymbol{ +\mu},\mathbf{C}\right)\\ +\boldsymbol{ \epsilon}& \sim +\mathcal{N}\left(\mathbf{0},\boldsymbol{ \Sigma}\right) \end{align} \]

          • Then \[ -\mathbf{ y}\sim \mathcal{N}\left(\mathbf{W}\boldsymbol{ \mu},\mathbf{W}\mathbf{C}\mathbf{W}^\top + \boldsymbol{ \Sigma}\right). -\] If \(\boldsymbol{ \Sigma}=\sigma^2\mathbf{I}\), this is Probabilistic PCA (Tipping and Bishop, 1999).

          • +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{W}\boldsymbol{ +\mu},\mathbf{W}\mathbf{C}\mathbf{W}^\top + \boldsymbol{ \Sigma}\right). +\] If \(\boldsymbol{ +\Sigma}=\sigma^2\mathbf{I}\), this is Probabilistic PCA (Tipping and Bishop, +1999).

            + + +
            +
            +

            Objective Optimization

            +
            +
            +

            Multivariate Derivatives

            +
              +
            • We will need some multivariate calculus.
            • +
            • For now some simple multivariate differentiation: \[\frac{\text{d}{\mathbf{a}^{\top}}{\mathbf{ +w}}}{\text{d}\mathbf{ w}}=\mathbf{a}\] and \[\frac{\mathbf{ w}^{\top}\mathbf{A}\mathbf{ +w}}{\text{d}\mathbf{ +w}}=\left(\mathbf{A}+\mathbf{A}^{\top}\right)\mathbf{ w}\] or if +\(\mathbf{A}\) is symmetric +(i.e. \(\mathbf{A}=\mathbf{A}^{\top}\)) \[\frac{\text{d}\mathbf{ +w}^{\top}\mathbf{A}\mathbf{ w}}{\text{d}\mathbf{ w}}=2\mathbf{A}\mathbf{ +w}.\]
            • +
            +
            +
            +

            Differentiate the Objective

            +
            +Differentiating with respect to the vector \(\mathbf{ w}\) we obtain +
            +

            \[ +\frac{\partial L\left(\mathbf{ w},\sigma^2 \right)}{\partial +\mathbf{ w}}=\frac{1}{\sigma^2} \sum _{i=1}^{n}\mathbf{ x}_i +y_i-\frac{1}{\sigma^2} +\left[\sum _{i=1}^{n}\mathbf{ x}_i\mathbf{ x}_i^{\top}\right]\mathbf{ w} +\] Leading to \[ +\mathbf{ w}^{*}=\left[\sum +_{i=1}^{n}\mathbf{ x}_i\mathbf{ x}_i^{\top}\right]^{-1}\sum +_{i=1}^{n}\mathbf{ x}_iy_i, +\]

            +
            +
            +

            Differentiate the Objective

            +

            Rewrite in matrix notation: \[ +\sum_{i=1}^{n}\mathbf{ x}_i\mathbf{ x}_i^\top = \designMatrix^\top +\designMatrix +\] \[ +\sum_{i=1}^{n}\mathbf{ x}_iy_i = \designMatrix^\top \mathbf{ y} +\]

            + +
            +
            +

            Update Equation for Global Optimum

            +
            +
            +

            Update Equations

            +
              +
            • Solve the matrix equation for \(\mathbf{ w}\). \[\designMatrix^\top \designMatrix\mathbf{ +w}= \designMatrix^\top \mathbf{ y}\]

            • +
            • The equation for \(\left.\sigma^2\right.^{*}\) may also be +found \[\left.\sigma^2\right.^{{*}}=\frac{\sum_{i=1}^{n}\left(y_i-\left.\mathbf{ +w}^{*}\right.^{\top}\mathbf{ x}_i\right)^{2}}{n}.\]

            • +
            +
            +
            +

            Movie Body Count Data

            +
              +
            • Data containing movie information (year, length, rating, genre, IMDB +Rating).
            +
            +
            +

            Multivariate Regression on Movie Body Count Data

            +
              +
            • Regress from features Year, Body_Count, +Length_Minutes to IMDB_Rating.
            • +
            +
            +
            +

            Residuals

            +
            +
            + + +
            +
            +
            @@ -582,10 +765,13 @@

            Underdetermined System

            Underdetermined System

              -
            • What about two unknowns and one observation? \[y_1 = mx_1 + c\]
            • +
            • What about two unknowns and one observation? \[y_1 = mx_1 + c\]
            -

            Can compute \(m\) given \(c\). \[m = \frac{y_1 - c}{x}\]

            +

            Can compute \(m\) given \(c\). \[m = +\frac{y_1 - c}{x}\]

            @@ -593,53 +779,58 @@

            Underdetermined System

            -

            +

            + + +

            - +
            - +
            - +
            - +
            - +
            - +
            - +
            - +
            - +
            - +

            Two Dimensional Gaussian

              -
            • Consider height, \(h/m\) and weight, \(w/kg\).
            • -
            • Could sample height from a distribution: \[ +
            • Consider height, \(h/m\) and +weight, \(w/kg\).
            • +
            • Could sample height from a distribution: \[ p(h) \sim \mathcal{N}\left(1.7,0.0225\right). \]
            • And similarly weight: \[ @@ -651,7 +842,7 @@

              Two Dimensional Gaussian

              Height and Weight Models

              - + @@ -673,45 +864,50 @@

              Sampling Two Dimensional Variables

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - +

              Body Mass Index

                -
              • In reality they are dependent (body mass index) \(= \frac{w}{h^2}\).
              • -
              • To deal with this dependence we introduce correlated multivariate Gaussians.
              • +
              • In reality they are dependent (body mass index) \(= \frac{w}{h^2}\).
              • +
              • To deal with this dependence we introduce correlated +multivariate Gaussians.
              @@ -719,37 +915,40 @@

              Sampling Two Dimensional Variables

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - + @@ -762,49 +961,74 @@

              Independent Gaussians

              Independent Gaussians

              \[ -p(w, h) = \frac{1}{\sqrt{2\pi \sigma_1^2}\sqrt{2\pi\sigma_2^2}} \exp\left(-\frac{1}{2}\left(\frac{(w-\mu_1)^2}{\sigma_1^2} + \frac{(h-\mu_2)^2}{\sigma_2^2}\right)\right) +p(w, h) = \frac{1}{\sqrt{2\pi \sigma_1^2}\sqrt{2\pi\sigma_2^2}} +\exp\left(-\frac{1}{2}\left(\frac{(w-\mu_1)^2}{\sigma_1^2} + +\frac{(h-\mu_2)^2}{\sigma_2^2}\right)\right) \]

              Independent Gaussians

              \[ -p(w, h) = \frac{1}{\sqrt{2\pi\sigma_1^22\pi\sigma_2^2}} \exp\left(-\frac{1}{2}\left(\begin{bmatrix}w \\ h\end{bmatrix} - \begin{bmatrix}\mu_1 \\ \mu_2\end{bmatrix}\right)^\top\begin{bmatrix}\sigma_1^2& 0\\0&\sigma_2^2\end{bmatrix}^{-1}\left(\begin{bmatrix}w \\ h\end{bmatrix} - \begin{bmatrix}\mu_1 \\ \mu_2\end{bmatrix}\right)\right) +p(w, h) = \frac{1}{\sqrt{2\pi\sigma_1^22\pi\sigma_2^2}} +\exp\left(-\frac{1}{2}\left(\begin{bmatrix}w \\ h\end{bmatrix} - +\begin{bmatrix}\mu_1 \\ +\mu_2\end{bmatrix}\right)^\top\begin{bmatrix}\sigma_1^2& +0\\0&\sigma_2^2\end{bmatrix}^{-1}\left(\begin{bmatrix}w \\ +h\end{bmatrix} - \begin{bmatrix}\mu_1 \\ +\mu_2\end{bmatrix}\right)\right) \]

              Independent Gaussians

              \[ -p(\mathbf{ y}) = \frac{1}{\det{2\pi \mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ \mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) +p(\mathbf{ y}) = \frac{1}{\det{2\pi \mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) \]

              Correlated Gaussian

              -

              Form correlated from original by rotating the data space using matrix \(\mathbf{R}\).

              +

              Form correlated from original by rotating the data space using matrix +\(\mathbf{R}\).

              \[ -p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ \mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{ y}- \boldsymbol{ \mu})\right) \]

              Correlated Gaussian

              -

              Form correlated from original by rotating the data space using matrix \(\mathbf{R}\).

              +

              Form correlated from original by rotating the data space using matrix +\(\mathbf{R}\).

              \[ -p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\mathbf{R}^\top\mathbf{ y}- \mathbf{R}^\top\boldsymbol{ \mu})^\top\mathbf{D}^{-1}(\mathbf{R}^\top\mathbf{ y}- \mathbf{R}^\top\boldsymbol{ \mu})\right) +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{R}^\top\mathbf{ y}- +\mathbf{R}^\top\boldsymbol{ +\mu})^\top\mathbf{D}^{-1}(\mathbf{R}^\top\mathbf{ y}- +\mathbf{R}^\top\boldsymbol{ \mu})\right) \]

              Correlated Gaussian

              -

              Form correlated from original by rotating the data space using matrix \(\mathbf{R}\).

              +

              Form correlated from original by rotating the data space using matrix +\(\mathbf{R}\).

              \[ -p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ \mu})^\top\mathbf{R}\mathbf{D}^{-1}\mathbf{R}^\top(\mathbf{ y}- \boldsymbol{ \mu})\right) +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{D}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{R}\mathbf{D}^{-1}\mathbf{R}^\top(\mathbf{ y}- +\boldsymbol{ \mu})\right) \] this gives a covariance matrix: \[ \mathbf{C}^{-1} = \mathbf{R}\mathbf{D}^{-1} \mathbf{R}^\top \]

              Correlated Gaussian

              -

              Form correlated from original by rotating the data space using matrix \(\mathbf{R}\).

              +

              Form correlated from original by rotating the data space using matrix +\(\mathbf{R}\).

              \[ -p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{C}}^{\frac{1}{2}}} \exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ \mu})^\top\mathbf{C}^{-1} (\mathbf{ y}- \boldsymbol{ \mu})\right) +p(\mathbf{ y}) = \frac{1}{\det{2\pi\mathbf{C}}^{\frac{1}{2}}} +\exp\left(-\frac{1}{2}(\mathbf{ y}- \boldsymbol{ +\mu})^\top\mathbf{C}^{-1} (\mathbf{ y}- \boldsymbol{ \mu})\right) \] this gives a covariance matrix: \[ \mathbf{C}= \mathbf{R}\mathbf{D} \mathbf{R}^\top \]

              @@ -815,7 +1039,8 @@

              Basis Functions

              Quadratic Basis

                -
              • Basis functions can be global. E.g. quadratic basis: \[ +
              • Basis functions can be global. E.g. quadratic basis: \[ \boldsymbol{ \phi}= [1, x, x^2] \]
              @@ -833,7 +1058,7 @@

              Quadratic Basis

              Matrix Valued Function

              \[ -\boldsymbol{ \Phi}(\mathbf{ x}) = +\boldsymbol{ \Phi}(\mathbf{ x}) = \begin{bmatrix} 1 & x_1 & x_1^2 \\ 1 & x_2 & x_2^2\\ @@ -842,47 +1067,56 @@

              Matrix Valued Function

              \end{bmatrix} \]

              -
              +

              Functions Derived from Quadratic Basis

              \[ -f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + {\color{yellow}{w_2 x^2}} +f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + +{\color{yellow}{w_2 x^2}} \]

              -

              +

              + + +

              - +
              - +
              - +

              Quadratic Functions

              \[ -f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + {\color{yellow}{w_2 x^2}} +f(x) = {\color{cyan}{w_0}} + {\color{green}{w_1 x}} + +{\color{yellow}{w_2 x^2}} \]

              -

              +

              + + +

              - +
              - +
              - + @@ -897,47 +1131,55 @@

              Rectified Linear Units

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +

              Functions Derived from Relu Basis

              \[ -f(x) = \color{cyan}{w_0} + \color{green}{w_1 xH(x+1.0) } + \color{yellow}{w_2 xH(x+0.33) } + \color{magenta}{w_3 xH(x-0.33)} + \color{red}{w_4 xH(x-1.0)} +f(x) = \color{cyan}{w_0} + \color{green}{w_1 xH(x+1.0) } + +\color{yellow}{w_2 xH(x+0.33) } + \color{magenta}{w_3 xH(x-0.33)} ++ \color{red}{w_4 xH(x-1.0)} \]

              -

              +

              + + +

              - +
              - +
              - + @@ -946,15 +1188,18 @@

              Gaussian Processes

              • Basis function models give non-linear predictions.
              • Need to choose number and location of basis functions.
              • -
              • Gaussian processes is a general framework (basis functions special case)
              • -
              • Within the framework you can consider models with infinite basis functions.
              • +
              • Gaussian processes is a general framework (basis functions special +case)
              • +
              • Within the framework you can consider models with infinite basis +functions.

              \[ p(\mathbf{ y}|\mathbf{X}, \mathbf{ w}) = \prod_{i=1}^{n} p(y_i | \mathbf{ x}_i, \mathbf{ w}) \]

              \[ -\mathbf{ y}|\mathbf{X}\sim \mathcal{N}\left(\mathbf{m}(\mathbf{X}),\mathbf{K}(\mathbf{X})\right), +\mathbf{ y}|\mathbf{X}\sim +\mathcal{N}\left(\mathbf{m}(\mathbf{X}),\mathbf{K}(\mathbf{X})\right), \]

              @@ -965,7 +1210,7 @@

              Linear Model Overview

              \[ \phi_{i,j} = \phi(\mathbf{ w}^{(1)}_{j}, \mathbf{ x}_{i}) \] Define design matrix \[ -\boldsymbol{ \Phi}= +\boldsymbol{ \Phi}= \begin{bmatrix} \phi_{1, 1} & \phi_{1, 2} & \dots & \phi_{1, h} \\ \phi_{1, 2} & \phi_{1, 2} & \dots & \phi_{1, n} \\ @@ -974,14 +1219,18 @@

              Linear Model Overview

              \end{bmatrix}. \]

              -
              +

              Matrix Representation of a Neural Network

              -

              \[y\left(\mathbf{ x}\right) = \boldsymbol{ \phi}\left(\mathbf{ x}\right)^\top \mathbf{ w}+ \epsilon\]

              +

              \[y\left(\mathbf{ x}\right) = \boldsymbol{ +\phi}\left(\mathbf{ x}\right)^\top \mathbf{ w}+ \epsilon\]

              -

              \[\mathbf{ y}= \boldsymbol{ \Phi}\mathbf{ w}+ \boldsymbol{ \epsilon}\]

              +

              \[\mathbf{ y}= \boldsymbol{ \Phi}\mathbf{ +w}+ \boldsymbol{ \epsilon}\]

              -

              \[\boldsymbol{ \epsilon}\sim \mathcal{N}\left(\mathbf{0},\sigma^2\mathbf{I}\right)\]

              +

              \[\boldsymbol{ \epsilon}\sim +\mathcal{N}\left(\mathbf{0},\sigma^2\mathbf{I}\right)\]

              @@ -992,13 +1241,19 @@

              Multivariate Gaussian Properties

              \]

            • Assume \[ \begin{align} -\mathbf{ x}& \sim \mathcal{N}\left(\boldsymbol{ \mu},\mathbf{C}\right)\\ -\boldsymbol{ \epsilon}& \sim \mathcal{N}\left(\mathbf{0},\boldsymbol{ \Sigma}\right) +\mathbf{ x}& \sim \mathcal{N}\left(\boldsymbol{ +\mu},\mathbf{C}\right)\\ +\boldsymbol{ \epsilon}& \sim +\mathcal{N}\left(\mathbf{0},\boldsymbol{ \Sigma}\right) \end{align} \]

            • Then \[ -\mathbf{ y}\sim \mathcal{N}\left(\mathbf{W}\boldsymbol{ \mu},\mathbf{W}\mathbf{C}\mathbf{W}^\top + \boldsymbol{ \Sigma}\right). -\] If \(\boldsymbol{ \Sigma}=\sigma^2\mathbf{I}\), this is Probabilistic PCA (Tipping and Bishop, 1999).

            • +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{W}\boldsymbol{ +\mu},\mathbf{W}\mathbf{C}\mathbf{W}^\top + \boldsymbol{ \Sigma}\right). +\] If \(\boldsymbol{ +\Sigma}=\sigma^2\mathbf{I}\), this is Probabilistic PCA (Tipping and Bishop, +1999).

              @@ -1007,91 +1262,116 @@

              Prior Density

            • Define \[ \mathbf{ w}\sim \mathcal{N}\left(\mathbf{0},\alpha\mathbf{I}\right), \]
            • -
            • Rules of multivariate Gaussians to see that, \[ -\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 \mathbf{I}\right). +
            • Rules of multivariate Gaussians to see that, \[ +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top + \sigma^2 \mathbf{I}\right). \]
            • \[ -\mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 \mathbf{I}. +\mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 +\mathbf{I}. \]

              Joint Gaussian Density

                -
              • Elements are a function \(k_{i,j} = k\left(\mathbf{ x}_i, \mathbf{ x}_j\right)\)
              • +
              • Elements are a function \(k_{i,j} = +k\left(\mathbf{ x}_i, \mathbf{ x}_j\right)\)

              \[ -\mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 \mathbf{I}. +\mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top + \sigma^2 +\mathbf{I}. \]

              Covariance Function

              \[ -k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top \boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right) +k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right) \]

                -
              • formed by inner products of the rows of the design matrix.
              • +
              • formed by inner products of the rows of the design +matrix.

              Gaussian Process

                -
              • Instead of making assumptions about our density over each data point, \(y_i\) as i.i.d.

              • +
              • Instead of making assumptions about our density over each data +point, \(y_i\) as i.i.d.

              • make a joint Gaussian assumption over our data.

              • -
              • covariance matrix is now a function of both the parameters of the activation function, \(\mathbf{W}_1\), and the input variables, \(\mathbf{X}\).

              • -
              • Arises from integrating out \(\mathbf{ w}^{(2)}\).

              • +
              • covariance matrix is now a function of both the parameters of the +activation function, \(\mathbf{W}_1\), +and the input variables, \(\mathbf{X}\).

              • +
              • Arises from integrating out \(\mathbf{ +w}^{(2)}\).

              Basis Functions

                -
              • Can be very complex, such as deep kernels, (Cho and Saul, 2009) or could even put a convolutional neural network inside.
              • -
              • Viewing a neural network in this way is also what allows us to beform sensible batch normalizations (Ioffe and Szegedy, 2015).
              • +
              • Can be very complex, such as deep kernels, (Cho and Saul, 2009) or could even put a +convolutional neural network inside.
              • +
              • Viewing a neural network in this way is also what allows us to +beform sensible batch normalizations (Ioffe and Szegedy, 2015).

              Radial Basis Functions

                -
              • Basis functions can be local e.g. radial (or Gaussian) basis \[ +
              • Basis functions can be local e.g. radial (or Gaussian) basis \[ \phi_j(x) = \exp\left(-\frac{(x-\mu_j)^2}{\ell^2}\right) \]
              -

              +

              + + +

              - +
              - +
              - +

              Functions Derived from Radial Basis

              \[ -f(x) = \color{cyan}{w_1 e^{-2(x+1)^2}} + \color{green}{w_2e^{-2x^2}} + \color{yellow}{w_3 e^{-2(x-1)^2}} +f(x) = \color{cyan}{w_1 e^{-2(x+1)^2}} + \color{green}{w_2e^{-2x^2}} + +\color{yellow}{w_3 e^{-2(x-1)^2}} \]

              -

              +

              + + +

              - +
              - +
              - + @@ -1116,7 +1396,8 @@

              Function Space View

              \mathbf{ w}\sim \mathcal{N}\left(\mathbf{0},\alpha \mathbf{I}\right) \]

              \[ -\boldsymbol{ \Phi}= \begin{bmatrix}\boldsymbol{ \phi}(\mathbf{ x}_1) \\ \vdots \\ +\boldsymbol{ \Phi}= \begin{bmatrix}\boldsymbol{ \phi}(\mathbf{ x}_1) \\ +\vdots \\ \boldsymbol{ \phi}(\mathbf{ x}_n)\end{bmatrix} \]

              \[ @@ -1127,44 +1408,54 @@

              Function Space View

              \mathbf{ f}= \boldsymbol{ \Phi}\mathbf{ w}. \]

              -
              +

              \[ -\mathbf{ f}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top\right). +\mathbf{ f}\sim \mathcal{N}\left(\mathbf{0},\alpha \boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top\right). \]

              -
              +

              \[ \mathbf{K}= \alpha \boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top. \]

              -
              K = alpha*Phi_pred@Phi_pred.T
              +
              K = alpha*Phi_pred@Phi_pred.T
              -
              +

              -
              K = alpha*Phi_pred@Phi_pred.T
              -f_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
              +
              K = alpha*Phi_pred@Phi_pred.T
              +f_sample = np.random.multivariate_normal(mean=np.zeros(x_pred.size), cov=K)
              - + +
              +

              +

              - +
              @@ -1176,37 +1467,49 @@

              \epsilon \sim \mathcal{N}\left(\mathbf{0},\sigma^2\mathbf{I}\right). \]

              \[ -\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\boldsymbol{ \Phi}\boldsymbol{ \Phi}^\top +\sigma^2\mathbf{I}\right). +\mathbf{ y}\sim \mathcal{N}\left(\mathbf{0},\boldsymbol{ +\Phi}\boldsymbol{ \Phi}^\top +\sigma^2\mathbf{I}\right). \]

              -
              - +
              +
              -
              - +
              +

              Non-degenerate Gaussian Processes

              • This process is degenerate.
              • -
              • Covariance function is of rank at most \(h\).
              • -
              • As \(n\rightarrow \infty\), covariance matrix is not full rank.
              • -
              • Leading to \(\det{\mathbf{K}} = 0\)
              • +
              • Covariance function is of rank at most \(h\).
              • +
              • As \(n\rightarrow \infty\), +covariance matrix is not full rank.
              • +
              • Leading to \(\det{\mathbf{K}} = +0\)
              @@ -1223,21 +1526,26 @@

              Infinite Networks

              Radford Neal - +
                -
              • In ML Radford Neal (Neal, 1994) asked “what would happen if you took \(h\rightarrow \infty\)?”
              • +
              • In ML Radford Neal (Neal, 1994) asked “what would +happen if you took \(h\rightarrow +\infty\)?”
              - +
              @@ -1245,91 +1553,109 @@

              Roughly Speaking

              • Instead of \[ \begin{align*} -k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) & = \alpha \boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top \boldsymbol{ \phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right)\\ -& = \alpha \sum_k \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ x}_i\right) \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ x}_j\right) +k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) & = \alpha \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_i\right)^\top \boldsymbol{ +\phi}\left(\mathbf{W}_1, \mathbf{ x}_j\right)\\ +& = \alpha \sum_k \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ +x}_i\right) \phi\left(\mathbf{ w}^{(1)}_k, \mathbf{ x}_j\right) \end{align*} \]
              • -
              • Sample infinitely many from a prior density, \(p(\mathbf{ w}^{(1)})\), \[ -k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \int \phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_i\right) \phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_j\right) p(\mathbf{ w}^{(1)}) \text{d}\mathbf{ w}^{(1)} +
              • Sample infinitely many from a prior density, \(p(\mathbf{ w}^{(1)})\), \[ +k_f\left(\mathbf{ x}_i, \mathbf{ x}_j\right) = \alpha \int +\phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_i\right) \phi\left(\mathbf{ +w}^{(1)}, \mathbf{ x}_j\right) p(\mathbf{ w}^{(1)}) \text{d}\mathbf{ +w}^{(1)} \]
              • -
              • Also applies for non-Gaussian \(p(\mathbf{ w}^{(1)})\) because of the central limit theorem.
              • +
              • Also applies for non-Gaussian \(p(\mathbf{ +w}^{(1)})\) because of the central limit theorem.

              Simple Probabilistic Program

              • If \[ -\begin{align*} -\mathbf{ w}^{(1)} & \sim p(\cdot)\\ \phi_i & = \phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_i\right), +\begin{align*} +\mathbf{ w}^{(1)} & \sim p(\cdot)\\ \phi_i & = +\phi\left(\mathbf{ w}^{(1)}, \mathbf{ x}_i\right), \end{align*} \] has finite variance.

              • -
              • Then taking number of hidden units to infinity, is also a Gaussian process.

              • +
              • Then taking number of hidden units to infinity, is also a +Gaussian process.

              Further Reading

                -
              • Chapter 2 of Neal’s thesis (Neal, 1994)

              • -
              • Rest of Neal’s thesis. (Neal, 1994)

              • -
              • David MacKay’s PhD thesis (MacKay, 1992)

              • +
              • Chapter 2 of Neal’s thesis (Neal, 1994)

              • +
              • Rest of Neal’s thesis. (Neal, 1994)

              • +
              • David MacKay’s PhD thesis (MacKay, 1992)

              Gaussian Process

              \[ -k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left( -\frac{\left\Vert \mathbf{ x}-\mathbf{ x}^\prime\right\Vert^2}{2\ell^2}\right), +k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left( -\frac{\left\Vert +\mathbf{ x}-\mathbf{ x}^\prime\right\Vert^2}{2\ell^2}\right), \]

              \[ -\left\Vert\mathbf{ x}- \mathbf{ x}^\prime\right\Vert^2 = (\mathbf{ x}- \mathbf{ x}^\prime)^\top (\mathbf{ x}- \mathbf{ x}^\prime) +\left\Vert\mathbf{ x}- \mathbf{ x}^\prime\right\Vert^2 = (\mathbf{ x}- +\mathbf{ x}^\prime)^\top (\mathbf{ x}- \mathbf{ x}^\prime) \]

              -
              -

              Exercise 1

              -

              Moving Parameters Have a play with the parameters for this covariance function (the lengthscale and the variance) and see what effects the parameters have on the types of functions you observe.

              -

              - +

              - +

              - +

              - +

              - +
              @@ -1349,183 +1675,305 @@

              Gaussian Process

              Making Predictions

              \[ -\begin{bmatrix}\mathbf{ f}\\ \mathbf{ f}^*\end{bmatrix} \sim \mathcal{N}\left(\mathbf{0},\begin{bmatrix} \mathbf{K}& \mathbf{K}_\ast \\ +\begin{bmatrix}\mathbf{ f}\\ \mathbf{ f}^*\end{bmatrix} \sim +\mathcal{N}\left(\mathbf{0},\begin{bmatrix} \mathbf{K}& +\mathbf{K}_\ast \\ \mathbf{K}_\ast^\top & \mathbf{K}_{\ast,\ast}\end{bmatrix}\right) \]

              \[ \begin{bmatrix} \mathbf{K}& \mathbf{K}_\ast \\ \mathbf{K}_\ast^\top & \mathbf{K}_{\ast,\ast}\end{bmatrix} \]

              +
              +
              + + +
              +
              + +
              +
              +

              Sampling a Function

              +

              Multi-variate Gaussians

              +
                +
              • We will consider a Gaussian with a particular structure of +covariance matrix.
              • +
              • Generate a single sample from this 25 dimensional Gaussian density, +\[ +\mathbf{ f}=\left[f_{1},f_{2}\dots f_{25}\right]. +\]
              • +
              • We will plot these points against their index.
              • +
              +
              +
              +

              Gaussian Distribution Sample

              + +

              + + +

              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +
              +
              + + +

              Sampling a Function from a Gaussian

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - +
              -

              Joint Density of \(f_1\) and \(f_2\)

              +

              Joint Density of \(f_1\) and \(f_2\)

              -

              Prediction of \(f_{2}\) from \(f_{1}\)

              +

              Prediction of \(f_{2}\) from \(f_{1}\)

              -

              +

              + + +

              - +
              - +
              - +
              - + -
              -

              Prediction of \(f_{2}\) from \(f_{1}\)

              -

              * The single contour of the Gaussian density represents the joint distribution, \(p(f_1, f_2)\)

              -
              -
                -
              • We observe that \(f_1=?\)
              • -
              +
              +

              Uluru

              +
              +
              +
              + +
              -
              -
                -
              • Conditional density: \(p(f_2|f_1=?)\)
              • -
              +

              Prediction with Correlated Gaussians

                -
              • Prediction of \(f_2\) from \(f_1\) requires conditional density.

              • -
              • Conditional density is also Gaussian. \[ -p(f_2|f_1) = {\mathcal{N}\left(f_2|\frac{k_{1, 2}}{k_{1, 1}}f_1,k_{2, 2} - \frac{k_{1,2}^2}{k_{1,1}}\right)} -\] where covariance of joint density is given by \[ -\mathbf{K}= \begin{bmatrix} k_{1, 1} & k_{1, 2}\\ k_{2, 1} & k_{2, 2}\end{bmatrix} -\]

              • +
              • Prediction of \(f_2\) from \(f_1\) requires conditional +density.
              • +
              • Conditional density is also Gaussian. \[ +p(f_2|f_1) = \mathcal{N}\left(f_2|\frac{k_{1, 2}}{k_{1, 1}}f_1, k_{2, 2} +- \frac{k_{1,2}^2}{k_{1,1}}\right) +\] where covariance of joint density is given by \[ +\mathbf{K}= \begin{bmatrix} k_{1, 1} & k_{1, 2}\\ k_{2, 1} & +k_{2, 2}.\end{bmatrix} +\]
              -

              Joint Density of \(f_1\) and \(f_8\)

              +

              Joint Density of \(f_1\) and \(f_8\)

              -

              Prediction of \(f_{8}\) from \(f_{1}\)

              +

              Prediction of \(f_{8}\) from \(f_{1}\)

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +

              Details

                -
              • The single contour of the Gaussian density represents the joint distribution, \(p(f_1, f_8)\)
              • +
              • The single contour of the Gaussian density represents the +joint distribution, \(p(f_1, f_8)\)
                -
              • We observe a value for \(f_1=-?\)
              • +
              • We observe a value for \(f_1=-?\)
                -
              • Conditional density: \(p(f_5|f_1=?)\).
              • +
              • Conditional density: \(p(f_8|f_1=?)\).
              -
              +

              Prediction with Correlated Gaussians

                -
              • Prediction of \(\mathbf{ f}_*\) from \(\mathbf{ f}\) requires multivariate conditional density.

              • -
              • Multivariate conditional density is also Gaussian. \[ -p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ f}_*|\mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{ f},\mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} \mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ f},*}\right)} +

              • Prediction of \(\mathbf{ f}_*\) +from \(\mathbf{ f}\) requires +multivariate conditional density.

              • +
              • Multivariate conditional density is also Gaussian. + \[ +p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ +f}_*|\mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ f},\mathbf{ +f}}^{-1}\mathbf{ f},\mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} +\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ +f},*}\right)} \]

              • -
              • Here covariance of joint density is given by \[ -\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & \mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & \mathbf{K}_{*, *}\end{bmatrix} +

              • Here covariance of joint density is given by \[ +\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & +\mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & +\mathbf{K}_{*, *}\end{bmatrix} \]

              -
              +

              Prediction with Correlated Gaussians

                -
              • Prediction of \(\mathbf{ f}_*\) from \(\mathbf{ f}\) requires multivariate conditional density.

              • -
              • Multivariate conditional density is also Gaussian. \[ -p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ f}_*|\boldsymbol{ \mu},\boldsymbol{ \Sigma}\right)} +

              • Prediction of \(\mathbf{ f}_*\) +from \(\mathbf{ f}\) requires +multivariate conditional density.

              • +
              • Multivariate conditional density is also Gaussian. + \[ +p(\mathbf{ f}_*|\mathbf{ f}) = {\mathcal{N}\left(\mathbf{ +f}_*|\boldsymbol{ \mu},\boldsymbol{ \Sigma}\right)} \] \[ -\boldsymbol{ \mu}= \mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{ f} +\boldsymbol{ \mu}= \mathbf{K}_{*,\mathbf{ f}}\mathbf{K}_{\mathbf{ +f},\mathbf{ f}}^{-1}\mathbf{ f} \] \[ -\boldsymbol{ \Sigma}= \mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} \mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ f},*} +\boldsymbol{ \Sigma}= \mathbf{K}_{*,*}-\mathbf{K}_{*,\mathbf{ f}} +\mathbf{K}_{\mathbf{ f},\mathbf{ f}}^{-1}\mathbf{K}_{\mathbf{ f},*} \]

              • -
              • Here covariance of joint density is given by \[ -\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & \mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & \mathbf{K}_{*, *}\end{bmatrix} +

              • Here covariance of joint density is given by \[ +\mathbf{K}= \begin{bmatrix} \mathbf{K}_{\mathbf{ f}, \mathbf{ f}} & +\mathbf{K}_{*, \mathbf{ f}}\\ \mathbf{K}_{\mathbf{ f}, *} & +\mathbf{K}_{*, *}\end{bmatrix} \]

              -
              +

              The Importance of the Covariance Function

              \[ \boldsymbol{ \mu}_f= \mathbf{A}^\top \mathbf{ y}, @@ -1537,12 +1985,16 @@

              The Importance of the Covariance Function

              Improving the Numerics

              -

              In practice we shouldn’t be using matrix inverse directly to solve the GP system. One more stable way is to compute the Cholesky decomposition of the kernel matrix. The log determinant of the covariance can also be derived from the Cholesky decomposition.

              +

              In practice we shouldn’t be using matrix inverse directly to solve +the GP system. One more stable way is to compute the Cholesky +decomposition of the kernel matrix. The log determinant of the +covariance can also be derived from the Cholesky decomposition.

              Capacity Control

              @@ -1563,14 +2015,18 @@

              Learning Covariance Parameters

              \[ -\mathcal{N}\left(\mathbf{ y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}{\det{\mathbf{K}}^{\frac{1}{2}}}}{\exp\left(-\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} +\mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}{\det{\mathbf{K}}^{\frac{1}{2}}}}{\exp\left(-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} \]

              \[ \begin{aligned} - \mathcal{N}\left(\mathbf{ y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}\color{yellow}{\det{\mathbf{K}}^{\frac{1}{2}}}}\color{cyan}{\exp\left(-\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} + \mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=\frac{1}{(2\pi)^\frac{n}{2}\color{yellow}{\det{\mathbf{K}}^{\frac{1}{2}}}}\color{cyan}{\exp\left(-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}\right)} \end{aligned} \]

              @@ -1578,29 +2034,39 @@

              \[ \begin{aligned} - \log \mathcal{N}\left(\mathbf{ y}|\mathbf{0},\mathbf{K}\right)=&\color{yellow}{-\frac{1}{2}\log\det{\mathbf{K}}}\color{cyan}{-\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} \\ &-\frac{n}{2}\log2\pi + \log \mathcal{N}\left(\mathbf{ +y}|\mathbf{0},\mathbf{K}\right)=&\color{yellow}{-\frac{1}{2}\log\det{\mathbf{K}}}\color{cyan}{-\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} \\ &-\frac{n}{2}\log2\pi \end{aligned} \]

              \[ -E(\boldsymbol{ \theta}) = \color{yellow}{\frac{1}{2}\log\det{\mathbf{K}}} + \color{cyan}{\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} +E(\boldsymbol{ \theta}) = +\color{yellow}{\frac{1}{2}\log\det{\mathbf{K}}} + +\color{cyan}{\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}} \]

              -
              +

              Capacity Control through the Determinant

              -

              The parameters are inside the covariance function (matrix). \[k_{i, j} = k(\mathbf{ x}_i, \mathbf{ x}_j; \boldsymbol{ \theta})\]

              +

              The parameters are inside the covariance function (matrix). +\[k_{i, j} = k(\mathbf{ x}_i, \mathbf{ x}_j; +\boldsymbol{ \theta})\]

              Eigendecomposition of Covariance

              -

              \[\mathbf{K}= \mathbf{R}\boldsymbol{ \Lambda}^2 \mathbf{R}^\top\]

              +

              \[\mathbf{K}= +\mathbf{R}\boldsymbol{ \Lambda}^2 \mathbf{R}^\top\]

              - +
              -\(\boldsymbol{ \Lambda}\) represents distance on axes. \(\mathbf{R}\) gives rotation. +\(\boldsymbol{ \Lambda}\) represents +distance on axes. \(\mathbf{R}\) gives +rotation.
              @@ -1608,106 +2074,130 @@

              Eigendecomposition of Covariance

              Eigendecomposition of Covariance

                -
              • \(\boldsymbol{ \Lambda}\) is diagonal, \(\mathbf{R}^\top\mathbf{R}= \mathbf{I}\).
              • -
              • Useful representation since \(\det{\mathbf{K}} = \det{\boldsymbol{ \Lambda}^2} = \det{\boldsymbol{ \Lambda}}^2\).
              • +
              • \(\boldsymbol{ \Lambda}\) is +diagonal, \(\mathbf{R}^\top\mathbf{R}= +\mathbf{I}\).
              • +
              • Useful representation since \(\det{\mathbf{K}} = \det{\boldsymbol{ \Lambda}^2} = +\det{\boldsymbol{ \Lambda}}^2\).
              -
              -

              Capacity control: \(\color{yellow}{\log \det{\mathbf{K}}}\)

              +
              +

              Capacity control: \(\color{yellow}{\log +\det{\mathbf{K}}}\)

              -

              +

              + + +

              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - +
              - + -
              -

              Data Fit: \(\color{cyan}{\frac{\mathbf{ y}^\top\mathbf{K}^{-1}\mathbf{ y}}{2}}\)

              +
              +

              Quadratic Data Fit

              +
              +
              +

              Data Fit: \(\color{cyan}{\frac{\mathbf{ +y}^\top\mathbf{K}^{-1}\mathbf{ y}}{2}}\)

              -

              +

              + + +

              - +
              - +
              - +
              - + -
              -

              \[E(\boldsymbol{ \theta}) = \color{yellow}{\frac{1}{2}\log\det{\mathbf{K}}}+\color{cyan}{\frac{\mathbf{ y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}}\]

              -
              -
              -

              Quadratic Data Fit

              +
              +

              \[E(\boldsymbol{ \theta}) = +\color{yellow}{\frac{1}{2}\log\det{\mathbf{K}}}+\color{cyan}{\frac{\mathbf{ +y}^{\top}\mathbf{K}^{-1}\mathbf{ y}}{2}}\]

              Data Fit Term

              -

              +

              + + +

              @@ -1717,11 +2207,11 @@

              Data Fit Term

              - + - +
              @@ -1731,11 +2221,11 @@

              Data Fit Term

              - + - +
              @@ -1745,11 +2235,11 @@

              Data Fit Term

              - + - +
              @@ -1759,11 +2249,11 @@

              Data Fit Term

              - + - +
              @@ -1773,11 +2263,11 @@

              Data Fit Term

              - + - +
              @@ -1787,11 +2277,11 @@

              Data Fit Term

              - + - +
              @@ -1801,11 +2291,11 @@

              Data Fit Term

              - + - +
              @@ -1815,11 +2305,11 @@

              Data Fit Term

              - + - +
              @@ -1829,11 +2319,11 @@

              Data Fit Term

              - + - +
              @@ -1843,11 +2333,11 @@

              Data Fit Term

              - + - +
              @@ -1857,7 +2347,9 @@

              Data Fit Term

              Exponentiated Quadratic Covariance

              -\[k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left(-\frac{\left\Vert \mathbf{ x}-\mathbf{ x}^\prime \right\Vert_2^2}{2\ell^2}\right)\] +\[k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha +\exp\left(-\frac{\left\Vert \mathbf{ x}-\mathbf{ x}^\prime +\right\Vert_2^2}{2\ell^2}\right)\]
              @@ -1899,36 +2391,47 @@

              GPSS: Gaussian Process Summer School

              - + - +
              -
              +

              GPy: A Gaussian Process Framework in Python

              - +
              -https://github.com/SheffieldML/GPy +https://github.com/SheffieldML/GPy
              -
              +

              GPy: A Gaussian Process Framework in Python

              • BSD Licensed software base.
              • Wide availability of libraries, ‘modern’ scripting language.
              • -
              • Allows us to set projects to undergraduates in Comp Sci that use GPs.
              • -
              • Available through GitHub https://github.com/SheffieldML/GPy
              • +
              • Allows us to set projects to undergraduates in Comp Sci that use +GPs.
              • +
              • Available through GitHub https://github.com/SheffieldML/GPy
              • Reproducible Research with Jupyter Notebook.

              Features

                -
              • Probabilistic-style programming (specify the model, not the algorithm).
              • +
              • Probabilistic-style programming (specify the model, not the +algorithm).
              • Non-Gaussian likelihoods.
              • Multivariate outputs.
              • Dimensionality reduction.
              • @@ -1949,7 +2452,7 @@

                GPy Tutorial

                James Hensman - + @@ -1962,32 +2465,35 @@

                GPy Tutorial

                Nicolas Durrande - +

              Covariance Functions

              \[ -k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left(-\frac{\left\Vert \mathbf{ x}- \mathbf{ x}^\prime \right\Vert_2^2}{2\ell^2}\right), +k(\mathbf{ x}, \mathbf{ x}^\prime) = \alpha \exp\left(-\frac{\left\Vert +\mathbf{ x}- \mathbf{ x}^\prime \right\Vert_2^2}{2\ell^2}\right), \]

              -
              input_dim=1
              -alpha = 1.0
              -lengthscale = 2.0
              -kern = GPy.kern.RBF(input_dim=input_dim, 
              -                    variance=alpha, 
              -                    lengthscale=lengthscale)
              +
              input_dim=1
              +alpha = 1.0
              +lengthscale = 2.0
              +kern = GPy.kern.RBF(input_dim=input_dim, 
              +                    variance=alpha, 
              +                    lengthscale=lengthscale)

              Kernel Output

              - +
              @@ -1996,34 +2502,40 @@

              Covariance Functions in GPy

            • Includes a range of covariance functions
            • -
              +

              Combining Covariance Functions in GPy

              -
              kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
              -kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
              -kern = kern1 + kern2
              +
              kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
              +kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
              +kern = kern1 + kern2

              - +

              Multiplication

              -
              kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
              -kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
              -kern = kern1 * kern2
              -display(kern)
              +
              kern1 = GPy.kern.RBF(1, variance=1., lengthscale=2.)
              +kern2 = GPy.kern.Matern52(1, variance=2., lengthscale=4.)
              +kern = kern1 * kern2
              +display(kern)

              @@ -2034,7 +2546,10 @@

              @@ -2050,34 +2565,38 @@

              A Gaussian Process Regression Model

              Noisy Sine

              - + -
              kern = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)
              -model = GPy.models.GPRegression(X,Y,kern)
              +
              kern = GPy.kern.RBF(input_dim=1, variance=1., lengthscale=1.)
              +model = GPy.models.GPRegression(X,Y,kern)

              GP Fit to Noisy Sine

              - + -
              +

              Covariance Function Parameter Estimation

              -
              model.optimize(messages=True)
              +
              model.optimize(messages=True)
              - + @@ -2092,76 +2611,139 @@

              Review

              Other Software

              Further Reading

                -
              • Chapter 2 of Neal (1994)

              • -
              • Rest of Neal (1994)

              • -
              • All of MacKay (1992)

              • +
              • Chapter 2 of Neal (1994)

              • +
              • Rest of Neal +(1994)

              • +
              • All of MacKay (1992)

              Thanks!

              References

              -
              -
              -

              Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. Consistent mapping of government malaria records across a changing territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5

              -
              -
              -

              Cho, Y., Saul, L.K., 2009. Kernel methods for deep learning, in: Bengio, Y., Schuurmans, D., Lafferty, J.D., Williams, C.K.I., Culotta, A. (Eds.), Advances in Neural Information Processing Systems 22. Curran Associates, Inc., pp. 342–350.

              -
              -
              -

              Ioffe, S., Szegedy, C., 2015. Batch normalization: Accelerating deep network training by reducing internal covariate shift, in: Bach, F., Blei, D. (Eds.), Proceedings of the 32nd International Conference on Machine Learning, Proceedings of Machine Learning Research. PMLR, Lille, France, pp. 448–456.

              -
              -
              -

              MacKay, D.J.C., 1992. Bayesian methods for adaptive models (PhD thesis). California Institute of Technology.

              -
              -
              -

              Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, N.D., 2014. Malaria surveillance with multiple data sources using Gaussian process models, in: 1st International Conference on the Use of Mobile ICT in Africa.

              -
              -
              -

              Neal, R.M., 1994. Bayesian learning for neural networks (PhD thesis). Dept. of Computer Science, University of Toronto.

              -
              -
              -

              Rasmussen, C.E., Williams, C.K.I., 2006. Gaussian processes for machine learning. mit, Cambridge, MA.

              -
              -
              -

              Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC Press.

              -
              -
              -

              Tipping, M.E., Bishop, C.M., 1999. Probabilistic principal component analysis. Journal of the Royal Statistical Society, B 6, 611–622. https://doi.org/doi:10.1111/1467-9868.00196

              +
              +
              +Andrade-Pacheco, R., Mubangizi, M., Quinn, J., Lawrence, N.D., 2014. +Consistent mapping of government malaria records across a changing +territory delimitation. Malaria Journal 13. https://doi.org/10.1186/1475-2875-13-S1-P5 +
              +
              +Cho, Y., Saul, L.K., 2009. Kernel +methods for deep learning, in: Bengio, Y., Schuurmans, D., Lafferty, +J.D., Williams, C.K.I., Culotta, A. (Eds.), Advances in Neural +Information Processing Systems 22. Curran Associates, Inc., pp. 342–350. +
              +
              +Ioffe, S., Szegedy, C., 2015. Batch +normalization: Accelerating deep network training by reducing internal +covariate shift, in: Bach, F., Blei, D. (Eds.), Proceedings of the +32nd International Conference on Machine Learning, Proceedings of +Machine Learning Research. PMLR, Lille, France, pp. 448–456. +
              +
              +MacKay, D.J.C., 1992. Bayesian methods for adaptive models (PhD thesis). +California Institute of Technology. +
              +
              +Mubangizi, M., Andrade-Pacheco, R., Smith, M.T., Quinn, J., Lawrence, +N.D., 2014. Malaria surveillance with multiple data sources using +Gaussian process models, in: 1st International Conference +on the Use of Mobile ICT in Africa. +
              +
              +Neal, R.M., 1994. Bayesian learning for neural networks (PhD thesis). +Dept. of Computer Science, University of Toronto. +
              +
              +Rasmussen, C.E., Williams, C.K.I., 2006. Gaussian processes for machine +learning. mit, Cambridge, MA. +
              +
              +Rogers, S., Girolami, M., 2011. A first course in machine learning. CRC +Press. +
              +
              +Tipping, M.E., Bishop, C.M., 1999. Probabilistic principal component +analysis. Journal of the Royal Statistical Society, B 6, 611–622. https://doi.org/doi:10.1111/1467-9868.00196
              - - + +