From d49893d7fb12654a72d01f66c84cd83c3e0c5040 Mon Sep 17 00:00:00 2001 From: Zhigang Song Date: Wed, 28 Feb 2024 01:12:22 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9C=BA=E5=99=A8=E5=AD=A6=E4=B9=A0=E5=9F=BA?= =?UTF-8?q?=E7=9F=B3=20-=20=E4=BD=9C=E4=B8=9A=202?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Gemfile | 1 + _config.yml | 1 + ...e-Learning-Mathematical-Foundations-hw2.md | 15 + _posts/ai/ntu/decision_stump.html | 8439 +++++++++++++++++ _posts/ai/ntu/decision_stump.ipynb | 460 + 5 files changed, 8916 insertions(+) create mode 100644 _posts/ai/ntu/2024-02-27-Machine-Learning-Mathematical-Foundations-hw2.md create mode 100644 _posts/ai/ntu/decision_stump.html create mode 100644 _posts/ai/ntu/decision_stump.ipynb diff --git a/Gemfile b/Gemfile index b174e87..0fb49a2 100644 --- a/Gemfile +++ b/Gemfile @@ -3,6 +3,7 @@ source "https://gems.ruby-china.com/" #gem "rexml", ">= 3.2.5" gem "jekyll-text-theme" gem "tzinfo-data" +# gem "jekyll-jupyter-notebook" group :jekyll_plugins do gem 'jekyll-graphviz', :git => 'https://github.com/sidgwick/jekyll-graphviz.git' diff --git a/_config.yml b/_config.yml index db39e7e..cf852be 100644 --- a/_config.yml +++ b/_config.yml @@ -222,3 +222,4 @@ plugins: - jekyll-paginate - jekyll-sitemap - jemoji + # - jekyll-jupyter-notebook diff --git a/_posts/ai/ntu/2024-02-27-Machine-Learning-Mathematical-Foundations-hw2.md b/_posts/ai/ntu/2024-02-27-Machine-Learning-Mathematical-Foundations-hw2.md new file mode 100644 index 0000000..f016ece --- /dev/null +++ b/_posts/ai/ntu/2024-02-27-Machine-Learning-Mathematical-Foundations-hw2.md @@ -0,0 +1,15 @@ +--- +title: "机器学习基石 - 作业 2" +date: 2024-02-27 11:14:41 +tags: ai ntu machine-learning mathematical-foundations +--- + +# 机器学习基石 - 作业 2 + + + + + +{% include decision_stump.html %} diff --git a/_posts/ai/ntu/decision_stump.html b/_posts/ai/ntu/decision_stump.html new file mode 100644 index 0000000..ab28c36 --- /dev/null +++ b/_posts/ai/ntu/decision_stump.html @@ -0,0 +1,8439 @@ + + + + + + + decision_stamp + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + +
+ + diff --git a/_posts/ai/ntu/decision_stump.ipynb b/_posts/ai/ntu/decision_stump.ipynb new file mode 100644 index 0000000..3878413 --- /dev/null +++ b/_posts/ai/ntu/decision_stump.ipynb @@ -0,0 +1,460 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import math\n", + "from functools import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Question 16](./images/q16.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "在课堂上, 我们讲授了一维数据的'正负射线'(简单来说就是一维感知器)的学习模型. 该模型包含以下形式的假设:\n", + "\n", + "$$\n", + "h_{s,\\theta}(x) = s \\cdot{} sign(x - \\theta)\n", + "$$\n", + "\n", + "该模型通常被称为'决策树桩'模型, 是最简单的学习模型之一. 如课堂所示, 对于一维数据, 决策树桩模型的 VC 维为 2.\n", + "\n", + "事实上, 决策树桩模型是我们可以​通过枚举所有可能的阈值来有效地轻松最小化 $E_{in}$ 的少数模型之一. 特别地, 对于 $N$ 个例子, 最多有 $ 2N $ 个 dichotomy(参见第 5 课幻灯片的第 22 页), 因此最多有 $2N$ 个不同 $E_{in}$ 值. 然后我们可以轻松地选择使得 $E_{in}$ 最小的 dichotomy, 其中可以通过在最小的 $E_{in}$ 中随机选择来消除平局(原文: We can then easily choose the dichotomy that leads to the lowest $E_{in}$, where ties can be broken by randomly choosing among the lowest $E_{in}$ ones). 所选的 dichotomy 表示某些点($\\theta$ 范围)和 $s$ 的组合(这里是说 $\\theta$ 和 $s$ 的组合确定了一个 dichotomy, 对这个 dichotomy 来说, $\\theta$ 的取值本身是一个范围, 在这个范围内, 给定的样本都可以形成同一个 dichotomy), 通常将该范围的中值选作实现 dichotomy 的 $\\theta$.\n", + "\n", + "在本题中, 我们将实现这样的算法, 并在人工数据集上运行程序.\n", + "\n", + "首先, 通过以下过程生成一维数据:\n", + "\n", + "1. 在 $[-1,1]$ 中通过均匀分布生成 $x$\n", + "2. 通过 $f(x) = \\tilde{s}(x) + noise$ 生成 $y$, 其中 $\\tilde{s}(x) = sign(x)$ 并且噪声以 20% 的概率翻转结果." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def h(x, s, theta):\n", + " t = x - theta\n", + " return s * (-1 if t < 0 else 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def example_data(size):\n", + " \"\"\"\n", + " f(x) = sign(x), flips result with 20% probability\n", + " \"\"\"\n", + "\n", + " def sign(data):\n", + " return [1 if x > 0 else -1 for x in data]\n", + "\n", + " def flip(x):\n", + " if np.random.rand() < 0.2:\n", + " return -x\n", + "\n", + " return x\n", + "\n", + " x = np.random.uniform(-1, 1, size)\n", + " y = [flip(a) for a in sign(x)]\n", + "\n", + " res = list(zip(x, y))\n", + " # np.random.shuffle(res)\n", + "\n", + " return res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 16\n", + "\n", + "对于任何决策树桩 $h_{s,\\theta}$, $\\theta \\in [-1,1]$, 将 $E_{out}(h_{s,\\theta})$ 表示为 $\\theta$ 和 $s$ 的函数.\n", + "\n", + "**解:**\n", + "\n", + "考虑作业 2 的第一题, 提到有噪声版的目标函数 $f$ 用下面的形式给出:\n", + "\n", + "$$\n", + "P(y \\mid x) = \\begin{cases}\n", + " \\lambda & y = f(x)\\\\\n", + " 1 - \\lambda & \\text{otherwise}\n", + " \\end{cases} \n", + "$$\n", + "\n", + "\n", + "假设函数 $h$ 犯错误的概率是 $ \\mu $, 那么错误概率可以写成:\n", + "\n", + "$$\n", + "E_{out} = \\lambda \\mu + (1-\\lambda)(1-\\mu)\n", + "$$\n", + "\n", + "> $E_{in}, E_{out}$ 可以用这种概率形式表示吗?\n", + ">\n", + "> 在未来课程中, 逻辑斯蒂回归使用的交叉熵错误, 看上去是一个概率形式的表达.\n", + "\n", + "题目中, 已经告诉我们 $\\lambda = 1-20\\% = 0.8$, 下面试着计算 $\\mu$, 情况比较复杂, 对 $s$ 分类讨论(决策树桩的 $s \\in \\{-1, +1\\} $):\n", + "\n", + "1. $s = +1$, $\\theta > 0$ 时, $x \\in [-1, 1]$ 的点里面, $x \\in [0, \\theta]$ 部分是分类错误(和无噪声的 $f$ 不一样)的, 因此有 $\\mu = \\frac{\\theta}{2}$\n", + "2. $s = +1$, $\\theta < 0$ 时, 参考 1 的分析, 有 $\\mu = \\frac{|\\theta|}{2}$\n", + "3. $s = -1$, $\\theta > 0$ 时, 参考 1 的分析, 有 $\\mu = 1 - \\frac{\\theta}{2}$\n", + "4. $s = -1$, $\\theta < 0$ 时, 参考 1 的分析, 有 $\\mu = 1 - \\frac{|\\theta|}{2}$\n", + "\n", + "把上面四种情况写在一起(注意这不是对概率在求和, 只是将上面四种情况用一个式子表达了出来而已):\n", + "\n", + "$$\n", + "\\begin{align*}\n", + "\\mu &= \\frac{1+s}{2} \\times \\frac{|\\theta|}{2} + \\frac{1-s}{2} \\times (1 - \\frac{|\\theta|}{2}) \\\\\n", + "&= \\frac{1}{2}(s |\\theta| - s + 1)\n", + "\\end{align*}\n", + "$$\n", + "\n", + "$E_{out}$ 里面代入 $\\lambda$ 和 $\\mu$, 可以得到 $E_{out} = 0.5 + 0.3s(|\\theta| - 1)$" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Question 17](./images/q17.jpg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Question 17\n", + "\n", + "按照上述过程生成大小为 20 的数据集, 并在数据集上运行一维决策树桩算法. 记录 $E_{in}$ 并使用上述公式计算 $E_{out}$, 重复该实验(包括数据生成, 运行决策树桩算法以及计算 $E_{in}$ 和 $E_{out}$) 5000 次. $E_{in}$ 的平均值是多少?\n", + "\n", + "**解:**\n", + "\n", + "按照题目的说法, 我们要暴力遍历所有的 dichotomy, 因此先准备好能决定 dichotomy 的 $(\\theta, s)$ 集合. 如果 $x \\in \\{ x_1, x_2, ... x_{20} \\mid x_1 < x_2 < ... < x_{20} \\}$, 那么 $\\theta$ 可以取的值有(使用每个 dichotomy $\\theta$ 范围的中值) $ \\frac{-1 + x_1}{2}, \\frac{x_1 + x_2}{2}, ..., \\frac{x_{19} + x_{20}}{2}, \\frac{x_{20} + 1}{2} $, 共计 21 个点.\n", + "\n", + "至于 $s$, 它仅可以取 $\\{-1, +1\\}$.\n", + "\n", + "> 出于计算后面的 19/20 题通用考虑, 下面的代码里面对第一个和最后一个 $\\theta$ 的处理, 不是用的 $-1$, $+1$ 的边界, 而是用的 $ x_0 - 1$ 和 $ x_20 + 1 $." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_theta_list(data):\n", + " result = []\n", + "\n", + " xs = sorted([x[0] for x in data])\n", + "\n", + " prev = xs[0] - 1\n", + " for x in xs:\n", + " result.append((prev + x) / 2)\n", + " prev = x\n", + "\n", + " result.append((prev + prev + 1) / 2)\n", + " \n", + " return result\n", + "\n", + "def get_dichotomy_theta_and_s(data):\n", + " result = []\n", + "\n", + " theta_list = get_theta_list(data)\n", + " \n", + " for theta in theta_list:\n", + " result.append((theta, -1))\n", + " result.append((theta, +1))\n", + "\n", + " return result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "有了 $(\\theta, s)$ 的集合之后, 所要做的事情就是遍历这个集合, 并计算在样本点上的 $E_{in}$, 然后用上面的公式可以算 $E_{out}$ 出来." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "def decision_stump(data):\n", + " dichotomy_list = get_dichotomy_theta_and_s(data)\n", + "\n", + " best = (0, 0)\n", + " best_in = np.inf\n", + " best_out = np.inf\n", + " \n", + " for theta, s in dichotomy_list:\n", + " _h = partial(h, s=s, theta=theta)\n", + " \n", + " # 代入数据运算\n", + " err = 0\n", + " for x, y in data:\n", + " err += _h(x) == y\n", + "\n", + " if err < best_in:\n", + " best_in = err\n", + " best = (theta, s)\n", + "\n", + " err_out = 0.5 + 0.3*s*(np.abs(theta) - 1)\n", + " if err_out < best_out:\n", + " best_out = err_out\n", + " # print(s, theta, err_out)\n", + "\n", + " return best_in / len(data), best_out, best" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "err_in 0.16993999999999992\n", + "err_out 0.21064865779191114\n" + ] + } + ], + "source": [ + "loops = 5000\n", + "\n", + "err_in = 0\n", + "err_out = 0\n", + "\n", + "for i in range(loops):\n", + " data = example_data(size=20)\n", + " e_in, e_out, _ = decision_stump(data)\n", + " \n", + " err_in += e_in\n", + " err_out += e_out\n", + "\n", + "print(\"err_in\", err_in / loops)\n", + "print(\"err_out\", err_out / loops)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Question 18](./images/q18.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 18\n", + "\n", + "**解:**\n", + "\n", + "参考 Question 17 里面关于 $E_{out}$ 的计算.\n", + "\n", + "---\n", + "\n", + "我最开始的更新错误部分的写法实际上是:\n", + "\n", + "\n", + "```python\n", + "if err < best_in:\n", + " best_in = err\n", + " best_out = 0.5 + 0.3*s*(np.abs(theta) - 1)\n", + "```\n", + "\n", + "\n", + "也就是只有在遇到更好的 err_in 的时候, 才用它的 $s, \\theta$ 来更新一下 `best_out`, 但是这样算出来 $E_{out}$ 均值达到了 $0.7$\n", + "\n", + "\n", + "> **!!!**\n", + ">\n", + "> 这里为什么要在无论何种场景下都要更新 $E_{out}$ 呢?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Question 19](./images/q19.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 19\n", + "\n", + "决策树桩也适用于多维数据. 特别地, 现在每个决策树桩处理一个特定的维度 $i$, 如下:\n", + "\n", + "$$\n", + "h_{s,i,\\theta}(x) = s \\cdot{} sign(x_i - \\theta)\n", + "$$\n", + "\n", + "对多维数据实现以下决策树桩算法:\n", + "\n", + "1. 对于每个维度 $i = 1, 2, ... , d$, 使用你刚刚实现的一维决策桩算法找到最佳决策桩 $h_{s,i, \\theta}$\n", + "2. 返回以 $E_{in}$ 而言 '最佳中的最佳' 决策桩. 如果出现平局, 请从 $E_{in}$ 最小的桩中随机选择一个(原文: return the \"best of best\"' decision stump in terms of $E_{in}$. If there is a tie , please randomly choose among the lowest-$E_{in}$ ones, 原文读的有点稀里糊涂, 这意思是不是说计算好 N 个维度之后, 返回 N 个维度最小的 $E_{in}$ 里面的那个最小的? 从网上别人的代码看, 好像是要这么干)\n", + "\n", + "训练数据 $\\mathcal{D}_{train}$ 可在以下位置获得: [https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_math/hw2_train.dat](https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_math/hw2_train.dat)\n", + "\n", + "测试数据 $\\mathcal{D}_{test}$ 可在以下位置获得: [https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_math/hw2_test.dat](https://www.csie.ntu.edu.tw/~htlin/mooc/datasets/mlfound_math/hw2_test.dat)\n", + "\n", + "对 $\\mathcal{D}_{train}$ 运行该算法, 报告您的程序返回的最佳决策桩的 $E_{in}$.\n", + "\n", + "**解:**\n", + "\n", + "读取并处理原始数据, 然后送到 Question 17 实现的算法里面计算即可." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.25\n" + ] + } + ], + "source": [ + "def read_data(file):\n", + " result = []\n", + "\n", + " fh = open(file)\n", + " for line in fh.readlines():\n", + " parts = line.strip().split(\" \")\n", + "\n", + " x = [float(x) for x in parts[:-2]]\n", + " y = int(parts[-1])\n", + "\n", + " result.append((x, y))\n", + "\n", + " return result\n", + "\n", + "def train(_data):\n", + " dim = len(_data[0][0])\n", + "\n", + " params = []\n", + " best_in = np.inf\n", + " \n", + " for i in range(dim):\n", + " data = [(x[i], y) for x, y in _data]\n", + " err_in, err_out, best = decision_stump(data)\n", + " \n", + " # 记一下这维度上表现最好的 theta 和 s, 以后预测可以用\n", + " params.append(best)\n", + "\n", + " if err_in < best_in:\n", + " best_in = err_in\n", + "\n", + " return best_in, params\n", + "\n", + "train_dat = read_data(\"hw2_train.dat\")\n", + "err_in, params = train(train_dat)\n", + "\n", + "print(err_in)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "![Question 20](./images/q20.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question 20\n", + "\n", + "用第 19 题求得的 params, 在 $\\mathcal{D}_{test}$ 上面执行预测, 并收集每个维度上的错误情况, 最后报告一个最小的错误." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.355" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def test(params):\n", + " _data = read_data(\"hw2_test.dat\")\n", + "\n", + " err_out = []\n", + " for i, (theta, s) in enumerate(params):\n", + " _h = partial(h, s=s, theta=theta)\n", + " \n", + " # 预测测试集数据\n", + " data = [(x[i], y) for x, y in _data]\n", + "\n", + " err = 0\n", + " for x, y in data:\n", + " err += _h(x) == y\n", + "\n", + " err_out.append(err)\n", + "\n", + " return np.array(err_out) / len(_data)\n", + "\n", + "res = test(params)\n", + "np.min(res)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}