slds-lmu · xuansontrinh · Dec 11, 2021 · Dec 13, 2021 · Dec 13, 2021 · Dec 15, 2021
diff --git a/slides/evaluation/figure/overfitting_simulation_plot.png b/slides/evaluation/figure/overfitting_simulation_plot.png
diff --git a/slides/evaluation/rsrc/make_overfitting_dataset.R b/slides/evaluation/rsrc/make_overfitting_dataset.R
@@ -0,0 +1,98 @@
+library(mlr3verse)
+library(mlbench)
+library(stats)
+library(mlr3measures)
+library(dplyr)
+library(stringr)
+
+defaultW <- getOption("warn")
+options(warn = -1)
+
+set.seed(0)
+
+min_n <- 100
+min_d <- 10
+max_n <- 5000
+max_d <- 500
+len_n <- 10
+len_d <- 10
+sample_sizes <- ceiling(2^seq(log2(min_n), log2(max_n), length = len_n))
+dims <- ceiling(2^seq(log2(min_d), log2(max_d), length = len_d))
+
+n_test <- 10000
+n_iters <- 10
+
+make_task_data <- function(d=500, n=500) {
+  peak <- mlbench.peak(n=n + n_test,d=d)
+  data <- as.data.frame(peak$x)
+  data$y <- peak$y
+  task <- as_task_regr(data, target = "y", id = paste0(d, "_", n))
+  task
+}
+
+df <- data.frame(learner_id=character(),
+                 sample=integer(),
+                 dim=integer(),
+                 train_error=double(),
+                 test_error=double(),
+                 of=double())
+measure <- msr("regr.mse")
+
+learners <- list(
+  lrn("regr.svm", id="SVM (gamma=1/p)", cost=1, kernel="radial", type="eps-regression"),
+  lrn("regr.ranger", id="Random Forest", num.trees=500, num.threads=4),
+  lrn("regr.xgboost", id="Gradient Boosting", nthread=4, nrounds=100),
+  lrn("regr.rpart", id="Regression Tree", maxdepth=30, minsplit=20),
+  lrn("regr.svm", id="SVM (gamma=1)", gamma=1, cost=1, kernel="radial", type="eps-regression"),
+  lrn("regr.kknn", id="K-Nearest Neighbors", k=7, distance=2)
+)
+
+for (i in 1:length(dims)) {
+  for (j in 1:length(sample_sizes)) {
+    d <- dims[i]
+    n <- sample_sizes[j]
+    print(paste0("starting task w/ dim=", d, " & sample_size=", n))
+    of_list <- list()
+    train_error_list <- list()
+    test_error_list <- list()
+    for (iter in 1:n_iters) {
+      task <- make_task_data(d=d,n=n)
+      holdout <- rsmp("holdout", ratio=as.double(n)/(n_test + n))
+      holdout$instantiate(task)
+      for (learner in learners) {
+        learner$train(task, row_ids=holdout$train_set(1))
+        train_error <- learner$predict(task, row_ids=holdout$train_set(1))$score(measures=measure)[[1]]
+        test_error <- learner$predict(task, row_ids=holdout$test_set(1))$score(measures=measure)[[1]]
+        of <- test_error - train_error
+        if (learner$id %in% names(of_list)) {
+          of_list[[learner$id]] <- of_list[[learner$id]] + of
+          train_error_list[[learner$id]] <- train_error_list[[learner$id]] + train_error
+          test_error_list[[learner$id]] <- test_error_list[[learner$id]] + test_error
+        } else {
+          of_list[[learner$id]] <- of
+          train_error_list[[learner$id]] <- train_error
+          test_error_list[[learner$id]] <- test_error
+        }
+        learner$reset()
+      }
+    }
+    for (learner in learners) {
+      df <- rbind(df, list(learner_id=learner$id,
+                       sample=n,
+                       dim=d,
+                       of=of_list[[learner$id]]/n_iters,
+                       train_error=train_error_list[[learner$id]]/n_iters,
+                       test_error=test_error_list[[learner$id]]/n_iters
+      ))
+    }
+    print(paste0("finished task w/ dim=", d, " & sample_size=", n))
+  }
+
+}
+
+saveRDS(df, file = "overfitting_peak.rds")
+
+options(warn = defaultW)
+
+
+
diff --git a/slides/evaluation/rsrc/make_overfitting_simulation_plot.R b/slides/evaluation/rsrc/make_overfitting_simulation_plot.R
@@ -0,0 +1,33 @@
+library(ggplot2)
+library(viridis)
+library(gridExtra)
+library(scales)
+
+theme_set(theme_minimal())
+
+df <- readRDS("overfitting_peak.rds")
+
+draw_heatmap <- function(learner_id, legend_name=expression(OF(hat(f), L)), fill_col="of") {
+  ggplot(df[df$learner_id==learner_id,], aes_string("dim", "sample", fill=fill_col)) +
+    scale_x_continuous(trans = "log2",
+                       breaks = trans_breaks("log2", function(x) 2^x),
+                       labels = trans_format("log2", math_format(2^.x))) +
+    scale_y_continuous(trans = "log2",
+                       breaks = trans_breaks("log2", function(x) 2^x),
+                       labels = trans_format("log2", math_format(2^.x))) +
+    geom_tile() +
+    xlab("p") +
+    ylab("n") +
+    scale_fill_viridis(end=0.9, name = legend_name) +
+    ggtitle(learner_id)
+}
+
+plots_of <- lapply(unique(df[["learner_id"]])[1:5], function(learner_id) draw_heatmap(learner_id))
+plots_train_error <- lapply(unique(df[["learner_id"]])[1:5],
+                            function(learner_id) draw_heatmap(learner_id, legend_name = expression(R[emp](hat(f), L)), fill_col = "train_error"))
+plots_test_error <- lapply(unique(df[["learner_id"]])[1:5],
+                            function(learner_id) draw_heatmap(learner_id, legend_name = expression(GE(hat(f), L)), fill_col = "test_error"))
+p <- grid.arrange(grobs = c(plots_of, plots_train_error, plots_test_error), nrow = 3, ncol = 5)
+
+ggsave(filename = "../figure/overfitting_simulation_plot.png", plot=p, width=14, height=6.5)
+
diff --git a/slides/evaluation/rsrc/overfitting_peak.rds b/slides/evaluation/rsrc/overfitting_peak.rds
diff --git a/slides/evaluation/slides-evaluation-overfitting-underfitting.tex b/slides/evaluation/slides-evaluation-overfitting-underfitting.tex
@@ -130,6 +130,23 @@
 \item Tightly connected to the bias-var-noise decomposition of GE
 of a learner ($\rightarrow$ which we study elsewhere).
 \end{itemize}
+
+\end{vbframe}
+
+\begin{vbframe}{Overfitting simulation}
+We simulate overfitting using regression datasets generated from $mlbench.peak$ and multiple ML hypothesis spaces with varying complexity. The resulting plots are arranged as a grid where each column is a hypothesis space and each row represents a score: $OF$, $\mathcal{R}_{emp}$ and $GE$, respectively. The metric used is $MSE$.
+
+\includegraphics[width=1\textwidth]{figure/overfitting_simulation_plot.png}
+
+From left to right, the plots are displayed in ascending order of the worst (highest) $GE$.
+
+Observations:
+\begin{itemize}
+	\item As the dimensionality of the feature space $p$ increases, $GE$ and $OF$ also increase. $\implies$ The higher the $p$, the easier to overfit.
+	\item As the amount of training data $n$ increases, $GE$ and $OF$ decreases. $\implies$ The smaller the $n$, the easier to overfit.
+	\item As $p$ increases, the hypothesis space needs to be complex enough to minimize $GE$ and $OF$.
+	\item SVM with fixed gamma is more prone to overfitting as $p$ increases than SVM with gamma set to $\frac{1}{p}$ .
+\end{itemize}
 \end{vbframe}