slds-lmu · Coorsaa · Aug 30, 2019 · Sep 9, 2019 · Sep 18, 2019 · Sep 19, 2019
diff --git a/attic/ames_usecase/ames1.Rmd b/attic/ames_usecase/ames1.Rmd
@@ -0,0 +1,108 @@
+---
+output: pdf_document
+title: "mlr3 Exercises Day 1 - Ames Housing Dataset"
+---
+
+This exercise is intended to use mlr3 to perform a benchmark analysis on the _Ames Dataset_.
+
+The main objectives of this exercise are as follows:
+
+- To build machine learning models able to predict house price based on house features
+- To analyze and compare models performance in order to choose the best model
+
+## Accessing the dataset
+
+The dataset is available on Kaggle https://bit.ly/2l0uWoz.
+Kaggle is a platform which provides data science competitions and datasets which can be used to get familiar with typical machine learning methods.
+
+## Importing the data
+
+```{r}
+train_set = read.csv("data/ames_housing_train.csv")
+```
+
+1. Load the `mlr3` and `mlr3learners` packages.
+
+```{r}
+library(mlr3)
+library(mlr3learners)
+```
+
+2. Create a regression task object.
+
+```{r}
+task = TaskRegr$new(id = "ames_housing", backend = train_set, target = "SalePrice")
+task
+```
+
+3. Create a list of learning algorithms which you want to use in the benchmark.
+
+```{r}
+# get a featureless learner as baseline
+# Additionally, we can train a regression tree, a knn learner
+# for different values of k and a random forest (ranger)
+learners = list(
+  featureless = lrn("regr.featureless"),
+  knn3 = lrn("regr.kknn", id = "regr.knn3", k = 3),
+  knn7 = lrn("regr.kknn", id = "regr.knn7", k = 7), #default
+  knn15 = lrn("regr.kknn", id = "regr.knn15", k = 15),
+  knn30 = lrn("regr.kknn", id = "regr.knn30", k = 30),
+  tree = lrn("regr.rpart"),
+  random_forest = lrn("regr.ranger")
+)
+```
+
+4. Create a resampling object for your benchmark evaluation.
+
+```{r}
+# compare via 10-fold cross validation
+resamplings = rsmp("cv", folds = 10)
+```
+
+5. Create a grid corresponding to the planned benchmark including the task, all learners and the resampling strategy.
+
+```{r}
+# create a BenchmarkDesign object
+design = benchmark_grid(task, learners, resamplings)
+print(design)
+```
+
+6. Run the benchmark
+
+```{r}
+# execute the benchmark
+bmr = benchmark(design)
+```
+
+7. Use appropriate regression measures to measure the performance of each learner in the benchmark.
+
+```{r}
+# get some measures: Mean Squared Error (which we use for the competition)
+# and Mean Absolute Error
+measures = mlr_measures$mget(c("regr.mse", "regr.mae"))
+bmr$aggregate(measures)
+```
+
+8. Use an appropriate plot to illustrate the benchmark results.
+Have e.g. a look at the `mlr3viz` package.
+
+```{r}
+# create a nice boxplot
+library(mlr3viz)
+autoplot(bmr)
+```
+
+9. Finally, we choose the ranger as final algrithm, train it on the complete training data and predict on the test data.
+
+```{r}
+test_set = read.csv("data/ames_housing_test.csv")
+final_learner = learners$random_forest
+final_learner$train(task)
+pred = final_learner$predict_newdata(task, test_set)
+
+# we can save the predictions as data.table and export them for Kaggle
+pred = as.data.table(pred)
+pred$truth = NULL
+write.csv(pred, "data/ames_housing_submission_day1.csv")
+```
+
diff --git a/attic/ames_usecase/ames1.pdf b/attic/ames_usecase/ames1.pdf
diff --git a/attic/ames_usecase/ames2.Rmd b/attic/ames_usecase/ames2.Rmd
@@ -0,0 +1,158 @@
+---
+output: pdf_document
+title: "mlr3 Usecase Day 2 - Ames Housing Dataset"
+---
+
+This exercise is intended to use `mlr3tuning` to improve the results of the benchmark analysis on the _Ames Housing Dataset_ of day 1.
+
+The main objective of this exercise is as follows:
+
+- To apply an appropriate tuning technique to a learning algorithm in order to improve its predictive performance.
+
+Keep in mind that you are asked to use the Mean Absolute Error (MAE) as a performance measure to evaluate the performance of a tuned algorithm.
+
+## Accessing the dataset
+
+The dataset is available on Kaggle http://www.kaggle.com/c/ames-day2.
+Kaggle is a platform which provides data science competitions and datasets which can be used to get familiar with typical machine learning methods.
+
+## Importing the data
+
+```{r}
+housing = read.csv("data/ames_housing_train_numeric.csv")
+```
+
+1. Load the `mlr3` and `mlr3learners` packages.
+
+```{r}
+library(mlr3)
+library(mlr3learners)
+```
+
+2. Create a regression task object.
+
+```{r}
+task = TaskRegr$new(id = "ames_housing", backend = housing, target = "SalePrice")
+task
+```
+
+3. Create a list of learning algorithms and their parameter sets which we want to tune.
+
+```{r}
+# get a featureless learner and a regression tree
+library(mlr3learners)
+knn = lrn("regr.kknn")
+xgboost = lrn("regr.xgboost", nrounds = 100L)
+```
+
+```{r}
+library(paradox)
+knn_tune_ps = ParamSet$new(list(
+ParamDbl$new("log_k", log(1), log(100))
+))
+knn_tune_ps$trafo = function(x, param_set) {
+return(list(k = round(exp(x$log_k))))
+}
+knn_tune_ps
+
+xgboost_tune_ps = ParamSet$new(list(
+  ParamDbl$new("eta", lower = -7, upper = 0),
+  ParamDbl$new("gamma", lower = -5, upper = 6)
+))
+xgboost_tune_ps$trafo = function(x, param_set) {
+return(list(eta = 2^(x$eta), gamma = 2^(x$gamma)))
+}
+xgboost_tune_ps
+```
+
+4. Define all necessary tuning settings.
+
+```{r}
+library(mlr3tuning)
+# we want to use an inner 3-fold cross validation
+inner_resampling = rsmp("cv", folds = 3) # inner resampling
+terminator = term("model_time", secs = 60L) # set terminator to 60 seconds
+measure = msr("regr.mae") # we want to optimize the MAE
+tuner = tnr("grid_search", resolution = 10) # we use grid search here
+```
+
+5. Let's use the `autotuner` to use the tuned learner as a regular one within our previous benchmark
+
+```{r}
+knn_tuned = AutoTuner$new(
+  learner = knn,
+  resampling = inner_resampling,
+  measures = measure,
+  tune_ps = knn_tune_ps,
+  terminator = terminator,
+  tuner = tuner
+)
+knn_tuned
+
+xgboost_tuned = AutoTuner$new(
+  learner = xgboost,
+  resampling = inner_resampling,
+  measures = measure,
+  tune_ps = xgboost_tune_ps,
+  terminator = terminator,
+  tuner = tuner
+)
+xgboost_tuned
+```
+
+5. Create a grid corresponding to the planned benchmark including the task, all learners and the resampling strategy.
+
+```{r}
+# create a BenchmarkDesign object
+learners = list(
+  featureless = lrn("regr.featureless"),
+  lm = lrn("regr.lm"),
+  knn = lrn("regr.kknn"),
+  knn_tuned = knn_tuned,
+  tree = lrn("regr.rpart"),
+  random_forest = lrn("regr.ranger"),
+  xgboost = xgboost,
+  xgboost_tuned = xgboost_tuned
+)
+resampling = rsmp("cv", folds = 5)
+design = benchmark_grid(task, learners, resampling)
+print(design)
+```
+
+6. Run the benchmark
+
+```{r, echo = -1}
+lgr::get_logger("mlr3")$set_threshold("error")
+# run the benchmark
+bmr = benchmark(design)
+```
+
+7. Use appropriate regression measures to measure the performance of each learner in the benchmark.
+
+```{r}
+# get some measures: accuracy (acc) and area under the curve (auc)
+measures = mlr_measures$mget(c("regr.mae", "regr.mse"))
+bmr$aggregate(measures)
+```
+
+8. Use an appropriate plot to illustrate the benchmark results.
+Here, we use the `mlr3viz` package.
+
+```{r}
+library(mlr3viz)
+autoplot(bmr)
+```
+
+9. We take the best performing algorithm - the xgboost autotuner - as final learner to predict on the test data.
+
+```{r}
+test_set = read.csv("data/ames_housing_test_numeric.csv")
+final_learner = learners$xgboost_tuned
+final_learner$train(task)
+pred = final_learner$predict_newdata(task, test_set)
+
+# we can save the predictions as data.table and export them for Kaggle
+pred = as.data.table(pred)
+pred$truth = NULL
+write.csv(pred, "data/ames_housing_submission_day2.csv", row.names = FALSE)
+```
diff --git a/attic/ames_usecase/ames2.pdf b/attic/ames_usecase/ames2.pdf