diff --git a/topic/timeseries/dask-weather-data-import.ipynb b/topic/timeseries/dask-weather-data-import.ipynb index 25d56293..b7951ac1 100644 --- a/topic/timeseries/dask-weather-data-import.ipynb +++ b/topic/timeseries/dask-weather-data-import.ipynb @@ -7,10 +7,19 @@ "source": [ "# How to Build Time Series Applications in CrateDB\n", "\n", - "This notebook guides you through an example of how to import and work with\n", + "This notebook guides you through an example of how to batch import \n", "time series data in CrateDB. It uses Dask to import data into CrateDB.\n", "Dask is a framework to parallelize operations on pandas Dataframes.\n", "\n", + "## Important Note\n", + "If you are running this notebook on a (free) Google Colab environment, you \n", + "might not see the parallelized execution of Dask operations due to constrained\n", + "CPU availability.\n", + "\n", + "We therefore recommend to run this notebook either locally or on an environment\n", + "that provides sufficient CPU capacity to demonstrate the parallel execution of\n", + "dataframe operations as well as write operations to CrateDB.\n", + "\n", "## Dataset\n", "This notebook uses a daily weather data set provided on kaggle.com. This dataset\n", "offers a collection of **daily weather readings from major cities around the\n", @@ -57,7 +66,7 @@ }, "outputs": [], "source": [ - "#!pip install dask pandas==2.0.0 'sqlalchemy[crate]'" + "!pip install dask 'pandas==2.0.0' 'crate[sqlalchemy]' 'cratedb-toolkit==0.0.10' 'pueblo>=0.0.7' kaggle" ] }, { @@ -75,6 +84,9 @@ "- Countries (countries.csv)\n", "\n", "The subsequent code cell acquires the dataset directly from kaggle.com.\n", + "In order to import the data automatically, you need to create a (free)\n", + "API key in your kaggle.com user settings. \n", + "\n", "To properly configure the notebook to use corresponding credentials\n", "after signing up on Kaggle, define the `KAGGLE_USERNAME` and\n", "`KAGGLE_KEY` environment variables. Alternatively, put them into the\n", @@ -85,6 +97,7 @@ " \"key\": \"2b1dac2af55caaf1f34df76236fada4a\"\n", "}\n", "```\n", + "\n", "Another variant is to acquire the dataset files manually, and extract\n", "them into a folder called `DOWNLOAD`. In this case, you can deactivate\n", "those two lines of code, in order to skip automatic dataset acquisition." @@ -92,37 +105,52 @@ }, { "cell_type": "code", - "execution_count": null, - "outputs": [], + "execution_count": 3, + "id": "8fcc014a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset URL: https://www.kaggle.com/datasets/guillemservera/global-daily-climate-data\n" + ] + } + ], "source": [ + "from pueblo.util.environ import getenvpass\n", "from cratedb_toolkit.datasets import load_dataset\n", "\n", + "# Uncomment and execute the following lines to get prompted for kaggle user name and key\n", + "# getenvpass(\"KAGGLE_USERNAME\", prompt=\"Kaggle.com User Name:\")\n", + "# getenvpass(\"KAGGLE_KEY\", prompt=\"Kaggle.com Key:\")\n", + "\n", "dataset = load_dataset(\"kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet\")\n", "dataset.acquire()" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 6, + "id": "d9e2916d", + "metadata": {}, "outputs": [], "source": [ "from dask import dataframe as dd\n", "from dask.diagnostics import ProgressBar\n", "\n", + "# Use multiprocessing of dask\n", + "import dask.multiprocessing\n", + "dask.config.set(scheduler=dask.multiprocessing.get)\n", + "\n", "# Show a progress bar for dask activities\n", "pbar = ProgressBar()\n", "pbar.register()" - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 9, "id": "a506f7c9", "metadata": {}, "outputs": [ @@ -130,10 +158,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "[########################################] | 100% Completed | 6.26 ss\n", - "[########################################] | 100% Completed | 6.37 s\n", - "[########################################] | 100% Completed | 6.47 s\n", - "[########################################] | 100% Completed | 6.47 s\n", + "[########################################] | 100% Completed | 127.49 s\n", + "[########################################] | 100% Completed | 127.49 s\n", "\n", "Index: 27635763 entries, 0 to 24220\n", "Data columns (total 14 columns):\n", @@ -155,10 +181,8 @@ "13 sunshine_total_min 1021461 non-null float64\n", "dtypes: category(3), datetime64[ns](1), float64(10)\n", "memory usage: 2.6 GB\n", - "[########################################] | 100% Completed | 5.37 ss\n", - "[########################################] | 100% Completed | 5.48 s\n", - "[########################################] | 100% Completed | 5.58 s\n", - "[########################################] | 100% Completed | 5.68 s\n" + "[########################################] | 100% Completed | 4.82 ss\n", + "[########################################] | 100% Completed | 4.89 s\n" ] }, { @@ -311,7 +335,7 @@ "4 NaN NaN NaN " ] }, - "execution_count": 56, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -490,14 +514,13 @@ }, { "cell_type": "markdown", + "id": "ea1dfadc", + "metadata": {}, "source": [ "### Connect to CrateDB\n", "\n", "This code uses SQLAlchemy to connect to CrateDB." - ], - "metadata": { - "collapsed": false - } + ] }, { "cell_type": "code",