alsuren · anusha-ramdarshan · Apr 20, 2021 · Apr 20, 2021 · Apr 20, 2021 · Apr 22, 2021
diff --git a/notebooks/README.md b/notebooks/README.md
@@ -35,3 +35,22 @@ Make a data export from your influxdb, and rename the files to (notebooks/):
 We use `nbstripout` to strip jupyter notebook cell output when committing to git and diffing.
 
 Run `poetry run nbstripout --install --attributes ../.gitattributes` to get that working if it's not already enabled on your system.
+
+## Looking at the data
+
+The data will be split into different csv files, split by different data types.According to your setup, there will be up to 5 files:
+- homie_boolean: contains all metrics stored as booleans (smart lights)
+- homie_enum
+- homie_color: contains rgb values for the smart lights
+- homie_float: contains all metrics stored as floats (temperature)
+- homie_integer: contains all metrics stored as integers (humidity %, battery level %)
+
+Here, we want to focus on the csvs containing floats and integers, as they contain the temperature/humdity data. Useful columns:
+- time: since epoch (unix epoch 1970). pandas handles this for us.
+device_id
+- device_name: only use data with device containing raspberry pi or cottage pi
+- node_id: mac address of the sensor
+- node_type: =="Mijia sensor" to select only the temperature/humidity sensor data
+- node_name: nickname for the sensor (e.g., "living room")
+
+There are between 4 and 10 data points per sensor per minute, depending on how often a sensor gets polled (~ 10K data points in a 24h period for a given sensor)
diff --git a/notebooks/data_exploration.ipynb b/notebooks/data_exploration.ipynb
@@ -7,7 +7,9 @@
    "outputs": [],
    "source": [
     "import pandas as pd \n",
-    "import plotly.express as px\n"
+    "import plotly.express as px\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "from sklearn.decomposition import PCA"
    ]
   },
   {
@@ -150,12 +152,114 @@
     "\n",
     "plot_temp_variations(dataset)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_boxplots_per_sensor(df):\n",
+    "    remove = [\"Table dangly\", \"Outside chair\", \"Fridge drawer\", \"Fridge door\", \"2AA3D2\", \"392F3E\", \"Tree top\", \"Tree bottom\"]\n",
+    "    data = df[~df['node_name'].isin(remove)].dropna().copy()\n",
+    "    # Separating out the features\n",
+    "    x = data.loc[:, ['temperature', 'humidity']].values\n",
+    "    # Separating out the target\n",
+    "    # y = df.loc[:,['node_name']].values\n",
+    "    # Standardizing the features\n",
+    "    x = StandardScaler().fit_transform(x)\n",
+    "\n",
+    "    pca = PCA(n_components=1)\n",
+    "    principalComponents = pca.fit_transform(x)\n",
+    "    print(len(principalComponents))\n",
+    "    print(data.shape)\n",
+    "    data['PCA']= principalComponents\n",
+    "    fig = px.box(data, y=\"PCA\", x='node_name')\n",
+    "    return fig.show()\n",
+    "\n",
+    "#plot_boxplots_per_sensor(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_week_month_comparison(df):\n",
+    "    remove = [\"Table dangly\", \"Outside chair\", \"Fridge drawer\", \"Fridge door\", \"2AA3D2\", \"392F3E\", \"Tree top\", \"Tree bottom\"]\n",
+    "    data = df[~df['node_name'].isin(remove)].dropna().copy()\n",
+    "    data['day_name'] = data['time'].dt.day_name()\n",
+    "    data['month_number'] = data['time'].dt.month\n",
+    "    data['time_of_day']= data['time'].dt.time\n",
+    "    \n",
+    "    data = data.set_index('time').groupby(['day_name']).resample('30min')['temperature'].mean().reset_index()\n",
+    "    data['time_of_day']= data['time'].dt.time\n",
+    "    data['month_number'] = data['time'].dt.month\n",
+    "    data = data.groupby(['time_of_day','day_name','month_number'])['temperature'].mean().reset_index()\n",
+    "    data = data.loc[(data['month_number']==1)|(data['month_number']==4)]\n",
+    "    \n",
+    "    fig = px.line(data, x=\"time_of_day\", y='temperature', color='day_name',facet_row=\"month_number\", width=700, height=700,category_orders={\"day_name\": [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]})\n",
+    "    fig.update_layout(\n",
+    "    xaxis = dict(\n",
+    "        tickmode = 'array',\n",
+    "        tickvals = [f\"{h:02}:00:00\" for h in range(0, 24, 2)],\n",
+    "        ticktext = [f\"{h:02}:00\" for h in range(0, 24, 2)],\n",
+    "))\n",
+    "    fig.update_xaxes(tickangle=45)\n",
+    "    return fig.show()\n",
+    "\n",
+    "plot_week_month_comparison(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def compare_days_of_the_week(df):\n",
+    "    remove = [\"Table dangly\", \"Outside chair\", \"Fridge drawer\", \"Fridge door\", \"2AA3D2\", \"392F3E\", \"Tree top\", \"Tree bottom\"]\n",
+    "    data = df[~df['node_name'].isin(remove)].dropna().copy()\n",
+    "    data['day_name'] = data['time'].dt.day_name()\n",
+    "    data['month_number'] = data['time'].dt.month\n",
+    "    data['time_of_day']= data['time'].dt.time\n",
+    "    \n",
+    "    data = data.set_index('time').groupby(['day_name']).resample('30min')['temperature'].mean().reset_index()\n",
+    "    data['time_of_day']= data['time'].dt.time\n",
+    "    data = data.groupby(['time_of_day','day_name'])['temperature'].mean().reset_index()\n",
+    "    \n",
+    "    fig = px.line(data, x=\"time_of_day\", y='temperature', color='day_name',labels=dict(time_of_day=\"Time of Day\", temperature=\"Temperature (°C)\", day_name=\"Day of the Week\"),category_orders={\"day_name\": [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]})\n",
+    "    fig.update_layout(\n",
+    "    xaxis = dict(\n",
+    "        tickmode = 'array',\n",
+    "        tickvals = [f\"{h:02}:00:00\" for h in range(0, 24, 2)],\n",
+    "        ticktext = [f\"{h:02}:00\" for h in range(0, 24, 2)],\n",
+    "    )\n",
+    ")\n",
+    "    fig.update_xaxes(tickangle=45)\n",
+    "    return fig.show()\n",
+    "\n",
+    "compare_days_of_the_week(dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3.9.4 64-bit ('mijia-homie-qZYmZ-v8-py3.9': venv)",
-   "name": "python394jvsc74a57bd034ee638aa14cee10dc00b93073271fa396fbb064582ffa24f14c58036232187c"
+   "display_name": "Python 3.9.2 64-bit",
+   "metadata": {
+    "interpreter": {
+     "hash": "de3140ad81ba08929dc8d47238f6d45138469e1e91652694ab15112290a4cfb7"
+    }
+   },
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -167,7 +271,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.4"
+   "version": "3.9.2-final"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/poetry.lock b/notebooks/poetry.lock
diff --git a/notebooks/pyproject.toml b/notebooks/pyproject.toml
@@ -10,6 +10,7 @@ ipykernel = "^5.5.3"
 pandas = "^1.2.4"
 plotly = "^4.14.3"
 nbstripout = "^0.3.9"
+sklearn = "^0.0"
 
 [tool.poetry.dev-dependencies]