diff --git a/flight_analysis.ipynb b/flight_analysis.ipynb index abbf029..a02daf5 100644 --- a/flight_analysis.ipynb +++ b/flight_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -11,17 +11,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "flights = Scrape(\"CTA\", \"PMO\", \"2023-10-28\")\n", + "flights = Scrape(\"Frosinone\", \"Sydney\", \"2023-10-25\")\n", "flights.run_scrape()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -60,310 +60,278 @@ " access_date\n", " one_way\n", " has_train\n", + " days_advance\n", " \n", " \n", " \n", " \n", " 0\n", - " 2023-10-28 07:50:00\n", - " 2023-10-28 11:25:00\n", - " [easyJet]\n", - " 0 days 03:35:00\n", - " CTA\n", - " PMO\n", + " 2023-10-25 21:10:00\n", + " 2023-10-27 13:35:00\n", + " [China Eastern]\n", + " 1 days 07:25:00\n", + " FCO\n", + " SYD\n", " 1\n", - " 0 days 01:30:00\n", - " [NAP]\n", - " 58\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150904\n", + " 0 days 09:25:00\n", + " [PVG]\n", + " 694\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469385\n", " True\n", " False\n", + " 78\n", " \n", " \n", " 1\n", - " 2023-10-28 14:00:00\n", - " 2023-10-28 18:20:00\n", - " [ITA]\n", - " 0 days 04:20:00\n", - " CTA\n", - " PMO\n", + " 2023-10-25 21:10:00\n", + " 2023-10-27 10:00:00\n", + " [China Eastern]\n", + " 1 days 03:50:00\n", + " FCO\n", + " SYD\n", " 1\n", - " 0 days 01:50:00\n", - " [FCO]\n", - " 222\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150920\n", + " 0 days 05:55:00\n", + " [PVG]\n", + " 924\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469406\n", " True\n", " False\n", + " 78\n", " \n", " \n", " 2\n", - " 2023-10-28 19:10:00\n", - " 2023-10-28 22:55:00\n", - " [ITA]\n", - " 0 days 03:45:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 01:10:00\n", - " [FCO]\n", - " 222\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150922\n", + " 2023-10-25 08:40:00\n", + " 2023-10-26 21:30:00\n", + " [ITA, THAI]\n", + " 1 days 03:50:00\n", + " FCO\n", + " SYD\n", + " 2\n", + " NaT\n", + " [FRA, BKK]\n", + " 961\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469420\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 3\n", - " 2023-10-28 12:50:00\n", - " 2023-10-28 22:55:00\n", - " [ITA]\n", - " 0 days 10:05:00\n", - " CTA\n", - " PMO\n", - " 3\n", - " NaT\n", - " [LIN, NAP, FCO]\n", - " 191\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150924\n", + " 2023-10-25 15:30:00\n", + " 2023-10-26 23:05:00\n", + " [Emirates, Qantas]\n", + " 0 days 22:35:00\n", + " FCO\n", + " SYD\n", + " 1\n", + " 0 days 03:05:00\n", + " [DXB]\n", + " 1312\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469423\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 4\n", - " 2023-10-28 15:25:00\n", - " 2023-10-28 22:10:00\n", - " [ITA]\n", - " 0 days 06:45:00\n", - " CTA\n", - " PMO\n", + " 2023-10-25 10:20:00\n", + " 2023-10-26 19:10:00\n", + " [Etihad]\n", + " 0 days 23:50:00\n", + " FCO\n", + " SYD\n", " 1\n", - " 0 days 04:20:00\n", - " [FCO]\n", - " 222\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150926\n", + " 0 days 04:05:00\n", + " [AUH]\n", + " 1322\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469426\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 5\n", - " 2023-10-28 11:05:00\n", - " 2023-10-28 16:20:00\n", - " [ITA]\n", - " 0 days 05:15:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 01:50:00\n", - " [LIN]\n", - " 228\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150927\n", + " 2023-10-25 08:40:00\n", + " 2023-10-27 09:15:00\n", + " [ITA, Vietnam Airlines]\n", + " 1 days 15:35:00\n", + " FCO\n", + " SYD\n", + " 2\n", + " NaT\n", + " [CDG, SGN]\n", + " 877\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469428\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 6\n", - " 2023-10-28 06:00:00\n", - " 2023-10-28 09:25:00\n", - " [ITA]\n", - " 0 days 03:25:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 00:55:00\n", - " [FCO]\n", - " 254\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150929\n", + " 2023-10-25 08:40:00\n", + " 2023-10-27 09:15:00\n", + " [ITA, Vietnam Airlines]\n", + " 1 days 15:35:00\n", + " FCO\n", + " SYD\n", + " 2\n", + " NaT\n", + " [FRA, SGN]\n", + " 885\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469430\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 7\n", - " 2023-10-28 10:15:00\n", - " 2023-10-28 14:25:00\n", - " [ITA]\n", - " 0 days 04:10:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 01:35:00\n", - " [FCO]\n", - " 265\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150931\n", + " 2023-10-25 15:30:00\n", + " 2023-10-27 11:00:00\n", + " [Turkish Airlines, Malaysia Airlines]\n", + " 1 days 10:30:00\n", + " FCO\n", + " SYD\n", + " 2\n", + " NaT\n", + " [IST, KUL]\n", + " 923\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469433\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 8\n", - " 2023-10-28 07:00:00\n", - " 2023-10-28 11:55:00\n", - " [ITA]\n", - " 0 days 04:55:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 02:20:00\n", - " [FCO]\n", - " 280\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150933\n", + " 2023-10-25 10:45:00\n", + " 2023-10-26 21:30:00\n", + " [ITA, THAI]\n", + " 1 days 01:45:00\n", + " FCO\n", + " SYD\n", + " 2\n", + " NaT\n", + " [ZRH, BKK]\n", + " 1004\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469435\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 9\n", - " 2023-10-28 06:00:00\n", - " 2023-10-28 11:05:00\n", - " [Lufthansa]\n", - " 0 days 05:05:00\n", - " CTA\n", - " PMO\n", + " 2023-10-25 10:20:00\n", + " 2023-10-27 07:20:00\n", + " [Etihad]\n", + " 1 days 12:00:00\n", + " FCO\n", + " SYD\n", " 1\n", - " 0 days 01:00:00\n", - " [MUC]\n", - " 408\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150934\n", + " 0 days 16:20:00\n", + " [AUH]\n", + " 1077\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469438\n", " True\n", " False\n", + " 77\n", " \n", " \n", " 10\n", - " 2023-10-28 21:35:00\n", - " 2023-10-29 11:10:00\n", - " [British Airways]\n", - " 0 days 14:35:00\n", - " CTA\n", - " PMO\n", + " 2023-10-25 21:25:00\n", + " 2023-10-27 07:15:00\n", + " [Korean Air]\n", + " 1 days 00:50:00\n", + " FCO\n", + " SYD\n", " 1\n", - " NaT\n", - " None\n", - " 457\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150936\n", - " True\n", - " False\n", - " \n", - " \n", - " 11\n", - " 2023-10-28 19:25:00\n", - " 2023-10-29 17:40:00\n", - " [Turkish Airlines]\n", - " 0 days 23:15:00\n", - " CTA\n", - " PMO\n", - " 1\n", - " 0 days 18:20:00\n", - " [IST]\n", - " 504\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150938\n", - " True\n", - " False\n", - " \n", - " \n", - " 12\n", - " 2023-10-28 07:00:00\n", - " 2023-10-28 13:55:00\n", - " [ITA, SWISS]\n", - " 0 days 06:55:00\n", - " CTA\n", - " PMO\n", - " 2\n", - " NaT\n", - " [FCO, ZRH]\n", - " 617\n", - " typical\n", - " None\n", - " 2023-08-07 20:18:11.150939\n", + " 0 days 03:15:00\n", + " [ICN]\n", + " 1249\n", + " low\n", + " 194\n", + " 2023-08-08 18:23:03.469440\n", " True\n", " False\n", + " 78\n", " \n", " \n", "\n", "" ], "text/plain": [ - " departure_datetime arrival_datetime airlines \\\n", - "0 2023-10-28 07:50:00 2023-10-28 11:25:00 [easyJet] \n", - "1 2023-10-28 14:00:00 2023-10-28 18:20:00 [ITA] \n", - "2 2023-10-28 19:10:00 2023-10-28 22:55:00 [ITA] \n", - "3 2023-10-28 12:50:00 2023-10-28 22:55:00 [ITA] \n", - "4 2023-10-28 15:25:00 2023-10-28 22:10:00 [ITA] \n", - "5 2023-10-28 11:05:00 2023-10-28 16:20:00 [ITA] \n", - "6 2023-10-28 06:00:00 2023-10-28 09:25:00 [ITA] \n", - "7 2023-10-28 10:15:00 2023-10-28 14:25:00 [ITA] \n", - "8 2023-10-28 07:00:00 2023-10-28 11:55:00 [ITA] \n", - "9 2023-10-28 06:00:00 2023-10-28 11:05:00 [Lufthansa] \n", - "10 2023-10-28 21:35:00 2023-10-29 11:10:00 [British Airways] \n", - "11 2023-10-28 19:25:00 2023-10-29 17:40:00 [Turkish Airlines] \n", - "12 2023-10-28 07:00:00 2023-10-28 13:55:00 [ITA, SWISS] \n", + " departure_datetime arrival_datetime \\\n", + "0 2023-10-25 21:10:00 2023-10-27 13:35:00 \n", + "1 2023-10-25 21:10:00 2023-10-27 10:00:00 \n", + "2 2023-10-25 08:40:00 2023-10-26 21:30:00 \n", + "3 2023-10-25 15:30:00 2023-10-26 23:05:00 \n", + "4 2023-10-25 10:20:00 2023-10-26 19:10:00 \n", + "5 2023-10-25 08:40:00 2023-10-27 09:15:00 \n", + "6 2023-10-25 08:40:00 2023-10-27 09:15:00 \n", + "7 2023-10-25 15:30:00 2023-10-27 11:00:00 \n", + "8 2023-10-25 10:45:00 2023-10-26 21:30:00 \n", + "9 2023-10-25 10:20:00 2023-10-27 07:20:00 \n", + "10 2023-10-25 21:25:00 2023-10-27 07:15:00 \n", "\n", - " travel_time origin destination layover_n layover_time \\\n", - "0 0 days 03:35:00 CTA PMO 1 0 days 01:30:00 \n", - "1 0 days 04:20:00 CTA PMO 1 0 days 01:50:00 \n", - "2 0 days 03:45:00 CTA PMO 1 0 days 01:10:00 \n", - "3 0 days 10:05:00 CTA PMO 3 NaT \n", - "4 0 days 06:45:00 CTA PMO 1 0 days 04:20:00 \n", - "5 0 days 05:15:00 CTA PMO 1 0 days 01:50:00 \n", - "6 0 days 03:25:00 CTA PMO 1 0 days 00:55:00 \n", - "7 0 days 04:10:00 CTA PMO 1 0 days 01:35:00 \n", - "8 0 days 04:55:00 CTA PMO 1 0 days 02:20:00 \n", - "9 0 days 05:05:00 CTA PMO 1 0 days 01:00:00 \n", - "10 0 days 14:35:00 CTA PMO 1 NaT \n", - "11 0 days 23:15:00 CTA PMO 1 0 days 18:20:00 \n", - "12 0 days 06:55:00 CTA PMO 2 NaT \n", + " airlines travel_time origin destination \\\n", + "0 [China Eastern] 1 days 07:25:00 FCO SYD \n", + "1 [China Eastern] 1 days 03:50:00 FCO SYD \n", + "2 [ITA, THAI] 1 days 03:50:00 FCO SYD \n", + "3 [Emirates, Qantas] 0 days 22:35:00 FCO SYD \n", + "4 [Etihad] 0 days 23:50:00 FCO SYD \n", + "5 [ITA, Vietnam Airlines] 1 days 15:35:00 FCO SYD \n", + "6 [ITA, Vietnam Airlines] 1 days 15:35:00 FCO SYD \n", + "7 [Turkish Airlines, Malaysia Airlines] 1 days 10:30:00 FCO SYD \n", + "8 [ITA, THAI] 1 days 01:45:00 FCO SYD \n", + "9 [Etihad] 1 days 12:00:00 FCO SYD \n", + "10 [Korean Air] 1 days 00:50:00 FCO SYD \n", "\n", - " layover_location price_eur price_trend price_value \\\n", - "0 [NAP] 58 typical None \n", - "1 [FCO] 222 typical None \n", - "2 [FCO] 222 typical None \n", - "3 [LIN, NAP, FCO] 191 typical None \n", - "4 [FCO] 222 typical None \n", - "5 [LIN] 228 typical None \n", - "6 [FCO] 254 typical None \n", - "7 [FCO] 265 typical None \n", - "8 [FCO] 280 typical None \n", - "9 [MUC] 408 typical None \n", - "10 None 457 typical None \n", - "11 [IST] 504 typical None \n", - "12 [FCO, ZRH] 617 typical None \n", + " layover_n layover_time layover_location price_eur price_trend \\\n", + "0 1 0 days 09:25:00 [PVG] 694 low \n", + "1 1 0 days 05:55:00 [PVG] 924 low \n", + "2 2 NaT [FRA, BKK] 961 low \n", + "3 1 0 days 03:05:00 [DXB] 1312 low \n", + "4 1 0 days 04:05:00 [AUH] 1322 low \n", + "5 2 NaT [CDG, SGN] 877 low \n", + "6 2 NaT [FRA, SGN] 885 low \n", + "7 2 NaT [IST, KUL] 923 low \n", + "8 2 NaT [ZRH, BKK] 1004 low \n", + "9 1 0 days 16:20:00 [AUH] 1077 low \n", + "10 1 0 days 03:15:00 [ICN] 1249 low \n", "\n", - " access_date one_way has_train \n", - "0 2023-08-07 20:18:11.150904 True False \n", - "1 2023-08-07 20:18:11.150920 True False \n", - "2 2023-08-07 20:18:11.150922 True False \n", - "3 2023-08-07 20:18:11.150924 True False \n", - "4 2023-08-07 20:18:11.150926 True False \n", - "5 2023-08-07 20:18:11.150927 True False \n", - "6 2023-08-07 20:18:11.150929 True False \n", - "7 2023-08-07 20:18:11.150931 True False \n", - "8 2023-08-07 20:18:11.150933 True False \n", - "9 2023-08-07 20:18:11.150934 True False \n", - "10 2023-08-07 20:18:11.150936 True False \n", - "11 2023-08-07 20:18:11.150938 True False \n", - "12 2023-08-07 20:18:11.150939 True False " + " price_value access_date one_way has_train days_advance \n", + "0 194 2023-08-08 18:23:03.469385 True False 78 \n", + "1 194 2023-08-08 18:23:03.469406 True False 78 \n", + "2 194 2023-08-08 18:23:03.469420 True False 77 \n", + "3 194 2023-08-08 18:23:03.469423 True False 77 \n", + "4 194 2023-08-08 18:23:03.469426 True False 77 \n", + "5 194 2023-08-08 18:23:03.469428 True False 77 \n", + "6 194 2023-08-08 18:23:03.469430 True False 77 \n", + "7 194 2023-08-08 18:23:03.469433 True False 77 \n", + "8 194 2023-08-08 18:23:03.469435 True False 77 \n", + "9 194 2023-08-08 18:23:03.469438 True False 77 \n", + "10 194 2023-08-08 18:23:03.469440 True False 78 " ] }, - "execution_count": 8, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -374,16 +342,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'https://www.google.com/travel/flights?q=Flights%20to%20PMO%20from%20CTA%20on%202023-10-28%20oneway&curr=EUR&gl=IT'" + "'https://www.google.com/travel/flights?q=Flights%20to%20Sydney%20from%20Frosinone%20on%202023-10-25%20oneway&curr=EUR&gl=IT'" ] }, - "execution_count": 9, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -394,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -404,7 +372,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 5\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m()\n", + "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 5\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m()\n", "\u001b[0;31mNotImplementedError\u001b[0m: " ] } @@ -422,6 +390,7 @@ "import pandas as pd\n", "from src.flight_analysis.database import Database\n", "import private.private as private\n", + "import numpy as np\n", "\n", "db = Database(\n", " db_host=private.DB_HOST,\n", @@ -441,9 +410,117 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
flight_uuidlayover_location
0bc9de66a-e11b-4ded-a230-0eccbc7f3838None
137e1aaeb-80d9-4cd2-b24f-1d197a93671fNone
2f6956f34-ddc4-4c57-80ef-706049058c34None
38b0795c7-2acb-427e-81aa-4181827a483eNone
4f7aa01e7-6fac-48ab-bb37-22f6b39c8213None
.........
11141517893f94-0096-42d0-b71c-c5fedd3be13bBCN
111416a2dc98dd-6268-44ea-af71-598f6a59fad3BCN
111417dd722f79-10c7-4f1d-97fa-fc31b608c182LHR
111418aa93dd96-6ffd-4a69-ac9b-e464b7e24b43AMS
111419d05452e6-e69c-45a1-aa86-6048cc3327c4ATH
\n", + "

111420 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " flight_uuid layover_location\n", + "0 bc9de66a-e11b-4ded-a230-0eccbc7f3838 None\n", + "1 37e1aaeb-80d9-4cd2-b24f-1d197a93671f None\n", + "2 f6956f34-ddc4-4c57-80ef-706049058c34 None\n", + "3 8b0795c7-2acb-427e-81aa-4181827a483e None\n", + "4 f7aa01e7-6fac-48ab-bb37-22f6b39c8213 None\n", + "... ... ...\n", + "111415 17893f94-0096-42d0-b71c-c5fedd3be13b BCN\n", + "111416 a2dc98dd-6268-44ea-af71-598f6a59fad3 BCN\n", + "111417 dd722f79-10c7-4f1d-97fa-fc31b608c182 LHR\n", + "111418 aa93dd96-6ffd-4a69-ac9b-e464b7e24b43 AMS\n", + "111419 d05452e6-e69c-45a1-aa86-6048cc3327c4 ATH\n", + "\n", + "[111420 rows x 2 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df = pd.DataFrame(res, columns=[\"flight_uuid\", \"layover_location\"])\n", "df" @@ -451,9 +528,125 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/_6/c56rsvy15k9_gv99mkdll8pc0000gn/T/ipykernel_94061/3424689233.py:2: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", + " df.lauover_location = df.layover_location.str.strip()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
flight_uuidlayover_location
5811962d6-e250-4480-bb8a-2d2ae511afe4ATH
8bea04f18-6fc3-4f39-8a57-caeaa25bf35bLGW
9941c0948-ceda-4a4f-bb60-f9fc5a322b95BCN
194a86fa79-dc3d-4e92-9c03-412bdfbf3810DUS
226a341fc6-9761-4cdc-bced-cf3fa636c2a9BCN
.........
11141517893f94-0096-42d0-b71c-c5fedd3be13bBCN
111416a2dc98dd-6268-44ea-af71-598f6a59fad3BCN
111417dd722f79-10c7-4f1d-97fa-fc31b608c182LHR
111418aa93dd96-6ffd-4a69-ac9b-e464b7e24b43AMS
111419d05452e6-e69c-45a1-aa86-6048cc3327c4ATH
\n", + "

50651 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " flight_uuid layover_location\n", + "5 811962d6-e250-4480-bb8a-2d2ae511afe4 ATH\n", + "8 bea04f18-6fc3-4f39-8a57-caeaa25bf35b LGW\n", + "9 941c0948-ceda-4a4f-bb60-f9fc5a322b95 BCN\n", + "19 4a86fa79-dc3d-4e92-9c03-412bdfbf3810 DUS\n", + "22 6a341fc6-9761-4cdc-bced-cf3fa636c2a9 BCN\n", + "... ... ...\n", + "111415 17893f94-0096-42d0-b71c-c5fedd3be13b BCN\n", + "111416 a2dc98dd-6268-44ea-af71-598f6a59fad3 BCN\n", + "111417 dd722f79-10c7-4f1d-97fa-fc31b608c182 LHR\n", + "111418 aa93dd96-6ffd-4a69-ac9b-e464b7e24b43 AMS\n", + "111419 d05452e6-e69c-45a1-aa86-6048cc3327c4 ATH\n", + "\n", + "[50651 rows x 2 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# get rid of \"min\"\n", "df.lauover_location = df.layover_location.str.strip()\n", @@ -467,9 +660,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, "outputs": [], + "source": [ + "db.create_scraped_layovers_table()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 / 11\n", + "2 / 11\n", + "3 / 11\n", + "4 / 11\n", + "5 / 11\n", + "6 / 11\n", + "7 / 11\n", + "8 / 11\n", + "9 / 11\n", + "10 / 11\n", + "11 / 11\n" + ] + } + ], "source": [ "def split_dataframe(df, chunk_size=5000):\n", " chunks = list()\n", @@ -656,21 +876,167 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ - "import re\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iatanamecountrylatlonregionmunicipalitycontinent
40729FCORome–Fiumicino Leonardo da Vinci International...IT41.80453212.251998IT-62RomeEU
\n", + "
" + ], + "text/plain": [ + " iata name country \\\n", + "40729 FCO Rome–Fiumicino Leonardo da Vinci International... IT \n", + "\n", + " lat lon region municipality continent \n", + "40729 41.804532 12.251998 IT-62 Rome EU " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "AIRPORTS_DF_URL = \"https://raw.githubusercontent.com/datasets/airport-codes/master/data/airport-codes.csv\"\n", + "df = pd.read_csv(AIRPORTS_DF_URL, encoding=\"utf-8\")\n", + "\n", + "# filter original data\n", + "# take only aiprort that have a IATA code\n", + "df = df.loc[df.iata_code.notnull()]\n", + "\n", + "# exclude closed, seaplane_base and heliport\n", + "df = df.loc[df.type.isin([\"large_airport\", \"medium_airport\", \"small_airport\"])]\n", + "\n", + "# split coordinates in latitude and longitude\n", + "df[\"lat\"] = df.coordinates.apply(lambda x: x.split(\",\")[0]).astype(float)\n", + "df[\"lon\"] = df.coordinates.apply(lambda x: x.split(\",\")[1]).astype(float)\n", + "\n", + "# take only useful columns\n", + "cols = [\n", + " \"iata_code\",\n", + " \"name\",\n", + " \"iso_country\",\n", + " \"lat\",\n", + " \"lon\",\n", + " \"iso_region\",\n", + " \"municipality\",\n", + " \"continent\",\n", + "]\n", + "df = df[cols]\n", + "\n", + "# rename columns\n", + "df = df.rename(\n", + " columns={\n", + " \"iata_code\": \"iata\",\n", + " \"iso_country\": \"country\",\n", + " \"iso_region\": \"region\",\n", + " }\n", + ")\n", "\n", - "not bool(re.search(\"ITA, \", \"ITA, Singapore\"))" + "# replace nan with None\n", + "df = df.replace({pd.NA: None, np.nan: None})\n", + "\n", + "df.loc[df.iata == \"FCO\"]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "from src.flight_analysis.database import Database\n", + "import private.private as private\n", + "\n", + "db = Database(\n", + " db_host=private.DB_HOST,\n", + " db_name=private.DB_NAME,\n", + " db_user=private.DB_USER,\n", + " db_pw=private.DB_PW,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['scraped', 'scraped_layovers', 'scraped_airlines']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.list_all_tables()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "db.create_data_airports_table()" + ] } ], "metadata": { diff --git a/flight_analysis.py b/flight_analysis.py index cd31d79..f8d438d 100644 --- a/flight_analysis.py +++ b/flight_analysis.py @@ -108,10 +108,12 @@ def get_routes_df(routes: list): final_df = pd.concat(all_results).reset_index(drop=True) # clean and transform the dataframe - final_df["layover_time"] = final_df["layover_time"].fillna(-1) - final_df["layover_location"] = ( - final_df["layover_location"].fillna(np.nan).replace([np.nan], [None]) - ) + # convert layover time and travel time to minutes + final_df["layover_time"] = final_df["layover_time"].dt.total_seconds() / 60 + final_df["layover_time"] = final_df["layover_time"].fillna(0).astype(int) + final_df["travel_time"] = final_df["travel_time"].dt.total_seconds() / 60 + final_df["travel_time"] = final_df["travel_time"].fillna(0).astype(int) + final_df["price_value"] = ( final_df["price_value"].fillna(np.nan).replace([np.nan], [None]) ) @@ -122,7 +124,7 @@ def get_routes_df(routes: list): return final_df -def generate_airline_df_from_flights(flights_df): +def generate_airlines_df_from_flights(flights_df): """ From a flights dataframe, generate an airline dataframe. Goal: respect good database conditions. @@ -145,6 +147,32 @@ def generate_airline_df_from_flights(flights_df): return airlines_df +def generate_layovers_df_from_flights(flights_df): + """ + From a flights dataframe, generate a layovers dataframe. + Goal: respect good database conditions. + """ + # check if all indices are unique + if not flights_df.index.is_unique: + flights_df = flights_df.reset_index(drop=True) + + # create a dataframe with all the layovers, referencing the index + layovers_df = flights_df.explode("layover_location")[ + ["layover_location"] + ].reset_index() + + # get rid of rows with no layover + layovers_df = layovers_df.dropna().reset_index(drop=True) + + # rename column + layovers_df = layovers_df.rename(columns={"uuid": "flight_uuid"}) + + # rename index to uuid + layovers_df.index.names = ["uuid"] + + return layovers_df + + if __name__ == "__main__": SKIP_SAVE_TO_DB = len(sys.argv) > 1 and sys.argv[1] == "nodb" @@ -154,11 +182,12 @@ def generate_airline_df_from_flights(flights_df): # scrape routes into a dataframe scraped_flights = get_routes_df(routes) - # generate an airline dataframe - scraped_airlines = generate_airline_df_from_flights(scraped_flights) + # generate airline and layovers dataframe + scraped_airlines = generate_airlines_df_from_flights(scraped_flights) + scraped_layovers = generate_layovers_df_from_flights(scraped_flights) - # drop airlines from flights dataframe - scraped_flights = scraped_flights.drop(columns=["airlines"]) + # drop airlines and layovers from flights dataframe + scraped_flights = scraped_flights.drop(columns=["airlines", "layover_location"]) # connect to database db = Database( @@ -175,6 +204,7 @@ def generate_airline_df_from_flights(flights_df): if not SKIP_SAVE_TO_DB: db.add_pandas_df_to_db(scraped_flights, table_name=db.table_scraped) db.add_pandas_df_to_db(scraped_airlines, table_name=db.table_scraped_airlines) + db.add_pandas_df_to_db(scraped_layovers, table_name=db.table_scraped_layovers) # if it's a monday, backup the database if datetime.today().weekday() == 0: diff --git a/src/flight_analysis/database.py b/src/flight_analysis/database.py index 4b0ab6c..8ddfcbf 100644 --- a/src/flight_analysis/database.py +++ b/src/flight_analysis/database.py @@ -6,6 +6,8 @@ import logging from datetime import datetime import subprocess +import pandas as pd +import numpy as np # logging logger_name = os.path.basename(__file__) @@ -96,7 +98,6 @@ def create_scraped_table(self): destination character(3) COLLATE pg_catalog."default" NOT NULL, layover_n smallint NOT NULL, layover_time smallint, - layover_location text COLLATE pg_catalog."default", price_eur smallint NOT NULL, price_trend text COLLATE pg_catalog."default", price_value text COLLATE pg_catalog."default", @@ -110,6 +111,11 @@ def create_scraped_table(self): TABLESPACE pg_default; ALTER TABLE IF EXISTS public.{self.table_scraped} OWNER to postgres; + + CREATE INDEX idx_access_date ON public.{self.table_scraped} USING btree (access_date); + CREATE INDEX idx_origin ON public.{self.table_scraped} USING btree (origin); + CREATE INDEX idx_destination ON public.{self.table_scraped} USING btree (destination); + CREATE INDEX idx_origin_destination ON public.{self.table_scraped} USING btree (origin, destination); """ cursor = self.conn.cursor() @@ -139,6 +145,8 @@ def create_scraped_airlines_table(self): ON public.{self.table_scraped_airlines} USING btree (flight_uuid ASC NULLS LAST) TABLESPACE pg_default; + + CREATE INDEX idx_airline ON public.{self.table_scraped_airlines} USING btree (airline ASC NULLS LAST); """ cursor = self.conn.cursor() @@ -168,13 +176,98 @@ def create_scraped_layovers_table(self): ON public.{self.table_scraped_layovers} USING btree (flight_uuid ASC NULLS LAST) TABLESPACE pg_default; + + CREATE INDEX idx_layover_location ON public.{self.table_scraped_layovers} USING btree (layover_location ASC NULLS LAST); """ cursor = self.conn.cursor() cursor.execute(query) cursor.close() - + def create_data_airports_table(self): + # create empty table + query = """ + CREATE TABLE IF NOT EXISTS public.data_airports + ( + iata character(3) NOT NULL, + name text, + country character(2), + lat numeric, + lon numeric, + region text, + municipality text, + continent character(2), + PRIMARY KEY (iata) + ) + + TABLESPACE pg_default; + + ALTER TABLE IF EXISTS public.data_airports + OWNER to postgres; + + CREATE INDEX idx_iata + ON public.data_airports USING hash + (iata) + """ + + cursor = self.conn.cursor() + cursor.execute(query) + cursor.close() + logger.info("Table [data_airports] created.") + + # download airports data + airports_df = self.download_data_airports() + + # add airports data to table + logger.info("Adding airports data to table [data_airports]...") + self.add_pandas_df_to_db(airports_df, table_name="data_airports") + + def download_data_airports(self): + AIRPORTS_DF_URL = "https://raw.githubusercontent.com/datasets/airport-codes/master/data/airport-codes.csv" + df = pd.read_csv(AIRPORTS_DF_URL, encoding="utf-8") + + # filter original data + # take only aiprort that have a IATA code + df = df.loc[df.iata_code.notnull()] + + # exclude closed, seaplane_base and heliport + df = df.loc[df.type.isin(["large_airport", "medium_airport", "small_airport"])] + + # split coordinates in latitude and longitude + df["lat"] = ( + df.coordinates.apply(lambda x: x.split(",")[0]).astype(float).round(4) + ) + df["lon"] = ( + df.coordinates.apply(lambda x: x.split(",")[1]).astype(float).round(4) + ) + + # take only useful columns + cols = [ + "iata_code", + "name", + "iso_country", + "lat", + "lon", + "iso_region", + "municipality", + "continent", + ] + df = df[cols] + + # rename columns + df = df.rename( + columns={ + "iata_code": "iata", + "iso_country": "country", + "iso_region": "region", + } + ) + + # replace nan with None + df = df.replace({pd.NA: None, np.nan: None}) + + return df.reset_index(drop=True) + def prepare_db_and_tables(self): """ Creates the database and the table if they don't exist. @@ -183,6 +276,10 @@ def prepare_db_and_tables(self): if self.db_name not in self.list_all_databases(): self.create_db() + # create data_airports table + if "data_airports" not in self.list_all_tables(): + self.create_data_airports_table() + # create scraped table if self.table_scraped not in self.list_all_tables(): self.create_scraped_table() @@ -191,13 +288,19 @@ def prepare_db_and_tables(self): if self.table_scraped_airlines not in self.list_all_tables(): self.create_scraped_airlines_table() + # create scraped_layovers table + if self.table_scraped_layovers not in self.list_all_tables(): + self.create_scraped_layovers_table() + def add_pandas_df_to_db(self, df, table_name): extras.register_uuid() # Create a list of tuples from the dataframe values if table_name == self.table_scraped: - df = df.reset_index() # otherwise the index (uuid) is not added to the table - + df = ( + df.reset_index() + ) # otherwise the index (uuid) is not added to the table + tuples = [tuple(x) for x in df.to_numpy()] # Comma-separated dataframe columns @@ -216,20 +319,19 @@ def add_pandas_df_to_db(self, df, table_name): cursor.close() - # fix layover time - # TODO: improve this + # fix layover time manually if table_name == self.table_scraped: - cursor = self.conn.cursor() - query = f""" - UPDATE {self.table_scraped} - SET layover_time = CASE - WHEN layover_time = -1 THEN null ELSE layover_time END; - - ALTER TABLE public.{self.table_scraped} - ALTER COLUMN layover_time TYPE smallint; - """ - cursor.execute(query) - cursor.close() + try: + query = f""" + UPDATE public.{self.table_scraped} + SET layover_time = NULL + WHERE layover_time = 0;""" + cursor = self.conn.cursor() + cursor.execute(query) + cursor.close() + except Exception as e: + logger.error(f"Error while updating layover_time: {e}") + self.conn.rollback() def dump_database_to_file(self): """ diff --git a/src/flight_analysis/flight.py b/src/flight_analysis/flight.py index 552cd11..a7f388a 100644 --- a/src/flight_analysis/flight.py +++ b/src/flight_analysis/flight.py @@ -117,6 +117,13 @@ def _is_arg_orig_dest(self, arg): # example: MUCFCO, BCNMAD if len(arg) == 6 and arg.isupper(): return True + + return False + + def _is_arg_train_service(self, arg): + train_keywords = ["Train service", "Flight + Train"] + if arg in train_keywords: + return True return False # --------------------------------------------------------------- @@ -197,14 +204,14 @@ def _parse_price(self, arg): return int(arg.replace(",", "")) def _parse_orig_dest(self, arg): - # special case: "Flight + Train" - if "Flight + Train" in arg: - self._has_train = True - return (self._queried_orig, self._queried_dest) - # regular case: like MUCFCO, LAXJFK return (arg[:3], arg[3:]) + def _parse_train_service(self, arg): + if "Train service" in arg: + self._has_train = True + return (self._queried_orig, self._queried_dest) + # --------------------------------------------------------------- def _classify_arg(self, arg: str): @@ -212,8 +219,6 @@ def _classify_arg(self, arg: str): Classifies a string (arg) into the correct attribute for a flight, such as price, numer of layover stops, arrival time... """ - parsed = False - # define cases for which to return early arg_empty = arg is None or arg == "" or len(arg) == 0 arg_useless = arg in ["Change of airport", "round trip", "Climate friendly"] @@ -238,6 +243,10 @@ def _classify_arg(self, arg: str): self._airline = arg.split(", ") return + # train service + if self._is_arg_train_service(arg): + self._origin, self._dest = self._parse_train_service(arg) + # departure and arrival times if self._is_arg_departure_arrival_times(arg): ( @@ -330,7 +339,7 @@ def make_dataframe(flights): "access_date": [], "one_way": [], "has_train": [], - # "days_advance": [], + "days_advance": [], } # populate the dictionary @@ -351,9 +360,9 @@ def make_dataframe(flights): data["access_date"] += [datetime.today()] data["one_way"] += [(False if flight._roundtrip else True)] data["has_train"] += [flight._has_train] - # data["days_advance"] += [ - # (flight._time_departure - datetime.today()).days - # ] + data["days_advance"] += [ + (flight._time_departure - datetime.today()).days + ] except Exception as e: print("Error with flight", flight, flight._price) print(e)