diff --git a/.gitignore b/.gitignore index a007c9b..6f53f67 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ backups private scrapes_csv test.py +sql_queries # logging logs/ diff --git a/flight_analysis.ipynb b/flight_analysis.ipynb index 0b589d4..abbf029 100644 --- a/flight_analysis.ipynb +++ b/flight_analysis.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -11,17 +11,17 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "flights = Scrape(\"MUC\", \"JFK\", \"2023-10-28\")\n", + "flights = Scrape(\"CTA\", \"PMO\", \"2023-10-28\")\n", "flights.run_scrape()" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -60,574 +60,330 @@ " access_date\n", " one_way\n", " has_train\n", - " days_advance\n", " \n", " \n", " \n", " \n", " 0\n", - " 2023-10-28 14:05:00\n", - " 2023-10-28 19:10:00\n", - " [Icelandair]\n", - " 665\n", - " MUC\n", - " JFK\n", + " 2023-10-28 07:50:00\n", + " 2023-10-28 11:25:00\n", + " [easyJet]\n", + " 0 days 03:35:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 60.0\n", - " KEF\n", - " 511\n", + " 0 days 01:30:00\n", + " [NAP]\n", + " 58\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524003\n", + " 2023-08-07 20:18:11.150904\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 1\n", - " 2023-10-28 09:55:00\n", - " 2023-10-28 20:20:00\n", - " [LOT]\n", - " 985\n", - " MUC\n", - " JFK\n", + " 2023-10-28 14:00:00\n", + " 2023-10-28 18:20:00\n", + " [ITA]\n", + " 0 days 04:20:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 320.0\n", - " WAW\n", - " 627\n", + " 0 days 01:50:00\n", + " [FCO]\n", + " 222\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524028\n", + " 2023-08-07 20:18:11.150920\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 2\n", - " 2023-10-28 11:50:00\n", - " 2023-10-28 16:35:00\n", - " [KLMDelta]\n", - " 645\n", - " MUC\n", - " JFK\n", + " 2023-10-28 19:10:00\n", + " 2023-10-28 22:55:00\n", + " [ITA]\n", + " 0 days 03:45:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 70.0\n", - " AMS\n", - " 706\n", + " 0 days 01:10:00\n", + " [FCO]\n", + " 222\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524034\n", + " 2023-08-07 20:18:11.150922\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 3\n", - " 2023-10-28 11:00:00\n", - " 2023-10-28 20:20:00\n", - " [Lufthansa, LOT]\n", - " 920\n", - " MUC\n", - " JFK\n", - " 1\n", - " 260.0\n", - " WAW\n", - " 658\n", + " 2023-10-28 12:50:00\n", + " 2023-10-28 22:55:00\n", + " [ITA]\n", + " 0 days 10:05:00\n", + " CTA\n", + " PMO\n", + " 3\n", + " NaT\n", + " [LIN, NAP, FCO]\n", + " 191\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524039\n", + " 2023-08-07 20:18:11.150924\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 4\n", - " 2023-10-28 10:25:00\n", - " 2023-10-28 23:48:00\n", - " [Delta, Virgin Atlantic]\n", - " 1163\n", - " MUC\n", - " JFK\n", + " 2023-10-28 15:25:00\n", + " 2023-10-28 22:10:00\n", + " [ITA]\n", + " 0 days 06:45:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 389.0\n", - " ATL\n", - " 670\n", + " 0 days 04:20:00\n", + " [FCO]\n", + " 222\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524045\n", + " 2023-08-07 20:18:11.150926\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 5\n", - " 2023-10-28 07:00:00\n", - " 2023-10-28 15:25:00\n", - " [KLMDelta]\n", - " 865\n", - " MUC\n", - " JFK\n", + " 2023-10-28 11:05:00\n", + " 2023-10-28 16:20:00\n", + " [ITA]\n", + " 0 days 05:15:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 290.0\n", - " AMS\n", - " 706\n", + " 0 days 01:50:00\n", + " [LIN]\n", + " 228\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524050\n", + " 2023-08-07 20:18:11.150927\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 6\n", - " 2023-10-28 11:25:00\n", - " 2023-10-28 19:20:00\n", - " [Aer Lingus]\n", - " 835\n", - " MUC\n", - " JFK\n", + " 2023-10-28 06:00:00\n", + " 2023-10-28 09:25:00\n", + " [ITA]\n", + " 0 days 03:25:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 220.0\n", - " DUB\n", - " 787\n", + " 0 days 00:55:00\n", + " [FCO]\n", + " 254\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524055\n", + " 2023-08-07 20:18:11.150929\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 7\n", - " 2023-10-28 14:05:00\n", - " 2023-10-28 19:15:00\n", - " [KLMDelta]\n", - " 670\n", - " MUC\n", - " JFK\n", + " 2023-10-28 10:15:00\n", + " 2023-10-28 14:25:00\n", + " [ITA]\n", + " 0 days 04:10:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 95.0\n", - " AMS\n", - " 812\n", + " 0 days 01:35:00\n", + " [FCO]\n", + " 265\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524060\n", + " 2023-08-07 20:18:11.150931\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 8\n", - " 2023-10-28 09:00:00\n", - " 2023-10-28 14:25:00\n", - " [Lufthansa, Condor]\n", - " 685\n", - " MUC\n", - " JFK\n", + " 2023-10-28 07:00:00\n", + " 2023-10-28 11:55:00\n", + " [ITA]\n", + " 0 days 04:55:00\n", + " CTA\n", + " PMO\n", " 1\n", - " 105.0\n", - " FRA\n", - " 830\n", + " 0 days 02:20:00\n", + " [FCO]\n", + " 280\n", " typical\n", " None\n", - " 2023-08-05 14:15:47.524066\n", + " 2023-08-07 20:18:11.150933\n", " True\n", " False\n", - " 83\n", " \n", " \n", " 9\n", - " 2023-10-28 12:10:00\n", - " 2023-10-28 15:00:00\n", - " [Lufthansa, United]\n", - " 530\n", - " MUC\n", - " JFK\n", - " 0\n", - " NaN\n", - " None\n", - " 1317\n", - " typical\n", - " None\n", - " 2023-08-05 14:15:47.524070\n", - " True\n", - " False\n", - " 83\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " departure_datetime arrival_datetime airlines \\\n", - "0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n", - "1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n", - "2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n", - "3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n", - "4 2023-10-28 10:25:00 2023-10-28 23:48:00 [Delta, Virgin Atlantic] \n", - "5 2023-10-28 07:00:00 2023-10-28 15:25:00 [KLMDelta] \n", - "6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n", - "7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n", - "8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n", - "9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n", - "\n", - " travel_time origin destination layover_n layover_time layover_location \\\n", - "0 665 MUC JFK 1 60.0 KEF \n", - "1 985 MUC JFK 1 320.0 WAW \n", - "2 645 MUC JFK 1 70.0 AMS \n", - "3 920 MUC JFK 1 260.0 WAW \n", - "4 1163 MUC JFK 1 389.0 ATL \n", - "5 865 MUC JFK 1 290.0 AMS \n", - "6 835 MUC JFK 1 220.0 DUB \n", - "7 670 MUC JFK 1 95.0 AMS \n", - "8 685 MUC JFK 1 105.0 FRA \n", - "9 530 MUC JFK 0 NaN None \n", - "\n", - " price_eur price_trend price_value access_date one_way \\\n", - "0 511 typical None 2023-08-05 14:15:47.524003 True \n", - "1 627 typical None 2023-08-05 14:15:47.524028 True \n", - "2 706 typical None 2023-08-05 14:15:47.524034 True \n", - "3 658 typical None 2023-08-05 14:15:47.524039 True \n", - "4 670 typical None 2023-08-05 14:15:47.524045 True \n", - "5 706 typical None 2023-08-05 14:15:47.524050 True \n", - "6 787 typical None 2023-08-05 14:15:47.524055 True \n", - "7 812 typical None 2023-08-05 14:15:47.524060 True \n", - "8 830 typical None 2023-08-05 14:15:47.524066 True \n", - "9 1317 typical None 2023-08-05 14:15:47.524070 True \n", - "\n", - " has_train days_advance \n", - "0 False 83 \n", - "1 False 83 \n", - "2 False 83 \n", - "3 False 83 \n", - "4 False 83 \n", - "5 False 83 \n", - "6 False 83 \n", - "7 False 83 \n", - "8 False 83 \n", - "9 False 83 " - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "flights.data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", " \n", " \n", - " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", "
departure_datetimearrival_datetimeairlinestravel_timeorigindestinationlayover_nlayover_timelayover_locationprice_eurprice_trendprice_valueaccess_dateone_wayhas_traindays_advance
02023-10-28 14:05:002023-10-28 19:10:00[Icelandair]665MUCJFK160.0KEF511typicalNone2023-08-05 14:15:47.524003TrueFalse83
12023-10-28 09:55:002023-10-28 20:20:00[LOT]985MUCJFK2023-10-28 06:00:002023-10-28 11:05:00[Lufthansa]0 days 05:05:00CTAPMO1320.0WAW6270 days 01:00:00[MUC]408typicalNone2023-08-05 14:15:47.5240282023-08-07 20:18:11.150934TrueFalse83
22023-10-28 11:50:002023-10-28 16:35:00[KLMDelta]645MUCJFK102023-10-28 21:35:002023-10-29 11:10:00[British Airways]0 days 14:35:00CTAPMO170.0AMS706typicalNaTNone2023-08-05 14:15:47.524034TrueFalse83
32023-10-28 11:00:002023-10-28 20:20:00[Lufthansa, LOT]920MUCJFK1260.0WAW658457typicalNone2023-08-05 14:15:47.5240392023-08-07 20:18:11.150936TrueFalse83
42023-10-28 10:25:002023-10-28 23:48:00[Delta, Virgin Atlantic]1163MUCJFK112023-10-28 19:25:002023-10-29 17:40:00[Turkish Airlines]0 days 23:15:00CTAPMO1389.0ATL6700 days 18:20:00[IST]504typicalNone2023-08-05 14:15:47.5240452023-08-07 20:18:11.150938TrueFalse83
5122023-10-28 07:00:002023-10-28 15:25:00[KLMDelta]865MUCJFK1290.0AMS7062023-10-28 13:55:00[ITA, SWISS]0 days 06:55:00CTAPMO2NaT[FCO, ZRH]617typicalNone2023-08-05 14:15:47.5240502023-08-07 20:18:11.150939TrueFalse83
62023-10-28 11:25:002023-10-28 19:20:00[Aer Lingus]835MUCJFK1220.0DUB787typicalNone2023-08-05 14:15:47.524055TrueFalse83
72023-10-28 14:05:002023-10-28 19:15:00[KLMDelta]670MUCJFK195.0AMS812typicalNone2023-08-05 14:15:47.524060TrueFalse83
82023-10-28 09:00:002023-10-28 14:25:00[Lufthansa, Condor]685MUCJFK1105.0FRA830typicalNone2023-08-05 14:15:47.524066TrueFalse83
92023-10-28 12:10:002023-10-28 15:00:00[Lufthansa, United]530MUCJFK0NaNNone1317typicalNone2023-08-05 14:15:47.524070TrueFalse83
\n", "
" ], "text/plain": [ - " departure_datetime arrival_datetime airlines \\\n", - "0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n", - "1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n", - "2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n", - "3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n", - "4 2023-10-28 10:25:00 2023-10-28 23:48:00 [Delta, Virgin Atlantic] \n", - "5 2023-10-28 07:00:00 2023-10-28 15:25:00 [KLMDelta] \n", - "6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n", - "7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n", - "8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n", - "9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n", + " departure_datetime arrival_datetime airlines \\\n", + "0 2023-10-28 07:50:00 2023-10-28 11:25:00 [easyJet] \n", + "1 2023-10-28 14:00:00 2023-10-28 18:20:00 [ITA] \n", + "2 2023-10-28 19:10:00 2023-10-28 22:55:00 [ITA] \n", + "3 2023-10-28 12:50:00 2023-10-28 22:55:00 [ITA] \n", + "4 2023-10-28 15:25:00 2023-10-28 22:10:00 [ITA] \n", + "5 2023-10-28 11:05:00 2023-10-28 16:20:00 [ITA] \n", + "6 2023-10-28 06:00:00 2023-10-28 09:25:00 [ITA] \n", + "7 2023-10-28 10:15:00 2023-10-28 14:25:00 [ITA] \n", + "8 2023-10-28 07:00:00 2023-10-28 11:55:00 [ITA] \n", + "9 2023-10-28 06:00:00 2023-10-28 11:05:00 [Lufthansa] \n", + "10 2023-10-28 21:35:00 2023-10-29 11:10:00 [British Airways] \n", + "11 2023-10-28 19:25:00 2023-10-29 17:40:00 [Turkish Airlines] \n", + "12 2023-10-28 07:00:00 2023-10-28 13:55:00 [ITA, SWISS] \n", "\n", - " travel_time origin destination layover_n layover_time layover_location \\\n", - "0 665 MUC JFK 1 60.0 KEF \n", - "1 985 MUC JFK 1 320.0 WAW \n", - "2 645 MUC JFK 1 70.0 AMS \n", - "3 920 MUC JFK 1 260.0 WAW \n", - "4 1163 MUC JFK 1 389.0 ATL \n", - "5 865 MUC JFK 1 290.0 AMS \n", - "6 835 MUC JFK 1 220.0 DUB \n", - "7 670 MUC JFK 1 95.0 AMS \n", - "8 685 MUC JFK 1 105.0 FRA \n", - "9 530 MUC JFK 0 NaN None \n", + " travel_time origin destination layover_n layover_time \\\n", + "0 0 days 03:35:00 CTA PMO 1 0 days 01:30:00 \n", + "1 0 days 04:20:00 CTA PMO 1 0 days 01:50:00 \n", + "2 0 days 03:45:00 CTA PMO 1 0 days 01:10:00 \n", + "3 0 days 10:05:00 CTA PMO 3 NaT \n", + "4 0 days 06:45:00 CTA PMO 1 0 days 04:20:00 \n", + "5 0 days 05:15:00 CTA PMO 1 0 days 01:50:00 \n", + "6 0 days 03:25:00 CTA PMO 1 0 days 00:55:00 \n", + "7 0 days 04:10:00 CTA PMO 1 0 days 01:35:00 \n", + "8 0 days 04:55:00 CTA PMO 1 0 days 02:20:00 \n", + "9 0 days 05:05:00 CTA PMO 1 0 days 01:00:00 \n", + "10 0 days 14:35:00 CTA PMO 1 NaT \n", + "11 0 days 23:15:00 CTA PMO 1 0 days 18:20:00 \n", + "12 0 days 06:55:00 CTA PMO 2 NaT \n", "\n", - " price_eur price_trend price_value access_date one_way \\\n", - "0 511 typical None 2023-08-05 14:15:47.524003 True \n", - "1 627 typical None 2023-08-05 14:15:47.524028 True \n", - "2 706 typical None 2023-08-05 14:15:47.524034 True \n", - "3 658 typical None 2023-08-05 14:15:47.524039 True \n", - "4 670 typical None 2023-08-05 14:15:47.524045 True \n", - "5 706 typical None 2023-08-05 14:15:47.524050 True \n", - "6 787 typical None 2023-08-05 14:15:47.524055 True \n", - "7 812 typical None 2023-08-05 14:15:47.524060 True \n", - "8 830 typical None 2023-08-05 14:15:47.524066 True \n", - "9 1317 typical None 2023-08-05 14:15:47.524070 True \n", + " layover_location price_eur price_trend price_value \\\n", + "0 [NAP] 58 typical None \n", + "1 [FCO] 222 typical None \n", + "2 [FCO] 222 typical None \n", + "3 [LIN, NAP, FCO] 191 typical None \n", + "4 [FCO] 222 typical None \n", + "5 [LIN] 228 typical None \n", + "6 [FCO] 254 typical None \n", + "7 [FCO] 265 typical None \n", + "8 [FCO] 280 typical None \n", + "9 [MUC] 408 typical None \n", + "10 None 457 typical None \n", + "11 [IST] 504 typical None \n", + "12 [FCO, ZRH] 617 typical None \n", "\n", - " has_train days_advance \n", - "0 False 83 \n", - "1 False 83 \n", - "2 False 83 \n", - "3 False 83 \n", - "4 False 83 \n", - "5 False 83 \n", - "6 False 83 \n", - "7 False 83 \n", - "8 False 83 \n", - "9 False 83 " + " access_date one_way has_train \n", + "0 2023-08-07 20:18:11.150904 True False \n", + "1 2023-08-07 20:18:11.150920 True False \n", + "2 2023-08-07 20:18:11.150922 True False \n", + "3 2023-08-07 20:18:11.150924 True False \n", + "4 2023-08-07 20:18:11.150926 True False \n", + "5 2023-08-07 20:18:11.150927 True False \n", + "6 2023-08-07 20:18:11.150929 True False \n", + "7 2023-08-07 20:18:11.150931 True False \n", + "8 2023-08-07 20:18:11.150933 True False \n", + "9 2023-08-07 20:18:11.150934 True False \n", + "10 2023-08-07 20:18:11.150936 True False \n", + "11 2023-08-07 20:18:11.150938 True False \n", + "12 2023-08-07 20:18:11.150939 True False " ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = flights.data\n", - "df" + "flights.data" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20MUC%20on%202023-10-28%20oneway&curr=EUR&gl=IT'" + "'https://www.google.com/travel/flights?q=Flights%20to%20PMO%20from%20CTA%20on%202023-10-28%20oneway&curr=EUR&gl=IT'" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -638,20 +394,30 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "Database: flight_analysis" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" + "ename": "NotImplementedError", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 5\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m()\n", + "\u001b[0;31mNotImplementedError\u001b[0m: " + ] } ], + "source": [ + "raise NotImplementedError()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import pandas as pd\n", "from src.flight_analysis.database import Database\n", @@ -663,26 +429,98 @@ " db_user=private.DB_USER,\n", " db_pw=private.DB_PW,\n", ")\n", - "db" + "\n", + "query = \"SELECT uuid, layover_location FROM scraped\"\n", + "\n", + "cur = db.conn.cursor()\n", + "cur.execute(query)\n", + "\n", + "res = cur.fetchall()\n", + "res" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "NotImplementedError", - "evalue": "This script is not meant to be run again!!!!!", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 7\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# this process is to retroactively split the flight airlines into a separate \"scraped_airlines\" table,\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39m# in order to violate the 1NF and 2NF rules of database normalization.\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mThis script is not meant to be run again!!!!!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mflight_analysis\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdatabase\u001b[39;00m \u001b[39mimport\u001b[39;00m Database\n", - "\u001b[0;31mNotImplementedError\u001b[0m: This script is not meant to be run again!!!!!" - ] - } - ], + "outputs": [], + "source": [ + "df = pd.DataFrame(res, columns=[\"flight_uuid\", \"layover_location\"])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get rid of \"min\"\n", + "df.lauover_location = df.layover_location.str.strip()\n", + "df.layover_location = df.layover_location.replace(\"\", np.nan)\n", + "df.layover_location = df.layover_location.replace(\"min\", np.nan)\n", + "df.layover_location = df.layover_location.replace(\"Change of airport\", np.nan)\n", + "\n", + "df = df.dropna()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def split_dataframe(df, chunk_size=5000):\n", + " chunks = list()\n", + " num_chunks = len(df) // chunk_size + 1\n", + " for i in range(num_chunks):\n", + " chunks.append(df[i * chunk_size : (i + 1) * chunk_size])\n", + " return chunks\n", + "\n", + "\n", + "chunks = split_dataframe(df, chunk_size=5000)\n", + "\n", + "i = 1\n", + "for c in chunks:\n", + " print(i, \"/\", len(chunks))\n", + " db.add_pandas_df_to_db(c, table_name=db.table_scraped_layovers)\n", + " i += 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# this process is to retroactively split the flight airlines into a separate \"scraped_airlines\" table,\n", "# in order to violate the 1NF and 2NF rules of database normalization.\n", @@ -711,17 +549,21 @@ "\n", "df2 = df.explode(\"airline\")\n", "df2[\"airline\"] = df2[\"airline\"].map(lambda x: x.lstrip(\"'\").rstrip(\"'\"))\n", - "df2[\"airline\"] = df2[\"airline\"].replace({\"Separate tickets booked together\": \"multiple\"})\n", + "df2[\"airline\"] = df2[\"airline\"].replace(\n", + " {\"Separate tickets booked together\": \"multiple\"}\n", + ")\n", "df2 = df2.reset_index(drop=True)\n", "\n", - "def split_dataframe(df, chunk_size = 5000): \n", + "\n", + "def split_dataframe(df, chunk_size=5000):\n", " chunks = list()\n", " num_chunks = len(df) // chunk_size + 1\n", " for i in range(num_chunks):\n", - " chunks.append(df[i*chunk_size:(i+1)*chunk_size])\n", + " chunks.append(df[i * chunk_size : (i + 1) * chunk_size])\n", " return chunks\n", "\n", - "chunks = split_dataframe(df2, chunk_size = 5000)\n", + "\n", + "chunks = split_dataframe(df2, chunk_size=5000)\n", "\n", "i = 1\n", "for c in chunks:\n", @@ -729,6 +571,106 @@ " db.add_pandas_df_to_db(c, table_name=db.table_scraped_airlines)\n", " i += 1" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "\n", + "\n", + "def _is_arg_layover(arg):\n", + " \"\"\" \"\"\"\n", + " # layover location cases\n", + " layover_location_cases = {\n", + " # case 1: (xx hr xx min AAA), (xx hr xx min Aaaaa)\n", + " 1: re.search(\"\\d{0,2} hr \\d{0,2} min [A-Z]+\", arg),\n", + " # case 2: (xx hr AAA), (xx hr AAA)\n", + " 2: re.search(\"\\d{0,2} hr [A-Z]+\", arg),\n", + " # case 3: (xx min AAA), (xx min AAA)\n", + " 3: re.search(\"\\d{0,2} min [A-Z]+\", arg),\n", + " # case 4: (AAA, BBB, ...)\n", + " 4: re.search(\"^[A-Z]{3}, ([A-Z]{3}(, )?)?\", arg),\n", + " }\n", + "\n", + " print(layover_location_cases)\n", + "\n", + " return any(layover_location_cases.values())\n", + "\n", + "\n", + "def _parse_layover_times_location(arg):\n", + " layover_time = None\n", + " layover_location = None\n", + "\n", + " # layover time\n", + " if (\" hr\" in arg) or (\" min\" in arg):\n", + " layover_time = (\n", + " re.search(\"^(\\d{1,2} hr){0,1}\\s{0,1}(\\d{1,2} min){0,1}\\s\", arg)\n", + " .group()\n", + " .strip()\n", + " )\n", + " layover_location = arg.split(\" \")[-1]\n", + "\n", + " # layover location\n", + " if \",\" in arg:\n", + " layover_location = arg.split(\", \")\n", + " layover_location = [x.strip() for x in layover_location]\n", + "\n", + " return layover_time, layover_location\n", + "\n", + "\n", + "test_args = [\n", + " \"22 hr 12 min FCO\",\n", + " # \"22 hr 12 min\",\n", + " # \"3 min Ancona\",\n", + " # \"13 min LAX\",\n", + " # \"FCO, JFK\",\n", + " # \"FCO, JFK\",\n", + " # \"21 hr 5 min DOH\",\n", + " # \"hehe\",\n", + " \"SWISS, Singapore AirlinesOperated by Helvetic\",\n", + "]\n", + "\n", + "for arg in test_args:\n", + " if _is_arg_layover(arg):\n", + " x = _parse_layover_times_location(arg)\n", + " print(arg, x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "not bool(re.search(\"ITA, \", \"ITA, Singapore\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/flight_analysis/database.py b/src/flight_analysis/database.py index aa317e4..4b0ab6c 100644 --- a/src/flight_analysis/database.py +++ b/src/flight_analysis/database.py @@ -24,6 +24,7 @@ def __init__(self, db_host, db_name, db_user, db_pw): # tables self.table_scraped = "scraped" self.table_scraped_airlines = "scraped_airlines" + self.table_scraped_layovers = "scraped_layovers" self.conn = self.connect_to_postgresql() self.conn.autocommit = True @@ -125,7 +126,7 @@ def create_scraped_airlines_table(self): airline text COLLATE pg_catalog."default", CONSTRAINT scraped_airlines_pkey PRIMARY KEY (uuid), CONSTRAINT flight_uuid FOREIGN KEY (flight_uuid) - REFERENCES public.scraped (uuid) MATCH SIMPLE + REFERENCES public.{self.table_scraped} (uuid) MATCH SIMPLE ON UPDATE CASCADE ON DELETE CASCADE ) @@ -135,7 +136,7 @@ def create_scraped_airlines_table(self): ALTER TABLE IF EXISTS public.{self.table_scraped_airlines} OWNER to postgres; CREATE INDEX IF NOT EXISTS fki_flight_uuid - ON public.scraped_airlines USING btree + ON public.{self.table_scraped_airlines} USING btree (flight_uuid ASC NULLS LAST) TABLESPACE pg_default; """ @@ -144,6 +145,36 @@ def create_scraped_airlines_table(self): cursor.execute(query) cursor.close() + def create_scraped_layovers_table(self): + query = "" + query += f""" + CREATE TABLE IF NOT EXISTS public.{self.table_scraped_layovers} + ( + uuid uuid NOT NULL DEFAULT gen_random_uuid(), + flight_uuid uuid NOT NULL, + layover_location text COLLATE pg_catalog."default", + CONSTRAINT scraped_layovers_pkey PRIMARY KEY (uuid), + CONSTRAINT flight_uuid FOREIGN KEY (flight_uuid) + REFERENCES public.{self.table_scraped} (uuid) MATCH SIMPLE + ON UPDATE CASCADE + ON DELETE CASCADE + ) + + TABLESPACE pg_default; + + ALTER TABLE IF EXISTS public.{self.table_scraped_layovers} OWNER to postgres; + + CREATE INDEX IF NOT EXISTS fki_flight_uuid + ON public.{self.table_scraped_layovers} USING btree + (flight_uuid ASC NULLS LAST) + TABLESPACE pg_default; + """ + + cursor = self.conn.cursor() + cursor.execute(query) + cursor.close() + + def prepare_db_and_tables(self): """ Creates the database and the table if they don't exist. @@ -164,10 +195,10 @@ def add_pandas_df_to_db(self, df, table_name): extras.register_uuid() # Create a list of tuples from the dataframe values - df = df.reset_index() # otherwise the index (uuid) is not added to the table + if table_name == self.table_scraped: + df = df.reset_index() # otherwise the index (uuid) is not added to the table tuples = [tuple(x) for x in df.to_numpy()] - print(tuples[0]) # Comma-separated dataframe columns cols = ",".join(list(df.columns)) diff --git a/src/flight_analysis/flight.py b/src/flight_analysis/flight.py index 3c2d668..552cd11 100644 --- a/src/flight_analysis/flight.py +++ b/src/flight_analysis/flight.py @@ -18,151 +18,277 @@ def __init__(self, dl, roundtrip, queried_orig, queried_dest, price_trend, *args self._dow = datetime.strptime(dl, "%Y-%m-%d").isoweekday() # day of week self._airline = None self._flight_time = None - self._num_stops = None - self._stops = None - self._stops_locations = None - self._co2 = None - self._emissions = None + self._layover_n = None + self._layover_time = None + self._layover_location = None self._price = None self._price_trend = price_trend - self._times = [] - self._time_leave = None - self._time_arrive = None + self._times_departure_arrival = [] + self._time_departure = None + self._time_arrival = None self._has_train = False self._trash = [] + self._separate_tickets = False # extract the values above from the scraped HTML page source self._parse_args(*args) + def _debug(self): + res = { + "origin": self._origin, + "dest": self._dest, + "airline": self._airline, + "flight_time": self._flight_time, + "layover_n": self._layover_n, + "layover_time": self._layover_time, + "layover_location": self._layover_location, + "price": self._price, + "price_trend": self._price_trend, + "times_departure_arrival": self._times_departure_arrival, + "time_departure": self._time_departure, + "time_arrival": self._time_arrival, + "has_train": self._has_train, + "trash": self._trash, + } + return res + def __repr__(self): return f"{self._origin}-{self._dest}-{self._date}" + def _is_arg_layover(self, arg): + """ + Returns True if the argument contains a layover location/time + """ + # case 0: manually exclude "ITA" (it's a company name, not a layover location) + if "ITA, " in arg: + return False + + # layover location cases + layover_location_cases = { + # case 1: (xx hr xx min AAA), (xx hr xx min Aaaaa) + 1: re.search("\d{0,2} hr \d{0,2} min [A-Z]+", arg), + # case 2: (xx hr AAA), (xx hr AAA) + 2: re.search("\d{0,2} hr [A-Z]+", arg), + # case 3: (xx min AAA), (xx min AAA) + 3: re.search("\d{0,2} min [A-Z]+", arg), + # case 4: (AAA, BBB, ...) + 4: re.search("^[A-Z]{3}, ([A-Z]{3}(, )?)?", arg), + } + + return any(layover_location_cases.values()) + + def _is_arg_airline(self, arg): + if self._airline is None: + return True + return False + + def _is_arg_departure_arrival_times(self, arg): + # regex: AM/PM (for example: 10:30AM, 4:11PM, 10:44AM+1) + re_match = re.compile(r"\d{1,2}:\d{2}(?:AM|PM)(?:\+\d{0,1})?") + + if re_match.fullmatch(arg) and len(self._times_departure_arrival) < 2: + return True + + return False + + def _is_arg_flight_time(self, arg): + # regex: 3 hr 35 min, 45 min, 5 hr + re_match = re.compile(r"^(?:\d{1,2} hr){0,1}\s{0,1}(?:\d{1,2} min){0,1}") + + if re_match.fullmatch(arg) and self._flight_time is None: + return True + + return False + + def _is_arg_layover_n(self, arg): + re_match = re.compile(r"\d stops{0,1}") + + if (arg == "Nonstop" or re_match.fullmatch(arg)) and self._layover_n is None: + return True + + return False + + def _is_arg_price(self, arg): + if arg.replace(",", "").isdigit() and self._price is None: + return True + return False + + def _is_arg_orig_dest(self, arg): + # example: MUCFCO, BCNMAD + if len(arg) == 6 and arg.isupper(): + return True + return False + + # --------------------------------------------------------------- + + def _parse_departure_arrival_times(self, arg): + dep, arr = (None, None) + + # handle + - days + # extract the optional delta value (in case of +- X days) + delta_days = int(arg[-1]) if arg[-2] == "+" else 0 + delta = timedelta(days=delta_days) + + # remove the delta value from the argument if present + if delta_days: + arg = arg[:-2] # from 10:30PM+1 to 10:30PM + + # Combine date and time using a formatted string + date_time_str = f"{self._date} {arg}" + date_format = "%Y-%m-%d %I:%M%p" + + # Parse the date and time and add to the list + date_ok = datetime.strptime(date_time_str, date_format) + delta + self._times_departure_arrival.append(date_ok) + + if len(self._times_departure_arrival) != 2: + return (None, None) + + return tuple(self._times_departure_arrival) + + def _parse_layover_times_location(self, arg): + """ + From an argument (arg), returns the layover time and location as a tuple + """ + layover_time = None + layover_location = None + + # layover time + if (" hr" in arg) or (" min" in arg): + layover_time = ( + re.search("^(\d{1,2} hr){0,1}\s{0,1}(\d{1,2} min){0,1}\s", arg) + .group() + .strip() + ) + layover_time = Flight.convert_duration_str_to_timedelta(layover_time) + layover_location = [arg.split(" ")[-1]] + + # layover location + if "," in arg: + layover_location = arg.split(", ") + layover_location = [x.strip() for x in layover_location] + + return layover_time, layover_location + + def _parse_airline(self, arg): + airline = None + dont_split = ["easyjet"] + if "Operated" in arg: + airline = arg.split("Operated")[0] + else: + airline = arg + + # split camel case + if airline.lower() not in dont_split: + airline = re.sub("([a-z])([A-Z])", r"\1, \2", airline) + + # make it into an array (list) + airline = airline.split(", ") + + return airline + + def _parse_flight_time(self, arg): + return Flight.convert_duration_str_to_timedelta(arg) + + def _parse_layover_n(self, arg): + return 0 if arg == "Nonstop" else int(arg.split()[0]) + + def _parse_price(self, arg): + return int(arg.replace(",", "")) + + def _parse_orig_dest(self, arg): + # special case: "Flight + Train" + if "Flight + Train" in arg: + self._has_train = True + return (self._queried_orig, self._queried_dest) + + # regular case: like MUCFCO, LAXJFK + return (arg[:3], arg[3:]) + + # --------------------------------------------------------------- + def _classify_arg(self, arg: str): """ Classifies a string (arg) into the correct attribute for a flight, such as price, numer of layover stops, arrival time... """ + parsed = False # define cases for which to return early - arg_empty = arg is None or arg == "" + arg_empty = arg is None or arg == "" or len(arg) == 0 arg_useless = arg in ["Change of airport", "round trip", "Climate friendly"] arg_delay = arg.startswith("Delayed") - early_return_conditions = [arg_empty, arg_useless, arg_delay] + emissions = arg.endswith("emissions") or arg.endswith("CO2") + + early_return_conditions = [arg_empty, arg_useless, arg_delay, emissions] # return early if any(early_return_conditions): return - # airline: Separate tickets booked together + # airline: takes care of special case of format: + # Separate tickets booked together easyJet, Scoot + # Separate tickets booked together Ryanair, SWISS, ITA if arg == "Separate tickets booked together": - self._airline = ["multiple"] - - # arrival or departure time - # regex: AM/PM (for example: 10:30AM, 4:11PM) - elif bool(re.search("\d{1,2}\:\d{2}(?:AM|PM)\+{0,1}\d{0,1}", arg)) and ( - len(self._times) < 2 - ): - delta = timedelta(days=0) - if arg[-2] == "+": - delta = timedelta(days=int(arg[-1])) - arg = arg[:-2] - - date_format = "%Y-%m-%d %I:%M%p" - self._times += [ - datetime.strptime(self._date + " " + arg, date_format) + delta - ] + self._separate_tickets = True + return + + if self._separate_tickets and "," in arg: + self._separate_tickets = False + self._airline = arg.split(", ") + return + + # departure and arrival times + if self._is_arg_departure_arrival_times(arg): + ( + self._time_departure, + self._time_arrival, + ) = self._parse_departure_arrival_times(arg) + return # flight time - # regex: 3 hr 35 min, 45 min, 5 hr - elif bool(re.search("\d{1,2} (?:hr|min)$", arg)) and ( - self._flight_time is None - ): - self._flight_time = arg - - # number of stops - elif ((arg == "Nonstop") or bool(re.search("\d stop", arg))) and ( - self._num_stops is None - ): - self._num_stops = 0 if arg == "Nonstop" else int(arg.split()[0]) - - # co2 - elif arg.endswith("CO2") and (self._co2 is None): - arg = arg.replace(",", "") - self._co2 = int(arg.split()[0]) - - # emissions - elif arg.endswith("emissions") and (self._emissions is None): - emission_val = arg.split()[0] - self._emissions = 0 if emission_val == "Avg" else int(emission_val[:-1]) + if self._is_arg_flight_time(arg): + self._flight_time = self._parse_flight_time(arg) + return - # price - elif arg.replace(",", "").isdigit() and (self._price is None): - self._price = int(arg.replace(",", "")) - - # origin/dest - elif ( - (len(arg) == 6 and arg.isupper() or "Flight + Train" in arg) - and (self._origin is None) - and (self._dest is None) - ): - if "Flight + Train" in arg: - self._origin = self._queried_orig - self._dest = self._queried_dest - self._has_train = True - else: - self._origin = arg[:3] - self._dest = arg[3:] - - # layover - # regex 1: matches "FCO, JFK, ABC, DEF", "5 min Ancona", "3 hr 13 min FCO", "FCO, JFK" - elif ( - bool(re.search("\d{0,2} (?:min|hr) (\d{0,2} (?:min|hr))?\w+", arg)) - and self._stops_locations is None - ): - # get stops locations - if "," in arg: # multiple stops - self._stops_locations = arg - else: # single stop - self._stops_locations = arg.split(" ")[-1] - - # get stops time - if "," in arg: - self._stops = arg.split(", ")[0] - else: - self._stops = ( - re.search("([0-9]+ hr )?([0-9]+ min )?", arg).group().strip() - ) - - # airline - elif len(arg) > 0 and (self._airline is None): - if "Operated" in arg: - airline = arg.split("Operated")[0] - else: - airline = arg - - # split camel case - airline = re.sub("([a-z])([A-Z])", r"\1, \2", airline) + # number of stops (layover n) + if self._is_arg_layover_n(arg): + self._layover_n = self._parse_layover_n(arg) + return - # make it into an array (list) - airline = airline.split(", ") + # price + if self._is_arg_price(arg): + self._price = self._parse_price(arg) + return - self._airline = airline + # origin and destination airports + if self._is_arg_orig_dest(arg): + self._origin, self._dest = self._parse_orig_dest(arg) + return - # other (trash) - else: - self._trash += [arg] + # layover time and location(s) + if self._is_arg_layover(arg): + ( + self._layover_time, + self._layover_location, + ) = self._parse_layover_times_location(arg) + return - # if we have both arrival and departure time, set them - if len(self._times) == 2: - self._time_leave = self._times[0] - self._time_arrive = self._times[1] + # airline (always have it at last since it captures everything not captured above) + if self._is_arg_airline(arg): + self._airline = self._parse_airline(arg) + return def _parse_args(self, args): - for arg in args: - self._classify_arg(arg) + # don't process if there are not enough arguments + if len(args) > 5: + for arg in args: + self._classify_arg(arg) + + # print(self._debug()) @staticmethod - def convert_duration_str_to_minutes(s): + def convert_duration_str_to_timedelta(s): """ Returns the duration in minutes from a string of the form: 3 hr 20 min --> 60*3 + 20 = 200 @@ -180,7 +306,7 @@ def convert_duration_str_to_minutes(s): if "min" in s: m = int(re.split("hr|min", s)[-2]) - return 60 * h + m + return timedelta(hours=h, minutes=m) @staticmethod def make_dataframe(flights): @@ -204,27 +330,33 @@ def make_dataframe(flights): "access_date": [], "one_way": [], "has_train": [], - "days_advance": [], + # "days_advance": [], } # populate the dictionary for flight in flights: - data["departure_datetime"] += [flight._time_leave] - data["arrival_datetime"] += [flight._time_arrive] - data["airlines"] += [flight._airline] - data["travel_time"] += [Flight.convert_duration_str_to_minutes(flight._flight_time)] - data["origin"] += [flight._origin] - data["destination"] += [flight._dest] - data["layover_n"] += [flight._num_stops] - data["layover_time"] += [Flight.convert_duration_str_to_minutes(flight._stops)] - data["layover_location"] += [flight._stops_locations] - data["price_eur"] += [flight._price] - data["price_trend"] += [flight._price_trend[0]] - data["price_value"] += [flight._price_trend[1]] - data["access_date"] += [datetime.today()] - data["one_way"] += [(False if flight._roundtrip else True)] - data["has_train"] += [flight._has_train] - data["days_advance"] += [(flight._time_leave - datetime.today()).days] + try: + data["departure_datetime"] += [flight._time_departure] + data["arrival_datetime"] += [flight._time_arrival] + data["airlines"] += [flight._airline] + data["travel_time"] += [flight._flight_time] + data["origin"] += [flight._origin] + data["destination"] += [flight._dest] + data["layover_n"] += [flight._layover_n] + data["layover_time"] += [flight._layover_time] + data["layover_location"] += [flight._layover_location] + data["price_eur"] += [flight._price] + data["price_trend"] += [flight._price_trend[0]] + data["price_value"] += [flight._price_trend[1]] + data["access_date"] += [datetime.today()] + data["one_way"] += [(False if flight._roundtrip else True)] + data["has_train"] += [flight._has_train] + # data["days_advance"] += [ + # (flight._time_departure - datetime.today()).days + # ] + except Exception as e: + print("Error with flight", flight, flight._price) + print(e) return pd.DataFrame(data) diff --git a/src/flight_analysis/scrape.py b/src/flight_analysis/scrape.py index 6a93daa..859ef9a 100644 --- a/src/flight_analysis/scrape.py +++ b/src/flight_analysis/scrape.py @@ -29,7 +29,7 @@ def __init__(self, orig, dest, date_leave, date_return=None): self._round_trip = True if date_return is not None else False self._data = None self._url = None - + @property def data(self): return self._data @@ -106,7 +106,7 @@ def _get_results(self, driver): logger.error( f"Scrape timeout reached. It could mean that no flights exist for the combination of airports and dates." ) - return -1 + return None flights = self._clean_results(results) return Flight.make_dataframe(flights) @@ -140,7 +140,7 @@ def _clean_results(self, result): matches = [] # Enumerate over the list 'res3' for index, element in enumerate(res3): - # Check if the length of the element is more than 2 + # Check if element is not an empty string if len(element) <= 2: continue @@ -153,20 +153,30 @@ def _clean_results(self, result): if element[-2] != "+" and is_time_format: matches.append(index) - # Keep only every second item in the matches list - matches = matches[::2] - - flights = [ - Flight( - self._date_leave, # date_leave - self._round_trip, # round_trip - self._orig, - self._dest, - price_trend, - res3[matches[i] : matches[i + 1]], - ) - for i in range(len(matches) - 1) - ] + # handles the identification of whole flights, instead of splitting every + # time a time is found + # TODO: document better + matches_ok = [matches[0]] + + for i in range(1, len(matches)): + if matches[i] - matches[i - 1] < 4: + continue + matches_ok.append(matches[i]) + + flights = [] + for i in range(len(matches_ok) - 1): + flight_args = res3[matches_ok[i] : matches_ok[i + 1]] + + if len(flight_args) > 5: + f = Flight( + self._date_leave, # date_leave + self._round_trip, # round_trip + self._orig, + self._dest, + price_trend, + flight_args, + ) + flights.append(f) return flights