diff --git a/.gitignore b/.gitignore
index a007c9b..6f53f67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ backups
private
scrapes_csv
test.py
+sql_queries
# logging
logs/
diff --git a/flight_analysis.ipynb b/flight_analysis.ipynb
index 0b589d4..abbf029 100644
--- a/flight_analysis.ipynb
+++ b/flight_analysis.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -11,17 +11,17 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
- "flights = Scrape(\"MUC\", \"JFK\", \"2023-10-28\")\n",
+ "flights = Scrape(\"CTA\", \"PMO\", \"2023-10-28\")\n",
"flights.run_scrape()"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 8,
"metadata": {},
"outputs": [
{
@@ -60,574 +60,330 @@
"
access_date | \n",
" one_way | \n",
" has_train | \n",
- " days_advance | \n",
" \n",
" \n",
" \n",
" \n",
" 0 | \n",
- " 2023-10-28 14:05:00 | \n",
- " 2023-10-28 19:10:00 | \n",
- " [Icelandair] | \n",
- " 665 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 07:50:00 | \n",
+ " 2023-10-28 11:25:00 | \n",
+ " [easyJet] | \n",
+ " 0 days 03:35:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 60.0 | \n",
- " KEF | \n",
- " 511 | \n",
+ " 0 days 01:30:00 | \n",
+ " [NAP] | \n",
+ " 58 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524003 | \n",
+ " 2023-08-07 20:18:11.150904 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 1 | \n",
- " 2023-10-28 09:55:00 | \n",
- " 2023-10-28 20:20:00 | \n",
- " [LOT] | \n",
- " 985 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 14:00:00 | \n",
+ " 2023-10-28 18:20:00 | \n",
+ " [ITA] | \n",
+ " 0 days 04:20:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 320.0 | \n",
- " WAW | \n",
- " 627 | \n",
+ " 0 days 01:50:00 | \n",
+ " [FCO] | \n",
+ " 222 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524028 | \n",
+ " 2023-08-07 20:18:11.150920 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 2 | \n",
- " 2023-10-28 11:50:00 | \n",
- " 2023-10-28 16:35:00 | \n",
- " [KLMDelta] | \n",
- " 645 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 19:10:00 | \n",
+ " 2023-10-28 22:55:00 | \n",
+ " [ITA] | \n",
+ " 0 days 03:45:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 70.0 | \n",
- " AMS | \n",
- " 706 | \n",
+ " 0 days 01:10:00 | \n",
+ " [FCO] | \n",
+ " 222 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524034 | \n",
+ " 2023-08-07 20:18:11.150922 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 3 | \n",
- " 2023-10-28 11:00:00 | \n",
- " 2023-10-28 20:20:00 | \n",
- " [Lufthansa, LOT] | \n",
- " 920 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 260.0 | \n",
- " WAW | \n",
- " 658 | \n",
+ " 2023-10-28 12:50:00 | \n",
+ " 2023-10-28 22:55:00 | \n",
+ " [ITA] | \n",
+ " 0 days 10:05:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
+ " 3 | \n",
+ " NaT | \n",
+ " [LIN, NAP, FCO] | \n",
+ " 191 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524039 | \n",
+ " 2023-08-07 20:18:11.150924 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 4 | \n",
- " 2023-10-28 10:25:00 | \n",
- " 2023-10-28 23:48:00 | \n",
- " [Delta, Virgin Atlantic] | \n",
- " 1163 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 15:25:00 | \n",
+ " 2023-10-28 22:10:00 | \n",
+ " [ITA] | \n",
+ " 0 days 06:45:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 389.0 | \n",
- " ATL | \n",
- " 670 | \n",
+ " 0 days 04:20:00 | \n",
+ " [FCO] | \n",
+ " 222 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524045 | \n",
+ " 2023-08-07 20:18:11.150926 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 5 | \n",
- " 2023-10-28 07:00:00 | \n",
- " 2023-10-28 15:25:00 | \n",
- " [KLMDelta] | \n",
- " 865 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 11:05:00 | \n",
+ " 2023-10-28 16:20:00 | \n",
+ " [ITA] | \n",
+ " 0 days 05:15:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 290.0 | \n",
- " AMS | \n",
- " 706 | \n",
+ " 0 days 01:50:00 | \n",
+ " [LIN] | \n",
+ " 228 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524050 | \n",
+ " 2023-08-07 20:18:11.150927 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 6 | \n",
- " 2023-10-28 11:25:00 | \n",
- " 2023-10-28 19:20:00 | \n",
- " [Aer Lingus] | \n",
- " 835 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 06:00:00 | \n",
+ " 2023-10-28 09:25:00 | \n",
+ " [ITA] | \n",
+ " 0 days 03:25:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 220.0 | \n",
- " DUB | \n",
- " 787 | \n",
+ " 0 days 00:55:00 | \n",
+ " [FCO] | \n",
+ " 254 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524055 | \n",
+ " 2023-08-07 20:18:11.150929 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 7 | \n",
- " 2023-10-28 14:05:00 | \n",
- " 2023-10-28 19:15:00 | \n",
- " [KLMDelta] | \n",
- " 670 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 10:15:00 | \n",
+ " 2023-10-28 14:25:00 | \n",
+ " [ITA] | \n",
+ " 0 days 04:10:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 95.0 | \n",
- " AMS | \n",
- " 812 | \n",
+ " 0 days 01:35:00 | \n",
+ " [FCO] | \n",
+ " 265 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524060 | \n",
+ " 2023-08-07 20:18:11.150931 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 8 | \n",
- " 2023-10-28 09:00:00 | \n",
- " 2023-10-28 14:25:00 | \n",
- " [Lufthansa, Condor] | \n",
- " 685 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 07:00:00 | \n",
+ " 2023-10-28 11:55:00 | \n",
+ " [ITA] | \n",
+ " 0 days 04:55:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 105.0 | \n",
- " FRA | \n",
- " 830 | \n",
+ " 0 days 02:20:00 | \n",
+ " [FCO] | \n",
+ " 280 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524066 | \n",
+ " 2023-08-07 20:18:11.150933 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
" 9 | \n",
- " 2023-10-28 12:10:00 | \n",
- " 2023-10-28 15:00:00 | \n",
- " [Lufthansa, United] | \n",
- " 530 | \n",
- " MUC | \n",
- " JFK | \n",
- " 0 | \n",
- " NaN | \n",
- " None | \n",
- " 1317 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524070 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- "\n",
- ""
- ],
- "text/plain": [
- " departure_datetime arrival_datetime airlines \\\n",
- "0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n",
- "1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n",
- "2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n",
- "3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n",
- "4 2023-10-28 10:25:00 2023-10-28 23:48:00 [Delta, Virgin Atlantic] \n",
- "5 2023-10-28 07:00:00 2023-10-28 15:25:00 [KLMDelta] \n",
- "6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n",
- "7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n",
- "8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n",
- "9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n",
- "\n",
- " travel_time origin destination layover_n layover_time layover_location \\\n",
- "0 665 MUC JFK 1 60.0 KEF \n",
- "1 985 MUC JFK 1 320.0 WAW \n",
- "2 645 MUC JFK 1 70.0 AMS \n",
- "3 920 MUC JFK 1 260.0 WAW \n",
- "4 1163 MUC JFK 1 389.0 ATL \n",
- "5 865 MUC JFK 1 290.0 AMS \n",
- "6 835 MUC JFK 1 220.0 DUB \n",
- "7 670 MUC JFK 1 95.0 AMS \n",
- "8 685 MUC JFK 1 105.0 FRA \n",
- "9 530 MUC JFK 0 NaN None \n",
- "\n",
- " price_eur price_trend price_value access_date one_way \\\n",
- "0 511 typical None 2023-08-05 14:15:47.524003 True \n",
- "1 627 typical None 2023-08-05 14:15:47.524028 True \n",
- "2 706 typical None 2023-08-05 14:15:47.524034 True \n",
- "3 658 typical None 2023-08-05 14:15:47.524039 True \n",
- "4 670 typical None 2023-08-05 14:15:47.524045 True \n",
- "5 706 typical None 2023-08-05 14:15:47.524050 True \n",
- "6 787 typical None 2023-08-05 14:15:47.524055 True \n",
- "7 812 typical None 2023-08-05 14:15:47.524060 True \n",
- "8 830 typical None 2023-08-05 14:15:47.524066 True \n",
- "9 1317 typical None 2023-08-05 14:15:47.524070 True \n",
- "\n",
- " has_train days_advance \n",
- "0 False 83 \n",
- "1 False 83 \n",
- "2 False 83 \n",
- "3 False 83 \n",
- "4 False 83 \n",
- "5 False 83 \n",
- "6 False 83 \n",
- "7 False 83 \n",
- "8 False 83 \n",
- "9 False 83 "
- ]
- },
- "execution_count": 5,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "flights.data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " departure_datetime | \n",
- " arrival_datetime | \n",
- " airlines | \n",
- " travel_time | \n",
- " origin | \n",
- " destination | \n",
- " layover_n | \n",
- " layover_time | \n",
- " layover_location | \n",
- " price_eur | \n",
- " price_trend | \n",
- " price_value | \n",
- " access_date | \n",
- " one_way | \n",
- " has_train | \n",
- " days_advance | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2023-10-28 14:05:00 | \n",
- " 2023-10-28 19:10:00 | \n",
- " [Icelandair] | \n",
- " 665 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 60.0 | \n",
- " KEF | \n",
- " 511 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524003 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2023-10-28 09:55:00 | \n",
- " 2023-10-28 20:20:00 | \n",
- " [LOT] | \n",
- " 985 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 2023-10-28 06:00:00 | \n",
+ " 2023-10-28 11:05:00 | \n",
+ " [Lufthansa] | \n",
+ " 0 days 05:05:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 320.0 | \n",
- " WAW | \n",
- " 627 | \n",
+ " 0 days 01:00:00 | \n",
+ " [MUC] | \n",
+ " 408 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524028 | \n",
+ " 2023-08-07 20:18:11.150934 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
- " 2 | \n",
- " 2023-10-28 11:50:00 | \n",
- " 2023-10-28 16:35:00 | \n",
- " [KLMDelta] | \n",
- " 645 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 10 | \n",
+ " 2023-10-28 21:35:00 | \n",
+ " 2023-10-29 11:10:00 | \n",
+ " [British Airways] | \n",
+ " 0 days 14:35:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 70.0 | \n",
- " AMS | \n",
- " 706 | \n",
- " typical | \n",
+ " NaT | \n",
" None | \n",
- " 2023-08-05 14:15:47.524034 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2023-10-28 11:00:00 | \n",
- " 2023-10-28 20:20:00 | \n",
- " [Lufthansa, LOT] | \n",
- " 920 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 260.0 | \n",
- " WAW | \n",
- " 658 | \n",
+ " 457 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524039 | \n",
+ " 2023-08-07 20:18:11.150936 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
- " 4 | \n",
- " 2023-10-28 10:25:00 | \n",
- " 2023-10-28 23:48:00 | \n",
- " [Delta, Virgin Atlantic] | \n",
- " 1163 | \n",
- " MUC | \n",
- " JFK | \n",
+ " 11 | \n",
+ " 2023-10-28 19:25:00 | \n",
+ " 2023-10-29 17:40:00 | \n",
+ " [Turkish Airlines] | \n",
+ " 0 days 23:15:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
" 1 | \n",
- " 389.0 | \n",
- " ATL | \n",
- " 670 | \n",
+ " 0 days 18:20:00 | \n",
+ " [IST] | \n",
+ " 504 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524045 | \n",
+ " 2023-08-07 20:18:11.150938 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
"
\n",
" \n",
- " 5 | \n",
+ " 12 | \n",
" 2023-10-28 07:00:00 | \n",
- " 2023-10-28 15:25:00 | \n",
- " [KLMDelta] | \n",
- " 865 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 290.0 | \n",
- " AMS | \n",
- " 706 | \n",
+ " 2023-10-28 13:55:00 | \n",
+ " [ITA, SWISS] | \n",
+ " 0 days 06:55:00 | \n",
+ " CTA | \n",
+ " PMO | \n",
+ " 2 | \n",
+ " NaT | \n",
+ " [FCO, ZRH] | \n",
+ " 617 | \n",
" typical | \n",
" None | \n",
- " 2023-08-05 14:15:47.524050 | \n",
+ " 2023-08-07 20:18:11.150939 | \n",
" True | \n",
" False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 2023-10-28 11:25:00 | \n",
- " 2023-10-28 19:20:00 | \n",
- " [Aer Lingus] | \n",
- " 835 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 220.0 | \n",
- " DUB | \n",
- " 787 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524055 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 2023-10-28 14:05:00 | \n",
- " 2023-10-28 19:15:00 | \n",
- " [KLMDelta] | \n",
- " 670 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 95.0 | \n",
- " AMS | \n",
- " 812 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524060 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 2023-10-28 09:00:00 | \n",
- " 2023-10-28 14:25:00 | \n",
- " [Lufthansa, Condor] | \n",
- " 685 | \n",
- " MUC | \n",
- " JFK | \n",
- " 1 | \n",
- " 105.0 | \n",
- " FRA | \n",
- " 830 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524066 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 2023-10-28 12:10:00 | \n",
- " 2023-10-28 15:00:00 | \n",
- " [Lufthansa, United] | \n",
- " 530 | \n",
- " MUC | \n",
- " JFK | \n",
- " 0 | \n",
- " NaN | \n",
- " None | \n",
- " 1317 | \n",
- " typical | \n",
- " None | \n",
- " 2023-08-05 14:15:47.524070 | \n",
- " True | \n",
- " False | \n",
- " 83 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
- " departure_datetime arrival_datetime airlines \\\n",
- "0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n",
- "1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n",
- "2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n",
- "3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n",
- "4 2023-10-28 10:25:00 2023-10-28 23:48:00 [Delta, Virgin Atlantic] \n",
- "5 2023-10-28 07:00:00 2023-10-28 15:25:00 [KLMDelta] \n",
- "6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n",
- "7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n",
- "8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n",
- "9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n",
+ " departure_datetime arrival_datetime airlines \\\n",
+ "0 2023-10-28 07:50:00 2023-10-28 11:25:00 [easyJet] \n",
+ "1 2023-10-28 14:00:00 2023-10-28 18:20:00 [ITA] \n",
+ "2 2023-10-28 19:10:00 2023-10-28 22:55:00 [ITA] \n",
+ "3 2023-10-28 12:50:00 2023-10-28 22:55:00 [ITA] \n",
+ "4 2023-10-28 15:25:00 2023-10-28 22:10:00 [ITA] \n",
+ "5 2023-10-28 11:05:00 2023-10-28 16:20:00 [ITA] \n",
+ "6 2023-10-28 06:00:00 2023-10-28 09:25:00 [ITA] \n",
+ "7 2023-10-28 10:15:00 2023-10-28 14:25:00 [ITA] \n",
+ "8 2023-10-28 07:00:00 2023-10-28 11:55:00 [ITA] \n",
+ "9 2023-10-28 06:00:00 2023-10-28 11:05:00 [Lufthansa] \n",
+ "10 2023-10-28 21:35:00 2023-10-29 11:10:00 [British Airways] \n",
+ "11 2023-10-28 19:25:00 2023-10-29 17:40:00 [Turkish Airlines] \n",
+ "12 2023-10-28 07:00:00 2023-10-28 13:55:00 [ITA, SWISS] \n",
"\n",
- " travel_time origin destination layover_n layover_time layover_location \\\n",
- "0 665 MUC JFK 1 60.0 KEF \n",
- "1 985 MUC JFK 1 320.0 WAW \n",
- "2 645 MUC JFK 1 70.0 AMS \n",
- "3 920 MUC JFK 1 260.0 WAW \n",
- "4 1163 MUC JFK 1 389.0 ATL \n",
- "5 865 MUC JFK 1 290.0 AMS \n",
- "6 835 MUC JFK 1 220.0 DUB \n",
- "7 670 MUC JFK 1 95.0 AMS \n",
- "8 685 MUC JFK 1 105.0 FRA \n",
- "9 530 MUC JFK 0 NaN None \n",
+ " travel_time origin destination layover_n layover_time \\\n",
+ "0 0 days 03:35:00 CTA PMO 1 0 days 01:30:00 \n",
+ "1 0 days 04:20:00 CTA PMO 1 0 days 01:50:00 \n",
+ "2 0 days 03:45:00 CTA PMO 1 0 days 01:10:00 \n",
+ "3 0 days 10:05:00 CTA PMO 3 NaT \n",
+ "4 0 days 06:45:00 CTA PMO 1 0 days 04:20:00 \n",
+ "5 0 days 05:15:00 CTA PMO 1 0 days 01:50:00 \n",
+ "6 0 days 03:25:00 CTA PMO 1 0 days 00:55:00 \n",
+ "7 0 days 04:10:00 CTA PMO 1 0 days 01:35:00 \n",
+ "8 0 days 04:55:00 CTA PMO 1 0 days 02:20:00 \n",
+ "9 0 days 05:05:00 CTA PMO 1 0 days 01:00:00 \n",
+ "10 0 days 14:35:00 CTA PMO 1 NaT \n",
+ "11 0 days 23:15:00 CTA PMO 1 0 days 18:20:00 \n",
+ "12 0 days 06:55:00 CTA PMO 2 NaT \n",
"\n",
- " price_eur price_trend price_value access_date one_way \\\n",
- "0 511 typical None 2023-08-05 14:15:47.524003 True \n",
- "1 627 typical None 2023-08-05 14:15:47.524028 True \n",
- "2 706 typical None 2023-08-05 14:15:47.524034 True \n",
- "3 658 typical None 2023-08-05 14:15:47.524039 True \n",
- "4 670 typical None 2023-08-05 14:15:47.524045 True \n",
- "5 706 typical None 2023-08-05 14:15:47.524050 True \n",
- "6 787 typical None 2023-08-05 14:15:47.524055 True \n",
- "7 812 typical None 2023-08-05 14:15:47.524060 True \n",
- "8 830 typical None 2023-08-05 14:15:47.524066 True \n",
- "9 1317 typical None 2023-08-05 14:15:47.524070 True \n",
+ " layover_location price_eur price_trend price_value \\\n",
+ "0 [NAP] 58 typical None \n",
+ "1 [FCO] 222 typical None \n",
+ "2 [FCO] 222 typical None \n",
+ "3 [LIN, NAP, FCO] 191 typical None \n",
+ "4 [FCO] 222 typical None \n",
+ "5 [LIN] 228 typical None \n",
+ "6 [FCO] 254 typical None \n",
+ "7 [FCO] 265 typical None \n",
+ "8 [FCO] 280 typical None \n",
+ "9 [MUC] 408 typical None \n",
+ "10 None 457 typical None \n",
+ "11 [IST] 504 typical None \n",
+ "12 [FCO, ZRH] 617 typical None \n",
"\n",
- " has_train days_advance \n",
- "0 False 83 \n",
- "1 False 83 \n",
- "2 False 83 \n",
- "3 False 83 \n",
- "4 False 83 \n",
- "5 False 83 \n",
- "6 False 83 \n",
- "7 False 83 \n",
- "8 False 83 \n",
- "9 False 83 "
+ " access_date one_way has_train \n",
+ "0 2023-08-07 20:18:11.150904 True False \n",
+ "1 2023-08-07 20:18:11.150920 True False \n",
+ "2 2023-08-07 20:18:11.150922 True False \n",
+ "3 2023-08-07 20:18:11.150924 True False \n",
+ "4 2023-08-07 20:18:11.150926 True False \n",
+ "5 2023-08-07 20:18:11.150927 True False \n",
+ "6 2023-08-07 20:18:11.150929 True False \n",
+ "7 2023-08-07 20:18:11.150931 True False \n",
+ "8 2023-08-07 20:18:11.150933 True False \n",
+ "9 2023-08-07 20:18:11.150934 True False \n",
+ "10 2023-08-07 20:18:11.150936 True False \n",
+ "11 2023-08-07 20:18:11.150938 True False \n",
+ "12 2023-08-07 20:18:11.150939 True False "
]
},
- "execution_count": 10,
+ "execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df = flights.data\n",
- "df"
+ "flights.data"
]
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "'https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20MUC%20on%202023-10-28%20oneway&curr=EUR&gl=IT'"
+ "'https://www.google.com/travel/flights?q=Flights%20to%20PMO%20from%20CTA%20on%202023-10-28%20oneway&curr=EUR&gl=IT'"
]
},
- "execution_count": 7,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -638,20 +394,30 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
- "data": {
- "text/plain": [
- "Database: flight_analysis"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "NotImplementedError",
+ "evalue": "",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
+ "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 5\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m()\n",
+ "\u001b[0;31mNotImplementedError\u001b[0m: "
+ ]
}
],
+ "source": [
+ "raise NotImplementedError()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"import pandas as pd\n",
"from src.flight_analysis.database import Database\n",
@@ -663,26 +429,98 @@
" db_user=private.DB_USER,\n",
" db_pw=private.DB_PW,\n",
")\n",
- "db"
+ "\n",
+ "query = \"SELECT uuid, layover_location FROM scraped\"\n",
+ "\n",
+ "cur = db.conn.cursor()\n",
+ "cur.execute(query)\n",
+ "\n",
+ "res = cur.fetchall()\n",
+ "res"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"metadata": {},
- "outputs": [
- {
- "ename": "NotImplementedError",
- "evalue": "This script is not meant to be run again!!!!!",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)",
- "\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 7\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m# this process is to retroactively split the flight airlines into a separate \"scraped_airlines\" table,\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[39m# in order to violate the 1NF and 2NF rules of database normalization.\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mNotImplementedError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39mThis script is not meant to be run again!!!!!\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 6\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpandas\u001b[39;00m \u001b[39mas\u001b[39;00m \u001b[39mpd\u001b[39;00m\n\u001b[1;32m 7\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msrc\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mflight_analysis\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdatabase\u001b[39;00m \u001b[39mimport\u001b[39;00m Database\n",
- "\u001b[0;31mNotImplementedError\u001b[0m: This script is not meant to be run again!!!!!"
- ]
- }
- ],
+ "outputs": [],
+ "source": [
+ "df = pd.DataFrame(res, columns=[\"flight_uuid\", \"layover_location\"])\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get rid of \"min\"\n",
+ "df.lauover_location = df.layover_location.str.strip()\n",
+ "df.layover_location = df.layover_location.replace(\"\", np.nan)\n",
+ "df.layover_location = df.layover_location.replace(\"min\", np.nan)\n",
+ "df.layover_location = df.layover_location.replace(\"Change of airport\", np.nan)\n",
+ "\n",
+ "df = df.dropna()\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def split_dataframe(df, chunk_size=5000):\n",
+ " chunks = list()\n",
+ " num_chunks = len(df) // chunk_size + 1\n",
+ " for i in range(num_chunks):\n",
+ " chunks.append(df[i * chunk_size : (i + 1) * chunk_size])\n",
+ " return chunks\n",
+ "\n",
+ "\n",
+ "chunks = split_dataframe(df, chunk_size=5000)\n",
+ "\n",
+ "i = 1\n",
+ "for c in chunks:\n",
+ " print(i, \"/\", len(chunks))\n",
+ " db.add_pandas_df_to_db(c, table_name=db.table_scraped_layovers)\n",
+ " i += 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"# this process is to retroactively split the flight airlines into a separate \"scraped_airlines\" table,\n",
"# in order to violate the 1NF and 2NF rules of database normalization.\n",
@@ -711,17 +549,21 @@
"\n",
"df2 = df.explode(\"airline\")\n",
"df2[\"airline\"] = df2[\"airline\"].map(lambda x: x.lstrip(\"'\").rstrip(\"'\"))\n",
- "df2[\"airline\"] = df2[\"airline\"].replace({\"Separate tickets booked together\": \"multiple\"})\n",
+ "df2[\"airline\"] = df2[\"airline\"].replace(\n",
+ " {\"Separate tickets booked together\": \"multiple\"}\n",
+ ")\n",
"df2 = df2.reset_index(drop=True)\n",
"\n",
- "def split_dataframe(df, chunk_size = 5000): \n",
+ "\n",
+ "def split_dataframe(df, chunk_size=5000):\n",
" chunks = list()\n",
" num_chunks = len(df) // chunk_size + 1\n",
" for i in range(num_chunks):\n",
- " chunks.append(df[i*chunk_size:(i+1)*chunk_size])\n",
+ " chunks.append(df[i * chunk_size : (i + 1) * chunk_size])\n",
" return chunks\n",
"\n",
- "chunks = split_dataframe(df2, chunk_size = 5000)\n",
+ "\n",
+ "chunks = split_dataframe(df2, chunk_size=5000)\n",
"\n",
"i = 1\n",
"for c in chunks:\n",
@@ -729,6 +571,106 @@
" db.add_pandas_df_to_db(c, table_name=db.table_scraped_airlines)\n",
" i += 1"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "def _is_arg_layover(arg):\n",
+ " \"\"\" \"\"\"\n",
+ " # layover location cases\n",
+ " layover_location_cases = {\n",
+ " # case 1: (xx hr xx min AAA), (xx hr xx min Aaaaa)\n",
+ " 1: re.search(\"\\d{0,2} hr \\d{0,2} min [A-Z]+\", arg),\n",
+ " # case 2: (xx hr AAA), (xx hr AAA)\n",
+ " 2: re.search(\"\\d{0,2} hr [A-Z]+\", arg),\n",
+ " # case 3: (xx min AAA), (xx min AAA)\n",
+ " 3: re.search(\"\\d{0,2} min [A-Z]+\", arg),\n",
+ " # case 4: (AAA, BBB, ...)\n",
+ " 4: re.search(\"^[A-Z]{3}, ([A-Z]{3}(, )?)?\", arg),\n",
+ " }\n",
+ "\n",
+ " print(layover_location_cases)\n",
+ "\n",
+ " return any(layover_location_cases.values())\n",
+ "\n",
+ "\n",
+ "def _parse_layover_times_location(arg):\n",
+ " layover_time = None\n",
+ " layover_location = None\n",
+ "\n",
+ " # layover time\n",
+ " if (\" hr\" in arg) or (\" min\" in arg):\n",
+ " layover_time = (\n",
+ " re.search(\"^(\\d{1,2} hr){0,1}\\s{0,1}(\\d{1,2} min){0,1}\\s\", arg)\n",
+ " .group()\n",
+ " .strip()\n",
+ " )\n",
+ " layover_location = arg.split(\" \")[-1]\n",
+ "\n",
+ " # layover location\n",
+ " if \",\" in arg:\n",
+ " layover_location = arg.split(\", \")\n",
+ " layover_location = [x.strip() for x in layover_location]\n",
+ "\n",
+ " return layover_time, layover_location\n",
+ "\n",
+ "\n",
+ "test_args = [\n",
+ " \"22 hr 12 min FCO\",\n",
+ " # \"22 hr 12 min\",\n",
+ " # \"3 min Ancona\",\n",
+ " # \"13 min LAX\",\n",
+ " # \"FCO, JFK\",\n",
+ " # \"FCO, JFK\",\n",
+ " # \"21 hr 5 min DOH\",\n",
+ " # \"hehe\",\n",
+ " \"SWISS, Singapore AirlinesOperated by Helvetic\",\n",
+ "]\n",
+ "\n",
+ "for arg in test_args:\n",
+ " if _is_arg_layover(arg):\n",
+ " x = _parse_layover_times_location(arg)\n",
+ " print(arg, x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "\n",
+ "not bool(re.search(\"ITA, \", \"ITA, Singapore\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
diff --git a/src/flight_analysis/database.py b/src/flight_analysis/database.py
index aa317e4..4b0ab6c 100644
--- a/src/flight_analysis/database.py
+++ b/src/flight_analysis/database.py
@@ -24,6 +24,7 @@ def __init__(self, db_host, db_name, db_user, db_pw):
# tables
self.table_scraped = "scraped"
self.table_scraped_airlines = "scraped_airlines"
+ self.table_scraped_layovers = "scraped_layovers"
self.conn = self.connect_to_postgresql()
self.conn.autocommit = True
@@ -125,7 +126,7 @@ def create_scraped_airlines_table(self):
airline text COLLATE pg_catalog."default",
CONSTRAINT scraped_airlines_pkey PRIMARY KEY (uuid),
CONSTRAINT flight_uuid FOREIGN KEY (flight_uuid)
- REFERENCES public.scraped (uuid) MATCH SIMPLE
+ REFERENCES public.{self.table_scraped} (uuid) MATCH SIMPLE
ON UPDATE CASCADE
ON DELETE CASCADE
)
@@ -135,7 +136,7 @@ def create_scraped_airlines_table(self):
ALTER TABLE IF EXISTS public.{self.table_scraped_airlines} OWNER to postgres;
CREATE INDEX IF NOT EXISTS fki_flight_uuid
- ON public.scraped_airlines USING btree
+ ON public.{self.table_scraped_airlines} USING btree
(flight_uuid ASC NULLS LAST)
TABLESPACE pg_default;
"""
@@ -144,6 +145,36 @@ def create_scraped_airlines_table(self):
cursor.execute(query)
cursor.close()
+ def create_scraped_layovers_table(self):
+ query = ""
+ query += f"""
+ CREATE TABLE IF NOT EXISTS public.{self.table_scraped_layovers}
+ (
+ uuid uuid NOT NULL DEFAULT gen_random_uuid(),
+ flight_uuid uuid NOT NULL,
+ layover_location text COLLATE pg_catalog."default",
+ CONSTRAINT scraped_layovers_pkey PRIMARY KEY (uuid),
+ CONSTRAINT flight_uuid FOREIGN KEY (flight_uuid)
+ REFERENCES public.{self.table_scraped} (uuid) MATCH SIMPLE
+ ON UPDATE CASCADE
+ ON DELETE CASCADE
+ )
+
+ TABLESPACE pg_default;
+
+ ALTER TABLE IF EXISTS public.{self.table_scraped_layovers} OWNER to postgres;
+
+ CREATE INDEX IF NOT EXISTS fki_flight_uuid
+ ON public.{self.table_scraped_layovers} USING btree
+ (flight_uuid ASC NULLS LAST)
+ TABLESPACE pg_default;
+ """
+
+ cursor = self.conn.cursor()
+ cursor.execute(query)
+ cursor.close()
+
+
def prepare_db_and_tables(self):
"""
Creates the database and the table if they don't exist.
@@ -164,10 +195,10 @@ def add_pandas_df_to_db(self, df, table_name):
extras.register_uuid()
# Create a list of tuples from the dataframe values
- df = df.reset_index() # otherwise the index (uuid) is not added to the table
+ if table_name == self.table_scraped:
+ df = df.reset_index() # otherwise the index (uuid) is not added to the table
tuples = [tuple(x) for x in df.to_numpy()]
- print(tuples[0])
# Comma-separated dataframe columns
cols = ",".join(list(df.columns))
diff --git a/src/flight_analysis/flight.py b/src/flight_analysis/flight.py
index 3c2d668..552cd11 100644
--- a/src/flight_analysis/flight.py
+++ b/src/flight_analysis/flight.py
@@ -18,151 +18,277 @@ def __init__(self, dl, roundtrip, queried_orig, queried_dest, price_trend, *args
self._dow = datetime.strptime(dl, "%Y-%m-%d").isoweekday() # day of week
self._airline = None
self._flight_time = None
- self._num_stops = None
- self._stops = None
- self._stops_locations = None
- self._co2 = None
- self._emissions = None
+ self._layover_n = None
+ self._layover_time = None
+ self._layover_location = None
self._price = None
self._price_trend = price_trend
- self._times = []
- self._time_leave = None
- self._time_arrive = None
+ self._times_departure_arrival = []
+ self._time_departure = None
+ self._time_arrival = None
self._has_train = False
self._trash = []
+ self._separate_tickets = False
# extract the values above from the scraped HTML page source
self._parse_args(*args)
+ def _debug(self):
+ res = {
+ "origin": self._origin,
+ "dest": self._dest,
+ "airline": self._airline,
+ "flight_time": self._flight_time,
+ "layover_n": self._layover_n,
+ "layover_time": self._layover_time,
+ "layover_location": self._layover_location,
+ "price": self._price,
+ "price_trend": self._price_trend,
+ "times_departure_arrival": self._times_departure_arrival,
+ "time_departure": self._time_departure,
+ "time_arrival": self._time_arrival,
+ "has_train": self._has_train,
+ "trash": self._trash,
+ }
+ return res
+
def __repr__(self):
return f"{self._origin}-{self._dest}-{self._date}"
+ def _is_arg_layover(self, arg):
+ """
+ Returns True if the argument contains a layover location/time
+ """
+ # case 0: manually exclude "ITA" (it's a company name, not a layover location)
+ if "ITA, " in arg:
+ return False
+
+ # layover location cases
+ layover_location_cases = {
+ # case 1: (xx hr xx min AAA), (xx hr xx min Aaaaa)
+ 1: re.search("\d{0,2} hr \d{0,2} min [A-Z]+", arg),
+ # case 2: (xx hr AAA), (xx hr AAA)
+ 2: re.search("\d{0,2} hr [A-Z]+", arg),
+ # case 3: (xx min AAA), (xx min AAA)
+ 3: re.search("\d{0,2} min [A-Z]+", arg),
+ # case 4: (AAA, BBB, ...)
+ 4: re.search("^[A-Z]{3}, ([A-Z]{3}(, )?)?", arg),
+ }
+
+ return any(layover_location_cases.values())
+
+ def _is_arg_airline(self, arg):
+ if self._airline is None:
+ return True
+ return False
+
+ def _is_arg_departure_arrival_times(self, arg):
+ # regex: AM/PM (for example: 10:30AM, 4:11PM, 10:44AM+1)
+ re_match = re.compile(r"\d{1,2}:\d{2}(?:AM|PM)(?:\+\d{0,1})?")
+
+ if re_match.fullmatch(arg) and len(self._times_departure_arrival) < 2:
+ return True
+
+ return False
+
+ def _is_arg_flight_time(self, arg):
+ # regex: 3 hr 35 min, 45 min, 5 hr
+ re_match = re.compile(r"^(?:\d{1,2} hr){0,1}\s{0,1}(?:\d{1,2} min){0,1}")
+
+ if re_match.fullmatch(arg) and self._flight_time is None:
+ return True
+
+ return False
+
+ def _is_arg_layover_n(self, arg):
+ re_match = re.compile(r"\d stops{0,1}")
+
+ if (arg == "Nonstop" or re_match.fullmatch(arg)) and self._layover_n is None:
+ return True
+
+ return False
+
+ def _is_arg_price(self, arg):
+ if arg.replace(",", "").isdigit() and self._price is None:
+ return True
+ return False
+
+ def _is_arg_orig_dest(self, arg):
+ # example: MUCFCO, BCNMAD
+ if len(arg) == 6 and arg.isupper():
+ return True
+ return False
+
+ # ---------------------------------------------------------------
+
+ def _parse_departure_arrival_times(self, arg):
+ dep, arr = (None, None)
+
+ # handle + - days
+ # extract the optional delta value (in case of +- X days)
+ delta_days = int(arg[-1]) if arg[-2] == "+" else 0
+ delta = timedelta(days=delta_days)
+
+ # remove the delta value from the argument if present
+ if delta_days:
+ arg = arg[:-2] # from 10:30PM+1 to 10:30PM
+
+ # Combine date and time using a formatted string
+ date_time_str = f"{self._date} {arg}"
+ date_format = "%Y-%m-%d %I:%M%p"
+
+ # Parse the date and time and add to the list
+ date_ok = datetime.strptime(date_time_str, date_format) + delta
+ self._times_departure_arrival.append(date_ok)
+
+ if len(self._times_departure_arrival) != 2:
+ return (None, None)
+
+ return tuple(self._times_departure_arrival)
+
+ def _parse_layover_times_location(self, arg):
+ """
+ From an argument (arg), returns the layover time and location as a tuple
+ """
+ layover_time = None
+ layover_location = None
+
+ # layover time
+ if (" hr" in arg) or (" min" in arg):
+ layover_time = (
+ re.search("^(\d{1,2} hr){0,1}\s{0,1}(\d{1,2} min){0,1}\s", arg)
+ .group()
+ .strip()
+ )
+ layover_time = Flight.convert_duration_str_to_timedelta(layover_time)
+ layover_location = [arg.split(" ")[-1]]
+
+ # layover location
+ if "," in arg:
+ layover_location = arg.split(", ")
+ layover_location = [x.strip() for x in layover_location]
+
+ return layover_time, layover_location
+
+ def _parse_airline(self, arg):
+ airline = None
+ dont_split = ["easyjet"]
+ if "Operated" in arg:
+ airline = arg.split("Operated")[0]
+ else:
+ airline = arg
+
+ # split camel case
+ if airline.lower() not in dont_split:
+ airline = re.sub("([a-z])([A-Z])", r"\1, \2", airline)
+
+ # make it into an array (list)
+ airline = airline.split(", ")
+
+ return airline
+
+ def _parse_flight_time(self, arg):
+ return Flight.convert_duration_str_to_timedelta(arg)
+
+ def _parse_layover_n(self, arg):
+ return 0 if arg == "Nonstop" else int(arg.split()[0])
+
+ def _parse_price(self, arg):
+ return int(arg.replace(",", ""))
+
+ def _parse_orig_dest(self, arg):
+ # special case: "Flight + Train"
+ if "Flight + Train" in arg:
+ self._has_train = True
+ return (self._queried_orig, self._queried_dest)
+
+ # regular case: like MUCFCO, LAXJFK
+ return (arg[:3], arg[3:])
+
+ # ---------------------------------------------------------------
+
def _classify_arg(self, arg: str):
"""
Classifies a string (arg) into the correct attribute for a flight,
such as price, numer of layover stops, arrival time...
"""
+ parsed = False
# define cases for which to return early
- arg_empty = arg is None or arg == ""
+ arg_empty = arg is None or arg == "" or len(arg) == 0
arg_useless = arg in ["Change of airport", "round trip", "Climate friendly"]
arg_delay = arg.startswith("Delayed")
- early_return_conditions = [arg_empty, arg_useless, arg_delay]
+ emissions = arg.endswith("emissions") or arg.endswith("CO2")
+
+ early_return_conditions = [arg_empty, arg_useless, arg_delay, emissions]
# return early
if any(early_return_conditions):
return
- # airline: Separate tickets booked together
+ # airline: takes care of special case of format:
+ # Separate tickets booked together easyJet, Scoot
+ # Separate tickets booked together Ryanair, SWISS, ITA
if arg == "Separate tickets booked together":
- self._airline = ["multiple"]
-
- # arrival or departure time
- # regex: AM/PM (for example: 10:30AM, 4:11PM)
- elif bool(re.search("\d{1,2}\:\d{2}(?:AM|PM)\+{0,1}\d{0,1}", arg)) and (
- len(self._times) < 2
- ):
- delta = timedelta(days=0)
- if arg[-2] == "+":
- delta = timedelta(days=int(arg[-1]))
- arg = arg[:-2]
-
- date_format = "%Y-%m-%d %I:%M%p"
- self._times += [
- datetime.strptime(self._date + " " + arg, date_format) + delta
- ]
+ self._separate_tickets = True
+ return
+
+ if self._separate_tickets and "," in arg:
+ self._separate_tickets = False
+ self._airline = arg.split(", ")
+ return
+
+ # departure and arrival times
+ if self._is_arg_departure_arrival_times(arg):
+ (
+ self._time_departure,
+ self._time_arrival,
+ ) = self._parse_departure_arrival_times(arg)
+ return
# flight time
- # regex: 3 hr 35 min, 45 min, 5 hr
- elif bool(re.search("\d{1,2} (?:hr|min)$", arg)) and (
- self._flight_time is None
- ):
- self._flight_time = arg
-
- # number of stops
- elif ((arg == "Nonstop") or bool(re.search("\d stop", arg))) and (
- self._num_stops is None
- ):
- self._num_stops = 0 if arg == "Nonstop" else int(arg.split()[0])
-
- # co2
- elif arg.endswith("CO2") and (self._co2 is None):
- arg = arg.replace(",", "")
- self._co2 = int(arg.split()[0])
-
- # emissions
- elif arg.endswith("emissions") and (self._emissions is None):
- emission_val = arg.split()[0]
- self._emissions = 0 if emission_val == "Avg" else int(emission_val[:-1])
+ if self._is_arg_flight_time(arg):
+ self._flight_time = self._parse_flight_time(arg)
+ return
- # price
- elif arg.replace(",", "").isdigit() and (self._price is None):
- self._price = int(arg.replace(",", ""))
-
- # origin/dest
- elif (
- (len(arg) == 6 and arg.isupper() or "Flight + Train" in arg)
- and (self._origin is None)
- and (self._dest is None)
- ):
- if "Flight + Train" in arg:
- self._origin = self._queried_orig
- self._dest = self._queried_dest
- self._has_train = True
- else:
- self._origin = arg[:3]
- self._dest = arg[3:]
-
- # layover
- # regex 1: matches "FCO, JFK, ABC, DEF", "5 min Ancona", "3 hr 13 min FCO", "FCO, JFK"
- elif (
- bool(re.search("\d{0,2} (?:min|hr) (\d{0,2} (?:min|hr))?\w+", arg))
- and self._stops_locations is None
- ):
- # get stops locations
- if "," in arg: # multiple stops
- self._stops_locations = arg
- else: # single stop
- self._stops_locations = arg.split(" ")[-1]
-
- # get stops time
- if "," in arg:
- self._stops = arg.split(", ")[0]
- else:
- self._stops = (
- re.search("([0-9]+ hr )?([0-9]+ min )?", arg).group().strip()
- )
-
- # airline
- elif len(arg) > 0 and (self._airline is None):
- if "Operated" in arg:
- airline = arg.split("Operated")[0]
- else:
- airline = arg
-
- # split camel case
- airline = re.sub("([a-z])([A-Z])", r"\1, \2", airline)
+ # number of stops (layover n)
+ if self._is_arg_layover_n(arg):
+ self._layover_n = self._parse_layover_n(arg)
+ return
- # make it into an array (list)
- airline = airline.split(", ")
+ # price
+ if self._is_arg_price(arg):
+ self._price = self._parse_price(arg)
+ return
- self._airline = airline
+ # origin and destination airports
+ if self._is_arg_orig_dest(arg):
+ self._origin, self._dest = self._parse_orig_dest(arg)
+ return
- # other (trash)
- else:
- self._trash += [arg]
+ # layover time and location(s)
+ if self._is_arg_layover(arg):
+ (
+ self._layover_time,
+ self._layover_location,
+ ) = self._parse_layover_times_location(arg)
+ return
- # if we have both arrival and departure time, set them
- if len(self._times) == 2:
- self._time_leave = self._times[0]
- self._time_arrive = self._times[1]
+ # airline (always have it at last since it captures everything not captured above)
+ if self._is_arg_airline(arg):
+ self._airline = self._parse_airline(arg)
+ return
def _parse_args(self, args):
- for arg in args:
- self._classify_arg(arg)
+ # don't process if there are not enough arguments
+ if len(args) > 5:
+ for arg in args:
+ self._classify_arg(arg)
+
+ # print(self._debug())
@staticmethod
- def convert_duration_str_to_minutes(s):
+ def convert_duration_str_to_timedelta(s):
"""
Returns the duration in minutes from a string of the form:
3 hr 20 min --> 60*3 + 20 = 200
@@ -180,7 +306,7 @@ def convert_duration_str_to_minutes(s):
if "min" in s:
m = int(re.split("hr|min", s)[-2])
- return 60 * h + m
+ return timedelta(hours=h, minutes=m)
@staticmethod
def make_dataframe(flights):
@@ -204,27 +330,33 @@ def make_dataframe(flights):
"access_date": [],
"one_way": [],
"has_train": [],
- "days_advance": [],
+ # "days_advance": [],
}
# populate the dictionary
for flight in flights:
- data["departure_datetime"] += [flight._time_leave]
- data["arrival_datetime"] += [flight._time_arrive]
- data["airlines"] += [flight._airline]
- data["travel_time"] += [Flight.convert_duration_str_to_minutes(flight._flight_time)]
- data["origin"] += [flight._origin]
- data["destination"] += [flight._dest]
- data["layover_n"] += [flight._num_stops]
- data["layover_time"] += [Flight.convert_duration_str_to_minutes(flight._stops)]
- data["layover_location"] += [flight._stops_locations]
- data["price_eur"] += [flight._price]
- data["price_trend"] += [flight._price_trend[0]]
- data["price_value"] += [flight._price_trend[1]]
- data["access_date"] += [datetime.today()]
- data["one_way"] += [(False if flight._roundtrip else True)]
- data["has_train"] += [flight._has_train]
- data["days_advance"] += [(flight._time_leave - datetime.today()).days]
+ try:
+ data["departure_datetime"] += [flight._time_departure]
+ data["arrival_datetime"] += [flight._time_arrival]
+ data["airlines"] += [flight._airline]
+ data["travel_time"] += [flight._flight_time]
+ data["origin"] += [flight._origin]
+ data["destination"] += [flight._dest]
+ data["layover_n"] += [flight._layover_n]
+ data["layover_time"] += [flight._layover_time]
+ data["layover_location"] += [flight._layover_location]
+ data["price_eur"] += [flight._price]
+ data["price_trend"] += [flight._price_trend[0]]
+ data["price_value"] += [flight._price_trend[1]]
+ data["access_date"] += [datetime.today()]
+ data["one_way"] += [(False if flight._roundtrip else True)]
+ data["has_train"] += [flight._has_train]
+ # data["days_advance"] += [
+ # (flight._time_departure - datetime.today()).days
+ # ]
+ except Exception as e:
+ print("Error with flight", flight, flight._price)
+ print(e)
return pd.DataFrame(data)
diff --git a/src/flight_analysis/scrape.py b/src/flight_analysis/scrape.py
index 6a93daa..859ef9a 100644
--- a/src/flight_analysis/scrape.py
+++ b/src/flight_analysis/scrape.py
@@ -29,7 +29,7 @@ def __init__(self, orig, dest, date_leave, date_return=None):
self._round_trip = True if date_return is not None else False
self._data = None
self._url = None
-
+
@property
def data(self):
return self._data
@@ -106,7 +106,7 @@ def _get_results(self, driver):
logger.error(
f"Scrape timeout reached. It could mean that no flights exist for the combination of airports and dates."
)
- return -1
+ return None
flights = self._clean_results(results)
return Flight.make_dataframe(flights)
@@ -140,7 +140,7 @@ def _clean_results(self, result):
matches = []
# Enumerate over the list 'res3'
for index, element in enumerate(res3):
- # Check if the length of the element is more than 2
+ # Check if element is not an empty string
if len(element) <= 2:
continue
@@ -153,20 +153,30 @@ def _clean_results(self, result):
if element[-2] != "+" and is_time_format:
matches.append(index)
- # Keep only every second item in the matches list
- matches = matches[::2]
-
- flights = [
- Flight(
- self._date_leave, # date_leave
- self._round_trip, # round_trip
- self._orig,
- self._dest,
- price_trend,
- res3[matches[i] : matches[i + 1]],
- )
- for i in range(len(matches) - 1)
- ]
+ # handles the identification of whole flights, instead of splitting every
+ # time a time is found
+ # TODO: document better
+ matches_ok = [matches[0]]
+
+ for i in range(1, len(matches)):
+ if matches[i] - matches[i - 1] < 4:
+ continue
+ matches_ok.append(matches[i])
+
+ flights = []
+ for i in range(len(matches_ok) - 1):
+ flight_args = res3[matches_ok[i] : matches_ok[i + 1]]
+
+ if len(flight_args) > 5:
+ f = Flight(
+ self._date_leave, # date_leave
+ self._round_trip, # round_trip
+ self._orig,
+ self._dest,
+ price_trend,
+ flight_args,
+ )
+ flights.append(f)
return flights
| |