Skip to content

Commit

Permalink
add: split table into scraped and scraped_airlines
Browse files Browse the repository at this point in the history
  • Loading branch information
esalonico committed Aug 4, 2023
1 parent 56abd96 commit fa0ad2a
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 112 deletions.
188 changes: 107 additions & 81 deletions flight_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,10 +78,10 @@
" <td>511</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852230</td>\n",
" <td>2023-08-04 23:25:00.186922</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
Expand All @@ -97,10 +97,10 @@
" <td>627</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852262</td>\n",
" <td>2023-08-04 23:25:00.186958</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
Expand All @@ -116,10 +116,10 @@
" <td>706</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852267</td>\n",
" <td>2023-08-04 23:25:00.186963</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
Expand All @@ -135,10 +135,10 @@
" <td>586</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852273</td>\n",
" <td>2023-08-04 23:25:00.186968</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
Expand All @@ -154,29 +154,29 @@
" <td>670</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852278</td>\n",
" <td>2023-08-04 23:25:00.186973</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>2023-10-28 11:30:00</td>\n",
" <td>2023-10-28 21:40:00</td>\n",
" <td>[Delta, Virgin Atlantic, KLM]</td>\n",
" <td>970</td>\n",
" <td>2023-10-28 07:00:00</td>\n",
" <td>2023-10-28 15:25:00</td>\n",
" <td>[KLMDelta]</td>\n",
" <td>865</td>\n",
" <td>MUC</td>\n",
" <td>JFK</td>\n",
" <td>1</td>\n",
" <td>270.0</td>\n",
" <td>DTW</td>\n",
" <td>723</td>\n",
" <td>290.0</td>\n",
" <td>AMS</td>\n",
" <td>706</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852282</td>\n",
" <td>2023-08-04 23:25:00.186978</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
Expand All @@ -192,10 +192,10 @@
" <td>787</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852287</td>\n",
" <td>2023-08-04 23:25:00.186982</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
Expand All @@ -211,10 +211,10 @@
" <td>812</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852291</td>\n",
" <td>2023-08-04 23:25:00.186987</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
Expand All @@ -230,10 +230,10 @@
" <td>830</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852296</td>\n",
" <td>2023-08-04 23:25:00.186992</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
Expand All @@ -246,66 +246,66 @@
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>None</td>\n",
" <td>1316</td>\n",
" <td>1317</td>\n",
" <td>typical</td>\n",
" <td>None</td>\n",
" <td>2023-08-03 19:44:53.852300</td>\n",
" <td>2023-08-04 23:25:00.186995</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>85</td>\n",
" <td>84</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" departure_datetime arrival_datetime airlines \\\n",
"0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n",
"1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n",
"2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n",
"3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n",
"4 2023-10-28 10:25:00 2023-10-28 23:37:00 [Delta, Virgin Atlantic] \n",
"5 2023-10-28 11:30:00 2023-10-28 21:40:00 [Delta, Virgin Atlantic, KLM] \n",
"6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n",
"7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n",
"8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n",
"9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n",
" departure_datetime arrival_datetime airlines \\\n",
"0 2023-10-28 14:05:00 2023-10-28 19:10:00 [Icelandair] \n",
"1 2023-10-28 09:55:00 2023-10-28 20:20:00 [LOT] \n",
"2 2023-10-28 11:50:00 2023-10-28 16:35:00 [KLMDelta] \n",
"3 2023-10-28 11:00:00 2023-10-28 20:20:00 [Lufthansa, LOT] \n",
"4 2023-10-28 10:25:00 2023-10-28 23:37:00 [Delta, Virgin Atlantic] \n",
"5 2023-10-28 07:00:00 2023-10-28 15:25:00 [KLMDelta] \n",
"6 2023-10-28 11:25:00 2023-10-28 19:20:00 [Aer Lingus] \n",
"7 2023-10-28 14:05:00 2023-10-28 19:15:00 [KLMDelta] \n",
"8 2023-10-28 09:00:00 2023-10-28 14:25:00 [Lufthansa, Condor] \n",
"9 2023-10-28 12:10:00 2023-10-28 15:00:00 [Lufthansa, United] \n",
"\n",
" travel_time origin destination layover_n layover_time layover_location \\\n",
"0 665 MUC JFK 1 60.0 KEF \n",
"1 985 MUC JFK 1 320.0 WAW \n",
"2 645 MUC JFK 1 70.0 AMS \n",
"3 920 MUC JFK 1 260.0 WAW \n",
"4 1152 MUC JFK 1 385.0 ATL \n",
"5 970 MUC JFK 1 270.0 DTW \n",
"5 865 MUC JFK 1 290.0 AMS \n",
"6 835 MUC JFK 1 220.0 DUB \n",
"7 670 MUC JFK 1 95.0 AMS \n",
"8 685 MUC JFK 1 105.0 FRA \n",
"9 530 MUC JFK 0 NaN None \n",
"\n",
" price_eur price_trend price_value access_date one_way \\\n",
"0 511 typical None 2023-08-03 19:44:53.852230 True \n",
"1 627 typical None 2023-08-03 19:44:53.852262 True \n",
"2 706 typical None 2023-08-03 19:44:53.852267 True \n",
"3 586 typical None 2023-08-03 19:44:53.852273 True \n",
"4 670 typical None 2023-08-03 19:44:53.852278 True \n",
"5 723 typical None 2023-08-03 19:44:53.852282 True \n",
"6 787 typical None 2023-08-03 19:44:53.852287 True \n",
"7 812 typical None 2023-08-03 19:44:53.852291 True \n",
"8 830 typical None 2023-08-03 19:44:53.852296 True \n",
"9 1316 typical None 2023-08-03 19:44:53.852300 True \n",
"0 511 typical None 2023-08-04 23:25:00.186922 True \n",
"1 627 typical None 2023-08-04 23:25:00.186958 True \n",
"2 706 typical None 2023-08-04 23:25:00.186963 True \n",
"3 586 typical None 2023-08-04 23:25:00.186968 True \n",
"4 670 typical None 2023-08-04 23:25:00.186973 True \n",
"5 706 typical None 2023-08-04 23:25:00.186978 True \n",
"6 787 typical None 2023-08-04 23:25:00.186982 True \n",
"7 812 typical None 2023-08-04 23:25:00.186987 True \n",
"8 830 typical None 2023-08-04 23:25:00.186992 True \n",
"9 1317 typical None 2023-08-04 23:25:00.186995 True \n",
"\n",
" has_train days_advance \n",
"0 False 85 \n",
"1 False 85 \n",
"2 False 85 \n",
"3 False 85 \n",
"4 False 85 \n",
"5 False 85 \n",
"6 False 85 \n",
"7 False 85 \n",
"8 False 85 \n",
"9 False 85 "
"0 False 84 \n",
"1 False 84 \n",
"2 False 84 \n",
"3 False 84 \n",
"4 False 84 \n",
"5 False 84 \n",
"6 False 84 \n",
"7 False 84 \n",
"8 False 84 \n",
"9 False 84 "
]
},
"execution_count": 3,
Expand All @@ -319,7 +319,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"outputs": [
{
Expand All @@ -328,7 +328,7 @@
"'https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20MUC%20on%202023-10-28%20oneway&curr=EUR&gl=IT'"
]
},
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -339,29 +339,55 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20%28MUC%2CFCO%29%20on%202023-10-28%20oneway&curr=EUR&gl=IT'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import private.private as private\n",
"from src.flight_analysis.database import Database\n",
"\n",
"db = Database(\n",
" db_host=private.DB_HOST,\n",
" db_name=private.DB_NAME,\n",
" db_user=private.DB_USER,\n",
" db_pw=private.DB_PW,\n",
" db_table=private.DB_TABLE,\n",
")\n",
"\n",
"db\n",
"base = \"https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20MUC%20on%202023-10-28%20oneway&curr=EUR&gl=IT\"\n",
"\n",
"cursor = db.conn.cursor()\n",
"cursor.execute(f\"SELECT * FROM {self.db_name}\")\n",
"a = \"https://www.google.com/travel/flights?q=Flights%20to%20JFK%20from%20%28MUC%2CFCO%29%20on%202023-10-28%20oneway&curr=EUR&gl=IT\"\n",
"a"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m/Users/emanuelesalonico/Library/CloudStorage/[email protected]/My Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb Cell 6\u001b[0m in \u001b[0;36m<cell line: 4>\u001b[0;34m()\u001b[0m\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico%40gmail.com/My%20Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb#X10sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m parser \u001b[39m=\u001b[39m StandardParser()\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico%40gmail.com/My%20Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb#X10sZmlsZQ%3D%3D?line=3'>4</a>\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mtest\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39mrb\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m fh:\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico%40gmail.com/My%20Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb#X10sZmlsZQ%3D%3D?line=4'>5</a>\u001b[0m \u001b[39m# print(fh.read())\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico%40gmail.com/My%20Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb#X10sZmlsZQ%3D%3D?line=5'>6</a>\u001b[0m output \u001b[39m=\u001b[39m parser\u001b[39m.\u001b[39;49mparse_message(fh, \u001b[39m\"\u001b[39;49m\u001b[39mmessage\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/emanuelesalonico/Library/CloudStorage/GoogleDrive-esalonico%40gmail.com/My%20Drive/SYNC/Dev/flight-analysis/flight_analysis.ipynb#X10sZmlsZQ%3D%3D?line=6'>7</a>\u001b[0m \u001b[39mprint\u001b[39m(output)\n",
"File \u001b[0;32m~/miniforge3/envs/flight-analysis/lib/python3.10/site-packages/protobuf_inspector/types.py:75\u001b[0m, in \u001b[0;36mStandardParser.parse_message\u001b[0;34m(self, file, gtype, endgroup)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mtype\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m: \u001b[39mtype\u001b[39m \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mmessage\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 74\u001b[0m end \u001b[39m=\u001b[39m [\u001b[39mNone\u001b[39;00m]\n\u001b[0;32m---> 75\u001b[0m x \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mparse_message(file, \u001b[39mtype\u001b[39;49m, end)\n\u001b[1;32m 76\u001b[0m x \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mgroup (end \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m) \u001b[39m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m fg4(\u001b[39mstr\u001b[39m(end[\u001b[39m0\u001b[39m])) \u001b[39m+\u001b[39m x\n\u001b[1;32m 77\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mgroups_observed \u001b[39m=\u001b[39m \u001b[39mTrue\u001b[39;00m\n",
"File \u001b[0;32m~/miniforge3/envs/flight-analysis/lib/python3.10/site-packages/protobuf_inspector/types.py:59\u001b[0m, in \u001b[0;36mStandardParser.parse_message\u001b[0;34m(self, file, gtype, endgroup)\u001b[0m\n\u001b[1;32m 56\u001b[0m key, wire_type \u001b[39m=\u001b[39m read_identifier(file)\n\u001b[1;32m 57\u001b[0m \u001b[39mif\u001b[39;00m key \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m: \u001b[39mbreak\u001b[39;00m\n\u001b[0;32m---> 59\u001b[0m x \u001b[39m=\u001b[39m read_value(file, wire_type)\n\u001b[1;32m 60\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mnot\u001b[39;00m (x \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m 62\u001b[0m \u001b[39mif\u001b[39;00m wire_type \u001b[39m==\u001b[39m \u001b[39m4\u001b[39m:\n",
"File \u001b[0;32m~/miniforge3/envs/flight-analysis/lib/python3.10/site-packages/protobuf_inspector/core.py:38\u001b[0m, in \u001b[0;36mread_value\u001b[0;34m(file, wire_type)\u001b[0m\n\u001b[1;32m 36\u001b[0m \u001b[39mif\u001b[39;00m length \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m: \u001b[39mreturn\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 37\u001b[0m c \u001b[39m=\u001b[39m file\u001b[39m.\u001b[39mread(length)\n\u001b[0;32m---> 38\u001b[0m \u001b[39massert\u001b[39;00m(\u001b[39mlen\u001b[39m(c) \u001b[39m==\u001b[39m length)\n\u001b[1;32m 39\u001b[0m \u001b[39mreturn\u001b[39;00m io\u001b[39m.\u001b[39mBytesIO(c)\n\u001b[1;32m 40\u001b[0m \u001b[39mif\u001b[39;00m wire_type \u001b[39m==\u001b[39m \u001b[39m3\u001b[39m \u001b[39mor\u001b[39;00m wire_type \u001b[39m==\u001b[39m \u001b[39m4\u001b[39m:\n",
"\u001b[0;31mAssertionError\u001b[0m: "
]
}
],
"source": [
"from protobuf_inspector.types import StandardParser\n",
"\n",
"for row in cursor:\n",
" print(row)\n",
" break"
"parser = StandardParser()\n",
"with open('test', 'rb') as fh:\n",
" # print(fh.read())\n",
" output = parser.parse_message(fh, \"message\")\n",
"print(output)"
]
}
],
Expand Down
38 changes: 34 additions & 4 deletions flight_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import utils
import os
import sys
import uuid

# logging
logger_name = os.path.basename(__file__)
Expand Down Expand Up @@ -103,7 +104,29 @@ def get_routes_df(routes: list):

n_iter += 1

return pd.concat(all_results)
final_df = pd.concat(all_results).reset_index(drop=True)
final_df["uuid"] = [uuid.uuid4() for _ in range(final_df.shape[0])]
final_df = final_df.set_index("uuid")

return final_df


def generate_airline_df_from_flights(flights_df):
"""
From a flights dataframe, generate an airline dataframe.
Goal: respect good database conditions.
"""
# check if all indices are unique
if not flights_df.index.is_unique:
flights_df = flights_df.reset_index(drop=True)

# create a dataframe with all the airlines, referencing the index
airlines_df = flights_df.explode("airlines")[["airlines"]]

# rename column to "airline"
airlines_df = airlines_df.rename(columns={"airlines": "airline"})

return airlines_df


if __name__ == "__main__":
Expand All @@ -115,21 +138,28 @@ def get_routes_df(routes: list):
# scrape routes into a dataframe
scraped_flights = get_routes_df(routes)

# generate an airline dataframe
scraped_airlines = generate_airline_df_from_flights(scraped_flights)

# drop airlines from flights dataframe
# scraped_flights = scraped_flights.drop(columns=["airlines"])

# connect to database
db = Database(
db_host=private.DB_HOST,
db_name=private.DB_NAME,
db_user=private.DB_USER,
db_pw=private.DB_PW,
db_table=private.DB_TABLE,
)

# prepare database and tables
db.prepare_db_and_tables(overwrite_table=False)
db.prepare_db_and_tables()

# add results to database
if not SKIP_SAVE_TO_DB:
db.add_pandas_df_to_db(scraped_flights)
db.add_pandas_df_to_db(scraped_flights, table_name=db.table_scraped)
db.add_pandas_df_to_db(scraped_airlines, table_name=db.table_scraped_airlines)


# if it's a monday, backup the database
if datetime.today().weekday() == 0:
Expand Down
Loading

0 comments on commit fa0ad2a

Please sign in to comment.