diff --git a/README.md b/README.md index a765cdf..48ffe9e 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ The whole project is written has been created with Python 3.11 and is based on t A very simple example of the main scraping functionality could be the following (get all flight from Munich (MUC) to Los Angeles (LAX) on May 28th, 2023): ``` -from google_flight_analysis.scrape import * +from flight_analysis.scrape import * flights = Scrape("MUC", "LAX", "2023-05-28") flights.data diff --git a/flight_analysis.ipynb b/flight_analysis.ipynb index 1dd64c9..4ed10ce 100644 --- a/flight_analysis.ipynb +++ b/flight_analysis.ipynb @@ -2,56 +2,26 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [ - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mRunning cells with '/Users/emanuelesalonico/opt/anaconda3/envs/accenture/bin/python' requires the ipykernel package.\n", - "\u001b[1;31mRun the following command to install 'ipykernel' into the Python environment. \n", - "\u001b[1;31mCommand: 'conda install -p /Users/emanuelesalonico/opt/anaconda3/envs/accenture ipykernel --update-deps --force-reinstall'" - ] - } - ], + "outputs": [], "source": [ - "from src.google_flight_analysis.scrape import Scrape" + "from src.flight_analysis.scrape import Scrape" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "ename": "WebDriverException", - "evalue": "Message: disconnected: not connected to DevTools\n (failed to check if window was closed: disconnected: not connected to DevTools)\n (Session info: headless chrome=114.0.5735.198)\nStacktrace:\n0 chromedriver 0x0000000105333f48 chromedriver + 4226888\n1 chromedriver 0x000000010532c4f4 chromedriver + 4195572\n2 chromedriver 0x0000000104f70d68 chromedriver + 281960\n3 chromedriver 0x0000000104f59db4 chromedriver + 187828\n4 chromedriver 0x0000000104f7ad54 chromedriver + 322900\n5 chromedriver 0x0000000104fe2e30 chromedriver + 749104\n6 chromedriver 0x0000000104f9ff1c chromedriver + 474908\n7 chromedriver 0x0000000104fa0ef4 chromedriver + 478964\n8 chromedriver 0x00000001052f559c chromedriver + 3970460\n9 chromedriver 0x00000001052f96f0 chromedriver + 3987184\n10 chromedriver 0x00000001052ff5b4 chromedriver + 4011444\n11 chromedriver 0x00000001052fa2fc chromedriver + 3990268\n12 chromedriver 0x00000001052d21c0 chromedriver + 3826112\n13 chromedriver 0x0000000105316088 chromedriver + 4104328\n14 chromedriver 0x00000001053161e0 chromedriver + 4104672\n15 chromedriver 0x0000000105325f28 chromedriver + 4169512\n16 libsystem_pthread.dylib 0x00000001a788bfa8 _pthread_start + 148\n17 libsystem_pthread.dylib 0x00000001a7886da0 thread_start + 8\n", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mWebDriverException\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m flights \u001b[39m=\u001b[39m Scrape(\u001b[39m\"\u001b[39m\u001b[39mMUC\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mLAX\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m2023-10-28\u001b[39m\u001b[39m\"\u001b[39m, export\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m)\n\u001b[0;32m----> 2\u001b[0m flights\u001b[39m.\u001b[39;49mrun_scrape()\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/src/google_flight_analysis/scrape.py:39\u001b[0m, in \u001b[0;36mScrape.run_scrape\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mrun_scrape\u001b[39m(\u001b[39mself\u001b[39m):\n\u001b[0;32m---> 39\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_scrape_data()\n\u001b[1;32m 41\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_export:\n\u001b[1;32m 42\u001b[0m Flight\u001b[39m.\u001b[39mexport_to_csv(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_data, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_origin,\n\u001b[1;32m 43\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_dest, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_date_leave, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_date_return)\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/src/google_flight_analysis/scrape.py:143\u001b[0m, in \u001b[0;36mScrape._scrape_data\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 141\u001b[0m driver \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcreate_driver()\n\u001b[1;32m 142\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_url \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_make_url()\n\u001b[0;32m--> 143\u001b[0m flight_results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_get_results(driver)\n\u001b[1;32m 144\u001b[0m driver\u001b[39m.\u001b[39mquit()\n\u001b[1;32m 146\u001b[0m \u001b[39mreturn\u001b[39;00m flight_results\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/src/google_flight_analysis/scrape.py:172\u001b[0m, in \u001b[0;36mScrape._get_results\u001b[0;34m(self, driver)\u001b[0m\n\u001b[1;32m 170\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 172\u001b[0m results \u001b[39m=\u001b[39m Scrape\u001b[39m.\u001b[39;49m_make_url_request(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_url, driver)\n\u001b[1;32m 173\u001b[0m \u001b[39mexcept\u001b[39;00m TimeoutException:\n\u001b[1;32m 174\u001b[0m logger\u001b[39m.\u001b[39merror(\n\u001b[1;32m 175\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mScrape timeout reached. It could mean that no flights exist for the combination of airports and dates.\u001b[39m\u001b[39m\"\u001b[39m)\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/src/google_flight_analysis/scrape.py:284\u001b[0m, in \u001b[0;36mScrape._make_url_request\u001b[0;34m(url, driver)\u001b[0m\n\u001b[1;32m 281\u001b[0m driver\u001b[39m.\u001b[39mget(url)\n\u001b[1;32m 283\u001b[0m \u001b[39m# detect Google's Terms & Conditions page (not always there, only in EU)\u001b[39;00m\n\u001b[0;32m--> 284\u001b[0m \u001b[39mif\u001b[39;00m Scrape\u001b[39m.\u001b[39m_identify_google_terms_page(driver\u001b[39m.\u001b[39;49mpage_source):\n\u001b[1;32m 285\u001b[0m WebDriverWait(driver, timeout)\u001b[39m.\u001b[39muntil(\n\u001b[1;32m 286\u001b[0m \u001b[39mlambda\u001b[39;00m s: Scrape\u001b[39m.\u001b[39m_identify_google_terms_page(s\u001b[39m.\u001b[39mpage_source))\n\u001b[1;32m 288\u001b[0m \u001b[39m# click on accept terms button\u001b[39;00m\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/.venv/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py:541\u001b[0m, in \u001b[0;36mWebDriver.page_source\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 532\u001b[0m \u001b[39m@property\u001b[39m\n\u001b[1;32m 533\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mpage_source\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mstr\u001b[39m:\n\u001b[1;32m 534\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Gets the source of the current page.\u001b[39;00m\n\u001b[1;32m 535\u001b[0m \n\u001b[1;32m 536\u001b[0m \u001b[39m :Usage:\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 539\u001b[0m \u001b[39m driver.page_source\u001b[39;00m\n\u001b[1;32m 540\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 541\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mexecute(Command\u001b[39m.\u001b[39;49mGET_PAGE_SOURCE)[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m]\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/.venv/lib/python3.11/site-packages/selenium/webdriver/remote/webdriver.py:440\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[0;34m(self, driver_command, params)\u001b[0m\n\u001b[1;32m 438\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcommand_executor\u001b[39m.\u001b[39mexecute(driver_command, params)\n\u001b[1;32m 439\u001b[0m \u001b[39mif\u001b[39;00m response:\n\u001b[0;32m--> 440\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49merror_handler\u001b[39m.\u001b[39;49mcheck_response(response)\n\u001b[1;32m 441\u001b[0m response[\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_unwrap_value(response\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mvalue\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m))\n\u001b[1;32m 442\u001b[0m \u001b[39mreturn\u001b[39;00m response\n", - "File \u001b[0;32m~/Library/CloudStorage/GoogleDrive-esalonico@gmail.com/My Drive/SYNC/Dev/Flights/flight-analysis/.venv/lib/python3.11/site-packages/selenium/webdriver/remote/errorhandler.py:245\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[0;34m(self, response)\u001b[0m\n\u001b[1;32m 243\u001b[0m alert_text \u001b[39m=\u001b[39m value[\u001b[39m\"\u001b[39m\u001b[39malert\u001b[39m\u001b[39m\"\u001b[39m]\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mtext\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 244\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[39m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[0;32m--> 245\u001b[0m \u001b[39mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n", - "\u001b[0;31mWebDriverException\u001b[0m: Message: disconnected: not connected to DevTools\n (failed to check if window was closed: disconnected: not connected to DevTools)\n (Session info: headless chrome=114.0.5735.198)\nStacktrace:\n0 chromedriver 0x0000000105333f48 chromedriver + 4226888\n1 chromedriver 0x000000010532c4f4 chromedriver + 4195572\n2 chromedriver 0x0000000104f70d68 chromedriver + 281960\n3 chromedriver 0x0000000104f59db4 chromedriver + 187828\n4 chromedriver 0x0000000104f7ad54 chromedriver + 322900\n5 chromedriver 0x0000000104fe2e30 chromedriver + 749104\n6 chromedriver 0x0000000104f9ff1c chromedriver + 474908\n7 chromedriver 0x0000000104fa0ef4 chromedriver + 478964\n8 chromedriver 0x00000001052f559c chromedriver + 3970460\n9 chromedriver 0x00000001052f96f0 chromedriver + 3987184\n10 chromedriver 0x00000001052ff5b4 chromedriver + 4011444\n11 chromedriver 0x00000001052fa2fc chromedriver + 3990268\n12 chromedriver 0x00000001052d21c0 chromedriver + 3826112\n13 chromedriver 0x0000000105316088 chromedriver + 4104328\n14 chromedriver 0x00000001053161e0 chromedriver + 4104672\n15 chromedriver 0x0000000105325f28 chromedriver + 4169512\n16 libsystem_pthread.dylib 0x00000001a788bfa8 _pthread_start + 148\n17 libsystem_pthread.dylib 0x00000001a7886da0 thread_start + 8\n" - ] - } - ], + "outputs": [], "source": [ - "flights = Scrape(\"MUC\", \"LAX\", \"2023-10-28\", export=True)\n", + "flights = Scrape(\"MUC\", \"LAX\", \"2023-10-28\", export=False)\n", "flights.run_scrape()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -96,226 +66,203 @@ "
\n", "