Skip to content

Commit

Permalink
code refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
esalonico committed Aug 3, 2023
1 parent daf748f commit b491f6f
Show file tree
Hide file tree
Showing 10 changed files with 396 additions and 31 deletions.
7 changes: 1 addition & 6 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,7 @@ __pycache__/
# custom
backups
private
outputs
prova.py
temp.txt
temp.ipynb
main.ipynb
simulate_data.ipynb
scrapes_csv

# logging
logs/
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ The whole project is written has been created with Python 3.11 and is based on t
- numpy
- matplotlib
- selenium
- tqdm
- pytest

A very simple example of the main scraping functionality could be the following (get all flight from Munich (MUC) to Los Angeles (LAX) on May 28th, 2023):
Expand Down
370 changes: 370 additions & 0 deletions flight_analysis.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion flight_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

# config
config = configparser.ConfigParser()
config.read(os.path.join(os.path.dirname(__file__), "config.ini"))
config.read(os.path.join(os.path.dirname(__file__), "routes.ini"))


if __name__ == "__main__":
Expand Down
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
tqdm
numpy
pandas
selenium
webdriver_manager
pytest
pymongo
configparser
psycopg2-binary
File renamed without changes.
8 changes: 5 additions & 3 deletions src/google_flight_analysis/flight.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from datetime import date, datetime, timedelta
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from os import path

Expand Down Expand Up @@ -316,13 +315,16 @@ def export_to_csv(df, origin, dest, date_leave, date_return=None):
Format:
{access_date_YYMMDD}_{access_time_HHMM}_{orig}_{dest}_{days_advance}_{leave_date_YYMMDD}_{return_date_YYMMDD}
"""
folder = "outputs"
folder = "scrapes_csv"
folder = path.join(path.dirname(__file__), folder)
print("folder is", folder)


# check if output folder exists
if not path.isdir(folder):
raise FileNotFoundError(f"Check if folder {folder} esists")

access_date = datetime.strptime(df["access_date"][0], "%Y-%m-%d %H:%M:%S").strftime("%y%m%d_%H%M")
access_date = df["access_date"][0].to_pydatetime().strftime("%y%m%d_%H%M")
days_in_advance = df["days_advance"].min()
leave_date = datetime.strptime(date_leave, "%Y-%m-%d").strftime("%y%m%d")
return_date = (datetime.strptime(date_return, "%Y-%m-%d").strftime("%y%m%d") if date_return else None)
Expand Down
4 changes: 2 additions & 2 deletions src/google_flight_analysis/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from src.google_flight_analysis.flight import Flight

Expand Down Expand Up @@ -172,7 +171,8 @@ def _get_results(self, driver):
try:
results = Scrape._make_url_request(self._url, driver)
except TimeoutException:
logger.error(f"Scrape timeout reached. It could mean that no flights exist for the combination of airports and dates." )
logger.error(
f"Scrape timeout reached. It could mean that no flights exist for the combination of airports and dates.")
return -1

flights = self._clean_results(results)
Expand Down
23 changes: 11 additions & 12 deletions tests/test_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,23 @@


def test_database_connection():
db = Database(db_host=private.DB_HOST, db_name=private.DB_NAME, db_user=private.DB_USER, db_pw=private.DB_PW, db_table=private.DB_TABLE)
db = Database(db_host=private.DB_HOST, db_name=private.DB_NAME,
db_user=private.DB_USER, db_pw=private.DB_PW, db_table=private.DB_TABLE)
try:
conn = db.connect_to_postgresql()
except ConnectionError as e:
assert False, e
def test_dataset_generation():
ten_days_ahead = (datetime.today() + timedelta(5)).strftime("%Y-%m-%d")
scrape_obj = Scrape("MUC", "FCO", ten_days_ahead)
scrape_obj.run_scrape()
assert isinstance(scrape_obj.data, pd.DataFrame)


# def test_dataset_generation():
# ten_days_ahead = (datetime.today() + timedelta(5)).strftime("%Y-%m-%d")
# scrape_obj = Scrape("MUC", "FCO", ten_days_ahead)
# scrape_obj.run_scrape()
# assert isinstance(scrape_obj.data, pd.DataFrame)

def test_config_file():
try:
config = configparser.ConfigParser()
config.read("config.ini")
config.read("routes.ini")
except Exception as e:
assert False, e


10 changes: 6 additions & 4 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
'class': 'logging.handlers.TimedRotatingFileHandler',
'formatter': 'standard',
'filename': os.path.join(LOGS_PATH, "logs.log"),
"when": "W6", # sunday,
"when": "W6", # sunday,
'backupCount': 8
},
},
Expand All @@ -38,10 +38,12 @@
def create_logs_folder():
if not os.path.isdir(LOGS_PATH):
os.mkdir(LOGS_PATH)



def setup_logger(logger_name):
create_logs_folder()
logging.getLogger('WDM').setLevel(logging.NOTSET) # suppress WDM (Webdrive Manager) logs
# suppress WDM (Webdrive Manager) logs
logging.getLogger('WDM').setLevel(logging.NOTSET)
logging.config.dictConfig(LOGGING_CONFIG_DICT)
return logging.getLogger(logger_name)

Expand All @@ -54,4 +56,4 @@ def get_routes_from_config(config_obj):
for route in config_obj["routes"]:
routes.append(json.loads(config_obj["routes"][route]))

return routes
return routes

0 comments on commit b491f6f

Please sign in to comment.