From 06611bc42cbf506cde1a5ee2e36fc50c0295e156 Mon Sep 17 00:00:00 2001 From: Ali Moghimi Date: Sat, 21 Oct 2023 12:36:03 +0000 Subject: [PATCH] initial commit --- .env.sample | 0 .flake8 | 3 + .github/workflows/flake8.yml | 24 ++++++ .gitignore | 160 +++++++++++++++++++++++++++++++++++ README.md | 1 + config/__init__.py | 1 + config/logger.py | 10 +++ config/settings.py | 4 + database/__init__.py | 0 main.py | 14 +++ pyproject.toml | 8 ++ requirements.txt | 4 + scraper/__init__.py | 0 scraper/engine.py | 38 +++++++++ transformer/__init__.py | 0 transformer/agent.py | 7 ++ 16 files changed, 274 insertions(+) create mode 100644 .env.sample create mode 100644 .flake8 create mode 100644 .github/workflows/flake8.yml create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config/__init__.py create mode 100644 config/logger.py create mode 100644 config/settings.py create mode 100644 database/__init__.py create mode 100644 main.py create mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 scraper/__init__.py create mode 100644 scraper/engine.py create mode 100644 transformer/__init__.py create mode 100644 transformer/agent.py diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..e69de29 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..eb3cd4a --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 88 +exclude = .git,__pycache__,venv,__init__.py \ No newline at end of file diff --git a/.github/workflows/flake8.yml b/.github/workflows/flake8.yml new file mode 100644 index 0000000..96406ea --- /dev/null +++ b/.github/workflows/flake8.yml @@ -0,0 +1,24 @@ +name: Lint + +on: [push, pull_request] + +jobs: + flake8-lint: + runs-on: ubuntu-latest + steps: + - name: Check out source repository + uses: actions/checkout@v2 + + - name: Set up Python environment + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 + + - name: Lint with flake8 + run: | + flake8 . \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..624a863 --- /dev/null +++ b/.gitignore @@ -0,0 +1,160 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..0229e55 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# ice-client diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..a48c133 --- /dev/null +++ b/config/__init__.py @@ -0,0 +1 @@ +from .logger import logger diff --git a/config/logger.py b/config/logger.py new file mode 100644 index 0000000..8dde6a5 --- /dev/null +++ b/config/logger.py @@ -0,0 +1,10 @@ +import logging + +from config.settings import LOG_LEVEL + +logging.basicConfig( + level=getattr(logging, LOG_LEVEL), + format="%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] [%(funcName)s()] - %(message)s", +) + +logger = logging.getLogger() diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..1b763e1 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,4 @@ +from decouple import config + +URL = config("URL") +LOG_LEVEL = config("LOG_LEVEL", default="INFO") diff --git a/database/__init__.py b/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py new file mode 100644 index 0000000..d0113d8 --- /dev/null +++ b/main.py @@ -0,0 +1,14 @@ +from config import logger +from scraper.engine import Engine + + +def main(): + logger.info("Initializing Application") + engine = Engine() + df = engine.fetch() + print(df) + return + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2db05cc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[tool.black] +exclude = ''' +( + \.git + | __pycache__ + | venv +) +''' \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..146cb34 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +pandas==2.0.3 +python-decouple==3.8 +requests==2.31.0 +lxml==4.9.3 diff --git a/scraper/__init__.py b/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scraper/engine.py b/scraper/engine.py new file mode 100644 index 0000000..f90ed98 --- /dev/null +++ b/scraper/engine.py @@ -0,0 +1,38 @@ +import pandas as pd +import requests + +from config import logger, settings + + +class Engine: + URL = settings.URL + + def fetch(self) -> pd.DataFrame: + logger.debug(f"Attempting to fetch content from {self.URL}.") + content = self.get_content() + logger.debug(f"Successfully fetched content from {self.URL}. Now parsing the content.") + df = self.parse_html(content) + logger.info(f"Parsed content from {self.URL}. Extracted {len(df)} rows.") + return df + + def get_content(self): + try: + r = requests.get(self.URL) + r.raise_for_status() + return r.text + except requests.RequestException as e: + logger.error(f"Error fetching content from {self.URL}. Error: {e}") + raise ConnectionError(f"Failed to connect to {self.URL}.") from e + + def parse_html(self, content: str) -> pd.DataFrame: + try: + dfs = pd.read_html(content) + except Exception as e: + logger.error(e) + raise e + + if len(dfs): + logger.debug(f"Successfully parsed content from {self.URL}. Extracted {len(dfs[0])} rows.") + return dfs[0] + else: + raise ValueError(f"No data found when parsing content from {self.URL}.") diff --git a/transformer/__init__.py b/transformer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/transformer/agent.py b/transformer/agent.py new file mode 100644 index 0000000..6fa0663 --- /dev/null +++ b/transformer/agent.py @@ -0,0 +1,7 @@ +import pandas as pd + + +class Transformer: + + def transform(self, df: pd.DataFrame): + pass \ No newline at end of file