From f7fc4a9e24a9325cf1779709b8896beb6369a4ab Mon Sep 17 00:00:00 2001 From: Hisao Date: Tue, 21 Jul 2020 21:54:56 +0000 Subject: [PATCH 1/3] Added gitignore --- .gitignore | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ From 7a76009f9a1dc02ab97f63c6a94afe7c076eceaa Mon Sep 17 00:00:00 2001 From: "pull[bot]" <39814207+pull[bot]@users.noreply.github.com> Date: Tue, 25 Aug 2020 12:56:43 -0500 Subject: [PATCH 2/3] Improve search engine with 3 new factors (#52) (#26) * Create tfidf method for search engine * Pylint fixes * Add testing * Fix linting * Add numpy to requirements.txt * Adapt to articles/keyword object return * Change naming to agreed db convention * disable numpy import error Co-authored-by: Domene99 --- services/parser-database/app.py | 4 +- services/parser-database/connector.py | 30 +++++++++ services/parser-database/requirements.txt | 3 +- .../parser-database/tests/test_connector.py | 62 +++++++++++++++++++ 4 files changed, 96 insertions(+), 3 deletions(-) diff --git a/services/parser-database/app.py b/services/parser-database/app.py index 1535330..ce5243c 100644 --- a/services/parser-database/app.py +++ b/services/parser-database/app.py @@ -7,8 +7,8 @@ from flask import jsonify # pylint: disable=import-error from parser import parse_all_documents -from connector import get_articles_that_match_keywords from connector import get_article_by_number +from connector import get_articles_by_tfidf_value app = Flask(__name__) app.config['JSON_AS_ASCII'] = False @@ -32,7 +32,7 @@ def get_keywords(): logging.error(error) return error, 400 else: - return jsonify(get_articles_that_match_keywords(json_request['keywords'])) + return jsonify(get_articles_by_tfidf_value(json_request['keywords'])) @app.route('/articles/', methods=['GET']) diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py index 6486ceb..1cce100 100644 --- a/services/parser-database/connector.py +++ b/services/parser-database/connector.py @@ -2,6 +2,8 @@ services/databases""" import requests # pylint: disable=import-error import logging +import numpy as np # pylint: disable=import-error + import constants import env @@ -61,6 +63,34 @@ def get_articles_that_match_keywords(keywords_list): return matching_articles +def get_articles_by_tfidf_value(keywords_list): + """ + Returns a value for every article based on a keyword + for a keyword list, value is based on + term frequency inverse document frequency (tfidf) + Args: + keywords_list (list): Keyword(s) to look for + + Returns: + list: articles and value for such keyword(s) + """ + matching_articles = {} + for keyword in keywords_list: + articles_that_match_keyword = {} + if keyword in keywords_in_memory: + for article in keywords_in_memory[keyword]: + # tfidf computation + word_count = articles_in_memory[str(article["number"])]["wordCount"] + term_density_in_article = article["frequency"]/word_count + document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword]) + inverse_doc_freq = np.log(document_frequency) + weight = term_density_in_article * inverse_doc_freq + + articles_that_match_keyword[str(article["number"])] = {"weight": weight} + matching_articles[keyword] = articles_that_match_keyword + return matching_articles + + def save_keywords_in_memory(keywords, article): """Saves the keywords from an article in memory diff --git a/services/parser-database/requirements.txt b/services/parser-database/requirements.txt index a92fbc7..415c0ba 100644 --- a/services/parser-database/requirements.txt +++ b/services/parser-database/requirements.txt @@ -2,4 +2,5 @@ Flask utils https://github.com/timClicks/slate/archive/master.zip requests -pytest-mock +numpy +pytest-mock \ No newline at end of file diff --git a/services/parser-database/tests/test_connector.py b/services/parser-database/tests/test_connector.py index 5f25996..e0a23e7 100644 --- a/services/parser-database/tests/test_connector.py +++ b/services/parser-database/tests/test_connector.py @@ -35,6 +35,35 @@ } +in_memory_value_mock_no_decimals = { + "ciclista": [ + {"number": 5, "frequency": 3}, + {"number": 45, "frequency": 6}, + {"number": 99, "frequency": 9}, + ], + "licencia": [ + {"number": 89, "frequency": 3}, + {"number": 45, "frequency": 3}, + {"number": 125, "frequency": 15}, + ], +} + + +articles_in_memory = {'5': {'wordCount': 32}, '45': {'wordCount': 40}, '89': {'wordCount': 16}, + '99': {'wordCount': 50}, '125': {'wordCount': 200}} + + +articles_in_memory_no_wordCount = {'5': {}, '45': {}, '89': {}, '99': {}, '125': {}} + + +def logn(num): + """ + Mocks the natural log of a number to try to + minimize decimal points + """ + return num + + @mock.patch("connector.keywords_in_memory", in_memory_value_mock) def test_get_articles_that_match_keywords_empty_result_one_keyword(): result_to_assert_1 = {"alcohol": {}} @@ -71,6 +100,39 @@ def test_get_articles_that_match_keywords_non_empty_result_two_keywords(): assert result == result_to_assert_4 +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory) +def test_get_articles_by_tfidf_value(): + expected = { + "licencia": {"89": {"weight": .3125}, "45": {"weight": .125}, "125": {"weight": .125}}, + "ciclista": {"5": {"weight": .15625}, "45": {"weight": .25}, "99": {"weight": .3}}, + } + keywords = ["licencia", "ciclista"] + with mock.patch("numpy.log", side_effect=logn): + assert expected == connector.get_articles_by_tfidf_value(keywords) + + +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory) +def test_get_articles_by_tfidf_value_empty_result(): + expected = { + "casco": {}, + "luz": {}, + } + keywords = ["casco", "luz"] + with mock.patch("numpy.log", side_effect=logn): + assert expected == connector.get_articles_by_tfidf_value(keywords) + + +@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals) +@mock.patch("connector.articles_in_memory", articles_in_memory_no_wordCount) +def test_get_articles_by_tfidf_value_missing_word_count(): + keywords = ["licencia", "ciclista"] + with mock.patch("numpy.log", side_effect=logn): + with pytest.raises(KeyError): + connector.get_articles_by_tfidf_value(keywords) + + def test_get_documents(): assert connector.get_documents_to_parse() == [constants.mty_document] From 5e8988994e00aaa37ea1d3207d4060aa9f6da2d7 Mon Sep 17 00:00:00 2001 From: Hisao Date: Fri, 28 Aug 2020 01:29:09 +0000 Subject: [PATCH 3/3] Added DB Functionality --- services/parser-database/app.py | 10 +-- services/parser-database/connector.py | 94 ++++++++++++++++++++++++++- services/parser-database/parser.py | 13 +++- 3 files changed, 109 insertions(+), 8 deletions(-) diff --git a/services/parser-database/app.py b/services/parser-database/app.py index ce5243c..35c42af 100644 --- a/services/parser-database/app.py +++ b/services/parser-database/app.py @@ -7,8 +7,8 @@ from flask import jsonify # pylint: disable=import-error from parser import parse_all_documents -from connector import get_article_by_number -from connector import get_articles_by_tfidf_value +from connector import get_article_by_id_db +from connector import get_articles_that_match_keywords_db app = Flask(__name__) app.config['JSON_AS_ASCII'] = False @@ -32,14 +32,14 @@ def get_keywords(): logging.error(error) return error, 400 else: - return jsonify(get_articles_by_tfidf_value(json_request['keywords'])) + return jsonify(get_articles_that_match_keywords_db(json_request['keywords'])) @app.route('/articles/', methods=['GET']) def get_article_by_number_in_memory(id): """Returns the article that matches the ID value accoring to the apiSpec.yaml file""" - article = get_article_by_number(str(id)) + article = get_article_by_id_db(str(id)) if article is not None: article = copy(article) return jsonify(article) @@ -50,5 +50,5 @@ def get_article_by_number_in_memory(id): if __name__ == '__main__': - parse_all_documents() + #parse_all_documents() app.run(debug=True, host='0.0.0.0', port=os.getenv("PORT")) diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py index 1cce100..0f8a219 100644 --- a/services/parser-database/connector.py +++ b/services/parser-database/connector.py @@ -3,16 +3,55 @@ import requests # pylint: disable=import-error import logging import numpy as np # pylint: disable=import-error - +import firebase_admin +from firebase_admin import credentials +from firebase_admin import firestore import constants import env logging.basicConfig(level=logging.INFO) +class Keywords: + """Class for storing articles. + """ + def __init__(self, keyword): + self.keyword = keyword + self.articles_that_contain_keyword = {} + + def to_dict(self): + article_dict = { + "keyword": self.number, + "contain": self.articles_that_contain_keyword, + } + return article_dict + + @staticmethod + def from_dict(src): + self.number = src["number"] + self.id = src["id"] + self.content = src["keyword"] + self.wordCount = src["frequency"] + +cred = credentials.ApplicationDefault() +firebase_admin.initialize_app(cred, { + 'projectId': 'major-tom-285619', +}) + +db = firestore.client() + articles_in_memory = {} keywords_in_memory = {} +def get_documents_to_parse_db(): + documents_ref = db.collection(u'documents') + docs = documents_ref.stream() + + for doc in docs: + print(f'{doc.id} => {doc.to_dict()}') + return docs + + def get_documents_to_parse(): # When database is integrated, this will go away document_list = [] @@ -113,3 +152,56 @@ def store_article(article_dict): articles_in_memory[article_dict["id"]] = article_dict save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict) logging.info('Article ' + article_dict["id"] + ' assigned keywords') + + +def store_article_in_db(article_dict): + db.collection(u'articles').document(article_dict["id"]).set(article_dict) + save_keywords_in_db(get_keywords(article_dict["content"]), article_dict) + logging.info('Article ' + article_dict["id"] + ' assigned keywords') + + +def save_keywords_in_db(keywords, article): + """Saves the keywords from an article in memory + + Args: + keywords (JSON): contains keywords + article (Article): article object + """ + for keyword in keywords: + frequency = article["content"].count(keyword) + + doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) + doc = doc_ref.get() + + if len(doc) != 0 and doc[0] is not None: + from_db = doc[0].to_dict() + print(from_db) + from_db["matching_articles"][article["id"]] = frequency + #print(from_db) + db.collection(u'keywords').document(doc[0].id).set(from_db) + else: + to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}} + db.collection(u'keywords').add(to_send) + + +def get_articles_that_match_keywords_db(keywords_list): + matching_articles = {} + for keyword in keywords_list: + articles_that_match_keyword = {} + doc_ref = db.collection(u'keywords').where('keyword', '==', keyword) + doc = doc_ref.get() + if doc.exists(): + doc_dict = doc.to_dict() + for article in doc_dict[keyword]: + articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]} + matching_articles[keyword] = articles_that_match_keyword + return matching_articles + + +def get_article_by_id_db(art_num): + documents_ref = db.collection(u'articles').document(art_num) + doc = documents_ref.get() + if doc is not None: + return doc.to_dict() + else: + return None \ No newline at end of file diff --git a/services/parser-database/parser.py b/services/parser-database/parser.py index e35f3ce..a506bf2 100644 --- a/services/parser-database/parser.py +++ b/services/parser-database/parser.py @@ -38,7 +38,7 @@ class Article: def __init__(self, number, content): self.number = number self.content = content - self.id = str(number) + self.id = 'monterrey'+str(number) def to_dict(self): article_dict = { @@ -49,6 +49,13 @@ def to_dict(self): } return article_dict + @staticmethod + def from_dict(src): + self.number = src["number"] + self.id = src["id"] + self.content = src["content"] + self.wordCount = src["wordCount"] + def identify_articles(pdf_text): """Identifies articles and returns a list of Article objects. @@ -113,4 +120,6 @@ def parse(document_to_parse): for article in articles: dictionary = article.to_dict() - connector.store_article(dictionary) + connector.store_article_in_db(dictionary) + +#parse_all_documents() \ No newline at end of file