From f7fc4a9e24a9325cf1779709b8896beb6369a4ab Mon Sep 17 00:00:00 2001
From: Hisao <jaimehisao2011@gmail.com>
Date: Tue, 21 Jul 2020 21:54:56 +0000
Subject: [PATCH 1/3] Added gitignore

---
 .gitignore | 138 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a81c8ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/

From 7a76009f9a1dc02ab97f63c6a94afe7c076eceaa Mon Sep 17 00:00:00 2001
From: "pull[bot]" <39814207+pull[bot]@users.noreply.github.com>
Date: Tue, 25 Aug 2020 12:56:43 -0500
Subject: [PATCH 2/3] Improve search engine with 3 new factors (#52) (#26)

* Create tfidf method for search engine

* Pylint fixes

* Add testing

* Fix linting

* Add numpy to requirements.txt

* Adapt to articles/keyword object return

* Change naming to agreed db convention

* disable numpy import error

Co-authored-by: Domene99 <jadomene99@gmail.com>
---
 services/parser-database/app.py               |  4 +-
 services/parser-database/connector.py         | 30 +++++++++
 services/parser-database/requirements.txt     |  3 +-
 .../parser-database/tests/test_connector.py   | 62 +++++++++++++++++++
 4 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/services/parser-database/app.py b/services/parser-database/app.py
index 1535330..ce5243c 100644
--- a/services/parser-database/app.py
+++ b/services/parser-database/app.py
@@ -7,8 +7,8 @@
 from flask import jsonify  # pylint: disable=import-error
 
 from parser import parse_all_documents
-from connector import get_articles_that_match_keywords
 from connector import get_article_by_number
+from connector import get_articles_by_tfidf_value
 
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
@@ -32,7 +32,7 @@ def get_keywords():
         logging.error(error)
         return error, 400
     else:
-        return jsonify(get_articles_that_match_keywords(json_request['keywords']))
+        return jsonify(get_articles_by_tfidf_value(json_request['keywords']))
 
 
 @app.route('/articles/<id>', methods=['GET'])
diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py
index 6486ceb..1cce100 100644
--- a/services/parser-database/connector.py
+++ b/services/parser-database/connector.py
@@ -2,6 +2,8 @@
 services/databases"""
 import requests  # pylint: disable=import-error
 import logging
+import numpy as np  # pylint: disable=import-error
+
 import constants
 import env
 
@@ -61,6 +63,34 @@ def get_articles_that_match_keywords(keywords_list):
     return matching_articles
 
 
+def get_articles_by_tfidf_value(keywords_list):
+    """
+    Returns a value for every article based on a keyword
+    for a keyword list, value is based on
+    term frequency inverse document frequency (tfidf)
+    Args:
+        keywords_list (list): Keyword(s) to look for
+
+    Returns:
+        list: articles and value for such keyword(s)
+    """
+    matching_articles = {}
+    for keyword in keywords_list:
+        articles_that_match_keyword = {}
+        if keyword in keywords_in_memory:
+            for article in keywords_in_memory[keyword]:
+                # tfidf computation
+                word_count = articles_in_memory[str(article["number"])]["wordCount"]
+                term_density_in_article = article["frequency"]/word_count
+                document_frequency = len(articles_in_memory)/len(keywords_in_memory[keyword])
+                inverse_doc_freq = np.log(document_frequency)
+                weight = term_density_in_article * inverse_doc_freq
+
+                articles_that_match_keyword[str(article["number"])] = {"weight": weight}
+        matching_articles[keyword] = articles_that_match_keyword
+    return matching_articles
+
+
 def save_keywords_in_memory(keywords, article):
     """Saves the keywords from an article in memory
 
diff --git a/services/parser-database/requirements.txt b/services/parser-database/requirements.txt
index a92fbc7..415c0ba 100644
--- a/services/parser-database/requirements.txt
+++ b/services/parser-database/requirements.txt
@@ -2,4 +2,5 @@ Flask
 utils
 https://github.com/timClicks/slate/archive/master.zip
 requests
-pytest-mock
+numpy
+pytest-mock
\ No newline at end of file
diff --git a/services/parser-database/tests/test_connector.py b/services/parser-database/tests/test_connector.py
index 5f25996..e0a23e7 100644
--- a/services/parser-database/tests/test_connector.py
+++ b/services/parser-database/tests/test_connector.py
@@ -35,6 +35,35 @@
 }
 
 
+in_memory_value_mock_no_decimals = {
+    "ciclista": [
+        {"number": 5, "frequency": 3},
+        {"number": 45, "frequency": 6},
+        {"number": 99, "frequency": 9},
+    ],
+    "licencia": [
+        {"number": 89, "frequency": 3},
+        {"number": 45, "frequency": 3},
+        {"number": 125, "frequency": 15},
+    ],
+}
+
+
+articles_in_memory = {'5': {'wordCount': 32}, '45': {'wordCount': 40}, '89': {'wordCount': 16},
+                      '99': {'wordCount': 50}, '125': {'wordCount': 200}}
+
+
+articles_in_memory_no_wordCount = {'5': {}, '45': {}, '89': {}, '99': {}, '125': {}}
+
+
+def logn(num):
+    """
+    Mocks the natural log of a number to try to
+    minimize decimal points
+    """
+    return num
+
+
 @mock.patch("connector.keywords_in_memory", in_memory_value_mock)
 def test_get_articles_that_match_keywords_empty_result_one_keyword():
     result_to_assert_1 = {"alcohol": {}}
@@ -71,6 +100,39 @@ def test_get_articles_that_match_keywords_non_empty_result_two_keywords():
     assert result == result_to_assert_4
 
 
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory)
+def test_get_articles_by_tfidf_value():
+    expected = {
+        "licencia": {"89": {"weight": .3125}, "45": {"weight": .125}, "125": {"weight": .125}},
+        "ciclista": {"5": {"weight": .15625}, "45": {"weight": .25}, "99": {"weight": .3}},
+    }
+    keywords = ["licencia", "ciclista"]
+    with mock.patch("numpy.log", side_effect=logn):
+        assert expected == connector.get_articles_by_tfidf_value(keywords)
+
+
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory)
+def test_get_articles_by_tfidf_value_empty_result():
+    expected = {
+        "casco": {},
+        "luz": {},
+    }
+    keywords = ["casco", "luz"]
+    with mock.patch("numpy.log", side_effect=logn):
+        assert expected == connector.get_articles_by_tfidf_value(keywords)
+
+
+@mock.patch("connector.keywords_in_memory", in_memory_value_mock_no_decimals)
+@mock.patch("connector.articles_in_memory", articles_in_memory_no_wordCount)
+def test_get_articles_by_tfidf_value_missing_word_count():
+    keywords = ["licencia", "ciclista"]
+    with mock.patch("numpy.log", side_effect=logn):
+        with pytest.raises(KeyError):
+            connector.get_articles_by_tfidf_value(keywords)
+
+
 def test_get_documents():
     assert connector.get_documents_to_parse() == [constants.mty_document]
 

From 5e8988994e00aaa37ea1d3207d4060aa9f6da2d7 Mon Sep 17 00:00:00 2001
From: Hisao <jaimehisao2011@gmail.com>
Date: Fri, 28 Aug 2020 01:29:09 +0000
Subject: [PATCH 3/3] Added DB Functionality

---
 services/parser-database/app.py       | 10 +--
 services/parser-database/connector.py | 94 ++++++++++++++++++++++++++-
 services/parser-database/parser.py    | 13 +++-
 3 files changed, 109 insertions(+), 8 deletions(-)

diff --git a/services/parser-database/app.py b/services/parser-database/app.py
index ce5243c..35c42af 100644
--- a/services/parser-database/app.py
+++ b/services/parser-database/app.py
@@ -7,8 +7,8 @@
 from flask import jsonify  # pylint: disable=import-error
 
 from parser import parse_all_documents
-from connector import get_article_by_number
-from connector import get_articles_by_tfidf_value
+from connector import get_article_by_id_db
+from connector import get_articles_that_match_keywords_db
 
 app = Flask(__name__)
 app.config['JSON_AS_ASCII'] = False
@@ -32,14 +32,14 @@ def get_keywords():
         logging.error(error)
         return error, 400
     else:
-        return jsonify(get_articles_by_tfidf_value(json_request['keywords']))
+        return jsonify(get_articles_that_match_keywords_db(json_request['keywords']))
 
 
 @app.route('/articles/<id>', methods=['GET'])
 def get_article_by_number_in_memory(id):
     """Returns the article that matches the ID value
     accoring to the apiSpec.yaml file"""
-    article = get_article_by_number(str(id))
+    article = get_article_by_id_db(str(id))
     if article is not None:
         article = copy(article)
         return jsonify(article)
@@ -50,5 +50,5 @@ def get_article_by_number_in_memory(id):
 
 
 if __name__ == '__main__':
-    parse_all_documents()
+    #parse_all_documents()
     app.run(debug=True, host='0.0.0.0', port=os.getenv("PORT"))
diff --git a/services/parser-database/connector.py b/services/parser-database/connector.py
index 1cce100..0f8a219 100644
--- a/services/parser-database/connector.py
+++ b/services/parser-database/connector.py
@@ -3,16 +3,55 @@
 import requests  # pylint: disable=import-error
 import logging
 import numpy as np  # pylint: disable=import-error
-
+import firebase_admin
+from firebase_admin import credentials
+from firebase_admin import firestore
 import constants
 import env
 
 logging.basicConfig(level=logging.INFO)
 
+class Keywords:
+    """Class for storing articles.
+    """
+    def __init__(self, keyword):
+        self.keyword = keyword
+        self.articles_that_contain_keyword = {}
+
+    def to_dict(self):
+        article_dict = {
+            "keyword": self.number,
+            "contain": self.articles_that_contain_keyword,
+        }
+        return article_dict
+
+    @staticmethod
+    def from_dict(src):
+        self.number = src["number"]
+        self.id = src["id"]
+        self.content = src["keyword"]
+        self.wordCount = src["frequency"]
+
+cred = credentials.ApplicationDefault()
+firebase_admin.initialize_app(cred, {
+  'projectId': 'major-tom-285619',
+})
+
+db = firestore.client()
+
 articles_in_memory = {}
 keywords_in_memory = {}
 
 
+def get_documents_to_parse_db():
+    documents_ref = db.collection(u'documents')
+    docs = documents_ref.stream()
+
+    for doc in docs:
+        print(f'{doc.id} => {doc.to_dict()}')
+    return docs
+
+
 def get_documents_to_parse():
     # When database is integrated, this will go away
     document_list = []
@@ -113,3 +152,56 @@ def store_article(article_dict):
     articles_in_memory[article_dict["id"]] = article_dict
     save_keywords_in_memory(get_keywords(article_dict["content"]), article_dict)
     logging.info('Article ' + article_dict["id"] + ' assigned keywords')
+
+
+def store_article_in_db(article_dict):
+    db.collection(u'articles').document(article_dict["id"]).set(article_dict)
+    save_keywords_in_db(get_keywords(article_dict["content"]), article_dict)
+    logging.info('Article ' + article_dict["id"] + ' assigned keywords')
+
+
+def save_keywords_in_db(keywords, article):
+    """Saves the keywords from an article in memory
+
+    Args:
+        keywords (JSON): contains keywords
+        article (Article): article object
+    """
+    for keyword in keywords:
+        frequency = article["content"].count(keyword)
+
+        doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
+        doc = doc_ref.get()
+        
+        if len(doc) != 0 and doc[0] is not None:
+            from_db = doc[0].to_dict()
+            print(from_db)
+            from_db["matching_articles"][article["id"]] = frequency
+            #print(from_db)
+            db.collection(u'keywords').document(doc[0].id).set(from_db)
+        else:
+            to_send = {"keyword": keyword, "matching_articles": {article["id"]: frequency}}
+            db.collection(u'keywords').add(to_send)
+
+
+def get_articles_that_match_keywords_db(keywords_list):
+    matching_articles = {}
+    for keyword in keywords_list:
+        articles_that_match_keyword = {}
+        doc_ref = db.collection(u'keywords').where('keyword', '==', keyword)
+        doc = doc_ref.get()
+        if doc.exists():
+            doc_dict = doc.to_dict()
+            for article in doc_dict[keyword]:
+                articles_that_match_keyword[str(article["id"])] = {"weight": article["frequency"]}
+        matching_articles[keyword] = articles_that_match_keyword
+    return matching_articles
+
+
+def get_article_by_id_db(art_num):
+    documents_ref = db.collection(u'articles').document(art_num)
+    doc = documents_ref.get()
+    if doc is not None:
+        return doc.to_dict()
+    else:
+        return None
\ No newline at end of file
diff --git a/services/parser-database/parser.py b/services/parser-database/parser.py
index e35f3ce..a506bf2 100644
--- a/services/parser-database/parser.py
+++ b/services/parser-database/parser.py
@@ -38,7 +38,7 @@ class Article:
     def __init__(self, number, content):
         self.number = number
         self.content = content
-        self.id = str(number)
+        self.id = 'monterrey'+str(number)
 
     def to_dict(self):
         article_dict = {
@@ -49,6 +49,13 @@ def to_dict(self):
         }
         return article_dict
 
+    @staticmethod
+    def from_dict(src):
+        self.number = src["number"]
+        self.id = src["id"]
+        self.content = src["content"]
+        self.wordCount = src["wordCount"]
+
 
 def identify_articles(pdf_text):
     """Identifies articles and returns a list of Article objects.
@@ -113,4 +120,6 @@ def parse(document_to_parse):
 
             for article in articles:
                 dictionary = article.to_dict()
-                connector.store_article(dictionary)
+                connector.store_article_in_db(dictionary)
+
+#parse_all_documents()
\ No newline at end of file