From bf7b5577698fc8061647d9badd11f4166e304f2b Mon Sep 17 00:00:00 2001 From: LockedInTheSkage Date: Thu, 18 Apr 2024 20:03:26 +0200 Subject: [PATCH 1/5] db_interface tests and deleters --- .../flashcards/knowledge_base/db_interface.py | 62 +++++++++++++++---- backend/flashcards/knowledge_base/tests.py | 44 +++++++++++++ 2 files changed, 95 insertions(+), 11 deletions(-) create mode 100644 backend/flashcards/knowledge_base/tests.py diff --git a/backend/flashcards/knowledge_base/db_interface.py b/backend/flashcards/knowledge_base/db_interface.py index 09b88fcb..233df5f6 100644 --- a/backend/flashcards/knowledge_base/db_interface.py +++ b/backend/flashcards/knowledge_base/db_interface.py @@ -12,10 +12,6 @@ class Curriculum: #embedding: list[float] pdf_name: str - - - - class DatabaseInterface(ABC): """ Abstract class for Connecting to a Database @@ -48,20 +44,32 @@ def get_curriculum(self, embedding: list[float]) -> list[Curriculum]: @abstractmethod def post_curriculum( - self, curriculum: str, page_num: int, paragraph_num: int, embedding: list[float] + self, curriculum: str, page_num: int, pdf_name: str, embedding: list[float] ) -> bool: """ Post the curriculum to the database Args: curriculum (str): The curriculum to be posted - embedding (list[float]): The embedding of the question + embedding (list[float]): The embedding of the page Returns: bool: True if the curriculum was posted, False otherwise """ pass + @abstarctmethod + def delete_all_curriculum(self) -> bool: + """ + Delete all curriculum from the database + + Returns: + bool: True if the curriculum was deleted, False otherwise + """ + pass + + + class MongoDB(DatabaseInterface): def __init__(self): self.client = MongoClient(Config().MONGODB_URI) @@ -108,15 +116,15 @@ def get_curriculum(self, embedding: list[float]) -> list[Curriculum]: return results def post_curriculum( - self, curriculum: str, page_num: int, paragraph_num: int, embedding: list[float] + self, curriculum: str, page_num: int, pdf_name: str, embedding: list[float] ) -> bool: if not curriculum: raise ValueError("Curriculum cannot be None") - if not page_num: + if page_num == None: raise ValueError("Page number cannot be None") - if not paragraph_num: + if pdf_name == None: raise ValueError("Paragraph number cannot be None") if not embedding: @@ -128,11 +136,43 @@ def post_curriculum( { "text": curriculum, "pageNum": page_num, - "paragraphNum": paragraph_num, + "pdfName": pdf_name, "embedding": embedding, } ) return True except: return False - + + def delete_all_curriculum(self) -> bool: + """ + Delete all curriculum from the database + + Returns: + bool: True if all curriculum were deleted, False otherwise + """ + try: + # Deleting all documents from MongoDB collection + self.collection.delete_many({}) + return True + except Exception as e: + print("Error deleting curriculum:", e) + return False + + def delete_pdf_pages(self, pdf_name: str) -> bool: + """ + Delete all curriculum entries with a specific PDF name from the database + + Args: + pdf_name (str): The PDF name to match for deletion + + Returns: + bool: True if all matching curriculum entries were deleted, False otherwise + """ + try: + # Deleting documents from MongoDB collection based on a condition + self.collection.delete_many({"pdfName": pdf_name}) + return True + except Exception as e: + print("Error deleting curriculum:", e) + return False \ No newline at end of file diff --git a/backend/flashcards/knowledge_base/tests.py b/backend/flashcards/knowledge_base/tests.py new file mode 100644 index 00000000..ef48e493 --- /dev/null +++ b/backend/flashcards/knowledge_base/tests.py @@ -0,0 +1,44 @@ +from django.test import TestCase +from flashcards.knowledge_base.embeddings import cosine_similarity +from flashcards.knowledge_base.embeddings import OpenAIEmbedding +from db_interface import MongoDB + + +class MongoDBTestCase(TestCase): + def setUp(self): + # Initialize MongoDB connection + self.mongo = MongoDB() + + self.curriculum = {"pdf1":"Antonio López de Santa Anna var en meksikansk politiker og general. Fra slutten av 1820-årene og frem til 1855 dominerte han Mexicos politiske liv, og var president seks ganger. Han var en ytterst fargerik personlighet uten noen politisk filosofi, men meget populær blant folket.", + "pdf2":"I 1829 gjorde spanske tropper et mislykket forsøk på å gjenerobre Mexico. Santa Annas seier mot invasjonsstyrken i Tampico ga ham anerkjennelse som nasjonalist og militærstrateg, et omdømme han nøt godt av de neste 25 årene. Gjennom karrieren var Santa Anna en typisk caudillo som vekslet mellom politisk og militær makt, i en tid da militærmakt var nøkkelen til politisk kontroll.", + "pdf3":"I 1833 kom han til makten som føderalist og motstander av den romersk-katolske kirken; i praksis etablerte han en sentralisert stat. Han forble ved presidentmakten til 1836, da han ledet meksikanske tropper inn i Texas for å dempe Texasrevolusjonen. Her ble han tatt til fange av Sam Houston, og ble tvunget til å anerkjenne den nye Republikken Texas."} + + for key in self.curriculum.keys(): + self.mongo.post_curriculum(self.curriculum[key], 1, key, OpenAIEmbedding().get_embedding(self.curriculum[key])) + + def test_get_curriculum(self): + # Test getting curriculum + curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) + self.assertEqual(len(curriculum), 1) + self.assertEqual(curriculum[0].text, self.curriculum["pdf1"]) + self.assertEqual(curriculum[0].page_num, 1) + self.assertEqual(curriculum[0].pdf_name, "pdf1") + + # Test getting curriculum using query + curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var stor i Texas under revolusjonen. Sam Houston var personlig en stor tilhenger")) + self.assertEqual(len(curriculum), 1) + self.assertEqual(curriculum[0].text, self.curriculum["pdf3"]) + self.assertEqual(curriculum[0].page_num, 1) + self.assertEqual(curriculum[0].pdf_name, "pdf3") + + def test_delete_pdf_pages(self): + # Test deleting curriculum entries with specific PDF name + self.assertTrue(self.mongo.delete_pdf_pages("pdf1")) + + # Check if curriculum entries with pdfName="pdf1" were deleted + curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) + self.assertEqual(len(curriculum), 0) + + def tearDown(self): + # Clean up test data + self.mongo.delete_all_curriculum() \ No newline at end of file From ec457a7126527c9ff05fb2fba95d7c475832f1b8 Mon Sep 17 00:00:00 2001 From: LockedInTheSkage Date: Tue, 23 Apr 2024 19:43:49 +0200 Subject: [PATCH 2/5] #Add database tests --- backend/config.py | 1 + backend/flashcards/knowledge_base/db_interface.py | 6 +++--- backend/flashcards/knowledge_base/tests.py | 12 ++++++------ backend/flashcards/tests.py | 4 ++-- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/backend/config.py b/backend/config.py index b9eaef6c..d8b26e20 100644 --- a/backend/config.py +++ b/backend/config.py @@ -15,5 +15,6 @@ def __init__(self, path='.env', gpt_model="gpt-3.5-turbo"): load_dotenv(dotenv_path=path) self.API_KEY = os.getenv('OPENAI_API_KEY') self.MONGODB_URI = os.getenv('MONGODB_URI') + self.MONGODB_TEST_URI = os.getenv('MONGODB_TEST_URI') diff --git a/backend/flashcards/knowledge_base/db_interface.py b/backend/flashcards/knowledge_base/db_interface.py index 233df5f6..a4a2526c 100644 --- a/backend/flashcards/knowledge_base/db_interface.py +++ b/backend/flashcards/knowledge_base/db_interface.py @@ -58,7 +58,7 @@ def post_curriculum( """ pass - @abstarctmethod + @abstractmethod def delete_all_curriculum(self) -> bool: """ Delete all curriculum from the database @@ -71,8 +71,8 @@ def delete_all_curriculum(self) -> bool: class MongoDB(DatabaseInterface): - def __init__(self): - self.client = MongoClient(Config().MONGODB_URI) + def __init__(self, uri=Config().MONGODB_URI): + self.client = MongoClient(uri) self.db = self.client["test-curriculum-database"] self.collection = self.db["test-curriculum-collection"] self.similarity_threshold = 0.83 diff --git a/backend/flashcards/knowledge_base/tests.py b/backend/flashcards/knowledge_base/tests.py index ef48e493..942a93ad 100644 --- a/backend/flashcards/knowledge_base/tests.py +++ b/backend/flashcards/knowledge_base/tests.py @@ -1,13 +1,14 @@ from django.test import TestCase from flashcards.knowledge_base.embeddings import cosine_similarity from flashcards.knowledge_base.embeddings import OpenAIEmbedding -from db_interface import MongoDB +from flashcards.knowledge_base.db_interface import MongoDB +from config import Config -class MongoDBTestCase(TestCase): +class MongoDBTest(TestCase): def setUp(self): # Initialize MongoDB connection - self.mongo = MongoDB() + self.mongo = MongoDB(uri=Config().MONGODB_TEST_URI) self.curriculum = {"pdf1":"Antonio López de Santa Anna var en meksikansk politiker og general. Fra slutten av 1820-årene og frem til 1855 dominerte han Mexicos politiske liv, og var president seks ganger. Han var en ytterst fargerik personlighet uten noen politisk filosofi, men meget populær blant folket.", "pdf2":"I 1829 gjorde spanske tropper et mislykket forsøk på å gjenerobre Mexico. Santa Annas seier mot invasjonsstyrken i Tampico ga ham anerkjennelse som nasjonalist og militærstrateg, et omdømme han nøt godt av de neste 25 årene. Gjennom karrieren var Santa Anna en typisk caudillo som vekslet mellom politisk og militær makt, i en tid da militærmakt var nøkkelen til politisk kontroll.", @@ -25,7 +26,7 @@ def test_get_curriculum(self): self.assertEqual(curriculum[0].pdf_name, "pdf1") # Test getting curriculum using query - curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var stor i Texas under revolusjonen. Sam Houston var personlig en stor tilhenger")) + curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var ikke stor i Texas under revolusjonen. Sam Houston var personlig en protestant")) self.assertEqual(len(curriculum), 1) self.assertEqual(curriculum[0].text, self.curriculum["pdf3"]) self.assertEqual(curriculum[0].page_num, 1) @@ -36,8 +37,7 @@ def test_delete_pdf_pages(self): self.assertTrue(self.mongo.delete_pdf_pages("pdf1")) # Check if curriculum entries with pdfName="pdf1" were deleted - curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) - self.assertEqual(len(curriculum), 0) + self.assertRaises(ValueError("No documents found"),self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"]))) def tearDown(self): # Clean up test data diff --git a/backend/flashcards/tests.py b/backend/flashcards/tests.py index 398627e3..ed022c81 100644 --- a/backend/flashcards/tests.py +++ b/backend/flashcards/tests.py @@ -12,7 +12,7 @@ class TextToFlashcardTest(TestCase): - def setUp(self) -> None: + def setUp(self): self.context = "Revenge of the Sith is set three years after the onset of the Clone Wars as established in Attack of the Clones. The Jedi are spread across the galaxy in a full-scale war against the Separatists. The Jedi Council dispatches Jedi Master Obi-Wan Kenobi on a mission to defeat General Grievous, the head of the Separatist army and Count Dooku's former apprentice, to put an end to the war. Meanwhile, after having visions of his wife Padmé Amidala dying in childbirth, Jedi Knight Anakin Skywalker is tasked by the Council to spy on Palpatine, the Supreme Chancellor of the Galactic Republic and, secretly, a Sith Lord. Palpatine manipulates Anakin into turning to the dark side of the Force and becoming his apprentice, Darth Vader, with wide-ranging consequences for the galaxy." def test_openai_flashcard_generator(self): @@ -36,6 +36,6 @@ def test_parse_for_anki(self): self.assertIsInstance(anki_format, str) self.assertTrue(re.search("(.*:.*\n)*(.*:.*)", anki_format)) - def process_answer_test(self): + def test_process_answer(self): user_input = "Give me an example of the use of 'the' in a sentence?" self.assertFalse(None, process_answer(user_input)) From 13fd212563692186338ac8ff4c12947cdf5fc63d Mon Sep 17 00:00:00 2001 From: LockedInTheSkage Date: Tue, 23 Apr 2024 21:10:31 +0200 Subject: [PATCH 3/5] Interface test update --- backend/flashcards/knowledge_base/db_interface.py | 4 ++-- backend/flashcards/knowledge_base/tests.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backend/flashcards/knowledge_base/db_interface.py b/backend/flashcards/knowledge_base/db_interface.py index a4a2526c..0c08746a 100644 --- a/backend/flashcards/knowledge_base/db_interface.py +++ b/backend/flashcards/knowledge_base/db_interface.py @@ -95,7 +95,7 @@ def get_curriculum(self, embedding: list[float]) -> list[Curriculum]: # Execute the query documents = self.collection.aggregate([query]) - + # print(list(documents)) if not documents: raise ValueError("No documents found") @@ -110,7 +110,7 @@ def get_curriculum(self, embedding: list[float]) -> list[Curriculum]: cosine_similarity(embedding, document["embedding"]) > self.similarity_threshold ): - results.append(Curriculum(text = document["text"], page_num = document["page_num"], pdf_name = document["pdf_name"])) + results.append(Curriculum(text = document["text"], page_num = document["pageNum"], pdf_name = document["pdfName"])) # Returns a list of relevant curriculum (can be 0, 1, 2, 3) return results diff --git a/backend/flashcards/knowledge_base/tests.py b/backend/flashcards/knowledge_base/tests.py index 942a93ad..f8d8524d 100644 --- a/backend/flashcards/knowledge_base/tests.py +++ b/backend/flashcards/knowledge_base/tests.py @@ -18,6 +18,7 @@ def setUp(self): self.mongo.post_curriculum(self.curriculum[key], 1, key, OpenAIEmbedding().get_embedding(self.curriculum[key])) def test_get_curriculum(self): + print("Get curriculum") # Test getting curriculum curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) self.assertEqual(len(curriculum), 1) @@ -33,11 +34,14 @@ def test_get_curriculum(self): self.assertEqual(curriculum[0].pdf_name, "pdf3") def test_delete_pdf_pages(self): + print("Delete stuff") # Test deleting curriculum entries with specific PDF name self.assertTrue(self.mongo.delete_pdf_pages("pdf1")) # Check if curriculum entries with pdfName="pdf1" were deleted - self.assertRaises(ValueError("No documents found"),self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"]))) + curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) + + self.assertEqual(curriculum[0].pdf_name, "pdf2") def tearDown(self): # Clean up test data From 817e204554960a28dc297b15a1accb6516d56e2e Mon Sep 17 00:00:00 2001 From: LockedInTheSkage Date: Tue, 23 Apr 2024 21:26:08 +0200 Subject: [PATCH 4/5] test: Finish tests and DB interface --- backend/flashcards/knowledge_base/tests.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/backend/flashcards/knowledge_base/tests.py b/backend/flashcards/knowledge_base/tests.py index f8d8524d..ebbb8197 100644 --- a/backend/flashcards/knowledge_base/tests.py +++ b/backend/flashcards/knowledge_base/tests.py @@ -20,15 +20,13 @@ def setUp(self): def test_get_curriculum(self): print("Get curriculum") # Test getting curriculum - curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) - self.assertEqual(len(curriculum), 1) + curriculum = self.mongo.get_curriculum("pdf1",OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) self.assertEqual(curriculum[0].text, self.curriculum["pdf1"]) self.assertEqual(curriculum[0].page_num, 1) self.assertEqual(curriculum[0].pdf_name, "pdf1") # Test getting curriculum using query - curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var ikke stor i Texas under revolusjonen. Sam Houston var personlig en protestant")) - self.assertEqual(len(curriculum), 1) + curriculum = self.mongo.get_curriculum("pdf3",OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var ikke stor i Texas under revolusjonen. Sam Houston var personlig en protestant")) self.assertEqual(curriculum[0].text, self.curriculum["pdf3"]) self.assertEqual(curriculum[0].page_num, 1) self.assertEqual(curriculum[0].pdf_name, "pdf3") @@ -39,7 +37,7 @@ def test_delete_pdf_pages(self): self.assertTrue(self.mongo.delete_pdf_pages("pdf1")) # Check if curriculum entries with pdfName="pdf1" were deleted - curriculum = self.mongo.get_curriculum(OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) + curriculum = self.mongo.get_curriculum("pdf1",OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) self.assertEqual(curriculum[0].pdf_name, "pdf2") From e59b52585d02f6a0ac8e1ab28e793fae46b4ca5c Mon Sep 17 00:00:00 2001 From: LockedInTheSkage Date: Tue, 23 Apr 2024 21:37:17 +0200 Subject: [PATCH 5/5] refactor: split tests and remove print statements --- backend/flashcards/knowledge_base/db_interface.py | 3 +-- backend/flashcards/knowledge_base/tests.py | 13 ++++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/backend/flashcards/knowledge_base/db_interface.py b/backend/flashcards/knowledge_base/db_interface.py index f26705a0..0aef312b 100644 --- a/backend/flashcards/knowledge_base/db_interface.py +++ b/backend/flashcards/knowledge_base/db_interface.py @@ -79,7 +79,7 @@ def delete_all_curriculum(self) -> bool: pass class MongoDB(DatabaseInterface): - def __init__(self, uri=Config().MONGODB_URI): + def __init__(self, uri:str=Config().MONGODB_URI): self.client = MongoClient(uri) self.db = self.client["test-curriculum-database"] self.collection = self.db["test-curriculum-collection"] @@ -105,7 +105,6 @@ def get_curriculum(self, pdf_name: str, embedding: list[float]) -> list[Page]: # Execute the query documents = self.collection.aggregate([query]) - # print(list(documents)) if not documents: raise ValueError("No documents found") diff --git a/backend/flashcards/knowledge_base/tests.py b/backend/flashcards/knowledge_base/tests.py index ebbb8197..09b85b94 100644 --- a/backend/flashcards/knowledge_base/tests.py +++ b/backend/flashcards/knowledge_base/tests.py @@ -17,22 +17,21 @@ def setUp(self): for key in self.curriculum.keys(): self.mongo.post_curriculum(self.curriculum[key], 1, key, OpenAIEmbedding().get_embedding(self.curriculum[key])) - def test_get_curriculum(self): - print("Get curriculum") - # Test getting curriculum - curriculum = self.mongo.get_curriculum("pdf1",OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) + def test_get_curriculum_for_embedding_for_the_same_document_content(self): + # Test getting curriculum for from the same text + curriculum = self.mongo.get_curriculum("pdf1", OpenAIEmbedding().get_embedding(self.curriculum["pdf1"])) self.assertEqual(curriculum[0].text, self.curriculum["pdf1"]) self.assertEqual(curriculum[0].page_num, 1) self.assertEqual(curriculum[0].pdf_name, "pdf1") - # Test getting curriculum using query - curriculum = self.mongo.get_curriculum("pdf3",OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var ikke stor i Texas under revolusjonen. Sam Houston var personlig en protestant")) + def test_get_curriculum_for_query_similar_to_pdf(self): + # Test getting curriculum using query similar text + curriculum = self.mongo.get_curriculum("pdf3", OpenAIEmbedding().get_embedding("Den romersk-katolske kirken var ikke stor i Texas under revolusjonen. Sam Houston var personlig en protestant")) self.assertEqual(curriculum[0].text, self.curriculum["pdf3"]) self.assertEqual(curriculum[0].page_num, 1) self.assertEqual(curriculum[0].pdf_name, "pdf3") def test_delete_pdf_pages(self): - print("Delete stuff") # Test deleting curriculum entries with specific PDF name self.assertTrue(self.mongo.delete_pdf_pages("pdf1"))