From c417826d69055e9a8580a0ba29173dc28339c092 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 17:00:28 +0100 Subject: [PATCH 01/21] Splitting text into sentence for the voice interface output --- wafl/interface/voice_interface.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/wafl/interface/voice_interface.py b/wafl/interface/voice_interface.py index 29190d4c..319f32b2 100644 --- a/wafl/interface/voice_interface.py +++ b/wafl/interface/voice_interface.py @@ -2,6 +2,8 @@ import random import re +import nltk + from wafl.events.utils import remove_text_between_brackets from wafl.interface.base_interface import BaseInterface from wafl.interface.utils import not_good_enough @@ -55,7 +57,8 @@ async def output(self, text: str, silent: bool = False): text = text self._insert_utterance(speaker="bot", text=text) print(COLOR_START + "bot> " + text + COLOR_END) - await self._speaker.speak(text) + for sentence in nltk.sent_tokenize(text): + await self._speaker.speak(sentence) self.bot_has_spoken(True) async def input(self) -> str: From c6ef1bab1f7ec1bba631d438d220534da338a18b Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 17:51:25 +0100 Subject: [PATCH 02/21] added cache file --- requirements.txt | 3 ++- setup.py | 1 + todo.txt | 4 +++- wafl/events/utils.py | 13 ++++++++++++- wafl/knowledge/single_file_knowledge.py | 6 ++++++ wafl/templates/config.json | 1 + 6 files changed, 25 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index da1a90d4..c01039f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,4 +17,5 @@ sphinx-rtd-theme==1.2.0 bluepy==1.3.0 einops==0.6.1 g2p-en==2.1.0 -pyyaml==6.0.1 \ No newline at end of file +pyyaml==6.0.1 +joblib==1.4.2 diff --git a/setup.py b/setup.py index 18461523..2c9477a3 100644 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ "einops==0.6.1", "g2p-en==2.1.0", "pyyaml==6.0.1", + "joblib==1.4.2", ], classifiers=[ "License :: OSI Approved :: MIT License", diff --git a/todo.txt b/todo.txt index 74dca50c..38b48625 100644 --- a/todo.txt +++ b/todo.txt @@ -1,5 +1,7 @@ wafl: -- create indices +- create indice +- silence output when someone speak +- multiple models in wafl-llm, with selection from frontend training: - retrain phi3 diff --git a/wafl/events/utils.py b/wafl/events/utils.py index 52734ea2..a074a432 100644 --- a/wafl/events/utils.py +++ b/wafl/events/utils.py @@ -1,5 +1,9 @@ +import joblib +import os import re + + from wafl.knowledge.single_file_knowledge import SingleFileKnowledge from wafl.simple_text_processing.normalize import normalized @@ -28,4 +32,11 @@ def load_knowledge(config, logger): else: rules_txt = config.get_value("rules") - return SingleFileKnowledge(config, rules_txt, logger=logger) + if os.path.exists(config.get_value("index_filename")): + knowledge = joblib.load(config.get_value("index_filename")) + if knowledge.hash == hash(rules_txt): + return knowledge + + knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) + joblib.dump(knowledge, config.get_value("index_filename")) + return knowledge diff --git a/wafl/knowledge/single_file_knowledge.py b/wafl/knowledge/single_file_knowledge.py index 747c5a15..a9741b0f 100644 --- a/wafl/knowledge/single_file_knowledge.py +++ b/wafl/knowledge/single_file_knowledge.py @@ -1,5 +1,6 @@ import asyncio import logging +import random from typing import List import nltk @@ -32,6 +33,11 @@ class SingleFileKnowledge(BaseKnowledge): _max_rules_per_type = 3 def __init__(self, config, rules_text=None, logger=None): + if rules_text: + self.hash = hash(rules_text) + else: + self.hash = str(random.randint(0, 1000000)) + self._logger = logger self._facts_dict = {} self._rules_dict = {} diff --git a/wafl/templates/config.json b/wafl/templates/config.json index 0f0eccf3..ebf30b01 100644 --- a/wafl/templates/config.json +++ b/wafl/templates/config.json @@ -4,6 +4,7 @@ "deactivate_sound": true, "rules": "rules.yaml", "functions": "functions.py", + "index_filename": "knowledge_cache", "frontend_port": 8090, "llm_model": { "model_host": "localhost", From e7fffb75f75a5a1a9089f26ca7a0e07e7d000ee6 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 20:30:41 +0100 Subject: [PATCH 03/21] deleted the one liner query when the bot repeats itself --- todo.txt | 4 +++- wafl/answerer/dialogue_answerer.py | 10 ---------- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/todo.txt b/todo.txt index 38b48625..19205660 100644 --- a/todo.txt +++ b/todo.txt @@ -1,5 +1,7 @@ wafl: -- create indice +/- create indices +- allow files/folders to be indexed (modify rules.yaml and then re-index) +- add keywords in retrieval from tfidf - silence output when someone speak - multiple models in wafl-llm, with selection from frontend diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 37b042b4..ec0961a7 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -51,10 +51,6 @@ async def answer(self, query_text: str) -> Answer: rules_text = await self._get_relevant_rules(conversation) if not conversation: conversation = create_one_liner(query_text) - last_bot_utterances = conversation.get_last_speaker_utterances("bot", 3) - last_user_utterance = conversation.get_last_speaker_utterances("user", 1) - if not last_user_utterance: - last_user_utterance = query_text conversational_timestamp = len(conversation) facts = await self._get_relevant_facts( query, @@ -73,12 +69,6 @@ async def answer(self, query_text: str) -> Answer: answer_text, memories = await self._apply_substitutions( original_answer_text ) - if answer_text in last_bot_utterances and not is_executable( - original_answer_text - ): - conversation = create_one_liner(last_user_utterance[-1]) - continue - if self._delete_current_rule in answer_text: self._prior_rules = [] final_answer_text += answer_text From 1e3c4ac20ed4191cfd3d9b7d5f7eed669ae0a986 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 20:34:21 +0100 Subject: [PATCH 04/21] adding fake memory to increase rule retrieval --- wafl/answerer/dialogue_answerer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index ec0961a7..0f58a2f2 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -113,6 +113,9 @@ async def _get_relevant_facts( "The bot can answer the question while informing the user that the answer was not retrieved" ) + if has_prior_rules: + memory += f"\nThe user wants the bot to answer the query using the rules." + return memory async def _get_relevant_rules(self, conversation: Conversation) -> List[str]: From 8cac4e359e977ec9b676cafe05aba25eed991133 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 20:47:01 +0100 Subject: [PATCH 05/21] not deleting the rules after they are completed --- wafl/answerer/dialogue_answerer.py | 7 +++++-- wafl/answerer/rule_maker.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 0f58a2f2..a4a6ef8f 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -14,14 +14,14 @@ from wafl.answerer.rule_maker import RuleMaker from wafl.connectors.clients.llm_chitchat_answer_client import LLMChitChatAnswerClient from wafl.extractors.dataclasses import Query, Answer -from wafl.interface.conversation import Conversation, Utterance +from wafl.interface.conversation import Conversation from wafl.simple_text_processing.questions import is_question class DialogueAnswerer(BaseAnswerer): def __init__(self, config, knowledge, interface, code_path, logger): self._threshold_for_facts = 0.85 - self._delete_current_rule = "[delete_rule]" + self._delete_current_rule = "" self._client = LLMChitChatAnswerClient(config) self._knowledge = knowledge self._logger = logger @@ -75,6 +75,9 @@ async def answer(self, query_text: str) -> Answer: break final_answer_text += answer_text + if final_answer_text.strip() == self._delete_current_rule: + continue + if not memories: break diff --git a/wafl/answerer/rule_maker.py b/wafl/answerer/rule_maker.py index e6fd87df..b246211b 100644 --- a/wafl/answerer/rule_maker.py +++ b/wafl/answerer/rule_maker.py @@ -31,7 +31,7 @@ async def create_from_query(self, conversation: "Conversation") -> List[str]: rules_text = rule.get_string_using_template( "- {effect}:\n{clauses}\n" + rule.indent_str - + f'- After you completed all the steps output "{self._delete_current_rule}".\n' +# + f'- After you completed all the steps output "{self._delete_current_rule}".\n' ) rules_texts.append(rules_text) await self._interface.add_fact(f"The bot remembers the rule:\n{rules_text}") From 90f34a51d9df33008e28a9b22bf00a12d729472f Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 15 Jun 2024 20:49:42 +0100 Subject: [PATCH 06/21] never deleting the rules --- wafl/answerer/dialogue_answerer.py | 11 ----------- wafl/answerer/rule_maker.py | 3 --- 2 files changed, 14 deletions(-) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index a4a6ef8f..cbaf2451 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -21,7 +21,6 @@ class DialogueAnswerer(BaseAnswerer): def __init__(self, config, knowledge, interface, code_path, logger): self._threshold_for_facts = 0.85 - self._delete_current_rule = "" self._client = LLMChitChatAnswerClient(config) self._knowledge = knowledge self._logger = logger @@ -38,7 +37,6 @@ def __init__(self, config, knowledge, interface, code_path, logger): config, interface, max_num_rules=1, - delete_current_rule=self._delete_current_rule, ) async def answer(self, query_text: str) -> Answer: @@ -69,14 +67,8 @@ async def answer(self, query_text: str) -> Answer: answer_text, memories = await self._apply_substitutions( original_answer_text ) - if self._delete_current_rule in answer_text: - self._prior_rules = [] - final_answer_text += answer_text - break final_answer_text += answer_text - if final_answer_text.strip() == self._delete_current_rule: - continue if not memories: break @@ -116,9 +108,6 @@ async def _get_relevant_facts( "The bot can answer the question while informing the user that the answer was not retrieved" ) - if has_prior_rules: - memory += f"\nThe user wants the bot to answer the query using the rules." - return memory async def _get_relevant_rules(self, conversation: Conversation) -> List[str]: diff --git a/wafl/answerer/rule_maker.py b/wafl/answerer/rule_maker.py index b246211b..1905ba18 100644 --- a/wafl/answerer/rule_maker.py +++ b/wafl/answerer/rule_maker.py @@ -11,14 +11,12 @@ def __init__( config: "BaseConfig", interface: "BaseInterface", max_num_rules: int, - delete_current_rule: str, max_recursion: int = 3, ): self._knowledge = knowledge self._config = config self._interface = interface self._max_num_rules = max_num_rules - self._delete_current_rule = delete_current_rule if not config.get_value("max_recursion"): self._max_indentation = max_recursion else: @@ -31,7 +29,6 @@ async def create_from_query(self, conversation: "Conversation") -> List[str]: rules_text = rule.get_string_using_template( "- {effect}:\n{clauses}\n" + rule.indent_str -# + f'- After you completed all the steps output "{self._delete_current_rule}".\n' ) rules_texts.append(rules_text) await self._interface.add_fact(f"The bot remembers the rule:\n{rules_text}") From b688e74ab3a6ed93c7b25257cf539df19f0d5594 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 22 Jun 2024 19:30:53 +0100 Subject: [PATCH 07/21] Added readers for paths to index --- wafl/events/utils.py | 20 +++++++++++++++++--- wafl/readers/base_reader.py | 16 ++++++++++++++++ wafl/readers/pdf_reader.py | 15 +++++++++++++++ wafl/readers/reader_factory.py | 15 +++++++++++++++ wafl/readers/text_reader.py | 18 ++++++++++++++++++ 5 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 wafl/readers/base_reader.py create mode 100644 wafl/readers/pdf_reader.py create mode 100644 wafl/readers/reader_factory.py create mode 100644 wafl/readers/text_reader.py diff --git a/wafl/events/utils.py b/wafl/events/utils.py index a074a432..96c6302c 100644 --- a/wafl/events/utils.py +++ b/wafl/events/utils.py @@ -1,10 +1,10 @@ import joblib import os import re - - +import yaml from wafl.knowledge.single_file_knowledge import SingleFileKnowledge +from wafl.readers.reader_factory import ReaderFactory from wafl.simple_text_processing.normalize import normalized @@ -22,9 +22,22 @@ def remove_text_between_brackets(text: str) -> str: return re.sub(r"(\[.*?\])", "", text) + +def _add_indices_to_knowledge(knowledge, config, logger): + filename = config.get_value("index") + indices = yaml.safe_load(filename) + for path in indices["paths"]: + for root, _, files in os.walk(path): + for file in files: + with open(os.path.join(root, file)) as f: + reader = ReaderFactory.get_reader(file) + for chunk in reader.get_chunks(f.read()): + knowledge.add(chunk) + + def load_knowledge(config, logger): if ".yaml" in config.get_value("rules") and not any( - item in config.get_value("rules") for item in [" ", "\n"] + item in config.get_value("rules") for item in [" ", "\n"] #### ALLOW INDEXING FROM rules.yaml ): with open(config.get_value("rules")) as file: rules_txt = file.read() @@ -38,5 +51,6 @@ def load_knowledge(config, logger): return knowledge knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) + knowledge = _add_indices_to_knowledge(knowledge, config, logger) joblib.dump(knowledge, config.get_value("index_filename")) return knowledge diff --git a/wafl/readers/base_reader.py b/wafl/readers/base_reader.py new file mode 100644 index 00000000..0d93cbe9 --- /dev/null +++ b/wafl/readers/base_reader.py @@ -0,0 +1,16 @@ +from typing import List + + +class BaseReader: + def read(self, file_path: str) -> str: + raise NotImplementedError() + + def get_chunks(self, text: str) -> List[str]: + raise NotImplementedError() + + def _chunk_text(self, text: str, size: int, overlap: int) -> List[str]: + chunks = [] + for i in range(0, len(text), size - overlap): + chunks.append(text[i:i + size]) + return chunks + diff --git a/wafl/readers/pdf_reader.py b/wafl/readers/pdf_reader.py new file mode 100644 index 00000000..03b28a4d --- /dev/null +++ b/wafl/readers/pdf_reader.py @@ -0,0 +1,15 @@ +from logging import getLogger + +from wafl.readers.base_reader import BaseReader + +_logger = getLogger(__name__) + + +class PdfReader(BaseReader): + def __init__(self, chunk_size: int, overlap: int): + self.chunk_size = chunk_size + self.overlap = overlap + + def get_chunks(self, filename): + _logger.info(f"Reading PDF file: {filename}") + diff --git a/wafl/readers/reader_factory.py b/wafl/readers/reader_factory.py new file mode 100644 index 00000000..ee1c61c1 --- /dev/null +++ b/wafl/readers/reader_factory.py @@ -0,0 +1,15 @@ +from wafl.readers.pdf_reader import PdfReader +from wafl.readers.text_reader import TextReader + + +class ReaderFactory: + _chunk_size = 1000 + _overlap = 100 + + @staticmethod + def get_reader(filename): + if ".pdf" in filename.lower(): + return PdfReader(ReaderFactory._chunk_size, ReaderFactory._overlap) + else: + return TextReader(ReaderFactory._chunk_size, ReaderFactory._overlap) + diff --git a/wafl/readers/text_reader.py b/wafl/readers/text_reader.py new file mode 100644 index 00000000..a6924437 --- /dev/null +++ b/wafl/readers/text_reader.py @@ -0,0 +1,18 @@ +from logging import getLogger + +from wafl.readers.base_reader import BaseReader + +_logger = getLogger(__name__) + + +class TextReader(BaseReader): + def __init__(self, chunk_size: int, overlap: int): + self.chunk_size = chunk_size + self.overlap = overlap + + def get_chunks(self, filename): + _logger.info(f"Reading text file: {filename}") + with open(filename, "r") as file: + return self._chunk_text(file.read(), self.chunk_size, self.overlap) + + From 02581458c4b348f3b5cd568c8c02e7c11529bbf9 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 23 Jun 2024 18:50:04 +0100 Subject: [PATCH 08/21] added test for appending an index path --- tests/config.json | 1 + tests/indices.yaml | 2 + tests/test_indexing.py | 29 +++++++++++ wafl/command_line.py | 16 ++++-- wafl/events/conversation_events.py | 3 +- wafl/events/utils.py | 36 -------------- wafl/knowledge/indexing_implementation.py | 60 +++++++++++++++++++++++ wafl/knowledge/utils.py | 6 +-- wafl/readers/reader_factory.py | 15 ++++-- 9 files changed, 117 insertions(+), 51 deletions(-) create mode 100644 tests/indices.yaml create mode 100644 tests/test_indexing.py create mode 100644 wafl/knowledge/indexing_implementation.py diff --git a/tests/config.json b/tests/config.json index 09c727b1..9ad52379 100644 --- a/tests/config.json +++ b/tests/config.json @@ -3,6 +3,7 @@ "waking_up_sound": true, "deactivate_sound": true, "rules": "rules.yaml", + "index": "indices.yaml", "functions": "functions.py", "max_recursion": 2, "llm_model": { diff --git a/tests/indices.yaml b/tests/indices.yaml new file mode 100644 index 00000000..51c91ca2 --- /dev/null +++ b/tests/indices.yaml @@ -0,0 +1,2 @@ +paths: +- ../../tests/files_to_index/ diff --git a/tests/test_indexing.py b/tests/test_indexing.py new file mode 100644 index 00000000..47b63181 --- /dev/null +++ b/tests/test_indexing.py @@ -0,0 +1,29 @@ +import os +import yaml + +from unittest import TestCase +from wafl.knowledge.indexing_implementation import add_to_index + +_path = os.path.dirname(__file__) + + + +class TestIndexing(TestCase): + def test__path_can_be_added_to_index(self): + data = _load_index() + prior_count = len(data["paths"]) + add_to_index("files_to_index") + + data = _load_index() + current_count = len(data["paths"]) + self.assertEqual(current_count, prior_count + 1) + + data["paths"].remove("files_to_index") + with open("indices.yaml", "w") as file: + file.write(yaml.dump(data)) + + + +def _load_index(): + with open("indices.yaml", "r") as file: + return yaml.safe_load(file.read()) \ No newline at end of file diff --git a/wafl/command_line.py b/wafl/command_line.py index 94ec8a1c..b0d8c52a 100644 --- a/wafl/command_line.py +++ b/wafl/command_line.py @@ -2,6 +2,7 @@ import sys from wafl.config import create_initial_files +from wafl.knowledge.indexing_implementation import add_to_index from wafl.parsing.preprocess import remove_preprocessed from wafl.run import ( run_from_command_line, @@ -19,9 +20,8 @@ def print_help(): print("\n") print("These are the available commands:") print("> wafl init: Initialize the current folder") - print( - "> wafl run: Starts all the available interfaces of the chatbot at the same time" - ) + print("> wafl add : Add the file or folder at to the index") + print("> wafl run: Starts the chatbot on the web interface and the audio interface.") print("> wafl run-cli: Run a cli version of the chatbot") print("> wafl run-audio: Run a voice-powered version of the chatbot") print("> wafl run-server: Run a webserver version of the chatbot") @@ -71,12 +71,18 @@ def process_cli(): elif command == "run-action": if len(arguments) > 2: action_name = arguments[2] + run_action(action_name) else: print("Please provide the action name as the second argument.") - return - run_action(action_name) + elif command == "add": + if len(arguments) > 2: + path = arguments[2] + add_to_index(path) + + else: + print("Please provide the path as the second argument.") elif command == "help": print_help() diff --git a/wafl/events/conversation_events.py b/wafl/events/conversation_events.py index 0ef99469..b9673558 100644 --- a/wafl/events/conversation_events.py +++ b/wafl/events/conversation_events.py @@ -4,7 +4,8 @@ from wafl.events.answerer_creator import create_answerer from wafl.simple_text_processing.normalize import normalized from wafl.config import Configuration -from wafl.events.utils import input_is_valid, load_knowledge +from wafl.events.utils import input_is_valid +from wafl.knowledge.indexing_implementation import load_knowledge from wafl.simple_text_processing.questions import is_question from wafl.exceptions import InterruptTask diff --git a/wafl/events/utils.py b/wafl/events/utils.py index 96c6302c..4ca97f0e 100644 --- a/wafl/events/utils.py +++ b/wafl/events/utils.py @@ -1,10 +1,5 @@ -import joblib -import os import re -import yaml -from wafl.knowledge.single_file_knowledge import SingleFileKnowledge -from wafl.readers.reader_factory import ReaderFactory from wafl.simple_text_processing.normalize import normalized @@ -23,34 +18,3 @@ def remove_text_between_brackets(text: str) -> str: -def _add_indices_to_knowledge(knowledge, config, logger): - filename = config.get_value("index") - indices = yaml.safe_load(filename) - for path in indices["paths"]: - for root, _, files in os.walk(path): - for file in files: - with open(os.path.join(root, file)) as f: - reader = ReaderFactory.get_reader(file) - for chunk in reader.get_chunks(f.read()): - knowledge.add(chunk) - - -def load_knowledge(config, logger): - if ".yaml" in config.get_value("rules") and not any( - item in config.get_value("rules") for item in [" ", "\n"] #### ALLOW INDEXING FROM rules.yaml - ): - with open(config.get_value("rules")) as file: - rules_txt = file.read() - - else: - rules_txt = config.get_value("rules") - - if os.path.exists(config.get_value("index_filename")): - knowledge = joblib.load(config.get_value("index_filename")) - if knowledge.hash == hash(rules_txt): - return knowledge - - knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) - knowledge = _add_indices_to_knowledge(knowledge, config, logger) - joblib.dump(knowledge, config.get_value("index_filename")) - return knowledge diff --git a/wafl/knowledge/indexing_implementation.py b/wafl/knowledge/indexing_implementation.py new file mode 100644 index 00000000..07827541 --- /dev/null +++ b/wafl/knowledge/indexing_implementation.py @@ -0,0 +1,60 @@ +import os + +import joblib +import yaml + +from wafl.config import Configuration +from wafl.knowledge.single_file_knowledge import SingleFileKnowledge +from wafl.readers.reader_factory import ReaderFactory + + +def _add_indices_to_knowledge(knowledge, text): + indices = yaml.safe_load(text) + for path in indices["paths"]: + for root, _, files in os.walk(path): + for file in files: + with open(os.path.join(root, file)) as f: + reader = ReaderFactory.get_reader(file) + for chunk in reader.get_chunks(f.read()): + knowledge.add(chunk) + + return knowledge + + +def load_knowledge(config, logger): + if ".yaml" in config.get_value("rules") and not any( + item in config.get_value("rules") for item in [" ", "\n"] + ): + with open(config.get_value("rules")) as file: + rules_txt = file.read() + + else: + rules_txt = config.get_value("rules") + + index_filename = config.get_value("index") + with open(index_filename) as file: + index_txt = file.read() + + if os.path.exists(config.get_value("index_filename")): + knowledge = joblib.load(config.get_value("index_filename")) + if knowledge.hash == hash(rules_txt + index_txt): + return knowledge + + knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) + knowledge = _add_indices_to_knowledge(knowledge, index_txt) + joblib.dump(knowledge, config.get_value("index_filename")) + return knowledge + + +def add_to_index(path): + config = Configuration.load_local_config() + index_filename = config.get_value("index") + with open(index_filename) as file: + indices = yaml.safe_load(file.read()) + if path in indices["paths"]: + return + + indices["paths"].append(path) + + with open(index_filename, "w") as file: + yaml.dump(indices, file) \ No newline at end of file diff --git a/wafl/knowledge/utils.py b/wafl/knowledge/utils.py index 021a2b9e..45768f1d 100644 --- a/wafl/knowledge/utils.py +++ b/wafl/knowledge/utils.py @@ -1,7 +1,3 @@ -import wafl.simple_text_processing.questions -from wafl.simple_text_processing.questions import get_sentence_from_yn_question - - def text_is_exact_string(text): return text.strip() and text.strip()[0] == "_" @@ -73,3 +69,5 @@ def needs_substitutions(effect): return True return False + + diff --git a/wafl/readers/reader_factory.py b/wafl/readers/reader_factory.py index ee1c61c1..6a41a783 100644 --- a/wafl/readers/reader_factory.py +++ b/wafl/readers/reader_factory.py @@ -5,11 +5,16 @@ class ReaderFactory: _chunk_size = 1000 _overlap = 100 - + _extension_to_reader_dict = { + ".pdf": PdfReader, + ".txt": TextReader + } + @staticmethod def get_reader(filename): - if ".pdf" in filename.lower(): - return PdfReader(ReaderFactory._chunk_size, ReaderFactory._overlap) - else: - return TextReader(ReaderFactory._chunk_size, ReaderFactory._overlap) + for extension, reader in ReaderFactory._extension_to_reader_dict.items(): + if extension in filename.lower(): + return reader(ReaderFactory._chunk_size, ReaderFactory._overlap) + + return TextReader(ReaderFactory._chunk_size, ReaderFactory._overlap) From 885ed64e50a9509df02c2599382a39a76a3f0a3e Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 23 Jun 2024 19:19:57 +0100 Subject: [PATCH 09/21] Added tests for indexing --- tests/config.json | 1 + tests/indices.yaml | 2 +- tests/test_indexing.py | 17 ++++++++++++++--- wafl/events/conversation_events.py | 13 +++++++++++-- wafl/knowledge/indexing_implementation.py | 13 ++++++------- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/tests/config.json b/tests/config.json index 9ad52379..3589a0ff 100644 --- a/tests/config.json +++ b/tests/config.json @@ -4,6 +4,7 @@ "deactivate_sound": true, "rules": "rules.yaml", "index": "indices.yaml", + "index_filename": "knowledge_cache", "functions": "functions.py", "max_recursion": 2, "llm_model": { diff --git a/tests/indices.yaml b/tests/indices.yaml index 51c91ca2..74d41f7c 100644 --- a/tests/indices.yaml +++ b/tests/indices.yaml @@ -1,2 +1,2 @@ paths: -- ../../tests/files_to_index/ +- files_to_index/ diff --git a/tests/test_indexing.py b/tests/test_indexing.py index 47b63181..d6a3198c 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -1,8 +1,12 @@ +import asyncio import os import yaml from unittest import TestCase -from wafl.knowledge.indexing_implementation import add_to_index + +from wafl.config import Configuration +from wafl.extractors.dataclasses import Query +from wafl.knowledge.indexing_implementation import add_to_index, load_knowledge _path = os.path.dirname(__file__) @@ -12,16 +16,23 @@ class TestIndexing(TestCase): def test__path_can_be_added_to_index(self): data = _load_index() prior_count = len(data["paths"]) - add_to_index("files_to_index") + add_to_index("files_to_index2") data = _load_index() current_count = len(data["paths"]) self.assertEqual(current_count, prior_count + 1) - data["paths"].remove("files_to_index") + data["paths"].remove("files_to_index2") with open("indices.yaml", "w") as file: file.write(yaml.dump(data)) + def test__indexed_files_can_be_retrieved(self): + config = Configuration.load_local_config() + knowledge = asyncio.run(load_knowledge(config)) + results = asyncio.run(knowledge.ask_for_facts(Query.create_from_text("How do I start WAFL"))) + expected = "WAFL" + self.assertIn(expected, results[0].text) + def _load_index(): diff --git a/wafl/events/conversation_events.py b/wafl/events/conversation_events.py index b9673558..8c4db48f 100644 --- a/wafl/events/conversation_events.py +++ b/wafl/events/conversation_events.py @@ -1,3 +1,4 @@ +import asyncio import os import re @@ -20,7 +21,15 @@ def __init__( logger=None, ): self._config = config - self._knowledge = load_knowledge(config, logger) + try: + loop = asyncio.get_running_loop() + + except RuntimeError: + loop = None + + if not loop or not loop.is_running(): + self._knowledge = asyncio.run(load_knowledge(config, logger)) + self._answerer = create_answerer(config, self._knowledge, interface, logger) self._answerer._client._connector._cache = {} self._interface = interface @@ -103,7 +112,7 @@ async def process_next(self, activation_word: str = "") -> bool: return False async def reload_knowledge(self): - self._knowledge = load_knowledge(self._config, self._logger) + self._knowledge = await load_knowledge(self._config, self._logger) await self._knowledge.initialize_retrievers() def is_computing(self): diff --git a/wafl/knowledge/indexing_implementation.py b/wafl/knowledge/indexing_implementation.py index 07827541..71267023 100644 --- a/wafl/knowledge/indexing_implementation.py +++ b/wafl/knowledge/indexing_implementation.py @@ -8,20 +8,19 @@ from wafl.readers.reader_factory import ReaderFactory -def _add_indices_to_knowledge(knowledge, text): +async def _add_indices_to_knowledge(knowledge, text): indices = yaml.safe_load(text) for path in indices["paths"]: for root, _, files in os.walk(path): for file in files: - with open(os.path.join(root, file)) as f: - reader = ReaderFactory.get_reader(file) - for chunk in reader.get_chunks(f.read()): - knowledge.add(chunk) + reader = ReaderFactory.get_reader(file) + for chunk in reader.get_chunks(os.path.join(root, file)): + await knowledge.add(chunk) return knowledge -def load_knowledge(config, logger): +async def load_knowledge(config, logger=None): if ".yaml" in config.get_value("rules") and not any( item in config.get_value("rules") for item in [" ", "\n"] ): @@ -41,7 +40,7 @@ def load_knowledge(config, logger): return knowledge knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) - knowledge = _add_indices_to_knowledge(knowledge, index_txt) + knowledge = await _add_indices_to_knowledge(knowledge, index_txt) joblib.dump(knowledge, config.get_value("index_filename")) return knowledge From 8025d9cd2113d63de91f1827f04f1652b15702fc Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Mon, 24 Jun 2024 16:49:26 +0100 Subject: [PATCH 10/21] initializing the retrievers when loading the knowledge --- tests/test_rules.py | 4 ++-- todo.txt | 6 ++++++ wafl/events/conversation_events.py | 1 - wafl/knowledge/indexing_implementation.py | 3 +++ 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index 9a901f51..9c2dbd94 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -39,8 +39,8 @@ def test__rules_can_be_triggered(self): interface=interface, ) asyncio.run(conversation_events.process_next()) - expected = "The horse is tall" - self.assertIn(expected, interface.get_utterances_list()[-1]) + expected = "the horse is tall" + self.assertIn(expected, interface.get_utterances_list()[-1].lower()) def test__rules_are_not_always_triggered(self): interface = DummyInterface( diff --git a/todo.txt b/todo.txt index 19205660..87393549 100644 --- a/todo.txt +++ b/todo.txt @@ -1,3 +1,9 @@ +None of the knowledge is loaded from the web interface. Why? +- you have just changed the load_knowledge function to make it async. + + + + wafl: /- create indices - allow files/folders to be indexed (modify rules.yaml and then re-index) diff --git a/wafl/events/conversation_events.py b/wafl/events/conversation_events.py index 8c4db48f..d83d52ce 100644 --- a/wafl/events/conversation_events.py +++ b/wafl/events/conversation_events.py @@ -113,7 +113,6 @@ async def process_next(self, activation_word: str = "") -> bool: async def reload_knowledge(self): self._knowledge = await load_knowledge(self._config, self._logger) - await self._knowledge.initialize_retrievers() def is_computing(self): return self._is_computing diff --git a/wafl/knowledge/indexing_implementation.py b/wafl/knowledge/indexing_implementation.py index 71267023..56904290 100644 --- a/wafl/knowledge/indexing_implementation.py +++ b/wafl/knowledge/indexing_implementation.py @@ -31,6 +31,8 @@ async def load_knowledge(config, logger=None): rules_txt = config.get_value("rules") index_filename = config.get_value("index") + if not os.path.exists(index_filename): + raise RuntimeError(f"Index file {index_filename} does not exist.") with open(index_filename) as file: index_txt = file.read() @@ -42,6 +44,7 @@ async def load_knowledge(config, logger=None): knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) knowledge = await _add_indices_to_knowledge(knowledge, index_txt) joblib.dump(knowledge, config.get_value("index_filename")) + await knowledge.initialize_retrievers() return knowledge From 3984bd066b083e0aa43f12963867cfeb4158346d Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Mon, 24 Jun 2024 17:07:55 +0100 Subject: [PATCH 11/21] adjusted template --- wafl/templates/config.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wafl/templates/config.json b/wafl/templates/config.json index ebf30b01..a0119c57 100644 --- a/wafl/templates/config.json +++ b/wafl/templates/config.json @@ -3,8 +3,10 @@ "waking_up_sound": true, "deactivate_sound": true, "rules": "rules.yaml", - "functions": "functions.py", + "index": "indices.yaml", "index_filename": "knowledge_cache", + "functions": "functions.py", + "max_recursion": 2, "frontend_port": 8090, "llm_model": { "model_host": "localhost", From c93b5031913925b6f6307430fe7784a9172d9cd8 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 5 Jul 2024 16:43:42 +0100 Subject: [PATCH 12/21] added an info icon --- todo.txt | 17 +++++++++++-- wafl/connectors/clients/information_client.py | 17 +++++++++++++ .../connectors/remote/remote_llm_connector.py | 25 +++++++++++++++++++ wafl/frontend/index.html | 9 +++++++ wafl/frontend/wafl.css | 1 + wafl/runners/routes.py | 6 +++++ wafl/runners/run_web_interface.py | 2 +- wafl/scheduler/web_handler.py | 16 ++++++++++++ .../scheduler/web_interface_implementation.py | 2 +- 9 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 wafl/connectors/clients/information_client.py diff --git a/todo.txt b/todo.txt index 87393549..7005bf64 100644 --- a/todo.txt +++ b/todo.txt @@ -1,7 +1,20 @@ -None of the knowledge is loaded from the web interface. Why? -- you have just changed the load_knowledge function to make it async. +* add control over which llm to use from the frontend + - add list of models in the backend +* interruptible speech +* dependabot!!! +* use poetry +* add pdf to indexing +* add jason to indexing +* add metadata to indexing items + + +/make backend it run with ollama as well (no too slow) + + +/None of the knowledge is loaded from the web interface. Why? +/- you have just changed the load_knowledge function to make it async. wafl: diff --git a/wafl/connectors/clients/information_client.py b/wafl/connectors/clients/information_client.py new file mode 100644 index 00000000..772afb00 --- /dev/null +++ b/wafl/connectors/clients/information_client.py @@ -0,0 +1,17 @@ +import os +import textwrap +from typing import List + +from wafl.connectors.factories.llm_connector_factory import LLMConnectorFactory +from wafl.connectors.prompt_template import PromptTemplate + +_path = os.path.dirname(__file__) + + +class InformationClient: + def __init__(self, config): + self._connector = LLMConnectorFactory.get_connector(config) + self._config = config + + async def get_information(self) -> str: + return await self._connector.get_information() diff --git a/wafl/connectors/remote/remote_llm_connector.py b/wafl/connectors/remote/remote_llm_connector.py index ae3403af..3ad7f67c 100644 --- a/wafl/connectors/remote/remote_llm_connector.py +++ b/wafl/connectors/remote/remote_llm_connector.py @@ -103,3 +103,28 @@ async def check_connection(self): print() return False + + async def get_information(self): + payload = { + "data": { + "system_prompt": "Hello!", + "conversation": [{"speaker": "user", "text": "Hi!"}], + }, + "temperature": 0.6, + "num_tokens": 1, + "last_strings": self._important_strings, + "num_replicas": self._num_replicas, + } + async with aiohttp.ClientSession( + conn_timeout=6000, + connector=aiohttp.TCPConnector(ssl=False), + ) as session: + async with session.post(self._server_url, json=payload) as response: + answer = json.loads(await response.text()) + status = answer["status"] + if status != "success": + raise RuntimeError(f"Error in prediction: {answer}") + return { + "model_name": answer["model"], + "backend_version": answer["version"], + } diff --git a/wafl/frontend/index.html b/wafl/frontend/index.html index f2ba40e9..6be65d64 100644 --- a/wafl/frontend/index.html +++ b/wafl/frontend/index.html @@ -70,6 +70,15 @@ +
  • + + + + + +
  • diff --git a/wafl/frontend/wafl.css b/wafl/frontend/wafl.css index 7c6c4959..a8d4c93a 100644 --- a/wafl/frontend/wafl.css +++ b/wafl/frontend/wafl.css @@ -73,6 +73,7 @@ pre .logs { .dialogue-row-bot { font-family: monospace; font-size: 30px; + font-size: 30px; color: gray; margin-top: 20px; margin-bottom: 15px; diff --git a/wafl/runners/routes.py b/wafl/runners/routes.py index 78b182c0..3b22a0d4 100644 --- a/wafl/runners/routes.py +++ b/wafl/runners/routes.py @@ -78,3 +78,9 @@ def add_new_rules(app: Flask, conversation_id: int, web_server_loop: "WebLoop"): web_server_loop.toggle_logs, methods=["POST"], ) + app.add_url_rule( + f"/{conversation_id}/get_info", + f"get_info_{conversation_id}", + web_server_loop.get_info, + methods=["POST"], + ) diff --git a/wafl/runners/run_web_interface.py b/wafl/runners/run_web_interface.py index bb75ddc9..66191607 100644 --- a/wafl/runners/run_web_interface.py +++ b/wafl/runners/run_web_interface.py @@ -51,7 +51,7 @@ def create_scheduler_and_webserver_loop(conversation_id): deactivate_on_closed_conversation=False, ) asyncio.run(interface.output("Hello. How may I help you?")) - web_loop = WebHandler(interface, conversation_id, conversation_events) + web_loop = WebHandler(interface, config, conversation_id, conversation_events) return { "scheduler": Scheduler([conversation_loop, web_loop]), "web_server_loop": web_loop, diff --git a/wafl/scheduler/web_handler.py b/wafl/scheduler/web_handler.py index 064a6d83..e69568d8 100644 --- a/wafl/scheduler/web_handler.py +++ b/wafl/scheduler/web_handler.py @@ -2,6 +2,9 @@ import os from flask import render_template, request, jsonify + +from wafl.config import Configuration +from wafl.connectors.clients.information_client import InformationClient from wafl.interface.base_interface import BaseInterface from wafl.logger.history_logger import HistoryLogger from wafl.scheduler.messages_creator import MessagesCreator @@ -13,6 +16,7 @@ class WebHandler: def __init__( self, interface: BaseInterface, + config: Configuration, conversation_id: int, conversation_events: "ConversationEvents", ): @@ -22,6 +26,7 @@ def __init__( self._conversation_events = conversation_events self._prior_dialogue_items = "" self._messages_creator = MessagesCreator(self._interface) + self._information_client = InformationClient(config) async def index(self): return render_template("index.html", conversation_id=self._conversation_id) @@ -96,6 +101,17 @@ async def toggle_logs(self): self._messages_creator.toggle_logs() return jsonify("") + async def get_info(self): + info = await self._information_client.get_information() + return f""" +
    {info}
    + """ + async def run(self): print(f"New web server instance {self._conversation_id} running!") return diff --git a/wafl/scheduler/web_interface_implementation.py b/wafl/scheduler/web_interface_implementation.py index bec8e520..4d0e2274 100644 --- a/wafl/scheduler/web_interface_implementation.py +++ b/wafl/scheduler/web_interface_implementation.py @@ -7,7 +7,7 @@ def _change_code_wrapper(text): def replace_code(match): code = match.group(1) - return f'' + return f'
    {code}
    ' return re.sub(pattern, replace_code, text, flags=re.DOTALL) From 0bb1133c4f041eeba9851a8828a8294ce477a80d Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 5 Jul 2024 17:46:44 +0100 Subject: [PATCH 13/21] updated interface (div/input alignment issue) --- wafl/frontend/index.html | 2 +- wafl/frontend/wafl.css | 10 +++++----- wafl/scheduler/web_handler.py | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/wafl/frontend/index.html b/wafl/frontend/index.html index 6be65d64..04d3a5b7 100644 --- a/wafl/frontend/index.html +++ b/wafl/frontend/index.html @@ -86,7 +86,7 @@
         
    Date: Fri, 5 Jul 2024 18:00:13 +0100 Subject: [PATCH 14/21] changed the templates for wafl init --- wafl/templates/rules.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wafl/templates/rules.yaml b/wafl/templates/rules.yaml index fed65b08..10284e69 100644 --- a/wafl/templates/rules.yaml +++ b/wafl/templates/rules.yaml @@ -56,10 +56,10 @@ rules: - say that you are just improvising the answer - say what you think answer the question - - the user thanks the bot: + - the user says "thank you": - The intention of the user is to close the conversation - You must answer the user by writing "close_conversation()" - the user says "end this conversation": - The intention of the user is to close the conversation - - You must answer the user by writing "close_conversation()" + - You must answer the user by writing "close_conversation()" \ No newline at end of file From fc4b72bdffebb053b5548618e76111c0af5e38dc Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 5 Jul 2024 18:01:32 +0100 Subject: [PATCH 15/21] updated the audio and web runner --- wafl/runners/run_web_and_audio_interface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wafl/runners/run_web_and_audio_interface.py b/wafl/runners/run_web_and_audio_interface.py index ed01a6e8..1c7779b7 100644 --- a/wafl/runners/run_web_and_audio_interface.py +++ b/wafl/runners/run_web_and_audio_interface.py @@ -48,7 +48,7 @@ def create_scheduler_and_webserver_loop(conversation_id): _logger, activation_word=config.get_value("waking_up_word"), ) - web_loop = WebHandler(interface, conversation_id, conversation_events) + web_loop = WebHandler(interface, config, conversation_id, conversation_events) return { "scheduler": Scheduler([conversation_loop, web_loop]), "web_server_loop": web_loop, From 4a306f0f464e37007d899a27035fd793dc9b9f2d Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 5 Jul 2024 18:24:28 +0100 Subject: [PATCH 16/21] added infobox --- wafl/frontend/index.html | 3 ++- wafl/scheduler/web_handler.py | 26 ++++++++++++++++++++------ 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/wafl/frontend/index.html b/wafl/frontend/index.html index 04d3a5b7..76b39d7a 100644 --- a/wafl/frontend/index.html +++ b/wafl/frontend/index.html @@ -44,7 +44,6 @@ hx-post="/{{conversation_id}}/toggle_logs" hx-swap="none" class="flex items-center p-2 rounded-lg text-white hover:bg-gray-700 group"> - diff --git a/wafl/scheduler/web_handler.py b/wafl/scheduler/web_handler.py index 043d336d..ca42c1e9 100644 --- a/wafl/scheduler/web_handler.py +++ b/wafl/scheduler/web_handler.py @@ -103,13 +103,27 @@ async def toggle_logs(self): async def get_info(self): info = await self._information_client.get_information() + is_clicked = request.form.get("clicked") + is_clicked = "false" if is_clicked == "true" else "true" + infobox = "" + if is_clicked == "true": + infobox = f""" +
    +
    Model name: {info['model_name']}
    +
    Backend version: {info['backend_version']}
    +
    + """ return f""" -
    {info}
    + + + + + {infobox} + """ async def run(self): From 9302f8e80f01b579f3183531d274d98a9b793f84 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 5 Jul 2024 18:26:36 +0100 Subject: [PATCH 17/21] moved handlers to a new folder --- setup.py | 1 + wafl/handlers/__init__.py | 0 wafl/{scheduler => handlers}/conversation_handler.py | 0 wafl/{scheduler => handlers}/generated_event_handler.py | 0 wafl/{scheduler => handlers}/web_handler.py | 0 wafl/{scheduler => handlers}/web_interface_implementation.py | 0 wafl/runners/run_from_audio.py | 2 +- wafl/runners/run_web_and_audio_interface.py | 4 ++-- wafl/runners/run_web_interface.py | 4 ++-- wafl/scheduler/messages_creator.py | 2 +- 10 files changed, 7 insertions(+), 6 deletions(-) create mode 100644 wafl/handlers/__init__.py rename wafl/{scheduler => handlers}/conversation_handler.py (100%) rename wafl/{scheduler => handlers}/generated_event_handler.py (100%) rename wafl/{scheduler => handlers}/web_handler.py (100%) rename wafl/{scheduler => handlers}/web_interface_implementation.py (100%) diff --git a/setup.py b/setup.py index 2c9477a3..7c97183a 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ "wafl.connectors.remote", "wafl.events", "wafl.extractors", + "wafl.handlers", "wafl.inference", "wafl.interface", "wafl.knowledge", diff --git a/wafl/handlers/__init__.py b/wafl/handlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wafl/scheduler/conversation_handler.py b/wafl/handlers/conversation_handler.py similarity index 100% rename from wafl/scheduler/conversation_handler.py rename to wafl/handlers/conversation_handler.py diff --git a/wafl/scheduler/generated_event_handler.py b/wafl/handlers/generated_event_handler.py similarity index 100% rename from wafl/scheduler/generated_event_handler.py rename to wafl/handlers/generated_event_handler.py diff --git a/wafl/scheduler/web_handler.py b/wafl/handlers/web_handler.py similarity index 100% rename from wafl/scheduler/web_handler.py rename to wafl/handlers/web_handler.py diff --git a/wafl/scheduler/web_interface_implementation.py b/wafl/handlers/web_interface_implementation.py similarity index 100% rename from wafl/scheduler/web_interface_implementation.py rename to wafl/handlers/web_interface_implementation.py diff --git a/wafl/runners/run_from_audio.py b/wafl/runners/run_from_audio.py index c8889c51..7b523687 100644 --- a/wafl/runners/run_from_audio.py +++ b/wafl/runners/run_from_audio.py @@ -2,7 +2,7 @@ from wafl.events.conversation_events import ConversationEvents from wafl.interface.voice_interface import VoiceInterface from wafl.logger.local_file_logger import LocalFileLogger -from wafl.scheduler.conversation_handler import ConversationHandler +from wafl.handlers.conversation_handler import ConversationHandler from wafl.scheduler.scheduler import Scheduler _logger = LocalFileLogger() diff --git a/wafl/runners/run_web_and_audio_interface.py b/wafl/runners/run_web_and_audio_interface.py index 1c7779b7..4d0ee5d8 100644 --- a/wafl/runners/run_web_and_audio_interface.py +++ b/wafl/runners/run_web_and_audio_interface.py @@ -7,13 +7,13 @@ from wafl.interface.list_interface import ListInterface from wafl.interface.voice_interface import VoiceInterface from wafl.scheduler.scheduler import Scheduler -from wafl.scheduler.conversation_handler import ConversationHandler +from wafl.handlers.conversation_handler import ConversationHandler from wafl.logger.local_file_logger import LocalFileLogger from wafl.events.conversation_events import ConversationEvents from wafl.interface.queue_interface import QueueInterface from wafl.config import Configuration from wafl.runners.routes import get_app, add_new_rules -from wafl.scheduler.web_handler import WebHandler +from wafl.handlers.web_handler import WebHandler app = get_app() _logger = LocalFileLogger() diff --git a/wafl/runners/run_web_interface.py b/wafl/runners/run_web_interface.py index 66191607..089498b3 100644 --- a/wafl/runners/run_web_interface.py +++ b/wafl/runners/run_web_interface.py @@ -6,8 +6,8 @@ from flask import render_template, redirect, url_for from wafl.scheduler.scheduler import Scheduler -from wafl.scheduler.web_handler import WebHandler -from wafl.scheduler.conversation_handler import ConversationHandler +from wafl.handlers.web_handler import WebHandler +from wafl.handlers.conversation_handler import ConversationHandler from wafl.logger.local_file_logger import LocalFileLogger from wafl.events.conversation_events import ConversationEvents from wafl.interface.queue_interface import QueueInterface diff --git a/wafl/scheduler/messages_creator.py b/wafl/scheduler/messages_creator.py index 04e1a6c2..9fa7e217 100644 --- a/wafl/scheduler/messages_creator.py +++ b/wafl/scheduler/messages_creator.py @@ -1,4 +1,4 @@ -from wafl.scheduler.web_interface_implementation import get_html_from_dialogue_item +from wafl.handlers.web_interface_implementation import get_html_from_dialogue_item class MessagesCreator: From 792a7f79cd7f24b0c4639cd0134bd545bf96c230 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 6 Jul 2024 12:56:54 +0100 Subject: [PATCH 18/21] added metadata to retrieval and pdf files indexing (if there's text inside the texts) --- requirements.txt | 2 ++ setup.py | 1 + tests/test_indexing.py | 10 +++++----- wafl/answerer/answerer_implementation.py | 12 ++++++++++-- wafl/answerer/dialogue_answerer.py | 5 ++--- wafl/answerer/rule_maker.py | 7 +++---- wafl/command_line.py | 4 +++- wafl/connectors/remote/remote_llm_connector.py | 4 ++-- wafl/dataclasses/__init__.py | 0 wafl/{extractors => dataclasses}/dataclasses.py | 0 wafl/{ => dataclasses}/facts.py | 1 + wafl/{ => dataclasses}/rules.py | 0 wafl/events/utils.py | 3 --- wafl/inference/utils.py | 2 +- wafl/knowledge/indexing_implementation.py | 4 ++-- wafl/knowledge/single_file_knowledge.py | 16 ++++++++++------ wafl/knowledge/utils.py | 2 -- wafl/parsing/line_rules_parser.py | 4 ++-- wafl/parsing/rules_parser.py | 4 ++-- wafl/readers/base_reader.py | 7 ++++--- wafl/readers/pdf_reader.py | 16 +++++++++++++--- wafl/readers/reader_factory.py | 9 ++++----- wafl/readers/text_reader.py | 15 +++++++++++---- 23 files changed, 78 insertions(+), 50 deletions(-) create mode 100644 wafl/dataclasses/__init__.py rename wafl/{extractors => dataclasses}/dataclasses.py (100%) rename wafl/{ => dataclasses}/facts.py (88%) rename wafl/{ => dataclasses}/rules.py (100%) diff --git a/requirements.txt b/requirements.txt index c01039f9..c42a2dc9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,5 @@ einops==0.6.1 g2p-en==2.1.0 pyyaml==6.0.1 joblib==1.4.2 +pymupdf==1.24.7 + diff --git a/setup.py b/setup.py index 7c97183a..ac883e04 100644 --- a/setup.py +++ b/setup.py @@ -63,6 +63,7 @@ "g2p-en==2.1.0", "pyyaml==6.0.1", "joblib==1.4.2", + "pymupdf==1.24.7", ], classifiers=[ "License :: OSI Approved :: MIT License", diff --git a/tests/test_indexing.py b/tests/test_indexing.py index d6a3198c..f214ce2c 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -5,13 +5,12 @@ from unittest import TestCase from wafl.config import Configuration -from wafl.extractors.dataclasses import Query +from wafl.dataclasses.dataclasses import Query from wafl.knowledge.indexing_implementation import add_to_index, load_knowledge _path = os.path.dirname(__file__) - class TestIndexing(TestCase): def test__path_can_be_added_to_index(self): data = _load_index() @@ -29,12 +28,13 @@ def test__path_can_be_added_to_index(self): def test__indexed_files_can_be_retrieved(self): config = Configuration.load_local_config() knowledge = asyncio.run(load_knowledge(config)) - results = asyncio.run(knowledge.ask_for_facts(Query.create_from_text("How do I start WAFL"))) + results = asyncio.run( + knowledge.ask_for_facts(Query.create_from_text("How do I start WAFL")) + ) expected = "WAFL" self.assertIn(expected, results[0].text) - def _load_index(): with open("indices.yaml", "r") as file: - return yaml.safe_load(file.read()) \ No newline at end of file + return yaml.safe_load(file.read()) diff --git a/wafl/answerer/answerer_implementation.py b/wafl/answerer/answerer_implementation.py index 3a7b99a0..e83c5306 100644 --- a/wafl/answerer/answerer_implementation.py +++ b/wafl/answerer/answerer_implementation.py @@ -4,7 +4,7 @@ from typing import List, Tuple from wafl.exceptions import CloseConversation -from wafl.facts import Fact +from wafl.dataclasses.facts import Fact from wafl.interface.conversation import Conversation, Utterance @@ -116,7 +116,15 @@ async def _run_code(to_execute: str, module, functions) -> str: def get_text_from_facts_and_thresholds( facts_and_thresholds: List[Tuple[Fact, float]], memory: str ) -> List[str]: - return [item[0].text for item in facts_and_thresholds if item[0].text not in memory] + text_list = [] + for item in facts_and_thresholds: + if item[0].text not in memory: + text = item[0].text + if item[0].metadata: + text = f"Metadata for the following text: {str(item[0].metadata)}" + "\n" + text + text_list.append(text) + + return text_list def add_dummy_utterances_to_continue_generation( diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index cbaf2451..53b8356a 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -2,7 +2,6 @@ from inspect import getmembers, isfunction from typing import List, Tuple from wafl.answerer.answerer_implementation import ( - is_executable, substitute_memory_in_answer_and_get_memories_if_present, create_one_liner, get_text_from_facts_and_thresholds, @@ -13,7 +12,7 @@ from wafl.answerer.base_answerer import BaseAnswerer from wafl.answerer.rule_maker import RuleMaker from wafl.connectors.clients.llm_chitchat_answer_client import LLMChitChatAnswerClient -from wafl.extractors.dataclasses import Query, Answer +from wafl.dataclasses.dataclasses import Query, Answer from wafl.interface.conversation import Conversation from wafl.simple_text_processing.questions import is_question @@ -115,7 +114,7 @@ async def _get_relevant_rules(self, conversation: Conversation) -> List[str]: for rule in rules: if rule not in self._prior_rules: self._prior_rules.insert(0, rule) - self._prior_rules = self._prior_rules[:self._max_num_past_utterances_for_rules] + self._prior_rules = self._prior_rules[: self._max_num_past_utterances_for_rules] return self._prior_rules def _init_python_module(self, module_name): diff --git a/wafl/answerer/rule_maker.py b/wafl/answerer/rule_maker.py index 1905ba18..115dfcfc 100644 --- a/wafl/answerer/rule_maker.py +++ b/wafl/answerer/rule_maker.py @@ -1,7 +1,7 @@ from typing import List -from wafl.extractors.dataclasses import Query -from wafl.rules import Rule +from wafl.dataclasses.dataclasses import Query +from wafl.dataclasses.rules import Rule class RuleMaker: @@ -27,8 +27,7 @@ async def create_from_query(self, conversation: "Conversation") -> List[str]: rules_texts = [] for rule in rules: rules_text = rule.get_string_using_template( - "- {effect}:\n{clauses}\n" - + rule.indent_str + "- {effect}:\n{clauses}\n" + rule.indent_str ) rules_texts.append(rules_text) await self._interface.add_fact(f"The bot remembers the rule:\n{rules_text}") diff --git a/wafl/command_line.py b/wafl/command_line.py index b0d8c52a..f62812f5 100644 --- a/wafl/command_line.py +++ b/wafl/command_line.py @@ -21,7 +21,9 @@ def print_help(): print("These are the available commands:") print("> wafl init: Initialize the current folder") print("> wafl add : Add the file or folder at to the index") - print("> wafl run: Starts the chatbot on the web interface and the audio interface.") + print( + "> wafl run: Starts the chatbot on the web interface and the audio interface." + ) print("> wafl run-cli: Run a cli version of the chatbot") print("> wafl run-audio: Run a voice-powered version of the chatbot") print("> wafl run-server: Run a webserver version of the chatbot") diff --git a/wafl/connectors/remote/remote_llm_connector.py b/wafl/connectors/remote/remote_llm_connector.py index 3ad7f67c..a2b3d7f7 100644 --- a/wafl/connectors/remote/remote_llm_connector.py +++ b/wafl/connectors/remote/remote_llm_connector.py @@ -116,8 +116,8 @@ async def get_information(self): "num_replicas": self._num_replicas, } async with aiohttp.ClientSession( - conn_timeout=6000, - connector=aiohttp.TCPConnector(ssl=False), + conn_timeout=6000, + connector=aiohttp.TCPConnector(ssl=False), ) as session: async with session.post(self._server_url, json=payload) as response: answer = json.loads(await response.text()) diff --git a/wafl/dataclasses/__init__.py b/wafl/dataclasses/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/wafl/extractors/dataclasses.py b/wafl/dataclasses/dataclasses.py similarity index 100% rename from wafl/extractors/dataclasses.py rename to wafl/dataclasses/dataclasses.py diff --git a/wafl/facts.py b/wafl/dataclasses/facts.py similarity index 88% rename from wafl/facts.py rename to wafl/dataclasses/facts.py index c2a3eec2..0445adff 100644 --- a/wafl/facts.py +++ b/wafl/dataclasses/facts.py @@ -10,6 +10,7 @@ class Fact: is_interruption: bool = False source: str = None destination: str = None + metadata: Union[str, dict] = None def toJSON(self): return str(self) diff --git a/wafl/rules.py b/wafl/dataclasses/rules.py similarity index 100% rename from wafl/rules.py rename to wafl/dataclasses/rules.py diff --git a/wafl/events/utils.py b/wafl/events/utils.py index 4ca97f0e..dc99d3bc 100644 --- a/wafl/events/utils.py +++ b/wafl/events/utils.py @@ -15,6 +15,3 @@ def input_is_valid(text): def remove_text_between_brackets(text: str) -> str: return re.sub(r"(\[.*?\])", "", text) - - - diff --git a/wafl/inference/utils.py b/wafl/inference/utils.py index 0004ab92..2f25bda1 100644 --- a/wafl/inference/utils.py +++ b/wafl/inference/utils.py @@ -2,7 +2,7 @@ from typing import List, Dict, Tuple, Any from fuzzywuzzy import process -from wafl.extractors.dataclasses import Answer +from wafl.dataclasses.dataclasses import Answer from wafl.simple_text_processing.normalize import normalized from wafl.simple_text_processing.questions import is_question diff --git a/wafl/knowledge/indexing_implementation.py b/wafl/knowledge/indexing_implementation.py index 56904290..c8fe9be1 100644 --- a/wafl/knowledge/indexing_implementation.py +++ b/wafl/knowledge/indexing_implementation.py @@ -15,7 +15,7 @@ async def _add_indices_to_knowledge(knowledge, text): for file in files: reader = ReaderFactory.get_reader(file) for chunk in reader.get_chunks(os.path.join(root, file)): - await knowledge.add(chunk) + await knowledge.add_fact(chunk) return knowledge @@ -59,4 +59,4 @@ def add_to_index(path): indices["paths"].append(path) with open(index_filename, "w") as file: - yaml.dump(indices, file) \ No newline at end of file + yaml.dump(indices, file) diff --git a/wafl/knowledge/single_file_knowledge.py b/wafl/knowledge/single_file_knowledge.py index a9741b0f..8c9c5a25 100644 --- a/wafl/knowledge/single_file_knowledge.py +++ b/wafl/knowledge/single_file_knowledge.py @@ -6,7 +6,7 @@ import nltk from wafl.config import Configuration -from wafl.facts import Fact +from wafl.dataclasses.facts import Fact from wafl.knowledge.base_knowledge import BaseKnowledge from wafl.knowledge.utils import ( text_is_exact_string, @@ -67,14 +67,18 @@ def __init__(self, config, rules_text=None, logger=None): if not loop or not loop.is_running(): asyncio.run(self.initialize_retrievers()) - async def add(self, text): - fact_index = f"F{len(self._facts_dict)}" - self._facts_dict[fact_index] = Fact(text=text) + async def add(self, text: str): + await self.add_fact(Fact(text=text)) + + async def add_fact(self, fact: Fact): + index = str(len(self._facts_dict)) + index = f"F{index}" + self._facts_dict[index] = fact await self._facts_retriever.add_text_and_index( - clean_text_for_retrieval(text), fact_index + clean_text_for_retrieval(fact.text), index=index ) await self._facts_retriever_for_questions.add_text_and_index( - clean_text_for_retrieval(text), fact_index + clean_text_for_retrieval(fact.text), index=index ) async def add_rule(self, rule_text): diff --git a/wafl/knowledge/utils.py b/wafl/knowledge/utils.py index 45768f1d..75e52fe7 100644 --- a/wafl/knowledge/utils.py +++ b/wafl/knowledge/utils.py @@ -69,5 +69,3 @@ def needs_substitutions(effect): return True return False - - diff --git a/wafl/parsing/line_rules_parser.py b/wafl/parsing/line_rules_parser.py index 8dbdb77d..73371f3e 100644 --- a/wafl/parsing/line_rules_parser.py +++ b/wafl/parsing/line_rules_parser.py @@ -1,6 +1,6 @@ from wafl.simple_text_processing.questions import is_question -from wafl.facts import Fact -from wafl.rules import Rule +from wafl.dataclasses.facts import Fact +from wafl.dataclasses.rules import Rule def parse_rule_from_single_line(text): diff --git a/wafl/parsing/rules_parser.py b/wafl/parsing/rules_parser.py index ceb9fc57..70d3b5f1 100644 --- a/wafl/parsing/rules_parser.py +++ b/wafl/parsing/rules_parser.py @@ -1,7 +1,7 @@ import yaml -from wafl.facts import Fact -from wafl.rules import Rule +from wafl.dataclasses.facts import Fact +from wafl.dataclasses.rules import Rule from wafl.simple_text_processing.deixis import from_user_to_bot diff --git a/wafl/readers/base_reader.py b/wafl/readers/base_reader.py index 0d93cbe9..ea995601 100644 --- a/wafl/readers/base_reader.py +++ b/wafl/readers/base_reader.py @@ -1,16 +1,17 @@ from typing import List +from wafl.dataclasses.facts import Fact + class BaseReader: def read(self, file_path: str) -> str: raise NotImplementedError() - def get_chunks(self, text: str) -> List[str]: + def get_chunks(self, filename: str) -> List[Fact]: raise NotImplementedError() def _chunk_text(self, text: str, size: int, overlap: int) -> List[str]: chunks = [] for i in range(0, len(text), size - overlap): - chunks.append(text[i:i + size]) + chunks.append(text[i : i + size]) return chunks - diff --git a/wafl/readers/pdf_reader.py b/wafl/readers/pdf_reader.py index 03b28a4d..4f610616 100644 --- a/wafl/readers/pdf_reader.py +++ b/wafl/readers/pdf_reader.py @@ -1,5 +1,8 @@ -from logging import getLogger +import pymupdf +from logging import getLogger +from typing import List +from wafl.dataclasses.facts import Fact from wafl.readers.base_reader import BaseReader _logger = getLogger(__name__) @@ -10,6 +13,13 @@ def __init__(self, chunk_size: int, overlap: int): self.chunk_size = chunk_size self.overlap = overlap - def get_chunks(self, filename): + def get_chunks(self, filename: str) -> List[Fact]: _logger.info(f"Reading PDF file: {filename}") - + with pymupdf.open(filename) as doc: + return [ + Fact( + text=page.get_text(), + metadata={"filename": filename, "page_number": i}, + ) + for i, page in enumerate(doc) + ] diff --git a/wafl/readers/reader_factory.py b/wafl/readers/reader_factory.py index 6a41a783..fbbbcd3b 100644 --- a/wafl/readers/reader_factory.py +++ b/wafl/readers/reader_factory.py @@ -5,10 +5,7 @@ class ReaderFactory: _chunk_size = 1000 _overlap = 100 - _extension_to_reader_dict = { - ".pdf": PdfReader, - ".txt": TextReader - } + _extension_to_reader_dict = {".pdf": PdfReader, ".txt": TextReader} @staticmethod def get_reader(filename): @@ -16,5 +13,7 @@ def get_reader(filename): if extension in filename.lower(): return reader(ReaderFactory._chunk_size, ReaderFactory._overlap) - return TextReader(ReaderFactory._chunk_size, ReaderFactory._overlap) + ### add pdf reader + ### add metadata and show in the UI + return TextReader(ReaderFactory._chunk_size, ReaderFactory._overlap) diff --git a/wafl/readers/text_reader.py b/wafl/readers/text_reader.py index a6924437..b22c4ffe 100644 --- a/wafl/readers/text_reader.py +++ b/wafl/readers/text_reader.py @@ -1,5 +1,7 @@ from logging import getLogger +from typing import List +from wafl.dataclasses.facts import Fact from wafl.readers.base_reader import BaseReader _logger = getLogger(__name__) @@ -10,9 +12,14 @@ def __init__(self, chunk_size: int, overlap: int): self.chunk_size = chunk_size self.overlap = overlap - def get_chunks(self, filename): + def get_chunks(self, filename: str) -> List[Fact]: _logger.info(f"Reading text file: {filename}") with open(filename, "r") as file: - return self._chunk_text(file.read(), self.chunk_size, self.overlap) - - + chunks = self._chunk_text(file.read(), self.chunk_size, self.overlap) + return [ + Fact( + text=chunk, + metadata={"filename": filename, "chunk_number": i}, + ) + for i, chunk in enumerate(chunks) + ] From 134bcd3e6a594e8e6540624dbf22154881d34342 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 7 Jul 2024 12:42:39 +0100 Subject: [PATCH 19/21] Added metadata and pdf files indexing. --- tests/test_indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/test_indexing.py b/tests/test_indexing.py index f214ce2c..1e64b4a4 100644 --- a/tests/test_indexing.py +++ b/tests/test_indexing.py @@ -34,6 +34,15 @@ def test__indexed_files_can_be_retrieved(self): expected = "WAFL" self.assertIn(expected, results[0].text) + def test__pdf_can_be_read(self): + config = Configuration.load_local_config() + knowledge = asyncio.run(load_knowledge(config)) + results = asyncio.run( + knowledge.ask_for_facts(Query.create_from_text("What color is the sky?")) + ) + expected = "green" + self.assertIn(expected, results[0].text) + def _load_index(): with open("indices.yaml", "r") as file: From 908bce1561e932601d39223458a855f9f83c5e7b Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 7 Jul 2024 17:28:58 +0100 Subject: [PATCH 20/21] updated version] --- todo.txt | 12 +++++++----- wafl/variables.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/todo.txt b/todo.txt index 7005bf64..08cd6cb8 100644 --- a/todo.txt +++ b/todo.txt @@ -1,13 +1,15 @@ -* add control over which llm to use from the frontend - - add list of models in the backend +/* add control over which llm to use from the frontend +/ - add list of models in the backend * interruptible speech +* add option so use llama.cpp from wafl_llm +* add option to have None as a model setting in wafl_llm * dependabot!!! * use poetry -* add pdf to indexing -* add jason to indexing -* add metadata to indexing items +/* add pdf to indexing +* add json to indexing +/* add metadata to indexing items /make backend it run with ollama as well (no too slow) diff --git a/wafl/variables.py b/wafl/variables.py index 27f3aa5c..49accbab 100644 --- a/wafl/variables.py +++ b/wafl/variables.py @@ -1,9 +1,9 @@ def get_variables(): return { - "version": "0.0.90", + "version": "0.0.91", } def is_supported(wafl_llm_version): - supported_versions = ["0.0.90"] + supported_versions = ["0.0.91"] return wafl_llm_version in supported_versions From 47b58935386e060b3af2f21599042ea892e24aa0 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Wed, 10 Jul 2024 10:47:21 +0100 Subject: [PATCH 21/21] added prompt file + docs --- documentation/source/configuration.rst | 12 +++++++++- documentation/source/index.rst | 1 + documentation/source/modify_the_prompt.rst | 23 +++++++++++++++++++ tests/config.json | 3 ++- tests/main.prompt | 11 +++++++++ todo.txt | 19 +++++++++++---- wafl/answerer/dialogue_answerer.py | 4 ++-- ...at_answer_client.py => llm_chat_client.py} | 19 ++++----------- wafl/knowledge/indexing_implementation.py | 6 ++--- wafl/templates/config.json | 3 ++- wafl/templates/main.prompt | 11 +++++++++ 11 files changed, 84 insertions(+), 28 deletions(-) create mode 100644 documentation/source/modify_the_prompt.rst create mode 100644 tests/main.prompt rename wafl/connectors/clients/{llm_chitchat_answer_client.py => llm_chat_client.py} (60%) create mode 100644 wafl/templates/main.prompt diff --git a/documentation/source/configuration.rst b/documentation/source/configuration.rst index a1722f00..f67dfbe7 100644 --- a/documentation/source/configuration.rst +++ b/documentation/source/configuration.rst @@ -12,8 +12,11 @@ A typical configuration file looks like this: "waking_up_sound": true, "deactivate_sound": true, "rules": "rules.yaml", + "index": "indices.yaml", + "cache_filename": "knowledge_cache", + "prompt_filename": "main.prompt", "functions": "functions.py", - "frontend_port": 8081, + "max_recursion": 2, "llm_model": { "model_host": "localhost", "model_port": 8080, @@ -37,6 +40,7 @@ A typical configuration file looks like this: } + These settings regulate the following: * "waking_up_word" is the name of the bot, used to wake up the system in the "run-audio" mode. @@ -45,6 +49,12 @@ These settings regulate the following: * "rules" is the file containing the facts and rules that guide the chatbot. The default is "rules.yaml". + * "index" is the file containing the path to the files to index. The default is "indices.yaml". + + * "cache_filename" is the file where the indexed knowledge is cached. The default is "knowledge_cache". + + * "prompt_filename" is the file containing the main prompt for the chatbot. The default is "main.prompt". + * "functions" is the file containing the functions that can be used in the rules. The default is "functions.py". * "frontend_port" is the port where the web frontend is running. The default is 8090. diff --git a/documentation/source/index.rst b/documentation/source/index.rst index 30c4f872..e4eb8f8f 100644 --- a/documentation/source/index.rst +++ b/documentation/source/index.rst @@ -16,6 +16,7 @@ Welcome to WAFL's 0.0.90 documentation! configuration running_WAFL facts_and_rules + modify_the_prompt examples testcases actions diff --git a/documentation/source/modify_the_prompt.rst b/documentation/source/modify_the_prompt.rst new file mode 100644 index 00000000..cba1761d --- /dev/null +++ b/documentation/source/modify_the_prompt.rst @@ -0,0 +1,23 @@ +Modify the original prompt +========================== + +The prompt is stored in the file "main.prompt" in the project's root directory. +The name of the file can be changed in the `config.json` file. +The default is: + + +.. code-block:: text + A user is chatting with a bot. The chat is happening through a web interface. The user is typing the messages and the bot is replying. + + This is summary of the bot's knowledge: + {facts} + + The rules that *must* be followed are: + {rules} + + Create a plausible dialogue based on the aforementioned summary and rules. + Do not repeat yourself. Be friendly but not too servile. + Follow the rules if present and they apply to the dialogue. Do not improvise if rules are present. + + +The variables `{facts}` and `{rules}` are replaced by the actual facts and rules when the prompt is generated. diff --git a/tests/config.json b/tests/config.json index 3589a0ff..121d77f0 100644 --- a/tests/config.json +++ b/tests/config.json @@ -4,7 +4,8 @@ "deactivate_sound": true, "rules": "rules.yaml", "index": "indices.yaml", - "index_filename": "knowledge_cache", + "cache_filename": "knowledge_cache", + "prompt_filename": "main.prompt", "functions": "functions.py", "max_recursion": 2, "llm_model": { diff --git a/tests/main.prompt b/tests/main.prompt new file mode 100644 index 00000000..07b45290 --- /dev/null +++ b/tests/main.prompt @@ -0,0 +1,11 @@ +A user is chatting with a bot. The chat is happening through a web interface. The user is typing the messages and the bot is replying. + +This is summary of the bot's knowledge: +{facts} + +The rules that *must* be followed are: +{rules} + +Create a plausible dialogue based on the aforementioned summary and rules. +Do not repeat yourself. Be friendly but not too servile. +Follow the rules if present and they apply to the dialogue. Do not improvise if rules are present. \ No newline at end of file diff --git a/todo.txt b/todo.txt index 08cd6cb8..020aaea4 100644 --- a/todo.txt +++ b/todo.txt @@ -1,12 +1,21 @@ +* interruptible speech +* dependabot!!! +* use poetry + +PharazonE +* upload to hetzner and make it work for some retrieval tasks +* develop more rules + use-cases for voice and other + + /* add control over which llm to use from the frontend / - add list of models in the backend -* interruptible speech -* add option so use llama.cpp from wafl_llm -* add option to have None as a model setting in wafl_llm +/* add quantization of llm to wafl_llm config +/* write docs about it on wafl + +/* add option so use llama.cpp from wafl_llm +/* add option to have None as a model setting in wafl_llm -* dependabot!!! -* use poetry /* add pdf to indexing * add json to indexing /* add metadata to indexing items diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 53b8356a..f12be579 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -11,7 +11,7 @@ ) from wafl.answerer.base_answerer import BaseAnswerer from wafl.answerer.rule_maker import RuleMaker -from wafl.connectors.clients.llm_chitchat_answer_client import LLMChitChatAnswerClient +from wafl.connectors.clients.llm_chat_client import LLMChatClient from wafl.dataclasses.dataclasses import Query, Answer from wafl.interface.conversation import Conversation from wafl.simple_text_processing.questions import is_question @@ -20,7 +20,7 @@ class DialogueAnswerer(BaseAnswerer): def __init__(self, config, knowledge, interface, code_path, logger): self._threshold_for_facts = 0.85 - self._client = LLMChitChatAnswerClient(config) + self._client = LLMChatClient(config) self._knowledge = knowledge self._logger = logger self._interface = interface diff --git a/wafl/connectors/clients/llm_chitchat_answer_client.py b/wafl/connectors/clients/llm_chat_client.py similarity index 60% rename from wafl/connectors/clients/llm_chitchat_answer_client.py rename to wafl/connectors/clients/llm_chat_client.py index 2302dc31..ef0da698 100644 --- a/wafl/connectors/clients/llm_chitchat_answer_client.py +++ b/wafl/connectors/clients/llm_chat_client.py @@ -1,5 +1,4 @@ import os -import textwrap from typing import List from wafl.connectors.factories.llm_connector_factory import LLMConnectorFactory @@ -8,10 +7,12 @@ _path = os.path.dirname(__file__) -class LLMChitChatAnswerClient: +class LLMChatClient: def __init__(self, config): self._connector = LLMConnectorFactory.get_connector(config) self._config = config + with open(self._config.get_value("prompt_filename")) as f: + self.prompt = f.read() async def get_answer(self, text: str, dialogue: str, rules_text: List[str]) -> str: prompt = await self._get_answer_prompt(text, dialogue, "\n".join(rules_text)) @@ -26,16 +27,4 @@ async def _get_answer_prompt( ) def _get_system_prompt(self, text, rules_text): - return f""" -A user is chatting with a bot. The chat is happening through a web interface. The user is typing the messages and the bot is replying. - -This is summary of the bot's knowledge: -{text.strip()} - -The rules that *must* be followed are: -{rules_text.strip()} - -Create a plausible dialogue based on the aforementioned summary and rules. -Do not repeat yourself. Be friendly but not too servile. -Follow the rules if present and they apply to the dialogue. Do not improvise if rules are present. - """.strip() + return self.prompt.replace("{facts}", text.strip()).replace("{rules}", rules_text.strip()).strip() diff --git a/wafl/knowledge/indexing_implementation.py b/wafl/knowledge/indexing_implementation.py index c8fe9be1..0d1ee29b 100644 --- a/wafl/knowledge/indexing_implementation.py +++ b/wafl/knowledge/indexing_implementation.py @@ -36,14 +36,14 @@ async def load_knowledge(config, logger=None): with open(index_filename) as file: index_txt = file.read() - if os.path.exists(config.get_value("index_filename")): - knowledge = joblib.load(config.get_value("index_filename")) + if os.path.exists(config.get_value("cache_filename")): + knowledge = joblib.load(config.get_value("cache_filename")) if knowledge.hash == hash(rules_txt + index_txt): return knowledge knowledge = SingleFileKnowledge(config, rules_txt, logger=logger) knowledge = await _add_indices_to_knowledge(knowledge, index_txt) - joblib.dump(knowledge, config.get_value("index_filename")) + joblib.dump(knowledge, config.get_value("cache_filename")) await knowledge.initialize_retrievers() return knowledge diff --git a/wafl/templates/config.json b/wafl/templates/config.json index a0119c57..7af9893e 100644 --- a/wafl/templates/config.json +++ b/wafl/templates/config.json @@ -4,7 +4,8 @@ "deactivate_sound": true, "rules": "rules.yaml", "index": "indices.yaml", - "index_filename": "knowledge_cache", + "cache_filename": "knowledge_cache", + "prompt_filename": "main.prompt", "functions": "functions.py", "max_recursion": 2, "frontend_port": 8090, diff --git a/wafl/templates/main.prompt b/wafl/templates/main.prompt new file mode 100644 index 00000000..07b45290 --- /dev/null +++ b/wafl/templates/main.prompt @@ -0,0 +1,11 @@ +A user is chatting with a bot. The chat is happening through a web interface. The user is typing the messages and the bot is replying. + +This is summary of the bot's knowledge: +{facts} + +The rules that *must* be followed are: +{rules} + +Create a plausible dialogue based on the aforementioned summary and rules. +Do not repeat yourself. Be friendly but not too servile. +Follow the rules if present and they apply to the dialogue. Do not improvise if rules are present. \ No newline at end of file