From 97f3f53795dbe50830a82a8cb1c656513cdb24e2 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 12 Jan 2024 18:13:43 +0000 Subject: [PATCH 01/10] fixed bug for recursive rules --- wafl/answerer/rule_creator.py | 7 +++++-- wafl/connectors/base_llm_connector.py | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/wafl/answerer/rule_creator.py b/wafl/answerer/rule_creator.py index a1a49ba0..4ecc3690 100644 --- a/wafl/answerer/rule_creator.py +++ b/wafl/answerer/rule_creator.py @@ -17,7 +17,7 @@ def __init__( self._indent_str = " " async def create_from_query(self, query): - rules = await self._knowledge.ask_for_rule_backward(query) + rules = await self._knowledge.ask_for_rule_backward(query, threshold=0.95) rules = rules[: self._max_num_rules] rules_texts = [] for rule in rules: @@ -34,6 +34,9 @@ async def create_from_query(self, query): return "\n".join(rules_texts) async def recursively_add_rules(self, query, depth=2): + if depth > self._max_indentation: + return "" + rules = await self._knowledge.ask_for_rule_backward(query, threshold=0.95) rules = rules[: self._max_num_rules] rules_texts = [] @@ -42,7 +45,7 @@ async def recursively_add_rules(self, query, depth=2): for cause_index, causes in enumerate(rule.causes): indentation = self._indent_str * depth rules_text += f"{indentation}{cause_index + 1}) {causes.text}\n" - rules_text += await self.recursively_add_rules(causes.text, depth + 1) + rules_text += await self.recursively_add_rules(causes, depth + 1) rules_texts.append(rules_text) diff --git a/wafl/connectors/base_llm_connector.py b/wafl/connectors/base_llm_connector.py index 806164e4..2c800c20 100644 --- a/wafl/connectors/base_llm_connector.py +++ b/wafl/connectors/base_llm_connector.py @@ -59,7 +59,7 @@ async def generate(self, prompt: str) -> str: if end_set: end = min(end_set) - candidate_answer = text[start:end].split("bot: ")[-1].strip() + candidate_answer = text[start:end].strip() candidate_answer = re.sub(r"(.*)<\|.*\|>", r"\1", candidate_answer).strip() if prompt not in self._cache: From e415d2878c6e15e513de80493819e2b60aeae95d Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Fri, 12 Jan 2024 18:28:59 +0000 Subject: [PATCH 02/10] changed threshold for facts --- wafl/answerer/dialogue_answerer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 7fe8c68c..c081a8ea 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -126,7 +126,7 @@ async def _get_relevant_facts( > conversational_timestamp - self._max_num_past_utterances_for_facts ] facts_and_thresholds = await self._knowledge.ask_for_facts_with_threshold( - query, is_from_user=True, threshold=0.8 + query, is_from_user=True, threshold=0.95 ) if facts_and_thresholds: facts = [ From 2dd795506c1b9738979cc5a4e6eaf5632923e2cc Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 13 Jan 2024 14:40:25 +0000 Subject: [PATCH 03/10] Added toggle logs icon --- wafl/frontend/index.html | 31 +++++++++++++++++++++++++++++++ wafl/runners/routes.py | 1 - 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/wafl/frontend/index.html b/wafl/frontend/index.html index bc7283f1..ecd15020 100644 --- a/wafl/frontend/index.html +++ b/wafl/frontend/index.html @@ -39,6 +39,37 @@ +
  • + + + + + + + + + +
  • diff --git a/wafl/runners/routes.py b/wafl/runners/routes.py index 169bfabc..59b8cfaa 100644 --- a/wafl/runners/routes.py +++ b/wafl/runners/routes.py @@ -9,7 +9,6 @@ from wafl.config import Configuration from wafl.events.conversation_events import ConversationEvents from wafl.interface.queue_interface import QueueInterface -from wafl.knowledge.single_file_knowledge import SingleFileKnowledge from wafl.logger.local_file_logger import LocalFileLogger from wafl.scheduler.conversation_loop import ConversationLoop from wafl.scheduler.scheduler import Scheduler From 82a5acc7dc3ccadf9dabc891a628f1654d29e735 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 14 Jan 2024 17:28:00 +0000 Subject: [PATCH 04/10] added toggle logs backend --- wafl/answerer/dialogue_answerer.py | 2 +- wafl/command_line.py | 2 +- wafl/frontend/index.html | 2 +- wafl/run.py | 1 - wafl/runners/routes.py | 6 +++ wafl/runners/run_from_actions.py | 10 ++--- wafl/scheduler/messages_creator.py | 70 ++++++++++++++++++++++++++++++ wafl/scheduler/web_loop.py | 60 ++++--------------------- 8 files changed, 93 insertions(+), 60 deletions(-) create mode 100644 wafl/scheduler/messages_creator.py diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index c081a8ea..12a52542 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -212,7 +212,7 @@ async def _run_code(self, to_execute): break if not result: - result = f'\n"""python\n{to_execute}\n"""' + result = f"\n```python\n{to_execute}\n```" return result diff --git a/wafl/command_line.py b/wafl/command_line.py index 14fbb6cd..fc68db06 100644 --- a/wafl/command_line.py +++ b/wafl/command_line.py @@ -7,7 +7,7 @@ run_from_command_line, run_testcases, print_incipit, - download_models + download_models, ) from wafl.runners.run_from_actions import run_action from wafl.runners.run_from_audio import run_from_audio diff --git a/wafl/frontend/index.html b/wafl/frontend/index.html index ecd15020..8137ca99 100644 --- a/wafl/frontend/index.html +++ b/wafl/frontend/index.html @@ -42,7 +42,7 @@
  • " + ) + conversation += "".join(dialogue_items) + return conversation + + async def _get_logs(self): + choices = self._interface.get_choices_and_timestamp() + choices = [ + ( + item[0], + "
    " + item[1] + "
    ", + ) + for item in choices + ] + facts = self._interface.get_facts_and_timestamp() + facts = [ + ( + item[0], + "
    " + item[1] + "
    ", + ) + for item in facts + ] + + choices_and_facts = choices + facts + choices_and_facts = sorted(choices_and_facts, key=lambda x: x[0])[::-1] + choices_and_facts = [item[1] for item in choices_and_facts] + conversation = "" + conversation += "
    " + conversation += "".join(choices_and_facts) + conversation += "
    " + return conversation diff --git a/wafl/scheduler/web_loop.py b/wafl/scheduler/web_loop.py index f43a5a9e..0d3510d0 100644 --- a/wafl/scheduler/web_loop.py +++ b/wafl/scheduler/web_loop.py @@ -4,9 +4,7 @@ from flask import render_template, request, jsonify from wafl.interface.queue_interface import QueueInterface from wafl.logger.history_logger import HistoryLogger -from wafl.scheduler.web_interface_implementation import ( - get_html_from_dialogue_item, -) +from wafl.scheduler.messages_creator import MessagesCreator _path = os.path.dirname(__file__) @@ -23,6 +21,7 @@ def __init__( self._conversation_id = conversation_id self._conversation_events = conversation_events self._prior_dialogue_items = "" + self._messages_creator = MessagesCreator(self._interface) async def index(self): return render_template("index.html", conversation_id=self._conversation_id) @@ -49,7 +48,7 @@ async def reset_conversation(self): self._conversation_events.reload_knowledge() self._conversation_events.reset_discourse_memory() await self._interface.output("Hello. How may I help you?") - conversation = await self._get_conversation() + conversation = await self._messages_creator.get_messages_window() return conversation async def reload_rules(self): @@ -59,7 +58,7 @@ async def reload_rules(self): return "" async def check_for_new_messages(self): - conversation = await self._get_conversation() + conversation = await self._messages_creator.get_messages_window() if conversation != self._prior_dialogue_items: self._prior_dialogue_items = conversation return f""" @@ -75,7 +74,7 @@ async def check_for_new_messages(self): return "
    " async def load_messages(self): - conversation = await self._get_conversation() + conversation = await self._messages_creator.get_messages_window() return conversation async def handle_output(self): @@ -93,51 +92,10 @@ async def thumbs_down(self): self._history_logger.write("thumbs_down") return jsonify("") + async def toggle_logs(self): + self._messages_creator.toggle_logs() + return jsonify("") + async def run(self): print(f"New web server instance {self._conversation_id} running!") return - - async def _get_conversation(self): - dialogue_items = self._interface.get_utterances_list_with_timestamp() - dialogue = [] - for index, item in enumerate(dialogue_items): - dialogue.append( - ( - item[0], - get_html_from_dialogue_item( - item[1], - ), - ) - ) - - choices = self._interface.get_choices_and_timestamp() - choices = [ - ( - item[0], - "
    " + item[1] + "
    ", - ) - for item in choices - ] - facts = self._interface.get_facts_and_timestamp() - facts = [ - ( - item[0], - "
    " + item[1] + "
    ", - ) - for item in facts - ] - choices_and_facts = choices + facts - choices_and_facts = sorted(choices_and_facts, key=lambda x: x[0])[::-1] - choices_and_facts = [item[1] for item in choices_and_facts] - dialogue_items = dialogue - dialogue_items = sorted(dialogue_items, key=lambda x: x[0])[::-1] - dialogue_items = [item[1] for item in dialogue_items] - conversation = ( - "
    " - ) - conversation += "".join(dialogue_items) - conversation += "
    " - conversation += "
    " - conversation += "".join(choices_and_facts) - conversation += "
    " - return conversation From 4b7eecdabca5e52b96613439d6ee8d82d6051487 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 21 Jan 2024 17:48:49 +0000 Subject: [PATCH 05/10] better rules nesting recursion --- tests/config.json | 1 + tests/test_rules.py | 27 +++++++++++++++++++++++++++ wafl/answerer/rule_creator.py | 10 +++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/config.json b/tests/config.json index 4bb4fa90..c78c40b3 100644 --- a/tests/config.json +++ b/tests/config.json @@ -4,6 +4,7 @@ "deactivate_sound": true, "rules": "rules.yaml", "functions": "functions.py", + "max_recursion": 2, "llm_model": { "model_host": "localhost", "model_port": 8080 diff --git a/tests/test_rules.py b/tests/test_rules.py index 619e3cd9..990f5492 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -13,6 +13,16 @@ - the user says their name: - reply casually to the conversation" + + - the user wants to buy coffee: + - the bot says the coffee prices + - ask for which price range + - tell them the right coffee + + - the bot says the coffee prices: + - decaf is 1.50 + - regular is 1.00 + - espresso is 2.00 """ @@ -48,3 +58,20 @@ def test__rules_are_not_always_triggered(self): asyncio.run(conversation_events.process_next()) unexpected = "bot: the horse is tall" self.assertNotEqual(unexpected, interface.get_utterances_list()[-1]) + + def test__rules_can_nest(self): + interface = DummyInterface( + to_utter=[ + "I want to buy coffee", + ] + ) + config = Configuration.load_local_config() + config.set_value("rules", wafl_example) + conversation_events = ConversationEvents( + config=config, + interface=interface, + ) + asyncio.run(conversation_events.process_next()) + self.assertIn("decaf", interface.get_facts_and_timestamp()[0][1]) + self.assertIn("regular", interface.get_facts_and_timestamp()[0][1]) + self.assertIn("espresso", interface.get_facts_and_timestamp()[0][1]) diff --git a/wafl/answerer/rule_creator.py b/wafl/answerer/rule_creator.py index 4ecc3690..78a8ab74 100644 --- a/wafl/answerer/rule_creator.py +++ b/wafl/answerer/rule_creator.py @@ -13,7 +13,11 @@ def __init__( self._interface = interface self._max_num_rules = max_num_rules self._delete_current_rule = delete_current_rule - self._max_indentation = max_recursion + if not config.get_value("max_recursion"): + self._max_indentation = max_recursion + else: + self._max_indentation = config.get_value("max_recursion") + self._indent_str = " " async def create_from_query(self, query): @@ -38,10 +42,10 @@ async def recursively_add_rules(self, query, depth=2): return "" rules = await self._knowledge.ask_for_rule_backward(query, threshold=0.95) - rules = rules[: self._max_num_rules] + rules = rules[:1] rules_texts = [] for rule in rules: - rules_text = f"- If {rule.effect.text} go through the following points:\n" + rules_text = "" for cause_index, causes in enumerate(rule.causes): indentation = self._indent_str * depth rules_text += f"{indentation}{cause_index + 1}) {causes.text}\n" From 1b860e81235aa2a64a30c41127ad4cce916d3de2 Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 27 Jan 2024 10:46:25 +0000 Subject: [PATCH 06/10] changed the name of the rule maker --- wafl/answerer/dialogue_answerer.py | 6 +++--- wafl/answerer/{rule_creator.py => rule_maker.py} | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) rename wafl/answerer/{rule_creator.py => rule_maker.py} (98%) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 12a52542..160d9f04 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -10,7 +10,7 @@ get_last_user_utterance, ) from wafl.answerer.base_answerer import BaseAnswerer -from wafl.answerer.rule_creator import RuleCreator +from wafl.answerer.rule_maker import RuleMaker from wafl.connectors.bridges.llm_chitchat_answer_bridge import LLMChitChatAnswerBridge from wafl.exceptions import CloseConversation from wafl.extractors.dataclasses import Query, Answer @@ -31,7 +31,7 @@ def __init__(self, config, knowledge, interface, code_path, logger): self._init_python_module(code_path.replace(".py", "")) self._prior_rule_with_timestamp = None self._max_predictions = 3 - self._rule_creator = RuleCreator( + self._rule_creator = RuleMaker( knowledge, config, interface, @@ -206,7 +206,7 @@ async def _run_code(self, to_execute): except Exception as e: result = ( - f'Error while executing\n\n"""python\n{to_execute}\n"""\n\n{str(e)}' + f'Error while executing\n\n```python\n{to_execute}\n```\n\n{str(e)}' ) traceback.print_exc() break diff --git a/wafl/answerer/rule_creator.py b/wafl/answerer/rule_maker.py similarity index 98% rename from wafl/answerer/rule_creator.py rename to wafl/answerer/rule_maker.py index 78a8ab74..7819fab5 100644 --- a/wafl/answerer/rule_creator.py +++ b/wafl/answerer/rule_maker.py @@ -1,4 +1,4 @@ -class RuleCreator: +class RuleMaker: def __init__( self, knowledge, @@ -21,7 +21,7 @@ def __init__( self._indent_str = " " async def create_from_query(self, query): - rules = await self._knowledge.ask_for_rule_backward(query, threshold=0.95) + rules = await self._knowledge.ask_for_rule_backward(query, threshold=0.92) rules = rules[: self._max_num_rules] rules_texts = [] for rule in rules: From 69e22c34938918504855c3bbef44c260d140869f Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 27 Jan 2024 15:07:28 +0000 Subject: [PATCH 07/10] changed thresholds --- wafl/answerer/dialogue_answerer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wafl/answerer/dialogue_answerer.py b/wafl/answerer/dialogue_answerer.py index 160d9f04..a115d77c 100644 --- a/wafl/answerer/dialogue_answerer.py +++ b/wafl/answerer/dialogue_answerer.py @@ -126,7 +126,7 @@ async def _get_relevant_facts( > conversational_timestamp - self._max_num_past_utterances_for_facts ] facts_and_thresholds = await self._knowledge.ask_for_facts_with_threshold( - query, is_from_user=True, threshold=0.95 + query, is_from_user=True, threshold=0.85 ) if facts_and_thresholds: facts = [ From 7db2d4ce32e8dc0d8e402e5c50b17a0552e7f07b Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sat, 16 Mar 2024 16:20:19 +0000 Subject: [PATCH 08/10] created datset for functions --- datasets/__item__.py | 0 datasets/create_final_dataset_table.py | 54 ++++++++++++ ...create_negative_examples_from_positives.py | 73 ++++++++++++++++ datasets/create_rules_dataset.py | 34 +++++--- ...reate_rules_dataset_from_ultrachat_200k.py | 50 +++++++++++ ...e_summary_and_memory_from_conversations.py | 71 +++++++++++++++ datasets/data/current_item_index.json | 1 + .../delete_repeating_negative_examples.py | 35 ++++++++ ..._original_prompt_onto_negative_examples.py | 50 +++++++++++ datasets/frontend/index.html | 57 ++++++++++++ datasets/select_dataset_rules.py | 34 ++++++++ datasets/server.py | 86 +++++++++++++++++++ todo.txt | 36 ++++++++ .../connectors/remote/remote_llm_connector.py | 14 +-- 14 files changed, 579 insertions(+), 16 deletions(-) create mode 100644 datasets/__item__.py create mode 100644 datasets/create_final_dataset_table.py create mode 100644 datasets/create_negative_examples_from_positives.py create mode 100644 datasets/create_rules_dataset_from_ultrachat_200k.py create mode 100644 datasets/create_summary_and_memory_from_conversations.py create mode 100644 datasets/data/current_item_index.json create mode 100644 datasets/delete_repeating_negative_examples.py create mode 100644 datasets/fit_original_prompt_onto_negative_examples.py create mode 100644 datasets/frontend/index.html create mode 100644 datasets/select_dataset_rules.py create mode 100644 datasets/server.py diff --git a/datasets/__item__.py b/datasets/__item__.py new file mode 100644 index 00000000..e69de29b diff --git a/datasets/create_final_dataset_table.py b/datasets/create_final_dataset_table.py new file mode 100644 index 00000000..d933f224 --- /dev/null +++ b/datasets/create_final_dataset_table.py @@ -0,0 +1,54 @@ +import json + +import pandas as pd + +from datasets import load_dataset + +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + + +def get_memory(text): + start_pattern = "This is the summary of the bot's knowledge: \n" + end_pattern = "\nThe rules are as follows:\n" + return text[text.find(start_pattern) + len(start_pattern) : text.find(end_pattern)] + + +def get_rules(text): + start_pattern = "\nThe rules are as follows:\n" + end_pattern = "\nThe conversation goes as follows:\n" + return text[text.find(start_pattern) + len(start_pattern) : text.find(end_pattern)] + + +def get_conversation(text): + pattern = "\nThe conversation goes as follows:\n" + return text.split(pattern)[1] + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + stopwords = ["", "", "<|assistant|>", ""] + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=stopwords + ) + + data_positive = json.load(open("data/positive_examples.json")) + data_negative = json.load(open("data/to_modify_negative_examples.json")) + output_dict = {"memory": [], "rules": [], "positive_conversation": [], "negative_conversation": []} + for text_positive, text_negative in zip(data_positive, data_negative): + memory = get_memory(text_positive) + rules = get_rules(text_positive) + positive_conversation = get_conversation(text_positive) + negative_conversation = get_conversation(text_negative) + output_dict["memory"].append(memory) + output_dict["rules"].append(rules) + output_dict["positive_conversation"].append(positive_conversation) + output_dict["negative_conversation"].append(negative_conversation) + + df = pd.DataFrame(output_dict) + df.to_parquet("data/wafl_functions.parquet", index=False) + dataset = load_dataset("parquet", data_files="data/wafl_functions.parquet") + dataset.push_to_hub("fractalego/wafl-functions-dataset") + print("Dataset pushed to the hub") \ No newline at end of file diff --git a/datasets/create_negative_examples_from_positives.py b/datasets/create_negative_examples_from_positives.py new file mode 100644 index 00000000..e90af49d --- /dev/null +++ b/datasets/create_negative_examples_from_positives.py @@ -0,0 +1,73 @@ +import asyncio +import json +from fuzzywuzzy import fuzz +from tqdm import tqdm + +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + + +def get_new_task(): + return ( + "\n\n" + "\n" + "Write a conversation that does not follow the same pattern as the previous one.\n" + "You must create a new conversation starting as the old one.\n" + "In particular, the new conversation between the user and the bot does not follow and contraddicts the rules and information above.\n" + "You can keep the conversation as it is but without any <> tags\n" + "You can also use the wrong tags.\n" + "You can also create a conversation that contradicts the rules above.\n" + "Show the results below, write at most 10 lines and end with the tag :\n" + ) + + +def get_prompt_before_conversation(text): + pattern = "\nThe conversation goes as follows:\n" + return text.split(pattern)[0] + pattern + + +def get_conversation(text): + pattern = "\nThe conversation goes as follows:\n" + return text.split(pattern)[1] + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + stopwords = ["", "", "<|assistant|>", ""] + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=stopwords + ) + + data = json.load(open("data/positive_examples.json")) + output_data = json.load(open("data/tmp_negative_examples.json")) + tags = ["", "", "", "", "", "", "", "", + "", "", "", "", "", "", "", "" + ] + for text in tqdm(data[len(output_data):]): + text += get_new_task() + to_save = "" + while len(to_save) < 200 or to_save in text or fuzz.ratio(to_save, get_conversation(text)) > 90: + conversation = asyncio.run(remote_llm_connector.predict(text, num_tokens=2048, num_replicas=1)) + to_save = conversation[0] + for item in stopwords: + to_save = to_save.replace(item, "") + to_save = to_save.split("")[0] + + for item in tags: + to_save = to_save.replace(item, "") + + output_data.append(to_save) + json.dump(output_data, open("data/tmp_negative_examples.json", "w"), indent=2) + + data = json.load(open("data/positive_examples.json")) + output_data = json.load(open("data/tmp_negative_examples.json")) + negative_data = [] + for positive, negative in zip(data, output_data): + prompt = get_prompt_before_conversation(positive) + if prompt not in negative: + negative = prompt + negative + negative_data.append(negative) + + json.dump(negative_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/create_rules_dataset.py b/datasets/create_rules_dataset.py index c711745a..9f4bf0ec 100644 --- a/datasets/create_rules_dataset.py +++ b/datasets/create_rules_dataset.py @@ -3,11 +3,14 @@ from wafl.config import Configuration from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever -def get_prompt(df, theme): +def get_prompt(df, theme, retriever, num_samples): + indices_and_scores = asyncio.run(retriever.get_indices_and_scores_from_text(theme)) + indices = [index for index, _ in indices_and_scores] prompt = "" - for _, row in df.sample(9).iterrows(): + for _, row in df.iloc[indices].sample(num_samples).iterrows(): prompt += ( f""" @@ -23,23 +26,32 @@ def get_prompt(df, theme): + "\n\n" ) + incipit = f"Create plausible dialogue about the theme \"{theme}\" based on the following summary and rules.\n\nThe rules are as follows:\n" + return ( prompt - + f'\nCreate plausible dialogue about the theme "{theme}" based on the following summary and rules.\n\nThe rules are as follows:\n' + + f'\n{incipit}' ) -if __name__ == "__main__": +def generate_conversation(theme, num_samples=2): config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=[""] + config.get_value("llm_model"), last_strings=[""], num_replicas=1 ) df = pd.read_csv("data/complex_instructions.csv") + for index, row in df.iterrows(): + asyncio.run(retriever.add_text_and_index(row["Theme"], str(index))) + + prompt = get_prompt(df, theme, retriever, num_samples=num_samples) + generated_text = asyncio.run( + remote_llm_connector.predict(prompt, temperature=0.5, num_tokens=2500) + )[0] + return "The rules are as follows:\n" + generated_text + + +if __name__ == "__main__": theme = "playing a song that the user likes" - prompt = get_prompt(df, theme) - print( - asyncio.run( - remote_llm_connector.predict(prompt, temperature=0.5, num_tokens=1500) - ) - ) + print(generate_conversation(theme)) diff --git a/datasets/create_rules_dataset_from_ultrachat_200k.py b/datasets/create_rules_dataset_from_ultrachat_200k.py new file mode 100644 index 00000000..ecd01d7b --- /dev/null +++ b/datasets/create_rules_dataset_from_ultrachat_200k.py @@ -0,0 +1,50 @@ +import json + +import pandas as pd +from tqdm import tqdm + +from create_rules_dataset import generate_conversation +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + +_max_rows = 2000 + + +def conversation_is_valid(conversation): + if "\nuser: " not in conversation: + return False + + if "- The user " not in conversation: + return False + + return True + + +def clean_conversation(conversation): + conversation = conversation.replace("", "") + conversation = conversation.replace("", "") + return conversation + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=[""] + ) + + df = pd.read_parquet("data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet") + dataset_items = json.load(open("data/rules_and_conversations.json")) + for index, row in tqdm(df[len(dataset_items):_max_rows + len(dataset_items)].iterrows(), total=_max_rows): + theme = row["prompt"] + conversation = generate_conversation(theme, num_samples=3) + if not conversation_is_valid(conversation): + continue + + if conversation in dataset_items: + continue + + conversation = clean_conversation(conversation) + dataset_items.append(conversation) + json.dump(dataset_items, open("data/rules_and_conversations.json", "w"), indent=2) diff --git a/datasets/create_summary_and_memory_from_conversations.py b/datasets/create_summary_and_memory_from_conversations.py new file mode 100644 index 00000000..45142d78 --- /dev/null +++ b/datasets/create_summary_and_memory_from_conversations.py @@ -0,0 +1,71 @@ +import asyncio +import json +import re + +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + +_max_rows = 2000 +items = json.load(open("data/accepted_rules_and_conversations.json")) + + +def get_prompt(conversation_item): + return f""" +Your task is to read the following conversation and create a summary of the knowledge that is being shared. +Specifically, you must create a summary of the bot's knowledge that is being shared in the conversation. +The conversation follow some rules explained at the beginning of the conversation. +Never *under any circumstance* include in the summary the rules that are explained at the beginning of the conversation. +Do not include in the summary the conversation itself, just what the bot seem to know given its answers. +If the bot outputs a text withing the tags and or and include that text in the summary. +The text containing the rules and the conversation is as follows: + +{conversation_item} + + +Summary of the bot's knowledge: + """.strip() + + +def clean_summary(summary): + return summary.replace("", "") + + +def clean_conversation_item(item): + # Change <|USER|>\n into user + item = item.replace("<|USER|>\n", "user: ") + item = item.replace("<|BOT|>\n", "bot: ") + + # Change function() into function() if there is a python function call being made + #item = re.sub("(.*?)\s(.*?\()(.*?)", r"\1 \2\3", item) + #item = re.sub("(.*?)\s(.*?\()(.*?)", r"\1 \2\3", item) + + # some sentences are between [] and should be removed + item = re.sub(r"\[.*?\]", "", item) + + # sometimes at the end of the conversation the bot says "Process finished with exit code 0". Erase this + item = re.sub(r"Process finished with exit code 0", "", item) + + # todo User -> user, or at least be internally consistent + item = item.replace("User: ", "user: ") + + return item + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=[""] + ) + + complete_items = [] + for item in items: + item = clean_conversation_item(item) + summary = asyncio.run( + remote_llm_connector.predict(get_prompt(item), temperature=0.5, num_tokens=2500) + )[0] + summary = clean_summary(summary) + complete_items.append(f"This is the summary of the bot's knowledge: {summary}\n\n{item}") + json.dump(complete_items, open("data/summary_and_memory_from_conversations.json", "w"), indent=2) + diff --git a/datasets/data/current_item_index.json b/datasets/data/current_item_index.json new file mode 100644 index 00000000..5ca234cb --- /dev/null +++ b/datasets/data/current_item_index.json @@ -0,0 +1 @@ +345 \ No newline at end of file diff --git a/datasets/delete_repeating_negative_examples.py b/datasets/delete_repeating_negative_examples.py new file mode 100644 index 00000000..0a5c837b --- /dev/null +++ b/datasets/delete_repeating_negative_examples.py @@ -0,0 +1,35 @@ +import json + +from tqdm import tqdm +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + + +def find_all_occurences(sentence, word): + import re + return [match.start() for match in re.finditer(word, sentence)] + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + stopwords = ["", "", "<|assistant|>", ""] + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=stopwords + ) + + data = json.load(open("data/to_modify_negative_examples.json")) + output_data = [] + for text in tqdm(data): + conversation = text + all_new_lines = find_all_occurences(conversation, "\n") + for new_line in all_new_lines: + tail = conversation[new_line:] + if len(tail) > len(conversation)/10. and tail in text[:new_line]: + conversation = conversation[:new_line] + break + + output_data.append(conversation) + + json.dump(output_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/fit_original_prompt_onto_negative_examples.py b/datasets/fit_original_prompt_onto_negative_examples.py new file mode 100644 index 00000000..17cee59b --- /dev/null +++ b/datasets/fit_original_prompt_onto_negative_examples.py @@ -0,0 +1,50 @@ +import asyncio +import json +from fuzzywuzzy import fuzz +from tqdm import tqdm + +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + + +def get_new_task(): + return ( + "\n\n" + "\n" + "Write a conversation exactly like the old one with one exception:\n" + "You *must* insert in the middle of the conversation a function to be executed\n" + "For example, if a conversation is about fruits, you can insert a function that calculates the number of fruits like this number_of_fruits()\n" + "You can use the tags and to insert the function or and or and \n, or and . Choose randomly.\n" + "Show the results below and end with the tag :\n" + ) + + +def get_prompt_before_conversation(text): + pattern = "\nThe conversation goes as follows:\n" + return text.split(pattern)[0] + pattern + + +def get_conversation(text): + pattern = "\nThe conversation goes as follows:\n" + return text.split(pattern)[1] + + +if __name__ == "__main__": + config = Configuration.load_local_config() + retriever = DenseRetriever("text_embedding_model", config) + stopwords = ["", "", "<|assistant|>", ""] + remote_llm_connector = RemoteLLMConnector( + config.get_value("llm_model"), last_strings=stopwords + ) + + data_positive = json.load(open("data/positive_examples.json")) + data_negative = json.load(open("data/to_modify_negative_examples.json")) + + output_data = json.load(open("data/tmp_negative_examples.json")) + for text_positive, text_negative in zip(data_positive, data_negative): + to_save = get_prompt_before_conversation(text_positive) + to_save += get_conversation(text_negative) + output_data.append(to_save) + + json.dump(output_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/frontend/index.html b/datasets/frontend/index.html new file mode 100644 index 00000000..7fe03f8b --- /dev/null +++ b/datasets/frontend/index.html @@ -0,0 +1,57 @@ + + + + Modify the items + + + + + +
    +
    +
    + +
    +
    + + + + +
    +
    + \ No newline at end of file diff --git a/datasets/select_dataset_rules.py b/datasets/select_dataset_rules.py new file mode 100644 index 00000000..a479338c --- /dev/null +++ b/datasets/select_dataset_rules.py @@ -0,0 +1,34 @@ +import json + +import pandas as pd +from tqdm import tqdm + +from create_rules_dataset import generate_conversation +from wafl.config import Configuration +from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector +from wafl.retriever.dense_retriever import DenseRetriever + + +if __name__ == "__main__": + config = Configuration.load_local_config() + dataset_items = json.load(open("data/rules_and_conversations.json")) + discarded_items = json.load(open("data/discarded_rules_and_conversations.json")) + accepted_items = json.load(open("data/accepted_rules_and_conversations.json")) + + for item in dataset_items: + if item in discarded_items or item in accepted_items: + continue + + print(item) + y_n = input("Do you want to accept this item? (y/n)") + if y_n != "y": + discarded_items.append(item) + json.dump(discarded_items, open("data/discarded_rules_and_conversations.json", "w"), indent=2) + print("\n\n") + continue + + accepted_items.append(item) + json.dump(accepted_items, open("data/accepted_rules_and_conversations.json", "w"), indent=2) + print("\n\n") + + #### CHANGE <|USER|>\n into user: (some of the elements are in the wrong format) diff --git a/datasets/server.py b/datasets/server.py new file mode 100644 index 00000000..24b296b7 --- /dev/null +++ b/datasets/server.py @@ -0,0 +1,86 @@ +import json +import os +from flask import Flask, render_template, request + +_path = os.path.dirname(__file__) + +app = Flask( + __name__, + static_url_path="", + static_folder=os.path.join(_path, "./frontend/"), + template_folder=os.path.join(_path, "./frontend/"), +) +filename = "negative_examples.json" +#filename = "positive_examples.json" +items = json.load(open(f"data/{filename}")) +current_item_index = json.load(open("data/current_item_index.json")) + + +@app.route('/') +def hello_world(): + return render_template('index.html') + + +@app.route('/current_item/') +def current_item(): + return f""" + + """ + + +@app.route('/next_item/') +def next_item(): + global current_item_index + current_item_index += 1 + json.dump(current_item_index, open("data/current_item_index.json", "w")) + return f""" + + """ + + +@app.route('/previous_item/') +def previous_item(): + global current_item_index + current_item_index -= 1 + json.dump(current_item_index, open("data/current_item_index.json", "w")) + return f""" + + """ + + +@app.route('/save_item/', methods=['GET']) +def save_item(): + global current_item_index + items[current_item_index] = request.values['editor'] + json.dump(items, open(f"data/{filename}", "w"), indent=2) + return "" + + +@app.route('/delete_item/') +def delete_item(): + global current_item_index + items.pop(current_item_index) + json.dump(items, open(f"data/{filename}", "w"), indent=2) + return f""" + + """ + + +if __name__ == '__main__': + app.run(debug=True) diff --git a/todo.txt b/todo.txt index d0d39bbb..170da585 100644 --- a/todo.txt +++ b/todo.txt @@ -1,8 +1,44 @@ ### TODO +* script to add wrong when none are needed PharazonE + + +On the to_modify set: +* sometimes the user answers yes (after "do you confirm?") and the dialogue does not have "user: yes" + + +On the accepted set: +* CHANGE <|USER|>\n into user: (some of the elements are in the wrong format) +* Perhaps change function() into function() (the memory should store the results of the function) +* Create a first paragraphs with the summary of the conversation: The conversation must always be grounded on the summary (USE LLM TO CREATE THE SUMMARY) +* The LLM wrote text after hallucinating the result of the execution. Think about how to deal with that. +* all the rules that says "two level of retrieval" should have the trigger rewritten to something more specific +* change "bot" into "assistant" some of times +* some sentences are between [] and should be removed +* put the items in so far in the conversation summary. If it is a function then you need to simulaten the relevant output using the LLM +* sometimes at the end of the conversation the bot says "Process finished with exit code 0". Erase this +* add ability to index files and files in entire folders +* if the bot uses a function to retrieve information, you should add . This is symmetrical to with a function call when necessary. +* some tags like should end the training item text +* todo User -> user, or at least be internally consistent + +* find a way to use HuggingFaceH4/ultrachat_200k as a starting point for each item + - each item should be easy to copy into a csv. + - Separate the items with special tokens/lines +* Create a dataset with about 500 elements + - use hugginface chat dataset as a starting point for + - themes + - conversation guide in prompt + - use LLM to create corresponding python code +* retriever in create_prompt +* change num_replicas back to 10 in remote_llm_connector + + /* create actions from command line /* add condition of when to stop to the actions + +Actions: #### Find way to delete cache in remote llm connector #### Put colors in action output (and dummy interface) #### Add green for when an expectation is matched diff --git a/wafl/connectors/remote/remote_llm_connector.py b/wafl/connectors/remote/remote_llm_connector.py index d232df7d..42fa0da7 100644 --- a/wafl/connectors/remote/remote_llm_connector.py +++ b/wafl/connectors/remote/remote_llm_connector.py @@ -9,13 +9,13 @@ class RemoteLLMConnector(BaseLLMConnector): _max_reply_length = 1024 _num_prediction_tokens = 200 _cache = {} - _num_replicas = 10 - def __init__(self, config, last_strings=None): + def __init__(self, config, last_strings=None, num_replicas=3): super().__init__(last_strings) host = config["model_host"] port = config["model_port"] self._server_url = f"https://{host}:{port}/predictions/bot" + self._num_replicas = num_replicas try: loop = asyncio.get_running_loop() @@ -28,24 +28,28 @@ def __init__(self, config, last_strings=None): ): raise RuntimeError("Cannot connect a running LLM.") - async def predict(self, prompt: str, temperature=None, num_tokens=None) -> [str]: + async def predict(self, prompt: str, temperature=None, num_tokens=None, num_replicas=None) -> [str]: if not temperature: temperature = 0.5 if not num_tokens: num_tokens = self._num_prediction_tokens + if not num_replicas: + num_replicas = self._num_replicas + payload = { "data": prompt, "temperature": temperature, "num_tokens": num_tokens, "last_strings": self._last_strings, - "num_replicas": self._num_replicas, + "num_replicas": num_replicas, } for _ in range(self._max_tries): async with aiohttp.ClientSession( - connector=aiohttp.TCPConnector(ssl=False) + conn_timeout=6000, + connector=aiohttp.TCPConnector(ssl=False), ) as session: async with session.post(self._server_url, json=payload) as response: answer = await response.text() From ea872532bd675419937123862f32cafd5e7b84ad Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Wed, 10 Apr 2024 16:31:00 +0100 Subject: [PATCH 09/10] tinkering with dpo training --- datasets/tinker_with_llm.py | 31 +++++++++ datasets/train_dpo_supervised.py | 72 ++++++++++++++++++++ datasets/train_dpo_with_wafl_dataset.py | 89 +++++++++++++++++++++++++ datasets/train_sft_with_wafl_dataset.py | 79 ++++++++++++++++++++++ datasets/train_supervised.py | 64 ++++++++++++++++++ todo.txt | 16 ++++- 6 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 datasets/tinker_with_llm.py create mode 100644 datasets/train_dpo_supervised.py create mode 100644 datasets/train_dpo_with_wafl_dataset.py create mode 100644 datasets/train_sft_with_wafl_dataset.py create mode 100644 datasets/train_supervised.py diff --git a/datasets/tinker_with_llm.py b/datasets/tinker_with_llm.py new file mode 100644 index 00000000..0c5097ff --- /dev/null +++ b/datasets/tinker_with_llm.py @@ -0,0 +1,31 @@ +import pandas as pd +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM + +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") +model = AutoModelForCausalLM.from_pretrained("./wafl_functions") + + +def create_train_dataset(df): + prompts = [] + chosen = [] + rejected = [] + for _, row in df.iterrows(): + prompts.append("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ + + "\n\nThe rules are as follows:\n" + row[ + "rules"].strip() + "\n\nThe conversation goes as follows:\n") + chosen.append(row["positive_conversation"].strip()) + rejected.append(row["negative_conversation"].strip()) + + return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) + + +if __name__ == '__main__': + df = pd.read_parquet("data/wafl_functions.parquet") + train_dataset = create_train_dataset(df) + user_string = "\nuser: write a haiku about cherry blossom.\nbot:" + input_ids = tokenizer(train_dataset[0]["prompt"] + user_string, return_tensors="pt") + #outputs = model.generate(**input_ids, max_length=input_ids.input_ids.shape[1] + 2, use_cache=True) + outputs = model.generate(**input_ids, do_sample=True, temperature=0.4, max_length=1024, + pad_token_id=tokenizer.eos_token_id) + print(tokenizer.decode(outputs[0])) diff --git a/datasets/train_dpo_supervised.py b/datasets/train_dpo_supervised.py new file mode 100644 index 00000000..3bc77ac8 --- /dev/null +++ b/datasets/train_dpo_supervised.py @@ -0,0 +1,72 @@ +import os + +import pandas as pd +import torch +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from svd_training.svd_model import SVDMistralForCausalLM + +model_name = "mistralai/Mistral-7B-Instruct-v0.1" +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +_filename = "mistral_svd_model.psd" +if os.path.exists(_filename): + model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) + +else: + model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), + rank_fraction=0.1) + torch.save(model.state_dict(), _filename) + + +def normalize_conversations(text): + # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") + # text = text.replace("user: ", "<|user|>\n") + # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") + # text = text + "<|endoftext|>" + return text + + +def normalize_prompt(text): + # text = "<|system|>\n" + text + return text + + +def create_train_dataset(df): + prompts = [] + chosen = [] + rejected = [] + for _, row in df.iterrows(): + prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ + + "\n\nThe rules are as follows:\n" + row[ + "rules"].strip() + "\n\nThe conversation goes as follows:\n")) + chosen.append(normalize_conversations(row["positive_conversation"])) + rejected.append(normalize_conversations(row["negative_conversation"])) + + return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) + + +def my_loss(logits, original_logits, target): + loss = torch.mean((output - target)**2) + + + return loss + + + +if __name__ == '__main__': + df = pd.read_parquet("data/wafl_functions.parquet") + train_dataset = create_train_dataset(df) + sample_text = train_dataset[0]["prompt"] + train_dataset[0]["chosen"] + optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) + input_ids = tokenizer(sample_text, return_tensors="pt").input_ids + output = model(input_ids) + loss = output.loss + loss.backward() + optimizer.step() + print("Done") + + input_ids = tokenizer(train_dataset[1]["prompt"], return_tensors="pt").input_ids + output = model.generate(input_ids, max_length=input_ids.shape[1] + 2, use_cache=True) + print(tokenizer.decode(output[0], skip_special_tokens=True)) + diff --git a/datasets/train_dpo_with_wafl_dataset.py b/datasets/train_dpo_with_wafl_dataset.py new file mode 100644 index 00000000..619c6cbb --- /dev/null +++ b/datasets/train_dpo_with_wafl_dataset.py @@ -0,0 +1,89 @@ +import os + +import pandas as pd +import torch +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments +from svd_training.svd_model import SVDMistralForCausalLM +from trl import DPOTrainer + +model_name = "mistralai/Mistral-7B-Instruct-v0.1" +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +original_model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True) +_filename = "mistral_svd_model.psd" +if os.path.exists(_filename): + model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) + +else: + model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), + rank_fraction=0.1) + torch.save(model.state_dict(), _filename) + +#model.half().cuda() + + +def normalize_conversations(text): + # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") + # text = text.replace("user: ", "<|user|>\n") + # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") + # text = text + "<|endoftext|>" + return text + + +def normalize_prompt(text): + # text = "<|system|>\n" + text + return text + + +def create_train_dataset(df): + prompts = [] + chosen = [] + rejected = [] + for _, row in df.iterrows(): + prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ + + "\n\nThe rules are as follows:\n" + row[ + "rules"].strip() + "\n\nThe conversation goes as follows:\n")) + chosen.append(normalize_conversations(row["positive_conversation"])) + rejected.append(normalize_conversations(row["negative_conversation"])) + + max_prompt_length = max(len(tokenizer.encode(prompt)) for prompt in prompts) + max_chosen_length = max(len(tokenizer.encode(chosen_text)) for chosen_text in chosen) + max_rejected_length = max(len(tokenizer.encode(rejected_text)) for rejected_text in rejected) + + return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}), max_prompt_length, max_chosen_length, max_rejected_length + + +if __name__ == '__main__': + df = pd.read_parquet("data/wafl_functions.parquet") + train_dataset, max_prompt_length, max_chosen_length, max_rejected_length = create_train_dataset(df) + training_args = TrainingArguments(output_dir="./output", + use_cpu=True, + num_train_epochs=0.3, #### Should be 1 + gradient_checkpointing=True, + per_device_train_batch_size=2, #### Try with 4 + learning_rate=1e-7, + logging_steps=10, + ) + trainer = DPOTrainer( + model, + original_model, + args=training_args, + beta=0.5, + train_dataset=train_dataset, + tokenizer=tokenizer, + max_prompt_length=max_prompt_length, + max_length=max(max_prompt_length + max_chosen_length, max_prompt_length + max_rejected_length), + ) + trainer.train() + model.merge_all() + #input_ids = tokenizer("The capital of England is", return_tensors="pt").input_ids + #output = model.generate(input_ids.cuda(), max_length=input_ids.shape[1] + 10, use_cache=True) + #print(tokenizer.decode(output[0], skip_special_tokens=True)) + model.save_pretrained("wafl_functions") + +#### SFT does not have -int +#### DPO introduces -inf after one step with 1e-8 +#### Does SFTTrainer have a similar issue? If so, it is likely a problem with the Trainer class + +#### TRY WIL LONGER CONTEXT WINDOW: DEFAULT IS 512 (LOOK FOR RIGHT ARGUMENT NAMES, THERE ARE TWO AT LEAST) diff --git a/datasets/train_sft_with_wafl_dataset.py b/datasets/train_sft_with_wafl_dataset.py new file mode 100644 index 00000000..01ba28ed --- /dev/null +++ b/datasets/train_sft_with_wafl_dataset.py @@ -0,0 +1,79 @@ +import os + +import pandas as pd +import torch +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments +from svd_training.svd_model import SVDMistralForCausalLM +from trl import DPOTrainer, SFTTrainer + +model_name = "mistralai/Mistral-7B-Instruct-v0.1" +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +_filename = "mistral_svd_model.psd" +if os.path.exists(_filename): + model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) +else: + model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), + rank_fraction=0.1) + torch.save(model.state_dict(), _filename) + + +def normalize_conversations(text): + # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") + # text = text.replace("user: ", "<|user|>\n") + # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") + # text = text + "<|endoftext|>" + return text + + +def normalize_prompt(text): + # text = "<|system|>\n" + text + return text + + +def create_train_dataset(df): + texts = [] + for _, row in df.iterrows(): + texts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ + + "\n\nThe rules are as follows:\n" + row[ + "rules"].strip() + "\n\nThe conversation goes as follows:\n") + + normalize_conversations(row["positive_conversation"]) + ) + + return Dataset.from_dict({"text": texts}) + + +if __name__ == '__main__': + df = pd.read_parquet("data/wafl_functions.parquet") + train_dataset = create_train_dataset(df) + training_args = TrainingArguments(output_dir="./output", + #fp16_full_eval=True, + use_cpu=True, + num_train_epochs=0.001, + gradient_checkpointing=True, + per_device_train_batch_size=1, + learning_rate=1e-7, + save_strategy="steps", + logging_steps=10, + ) + trainer = SFTTrainer( + model, + train_dataset=train_dataset, + dataset_text_field="text", + tokenizer=tokenizer, + max_seq_length=512, + args=training_args, + ) + trainer.train() + model.merge_all() + # input_ids = tokenizer("The capital of England is", return_tensors="pt").input_ids + # output = model.generate(input_ids.cuda(), max_length=input_ids.shape[1] + 10, use_cache=True) + # print(tokenizer.decode(output[0], skip_special_tokens=True)) + model.save_pretrained("wafl_functions") + +#### SFT does not have -int +#### DPO introduces -inf after one step with 1e-8 +#### Does SFTTrainer have a similar issue? If so, it is likely a problem with the Trainer class + +#### TRY WIL LONGER CONTEXT WINDOW: DEFAULT IS 512 (LOOK FOR RIGHT ARGUMENT NAMES, THERE ARE TWO AT LEAST) diff --git a/datasets/train_supervised.py b/datasets/train_supervised.py new file mode 100644 index 00000000..206da284 --- /dev/null +++ b/datasets/train_supervised.py @@ -0,0 +1,64 @@ +import os + +import pandas as pd +import torch +from datasets import Dataset +from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer +from svd_training.svd_model import SVDMistralForCausalLM + +model_name = "mistralai/Mistral-7B-Instruct-v0.1" +tokenizer = AutoTokenizer.from_pretrained(model_name) +tokenizer.pad_token = tokenizer.eos_token +_filename = "mistral_svd_model.psd" +if os.path.exists(_filename): + model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) + +else: + model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), + rank_fraction=0.1) + torch.save(model.state_dict(), _filename) + + +def normalize_conversations(text): + # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") + # text = text.replace("user: ", "<|user|>\n") + # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") + # text = text + "<|endoftext|>" + return text + + +def normalize_prompt(text): + # text = "<|system|>\n" + text + return text + + +def create_train_dataset(df): + prompts = [] + chosen = [] + rejected = [] + for _, row in df.iterrows(): + prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ + + "\n\nThe rules are as follows:\n" + row[ + "rules"].strip() + "\n\nThe conversation goes as follows:\n")) + chosen.append(normalize_conversations(row["positive_conversation"])) + rejected.append(normalize_conversations(row["negative_conversation"])) + + return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) + + +if __name__ == '__main__': + df = pd.read_parquet("data/wafl_functions.parquet") + train_dataset = create_train_dataset(df) + sample_text = train_dataset[0]["prompt"] + train_dataset[0]["chosen"] + optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) + input_ids = tokenizer(sample_text, return_tensors="pt").input_ids + output = model(input_ids, labels=input_ids) + loss = output.loss + loss.backward() + optimizer.step() + print("Done") + + input_ids = tokenizer(train_dataset[1]["prompt"], return_tensors="pt").input_ids + output = model.generate(input_ids, max_length=input_ids.shape[1] + 2, use_cache=True) + print(tokenizer.decode(output[0], skip_special_tokens=True)) + diff --git a/todo.txt b/todo.txt index 170da585..3a5275fd 100644 --- a/todo.txt +++ b/todo.txt @@ -1,6 +1,20 @@ +1) train on more steps + a) try 3 epochs, save each + b) use lr=1e-6 + c) use batch_size=4 + d) do not use 4 bit original model, use 16 bit (on the GPU) +2) evaluate result +3) Upload to hf +4) create a test set of 50 elements for paper. Find a way to test it. repeat from 1) +5) refactor code +6) maybe change voice model +6) write paper + + + ### TODO -* script to add wrong when none are needed PharazonE +* script to add wrong when none are needed On the to_modify set: From fc0ff5efe9a87fdd70523483f4971b837a0b6e3e Mon Sep 17 00:00:00 2001 From: Alberto Cetoli Date: Sun, 28 Apr 2024 17:21:34 +0100 Subject: [PATCH 10/10] played around with dataset creation --- datasets/__item__.py | 0 datasets/create_chitchat_dataset.py | 325 ------------------ datasets/create_final_dataset_table.py | 54 --- ...create_negative_examples_from_positives.py | 73 ---- datasets/create_rules_dataset.py | 57 --- ...reate_rules_dataset_from_ultrachat_200k.py | 50 --- ...e_summary_and_memory_from_conversations.py | 71 ---- datasets/data/current_item_index.json | 1 - .../delete_repeating_negative_examples.py | 35 -- ..._original_prompt_onto_negative_examples.py | 50 --- datasets/frontend/index.html | 57 --- datasets/select_dataset_rules.py | 34 -- datasets/server.py | 86 ----- datasets/tinker_with_llm.py | 31 -- datasets/train_dpo_supervised.py | 72 ---- datasets/train_dpo_with_wafl_dataset.py | 89 ----- datasets/train_llm_on_rules_dataset.py | 122 ------- datasets/train_sft_with_wafl_dataset.py | 79 ----- datasets/train_supervised.py | 64 ---- todo.txt | 4 + 20 files changed, 4 insertions(+), 1350 deletions(-) delete mode 100644 datasets/__item__.py delete mode 100644 datasets/create_chitchat_dataset.py delete mode 100644 datasets/create_final_dataset_table.py delete mode 100644 datasets/create_negative_examples_from_positives.py delete mode 100644 datasets/create_rules_dataset.py delete mode 100644 datasets/create_rules_dataset_from_ultrachat_200k.py delete mode 100644 datasets/create_summary_and_memory_from_conversations.py delete mode 100644 datasets/data/current_item_index.json delete mode 100644 datasets/delete_repeating_negative_examples.py delete mode 100644 datasets/fit_original_prompt_onto_negative_examples.py delete mode 100644 datasets/frontend/index.html delete mode 100644 datasets/select_dataset_rules.py delete mode 100644 datasets/server.py delete mode 100644 datasets/tinker_with_llm.py delete mode 100644 datasets/train_dpo_supervised.py delete mode 100644 datasets/train_dpo_with_wafl_dataset.py delete mode 100644 datasets/train_llm_on_rules_dataset.py delete mode 100644 datasets/train_sft_with_wafl_dataset.py delete mode 100644 datasets/train_supervised.py diff --git a/datasets/__item__.py b/datasets/__item__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/datasets/create_chitchat_dataset.py b/datasets/create_chitchat_dataset.py deleted file mode 100644 index d5ed426f..00000000 --- a/datasets/create_chitchat_dataset.py +++ /dev/null @@ -1,325 +0,0 @@ -import os -import json -import random -import re - -from tqdm import tqdm - - -_path = os.path.dirname(__file__) -_samsum_train_path = os.path.join(_path, "data/samsum-train.json") -_squad_train_path = os.path.join(_path, "data/squad2-train.json") -_squad_filter_path = os.path.join(_path, "data/squad_items_about_people.json") -_candidate_answers = [ - "unknown", - "I don't know", - "I do not know", - "I have no information about this", -] -_unknown_fraction = 0.1 -_context_fraction = 0.2 - - -def get_speakers(dialogue): - speakers = set() - for line in dialogue.split("\n"): - name = line[: line.find(":")] - speakers.add(name) - - return list(speakers) - - -def select_random_pair_of_speakers(candidates): - random.shuffle(candidates) - return candidates[:2] - - -def create_inset_from_unanswerable_question(squad_set, first_speaker, second_speaker): - data = squad_set["data"] - item = random.choice(data) - paragraph = random.choice(item["paragraphs"]) - qas = random.choice(paragraph["qas"]) - question = qas["question"] - answer = random.choice(_candidate_answers) - return f"{first_speaker}: {question}\n{second_speaker}: {answer}\n" - - -def from_name_to_2nd_person(text, name): - text = re.sub(f"{name} doesn't", "you don't", text, flags=re.IGNORECASE) - text = re.sub(f"{name} does not", "you do not", text, flags=re.IGNORECASE) - text = re.sub(f"{name} does", "you do", text, flags=re.IGNORECASE) - text = re.sub(f"{name}'s", "your", text, flags=re.IGNORECASE) - text = re.sub(f"does {name}", "do you", text, flags=re.IGNORECASE) - text = re.sub(f"is {name}", "are you", text, flags=re.IGNORECASE) - text = re.sub(f"was {name}", "were you", text, flags=re.IGNORECASE) - text = re.sub(f"{name} is", "you are", text, flags=re.IGNORECASE) - text = re.sub(f"{name}", "you", text, flags=re.IGNORECASE) - return text - - -def from_name_to_1st_person(text, name): - text = re.sub(f"{name} doesn't", "I don't", text, flags=re.IGNORECASE) - text = re.sub(f"{name} does not", "I do not", text, flags=re.IGNORECASE) - text = re.sub(f"{name} does", "I do", text, flags=re.IGNORECASE) - text = re.sub(f"{name}'s", "my", text, flags=re.IGNORECASE) - text = re.sub(f"does {name}", "do I", text, flags=re.IGNORECASE) - text = re.sub(f"is {name}", "am I", text, flags=re.IGNORECASE) - text = re.sub(f"was {name}", "was I", text, flags=re.IGNORECASE) - text = re.sub(f"{name} is", "I am", text, flags=re.IGNORECASE) - text = re.sub(f"to {name}", "to me", text, flags=re.IGNORECASE) - text = re.sub(f"{name}", "I", text, flags=re.IGNORECASE) - return text - - -def from_2nd_person_to_name(text, name): - text = re.sub("you don't", f"{name} doesn't", text, flags=re.IGNORECASE) - text = re.sub("you do not", f"{name} does not", text, flags=re.IGNORECASE) - text = re.sub("you do", f"{name} does", text, flags=re.IGNORECASE) - text = re.sub("your", f"{name}'s", text, flags=re.IGNORECASE) - text = re.sub("do you", f"does {name}", text, flags=re.IGNORECASE) - text = re.sub("are you", f"is {name}", text, flags=re.IGNORECASE) - text = re.sub("were you", f"was {name}", text, flags=re.IGNORECASE) - text = re.sub("you are", f"{name} is", text, flags=re.IGNORECASE) - text = re.sub("you will", f"{name} will", text, flags=re.IGNORECASE) - text = re.sub("you'll", f"{name} will", text, flags=re.IGNORECASE) - text = re.sub(" you ", f" {name} ", text, flags=re.IGNORECASE) - text = re.sub(" you\.", f" {name}\.", text, flags=re.IGNORECASE) - text = re.sub(" you!", f" {name}!", text, flags=re.IGNORECASE) - text = re.sub(" you\?", f" {name}\?", text, flags=re.IGNORECASE) - return text - - -def from_1st_person_to_name(text, name): - text = re.sub("I don't", f"{name} doesn't", text, flags=re.IGNORECASE) - text = re.sub("I do not", f"{name} does not", text, flags=re.IGNORECASE) - text = re.sub("I do", f"{name} does", text, flags=re.IGNORECASE) - text = re.sub("my ", f"{name}'s ", text, flags=re.IGNORECASE) - text = re.sub("do I", f"does {name}", text, flags=re.IGNORECASE) - text = re.sub("am I", f"is {name}", text, flags=re.IGNORECASE) - text = re.sub("was I", f"was {name}", text, flags=re.IGNORECASE) - text = re.sub("I am", f"{name} is", text, flags=re.IGNORECASE) - text = re.sub("to me", f"to {name}", text, flags=re.IGNORECASE) - text = re.sub("I will", f"{name} will", text, flags=re.IGNORECASE) - text = re.sub("I'll", f"{name} will", text, flags=re.IGNORECASE) - text = re.sub("I'm", f"{name} is", text) - text = re.sub("I ", f"{name} ", text) - text = re.sub(" I\?", f" {name}\?", text) - text = re.sub(" me ", f" {name} ", text, flags=re.IGNORECASE) - text = re.sub(" me\.", f" {name}\.", text, flags=re.IGNORECASE) - text = re.sub(" me!", f" {name}!", text, flags=re.IGNORECASE) - text = re.sub(" me\?", f" {name}\?", text, flags=re.IGNORECASE) - return text - - -def replace_names(text, names, replace_function): - names = sorted(names, key=lambda x: -len(x)) - for name in names: - if name in text: - return replace_function(text, name) - - return text - - -def create_inset_with_first_person_answer( - squad_set, squad_people_filter, first_speaker, second_speaker -): - squad_item_number, names = random.sample(squad_people_filter.items(), 1)[0] - squad_item_number = int(squad_item_number) - names = names["names"] - question, answer = "", "" - while ( - "you" not in question.lower() - and "your" not in question.lower() - and "I " not in answer - and "my" not in answer.lower() - ): - paragraph = random.choice(squad_set["data"][squad_item_number]["paragraphs"]) - qas = random.choice(paragraph["qas"]) - if not qas["answers"]: - continue - - question = replace_names(qas["question"], names, from_name_to_2nd_person) - answer = replace_names( - random.choice(qas["answers"])["text"], names, from_name_to_1st_person - ) - - context = replace_names( - paragraph["context"], names, lambda x, y: x.replace(y, second_speaker) - ) - return f"{first_speaker}: {question}\n{second_speaker}: {answer}\n", context - - -def create_inset_with_first_person_query( - squad_set, squad_people_filter, first_speaker, second_speaker -): - squad_item_number, names = random.sample(squad_people_filter.items(), 1)[0] - squad_item_number = int(squad_item_number) - names = names["names"] - question, answer = "", "" - while ( - "I" not in question.lower() - and "my" not in question.lower() - and "you " not in answer.lower() - and "your " not in answer.lower() - ): - paragraph = random.choice(squad_set["data"][squad_item_number]["paragraphs"]) - qas = random.choice(paragraph["qas"]) - if not qas["answers"]: - continue - - question = replace_names(qas["question"], names, from_name_to_1st_person) - answer = replace_names( - random.choice(qas["answers"])["text"], names, from_name_to_2nd_person - ) - - context = replace_names( - paragraph["context"], names, lambda x, y: x.replace(y, second_speaker) - ) - return f"{first_speaker}: {question}\n{second_speaker}: {answer}\n", context - - -def is_question(line): - return "?" in line - - -def get_sequence_of_speakers(dialogue_lines): - return [line.split(":")[0] for line in dialogue_lines if ":" in line] - - -def find_next_speaker(speaker_sequence, index, curr_speaker): - for speaker in speaker_sequence[index + 1 :]: - if curr_speaker != speaker: - return speaker - - raise RuntimeWarning("No next speaker in conversation.") - - -def find_prior_speaker(speaker_sequence, index, curr_speaker): - for speaker in speaker_sequence[:index][::-1]: - if curr_speaker != speaker: - return speaker - - raise RuntimeError("No prior speaker in conversation.") - - -def substitute_pronouns_with_speaker_names(dialogue_text): - dialogue_lines = [line for line in dialogue_text.split("\n") if line] - speaker_sequence = get_sequence_of_speakers(dialogue_lines) - new_lines = [] - for index in range(len(dialogue_lines) - 1): - line = dialogue_lines[index] - curr_speaker = speaker_sequence[index] - if "remembers" in curr_speaker: - new_lines.append(line) - continue - - new_line = from_1st_person_to_name(line, curr_speaker) - try: - next_speaker = find_next_speaker(speaker_sequence, index, curr_speaker) - - except RuntimeWarning: - new_lines.append(new_line) - break - - new_line = from_2nd_person_to_name(new_line, next_speaker) - new_lines.append(new_line) - - new_line = from_1st_person_to_name(dialogue_lines[-1], speaker_sequence[-1]) - try: - prior_speaker = find_prior_speaker(speaker_sequence, -1, speaker_sequence[-1]) - - except RuntimeWarning: - new_lines.append(new_line) - return "\n".join(new_lines) - - new_line = from_2nd_person_to_name(new_line, prior_speaker) - new_lines.append(new_line) - - return "\n".join(new_lines) - - -if __name__ == "__main__": - samsum_train = json.load(open(_samsum_train_path)) - squad_train = json.load(open(_squad_train_path)) - squad_people_filter = json.load(open(_squad_filter_path)) - - new_train_set = [] - for item in tqdm(samsum_train[:1000]): - new_item = {} - dialogue = item["dialogue"].replace("\r", "") - if not dialogue: - continue - - speakers = get_speakers(dialogue) - first, second = select_random_pair_of_speakers(speakers) - inset = create_inset_from_unanswerable_question(squad_train, first, second) - first_person_answer, sp_context = create_inset_with_first_person_answer( - squad_train, squad_people_filter, first, second - ) - first_person_query, fp_context = create_inset_with_first_person_query( - squad_train, squad_people_filter, first, second - ) - - new_dialogue = "" - num_lines = len(dialogue.split("\n")) - unknown_inserted_before = False - first_person_answer_inserted_before = False - first_person_query_inserted_before = False - - for line in dialogue.split("\n"): - new_dialogue += line + "\n" - if line and is_question(line): - continue - - threshold = _unknown_fraction / num_lines - context_threshold = _context_fraction / num_lines - if random.uniform(0, 1) < threshold and not unknown_inserted_before: - new_dialogue += inset - unknown_inserted_before = True - - elif ( - random.uniform(0, 1) < context_threshold - and not first_person_answer_inserted_before - ): - if random.choice([1, 0]): - new_dialogue += f"{second} remembers: " + sp_context + "\n" - first_person_answer = first_person_answer.replace( - f"{second}:", f"{second}: [factual]" - ) - - else: - new_dialogue += f"{second}: " + sp_context + "\n" - first_person_answer = first_person_answer.replace( - f"{second}:", f"{second}: [answer in conversation]" - ) - - new_dialogue += first_person_answer - first_person_answer_inserted_before = True - continue - - elif ( - random.uniform(0, 1) < context_threshold - and not first_person_query_inserted_before - ): - if random.choice([1, 0]): - new_dialogue += f"{second} remembers: " + fp_context + "\n" - first_person_query = first_person_query.replace( - f"{second}:", f"{second}: [factual]" - ) - - else: - new_dialogue += f"{first}: " + fp_context + "\n" - first_person_query = first_person_query.replace( - f"{second}:", f"{second}: [answer in conversation]" - ) - - new_dialogue += first_person_query - first_person_answer_inserted_before = True - - new_item["dialogue"] = ( - "In the dialogue below some people are talking:\n" - + substitute_pronouns_with_speaker_names(new_dialogue) - ) - new_train_set.append(new_item) - - json.dump(new_train_set, open(os.path.join(_path, "data/dialogues.json"), "w")) diff --git a/datasets/create_final_dataset_table.py b/datasets/create_final_dataset_table.py deleted file mode 100644 index d933f224..00000000 --- a/datasets/create_final_dataset_table.py +++ /dev/null @@ -1,54 +0,0 @@ -import json - -import pandas as pd - -from datasets import load_dataset - -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -def get_memory(text): - start_pattern = "This is the summary of the bot's knowledge: \n" - end_pattern = "\nThe rules are as follows:\n" - return text[text.find(start_pattern) + len(start_pattern) : text.find(end_pattern)] - - -def get_rules(text): - start_pattern = "\nThe rules are as follows:\n" - end_pattern = "\nThe conversation goes as follows:\n" - return text[text.find(start_pattern) + len(start_pattern) : text.find(end_pattern)] - - -def get_conversation(text): - pattern = "\nThe conversation goes as follows:\n" - return text.split(pattern)[1] - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - stopwords = ["", "", "<|assistant|>", ""] - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=stopwords - ) - - data_positive = json.load(open("data/positive_examples.json")) - data_negative = json.load(open("data/to_modify_negative_examples.json")) - output_dict = {"memory": [], "rules": [], "positive_conversation": [], "negative_conversation": []} - for text_positive, text_negative in zip(data_positive, data_negative): - memory = get_memory(text_positive) - rules = get_rules(text_positive) - positive_conversation = get_conversation(text_positive) - negative_conversation = get_conversation(text_negative) - output_dict["memory"].append(memory) - output_dict["rules"].append(rules) - output_dict["positive_conversation"].append(positive_conversation) - output_dict["negative_conversation"].append(negative_conversation) - - df = pd.DataFrame(output_dict) - df.to_parquet("data/wafl_functions.parquet", index=False) - dataset = load_dataset("parquet", data_files="data/wafl_functions.parquet") - dataset.push_to_hub("fractalego/wafl-functions-dataset") - print("Dataset pushed to the hub") \ No newline at end of file diff --git a/datasets/create_negative_examples_from_positives.py b/datasets/create_negative_examples_from_positives.py deleted file mode 100644 index e90af49d..00000000 --- a/datasets/create_negative_examples_from_positives.py +++ /dev/null @@ -1,73 +0,0 @@ -import asyncio -import json -from fuzzywuzzy import fuzz -from tqdm import tqdm - -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -def get_new_task(): - return ( - "\n\n" - "\n" - "Write a conversation that does not follow the same pattern as the previous one.\n" - "You must create a new conversation starting as the old one.\n" - "In particular, the new conversation between the user and the bot does not follow and contraddicts the rules and information above.\n" - "You can keep the conversation as it is but without any <> tags\n" - "You can also use the wrong tags.\n" - "You can also create a conversation that contradicts the rules above.\n" - "Show the results below, write at most 10 lines and end with the tag :\n" - ) - - -def get_prompt_before_conversation(text): - pattern = "\nThe conversation goes as follows:\n" - return text.split(pattern)[0] + pattern - - -def get_conversation(text): - pattern = "\nThe conversation goes as follows:\n" - return text.split(pattern)[1] - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - stopwords = ["", "", "<|assistant|>", ""] - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=stopwords - ) - - data = json.load(open("data/positive_examples.json")) - output_data = json.load(open("data/tmp_negative_examples.json")) - tags = ["", "", "", "", "", "", "", "", - "", "", "", "", "", "", "", "" - ] - for text in tqdm(data[len(output_data):]): - text += get_new_task() - to_save = "" - while len(to_save) < 200 or to_save in text or fuzz.ratio(to_save, get_conversation(text)) > 90: - conversation = asyncio.run(remote_llm_connector.predict(text, num_tokens=2048, num_replicas=1)) - to_save = conversation[0] - for item in stopwords: - to_save = to_save.replace(item, "") - to_save = to_save.split("")[0] - - for item in tags: - to_save = to_save.replace(item, "") - - output_data.append(to_save) - json.dump(output_data, open("data/tmp_negative_examples.json", "w"), indent=2) - - data = json.load(open("data/positive_examples.json")) - output_data = json.load(open("data/tmp_negative_examples.json")) - negative_data = [] - for positive, negative in zip(data, output_data): - prompt = get_prompt_before_conversation(positive) - if prompt not in negative: - negative = prompt + negative - negative_data.append(negative) - - json.dump(negative_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/create_rules_dataset.py b/datasets/create_rules_dataset.py deleted file mode 100644 index 9f4bf0ec..00000000 --- a/datasets/create_rules_dataset.py +++ /dev/null @@ -1,57 +0,0 @@ -import asyncio -import pandas as pd - -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -def get_prompt(df, theme, retriever, num_samples): - indices_and_scores = asyncio.run(retriever.get_indices_and_scores_from_text(theme)) - indices = [index for index, _ in indices_and_scores] - prompt = "" - for _, row in df.iloc[indices].sample(num_samples).iterrows(): - prompt += ( - f""" - -Create a plausible dialogue about the theme \"{row["Theme"]}\" based on the following summary and rules. - -The rules are as follows: -{row["Rules"]} - -The conversation goes as follows: -{row["Conversation"]} - - """.strip() - + "\n\n" - ) - - incipit = f"Create plausible dialogue about the theme \"{theme}\" based on the following summary and rules.\n\nThe rules are as follows:\n" - - return ( - prompt - + f'\n{incipit}' - ) - - -def generate_conversation(theme, num_samples=2): - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=[""], num_replicas=1 - ) - - df = pd.read_csv("data/complex_instructions.csv") - for index, row in df.iterrows(): - asyncio.run(retriever.add_text_and_index(row["Theme"], str(index))) - - prompt = get_prompt(df, theme, retriever, num_samples=num_samples) - generated_text = asyncio.run( - remote_llm_connector.predict(prompt, temperature=0.5, num_tokens=2500) - )[0] - return "The rules are as follows:\n" + generated_text - - -if __name__ == "__main__": - theme = "playing a song that the user likes" - print(generate_conversation(theme)) diff --git a/datasets/create_rules_dataset_from_ultrachat_200k.py b/datasets/create_rules_dataset_from_ultrachat_200k.py deleted file mode 100644 index ecd01d7b..00000000 --- a/datasets/create_rules_dataset_from_ultrachat_200k.py +++ /dev/null @@ -1,50 +0,0 @@ -import json - -import pandas as pd -from tqdm import tqdm - -from create_rules_dataset import generate_conversation -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - -_max_rows = 2000 - - -def conversation_is_valid(conversation): - if "\nuser: " not in conversation: - return False - - if "- The user " not in conversation: - return False - - return True - - -def clean_conversation(conversation): - conversation = conversation.replace("", "") - conversation = conversation.replace("", "") - return conversation - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=[""] - ) - - df = pd.read_parquet("data/train_gen-00000-of-00003-a6c9fb894be3e50b.parquet") - dataset_items = json.load(open("data/rules_and_conversations.json")) - for index, row in tqdm(df[len(dataset_items):_max_rows + len(dataset_items)].iterrows(), total=_max_rows): - theme = row["prompt"] - conversation = generate_conversation(theme, num_samples=3) - if not conversation_is_valid(conversation): - continue - - if conversation in dataset_items: - continue - - conversation = clean_conversation(conversation) - dataset_items.append(conversation) - json.dump(dataset_items, open("data/rules_and_conversations.json", "w"), indent=2) diff --git a/datasets/create_summary_and_memory_from_conversations.py b/datasets/create_summary_and_memory_from_conversations.py deleted file mode 100644 index 45142d78..00000000 --- a/datasets/create_summary_and_memory_from_conversations.py +++ /dev/null @@ -1,71 +0,0 @@ -import asyncio -import json -import re - -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - -_max_rows = 2000 -items = json.load(open("data/accepted_rules_and_conversations.json")) - - -def get_prompt(conversation_item): - return f""" -Your task is to read the following conversation and create a summary of the knowledge that is being shared. -Specifically, you must create a summary of the bot's knowledge that is being shared in the conversation. -The conversation follow some rules explained at the beginning of the conversation. -Never *under any circumstance* include in the summary the rules that are explained at the beginning of the conversation. -Do not include in the summary the conversation itself, just what the bot seem to know given its answers. -If the bot outputs a text withing the tags and or and include that text in the summary. -The text containing the rules and the conversation is as follows: - -{conversation_item} - - -Summary of the bot's knowledge: - """.strip() - - -def clean_summary(summary): - return summary.replace("", "") - - -def clean_conversation_item(item): - # Change <|USER|>\n into user - item = item.replace("<|USER|>\n", "user: ") - item = item.replace("<|BOT|>\n", "bot: ") - - # Change function() into function() if there is a python function call being made - #item = re.sub("(.*?)\s(.*?\()(.*?)", r"\1 \2\3", item) - #item = re.sub("(.*?)\s(.*?\()(.*?)", r"\1 \2\3", item) - - # some sentences are between [] and should be removed - item = re.sub(r"\[.*?\]", "", item) - - # sometimes at the end of the conversation the bot says "Process finished with exit code 0". Erase this - item = re.sub(r"Process finished with exit code 0", "", item) - - # todo User -> user, or at least be internally consistent - item = item.replace("User: ", "user: ") - - return item - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=[""] - ) - - complete_items = [] - for item in items: - item = clean_conversation_item(item) - summary = asyncio.run( - remote_llm_connector.predict(get_prompt(item), temperature=0.5, num_tokens=2500) - )[0] - summary = clean_summary(summary) - complete_items.append(f"This is the summary of the bot's knowledge: {summary}\n\n{item}") - json.dump(complete_items, open("data/summary_and_memory_from_conversations.json", "w"), indent=2) - diff --git a/datasets/data/current_item_index.json b/datasets/data/current_item_index.json deleted file mode 100644 index 5ca234cb..00000000 --- a/datasets/data/current_item_index.json +++ /dev/null @@ -1 +0,0 @@ -345 \ No newline at end of file diff --git a/datasets/delete_repeating_negative_examples.py b/datasets/delete_repeating_negative_examples.py deleted file mode 100644 index 0a5c837b..00000000 --- a/datasets/delete_repeating_negative_examples.py +++ /dev/null @@ -1,35 +0,0 @@ -import json - -from tqdm import tqdm -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -def find_all_occurences(sentence, word): - import re - return [match.start() for match in re.finditer(word, sentence)] - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - stopwords = ["", "", "<|assistant|>", ""] - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=stopwords - ) - - data = json.load(open("data/to_modify_negative_examples.json")) - output_data = [] - for text in tqdm(data): - conversation = text - all_new_lines = find_all_occurences(conversation, "\n") - for new_line in all_new_lines: - tail = conversation[new_line:] - if len(tail) > len(conversation)/10. and tail in text[:new_line]: - conversation = conversation[:new_line] - break - - output_data.append(conversation) - - json.dump(output_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/fit_original_prompt_onto_negative_examples.py b/datasets/fit_original_prompt_onto_negative_examples.py deleted file mode 100644 index 17cee59b..00000000 --- a/datasets/fit_original_prompt_onto_negative_examples.py +++ /dev/null @@ -1,50 +0,0 @@ -import asyncio -import json -from fuzzywuzzy import fuzz -from tqdm import tqdm - -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -def get_new_task(): - return ( - "\n\n" - "\n" - "Write a conversation exactly like the old one with one exception:\n" - "You *must* insert in the middle of the conversation a function to be executed\n" - "For example, if a conversation is about fruits, you can insert a function that calculates the number of fruits like this number_of_fruits()\n" - "You can use the tags and to insert the function or and or and \n, or and . Choose randomly.\n" - "Show the results below and end with the tag :\n" - ) - - -def get_prompt_before_conversation(text): - pattern = "\nThe conversation goes as follows:\n" - return text.split(pattern)[0] + pattern - - -def get_conversation(text): - pattern = "\nThe conversation goes as follows:\n" - return text.split(pattern)[1] - - -if __name__ == "__main__": - config = Configuration.load_local_config() - retriever = DenseRetriever("text_embedding_model", config) - stopwords = ["", "", "<|assistant|>", ""] - remote_llm_connector = RemoteLLMConnector( - config.get_value("llm_model"), last_strings=stopwords - ) - - data_positive = json.load(open("data/positive_examples.json")) - data_negative = json.load(open("data/to_modify_negative_examples.json")) - - output_data = json.load(open("data/tmp_negative_examples.json")) - for text_positive, text_negative in zip(data_positive, data_negative): - to_save = get_prompt_before_conversation(text_positive) - to_save += get_conversation(text_negative) - output_data.append(to_save) - - json.dump(output_data, open("data/negative_examples.json", "w"), indent=2) \ No newline at end of file diff --git a/datasets/frontend/index.html b/datasets/frontend/index.html deleted file mode 100644 index 7fe03f8b..00000000 --- a/datasets/frontend/index.html +++ /dev/null @@ -1,57 +0,0 @@ - - - - Modify the items - - - - - -
    -
    -
    - -
    -
    - - - - -
    -
    - \ No newline at end of file diff --git a/datasets/select_dataset_rules.py b/datasets/select_dataset_rules.py deleted file mode 100644 index a479338c..00000000 --- a/datasets/select_dataset_rules.py +++ /dev/null @@ -1,34 +0,0 @@ -import json - -import pandas as pd -from tqdm import tqdm - -from create_rules_dataset import generate_conversation -from wafl.config import Configuration -from wafl.connectors.remote.remote_llm_connector import RemoteLLMConnector -from wafl.retriever.dense_retriever import DenseRetriever - - -if __name__ == "__main__": - config = Configuration.load_local_config() - dataset_items = json.load(open("data/rules_and_conversations.json")) - discarded_items = json.load(open("data/discarded_rules_and_conversations.json")) - accepted_items = json.load(open("data/accepted_rules_and_conversations.json")) - - for item in dataset_items: - if item in discarded_items or item in accepted_items: - continue - - print(item) - y_n = input("Do you want to accept this item? (y/n)") - if y_n != "y": - discarded_items.append(item) - json.dump(discarded_items, open("data/discarded_rules_and_conversations.json", "w"), indent=2) - print("\n\n") - continue - - accepted_items.append(item) - json.dump(accepted_items, open("data/accepted_rules_and_conversations.json", "w"), indent=2) - print("\n\n") - - #### CHANGE <|USER|>\n into user: (some of the elements are in the wrong format) diff --git a/datasets/server.py b/datasets/server.py deleted file mode 100644 index 24b296b7..00000000 --- a/datasets/server.py +++ /dev/null @@ -1,86 +0,0 @@ -import json -import os -from flask import Flask, render_template, request - -_path = os.path.dirname(__file__) - -app = Flask( - __name__, - static_url_path="", - static_folder=os.path.join(_path, "./frontend/"), - template_folder=os.path.join(_path, "./frontend/"), -) -filename = "negative_examples.json" -#filename = "positive_examples.json" -items = json.load(open(f"data/{filename}")) -current_item_index = json.load(open("data/current_item_index.json")) - - -@app.route('/') -def hello_world(): - return render_template('index.html') - - -@app.route('/current_item/') -def current_item(): - return f""" - - """ - - -@app.route('/next_item/') -def next_item(): - global current_item_index - current_item_index += 1 - json.dump(current_item_index, open("data/current_item_index.json", "w")) - return f""" - - """ - - -@app.route('/previous_item/') -def previous_item(): - global current_item_index - current_item_index -= 1 - json.dump(current_item_index, open("data/current_item_index.json", "w")) - return f""" - - """ - - -@app.route('/save_item/', methods=['GET']) -def save_item(): - global current_item_index - items[current_item_index] = request.values['editor'] - json.dump(items, open(f"data/{filename}", "w"), indent=2) - return "" - - -@app.route('/delete_item/') -def delete_item(): - global current_item_index - items.pop(current_item_index) - json.dump(items, open(f"data/{filename}", "w"), indent=2) - return f""" - - """ - - -if __name__ == '__main__': - app.run(debug=True) diff --git a/datasets/tinker_with_llm.py b/datasets/tinker_with_llm.py deleted file mode 100644 index 0c5097ff..00000000 --- a/datasets/tinker_with_llm.py +++ /dev/null @@ -1,31 +0,0 @@ -import pandas as pd -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM - -tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") -model = AutoModelForCausalLM.from_pretrained("./wafl_functions") - - -def create_train_dataset(df): - prompts = [] - chosen = [] - rejected = [] - for _, row in df.iterrows(): - prompts.append("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ - + "\n\nThe rules are as follows:\n" + row[ - "rules"].strip() + "\n\nThe conversation goes as follows:\n") - chosen.append(row["positive_conversation"].strip()) - rejected.append(row["negative_conversation"].strip()) - - return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) - - -if __name__ == '__main__': - df = pd.read_parquet("data/wafl_functions.parquet") - train_dataset = create_train_dataset(df) - user_string = "\nuser: write a haiku about cherry blossom.\nbot:" - input_ids = tokenizer(train_dataset[0]["prompt"] + user_string, return_tensors="pt") - #outputs = model.generate(**input_ids, max_length=input_ids.input_ids.shape[1] + 2, use_cache=True) - outputs = model.generate(**input_ids, do_sample=True, temperature=0.4, max_length=1024, - pad_token_id=tokenizer.eos_token_id) - print(tokenizer.decode(outputs[0])) diff --git a/datasets/train_dpo_supervised.py b/datasets/train_dpo_supervised.py deleted file mode 100644 index 3bc77ac8..00000000 --- a/datasets/train_dpo_supervised.py +++ /dev/null @@ -1,72 +0,0 @@ -import os - -import pandas as pd -import torch -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer -from svd_training.svd_model import SVDMistralForCausalLM - -model_name = "mistralai/Mistral-7B-Instruct-v0.1" -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -_filename = "mistral_svd_model.psd" -if os.path.exists(_filename): - model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) - -else: - model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), - rank_fraction=0.1) - torch.save(model.state_dict(), _filename) - - -def normalize_conversations(text): - # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") - # text = text.replace("user: ", "<|user|>\n") - # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") - # text = text + "<|endoftext|>" - return text - - -def normalize_prompt(text): - # text = "<|system|>\n" + text - return text - - -def create_train_dataset(df): - prompts = [] - chosen = [] - rejected = [] - for _, row in df.iterrows(): - prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ - + "\n\nThe rules are as follows:\n" + row[ - "rules"].strip() + "\n\nThe conversation goes as follows:\n")) - chosen.append(normalize_conversations(row["positive_conversation"])) - rejected.append(normalize_conversations(row["negative_conversation"])) - - return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) - - -def my_loss(logits, original_logits, target): - loss = torch.mean((output - target)**2) - - - return loss - - - -if __name__ == '__main__': - df = pd.read_parquet("data/wafl_functions.parquet") - train_dataset = create_train_dataset(df) - sample_text = train_dataset[0]["prompt"] + train_dataset[0]["chosen"] - optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) - input_ids = tokenizer(sample_text, return_tensors="pt").input_ids - output = model(input_ids) - loss = output.loss - loss.backward() - optimizer.step() - print("Done") - - input_ids = tokenizer(train_dataset[1]["prompt"], return_tensors="pt").input_ids - output = model.generate(input_ids, max_length=input_ids.shape[1] + 2, use_cache=True) - print(tokenizer.decode(output[0], skip_special_tokens=True)) - diff --git a/datasets/train_dpo_with_wafl_dataset.py b/datasets/train_dpo_with_wafl_dataset.py deleted file mode 100644 index 619c6cbb..00000000 --- a/datasets/train_dpo_with_wafl_dataset.py +++ /dev/null @@ -1,89 +0,0 @@ -import os - -import pandas as pd -import torch -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments -from svd_training.svd_model import SVDMistralForCausalLM -from trl import DPOTrainer - -model_name = "mistralai/Mistral-7B-Instruct-v0.1" -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -original_model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True) -_filename = "mistral_svd_model.psd" -if os.path.exists(_filename): - model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) - -else: - model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), - rank_fraction=0.1) - torch.save(model.state_dict(), _filename) - -#model.half().cuda() - - -def normalize_conversations(text): - # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") - # text = text.replace("user: ", "<|user|>\n") - # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") - # text = text + "<|endoftext|>" - return text - - -def normalize_prompt(text): - # text = "<|system|>\n" + text - return text - - -def create_train_dataset(df): - prompts = [] - chosen = [] - rejected = [] - for _, row in df.iterrows(): - prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ - + "\n\nThe rules are as follows:\n" + row[ - "rules"].strip() + "\n\nThe conversation goes as follows:\n")) - chosen.append(normalize_conversations(row["positive_conversation"])) - rejected.append(normalize_conversations(row["negative_conversation"])) - - max_prompt_length = max(len(tokenizer.encode(prompt)) for prompt in prompts) - max_chosen_length = max(len(tokenizer.encode(chosen_text)) for chosen_text in chosen) - max_rejected_length = max(len(tokenizer.encode(rejected_text)) for rejected_text in rejected) - - return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}), max_prompt_length, max_chosen_length, max_rejected_length - - -if __name__ == '__main__': - df = pd.read_parquet("data/wafl_functions.parquet") - train_dataset, max_prompt_length, max_chosen_length, max_rejected_length = create_train_dataset(df) - training_args = TrainingArguments(output_dir="./output", - use_cpu=True, - num_train_epochs=0.3, #### Should be 1 - gradient_checkpointing=True, - per_device_train_batch_size=2, #### Try with 4 - learning_rate=1e-7, - logging_steps=10, - ) - trainer = DPOTrainer( - model, - original_model, - args=training_args, - beta=0.5, - train_dataset=train_dataset, - tokenizer=tokenizer, - max_prompt_length=max_prompt_length, - max_length=max(max_prompt_length + max_chosen_length, max_prompt_length + max_rejected_length), - ) - trainer.train() - model.merge_all() - #input_ids = tokenizer("The capital of England is", return_tensors="pt").input_ids - #output = model.generate(input_ids.cuda(), max_length=input_ids.shape[1] + 10, use_cache=True) - #print(tokenizer.decode(output[0], skip_special_tokens=True)) - model.save_pretrained("wafl_functions") - -#### SFT does not have -int -#### DPO introduces -inf after one step with 1e-8 -#### Does SFTTrainer have a similar issue? If so, it is likely a problem with the Trainer class - -#### TRY WIL LONGER CONTEXT WINDOW: DEFAULT IS 512 (LOOK FOR RIGHT ARGUMENT NAMES, THERE ARE TWO AT LEAST) diff --git a/datasets/train_llm_on_rules_dataset.py b/datasets/train_llm_on_rules_dataset.py deleted file mode 100644 index 251bbe9d..00000000 --- a/datasets/train_llm_on_rules_dataset.py +++ /dev/null @@ -1,122 +0,0 @@ -import random - -import pandas as pd -from datasets import Dataset -from transformers import ( - AutoTokenizer, - AutoModelForCausalLM, - TrainingArguments, - Trainer, - DataCollatorForLanguageModeling, -) - -model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1" -max_length = 1024 + 512 - - -def get_prompts(df): - prompts = [] - for _, row in df.sample(frac=1).iterrows(): - memory = "" - if memory == "": - memory = "The user has no memory." - - current_rule = row["Rules"] - rules = df.sample(random.choice([1, 2]))["Rules"].tolist() + [current_rule] - random.shuffle(rules) - rules = "\n".join(rules) - prompt = ( - f""" -The user is talking with a chatbot about the theme \"{row["Theme"]}\" based on the following summary. - -{memory} - - -The rules are as follows: - -{rules} - - -The conversation goes as follows: -{row["Conversation"]} - """.strip() - + "\n\n" - ) - prompts.append(prompt) - - return prompts - - -def preprocess_function(sample): - model_inputs = tokenizer( - sample["prompt"], - return_tensors="pt", - max_length=max_length, - padding="max_length", - ) - labels = tokenizer( - sample["prompt"], - return_tensors="pt", - max_length=max_length, - padding="max_length", - ) - - model_inputs["labels"] = labels["input_ids"] - return model_inputs - - -def model_init(): - model = AutoModelForCausalLM.from_pretrained(model_name_or_path) - parameters = model.parameters() - for parameter in parameters: - parameter.requires_grad = False - - model.model.enable_input_require_grads() - model.lm_head.training = True - for index in range(len(model.model.layers)): - model.model.layers[index].self_attn.k_proj.training = True - - return model - - -def create_dataset_from_file(filepath): - df = pd.read_csv(filepath) - prompts = get_prompts(df) - return Dataset.from_dict({"prompt": prompts}) - - -if __name__ == "__main__": - tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) - tokenizer.pad_token = tokenizer.eos_token - dataset = create_dataset_from_file("data/complex_instructions.csv") - train_dataset = dataset.map( - preprocess_function, batched=True, batch_size=1, num_proc=4 - ) - data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) - learning_rate = 1e-6 - output_dir_name = f"checkpoint_lr{learning_rate}" - training_args = TrainingArguments( - output_dir=output_dir_name, - per_device_train_batch_size=1, - per_device_eval_batch_size=1, - evaluation_strategy="steps", - use_cpu=True, - learning_rate=learning_rate, - num_train_epochs=2, - logging_steps=200, - eval_steps=200, - save_total_limit=1, - ) - model = model_init() - trainer = Trainer( - model=model, - args=training_args, - tokenizer=tokenizer, - data_collator=data_collator, - train_dataset=train_dataset, - ) - trainer.train() - trainer.save_model("wafl-mistral") - model = trainer.model - model.push_to_hub("fractalego/wafl-mistral") - tokenizer.push_to_hub("fractalego/wafl-mistral") diff --git a/datasets/train_sft_with_wafl_dataset.py b/datasets/train_sft_with_wafl_dataset.py deleted file mode 100644 index 01ba28ed..00000000 --- a/datasets/train_sft_with_wafl_dataset.py +++ /dev/null @@ -1,79 +0,0 @@ -import os - -import pandas as pd -import torch -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments -from svd_training.svd_model import SVDMistralForCausalLM -from trl import DPOTrainer, SFTTrainer - -model_name = "mistralai/Mistral-7B-Instruct-v0.1" -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -_filename = "mistral_svd_model.psd" -if os.path.exists(_filename): - model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) -else: - model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), - rank_fraction=0.1) - torch.save(model.state_dict(), _filename) - - -def normalize_conversations(text): - # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") - # text = text.replace("user: ", "<|user|>\n") - # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") - # text = text + "<|endoftext|>" - return text - - -def normalize_prompt(text): - # text = "<|system|>\n" + text - return text - - -def create_train_dataset(df): - texts = [] - for _, row in df.iterrows(): - texts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ - + "\n\nThe rules are as follows:\n" + row[ - "rules"].strip() + "\n\nThe conversation goes as follows:\n") - + normalize_conversations(row["positive_conversation"]) - ) - - return Dataset.from_dict({"text": texts}) - - -if __name__ == '__main__': - df = pd.read_parquet("data/wafl_functions.parquet") - train_dataset = create_train_dataset(df) - training_args = TrainingArguments(output_dir="./output", - #fp16_full_eval=True, - use_cpu=True, - num_train_epochs=0.001, - gradient_checkpointing=True, - per_device_train_batch_size=1, - learning_rate=1e-7, - save_strategy="steps", - logging_steps=10, - ) - trainer = SFTTrainer( - model, - train_dataset=train_dataset, - dataset_text_field="text", - tokenizer=tokenizer, - max_seq_length=512, - args=training_args, - ) - trainer.train() - model.merge_all() - # input_ids = tokenizer("The capital of England is", return_tensors="pt").input_ids - # output = model.generate(input_ids.cuda(), max_length=input_ids.shape[1] + 10, use_cache=True) - # print(tokenizer.decode(output[0], skip_special_tokens=True)) - model.save_pretrained("wafl_functions") - -#### SFT does not have -int -#### DPO introduces -inf after one step with 1e-8 -#### Does SFTTrainer have a similar issue? If so, it is likely a problem with the Trainer class - -#### TRY WIL LONGER CONTEXT WINDOW: DEFAULT IS 512 (LOOK FOR RIGHT ARGUMENT NAMES, THERE ARE TWO AT LEAST) diff --git a/datasets/train_supervised.py b/datasets/train_supervised.py deleted file mode 100644 index 206da284..00000000 --- a/datasets/train_supervised.py +++ /dev/null @@ -1,64 +0,0 @@ -import os - -import pandas as pd -import torch -from datasets import Dataset -from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer -from svd_training.svd_model import SVDMistralForCausalLM - -model_name = "mistralai/Mistral-7B-Instruct-v0.1" -tokenizer = AutoTokenizer.from_pretrained(model_name) -tokenizer.pad_token = tokenizer.eos_token -_filename = "mistral_svd_model.psd" -if os.path.exists(_filename): - model = SVDMistralForCausalLM.create_from_state_dict(torch.load(_filename)) - -else: - model = SVDMistralForCausalLM.create_from_model(AutoModelForCausalLM.from_pretrained(model_name), - rank_fraction=0.1) - torch.save(model.state_dict(), _filename) - - -def normalize_conversations(text): - # text = text.replace("\nuser: ", "<|endoftext|>\n<|user|>\n") - # text = text.replace("user: ", "<|user|>\n") - # text = text.replace("\nbot: ", "<|endoftext|>\n<|assistant|>\n") - # text = text + "<|endoftext|>" - return text - - -def normalize_prompt(text): - # text = "<|system|>\n" + text - return text - - -def create_train_dataset(df): - prompts = [] - chosen = [] - rejected = [] - for _, row in df.iterrows(): - prompts.append(normalize_prompt("This is the summary of the bot's knowledge: \n" + row["memory"].strip() \ - + "\n\nThe rules are as follows:\n" + row[ - "rules"].strip() + "\n\nThe conversation goes as follows:\n")) - chosen.append(normalize_conversations(row["positive_conversation"])) - rejected.append(normalize_conversations(row["negative_conversation"])) - - return Dataset.from_dict({"prompt": prompts, "chosen": chosen, "rejected": rejected}) - - -if __name__ == '__main__': - df = pd.read_parquet("data/wafl_functions.parquet") - train_dataset = create_train_dataset(df) - sample_text = train_dataset[0]["prompt"] + train_dataset[0]["chosen"] - optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) - input_ids = tokenizer(sample_text, return_tensors="pt").input_ids - output = model(input_ids, labels=input_ids) - loss = output.loss - loss.backward() - optimizer.step() - print("Done") - - input_ids = tokenizer(train_dataset[1]["prompt"], return_tensors="pt").input_ids - output = model.generate(input_ids, max_length=input_ids.shape[1] + 2, use_cache=True) - print(tokenizer.decode(output[0], skip_special_tokens=True)) - diff --git a/todo.txt b/todo.txt index 3a5275fd..0beacad2 100644 --- a/todo.txt +++ b/todo.txt @@ -1,3 +1,7 @@ +* on wafl_llm make it so only some LLMs are supported +* change speaker model with newer one + + 1) train on more steps a) try 3 epochs, save each b) use lr=1e-6