diff --git a/dalm/datasets/reading_comprehension_generation/README.md b/dalm/datasets/reading_comprehension_generation/README.md new file mode 100644 index 0000000..01fb30d --- /dev/null +++ b/dalm/datasets/reading_comprehension_generation/README.md @@ -0,0 +1,49 @@ +## A note about reading comprehension + +This aproach of adapting LLMs is based on this [paper](https://arxiv.org/abs/2309.09530) by Microsoft +The way espoused by the paper is generating reading comprehension questions and answers based on the raw corpora +and training a llm on said generated dataset can enhance its domain adaptiveness + +We have two ways of generating reading comprehension data + +1. Via regex based methods that combs the input data for match and aligns them into questions and answers +2. Via prompting a large language model to come up with questions and answers + +To see the prompt behind LLM based reading-comprehension dataset generation please go [here](https://github.com/arcee-ai/DALM/blob/4d93d4a198cc64ce5d19ee98786b70f579dbef0c/dalm/datasets/reading_comprehension_generation/synthetic_based.py#L22) + +## How to get started + +For the input, either a single csv file or a directory of individual files each containing raw text will do. + + +### LLM based + +Assuming you have your dataset as a csv file with the column `text` containing the raw texts + +(chunking based on context length of model is enabled by default) + +```bash +python dalm/datasets/reading_comprehension_generation/synthetic_based.py \ + --model HuggingFaceH4/zephyr-7b-alpha \ + --context-length 4192 + --input input_dataset.csv --output_directory synth_data --dataset_name llm_generated_dataset +``` + +the output directory serves as a temporary holding place of all generated data before it can be made a dataset. +The generation process is time consuming and expensive. On average, because the process uses a LLM (if using the recommended 13b llama2 model), it takes about 20-30 minutes to produce 10 questions (numbers may vary depending on the content of your dataset and the unpredictability of the model). So every step is taken to ensure that if the process is interrupted, once back running will pick up where it left off. + +Chunking of data is enabled by default and requires the context length to be passed which is why it passed in in the example + +### Regex based + +(Same, as above i.e assuming you have your dataset as a csv file with the column `text` containing the raw texts) + +Please note there is the choice of passing in a domain sentence model in addition, but this is not required as +the script will train a domain specific sentencepiece model on the input corpus + +```bash + +python dalm/datasets/reading_comprehension_generation/regex_based.py --input input.csv \ + --csv_column text --general_spm_path resources/general.spm \ + --output_dataset_name regex_dataset +``` \ No newline at end of file diff --git a/dalm/datasets/reading_comprehension_generation/regex_based.py b/dalm/datasets/reading_comprehension_generation/regex_based.py new file mode 100644 index 0000000..adb4d8e --- /dev/null +++ b/dalm/datasets/reading_comprehension_generation/regex_based.py @@ -0,0 +1,1265 @@ +# Modified version of code from https://github.com/microsoft/LMOps/blob/main/adaptllm/utils/read.py + +# ruff: noqa: E501 + +import argparse +import copy +import json +import logging +import os +import random +import re +import typing +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import datasets +import numpy as np +import sentencepiece as spm # type: ignore[import-untyped] +from pysbd import Segmenter # type: ignore[import-untyped] +from tqdm.contrib.concurrent import process_map + +from dalm.datasets.reading_comprehension_generation.utils import ( + create_domain_tokenizer, + create_domain_tokenizer_from_files, + input_generator, +) + +logger = logging.getLogger(__name__) + +TYPES = ["nli", "common_reason", "paraphrase", "word2text", "summarize", "text_completion"] + + +def remove_double_space(string: str) -> str: + return re.sub("[ ]{2,}", " ", string) + + +class App: + def __init__(self) -> None: + self.cls_dic: Dict[str, Any] = {} + + @typing.no_type_check + def add(self, key: str): + def adder(cls: Any): + self.cls_dic[key] = cls + return cls + + return adder + + +type_map = App() + + +def chatml_format(question: str, answer: str | None = None) -> List[Dict[str, str]]: + result = [{"role": "user", "content": question}] + if answer is not None: + result.append({"role": "assistant", "content": answer}) + return result + + +@type_map.add("basetype") +class BaseType(object): + def __init__(self) -> None: + self.max_subcategory_num = 2 # limit the number of examples per subcategory + self.max_seq_len = 2048 + self.mine_regex: Dict[str, Any] = {} + + def collect_mined(self, tup: List[str], class_name: str) -> Dict[str, Any]: + raise NotImplementedError + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]] | List[Tuple[str]]: + raise NotImplementedError + + @typing.no_type_check + def get_template(self, entry: Dict[str, Any], random_seed: int) -> Tuple[str] | Tuple[str, str]: + """ + random sample a template for each entry + """ + random.seed(random_seed) # fix random seed for reproduction + template = random.choice(self.get_all_templates(entry, random_seed)) + return template + + # TODO: refactor + def fill_in_the_template( + self, template: Tuple[str] | Tuple[str, str], kw_dic: Dict[str, Any] + ) -> List[Dict[str, str]]: + """ + Account for: + 1. length 1 template and no qa_demos + 2. length 1 template and qa_demos + 3. length 2 template and no qa_demos + 4. length 2 template and qa_demos + + """ + qa_demos = kw_dic.get("qa_demos", []) + + if "qa_demos" in kw_dic.keys(): + qa_demos = kw_dic["qa_demos"] + + question = template[0].format(**kw_dic) + + if len(template) == 1 and len(qa_demos) > 1: + qa_demos[0]["content"] = question + qa_demos[0]["content"] + return qa_demos + elif len(template) == 1 and len(qa_demos) == 0: + return chatml_format(question) + elif len(template) == 2: + answer = template[1].format(**kw_dic) + + result = chatml_format(question, answer) + if qa_demos is not None: + result = qa_demos + result + + return result + else: + raise ValueError("template length must be 1 or 2") + + def truncate_sentence(self, text: str, max_len: int) -> List[str]: + tokenized_example = self.ori_spm.encode(text) + example_length = len(tokenized_example) + + if example_length > max_len: + truncated_text_list = [] + chunked_list = [tokenized_example[i : i + max_len] for i in range(0, len(tokenized_example), max_len)] + # input_ids = tokenized_example[:max_len] + # truncated_text = self.ori_spm.decode(input_ids) + for truncated_tokens in chunked_list: + truncated_text_list.append(self.ori_spm.decode(truncated_tokens)) + return truncated_text_list + else: + return [text] + + def init_spm(self, ori_spm: spm.SentencePieceProcessor, domain_spm: spm.SentencePieceProcessor) -> None: + self.ori_spm = ori_spm + self.domain_spm = domain_spm + + ori_tokens = set([self.ori_spm.id_to_piece(i) for i in range(len(self.ori_spm))]) + domain_tokens = set([self.domain_spm.id_to_piece(i) for i in range(len(self.domain_spm))]) + specific_tokens_set = domain_tokens - (ori_tokens & domain_tokens) + specific_tokens = [token for token in list(specific_tokens_set) if (token.startswith("▁") and len(token) > 10)] + self.specific_token_set = set(specific_tokens) + + def compile_regex(self) -> None: + """ + Does nothing more than compile regexes + """ + self.regex_dic = {} + for class_name, pattern in self.mine_regex.items(): + self.regex_dic[class_name] = re.compile(pattern, re.IGNORECASE) + + def mine(self, text: str, **kwargs: Dict[str, Any]) -> Tuple[Dict[str, Any], int]: + mined_dic: Dict[str, Any] = {} + mined_num = 0 + for class_name, regex in self.regex_dic.items(): + mined_dic[class_name] = [] + x = regex.findall(text) + if len(x) > 0: + for tup in x: + collected = self.collect_mined(tup, class_name) + mined_dic[class_name].append(collected) + mined_num += len(mined_dic[class_name]) + return mined_dic, mined_num + + +@type_map.add("nli") +class nli(BaseType): + def __init__(self) -> None: + super().__init__() + # init regex + self.mine_regex = { + "Entail": r"([.!?]+[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)(Yes|Therefore|Thus|Accordingly|Hence|For this reason)([\s]*,[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)", + "Contradict": r"([.!?]+[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)(No|However|But|On the contrary|In contrast|Whereas)([\s]*,[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)", + "Neutral": r"([.!?]+[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)(Maybe|Also|Furthermore|Secondly|Additionally|Moreover|In addition)([\s]*,[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)", + } + self.compile_regex() + + def collect_mined(self, tup: List[str], class_name: str) -> Dict[str, Any]: + dic = { + "label": class_name, + "verbalizer": tup[3], + "premise": tup[1], + "hypothesis": tup[-2], + } + return dic + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]]: + np.random.seed(random_seed) + type = np.random.choice(["generate", "classify"], p=[0.2, 0.8]) + if type == "classify": + return [ + # Basic Templates + ('{premise}\nBased on the sentence above can we infer that "{hypothesis}"?', "{answer}"), + ( + "{premise}\nBased on this sentence can we infer that the following sentence is true?\n{hypothesis}\nAnswer:", + "{answer}", + ), + ("{premise}\nCan we draw the following hypothesis?\n{hypothesis}\n{options_}", "{answer}"), + ( + "{premise}\nDoes this next sentence follow, given the preceding text?\n{hypothesis}\nAnswer:", + "{answer}", + ), + ( + "Can we draw the following hypothesis from the context?\nContext: {premise}\nHypothesis: {hypothesis}\nAnswer:", + "{answer}", + ), + ( + "{hypothesis}\nDetermine if the sentence is true based on the text below:\n{premise}\nAnswer:", + "{answer}", + ), + ("Premise: {premise}\nHypothesis: {hypothesis}\nDoes the premise entail the hypothesis?", "{answer}"), + ( + "Premise: {premise}\nHypothesis: {hypothesis}\nIs the hypothesis entailed by the premise?", + "{answer}", + ), + ( + "Here is a premise:\n{premise}\nHere is a hypothesis:\n{hypothesis}\nIs it possible to infer that if the premise is true, then so is the hypothesis?", + "{answer}", + ), + ( + "Sentence 1: {premise}\nSentence 2: {hypothesis}\nIs this second sentence entailed by the first sentence?\n{options_}", + "{answer}", + ), + ('Based on the premise "{premise}", can we infer the hypothesis "{hypothesis}" is true?', "{answer}"), + ( + 'Premise:\n"{premise}" Based on this premise, is the hypothesis "{hypothesis}" true?\n{options_}', + "{answer}", + ), + ('If {premise}, can we infer that "{hypothesis}"?', "{answer}"), + ('{premise}\nDoes it follow that "{hypothesis}"?\n{options_}', "{answer}"), + ('Question: If "{premise}", does this mean that "{hypothesis}"?\nAnswer:', "{answer}"), + ('If "{premise}", can we infer "{hypothesis}"?', "{answer}"), + ('If "{premise}", does it logically follow that "{hypothesis}"?', "{answer}"), + ('Based on the sentence "{premise}", is the sentence "{hypothesis}" a true sentence?', "{answer}"), + ( + "Premise: {premise}\nHypothesis: {hypothesis}\nCan we infer that the hypothesis is true if the premise is true?", + "{answer}", + ), + ( + 'Here is a premise: "{premise}"\nHere is a hypothesis: "{hypothesis}"\nDoes the premise tell us whether the hypothesis is true?', + "{answer}", + ), + ('Is the premise "{premise}" true if "{hypothesis}"?\n{options_}', "{answer}"), + ('If "{premise}", can we infer that "{hypothesis}"?\n{options_}', "{answer}"), + ('If "{premise}", is "{hypothesis}" correct?', "{answer}"), + ('Let\'s say that "{premise}"\nCan we now say that "{hypothesis}"?', "{answer}"), + ('Does "{hypothesis}" appear to be an accurate statement based on "{premise}"?', "{answer}"), + ('Is it possible to draw the statement that "{hypothesis}" if "{premise}"?', "{answer}"), + ('Is "{hypothesis}" true if "{premise}"?\n{options_}', "{answer}"), + ( + 'Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nIs sentence 2 true, based on sentence 1?', + "{answer}", + ), + # fill-in-the-blank: + ( + 'Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nWhich word is the best to connect them? Therefore, However, or Moreover?', + "{connect_answer}", + ), + ( + "Choose the most suitable word to link the following sentences:\n1. {premise}\n2. {hypothesis}\nOptions:\n- Therefore\n- However\n- Moreover", + "{connect_answer}", + ), + ( + 'Connect the following sentence: {premise}\nChoose the appropriate word to link it with: "{hypothesis}"\nOptions: Therefore, However, Moreover', + "{connect_answer}", + ), + ( + 'Given the sentence: {premise}\nChoose the appropriate word from the options (Therefore, However, Moreover) to connect it with: "{hypothesis}"\nWord:', + "{connect_answer}", + ), + ( + 'Connect the sentence: {premise}\nFrom the choices (Therefore, However, Moreover), select the word that best links it to: "{hypothesis}"\nAnswer:', + "{connect_answer}", + ), + # relation classification + ( + 'Assess the relationship between Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nIs it characterized as Entailment, Neutral, or Contradictory?', + "{relation_answer}", + ), + ( + 'Given Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nHow would you describe the relationship between these two sentences? Entailment, Neutral, or Contradictory?', + "{relation_answer}", + ), + ( + 'Considering Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nHow do you perceive the connection between these two sentences in terms of their relationship?', + "{relation_answer}", + ), + ( + 'Assess the relationship between Sentence 1: "{premise}"\nSentence 2: "{hypothesis}"\nWould you categorize their connection as Entailment, Neutral, or Contradictory?', + "{relation_answer}", + ), + ] + elif type == "generate": + if entry["label"] == "Entail": + return [ + ("Complete the following sentence\n{premise} Accordingly,", "{hypothesis}"), + ("{premise} Therefore:", "{hypothesis}"), + ("{premise} Thus?", "{hypothesis}"), + ( + 'Based on the statement "{premise}", provide a continuation using the word "Hence" to express the following idea.\nContinuation:', + "{hypothesis}", + ), + ( + 'Question: Complete the following statement using the word "Therefore" in relation to "{premise}"\nAnswer:', + "{hypothesis}", + ), + ("{premise} {verbalizer}?", "{hypothesis}"), + ("{premise} {verbalizer}:", "{hypothesis}"), + # more variations + ( + "{premise}\nProduce a sentence that encompasses the concept from the above statement. Sentence:", + "{hypothesis}", + ), + ( + '"{premise}" Generate a sentence that follows from the notion presented in the previous statement.', + "{hypothesis}", + ), + ( + "{premise}\nCraft a sentence that develops the idea put forth in the preceding statement.", + "{hypothesis}", + ), + ( + "{premise}\nCreate a sentence that is a logical extension of the idea in the previous statement.\nAnswer:", + "{hypothesis}", + ), + ( + '"{premise}" Formulate a sentence that is consistent with the concept presented in the prior statement.', + "{hypothesis}", + ), + ( + "{premise}\nDevelop a sentence that builds upon the thought conveyed in the above statement.", + "{hypothesis}", + ), + ] + elif entry["label"] == "Neutral": + return [ + ("Complete the following sentence: {premise} {verbalizer},", "{hypothesis}"), + ("Complete the following sentence\n{premise} {verbalizer}:", "{hypothesis}"), + ("{premise} {verbalizer}?", "{hypothesis}"), + ( + 'Based on the statement {premise}, provide a continuation using the word "{verbalizer}" to express the following idea.\nContinuation:', + "{hypothesis}", + ), + ( + 'Question: Complete the following statement using the word "{verbalizer}" in relation to "{premise}"\nAnswer:', + "{hypothesis}", + ), + ] + elif entry["label"] == "Contradict": + return [ + ("Complete the following sentence: {premise} On the contrary,", "{hypothesis}"), + ("{premise} But,\nWhat is a completion for it?", "{hypothesis}"), + ("Complete the following sentence\n{premise} However?", "{hypothesis}"), + ("Sentence: {premise} {verbalizer},\nHow do you finish this sentence?", "{hypothesis}"), + ("{premise} {verbalizer}:", "{hypothesis}"), + ( + 'Based on the statement {premise}, provide a continuation using "In contrast" to express the following idea.', + "{hypothesis}", + ), + ( + 'Complete the following statement using the word "But" in relation to "{premise}".', + "{hypothesis}", + ), + ] + else: + raise ValueError("label must be Entail, Neutral or Contradict") + else: + raise ValueError("type must be generate or classify") + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + kw_dic = {} + kw_dic["premise"] = entry["premise"] + hypothesis = entry["hypothesis"] + kw_dic["hypothesis"] = hypothesis[0].upper() + hypothesis[1:] + kw_dic["options_"] = "- Yes\n- No\n- Maybe" + + kw_dic["verbalizer"] = entry["verbalizer"] + if entry["label"] == "Entail": + kw_dic["answer"] = "Yes" + kw_dic["connect_answer"] = "Therefore" + kw_dic["relation_answer"] = "Entailment" + elif entry["label"] == "Contradict": + kw_dic["answer"] = "No" + kw_dic["connect_answer"] = "However" + kw_dic["relation_answer"] = "Contradictory" + elif entry["label"] == "Neutral": + kw_dic["answer"] = "Maybe" + kw_dic["connect_answer"] = "Moreover" + kw_dic["relation_answer"] = "Neutral" + + template = self.get_template(entry, random_seed) + return self.fill_in_the_template(template, kw_dic) + + +@type_map.add("common_reason") +class common_reason(BaseType): + def __init__(self) -> None: + super().__init__() + self.mine_regex = { + "Cause-effect": r"([.!?]+[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)(Thus|Therefore|Accordingly|Hence|For this reason)([\s]*,[\s]+)([^.!?\n,]{50,}[.!?]+)([\s]+)", + "Effect-cause": r"([.!?]+[\s]+)([^.!?;\n,]{50,}[.!?]+)([\s]+)(due to|on account of|owing to)([\s]+)([^.!?;\n,]{50,}[.!?]+)([\s]+)", + } + self.compile_regex() + + def collect_mined(self, tup: List[str], class_name: str) -> Dict[str, Any]: + dic = { + "relation": class_name, + "verbalizer": tup[3], + "sentence1": tup[1], + "sentence2": tup[-2], + } + return dic + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]]: + if entry["relation"] == "Cause-effect": + return [ + # Basic templates + ('Question: What is the effect of "{cause}"? Answer:', "{effect}"), + ("Here is a premise: {cause}\nWhat is the effect?", "{effect}"), + ('Q: What is the result of "{cause}"? A:', "{effect}"), + ('What is a plausible effect of "{cause}"?', "{effect}"), + ('Based on "{cause}", what is the result?', "{effect}"), + ("{cause}\nEffect:", "{effect}"), + ("What is the result of the following sentence?\n{cause}\nResult:", "{effect}"), + ('Q: What happens after "{cause}"? A:', "{effect}"), + ("{cause}\nWhat happens next?", "{effect}"), + # More varaiations + ("Considering the cause: {cause}\nWhat could be the resulting effect?", "{effect}"), + ("Given that: {cause}\nWhat do you anticipate as the outcome?", "{effect}"), + ('What could stem from "{cause}"?', "{effect}"), + ("Explore the consequences of: {cause}\nAnswer:", "{effect}"), + ('What might follow from "{cause}"?', "{effect}"), + ('Based on the cause: "{cause}"\nWhat is likely to be the effect?', "{effect}"), + ('If "{cause}" occurs, what is the probable effect?', "{effect}"), + ('Imagine "{cause}" taking place; what would be the resultant effect?', "{effect}"), + ("Given the scenario: {cause}\nWhat effect could be expected?", "{effect}"), + ('Examine the potential outcomes of "{cause}"\nOutcome:', "{effect}"), + ("Anticipating the result of: {cause}\nWhat could be the effect?", "{effect}"), + ('What is the expected effect of "{cause}"?', "{effect}"), + ("Considering the event: {cause}\nWhat could be an outcome?", "{effect}"), + ('If "{cause}" happens, what could be the subsequent effect?', "{effect}"), + ('Explore the aftermath of: "{cause}"\nWhat could be the effect?', "{effect}"), + ] + elif entry["relation"] == "Effect-cause": + return [ + # Basic templates + ('Q: "{effect}" What is the cause? A:', "{cause}"), + ("Here is a result: {effect}\nWhat is the cause?", "{cause}"), + ('What is the reason of "{effect}"?', "{cause}"), + ('What is a plausible reason for "{effect}"?', "{cause}"), + ('what is the cause of "{effect}"?', "{cause}"), + ("{effect}\nCause:", "{cause}"), + ("Question: What is the reason of the following sentence?\n{effect}\nAnswer:", "{cause}"), + ('What happens before "{effect}"?', "{cause}"), + ("{effect}\nWhat happens before?", "{cause}"), + # More variations: + ("Given the outcome: {effect}\nWhat could have led to this result?", "{cause}"), + ('Uncover the cause behind: "{effect}".', "{cause}"), + ("What might be responsible for {effect}?", "{cause}"), + ("Identify a probable cause for: {effect}\nCause:", "{cause}"), + ('What event or circumstance could explain "{effect}"?', "{cause}"), + ("When observing: {effect}\nWhat should we consider as the cause?", "{cause}"), + ("What events or factors contributed to: {effect}?", "{cause}"), + ('Considering the effect: "{effect}"\nWhat could be the underlying cause?', "{cause}"), + ('Before "{effect}" occurred, what factor might have caused it?', "{cause}"), + ('What do you think led to the occurrence of: "{effect}"?', "{cause}"), + ("Analyze the occurrence of: {effect}\nWhat could be identified as the cause?", "{cause}"), + ("Given that: {effect}\nWhat was the triggering cause?", "{cause}"), + ('Explore the background of: "{effect}"\nWhat could have instigated it?', "{cause}"), + ("What played a role in bringing about: {effect}?", "{cause}"), + ( + 'Delve into the circumstances behind "{effect}"\nWhat could be the originating cause? Answer:', + "{cause}", + ), + ("Complete the following sentence\n{effect} because of", "{cause}"), + ("Your task is to complete the following sentence: {effect} due to", "{cause}"), + ("{effect} owing to\nHow would you complete it:", "{cause}"), + ( + 'Based on the statement {effect}, provide a continuation using "{verbalizer}" to express the following idea.\nContinuation:', + "{cause}", + ), + ( + 'Question: Complete the following statement using "{verbalizer}" in relation to "{effect}".', + "{cause}", + ), + ("Answer the question...{effect} {verbalizer}?", "{cause}"), + ("{effect} {verbalizer}:", "{cause}"), + ] + else: + raise ValueError("relation must be Cause-effect or Effect-cause") + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + kw_dic = {} + kw_dic["verbalizer"] = entry["verbalizer"] + if entry["relation"] == "Cause-effect": + kw_dic["cause"] = entry["sentence1"] + kw_dic["effect"] = entry["sentence2"][0].upper() + entry["sentence2"][1:] + elif entry["relation"] == "Effect-cause": + kw_dic["cause"] = entry["sentence2"][0].upper() + entry["sentence2"][1:] + kw_dic["effect"] = entry["sentence1"] + elif entry["relation"] == "Explanantion": + kw_dic["sentence1"] = entry["sentence1"] + kw_dic["sentence2"] = entry["sentence2"][0].upper() + entry["sentence2"][1:] + + template = self.get_template(entry, random_seed) + return self.fill_in_the_template(template, kw_dic) + + +@type_map.add("paraphrase") +class paraphrase(BaseType): + def __init__(self) -> None: + super().__init__() + self.mine_regex = { + "Similar": r"([.!?]+[\s]+)([^.!?\n]{50,}[.!?]+)([\s]+)(In other words|In other word|Namely|That is to say|i.e.|Scilicet|Similarly|Equally)([\s]*,[\s]+)([^.!?\n]{50,}[.!?]+)([\s]+)", + "Different": r"([.!?]+[\s]+)([^.!?\n]{50,}[.!?]+)([\s]+)(No|However|But|On the contrary|In contrast|Whereas)([\s]*,[\s]+)([^.!?\n]{50,}[.!?]+)([\s]+)", + } + + self.compile_regex() + + def collect_mined(self, tup: List[str], class_name: str) -> Dict[str, Any]: + dic = { + "label": class_name, + "verbalizer": tup[3], + "sentence1": tup[1], + "sentence2": tup[-2], + } + return dic + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]]: + if entry["label"] == "Different": + return [ + ( + '"{sentence1}" Generate a sentence that expresses a contrasting idea to the previous statement.', + "{sentence2}", + ), + ('Can you create a sentence that contradicts the meaning of "{sentence1}"?', "{sentence2}"), + ( + 'Given the sentence "{sentence1}", can you come up with a statement that contradicts its meaning?', + "{sentence2}", + ), + ( + 'Here is a sentence: "{sentence1}". Now, provide a sentence that contradicts its meaning.', + "{sentence2}", + ), + ( + 'Your challenge is to create a sentence that expresses the opposite of "{sentence1}". Answer:', + "{sentence2}", + ), + ('Contradict the meaning of the sentence "{sentence1}" by crafting another sentence.', "{sentence2}"), + ('Compose a sentence that contradicts the idea conveyed in "{sentence1}".', "{sentence2}"), + ( + 'Can you generate a sentence that has a conflicting meaning compared to "{sentence1}"?', + "{sentence2}", + ), + ( + 'In opposition to the sentence "{sentence1}", create a sentence with a contradictory meaning.', + "{sentence2}", + ), + ( + 'Your task is to provide a sentence that negates or contradicts the message of "{sentence1}".', + "{sentence2}", + ), + ( + 'Given the sentence "{sentence1}", come up with a different sentence that contradicts its meaning?', + "{sentence2}", + ), + ('Craft a sentence that goes against the meaning of the sentence "{sentence1}".', "{sentence2}"), + ] + elif entry["label"] == "Similar": + return [ + ("Complete the following sentence: {sentence1} Namely,", "{sentence2}"), + ("{sentence1} In other words\nProvide the missing portion of the above sentence:", "{sentence2}"), + ("Q: {sentence1} That is to say?", "{sentence2}"), + ( + 'Question: Complete the following statement using "{verbalizer}" in relation to "{sentence1}"\nAnswer:', + "{sentence2}", + ), + ("Question: {sentence1} {verbalizer}?", "{sentence2}"), + ("{sentence1} {verbalizer},\nHow do you finish this sentence?", "{sentence2}"), + ("Extend the thought in this sentence: {sentence1} To elaborate further:", "{sentence2}"), + ( + 'Build upon the statement {sentence1} by utilizing "{verbalizer}" to express the following concept.', + "{sentence2}", + ), + ( + '"{sentence1}" Generate a sentence that expresses a further elaboration to the previous statement.', + "{sentence2}", + ), + ('"{sentence1}" Expand on the previous statement:', "{sentence2}"), + ("{sentence1}\nProvide an explanatory sentence:", "{sentence2}"), + ] + else: + raise ValueError("label must be Similar or Different") + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + kw_dic = {} + kw_dic["verbalizer"] = entry["verbalizer"] + kw_dic["sentence1"] = entry["sentence1"] + kw_dic["sentence2"] = entry["sentence2"][0].upper() + entry["sentence2"][1:] + + template = self.get_template(entry, random_seed) + return self.fill_in_the_template(template, kw_dic) + + +@type_map.add("word2text") +class word2text(BaseType): + def __init__(self) -> None: + super().__init__() + self.mine_regex = { + "definition": r"([\s]+)([^.!?,;\s\"]{10,})([\s]+)(is defined as|\'s definition is)([\s]+)([^.!?\n]{20,}[.!?]+)([\s]+)", + "topic": r"([.!?]+[\s]+)([^.!?,;\n]{20,})([\s]+)(was about|talks about|is about|\'s topic is)([\s]+)([^.!?\n]{20,}[.!?]+)([\s]+)", + } + # `topic` is defined as a summaization task in our paper, + # here we categorize it to word2text for simple code implementation + + self.compile_regex() + + self.min_kw_num = 3 # requires at least 3 domain-specific keywords, + self.max_sent_len = 100 # with fewer than 100 sent tokens. + self.max_collect_sent = 2 # early break when find enough task examples. + + def collect_mined(self, tup: List[str], class_name: str) -> Dict[str, Any]: + if class_name == "definition": + dic = { + "relation": class_name, + "verbalizer": tup[3], + "word": tup[1], + "definition": tup[-2], + } + elif class_name == "topic": + dic = { + "relation": class_name, + "verbalizer": tup[3], + "sentence": tup[1], + "topic": tup[-2], + } + return dic + + @typing.no_type_check + def mine(self, text: str, sents: List[str], **kwargs) -> Tuple[Dict[str, Any], int]: + def mine_regex(text): + mined_dic = {} + mined_num = 0 + for class_name, regex in self.regex_dic.items(): + mined_dic[class_name] = [] + x = regex.findall(text) + if len(x) > 0: + for tup in x: + collected = self.collect_mined(tup, class_name) + mined_dic[class_name].append(collected) + mined_num += len(mined_dic[class_name]) + return mined_dic, mined_num + + mined_dic, mined_num = mine_regex(text) + + random.seed(len(text)) # fix random seed for reproduction + random.shuffle(sents) + + mined_dic["word2text"] = [] # wrap as a list to align with other task types + for sent in sents: + if len(mined_dic["word2text"]) == self.max_collect_sent: + break + sent_tokens = set(self.domain_spm.encode(sent, out_type=str)) + specific_tokens_in_sent = list(self.specific_token_set & sent_tokens) + if len(specific_tokens_in_sent) >= self.min_kw_num and len(sent_tokens) <= self.max_sent_len: + tokens = [ + self.domain_spm.decode(token) for token in specific_tokens_in_sent + ] # transfer tokens back to normal words + dic = { + "relation": "word2text", + "token_set": tokens, + "sent": sent.strip(), + } + mined_dic["word2text"].append(dic) + mined_num += len(mined_dic["word2text"]) + return mined_dic, mined_num + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]]: + if entry["relation"] == "word2text": + return [ + ("Concepts: {tripleset}\nWrite a sentence that includes all these words.\nSentence:", "{target}"), + ( + "Concepts: {tripleset}\nFind a sentence in the article that includes all these words.\nSentence:", + "{target}", + ), + ("Keywords: {tripleset}\nWhat is a sentence that includes all these keywords?", "{target}"), + ( + "Here are some concepts: {tripleset}\nWhat is a sentence about these concepts in the article?", + "{target}", + ), + ("Produce a sentence which mentions all of these concepts: {tripleset}\nAnswer:", "{target}"), + ("Write a sentence about the following things:\n{tripleset}\nAnswer:", "{target}"), + ("Generate a sentence that includes all the following words: {tripleset}. Sentence:", "{target}"), + ("Sentence: {target}\nWhat are the keywords in this sentence?", "{tripleset}"), + ("What are the most important words in the following sentence\n{target}\nWords:", "{tripleset}"), + ("{target}\nIdentify the most salient words in the above sentence.", "{tripleset}"), + ("Concepts: {tripleset}\nWhat would a sentence about these concepts be like?", "{target}"), + ("Here are some words: {tripleset}.\nWrite a sentence that describes them.", "{target}"), + ( + "Here are some words: {tripleset}.\nTell me a sentence that describes them in the article.", + "{target}", + ), + ( + "Here are some concepts: {tripleset}.\nGenerate a detailed description of them.\nDescription:", + "{target}", + ), + ("Generate a sentence about: {tripleset}\nSentence:", "{target}"), + ("Write a sentence about [{tripleset}].", "{target}"), + ("Produce a long descriptive sentence that uses all these words: {tripleset}.\nSentence:", "{target}"), + ("Create a set of three concepts in the following sentence.\n{target}\nConcepts:", "{tripleset}"), + ("{tripleset}\nWhat is the sentence in the article that verbalizes these concepts?", "{target}"), + ( + "Keywords: {tripleset}\nTell me the sentence in the article about these concepts.\nSentence:", + "{target}", + ), + ("Here are some keywords: {tripleset}.\nWrite a sentence that includes them.", "{target}"), + ("Generate a sentence that includes these keywords [{tripleset}].", "{target}"), + ("Find a sentence in the above article that includes the following words: [{tripleset}].", "{target}"), + ("Produce a long descriptive sentence that uses all these words: {tripleset}\nAnswer:", "{target}"), + ("Sentence: {target}\nWhat keywords can be extracted from this sentence?", "{tripleset}"), + ] + elif entry["relation"] == "definition": + return [ + ("Q: {word} {verbalizer}? A:", "{definition}"), + ("Next question: {word} {verbalizer}:", "{definition}"), + ("{word} {verbalizer}?", "{definition}"), + ("{word} {verbalizer}:", "{definition}"), + ("What is the definition of {word}?", "{definition}"), + ("How to define {word}?", "{definition}"), + ('Explain the meaning of "{word}".', "{definition}"), + ('What does "{word}" refer to?', "{definition}"), + ("Please elucidate the concept of {word}\nAnswer:", "{definition}"), + ('What is the meaning of the term "{word}"?', "{definition}"), + ("Could you offer a definition for {word}?", "{definition}"), + ("Could you offer a definition for {word}?\nDefinition:", "{definition}"), + ] + elif entry["relation"] == "topic": + return [ + ("{sentence} {verbalizer}?", "{topic}"), + ("{sentence} {verbalizer}:", "{topic}"), + ("Q: {sentence} {verbalizer}? A:", "{topic}"), + ("Answer the question\n{sentence} {verbalizer}?", "{topic}"), + ("Answer the question\n{sentence} {verbalizer}:", "{topic}"), + ("Answer the following question:\n{sentence} {verbalizer}?\nAnswer:", "{topic}"), + ("Answer this question:\n{sentence} {verbalizer}?", "{topic}"), + ("Please answer this question: {sentence} {verbalizer}?\nAnswer:", "{topic}"), + ("Answer the question...{sentence} {verbalizer}?", "{topic}"), + ('Can you tell me the answer to "{sentence} {verbalizer}?"?', "{topic}"), + ("Next question: {sentence} {verbalizer}:", "{topic}"), + ("Q: {sentence} {verbalizer}:", "{topic}"), + ("Please answer this question: {sentence} {verbalizer}:", "{topic}"), + ("Write the answer: {sentence} {verbalizer}?\nAnswer:", "{topic}"), + ('What is the answer to "{sentence} {verbalizer}:"?', "{topic}"), + ("Answer this question.\n{sentence} {verbalizer}:", "{topic}"), + ("Answer the following question. {sentence} {verbalizer}:", "{topic}"), + ("Question: {sentence} {verbalizer}?", "{topic}"), + ("{sentence} {verbalizer}??", "{topic}"), + ] + else: + raise ValueError("relation must be word2text, definition or topic") + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + kw_dic = {} + if entry["relation"] == "word2text": + kw_dic["tokens"] = entry["token_set"] + kw_dic["tripleset"] = ", ".join(kw_dic["tokens"][: self.min_kw_num]) + kw_dic["target"] = entry["sent"].strip() + elif entry["relation"] == "definition" or entry["relation"] == "topic": + kw_dic = entry + + template = self.get_template(entry, random_seed) + return self.fill_in_the_template(template, kw_dic) + + +@type_map.add("summarize") +class summarize(BaseType): + def __init__(self) -> None: + super().__init__() + + @typing.no_type_check + def mine(self, text: str, title, **kwargs): + # seems redundant but has to do so to align with other task types + mined_dic = {"title": title} + mined_num = 1 if title is not None else 0 + return mined_dic, mined_num + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str]]: + # those are templates when summarization is conducted but text completion is NOT conducted + return [ + # summary_templates + ( + "{context_wo_title}\n\nWhat is a potential title for this context in the {domain} domain? \nTitle: {title}", + ), + ("{domain} article: {context_wo_title}{qa_demos}\n\nWhat is the title of this article? {title}",), + ("Article: {context_wo_title}{qa_demos}\n\nGenerate a title for this {domain} paragraph.\nTitle: {title}",), + ("{context_wo_title}\n\nWrite a title for the above {domain} article. {title}",), + ("{context_wo_title}\nBriefly summarize this {domain} text? {title}",), + ( + "Article in the {domain} domain: {context_wo_title}\n\nGenerate a short summary for this article.\nAnswer: {title}", + ), + ( + "{context_wo_title}{qa_demos}\n\nSummarize the aforementioned {domain} text in a single sentence. {title}", + ), + ( + "{context_wo_title}\nCan you generate a short summary of the above {domain} paragraph? {title}{qa_demos}", + ), + ( + "{context_wo_title}\nPlease write a short summary for the above article in the {domain} domain. {title}{qa_demos}", + ), + ("Context: {context_wo_title}{qa_demos}\n\nWhat was this {domain} article about? {title}",), + # write based on title + ( + "Write an article about {domain} domain, using the following title: {title}.\nArticle: {context_wo_title}{qa_demos}", + ), + ( + "Title: {title}\nWrite a an article about {domain} domain based on this title. {context_wo_title}{qa_demos}", + ), + ('Use the title "{title}" to write a {domain} article.\nArticle: {context_wo_title}{qa_demos}',), + ( + "Craft an informative article about the {domain} domain, drawing from the following summary: {title}\nArticle: {context_wo_title}{qa_demos}", + ), + ( + "Create a {domain} article inspired by the provided title: {title}\nOutput: {context_wo_title}{qa_demos}", + ), + ('Can you develop an engaging {domain} article using the title "{title}"? {context_wo_title}{qa_demos}',), + ( + "Write an informative piece on the {domain} domain, using the provided title: {title}. {context_wo_title}{qa_demos}", + ), + ( + "Craft an article focused on {domain}, utilizing the provided title: {title}.\nArticle: {context_wo_title}{qa_demos}", + ), + ( + "Compose an in-depth {domain} article based on the title: {title}\nArticle: {context_wo_title}{qa_demos}", + ), + ( + 'Can you create an article delving into the {domain} domain, incorporating the given title "{title}"? {context_wo_title}{qa_demos}', + ), + ] + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + sents = entry.pop("sents") + template = self.get_template(entry, random_seed) + + entry["context_wo_title"] = "".join(sents).strip() + final_demo = self.fill_in_the_template(template, entry) + return final_demo + + +@type_map.add("text_completion") +class text_completion(BaseType): + def __init__(self) -> None: + super().__init__() + + @typing.no_type_check + def mine(self, sents: Any, **kwargs) -> Tuple[Dict[str, Any], int]: + # seems redundant but has to do so to align with other task types + mined_dic = {"sents": sents} + mined_num = 1 if len(sents) >= 4 else 0 + return mined_dic, mined_num + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str, str]]: + # those are templates when text completion is conducted but summarization is NOT conducted + return [ + ("Please complete an article: {context_1st_half}", "{context_2nd_half}"), + ( + "Here is the first part of an article: {context_1st_half}\n\nHow would you continue the article?", + "{context_2nd_half}", + ), + ( + "Explore the initial section of an article: {context_1st_half}\nWhat could be the next part?", + "{context_2nd_half}", + ), + ("Read the beginning of an article {context_1st_half}\n\nWrite the subsequent part?", "{context_2nd_half}"), + ( + "In this article snippet, you will find the first part: {context_1st_half}\nHow would you compose the remaining section?", + "{context_2nd_half}", + ), + ( + "Take a look at the introductory part of an article: {context_1st_half}\n\nYour challenge is to write the following segment", + "{context_2nd_half}", + ), + ( + "Review the initial portion of an article: {context_1st_half}\nWhat would you include in the rest of the article?", + "{context_2nd_half}", + ), + ( + "Consider the first segment of an article: {context_1st_half}\nContinuation of the article:", + "{context_2nd_half}", + ), + ( + "Examine the first segment of an article: {context_1st_half}\n\nQuestion: Complete the article?\nCompletion:", + "{context_2nd_half}", + ), + ( + "Read the beginning of an article: {context_1st_half}\n\nHow would you extend the article?", + "{context_2nd_half}", + ), + ] + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + sents = entry.pop("sents") + entry["context_1st_half"] = entry["title"] + "\n" if entry["title"] is not None else "" + + cut_index = random.Random(random_seed).randint(1, len(sents) - 1) + + entry["context_1st_half"] += "".join(sents[:cut_index]).strip() + entry["context_2nd_half"] = "".join(sents[cut_index:]).strip() + template = self.get_template(entry, random_seed) + final_demo = self.fill_in_the_template(template, entry) + return final_demo + + +# NOTE: useless if we don't have the title +@type_map.add("summarize_completion") +class summarize_completion(BaseType): + def __init__(self) -> None: + super().__init__() + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str]]: + # applicable to both text completion and summarization: + return [ + ( + "Please complete an article about {domain}: {context_1st_half} {context_2nd_half}{qa_demos}\n\nWhat was this article about?\nAnswer: {title}", + ), + ( + "Here is the first part of an article about {domain}: {context_1st_half}\n\nPlease complete it.\nCompletion: {context_2nd_half}{qa_demos}\n\nWhat was this article about? {title}", + ), + ( + "Explore the initial section of an article on {domain}: {context_1st_half}\n\nProvide the text ending? {context_2nd_half}\n\nPropose a title for this context? {title}{qa_demos}", + ), + ( + "Read the beginning of an article about {domain}: {context_1st_half}\n\nYour task is to add the subsequent part. {context_2nd_half}\n\nBriefly summarize this text. Summary: {title}{qa_demos}", + ), + ( + "In this article snippet about {domain}, you will find the first half: {context_1st_half}\n\nCompose the remaining section: {context_2nd_half}\n\nWrite a title for it.\nTitle: {title}{qa_demos}", + ), + ( + "Take a look at the first part of an article on {domain}: {context_1st_half}\n\nYour challenge is to write the following segment. {context_2nd_half}\n\nWhat is a very short summary of the above text? {title}{qa_demos}", + ), + ( + "Review the initial portion of an article discussing {domain}: {context_1st_half}\n\nWhat would you include in the rest of the article? {context_2nd_half}\n\nWhat is a shorter version of this article?\nShort version: {title}{qa_demos}", + ), + ( + "Consider the opening of an article centered around {domain}: {context_1st_half}\n\nNow, provide the continuation of the article.\nContinuation: {context_2nd_half}\n\nWhat was this article about? {title}{qa_demos}", + ), + ( + "Examine the first segment of an article exploring {domain}: {context_1st_half}\n\nComplete the article? {context_2nd_half}\nCan you generate a short summary of the above paragraph?\nAnswer: {title}{qa_demos}", + ), + ( + "Read the beginning of an article on {domain}: {context_1st_half}\n\nHow would you extend the article? {context_2nd_half}\n\nPlease write a short summary for the above article. {title}{qa_demos}", + ), + ] + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + sents = entry.pop("sents") + template = self.get_template(entry, random_seed) + cut_index = random.Random(random_seed).randint(1, len(sents) - 1) + + entry["context_1st_half"] = "".join(sents[:cut_index]).strip() + entry["context_2nd_half"] = "".join(sents[cut_index:]).strip() + final_demo = self.fill_in_the_template(template, entry) + return final_demo + + +@type_map.add("no_summarize_completion") +class no_summarize_completion(BaseType): + def __init__(self) -> None: + super().__init__() + + def get_all_templates(self, entry: Dict[str, Any], random_seed: int) -> List[Tuple[str]]: + # applicable to having no summarization and no completion + return [ + ("Please answer some questions about the following article:\n{context}\n",), + ("Read this article and answer questions\n{context}\n",), + ("{context}\n",), + ("Answer some questions about this article:\n{context}\n",), + ("Here are some questions about this article: {context}\n",), + ("Article: {context}\n",), + ("Read this article: {context}\n",), + ("Given the rticle: {context}\n",), + ("Context: {context}\n",), + ("Article: {context}\n",), + ("Use this article to answer the questions: {context}\n",), + ("Answer based on context :\n{context}\n",), + ] + + def format_single_demo(self, entry: Dict[str, Any], random_seed: int) -> List[Dict[str, str]]: + sents = entry.pop("sents") + entry["context"] = entry["title"] + "\n" if entry["title"] is not None else "" + + template = self.get_template(entry, random_seed) + + entry["context"] += "".join(sents).strip() + final_demo = self.fill_in_the_template(template, entry) + return final_demo + + +@type_map.add("overall") +class overall(BaseType): + def __init__(self) -> None: + super().__init__() + self.demo_deliminator = "\n\n" + self.intro_deliminators = [ # connect raw text with the followed QAs + ("\nPlease answer some questions about the above article:\n\n",), + ("\nAnswer some questions about the above article :\n\n",), + ("\n\nWhat are the answers to these questions?\n",), + ("\n\nNow answer these questions:\n\n",), + ("\nNow answer the following questions:\n\n",), + ("\n\nWhat are the answers to the questions or completions:\n",), + ("\nHow would one answer these questions in the domain:\n\n",), + ("\n\nUse evidence from the article to answer these questions:\n\n",), + ("\n\nUse this above article to answer the questions:\n",), + ("\nAnswer the following questions based on the article:\n\n",), + ("\nAnswer these questions:\n",), + ("\n\nBased on the above article, answer questions.\n\n",), + ("\nWrite some question-answer pairs about the above article:\n\n",), + ("\nRespond to the following questions based on the above article:\n\n",), + ("\n\nUpon reading the article, answer the following questions:\n\n",), + ("\nEvaluate your understanding of the article by answering the following questions:\n\n",), + ] + + def format_recomprehension( + self, overall_entry: Dict[str, Any], insert_types: List[str] = TYPES + ) -> Tuple[str, Dict[str, Any]]: + qa_demo_list = [] + seed = overall_entry["text_id"] + count_dict: Dict[str, Any] = {} + for type in list(set(insert_types) & set(["nli", "common_reason", "paraphrase", "word2text"])): + type_cls = type_map.cls_dic[type]() + type_examples = [] + count_dict[type] = {} + for subcategory, examples in overall_entry[type].items(): + if len(examples) == 0: + continue + random.Random(seed).shuffle(examples) + type_examples += examples[: type_cls.max_subcategory_num] + count_dict[type][subcategory] = len(examples[: type_cls.max_subcategory_num]) + if len(type_examples) == 0: + continue + # ensure examples of one type altogether, to imitate the few-shot setting + + qa_demo_list += [type_cls.format_single_demo(example, seed) for example in type_examples] + + if len(qa_demo_list) > 0: + random.Random(seed).shuffle(qa_demo_list) + intro = random.Random(seed).choice(self.intro_deliminators)[0] + qa_demos: List[Dict[str, str]] = sum(qa_demo_list, []) + qa_demos[0]["content"] = intro + qa_demos[0]["content"] + else: + qa_demos = [] + + def summaize_only(count_dict: Dict[str, int]) -> Tuple[List[Dict[str, str]], Dict[str, int]]: + count_dict["summarize"] = 1 + count_dict["text_completion"] = 0 + overall_cls = summarize() + entry = overall_entry["summarize"] + entry["sents"] = overall_entry["text_completion"]["sents"] + entry["qa_demos"] = qa_demos + entry["spm"] = self.ori_spm + read_compre_demo = overall_cls.format_single_demo(entry, seed) + return read_compre_demo, count_dict + + def completion_only(count_dict: Dict[str, int]) -> Tuple[List[Dict[str, str]], Dict[str, int]]: + count_dict["summarize"] = 0 + count_dict["text_completion"] = 1 + overall_cls = text_completion() + entry = overall_entry["text_completion"] + entry["qa_demos"] = qa_demos + entry["title"] = overall_entry["summarize"]["title"] + entry["spm"] = self.ori_spm + read_compre_demo = overall_cls.format_single_demo(entry, seed) + return read_compre_demo, count_dict + + def summarize_and_completion(count_dict: Dict[str, int]) -> Tuple[List[Dict[str, str]], Dict[str, int]]: + count_dict["summarize"] = 1 + count_dict["text_completion"] = 1 + overall_cls = summarize_completion() + entry = overall_entry["text_completion"] + entry["qa_demos"] = qa_demos + entry["title"] = overall_entry["summarize"]["title"] + entry["spm"] = self.ori_spm + read_compre_demo = overall_cls.format_single_demo(entry, seed) + return read_compre_demo, count_dict + + def no_summarize_or_completion(count_dict: Dict[str, int]) -> Tuple[List[Dict[str, str]], Dict[str, int]]: + count_dict["summarize"] = 0 + count_dict["text_completion"] = 0 + overall_cls = no_summarize_completion() + entry = overall_entry["text_completion"] + entry["qa_demos"] = qa_demos + entry["title"] = overall_entry["summarize"]["title"] + entry["spm"] = self.ori_spm + read_compre_demo = overall_cls.format_single_demo(entry, seed) + return read_compre_demo, count_dict + + if ("summarize" in insert_types and overall_entry["summarize"]["title"] is not None) and ( + "text_completion" in insert_types and len(overall_entry["text_completion"]["sents"]) >= 2 + ): + np.random.seed(seed) + read_func = np.random.choice( # type: ignore + [summaize_only, completion_only, summarize_and_completion, no_summarize_or_completion], # type: ignore + p=[0.4, 0.1, 0.4, 0.1], # type: ignore + ) # type: ignore + elif "summarize" in insert_types and overall_entry["summarize"]["title"] is not None: + np.random.seed(seed) + read_func = np.random.choice([summaize_only, no_summarize_or_completion], p=[0.5, 0.5]) # type: ignore + if "text_completion" in insert_types and len(overall_entry["text_completion"]["sents"]) >= 2: + np.random.seed(seed) + if len(qa_demos) == 0: + read_func = completion_only + else: + read_func = np.random.choice([completion_only, no_summarize_or_completion], p=[0.5, 0.5]) # type: ignore + else: + read_func = no_summarize_or_completion + + return read_func(count_dict) + + +class RegexBasedReadingComprehension: + def __init__(self, general_spm: spm.SentencePieceProcessor, domain_spm: spm.SentencePieceProcessor) -> None: + self.inited_type_map = {} + + for type in TYPES: + type_cls = type_map.cls_dic[type]() + type_cls.init_spm(general_spm, domain_spm) + self.inited_type_map[type] = type_cls + + self.overall_cls = type_map.cls_dic["overall"]() + self.overall_cls.init_spm(general_spm, domain_spm) + + # to chunk text to sentences + self.segmenter = Segmenter(language="en", clean=False) + + def generate(self, entry: Dict[str, Any]) -> Dict[str, Any]: + # NOTE: if the context has no title, use the following code: + title = None + context_wo_title = entry["text"] + + # truncate the context to meet the max_seq_len + # context_wo_title = overall_cls.truncate_sentence(context_wo_title, max_len=overall_cls.max_seq_len-200) + context_wo_title_list = self.overall_cls.truncate_sentence( + context_wo_title, max_len=self.overall_cls.max_seq_len - 200 + ) + + read_compre_list = [] + for context_wo_title in context_wo_title_list: + sents = self.segmenter.segment(context_wo_title) + overall_entry = {"text_id": entry["text_id"]} + for type in TYPES: + type_cls = self.inited_type_map[type] + overall_entry[type], mined_num = type_cls.mine( + text=context_wo_title, title=title, sents=copy.deepcopy(sents) + ) + + # create the reading comprehension text + read_compre, count_dict = self.overall_cls.format_recomprehension(copy.deepcopy(overall_entry)) + # count_dict includes the number of comprehension tasks per task type + # you may use `mined_num` and `count_dict` for data analysis + read_compre_list.append(read_compre) + + return {"read_compre": read_compre_list, "file_name": entry["file_name"]} + + def dataset_generator( + self, input_dir_or_file: str, column: Optional[str], workers: int = 1 + ) -> Iterator[Tuple[int, str, str]]: + generator = input_generator(input_dir_or_file, column) + + raw_texts = [] + for text_id, (filename, content) in enumerate(generator): + text = content.strip() + raw_texts.append({"text": text, "text_id": text_id, "file_name": filename}) + + logger.info("transferring raw texts into reading comprehension...") + read_compre = list(process_map(self.generate, raw_texts, max_workers=workers, chunksize=8192)) + + logger.info("saving reading comprehension texts...") + # sort by text_id to align with the order of raw texts + for entry in read_compre: + for index, read_compre_example in enumerate(entry["read_compre"]): + file_name = entry["file_name"] + yield index, file_name, read_compre_example + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--input", type=str, required=True, help="Directory containing the input files OR a CSV file") + parser.add_argument("--csv_column", type=str, help="Column to read from the CSV file") + parser.add_argument( + "--debug_output_dir", + type=str, + help="directory of the output reading comprehension texts", + ) + parser.add_argument("--general_spm_path", type=str, help="path of the general sentencepiece model", required=True) + parser.add_argument( + "--domain_spm_path", + type=str, + help="path of the domain sentencepiece model", + ) + parser.add_argument( + "--domain_tokenizer_training_text", + type=str, + help="path of the domain sentencepiece model", + ) + parser.add_argument( + "--output_dataset_name", + type=str, + required=True, + help="name of the output dataset", + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + if os.path.isfile(args.input) and not args.csv_column: + raise ValueError("a CSV column must be specified if the input is a file") + + if not (args.domain_spm_path or args.domain_tokenizer_training_text): + # warn user that the domain tokenizer will be created from the input files + logger.warning( + "No domain tokenizer is provided nor explicit file for training domain tokenizer is provided, " + "the domain tokenizer will be created from the input files, " + ) + + if args.domain_tokenizer_training_text: + # train domain tokenizer + domain_spm = create_domain_tokenizer(args.domain_tokenizer_training_text) + elif args.domain_spm_path: + domain_spm = spm.SentencePieceProcessor(model_file=args.domain_spm_path) + else: + domain_spm = create_domain_tokenizer_from_files(args.input, args.csv_column) + + general_spm = spm.SentencePieceProcessor(model_file=args.general_spm_path) + + # get max worker for multi-process + max_workers = min((os.cpu_count() or 1) // 2, 1) + + logger.info(f"max_workers for generation of regex data: {max_workers}") + + rc = RegexBasedReadingComprehension(general_spm, domain_spm) + dataset_generator = rc.dataset_generator(args.input, column=args.csv_column, workers=max_workers) + + if args.debug_output_dir: + in_memory_dataset = [] + + logger.info("saving debug data...") + os.makedirs(args.debug_output_dir, exist_ok=True) + + for index, filename, read_compre_example in dataset_generator: + with open(os.path.join(args.debug_output_dir, f"{filename}_{index}.json"), "w", encoding="utf8") as f: + json.dump({"messages": read_compre_example}, f) + in_memory_dataset.append({"messages": read_compre_example}) + else: + in_memory_dataset = [{"messages": rc_text} for _, _, rc_text in dataset_generator] + + # make dataset from reading comprehension texts + logger.info("making dataset...") + + regex_dataset = datasets.Dataset.from_list(in_memory_dataset) + regex_dataset.save_to_disk(args.output_dataset_name) + + logger.info(f"Done. Dataset saved to disk at {args.output_dataset_name}") + + +if __name__ == "__main__": + main() diff --git a/dalm/datasets/reading_comprehension_generation/synthetic_based.py b/dalm/datasets/reading_comprehension_generation/synthetic_based.py new file mode 100644 index 0000000..eb6da59 --- /dev/null +++ b/dalm/datasets/reading_comprehension_generation/synthetic_based.py @@ -0,0 +1,223 @@ +import argparse +import json +import logging +import os +import pickle +from typing import Any, Dict, Iterator, List, Optional, Tuple + +import torch +from datasets import Dataset +from transformers import Pipeline, pipeline + +from dalm.datasets.reading_comprehension_generation.utils import ( + input_generator, + question_and_answer_extractor, + text_chunker, +) + +logger = logging.getLogger(__name__) + +# ruff: noqa: B006 + +PROMPT = ( + "There are 4 types of reading comprehension tasks. " + "The point of reading comprehension tasks is to be assigned a text and questions to " + "prompt answers so as to test conceptual and procedural knowledge present in the text. " + "The four types of reading comprehension tasks are : 1. complete-the-sentence Q&A TASK " + "2.true/false Q&A TASK (description: a sentence is posed and the user is asked to state " + "the correctness of the statement) 3. frame a sentence with domain specific keywords" + "(these keywords are required to be present in the text) Q&A TASK " + "4. Normal questions and answer Task (description: longform Q&A to test procedural and " + "conceptual knowledge). An example of all four tasks given an example text is as follows: " + "\n EXAMPLE TEXT: The insights into the mechanisms of memory consolidation during the sleep " + "processes in human and animal brain led to other biologically inspired approaches. While " + "declarative memories are in the classical picture consolidated by hippocampo-neocortical " + "dialog during NREM phase of sleep, some types of procedural memories were suggested not " + "to rely on the hippocampus and involve REM phase of the sleep. This inspired models where " + "internal representations (memories) created by previous learning are spontaneously replayed " + "during sleep-like periods in the network itself (i.e. without help of secondary network " + "performed by generative replay approaches mentioned above).\n" + "Question: [type: true/false] Is the following sentence true? all types of procedural " + "memories rely on the hippocampus\n" + "Answer: False. The text clearly states there are some types of procedural memories not " + "reliant on the hippocampus\n--------\n" + "Question [type: complete-the-sentence] Complete the following sentence: The insights into " + "____ in human and animal brain led to other _____ approaches\n" + "Answer: The insights into the mechanisms of memory consolidation during the sleep processes " + "in human and animal brain led to other biologically inspired approaches\n------\n" + "Question [type 3 domain-keywords] Make a sentence with the following keywords " + "'hippocampo-neocortical', 'declarative' and 'NREM'\n" + "Answer: declarative memories are in the classical picture consolidated by " + "hippocampo-neocortical dialog during NREM phase of sleep\n-------\n" + "Question [type: normal q&a] Some types of procedural memories were suggested not to rely on " + "the hippocampus and involve REM phase of the sleep. What did this go on to inspire?\n" + "Answer This inspired models where internal representations (memories) created by previous " + "learning are spontaneously replayed during sleep-like periods in the network itself [END OF " + "EXAMPLE]\n\n " + "Similar to the above, could you craft 4 different reading comprehension tasks (make sure " + "your output is a list of question answer pairs and each question is labelled QUESTION and " + "answer is labelled ANSWER and there is one question and answer per task) based solely and " + "completely focused on the following TEXT: " +) + + +def gen_prompt(text: str) -> List[Dict[str, str]]: + prompt = PROMPT + text + + return [ + { + "role": "system", + "content": ( + "You are a helpful and meticulous instruction following question and answer making chatbot. " + "Please refrain from acknowledgments, additions or niceties of any sort" + ), + }, + {"role": "user", "content": prompt}, + ] + + +def generate_synthetic_data(model_pipeline: Pipeline, text: str, generation_params: Dict[str, Any]) -> str: + prompt = gen_prompt(text) + prompt = model_pipeline.tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) + outputs = model_pipeline(prompt, **generation_params) + + return outputs[0]["generated_text"] + + +def generate_synthetic_dataset( + model_name: str, + input_directory_or_file: str, + csv_column: Optional[str], + processed_files: List[str], + chunk: bool, + context_length: int, + generation_params: Dict[str, Any] = { + "max_new_tokens": 600, + "do_sample": True, + "temperature": 0.7, + "top_k": 5, + "top_p": 0.95, + "return_full_text": False, + }, +) -> Iterator[Tuple[int, str, str, str]]: + model_pipeline = pipeline("text-generation", model=model_name, torch_dtype=torch.bfloat16, device_map="auto") + + input_files = input_generator(input_directory_or_file, csv_column) + + if chunk: + tokenizer = model_pipeline.tokenizer + tokens = tokenizer.apply_chat_template(gen_prompt(""), tokenize=False, add_generation_prompt=True) + CONSTANT = len(tokenizer(tokens)["input_ids"]) + k = context_length - CONSTANT + + for file, text in input_files: + if file in processed_files: + continue + + if chunk: + for index, chunk_ in enumerate(text_chunker(text, tokenizer, k)): + gen_text = generate_synthetic_data(model_pipeline, chunk_, generation_params) + yield index, file, chunk_, gen_text + else: + gen_text = generate_synthetic_data(model_pipeline, text, generation_params) + yield 0, file, text, gen_text + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser("Generate synthetic dataset for reading comprehension") + parser.add_argument("--model_name", type=str, default="HuggingFaceH4/zephyr-7b-beta") + parser.add_argument("--input", type=str, required=True, help="Directory containing the input files OR a CSV file") + parser.add_argument("--csv_column", type=str, help="Column to read from the CSV file") + parser.add_argument( + "--output_directory", + type=str, + required=True, + help="Directory to save the generated files (serves as intermediate step and for debugging purposes)", + ) + parser.add_argument( + "--state_file", + type=str, + required=False, + default="rc_generation_state.pkl", + help="File to save the state of the generation in order to support resume functionality", + ) + parser.add_argument("--context_length", type=int, default=4096, help="context length to calculate the chunk size") + parser.add_argument("--no_chunk", action="store_false") + parser.add_argument( + "--dataset_name", type=str, default="synthetic_rc_dataset", help="name of the dataset to be saved" + ) + + return parser.parse_args() + + +def main() -> None: + args = parse_args() + """ + Pipeline here includes chunking, generation and parsing of question and answer into a list of exchanges + that can be used directly for training + """ + + if os.path.isfile(args.input) and not args.csv_column: + raise ValueError("a CSV column must be specified if the input is a file") + + if args.state_file: + if os.path.exists(args.state_file): + with open(args.state_file, "rb") as f: + state = pickle.load(f) + else: + state = {"processed_files": []} + pickle.dump(state, open(args.state_file, "wb")) + + if not os.path.exists(args.output_directory): + os.makedirs(args.output_directory) + + files_missed = 0 + total_files = 0 + + synth_dataset_generator = generate_synthetic_dataset( + model_name=args.model_name, + input_directory_or_file=args.input, + processed_files=state["processed_files"] if args.state_file else [], + chunk=args.no_chunk, + context_length=args.context_length, + csv_column=args.csv_column, + ) + + for index, filename, context, gen_text in synth_dataset_generator: + state["processed_files"].append(filename) + pickle.dump(state, open(args.state_file, "wb")) + qanda = question_and_answer_extractor(gen_text, context) + if qanda: + output_file = f"{filename}_{index}.json" + with open(os.path.join(args.output_directory, output_file), "w") as o: + json.dump(qanda, o) + else: + logger.warning( + (f"No question and answer pairs found for {filename} " f"chunk: {index}" if not args.no_chunk else "") + ) + files_missed += 1 + total_files += 1 + + unit = "files" if args.no_chunk else "chunks" + + logger.info(" Statistics ") + logger.info(f"Total number of successfully extracted q&a {unit}: {total_files - files_missed}") + logger.info(f"Total {unit} missed: {files_missed} out of {total_files}") + + in_memory_dataset = [] + for file in os.listdir(args.output_directory): + with open(os.path.join(args.output_directory, file), "r") as f: + in_memory_dataset.append({"messages": json.load(f)}) + + dataset = Dataset.from_list(in_memory_dataset) + dataset.save_to_disk(args.dataset_name) + + logger.info("Done generating synthetic dataset") + logger.info(f"Dataset saved to {args.dataset_name}") + + if args.state_file: + os.remove(args.state_file) + + +if __name__ == "__main__": + main() diff --git a/dalm/datasets/reading_comprehension_generation/utils.py b/dalm/datasets/reading_comprehension_generation/utils.py new file mode 100644 index 0000000..af019d7 --- /dev/null +++ b/dalm/datasets/reading_comprehension_generation/utils.py @@ -0,0 +1,210 @@ +import csv +import logging +import os +import re +import tempfile +from typing import Dict, Iterator, List, Optional, Tuple + +import sentencepiece as spm # type: ignore[import-untyped] +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +logger = logging.getLogger(__name__) + + +def input_generator(directory_or_file: str, csv_column: Optional[str] = None) -> Iterator[Tuple[str, str]]: + """ + Generator that yields the contents of the files in the directory or the CSV column. + """ + + if os.path.isfile(directory_or_file): + if directory_or_file.endswith(".csv") and csv_column: + # Process a single CSV file + yield from process_csv_file(directory_or_file, csv_column) + elif not csv_column: + # Process a single non-CSV file + yield from process_plain_file(directory_or_file) + else: + raise ValueError("CSV column specified for non-CSV file") + + elif os.path.isdir(directory_or_file): + # Process each file in the directory + for file in os.listdir(directory_or_file): + file_path = os.path.join(directory_or_file, file) + if file_path.endswith(".csv") and csv_column: + yield from process_csv_file(file_path, csv_column) + elif not file_path.endswith(".csv"): + yield from process_plain_file(file_path) + + else: + raise ValueError("The input should be a directory or a file.") + + +def process_csv_file(file_path: str, csv_column: str) -> Iterator[Tuple[str, str]]: + """Process a single CSV file.""" + with open(file_path, newline="", encoding="utf-8") as csvfile: + reader = csv.DictReader(csvfile) + for index, row in enumerate(reader): + yield os.path.basename(file_path) + str(index), row[csv_column] + + +def process_plain_file(file_path: str) -> Iterator[Tuple[str, str]]: + """Process a single plain text file.""" + try: + with open(file_path, "r", encoding="utf-8") as file_contents: + contents = file_contents.read() + except UnicodeDecodeError: + with open(file_path, "r", encoding="utf-8", errors="replace") as file_contents: + contents = file_contents.read() + yield os.path.basename(file_path), contents + + +def text_chunker(text: str, tokenizer: PreTrainedTokenizerBase, chunk_size: int) -> Iterator[str]: + tokens = tokenizer(text, return_tensors="pt")["input_ids"] + for i in range(0, tokens.shape[1], chunk_size): + chunk = tokens[:, i : i + chunk_size] + chunk = tokenizer.decode(chunk[0], skip_special_tokens=True) + yield chunk + + +# standalone +def files_chunker(input_directory: str, model: str, context_length: int, output_directory: str, prompt: str) -> None: + tokenizer = AutoTokenizer.from_pretrained(model) + + tokens = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) + CONSTANT = len(tokenizer(tokens)["input_ids"]) + + k = context_length - CONSTANT + + for filename, text in input_generator(input_directory): + extension = filename.split(".")[-1] + output_file_name = filename.split(".")[0] + for index, chunk in enumerate(text_chunker(text, tokenizer, k)): + output_file = f"{output_file_name}_{index}.{extension}" + with open(os.path.join(output_directory, output_file), "w") as o: + o.write(chunk) + + +def create_domain_tokenizer(text_file: str) -> spm.SentencePieceProcessor: + """ + train and return domain tokenizer + """ + with tempfile.TemporaryDirectory() as temp_dir: + # Define the model prefix with the path to the temp directory + model_prefix = f"{temp_dir}/domain" + + # Train the SentencePiece model, the model is saved in the temporary directory + exit_tries = False + vocab_size = 32000 + # hack to get around that the vocab size is too large and has to be set manually + # TODO: please figure out if mathematically a lower number of vocab is suitable for our case + while not exit_tries: + try: + spm.SentencePieceTrainer.train( + input=text_file, model_prefix=model_prefix, vocab_size=vocab_size, character_coverage=1.0 + ) + exit_tries = True + except RuntimeError as e: + error_message = str(e) + if error_message.startswith("Internal: src/trainer_interface.cc(661)"): + logger.warning(f"Vocab size of {vocab_size} is too large, decreasing it ...") + vocab_size = int( + error_message.split()[-1][:-1] + ) # error message ends with the recommended vocab and a period + logger.warning(f"Attempting with vocab size of {vocab_size}") + else: + raise e + + sp_model_file = f"{model_prefix}.model" + return spm.SentencePieceProcessor(model_file=sp_model_file) + + +def split_to_sentences(text: str) -> List[str]: + sentences = re.split(r"[.?!]\s+", text) + + return sentences + + +def create_domain_tokenizer_from_files(directory_or_file: str, csv_column: Optional[str]) -> spm.SentencePieceProcessor: + # open a tempfile and add sentences from files in directory_with_files to it + with tempfile.TemporaryDirectory() as temp_dir: + with open(os.path.join(temp_dir, "temp.txt"), "w", encoding="utf-8") as tfile: + generator = input_generator(directory_or_file, csv_column) + for _, text in generator: + sentences = split_to_sentences(text) + + for sentence in sentences: + sentence = sentence.strip() + if sentence and sentence != "": + tfile.write(sentence + "\n") + + return create_domain_tokenizer(os.path.join(temp_dir, "temp.txt")) + + +def fix_first_prompt(text: str, chat_chain: List[Dict[str, str]]) -> List[Dict[str, str]]: + # remove the first prompt + first_prompt = chat_chain.pop(0) + fixed_first_prompt = [ + { + "content": f"Based on the following text: \n {text}, \n I'd like you to answer a few questions\n" + + first_prompt["content"], + "role": "user", + } + ] + return fixed_first_prompt + chat_chain + + +# TODO: add test +# TODO: Address known issues described in #78 +def question_and_answer_extractor(whole_text: str, context: str) -> List[Dict[str, str]] | None: + text_lines = whole_text.split("\n") + question: List[str] = [] + answer: List[str] = [] + + question_context = False + answer_context = False + + result = [] + task_regex = r"^\*?\*?task\s*\d*" + + # question regex + question_regex = r"^question\s*\d*" + + # answer regex + answer_regex = r"^answer\s*\d*" + + for i in text_lines: + raw_text = i.strip() + text = raw_text.lower() + + # ignore empty lines + if text == "": + continue + + # if the line start matches the question regex or the task regex + if re.match(question_regex, text) or re.match(task_regex, text): + if answer_context: + result.append({"content": " ".join(question), "role": "user"}) + result.append({"content": " ".join(answer), "role": "assistant"}) + question = [] + answer = [] + answer_context = False + + question_context = True + answer_context = False + + if re.match(answer_regex, text): + question_context = False + answer_context = True + + if question_context: + # remove (labelled as QUESTION and ANSWER) from the text + raw_text = re.sub(r"\(labelled as QUESTION and ANSWER\)", "", raw_text) + question.append(raw_text) + + if answer_context: + answer.append(raw_text) + + if result == []: + return None + + return fix_first_prompt(context, result) diff --git a/dalm/pipelines/README.md b/dalm/pipelines/README.md new file mode 100644 index 0000000..f43ab39 --- /dev/null +++ b/dalm/pipelines/README.md @@ -0,0 +1,16 @@ +## Reading comprehension Pipeline + +This is an example of how to string together an end to end reaading-comprehension dataset generator and training regiment in one go + +It should be noted that this not the last say, if you the user think there's a better of stringing these together, the components +are atomic enough for you to string them in other ways that you may deem more appropriate + +```bash +python dalm/pipelines/reading_comprehension_pipeline.py --model_name HuggingFaceH4/zephyr-7b-alpha \ + --input input.csv --csv_column text \ + --output_dataset_name combined \ + --general_spm_path tokenizers/general.spm \ + --llm_synth_model_name meta-llama/Llama-2-13b-chat-hf \ + --llm_synth_model_context_length 4096 + +``` \ No newline at end of file diff --git a/dalm/pipelines/reading_comprehension_pipeline.py b/dalm/pipelines/reading_comprehension_pipeline.py new file mode 100644 index 0000000..6e366ba --- /dev/null +++ b/dalm/pipelines/reading_comprehension_pipeline.py @@ -0,0 +1,388 @@ +import argparse +import json +import logging +import os +import pickle +import random +from dataclasses import dataclass +from enum import Enum +from typing import Optional + +import datasets +import sentencepiece as spm # type: ignore[import-untyped] + +from dalm.datasets.reading_comprehension_generation.regex_based import RegexBasedReadingComprehension +from dalm.datasets.reading_comprehension_generation.synthetic_based import generate_synthetic_dataset +from dalm.datasets.reading_comprehension_generation.utils import ( + create_domain_tokenizer_from_files, + question_and_answer_extractor, +) +from dalm.training.generator_only.trainer import train_generator + +logger = logging.getLogger(__name__) + + +class SynthMode(Enum): + REGEX = "regex" + LLM = "llm" + BOTH = "both" + + +@dataclass +class LLMKwargs: + model_name: str + context_length: Optional[int] + dataset_output_path: str + chunk: bool + + def __post_init__(self) -> None: + if self.chunk and not self.context_length: + raise ValueError("context_length is required for chunking") + + +@dataclass +class SynthKwargs: + general_spm_path: str + domain_spm_path: Optional[str] + + +def pipeline( + model_name: str, + output_dataset_name: str, + input: str, + model_output_dir: str, + log_with: Optional[str], + llm_kwargs: Optional[LLMKwargs], + synth_kwargs: Optional[SynthKwargs], + csv_column: Optional[str], + size_valid_set: Optional[int], + comprehension_type: SynthMode, + shuffle_buffer: Optional[int], + generation_state_file: str = "generation_state.pkl", + num_train_epochs: int = 1, + split: str = "train", + streaming: bool = False, + seq_length: int = 2600, + num_workers: int = 4, + eval_steps: int = 200, + logging_steps: int = 1000, + per_device_train_batch_size: int = 1, + per_device_eval_batch_size: int = 1, + gradient_accumulation_steps: int = 1, + gradient_checkpointing: bool = True, + group_by_length: bool = False, + packing: bool = True, + lora_alpha: int = 512, + lora_dropout: float = 0.05, + lora_r: int = 256, + learning_rate: float = 5e-5, + lr_scheduler_type: str = "cosine", + num_warmup_steps: int = 0, + weight_decay: float = 0.0, + optimizer_type: str = "paged_adamw_32bit", + neftune_noise_alpha: int = 5, + run_name: str = "rc_pipeline", + validation_split: Optional[float] = 0.05, +) -> None: + if comprehension_type in [SynthMode.LLM, SynthMode.BOTH]: + if not llm_kwargs: + raise ValueError("llm_kwargs is required for LLM based generation") + + if comprehension_type in [SynthMode.REGEX, SynthMode.BOTH]: + if not synth_kwargs: + raise ValueError("synth_kwargs is required for regex based generation") + + if synth_kwargs and synth_kwargs.domain_spm_path: + domain_spm = spm.SentencePieceProcessor(model_file=synth_kwargs.domain_spm_path) + else: + logger.warning("No domain tokenizer provided. The domain tokenizer will be created from the input files") + domain_spm = create_domain_tokenizer_from_files(input, csv_column=csv_column) + + general_spm = spm.SentencePieceProcessor(model_file=synth_kwargs.general_spm_path) + + in_memory_dataset = [] + + # generate regex based reading comprehension dataset + if comprehension_type in [SynthMode.REGEX, SynthMode.BOTH]: + # generate regex based reading comprehension dataset + regex_rc_gen = RegexBasedReadingComprehension(general_spm, domain_spm) + + # NOTE: this is a simple check to see if the dataset is already generated + in_memory_dataset.extend( + [{"messages": rc_text} for _, _, rc_text in regex_rc_gen.dataset_generator(input, csv_column)] + ) + + # NOTE: this operation is time consuming and very expensive + # Attention has been paid to try to save intermediate steps in case of failure + # so that the generation can be resumed from the last checkpoint + if comprehension_type in [SynthMode.LLM, SynthMode.BOTH] and llm_kwargs: + if generation_state_file: + if os.path.exists(generation_state_file): + with open(generation_state_file, "rb") as f: + generation_state = pickle.load(f) + else: + generation_state = {"processed_texts": [], "total_texts": 0, "texts_missed": 0} + pickle.dump(generation_state, open(generation_state_file, "wb")) + + if not os.path.exists(llm_kwargs.dataset_output_path): + os.makedirs(llm_kwargs.dataset_output_path) + + llm_rc_dataset_generator = generate_synthetic_dataset( + model_name=llm_kwargs.model_name, + input_directory_or_file=input, + processed_files=generation_state["processed_texts"], + chunk=llm_kwargs.chunk or False, + context_length=llm_kwargs.context_length or 0, + csv_column=csv_column, + ) + + # generate llm based reading comprehension dataset + for index, text_identifier, context, gen_text in llm_rc_dataset_generator: + qanda = question_and_answer_extractor(gen_text, context) + if qanda: + output_file = f"{text_identifier}_{index}.json" + with open(os.path.join(llm_kwargs.dataset_output_path, output_file), "w") as o: + json.dump(qanda, o) + else: + logger.warning( + ( + f"No question and answer pairs found for {text_identifier} " f"chunk: {index}" + if llm_kwargs.chunk + else "" + ) + ) + generation_state["texts_missed"] += 1 + generation_state["processed_texts"].append(text_identifier) + generation_state["total_texts"] += 1 + pickle.dump(generation_state, open(generation_state_file, "wb")) + + logger.info(" Statistics ") + success_files_count = generation_state["total_texts"] - generation_state["texts_missed"] + logger.info(f"Total number of successfully extracted q&a: {success_files_count}") + logger.info(f"Total texts missed: {generation_state['texts_missed']} out of {generation_state['total_texts']}") + logger.info(f"Total texts processed: {generation_state['total_texts']}") + + for file in os.listdir(llm_kwargs.dataset_output_path): + with open(os.path.join(llm_kwargs.dataset_output_path, file), "r") as f: + in_memory_dataset.append({"messages": json.load(f)}) + + if in_memory_dataset == []: + raise ValueError("No dataset generated") + + # shuffle in memory dataset + random.shuffle(in_memory_dataset) + + dataset = datasets.Dataset.from_list(in_memory_dataset) + + dataset.save_to_disk(output_dataset_name) + + train_generator( + model_name=model_name, + dataset_name=output_dataset_name, + num_train_epochs=num_train_epochs, + split=split, + size_valid_set=size_valid_set, + streaming=streaming, + shuffle_buffer=shuffle_buffer, + seq_length=seq_length, + num_workers=num_workers, + eval_steps=eval_steps, + logging_steps=logging_steps, + per_device_train_batch_size=per_device_train_batch_size, + per_device_eval_batch_size=per_device_eval_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + gradient_checkpointing=gradient_checkpointing, + group_by_length=group_by_length, + packing=packing, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + lora_r=lora_r, + learning_rate=learning_rate, + lr_scheduler_type=lr_scheduler_type, + num_warmup_steps=num_warmup_steps, + weight_decay=weight_decay, + optimizer_type=optimizer_type, + output_dir=model_output_dir, + neftune_noise_alpha=neftune_noise_alpha, + log_with=log_with, + local_dataset=True, + validation_split=validation_split, + run_name=run_name, + ) + + if generation_state_file: + os.remove(generation_state_file) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "--model_name", type=str, default="HuggingFaceH4/zephyr-7b-alpha", help="name of the model to be trained" + ) + parser.add_argument( + "--output_dataset_name", + type=str, + required=True, + help="name of the dataset that the generated data will be saved to", + ) + parser.add_argument( + "--comprehension_type", + type=SynthMode, + default=SynthMode.LLM, + choices=list(SynthMode), + help="type of comprehension to be generated", + ) + parser.add_argument( + "--llm_synth_model_name", + type=str, + default="HuggingFaceH4/zephyr-7b-beta", + help="name of the model to be used for LLM based generation", + ) + parser.add_argument( + "--llm_synth_model_context_length", type=int, default=4096, help="context length to calulcate the chunk size" + ) + parser.add_argument( + "--llm_dataset_output_path", + type=str, + default="llm_dataset", + help="path to save the generated LLM based dataset", + ) + parser.add_argument( + "--general_spm_path", + type=str, + default="./resources/general.spm", + help="path to the general tokenizer (needed for regex based generation)", + ) + parser.add_argument( + "--domain_spm_path", + type=str, + default=None, + help=( + "path to the domain tokenizer used for regex based generation." + "If None is provided (default), it will be automatically generated." + ), + ) + parser.add_argument( + "--input", type=str, required=True, help="A CSV file OR a directory containing the CSV input files" + ) + parser.add_argument("--csv_column", type=str, help="Column to read from the CSV file") + parser.add_argument("--no_chunk", action="store_true", help="whether to NOT chunk the input files or not") + parser.add_argument("--num_train_epochs", type=int, default=1, help="number of epochs to train the generator") + parser.add_argument("--split", type=str, default="train", help="split to be used for training") + parser.add_argument("--size_valid_set", type=int, default=1000, help="size of the validation set (STREAMING ONLY)") + parser.add_argument("--validation_split", type=float, default=0.05, help="validation split") + parser.add_argument("--streaming", action="store_true", help="whether to use streaming or not") + parser.add_argument("--shuffle_buffer", type=int, default=10000, help="shuffle buffer size (STREAMING ONLY)") + parser.add_argument("--seq_length", type=int, default=2600, help="sequence length to be used for training") + parser.add_argument( + "--num_workers", type=int, default=4, help="number of workers to be used for data loading during training" + ) + parser.add_argument("--eval_steps", type=int, default=200, help="number of steps to evaluate the model") + parser.add_argument("--logging_steps", type=int, default=1000, help="number of steps to log the model") + parser.add_argument("--per_device_train_batch_size", type=int, default=1, help="batch size to be used for training") + parser.add_argument( + "--per_device_eval_batch_size", type=int, default=1, help="batch size to be used for evaluation" + ) + parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="gradient accumulation steps") + parser.add_argument( + "--no_gradient_checkpointing", action="store_true", help="whether to disable gradient checkpointing" + ) + parser.add_argument("--group_by_length", action="store_true", help="whether to group the dataset by length or not") + parser.add_argument("--no_packing", action="store_true", help="whether to disable packing or not") + parser.add_argument("--lora_alpha", type=int, default=512, help="lora alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="lora dropout") + parser.add_argument("--lora_r", type=int, default=256, help="lora r") + parser.add_argument("--learning_rate", type=float, default=1e-4) + parser.add_argument("--lr_scheduler_type", type=str, default="cosine") + parser.add_argument("--num_warmup_steps", type=int, default=100) + parser.add_argument("--weight_decay", type=float, default=0.05) + parser.add_argument("--optimizer_type", type=str, default="paged_adamw_32bit") + parser.add_argument("--model_output_dir", type=str, default="model_output_dir") + parser.add_argument("--neftune_noise_alpha", type=int, default=5) + parser.add_argument( + "--log_with", + type=str, + default="none", + help="tracker backend to be used", + ) + parser.add_argument( + "--generation_state_file", type=str, default="generation_state.pkl", help="file to save the generation state to" + ) + parser.add_argument("--run_name", type=str, default="rc_pipeline", help="name of the run") + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + if os.path.isfile(args.input) and not args.csv_column: + raise ValueError("a CSV column must be specified if the input is a file") + + llm_kwargs = None + synth_kwargs = None + + if args.comprehension_type in [SynthMode.LLM, SynthMode.BOTH]: + if not args.llm_synth_model_name: + raise ValueError("llm_synth_model_name is required for LLM based generation") + + if not args.llm_dataset_output_path: + raise ValueError("llm_dataset_output_path is required for LLM based generation") + + llm_kwargs = LLMKwargs( + model_name=args.llm_synth_model_name, + context_length=args.llm_synth_model_context_length, + dataset_output_path=args.llm_dataset_output_path, + chunk=not args.no_chunk, + ) + + if args.comprehension_type in [SynthMode.REGEX, SynthMode.BOTH]: + if not args.general_spm_path: + raise ValueError("general_spm_path is required for regex based generation") + + synth_kwargs = SynthKwargs( + general_spm_path=args.general_spm_path, + domain_spm_path=args.domain_spm_path, + ) + + pipeline( + model_name=args.model_name, + output_dataset_name=args.output_dataset_name, + comprehension_type=args.comprehension_type, + input=args.input, + num_train_epochs=args.num_train_epochs, + split=args.split, + size_valid_set=args.size_valid_set, + streaming=args.streaming, + shuffle_buffer=args.shuffle_buffer, + seq_length=args.seq_length, + num_workers=args.num_workers, + eval_steps=args.eval_steps, + logging_steps=args.logging_steps, + per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + gradient_checkpointing=not args.no_gradient_checkpointing, + group_by_length=args.group_by_length, + packing=not args.no_packing, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_r=args.lora_r, + learning_rate=args.learning_rate, + lr_scheduler_type=args.lr_scheduler_type, + num_warmup_steps=args.num_warmup_steps, + weight_decay=args.weight_decay, + optimizer_type=args.optimizer_type, + model_output_dir=args.model_output_dir, + neftune_noise_alpha=args.neftune_noise_alpha, + log_with=args.log_with, + generation_state_file=args.generation_state_file, + llm_kwargs=llm_kwargs, + synth_kwargs=synth_kwargs, + validation_split=args.validation_split, + run_name=args.run_name, + csv_column=args.csv_column, + ) + + +if __name__ == "__main__": + main() diff --git a/dalm/training/generator_only/trainer.py b/dalm/training/generator_only/trainer.py new file mode 100644 index 0000000..c18270c --- /dev/null +++ b/dalm/training/generator_only/trainer.py @@ -0,0 +1,300 @@ +import argparse +import logging +import os +from typing import Any, Callable, Dict, Optional, Tuple + +import torch +from accelerate import Accelerator +from datasets import Dataset, load_dataset, load_from_disk +from peft import LoraConfig +from tqdm import tqdm +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + PreTrainedTokenizerBase, + TrainingArguments, +) +from trl import SFTTrainer # type: ignore[import] + +logger = logging.getLogger(__name__) + + +def create_datasets( + dataset_name: str, + split: str, + validation_split: Optional[float], + size_valid_set: Optional[int], + streaming: bool, + shuffle_buffer: Optional[int], + num_workers: int, + local_dataset: bool = False, +) -> Tuple[Dataset, Dataset]: + if local_dataset: + dataset = load_from_disk( + dataset_name, + ) + else: + dataset = load_dataset( + dataset_name, + split=split, + num_proc=num_workers if not streaming else None, + streaming=streaming, + ) + if streaming: + logging.info("Loading the dataset in streaming mode") + if not (shuffle_buffer and size_valid_set): + raise ValueError("size_valid_set must be set when streaming is enabled") + valid_data = dataset.take(size_valid_set) + train_data = dataset.skip(size_valid_set) + train_data = train_data.shuffle(buffer_size=shuffle_buffer, seed=None) + else: + if not validation_split: + raise ValueError("validation_split must be set when streaming is disabled") + dataset = dataset.train_test_split(test_size=validation_split, seed=None) + train_data = dataset["train"] + valid_data = dataset["test"] + logging.info(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}") + + return train_data, valid_data + + +def chars_token_ratio( + dataset: Dataset, + tokenizer: PreTrainedTokenizerBase, + formatting_func: Callable[[Dict[str, Any]], str], + sample_size: int = 400, +) -> float: + """ + Estimate the average number of characters per token in the dataset. + """ + total_characters, total_tokens = 0, 0 + for _, example in tqdm(zip(range(sample_size), iter(dataset), strict=False), total=sample_size): + text = formatting_func(example) + total_characters += len(text) + if tokenizer.is_fast: + total_tokens += len(tokenizer(text).tokens()) + else: + total_tokens += len(tokenizer.tokenize(text)) + + return total_characters / total_tokens + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", type=str, default="HuggingFaceH4/zephyr-7b-beta", help="the model name") + parser.add_argument("--log_with", type=str, help="tracker name (wandb, mlflow, ..etc)") + parser.add_argument( + "--dataset_name", + type=str, + required=True, + help=( + "The dataset name corresponding to one sitting on huggingface or a local one" + "If local, be sure to set the local_dataset flag" + ), + ) + parser.add_argument("--local_dataset", action="store_true", help="whether to use a local dataset") + parser.add_argument("--split", type=str, default="train", help="the split to use") + parser.add_argument( + "--size_valid_set", type=int, default=4000, help="the size of the validation set (when streaming is enabled)" + ) + parser.add_argument("--validation_split", type=float, default=0.05, help="the validation split percentage") + parser.add_argument("--stream", action="store_true", help="whether to stream the dataset") + parser.add_argument("--shuffle_buffer", type=int, default=5000, help="the shuffle buffer size") + parser.add_argument("--seq_length", type=int, default=2600, help="the sequence length") + parser.add_argument("--num_workers", type=int, default=4, help="the number of workers") + + parser.add_argument("--eval_steps", type=int, default=200, help="the evaluation frequency") + parser.add_argument("--num_train_epochs", type=int, default=3, help="the number of training epochs") + parser.add_argument("--logging_steps", type=int, default=10, help="the logging frequency") + parser.add_argument("--per_device_train_batch_size", type=int, default=1, help="the per device train batch size") + parser.add_argument("--per_device_eval_batch_size", type=int, default=1, help="the per device eval batch size") + parser.add_argument("--gradient_accumulation_steps", type=int, default=32, help="the gradient accumulation steps") + parser.add_argument( + "--gradient_checkpointing", type=bool, default=True, help="whether to use gradient checkpointing" + ) + parser.add_argument("--group_by_length", type=bool, default=False, help="whether to group by length") + parser.add_argument("--no-packing", action="store_true", help="whether to not pack the sequences") + + parser.add_argument("--lora_alpha", type=float, default=512, help="the lora alpha parameter") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="the lora dropout parameter") + parser.add_argument("--lora_r", type=int, default=256, help="the lora r parameter") + + parser.add_argument("--learning_rate", type=float, default=1e-4, help="the learning rate") + parser.add_argument("--lr_scheduler_type", type=str, default="cosine", help="the lr scheduler type") + parser.add_argument("--num_warmup_steps", type=int, default=100, help="the number of warmup steps") + parser.add_argument("--weight_decay", type=float, default=0.05, help="weight decay") + parser.add_argument("--optimizer_type", type=str, default="paged_adamw_32bit", help="the optimizer type") + + parser.add_argument( + "--output_dir", + type=str, + default="./generator_finetuned_model", + help="the output directory where the model will be saved", + ) + parser.add_argument("--neftune_noise_alpha", type=int, default=5, help="the noise alpha for neftune") + parser.add_argument("--run_name", type=str, default="generator_finetuning", help="the tracker run name") + return parser.parse_args() + + +def train_generator( + model_name: str, + dataset_name: str, + local_dataset: bool, + run_name: str, + output_dir: str, + log_with: Optional[str], + size_valid_set: Optional[int], + validation_split: Optional[float], + shuffle_buffer: Optional[int], + num_train_epochs: int = 1, + split: str = "train", + streaming: bool = False, + seq_length: int = 2600, + num_workers: int = 4, + eval_steps: int = 200, + logging_steps: int = 10, + per_device_train_batch_size: int = 1, + per_device_eval_batch_size: int = 1, + gradient_accumulation_steps: int = 1, + gradient_checkpointing: bool = True, + group_by_length: bool = False, + packing: bool = True, + lora_alpha: int = 512, + lora_dropout: float = 0.05, + lora_r: int = 256, + learning_rate: float = 1e-4, + lr_scheduler_type: str = "cosine", + num_warmup_steps: int = 100, + weight_decay: float = 0.05, + optimizer_type: str = "paged_adamw_32bit", + neftune_noise_alpha: int = 5, +) -> None: + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + tokenizer.pad_token = tokenizer.eos_token + tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training + + bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16) + + base_model = AutoModelForCausalLM.from_pretrained( + model_name, + quantization_config=bnb_config, + device_map={"": Accelerator().local_process_index}, + trust_remote_code=True, + ) + + base_model.config.use_cache = False + + peft_config = LoraConfig( + r=lora_r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + target_modules=["q_proj", "v_proj"], + bias="none", + task_type="CAUSAL_LM", + ) + + training_args = TrainingArguments( + output_dir=output_dir, + per_device_train_batch_size=per_device_train_batch_size, + gradient_accumulation_steps=gradient_accumulation_steps, + gradient_checkpointing=gradient_checkpointing, + per_device_eval_batch_size=per_device_eval_batch_size, + learning_rate=learning_rate, + logging_steps=logging_steps, + num_train_epochs=num_train_epochs, + report_to=log_with, + save_strategy="epoch", + evaluation_strategy="steps", + eval_steps=eval_steps, + group_by_length=group_by_length, + lr_scheduler_type=lr_scheduler_type, + warmup_steps=num_warmup_steps, + optim=optimizer_type, + bf16=True, + remove_unused_columns=False, + run_name=run_name, + weight_decay=weight_decay, + neftune_noise_alpha=neftune_noise_alpha, + ) + + def prepare_sample_text(example: Dict[str, Any]) -> str: + """Prepare the text from a sample of the dataset.""" + text = tokenizer.apply_chat_template(example["messages"], tokenize=False) + return text + + train_dataset, eval_dataset = create_datasets( + dataset_name=dataset_name, + split=split, + validation_split=validation_split, + size_valid_set=size_valid_set, + streaming=streaming, + shuffle_buffer=shuffle_buffer, + num_workers=num_workers, + local_dataset=local_dataset, + ) + + chars_per_token = chars_token_ratio(train_dataset, tokenizer, prepare_sample_text) + logging.info(f"The character to token ratio of the dataset is: {chars_per_token:.2f}") + + logging.info("Starting the model training") + + trainer = SFTTrainer( + model=base_model, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + peft_config=peft_config, + packing=packing, + max_seq_length=seq_length, + tokenizer=tokenizer, + args=training_args, + chars_per_token=chars_per_token, + formatting_func=prepare_sample_text, + ) + + trainer.train() + trainer.save_model(output_dir) + + output_dir = os.path.join(output_dir, "final_checkpoint") + trainer.model.save_pretrained(output_dir) + + +def main() -> None: + args = parse_args() + train_generator( + model_name=args.model_name, + dataset_name=args.dataset_name, + local_dataset=args.local_dataset, + split=args.split, + size_valid_set=args.size_valid_set, + validation_split=args.validation_split, + streaming=args.stream, + shuffle_buffer=args.shuffle_buffer, + seq_length=args.seq_length, + num_workers=args.num_workers, + num_train_epochs=args.num_train_epochs, + eval_steps=args.eval_steps, + logging_steps=args.logging_steps, + per_device_train_batch_size=args.per_device_train_batch_size, + per_device_eval_batch_size=args.per_device_eval_batch_size, + gradient_accumulation_steps=args.gradient_accumulation_steps, + gradient_checkpointing=args.gradient_checkpointing, + group_by_length=args.group_by_length, + packing=not args.no_packing, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_r=args.lora_r, + learning_rate=args.learning_rate, + lr_scheduler_type=args.lr_scheduler_type, + num_warmup_steps=args.num_warmup_steps, + weight_decay=args.weight_decay, + optimizer_type=args.optimizer_type, + output_dir=args.output_dir, + log_with=args.log_with, + neftune_noise_alpha=args.neftune_noise_alpha, + run_name=args.run_name, + ) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 0fbf96b..fa8c805 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,7 @@ packages = [ ] dependencies = [ "scikit-learn", - "transformers", + "transformers>4.35", "peft", "accelerate", "datasets", @@ -24,10 +24,12 @@ dependencies = [ "evaluate", "tqdm", "peft", - "diffusers", + "trl", "bitsandbytes", "typer>=0.9.0,<1.0", "pydantic==1.10.9", # Sync w/ other platform components + "pysbd", + "sentencepiece" ] [project.scripts] diff --git a/resources/general.spm b/resources/general.spm new file mode 100644 index 0000000..8b787d7 Binary files /dev/null and b/resources/general.spm differ