prompt formatting for flores , crosssum , xqora and xquad added - Adi…

…thya S K
adithya-s-k · Jun 4, 2024 · a03c460 · a03c460
1 parent d1fb7bb
commit a03c460
Showing 1 changed file with 192 additions and 56 deletions.
diff --git a/src/indic_eval/tasks/tasks_prompt_formatting.py b/src/indic_eval/tasks/tasks_prompt_formatting.py
@@ -38,6 +38,193 @@
 LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
 # fmt: on
 
+# all prompt formatters for indic languages
+
+def get_indic_language(language_code: str):
+    "takes in a indic language code and returns the language "
+
+    language_code_mapping = {"hi" :"hindi", 
+                                "kn" : "kannada",
+                                "ta" : "tamil",
+                                "te" : "telgu",
+                                "mr" : "marathi",
+                                "ml" : "malayalam",
+                                "gu" : "gujarati"}
+    return str(language_code_mapping[language_code])
+
+def hellaswag_harness_indic(line, task_name: str = None):
+    def preprocess(text):
+        """Comes from AiHarness"""
+        # text = text.strip()
+        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+
+    # steps to get ctx_b as we didnt translate form the original dataset but merged it with the translated ending
+    choices = [preprocess(ending) for ending in line["translated_endings"]]
+    right_label = int(line["label"]) if line["label"] != "" else -1
+    ctx_b = choices[right_label]
+    ctx_b = ' '.join(ctx_b.split()[:2]) # gets the first two words from the correct option as the starting tokens
+    ctx = f"{line['translated_ctx_a']} {ctx_b}"
+    return Doc(
+        task_name=task_name,
+        query=preprocess(ctx),
+        choices=[preprocess(ending) for ending in line["translated_endings"]],
+        gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
+        # "metric": "choices_loglikelihood",
+    )
+
+def crosssum_indic(line, task_name: str = None):
+    """
+    I will first show a news article in English and then provide a
+    summary of it in the [Target Language Name] language.
+    Summarize the following article: [Article]
+    Summary:    
+    """
+    language = get_indic_language(line["lang"])
+    return Doc(
+        task_name=task_name,
+        query = f"""I will first show a news article in English and then provide a
+                    summary of it in the {language} language.
+                    Summarize the following article: {line["text"]}
+                    Summary: """,
+        choices=[line["summary"]],
+        gold_index=0,
+        instruction=f"I will first show a news article in English and then provide a summary of it in the {language} language.",
+    )
+
+def flores_enxx_indic(line, task_name: str = None):
+    """
+    Translate the following:
+    To [Target Language Name]: [Sentence in English]
+    Output:
+    """
+    language = get_indic_language(line["lang"])
+    return Doc(
+        task_name=task_name,
+        query = f"""Translate the following:
+                    {line["source"]} to {language}.
+                    
+                    ### Translation: 
+                    """,
+        choices=[line["target"]],
+        gold_index=0,
+        instruction=f"Translate the following:{line["source"]} to {language}",
+    )
+
+def flores_xxen_indic(line, task_name: str = None):
+    """
+    Translate the following:
+    To English: [Sentence in Target Language]
+    Output:
+    """
+    # language = get_indic_language(line["lang"])
+    return Doc(
+        task_name=task_name,
+        query = f"""Translate the following:
+                    {line["source"]} to english.
+                    
+                    ### Translation: 
+                    """,
+        choices=[line["target"]],
+        gold_index=0,
+        instruction=f"Translate the following:{line["source"]} to english.",
+    )
+
+def xquad_indic(line, task_name: str = None):
+
+    """
+    Given the following context:
+    
+    [Passage in Target Language]
+    Question : [Question in Target Language]
+    Answer :
+    
+    """
+
+    language = get_indic_language(line["lang"])
+    answer = line["answers"][0]["text"]
+    return Doc(
+        task_name=task_name,
+        query = f"""Given the following context:
+                {line["context"]} 
+                
+                answer to the following 
+                Question: {line["question"]}  in {language}
+                
+                Answer : 
+                """,   
+        choices=[answer],
+        gold_index=0,
+        instruction= f"Given the following context answer to the question"
+    )
+
+def xorqa_indic(line, task_name: str = None):
+    """
+    Generate an answer in [Target Language Name] for the question
+    based on the given passage:
+    [Passage in English]
+    Question : [Question in Target Language]
+    Answer :
+    """
+
+    language = get_indic_language(line["lang"])
+    answer = line["translated_answers"][0]["text"]
+    return Doc(
+        task_name=task_name,
+        query = f"""Generate an answer in {language} for the question
+                based on the given passage:
+                
+                {line["context"]} 
+                
+                Question: {line["oracle_question"]}
+                
+                Answer : 
+                """,   
+        choices=[answer],
+        gold_index=0,
+        instruction= f"Given the following passage answer to the question in {language}"
+    )
+
+
+def arc_indic(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"Question: {line['translated_question']}\nAnswer:",
+        choices=[f" {c}" for c in line["translated_choices"]["text"]],
+        gold_index=line["translated_choices"]["label"].index(line["answerKey"]),
+    )
+
+def boolq_harness_indic(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=f"{line['translated_passage']}\nQuestion: {line['translated_question']}?\nAnswer:",
+        choices=[" no", " yes"],  # False is label 0
+        gold_index=int(line["answer"]),
+    )
+
+def mmlu_helm_indic (line, task_name: str = None):
+    # subject = line["subject"]
+    # query = f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\nQuestion: {line['question']}"
+    query = f"The following are multiple choice questions (with answers) \n\nQuestion: {line['translated_question']}"
+    query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["translated_choices"])])
+    query += "\nAnswer:"
+
+    gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=[" A", " B", " C", " D"],
+        gold_index=gold_ix,
+        # instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
+        instruction=f"The following are multiple choice questions (with answers) \n\n",
+        target_for_fewshot_sorting=line["translated_choices"][gold_ix],  # specific to HELM evals
+    )
+
+
 
 def anli(line, task_name: str = None):
     return Doc(
@@ -76,13 +263,8 @@ def arc(line, task_name: str = None):
         gold_index=line["choices"]["label"].index(line["answerKey"]),
     )
 
-def arc_indic(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=f"Question: {line['translated_question']}\nAnswer:",
-        choices=[f" {c}" for c in line["translated_choices"]["text"]],
-        gold_index=line["translated_choices"]["label"].index(line["answerKey"]),
-    )
+
+
 
 
 def arc_with_options_letters_predict(line, task_name: str = None):
@@ -474,14 +656,7 @@ def boolq_helm_contrastset(line, task_name: str = None):
     ][0]
 
 
-def boolq_harness_indic(line, task_name: str = None):
-    return Doc(
-        task_name=task_name,
-        query=f"{line['translated_passage']}\nQuestion: {line['translated_question']}?\nAnswer:",
-        choices=[" no", " yes"],  # False is label 0
-        gold_index=int(line["answer"]),
-    )
-
+
 def boolq_harness(line, task_name: str = None):
     return Doc(
         task_name=task_name,
@@ -583,6 +758,8 @@ def covid_dialogue(line, task_name: str = None):
     )
 
 
+
+
 def crows_pair(line, task_name: str = None):
     return Doc(task_name=task_name, query="", choices="", gold_index="", instruction="")
 
@@ -788,29 +965,6 @@ def preprocess(text):
         gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
         # "metric": "choices_loglikelihood",
     )
-def hellaswag_harness_indic(line, task_name: str = None):
-    def preprocess(text):
-        """Comes from AiHarness"""
-        # text = text.strip()
-        # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
-        text = text.replace(" [title]", ". ")
-        text = re.sub("\\[.*?\\]", "", text)
-        text = text.replace("  ", " ")
-        return text
-
-    # steps to get ctx_b as we didnt translate form the original dataset but merged it with the translated ending
-    choices = [preprocess(ending) for ending in line["translated_endings"]]
-    right_label = int(line["label"]) if line["label"] != "" else -1
-    ctx_b = choices[right_label]
-    ctx_b = ' '.join(ctx_b.split()[:2]) # gets the first two words from the correct option as the starting tokens
-    ctx = f"{line['translated_ctx_a']} {ctx_b}"
-    return Doc(
-        task_name=task_name,
-        query=preprocess(ctx),
-        choices=[preprocess(ending) for ending in line["translated_endings"]],
-        gold_index=int(line["label"]) if line["label"] != "" else -1,  # -1 for test
-        # "metric": "choices_loglikelihood",
-    )
 
 
 def hellaswag_helm(line, task_name: str = None):
@@ -1663,24 +1817,6 @@ def mmlu_helm(line, task_name: str = None):
         target_for_fewshot_sorting=line["choices"][gold_ix],  # specific to HELM evals
     )
 
-def mmlu_helm_indic (line, task_name: str = None):
-    # subject = line["subject"]
-    # query = f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\nQuestion: {line['question']}"
-    query = f"The following are multiple choice questions (with answers) \n\nQuestion: {line['translated_question']}"
-    query += "".join([f"\n{key}. {choice}" for key, choice in zip(LETTER_INDICES, line["translated_choices"])])
-    query += "\nAnswer:"
-
-    gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-
-    return Doc(
-        task_name=task_name,
-        query=query,
-        choices=[" A", " B", " C", " D"],
-        gold_index=gold_ix,
-        # instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
-        instruction=f"The following are multiple choice questions (with answers) \n\n",
-        target_for_fewshot_sorting=line["translated_choices"][gold_ix],  # specific to HELM evals
-    )
 
 
 def mmlu_qa_abstract_algebra(line, task_name: str = None):