oneonlee
diff --git a/‎.gitignore
Lines changed: 4 additions & 0 deletions b/‎.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 26 additions & 7 deletions b/‎README.md
Lines changed: 26 additions & 7 deletions
diff --git a/‎config/ca_config.yaml
Lines changed: 23 additions & 0 deletions b/‎config/ca_config.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎config/cs_config.yaml
Lines changed: 23 additions & 0 deletions b/‎config/cs_config.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎config/eu_config.yaml
Lines changed: 23 additions & 0 deletions b/‎config/eu_config.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎config/fa_config.yaml
Lines changed: 23 additions & 0 deletions b/‎config/fa_config.yaml
Lines changed: 23 additions & 0 deletions
diff --git a/‎data/test/scripts/retrieve_contexts.py
Lines changed: 51 additions & 0 deletions b/‎data/test/scripts/retrieve_contexts.py
Lines changed: 51 additions & 0 deletions
diff --git a/‎lib.py
Lines changed: 4 additions & 1 deletion b/‎lib.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎model/FAVA.py
Lines changed: 4 additions & 0 deletions b/‎model/FAVA.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎model/REFIND.py
Lines changed: 72 additions & 7 deletions b/‎model/REFIND.py
Lines changed: 72 additions & 7 deletions
@@ -5,6 +5,10 @@ data/val/*
 !data/val/scripts
 !data/val/README-v2.md
 
+data/test/*
+!data/test/README-v1.md
+!data/test/scripts
+
 logs/*
 
 result/*
 
@@ -5,7 +5,7 @@
 </div>
 <br>
 
-## Task & Dataset Info
+## Task & Dataset Info.
 [SemEval-2025 Task-3 — Mu-SHROOM](https://helsinki-nlp.github.io/shroom/)
 
 
@@ -27,18 +27,37 @@ Download Mu-SHROOM Dataset from [Official Website](https://helsinki-nlp.github.i
 sh scripts/preprocess_wiki.sh
 ```
 
-### Experiment
+### Experiment 
+#### Validation Set
 ```bash
+# Retrieve Contexts
+sh scripts/run_val_retriever.sh
+
+# Our Method
+sh scripts/run_val_REFIND.sh
+
+# Baselines
+sh scripts/run_val_XLM-R.sh
+sh scripts/run_val_FAVA.sh
+
+## Evaluation
+sh scripts/evaluate_val.sh
+```
+
+#### Test Set
+```bash
+# Retrieve Contexts
+sh scripts/run_test_retriever.sh
+
 # Our Method
-sh scripts/run_REFIND.sh
+sh scripts/run_test_REFIND.sh
 
 # Baselines
-sh scripts/run_random_guess.sh
-sh scripts/run_XLM-R.sh
-sh scripts/run_FAVA.sh
+sh scripts/run_test_XLM-R.sh
+sh scripts/run_test_FAVA.sh
 
 ## Evaluation
-sh scripts/evaluate.sh
+sh scripts/evaluate_test.sh
 ```
 
 ## References
 
@@ -0,0 +1,23 @@
+REFIND:
+    retriever: HybridRetriever
+    input_prompt_template: REFIND_PROMPT_TEMPLATE
+    threshold_list: [0.1, 0.2, 0.3, 0.4]
+FAVA:
+    retriever: HybridRetriever
+
+Retriever:
+    language: CA
+    input_file_path: retriever/ca_wiki_corpus.jsonl
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 5
+HybridRetriever:
+    language: CA
+    input_file_path: retriever/ca_wiki_corpus.jsonl
+    embedding_model_path: intfloat/multilingual-e5-large
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 10
+        reranking_top_k: 5
@@ -0,0 +1,23 @@
+REFIND:
+    retriever: HybridRetriever
+    input_prompt_template: REFIND_PROMPT_TEMPLATE
+    threshold_list: [0.1, 0.2, 0.3, 0.4]
+FAVA:
+    retriever: HybridRetriever
+
+Retriever:
+    language: CS
+    input_file_path: retriever/cs_wiki_corpus.jsonl
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 5
+HybridRetriever:
+    language: CS
+    input_file_path: retriever/cs_wiki_corpus.jsonl
+    embedding_model_path: intfloat/multilingual-e5-large
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 10
+        reranking_top_k: 5
@@ -0,0 +1,23 @@
+REFIND:
+    retriever: HybridRetriever
+    input_prompt_template: REFIND_PROMPT_TEMPLATE
+    threshold_list: [0.1, 0.2, 0.3, 0.4]
+FAVA:
+    retriever: HybridRetriever
+
+Retriever:
+    language: EU
+    input_file_path: retriever/eu_wiki_corpus.jsonl
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 5
+HybridRetriever:
+    language: EU
+    input_file_path: retriever/eu_wiki_corpus.jsonl
+    embedding_model_path: intfloat/multilingual-e5-large
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 10
+        reranking_top_k: 5
@@ -0,0 +1,23 @@
+REFIND:
+    retriever: HybridRetriever
+    input_prompt_template: REFIND_PROMPT_TEMPLATE
+    threshold_list: [0.1, 0.2, 0.3, 0.4]
+FAVA:
+    retriever: HybridRetriever
+
+Retriever:
+    language: FA
+    input_file_path: retriever/fa_wiki_corpus.jsonl
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 5
+HybridRetriever:
+    language: FA
+    input_file_path: retriever/fa_wiki_corpus.jsonl
+    embedding_model_path: intfloat/multilingual-e5-large
+    parameters:
+        retrieval_chunk_size: 600
+        retrieval_chunk_overlap: 30
+        retrieval_top_k: 10
+        reranking_top_k: 5
@@ -0,0 +1,51 @@
+import argparse as ap
+import os
+import sys
+
+parent_dir = os.path.dirname(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+sys.path.append(parent_dir)
+
+import yaml
+from lib import load_jsonl_file, write_jsonl
+from retriever.retriever import HybridRetriever
+from tqdm import tqdm
+
+
+p = ap.ArgumentParser()
+p.add_argument("--yaml_filepath", type=str, default="config/en_config.yaml")
+p.add_argument("--input_filepath", type=str)
+args = p.parse_args()
+
+
+def main():
+    records = load_jsonl_file(args.input_filepath)
+
+    with open(args.yaml_filepath, "r") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+
+    retriever = HybridRetriever(args.yaml_filepath)
+
+    records_with_contexts = []
+    for record in tqdm(records, desc="Retrieving contexts"):
+        context_list = retriever.retrieve(
+            query=record["model_input"], return_type="list"
+        )
+        assert (
+            context_list is not None
+        ), f"Failed to retrieve contexts for record {record}"
+
+        record["context"] = context_list
+        records_with_contexts.append(record)
+
+    input_directory = os.path.dirname(args.input_filepath)
+    output_filename = f"context-{os.path.basename(args.input_filepath)}"
+    output_filepath = os.path.join(input_directory, output_filename)
+
+    write_jsonl(records_with_contexts, output_filepath)
+    print(f"Contexts written to {output_filepath}")
+
+
+if __name__ == "__main__":
+    main()
@@ -35,7 +35,10 @@ def load_jsonl_file(filename):
     Performs minor format checks (ensures that soft_labels are present, optionally compute hard_labels on the fly)."""
     df = pd.read_json(filename, lines=True)
     if 'hard_labels' not in df.columns:
-        df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
+        try:
+            df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
+        except AttributeError:
+            pass
     # adding an extra column for convenience
     df['text_len'] = df.model_output_text.apply(len)
     return df.to_dict(orient='records')
@@ -43,6 +43,8 @@ def _find_hallucination_spans(output_text):
     spans = []
     for match in matches:
         content = match.group(2).strip()
+        if content == "":
+            continue
         start_idx = processed_output_text.find(content)
         if start_idx != -1:
             end_idx = start_idx + len(content)
@@ -162,6 +164,8 @@ def predict_hallucinations(
             exit()
 
         model_output_start_idx = hallucinated_output.find(hallucinated_text)
+        if model_output_start_idx == -1:
+            continue
         model_output_end_idx = model_output_start_idx + len(hallucinated_text)
 
         hard_labels.append([model_output_start_idx, model_output_end_idx])
 
@@ -78,8 +78,15 @@ def _generate_offset_mapping_manually(text, tokenizer):
             offset_mapping = []
             start = 0
             for token in tokens:
-                token = token.replace("▁", " ")
                 start = text.find(token, start)
+                if start == -1:
+                    token = token.replace("▁", " ")
+                    start = text.find(token, start)
+                    if start == -1:
+                        token = token.replace(" ", "")
+                        start = text.find(token, start)
+                        if start == -1:
+                            continue
                 end = start + len(token)
                 offset_mapping.append((start, end))
                 start = end
@@ -228,6 +235,12 @@ def main():
             model = AutoModelForCausalLM.from_pretrained(
                 model_id, device_map="auto", torch_dtype=torch.bfloat16
             )
+        elif model_id.replace("\/", "/") == "CohereForAI/aya-23-35B":
+            from torch.nn import DataParallel
+            tokenizer = AutoTokenizer.from_pretrained(model_id.replace("\/", "/"), trust_remote_code=True)
+            model = AutoModelForCausalLM.from_pretrained(model_id.replace("\/", "/"), trust_remote_code=True)
+            model = DataParallel(model)
+            model.to(args.device)
         else:
             tokenizer = AutoTokenizer.from_pretrained(
                 model_id.replace("\/", "/"), trust_remote_code=True
@@ -276,12 +289,44 @@ def main():
             model_output_token_ids, offsets_mapping = (
                 _get_tokens_ids_and_offsets_mapping(tokenizer, model_output_text)
             )
-            assert offsets_mapping[-1][1] == len(
-                model_output_text
-            ), "Offsets mapping and model output text mismatch!"
-            assert len(model_output_token_ids) == len(
-                offsets_mapping
-            ), f"Token IDs and offsets mapping mismatch! {len(model_output_token_ids)} vs {len(offsets_mapping)}"
+            try:
+                assert offsets_mapping[-1][1] == len(
+                    model_output_text
+                ), "Offsets mapping and model output text mismatch!"
+            except AssertionError as e:
+                print(f"AssertionError: {e}")
+                print(f"offsets_mapping: {offsets_mapping}")
+                print(f"model_output_text: {model_output_text}")
+                
+                # Augment offsets_mapping
+                prev_end_idx = 0
+                # end_idx = len(model_output_text)
+                for i, span in enumerate(offsets_mapping):
+                    start_idx, end_idx = span
+                    if start_idx == prev_end_idx:
+                        prev_end_idx = end_idx
+                        continue
+                    else:
+                        offsets_mapping.insert(i, (prev_end_idx, start_idx))
+                        prev_end_idx = end_idx
+                
+                # Check again
+                assert offsets_mapping[-1][1] == len(
+                    model_output_text
+                ), "Offsets mapping and model output text mismatch!"
+
+            try:
+                assert len(model_output_token_ids) == len(
+                    offsets_mapping
+                ), f"Token IDs and offsets mapping mismatch! {len(model_output_token_ids)} vs {len(offsets_mapping)}"
+            except AssertionError as e:
+                print(f"AssertionError: {e}")
+                print(f"model_output_token_ids: {model_output_token_ids}")
+                print(f"offsets_mapping: {offsets_mapping}")
+                if len(model_output_token_ids) > len(offsets_mapping):
+                    model_output_token_ids = model_output_token_ids[: len(offsets_mapping)]
+                else:
+                    offsets_mapping = offsets_mapping[: len(model_output_token_ids)]
             model_output_probs, model_output_logits = compute_output_probs(
                 model,
                 tokenizer,
@@ -457,6 +502,26 @@ def main():
     for condition in HALLUCINATION_CONDITIONS.keys():
         for threshold in threshold_list:
             predictions = total_predictions[condition][threshold]
+
+            # Check Validity of prediction
+            for prediction in predictions:
+                hard_labels = prediction["hard_labels"]
+                soft_labels = prediction["soft_labels"]
+                assert len(hard_labels) == len(soft_labels), "Hard and soft labels mismatch!"
+
+                for hard_label, soft_label in zip(hard_labels, soft_labels):
+                    if hard_label[0] < 0 or hard_label[1] > len(prediction["model_output_text"]):
+                        # remove invalid spans
+                        hard_labels.remove(hard_label)
+                        soft_labels.remove(soft_label)
+                        continue
+                    
+                    assert (
+                        hard_label[0] == soft_label["start"]
+                    ), "Hard and soft labels mismatch!"
+                    assert hard_label[1] == soft_label["end"], "Hard and soft labels mismatch!"
+                
+
             output_file_directory = os.path.join(
                 args.output_directory,
                 f'{os.path.basename(args.yaml_filepath.replace(".yaml", ""))}',