diff --git a/flexeval/core/metric/bleu.py b/flexeval/core/metric/bleu.py index 7fb54ac..73d7838 100644 --- a/flexeval/core/metric/bleu.py +++ b/flexeval/core/metric/bleu.py @@ -33,7 +33,9 @@ class BLEU(Metric): """ def __init__(self, tokenize_option: str | None = None) -> None: - self._bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option) + self._corpus_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option) + # For sentence BLEU, we need to set `effective_order=True` as recommended by sacrebleu. + self._sentence_bleu = sacrebleu.metrics.BLEU(tokenize=tokenize_option, effective_order=True) def evaluate( self, @@ -60,16 +62,16 @@ def evaluate( set_of_references.append("") references_for_sacrebleu.append(set_of_references) - bleu = self._bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu) + bleu = self._corpus_bleu.corpus_score([o.strip() for o in lm_outputs], references_for_sacrebleu) sentence_bleu_list = [ - self._bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list) + self._sentence_bleu.sentence_score(o.strip(), refs) for o, refs in zip(lm_outputs, references_list) ] return MetricResult( { "bleu_score": bleu.score / 100, "bleu_bp": bleu.bp, - "bleu_signature": self._bleu.get_signature(), + "bleu_signature": self._corpus_bleu.get_signature(), }, instance_details=[{"bleu_score": b.score / 100, "bleu_bp": b.bp} for b in sentence_bleu_list], )