Skip to content

Commit

Permalink
refactor: bleu file to work with 3.12
Browse files Browse the repository at this point in the history
  • Loading branch information
k4black committed Nov 16, 2023
1 parent 4deda6a commit d95b1d1
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 287 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
run: |
python -m pip install -e .[test]
- name: Run tests
run: python -m pytest | true
run: python -m pytest

external-build-workflow:
needs: [fast-tests-python]
Expand Down
156 changes: 9 additions & 147 deletions codebleu/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,11 @@
import sys
import warnings
from collections import Counter
from fractions import Fraction as _Fraction
from typing import Any

from .utils import ngrams


class Fraction(_Fraction):
"""Fraction class with _normalize=False support.
_normalize=False was removed in 3.12, add custom class for back-compatibility
"""

# We're immutable, so use __new__ not __init__
def __new__(cls, numerator: Any = 0, denominator: Any = None, *, _normalize: bool = True) -> "Fraction":
if sys.version_info >= (3, 12):
return super(Fraction, cls).__new__(cls, numerator, denominator)
else:
return super(Fraction, cls).__new__(cls, numerator, denominator, _normalize=False)


def sentence_bleu(
references,
hypothesis,
Expand Down Expand Up @@ -166,9 +152,9 @@ def corpus_bleu(
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
for i, _ in enumerate(weights, start=1):
p_i = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i.numerator
p_denominators[i] += p_i.denominator
p_i_numerator, p_i_denominator = modified_precision(references, hypothesis, i)
p_numerators[i] += p_i_numerator
p_denominators[i] += p_i_denominator

# Calculate the hypothesis length and the closest reference length.
# Adds them to the corpus-level hypothesis and reference counts.
Expand All @@ -185,8 +171,8 @@ def corpus_bleu(
if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
weights = (1 / hyp_lengths,) * hyp_lengths

# Collects the various precision values for the different ngram orders.
p_n = [Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1)]
# Collects the various recall values for the different ngram orders.
p_n = [(p_numerators[i], p_denominators[i]) for i, _ in enumerate(weights, start=1)]

# Returns 0 if there's no matching n-grams
# We only need to check for p_numerators[1] == 0, since if there's
Expand All @@ -202,7 +188,7 @@ def corpus_bleu(
# it tries to retain the Fraction object as much as the
# smoothing method allows.
p_n = smoothing_function(p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths)
s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
s = (w_i * math.log(p_i[0] / p_i[1]) for w_i, p_i in zip(weights, p_n))
s = bp * math.exp(math.fsum(s))
return s

Expand Down Expand Up @@ -298,7 +284,8 @@ def modified_precision(references, hypothesis, n):
# Usually this happens when the ngram order is > len(reference).
denominator = max(1, sum(counts.values()))

return Fraction(numerator, denominator, _normalize=False)
# return Fraction(numerator, denominator, _normalize=False)
return numerator, denominator


def closest_ref_length(references, hyp_len):
Expand Down Expand Up @@ -447,133 +434,8 @@ def __init__(self, epsilon=0.1, alpha=5, k=5):
self.alpha = alpha
self.k = k

def method0(self, p_n, *args, **kwargs):
"""
No smoothing.
"""
p_n_new = []
for i, p_i in enumerate(p_n):
if p_i.numerator != 0:
p_n_new.append(p_i)
else:
_msg = str(
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
"Therefore the BLEU score evaluates to 0, independently of\n"
"how many N-gram overlaps of lower order it contains.\n"
"Consider using lower n-gram order or use "
"SmoothingFunction()"
).format(i + 1)
warnings.warn(_msg)
# When numerator==0 where denonminator==0 or !=0, the result
# for the precision score should be equal to 0 or undefined.
# Due to BLEU geometric mean computation in logarithm space,
# we we need to take the return sys.float_info.min such that
# math.log(sys.float_info.min) returns a 0 precision score.
p_n_new.append(sys.float_info.min)
return p_n_new

def method1(self, p_n, *args, **kwargs):
"""
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
"""
return [(p_i.numerator + self.epsilon) / p_i.denominator if p_i.numerator == 0 else p_i for p_i in p_n]

def method2(self, p_n, *args, **kwargs):
"""
Smoothing method 2: Add 1 to both numerator and denominator from
Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
machine translation quality using longest common subsequence and
skip-bigram statistics. In ACL04.
"""
return [Fraction(p_i.numerator + 1, p_i.denominator + 1, _normalize=False) for p_i in p_n]

def method3(self, p_n, *args, **kwargs):
"""
Smoothing method 3: NIST geometric sequence smoothing
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
precision score whose matching n-gram count is null.
k is 1 for the first 'n' value for which the n-gram match count is null/
For example, if the text contains:
- one 2-gram match
- and (consequently) two 1-gram matches
the n-gram count for each individual precision score would be:
- n=1 => prec_count = 2 (two unigrams)
- n=2 => prec_count = 1 (one bigram)
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
"""
incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
for i, p_i in enumerate(p_n):
if p_i.numerator == 0:
p_n[i] = 1 / (2**incvnt * p_i.denominator)
incvnt += 1
return p_n

def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 4:
Shorter translations may have inflated precision values due to having
smaller denominators; therefore, we give them proportionally
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len != 0:
incvnt = i + 1 * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
p_n[i] = incvnt / p_i.denominator
return p_n

def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 5:
The matched counts for similar values of n should be similar. To a
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
matched counts.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
m = {}
# Requires an precision value for an addition ngram order.
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
m[-1] = p_n[0] + 1
for i, p_i in enumerate(p_n):
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
m[i] = p_n[i]
return p_n

def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
if i in [0, 1]: # Skips the first 2 orders of ngrams.
continue
else:
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
# No. of ngrams in translation that matches the reference.
m = p_i.numerator
# No. of ngrams in translation.
ngrams_count = sum(1 for _ in ngrams(hypothesis, i + 1))
# Calculates the interpolated precision.
p_n[i] = (m + self.alpha * pi0) / (ngrams_count + self.alpha)
return p_n

def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 7:
Interpolates methods 4 and 5.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
p_n = self.method4(p_n, references, hypothesis, hyp_len)
p_n = self.method5(p_n, references, hypothesis, hyp_len)
return p_n
return [((p_i[0] + self.epsilon), p_i[1]) if p_i[0] == 0 else p_i for p_i in p_n]
129 changes: 2 additions & 127 deletions codebleu/weighted_ngram_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def corpus_bleu(
# For each order of ngram, calculate the numerator and
# denominator for the corpus-level modified precision.
for i, _ in enumerate(weights, start=1):
p_i_numeraotr, p_i_denominator = modified_recall(references, hypothesis, i)
p_numerators[i] += p_i_numeraotr
p_i_numerator, p_i_denominator = modified_recall(references, hypothesis, i)
p_numerators[i] += p_i_numerator
p_denominators[i] += p_i_denominator

# Calculate the hypothesis length and the closest reference length.
Expand Down Expand Up @@ -400,133 +400,8 @@ def __init__(self, epsilon=0.1, alpha=5, k=5):
self.alpha = alpha
self.k = k

def method0(self, p_n, *args, **kwargs):
"""
No smoothing.
"""
p_n_new = []
for i, p_i in enumerate(p_n):
if p_i[0] != 0:
p_n_new.append(p_i)
else:
_msg = str(
"\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
"Therefore the BLEU score evaluates to 0, independently of\n"
"how many N-gram overlaps of lower order it contains.\n"
"Consider using lower n-gram order or use "
"SmoothingFunction()"
).format(i + 1)
warnings.warn(_msg)
# When numerator==0 where denonminator==0 or !=0, the result
# for the precision score should be equal to 0 or undefined.
# Due to BLEU geometric mean computation in logarithm space,
# we we need to take the return sys.float_info.min such that
# math.log(sys.float_info.min) returns a 0 precision score.
p_n_new.append(sys.float_info.min)
return p_n_new

def method1(self, p_n, *args, **kwargs):
"""
Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
"""
return [((p_i[0] + self.epsilon), p_i[1]) if p_i[0] == 0 else p_i for p_i in p_n]

def method2(self, p_n, *args, **kwargs):
"""
Smoothing method 2: Add 1 to both numerator and denominator from
Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
machine translation quality using longest common subsequence and
skip-bigram statistics. In ACL04.
"""
return [(p_i[0] + 1, p_i[1] + 1) for p_i in p_n]

def method3(self, p_n, *args, **kwargs):
"""
Smoothing method 3: NIST geometric sequence smoothing
The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
precision score whose matching n-gram count is null.
k is 1 for the first 'n' value for which the n-gram match count is null/
For example, if the text contains:
- one 2-gram match
- and (consequently) two 1-gram matches
the n-gram count for each individual precision score would be:
- n=1 => prec_count = 2 (two unigrams)
- n=2 => prec_count = 1 (one bigram)
- n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
- n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
"""
incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
for i, p_i in enumerate(p_n):
if p_i.numerator == 0:
p_n[i] = 1 / (2**incvnt * p_i.denominator)
incvnt += 1
return p_n

def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 4:
Shorter translations may have inflated precision values due to having
smaller denominators; therefore, we give them proportionally
smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
suggests dividing by 1/ln(len(T)), where T is the length of the translation.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
for i, p_i in enumerate(p_n):
if p_i.numerator == 0 and hyp_len != 0:
incvnt = i + 1 * self.k / math.log(hyp_len) # Note that this K is different from the K from NIST.
p_n[i] = incvnt / p_i.denominator
return p_n

def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 5:
The matched counts for similar values of n should be similar. To a
calculate the n-gram matched count, it averages the n−1, n and n+1 gram
matched counts.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
m = {}
# Requires an precision value for an addition ngram order.
p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
m[-1] = p_n[0] + 1
for i, p_i in enumerate(p_n):
p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
m[i] = p_n[i]
return p_n

def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 6:
Interpolates the maximum likelihood estimate of the precision *p_n* with
a prior estimate *pi0*. The prior is estimated by assuming that the ratio
between pn and pn−1 will be the same as that between pn−1 and pn−2; from
Gao and He (2013) Training MRF-Based Phrase Translation Models using
Gradient Ascent. In NAACL.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
# This smoothing only works when p_1 and p_2 is non-zero.
# Raise an error with an appropriate message when the input is too short
# to use this smoothing technique.
assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
for i, p_i in enumerate(p_n):
if i in [0, 1]: # Skips the first 2 orders of ngrams.
continue
else:
pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
# No. of ngrams in translation that matches the reference.
m = p_i.numerator
# No. of ngrams in translation.
ngrams_count = sum(1 for _ in ngrams(hypothesis, i + 1))
# Calculates the interpolated precision.
p_n[i] = (m + self.alpha * pi0) / (ngrams_count + self.alpha)
return p_n

def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
"""
Smoothing method 7:
Interpolates methods 4 and 5.
"""
hyp_len = hyp_len if hyp_len else len(hypothesis)
p_n = self.method4(p_n, references, hypothesis, hyp_len)
p_n = self.method5(p_n, references, hypothesis, hyp_len)
return p_n
5 changes: 1 addition & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,12 @@ exclude = ["tests", "tests.*", "codebleu.parser.tree-sitter"]
"*" = ["py.typed", "*.txt", "*.so", "*.dylib", "*.dll", "keywords/*"]



[project.scripts]
codebleu = "codebleu.__main__:main"

[project.urls]
homepage = "https://github.com/k4black/codebleu"


[project.optional-dependencies]
test = [
"pytest >=7.0.0,<8.0.0",
Expand All @@ -60,10 +58,9 @@ test = [
"flake8 >=6.0.0,<7.0.0",
"ruff >=0.0.275,<0.2.0",
"isort >=5.0.0,<6.0.0",
"nltk >=3.0.0,<4.0.0",
]



[tool.setuptools.dynamic]
version = {file = "VERSION"}

Expand Down
Loading

0 comments on commit d95b1d1

Please sign in to comment.