diff --git a/evaluate_app/codebleu.py b/evaluate_app/codebleu.py index bb365ba..927b73f 100644 --- a/evaluate_app/codebleu.py +++ b/evaluate_app/codebleu.py @@ -17,7 +17,6 @@ import datasets import evaluate - _CITATION = """\ @misc{ren2020codebleu, title={CodeBLEU: a Method for Automatic Evaluation of Code Synthesis}, diff --git a/pyproject.toml b/pyproject.toml index 7ad8b5d..ddc4a28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,4 +106,3 @@ target_version=["py38","py39","py310","py311"] [tool.ruff] line-length=120 - diff --git a/setup.py b/setup.py index 1b0e285..1c12d40 100644 --- a/setup.py +++ b/setup.py @@ -8,14 +8,15 @@ subprocess.run( - ['bash', 'build.sh'], - cwd=ROOT / 'codebleu' / 'parser', + ["bash", "build.sh"], + cwd=ROOT / "codebleu" / "parser", check=True, ) class PlatformSpecificDistribution(Distribution): """Distribution which always forces a binary package with platform name""" + def has_ext_modules(self): return True diff --git a/tests/test_codebleu.py b/tests/test_codebleu.py index cf380e2..9bf4f32 100644 --- a/tests/test_codebleu.py +++ b/tests/test_codebleu.py @@ -1,97 +1,114 @@ import inspect +import logging from typing import Any, List import pytest -import logging from codebleu.codebleu import AVAILABLE_LANGS, calc_codebleu -@pytest.mark.parametrize(['predictions', 'references', 'codebleu'], [ - (['some rannnndom words in length more than 3'], - ['def test ( ) :\n pass'], 0.25), # 'cause data_flow is 0 and considered as 1 - (['def bar ( y , x ) :\n a = x * x\n return a'], ['def foo ( x ) :\n return x'], 0.4), - (['def foo ( x ) :\n return x * x'], ['def bar ( x ) :\n return x'], 0.6), - (['def bar ( x ) :\n return x'], ['def foo ( x ) :\n return x'], 0.8), - (['def foo ( x ) :\n return x'], ['def foo ( x ) :\n return x'], 1.0), -]) +@pytest.mark.parametrize( + ["predictions", "references", "codebleu"], + [ + ( + ["some rannnndom words in length more than 3"], + ["def test ( ) :\n pass"], + 0.25, + ), # 'cause data_flow is 0 and considered as 1 + (["def bar ( y , x ) :\n a = x * x\n return a"], ["def foo ( x ) :\n return x"], 0.4), + (["def foo ( x ) :\n return x * x"], ["def bar ( x ) :\n return x"], 0.6), + (["def bar ( x ) :\n return x"], ["def foo ( x ) :\n return x"], 0.8), + (["def foo ( x ) :\n return x"], ["def foo ( x ) :\n return x"], 1.0), + ], +) def test_simple_cases(predictions: List[Any], references: List[Any], codebleu: float) -> None: - result = calc_codebleu(references, predictions, 'python') + result = calc_codebleu(references, predictions, "python") logging.debug(result) - assert result['codebleu'] == pytest.approx(codebleu, 0.1) + assert result["codebleu"] == pytest.approx(codebleu, 0.1) -@pytest.mark.parametrize(['lang'], [(lang,) for lang in AVAILABLE_LANGS]) +@pytest.mark.parametrize(["lang"], [(lang,) for lang in AVAILABLE_LANGS]) def test_exact_match_works_for_all_langs(lang: str) -> None: - predictions = references = ['some matching string a couple of times'] - assert calc_codebleu(references, predictions, lang)['codebleu'] == 1.0 - - -@pytest.mark.parametrize(['lang', 'predictions', 'references'], [ - ('python', ['def foo ( x ) :\n return x'], ['def bar ( y ) :\n return y']), - ('java', ['public function foo ( x ) { return x }'], ['public function bar ( y ) {\n return y\n}']), - ('javascript', ['function foo ( x ) { return x }'], ['function bar ( y ) {\n return y\n}']), - ('c', ['int foo ( int x ) { return x }'], ['int bar ( int y ) {\n return y\n}']), - ('c_sharp', ['public int foo ( int x ) { return x }'], ['public int bar ( int y ) {\n return y\n}']), - ('cpp', ['int foo ( int x ) { return x }'], ['int bar ( int y ) {\n return y\n}']), - ('php', ['function foo ( x ) { return x }'], ['function bar ( y ) {\n return y\n}']), -]) + predictions = references = ["some matching string a couple of times"] + assert calc_codebleu(references, predictions, lang)["codebleu"] == 1.0 + + +@pytest.mark.parametrize( + ["lang", "predictions", "references"], + [ + ("python", ["def foo ( x ) :\n return x"], ["def bar ( y ) :\n return y"]), + ("java", ["public function foo ( x ) { return x }"], ["public function bar ( y ) {\n return y\n}"]), + ("javascript", ["function foo ( x ) { return x }"], ["function bar ( y ) {\n return y\n}"]), + ("c", ["int foo ( int x ) { return x }"], ["int bar ( int y ) {\n return y\n}"]), + ("c_sharp", ["public int foo ( int x ) { return x }"], ["public int bar ( int y ) {\n return y\n}"]), + ("cpp", ["int foo ( int x ) { return x }"], ["int bar ( int y ) {\n return y\n}"]), + ("php", ["function foo ( x ) { return x }"], ["function bar ( y ) {\n return y\n}"]), + ], +) def test_simple_cases_work_for_all_langs(lang: str, predictions: List[Any], references: List[Any]) -> None: result = calc_codebleu(references, predictions, lang) logging.debug(result) - assert result['codebleu'] == pytest.approx(0.6, 0.1) + assert result["codebleu"] == pytest.approx(0.6, 0.1) def test_error_when_lang_not_supported() -> None: with pytest.raises(AssertionError): - calc_codebleu(['def foo : pass'], ['def bar : pass'], 'not_supported_lang') + calc_codebleu(["def foo : pass"], ["def bar : pass"], "not_supported_lang") def test_error_when_input_length_mismatch() -> None: with pytest.raises(AssertionError): - calc_codebleu(['def foo : pass'], ['def bar : pass', 'def buz : pass'], 'python') + calc_codebleu(["def foo : pass"], ["def bar : pass", "def buz : pass"], "python") # https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/example.png -@pytest.mark.parametrize(['predictions', 'references', 'codebleu'], [ - # ( - # ['public static int Sign ( double d ) { return ( float ) ( ( d == 0 ) ? 0 : ( c < 0.0 ) ? - 1 : 1) ; }'], - # ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], - # 0.7238 # TODO: lol, not working at <3.12 - # ), - # ( - # ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'], - # ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], - # 0.8397 # TODO: check, lol, not working - # ), -]) +@pytest.mark.parametrize( + ["predictions", "references", "codebleu"], + [ + # ( + # ['public static int Sign ( double d ) { return ( float ) ( ( d == 0 ) ? 0 : ( c < 0.0 ) ? - 1 : 1) ; }'], + # ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], + # 0.7238 # TODO: lol, not working at <3.12 + # ), + # ( + # ['public static int Sign ( double c ) { return ( int ) ( ( c == 0 ) ? 0 : ( c < 0 ) ? - 1 : 1) ; }'], + # ['public static int Sign ( double d ) { return ( int ) ( ( d == 0 ) ? 0 : ( d < 0 ) ? - 1 : 1) ; }'], + # 0.8397 # TODO: check, lol, not working + # ), + ], +) def test_code_x_glue_readme_examples(predictions: List[Any], references: List[Any], codebleu: float) -> None: - - result = calc_codebleu(references, predictions, 'java') + result = calc_codebleu(references, predictions, "java") logging.debug(result) - assert result['codebleu'] == pytest.approx(codebleu, 0.01) - - -@pytest.mark.parametrize(['predictions', 'references', 'codebleu'], [ - # ([], [], 1.0), - # ([], [[]], 1.0), - (['def foo ( x ) : pass'], ['def foo ( x ) : pass'], 1.0), - (['def foo ( x ) : pass'], [['def foo ( x ) : pass']], 1.0), - (['def foo ( x ) : pass'], [['def bar ( x ) : pass', 'def foo ( x ) : pass']], 0.95), - (['def foo ( x ) : pass'], [['def foo ( x ) : pass', 'def bar ( x ) : pass']], 0.95), -]) + assert result["codebleu"] == pytest.approx(codebleu, 0.01) + + +@pytest.mark.parametrize( + ["predictions", "references", "codebleu"], + [ + # ([], [], 1.0), + # ([], [[]], 1.0), + (["def foo ( x ) : pass"], ["def foo ( x ) : pass"], 1.0), + (["def foo ( x ) : pass"], [["def foo ( x ) : pass"]], 1.0), + (["def foo ( x ) : pass"], [["def bar ( x ) : pass", "def foo ( x ) : pass"]], 0.95), + (["def foo ( x ) : pass"], [["def foo ( x ) : pass", "def bar ( x ) : pass"]], 0.95), + ], +) def test_input_variants(predictions: List[Any], references: List[Any], codebleu: float) -> None: - assert calc_codebleu(references, predictions, 'python')['codebleu'] == pytest.approx(codebleu, 0.01) + assert calc_codebleu(references, predictions, "python")["codebleu"] == pytest.approx(codebleu, 0.01) # TODO: fix this test # @pytest.mark.timeout(1) def test_finite_processing_time_in_bug_testcase() -> None: - dummy_true_code = inspect.cleandoc(''' + dummy_true_code = inspect.cleandoc( + """ def foo(n): pass - ''') - generated_code = inspect.cleandoc(''' + """ + ) + generated_code = inspect.cleandoc( + """ def foo(n): for i in range(n): for j in range(n): @@ -130,10 +147,11 @@ def foo(n): # for q in range(n): # for r in range(n): # for s - ''') + """ + ) # just test finite processing time - calc_codebleu([dummy_true_code], [generated_code], 'python') + calc_codebleu([dummy_true_code], [generated_code], "python") # TODO: add tests with direct comparison with XLCoST and CodeXGlue results