Merge pull request #116 from sbintuitions/add_correlation

ryokan0123 · web-flow · commit 76cfdef6f88a · 2025-01-06T16:41:07.000+09:00
Add `Correlation` metric class
diff --git a/flexeval/core/metric/__init__.py b/flexeval/core/metric/__init__.py
@@ -4,6 +4,7 @@
 from .code_eval import CodeEval
 from .common_prefix_length import CommonPrefixLength
 from .common_string_length import CommonStringLength
+from .correlation import Correlation
 from .exact_match import ExactMatch
 from .llm_label import ChatLLMLabel, LLMLabel
 from .llm_score import ChatLLMScore, LLMScore
diff --git a/flexeval/core/metric/correlation.py b/flexeval/core/metric/correlation.py
@@ -0,0 +1,108 @@
+from __future__ import annotations
+
+import functools
+import warnings
+from typing import Literal
+
+from scipy.stats import kendalltau, pearsonr, spearmanr
+
+from .base import Metric, MetricResult
+from .string_processor import StringProcessor
+
+
+class Correlation(Metric):
+    """
+    Correlation metric to compute Pearson, Spearman, or Kendall correlation coefficients.
+    The lm_outputs and references should be numeric values, optionally preprocessed by StringProcessor.
+
+    Args:
+        method: The correlation method to use ('pearson', 'spearman', 'kendall').
+        lm_output_processor: StringProcessor or a list of StringProcessor to be applied to the model outputs before
+            computing the correlation. If a list is provided, the processors will be applied in order.
+        reference_processor: StringProcessor or a list of StringProcessor to be applied to the references before
+            computing the correlation. If a list is provided, the processors will be applied in order.
+
+    Examples:
+        >>> from flexeval import Correlation
+        >>> correlation = Correlation(method='pearson')
+        >>> lm_outputs = ["1", "2", "3", "4", "5"]
+        >>> references = [["5"], ["4"], ["3"], ["2"], ["1"]]
+        >>> result = correlation.evaluate(lm_outputs, references)
+        >>> print(result)
+        MetricResult(
+            summary={"pearson_correlation": -1.0, "pearson_pvalue": 0.0},
+            instance_details=[],
+        )
+    """
+
+    def __init__(
+        self,
+        method: Literal["pearson", "spearman", "kendall"] = "pearson",
+        lm_output_processor: StringProcessor | list[StringProcessor] | None = None,
+        reference_processor: StringProcessor | list[StringProcessor] | None = None,
+    ) -> None:
+        if method not in {"pearson", "spearman", "kendall"}:
+            msg = f"Invalid method '{method}'. Choose from 'pearson', 'spearman', 'kendall'."
+            raise ValueError(msg)
+        self.method = method
+
+        if isinstance(lm_output_processor, StringProcessor):
+            lm_output_processor = [lm_output_processor]
+        if isinstance(reference_processor, StringProcessor):
+            reference_processor = [reference_processor]
+        self.lm_output_processors = lm_output_processor
+        self.reference_processors = reference_processor
+
+    def evaluate(
+        self,
+        lm_outputs: list[str],
+        references_list: list[list[str]],
+        task_inputs_list: list[dict[str, str]] | None = None,
+    ) -> MetricResult:
+        if len(lm_outputs) != len(references_list):
+            msg = (
+                f"Number of model outputs ({len(lm_outputs)}) and number of references ({len(references_list)}) "
+                "should be the same."
+            )
+            raise ValueError(msg)
+
+        # We only use the first reference here
+        references = [refs[0] for refs in references_list]
+
+        if self.lm_output_processors:
+            lm_outputs = [
+                functools.reduce(lambda x, norm: norm(x), self.lm_output_processors, output) for output in lm_outputs
+            ]
+
+        if self.reference_processors:
+            references = [
+                functools.reduce(lambda x, norm: norm(x), self.reference_processors, ref) for ref in references
+            ]
+
+        # The model output should be converted to float, if fails it will be treated as 0
+        lm_outputs_as_float: list[float] = []
+        for output in lm_outputs:
+            try:
+                lm_outputs_as_float.append(float(output))
+            except ValueError:  # noqa:PERF203
+                warnings.warn(f"Failed to convert model output '{output}' to float. Treating it as 0.", stacklevel=2)
+                lm_outputs_as_float.append(0.0)
+
+        # The reference should be converted to float
+        references_as_float = [float(ref) for ref in references]
+
+        # Compute correlation
+        if self.method == "pearson":
+            correlation, pvalue = pearsonr(lm_outputs_as_float, references_as_float)
+        elif self.method == "spearman":
+            correlation, pvalue = spearmanr(lm_outputs_as_float, references_as_float)
+        elif self.method == "kendall":
+            correlation, pvalue = kendalltau(lm_outputs_as_float, references_as_float)
+        else:
+            msg = f"Unsupported method: {self.method}"
+            raise ValueError(msg)
+
+        return MetricResult(
+            {f"{self.method}_correlation": correlation, f"{self.method}_pvalue": pvalue},
+            instance_details=[],
+        )
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ vllm = {version = "^0.6.4.post1", optional = true }
 loguru = "^0.7.2"
 wandb = {version = "^0.17.2", optional = true}
 pyarrow = "16.1.0"  # set the version because we get "Unable to find installation candidates" with 17.0.0
+scipy = "1.13.0"
 
 [tool.poetry.extras]
 vllm = ["vllm"]
diff --git a/tests/core/metric/test_correlation.py b/tests/core/metric/test_correlation.py
@@ -0,0 +1,66 @@
+from __future__ import annotations
+
+import pytest
+
+from flexeval import Correlation, MetricResult
+
+
+@pytest.mark.parametrize(
+    ("method", "lm_outputs", "references", "expected_correlation"),
+    [
+        ("pearson", [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], 1.0),
+        ("pearson", [1, 2, 3, 4, 5], [5, 4, 3, 2, 1], -1.0),
+        ("spearman", [1, 2, 3, 4, 5], [1, 20, 30, 400, 500], 1.0),
+        ("spearman", [1, 2, 3, 4, 5], [500, 400, 30, 20, 1], -1.0),
+        ("kendall", [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], 1.0),
+        ("kendall", [1, 2, 3, 4, 5], [5, 4, 3, 2, 1], -1.0),
+    ],
+)
+def test_correlation(
+    method: str, lm_outputs: list[float], references: list[float], expected_correlation: float
+) -> None:
+    correlation = Correlation(method=method)
+    references_list = [[ref] for ref in references]  # Wrap references in a list for each instance
+
+    result = correlation.evaluate(lm_outputs, references_list)
+
+    assert isinstance(result, MetricResult)
+    assert f"{method}_correlation" in result.summary
+    assert result.summary[f"{method}_correlation"] == pytest.approx(expected_correlation, rel=1e-3)
+
+
+def test_instantiation_fails_with_invalid_method() -> None:
+    with pytest.raises(ValueError, match="Invalid method"):  # Expecting an error for invalid method
+        Correlation(method="invalid")
+
+
+def test_evaluation_fails_with_mismatched_lengths() -> None:
+    correlation = Correlation(method="pearson")
+
+    lm_outputs = [1, 2, 3]
+    references_list = [[1], [2]]  # Mismatched lengths
+
+    with pytest.raises(ValueError):
+        correlation.evaluate(lm_outputs, references_list)
+
+
+def test_evaluation_does_not_fail_with_non_numeric_lm_outputs() -> None:
+    correlation = Correlation(method="pearson")
+
+    lm_outputs = ["1", "a", "3"]
+    references_list = [["1.0"], ["2.0"], ["3.0"]]
+
+    with pytest.warns(UserWarning, match="Failed to convert model output 'a' to float"):
+        result = correlation.evaluate(lm_outputs, references_list)
+
+    assert result.summary["pearson_correlation"] is not None
+
+
+def test_evaluation_fails_with_non_numeric_references() -> None:
+    correlation = Correlation(method="pearson")
+
+    lm_outputs = ["1", "2", "3"]
+    references_list = [["1.0"], ["non-numeric"], ["3.0"]]
+
+    with pytest.raises(ValueError):
+        correlation.evaluate(lm_outputs, references_list)