Skip to content

Commit ac4a09e

Browse files
committed
Add comprehensive pytest test suite for evaluation module
1 parent 36e9e2d commit ac4a09e

File tree

1 file changed

+199
-40
lines changed

1 file changed

+199
-40
lines changed

evaluation/test_evals.py

Lines changed: 199 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,212 @@
1-
2-
from unittest.mock import patch, MagicMock
1+
#!/usr/bin/env -S uv run --script
2+
# /// script
3+
# requires-python = "==3.11.11"
4+
# dependencies = [
5+
# "pandas==2.2.3",
6+
# "lighteval==0.10.0",
7+
# "openai==1.83.0",
8+
# "spacy==3.8.7",
9+
# "pytest==8.3.3",
10+
# "pytest-asyncio==0.24.0",
11+
# "pip"
12+
# ]
13+
# ///
314

415
import pytest
516
import pandas as pd
17+
from unittest.mock import Mock, patch, AsyncMock
18+
import tempfile
19+
import os
20+
import sys
21+
22+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
23+
from evaluation.evals import evaluate_response, calculate_cost_metrics, load_csv
24+
625

7-
from evals import evaluate_response
26+
@pytest.fixture
27+
def mock_token_usage():
28+
token_usage = Mock()
29+
token_usage.input_tokens = 100
30+
token_usage.output_tokens = 50
31+
return token_usage
832

9-
class MockTokenUsage:
10-
def __init__(self, input_tokens, output_tokens):
11-
self.input_tokens = input_tokens
12-
self.output_tokens = output_tokens
1333

14-
@patch("evals.ModelFactory.get_handler")
15-
@patch("evals.Extractiveness.compute")
16-
def test_evaluate_response(mock_extractiveness_compute, mock_get_handler):
34+
@pytest.fixture
35+
def temp_csv():
36+
def _create_csv(content):
37+
with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
38+
f.write(content)
39+
return f.name
1740

18-
# Mock BaseModelHandler
19-
mock_handler = MagicMock()
20-
mock_handler.handle_request.return_value = (
21-
"This is a summary.",
22-
MockTokenUsage(input_tokens=100, output_tokens=50),
23-
{"input": 15.0, "output": 30.0}, # $15 and $30 per 1M tokens
24-
1.23, # duration
41+
return _create_csv
42+
43+
44+
class TestCalculateCostMetrics:
45+
@pytest.mark.parametrize(
46+
"input_tokens,output_tokens,input_price,output_price,expected_input,expected_output,expected_total",
47+
[
48+
(1000, 500, 5.0, 15.0, 0.005, 0.0075, 0.0125),
49+
(0, 0, 5.0, 15.0, 0.0, 0.0, 0.0),
50+
(1_000_000, 2_000_000, 10.0, 30.0, 10.0, 60.0, 70.0),
51+
],
2552
)
53+
def test_calculate_cost_metrics(
54+
self,
55+
input_tokens,
56+
output_tokens,
57+
input_price,
58+
output_price,
59+
expected_input,
60+
expected_output,
61+
expected_total,
62+
):
63+
token_usage = Mock(input_tokens=input_tokens, output_tokens=output_tokens)
64+
pricing = {"input": input_price, "output": output_price}
65+
66+
result = calculate_cost_metrics(token_usage, pricing)
2667

27-
mock_get_handler.return_value = mock_handler
68+
assert pytest.approx(result["input_cost"]) == expected_input
69+
assert pytest.approx(result["output_cost"]) == expected_output
70+
assert pytest.approx(result["total_cost"]) == expected_total
2871

29-
mock_extractiveness_compute.return_value = {
30-
"summarization_coverage": 0.8,
31-
"summarization_density": 1.5,
32-
"summarization_compression": 2.0,
33-
}
3472

35-
df = evaluate_response(
36-
model_name="mock-model",
37-
query="What is the summary?",
38-
context="This is a long article about something important.",
39-
reference="This is a reference summary.",
73+
class TestLoadCsv:
74+
@pytest.mark.parametrize(
75+
"csv_content,required_columns,expected_len",
76+
[
77+
(
78+
"model,instructions\ngpt-4,Test prompt\ngpt-3.5,Another prompt\n",
79+
["MODEL", "INSTRUCTIONS"],
80+
2,
81+
),
82+
(
83+
" model , instructions \ngpt-4,Test prompt\n",
84+
["MODEL", "INSTRUCTIONS"],
85+
1,
86+
),
87+
("input\nTest input 1\nTest input 2\n", ["INPUT"], 2),
88+
],
4089
)
90+
def test_load_csv_valid(
91+
self, temp_csv, csv_content, required_columns, expected_len
92+
):
93+
temp_path = temp_csv(csv_content)
94+
try:
95+
df = load_csv(temp_path, required_columns)
96+
assert len(df) == expected_len
97+
assert list(df.columns) == required_columns
98+
finally:
99+
os.unlink(temp_path)
100+
101+
@pytest.mark.parametrize(
102+
"csv_content,required_columns",
103+
[
104+
("model,prompt\ngpt-4,Test prompt\n", ["MODEL", "INSTRUCTIONS"]),
105+
("wrong,columns\nval1,val2\n", ["MODEL", "INSTRUCTIONS"]),
106+
],
107+
)
108+
def test_load_csv_missing_columns(self, temp_csv, csv_content, required_columns):
109+
temp_path = temp_csv(csv_content)
110+
try:
111+
with pytest.raises(ValueError, match="must contain the following columns"):
112+
load_csv(temp_path, required_columns)
113+
finally:
114+
os.unlink(temp_path)
115+
116+
def test_load_csv_nonexistent_file(self):
117+
with pytest.raises(FileNotFoundError):
118+
load_csv("nonexistent_file.csv", ["MODEL"])
119+
120+
121+
class TestEvaluateResponse:
122+
@pytest.mark.asyncio
123+
async def test_evaluate_response_success(self, mock_token_usage):
124+
mock_handler = AsyncMock()
125+
mock_handler.handle_request.return_value = (
126+
"Generated response text",
127+
mock_token_usage,
128+
{"input": 5.0, "output": 15.0},
129+
1.5,
130+
)
131+
132+
mock_extractiveness = Mock()
133+
mock_extractiveness.compute.return_value = {
134+
"summarization_coverage": 0.8,
135+
"summarization_density": 0.6,
136+
"summarization_compression": 0.4,
137+
}
138+
139+
with (
140+
patch(
141+
"evaluation.evals.ModelFactory.get_handler", return_value=mock_handler
142+
),
143+
patch("evaluation.evals.Extractiveness", return_value=mock_extractiveness),
144+
):
145+
result = await evaluate_response("gpt-4", "Test instructions", "Test input")
146+
147+
assert isinstance(result, pd.DataFrame)
148+
assert len(result) == 1
149+
row = result.iloc[0]
150+
assert row["Generated Text"] == "Generated response text"
151+
assert row["Extractiveness Coverage"] == 0.8
152+
assert row["Input Token Usage"] == 100
153+
assert row["Output Token Usage"] == 50
154+
assert row["Duration (s)"] == 1.5
155+
156+
@pytest.mark.parametrize(
157+
"exception_side_effect", ["get_handler", "handle_request", "extractiveness"]
158+
)
159+
@pytest.mark.asyncio
160+
async def test_evaluate_response_exceptions(
161+
self, mock_token_usage, exception_side_effect
162+
):
163+
if exception_side_effect == "get_handler":
164+
with patch(
165+
"evaluation.evals.ModelFactory.get_handler",
166+
side_effect=Exception("Test error"),
167+
):
168+
result = await evaluate_response(
169+
"invalid-model", "Test instructions", "Test input"
170+
)
171+
172+
elif exception_side_effect == "handle_request":
173+
mock_handler = AsyncMock()
174+
mock_handler.handle_request.side_effect = Exception("Handler error")
175+
with patch(
176+
"evaluation.evals.ModelFactory.get_handler", return_value=mock_handler
177+
):
178+
result = await evaluate_response(
179+
"gpt-4", "Test instructions", "Test input"
180+
)
181+
182+
elif exception_side_effect == "extractiveness":
183+
mock_handler = AsyncMock()
184+
mock_handler.handle_request.return_value = (
185+
"text",
186+
mock_token_usage,
187+
{"input": 5.0, "output": 15.0},
188+
1.5,
189+
)
190+
mock_extractiveness = Mock()
191+
mock_extractiveness.compute.side_effect = Exception("Extractiveness error")
192+
193+
with (
194+
patch(
195+
"evaluation.evals.ModelFactory.get_handler",
196+
return_value=mock_handler,
197+
),
198+
patch(
199+
"evaluation.evals.Extractiveness", return_value=mock_extractiveness
200+
),
201+
):
202+
result = await evaluate_response(
203+
"gpt-4", "Test instructions", "Test input"
204+
)
205+
206+
assert isinstance(result, pd.DataFrame)
207+
assert len(result) == 1
208+
assert pd.isna(result.iloc[0]["Generated Text"])
209+
41210

42-
assert isinstance(df, pd.DataFrame)
43-
assert df.shape == (1, 8)
44-
assert df["Output Text"].iloc[0] == "This is a summary."
45-
assert df["Extractiveness Coverage"].iloc[0] == 0.8
46-
assert df["Extractiveness Density"].iloc[0] == 1.5
47-
assert df["Extractiveness Compression"].iloc[0] == 2.0
48-
assert df["Input Token Usage"].iloc[0] == 100
49-
assert df["Output Token Usage"].iloc[0] == 50
50-
51-
expected_cost = (15.0 / 1_000_000) * 100 + (30.0 / 1_000_000) * 50
52-
assert pytest.approx(df["Cost (USD)"].iloc[0], rel=1e-4) == expected_cost
53-
assert pytest.approx(df["Duration (s)"].iloc[0], rel=1e-4) == 1.23
211+
if __name__ == "__main__":
212+
pytest.main([__file__])

0 commit comments

Comments
 (0)