1
-
2
- from unittest .mock import patch , MagicMock
1
+ #!/usr/bin/env -S uv run --script
2
+ # /// script
3
+ # requires-python = "==3.11.11"
4
+ # dependencies = [
5
+ # "pandas==2.2.3",
6
+ # "lighteval==0.10.0",
7
+ # "openai==1.83.0",
8
+ # "spacy==3.8.7",
9
+ # "pytest==8.3.3",
10
+ # "pytest-asyncio==0.24.0",
11
+ # "pip"
12
+ # ]
13
+ # ///
3
14
4
15
import pytest
5
16
import pandas as pd
17
+ from unittest .mock import Mock , patch , AsyncMock
18
+ import tempfile
19
+ import os
20
+ import sys
21
+
22
+ sys .path .insert (0 , os .path .abspath (os .path .join (os .path .dirname (__file__ ), ".." )))
23
+ from evaluation .evals import evaluate_response , calculate_cost_metrics , load_csv
24
+
6
25
7
- from evals import evaluate_response
26
+ @pytest .fixture
27
+ def mock_token_usage ():
28
+ token_usage = Mock ()
29
+ token_usage .input_tokens = 100
30
+ token_usage .output_tokens = 50
31
+ return token_usage
8
32
9
- class MockTokenUsage :
10
- def __init__ (self , input_tokens , output_tokens ):
11
- self .input_tokens = input_tokens
12
- self .output_tokens = output_tokens
13
33
14
- @patch ("evals.ModelFactory.get_handler" )
15
- @patch ("evals.Extractiveness.compute" )
16
- def test_evaluate_response (mock_extractiveness_compute , mock_get_handler ):
34
+ @pytest .fixture
35
+ def temp_csv ():
36
+ def _create_csv (content ):
37
+ with tempfile .NamedTemporaryFile (mode = "w" , suffix = ".csv" , delete = False ) as f :
38
+ f .write (content )
39
+ return f .name
17
40
18
- # Mock BaseModelHandler
19
- mock_handler = MagicMock ()
20
- mock_handler .handle_request .return_value = (
21
- "This is a summary." ,
22
- MockTokenUsage (input_tokens = 100 , output_tokens = 50 ),
23
- {"input" : 15.0 , "output" : 30.0 }, # $15 and $30 per 1M tokens
24
- 1.23 , # duration
41
+ return _create_csv
42
+
43
+
44
+ class TestCalculateCostMetrics :
45
+ @pytest .mark .parametrize (
46
+ "input_tokens,output_tokens,input_price,output_price,expected_input,expected_output,expected_total" ,
47
+ [
48
+ (1000 , 500 , 5.0 , 15.0 , 0.005 , 0.0075 , 0.0125 ),
49
+ (0 , 0 , 5.0 , 15.0 , 0.0 , 0.0 , 0.0 ),
50
+ (1_000_000 , 2_000_000 , 10.0 , 30.0 , 10.0 , 60.0 , 70.0 ),
51
+ ],
25
52
)
53
+ def test_calculate_cost_metrics (
54
+ self ,
55
+ input_tokens ,
56
+ output_tokens ,
57
+ input_price ,
58
+ output_price ,
59
+ expected_input ,
60
+ expected_output ,
61
+ expected_total ,
62
+ ):
63
+ token_usage = Mock (input_tokens = input_tokens , output_tokens = output_tokens )
64
+ pricing = {"input" : input_price , "output" : output_price }
65
+
66
+ result = calculate_cost_metrics (token_usage , pricing )
26
67
27
- mock_get_handler .return_value = mock_handler
68
+ assert pytest .approx (result ["input_cost" ]) == expected_input
69
+ assert pytest .approx (result ["output_cost" ]) == expected_output
70
+ assert pytest .approx (result ["total_cost" ]) == expected_total
28
71
29
- mock_extractiveness_compute .return_value = {
30
- "summarization_coverage" : 0.8 ,
31
- "summarization_density" : 1.5 ,
32
- "summarization_compression" : 2.0 ,
33
- }
34
72
35
- df = evaluate_response (
36
- model_name = "mock-model" ,
37
- query = "What is the summary?" ,
38
- context = "This is a long article about something important." ,
39
- reference = "This is a reference summary." ,
73
+ class TestLoadCsv :
74
+ @pytest .mark .parametrize (
75
+ "csv_content,required_columns,expected_len" ,
76
+ [
77
+ (
78
+ "model,instructions\n gpt-4,Test prompt\n gpt-3.5,Another prompt\n " ,
79
+ ["MODEL" , "INSTRUCTIONS" ],
80
+ 2 ,
81
+ ),
82
+ (
83
+ " model , instructions \n gpt-4,Test prompt\n " ,
84
+ ["MODEL" , "INSTRUCTIONS" ],
85
+ 1 ,
86
+ ),
87
+ ("input\n Test input 1\n Test input 2\n " , ["INPUT" ], 2 ),
88
+ ],
40
89
)
90
+ def test_load_csv_valid (
91
+ self , temp_csv , csv_content , required_columns , expected_len
92
+ ):
93
+ temp_path = temp_csv (csv_content )
94
+ try :
95
+ df = load_csv (temp_path , required_columns )
96
+ assert len (df ) == expected_len
97
+ assert list (df .columns ) == required_columns
98
+ finally :
99
+ os .unlink (temp_path )
100
+
101
+ @pytest .mark .parametrize (
102
+ "csv_content,required_columns" ,
103
+ [
104
+ ("model,prompt\n gpt-4,Test prompt\n " , ["MODEL" , "INSTRUCTIONS" ]),
105
+ ("wrong,columns\n val1,val2\n " , ["MODEL" , "INSTRUCTIONS" ]),
106
+ ],
107
+ )
108
+ def test_load_csv_missing_columns (self , temp_csv , csv_content , required_columns ):
109
+ temp_path = temp_csv (csv_content )
110
+ try :
111
+ with pytest .raises (ValueError , match = "must contain the following columns" ):
112
+ load_csv (temp_path , required_columns )
113
+ finally :
114
+ os .unlink (temp_path )
115
+
116
+ def test_load_csv_nonexistent_file (self ):
117
+ with pytest .raises (FileNotFoundError ):
118
+ load_csv ("nonexistent_file.csv" , ["MODEL" ])
119
+
120
+
121
+ class TestEvaluateResponse :
122
+ @pytest .mark .asyncio
123
+ async def test_evaluate_response_success (self , mock_token_usage ):
124
+ mock_handler = AsyncMock ()
125
+ mock_handler .handle_request .return_value = (
126
+ "Generated response text" ,
127
+ mock_token_usage ,
128
+ {"input" : 5.0 , "output" : 15.0 },
129
+ 1.5 ,
130
+ )
131
+
132
+ mock_extractiveness = Mock ()
133
+ mock_extractiveness .compute .return_value = {
134
+ "summarization_coverage" : 0.8 ,
135
+ "summarization_density" : 0.6 ,
136
+ "summarization_compression" : 0.4 ,
137
+ }
138
+
139
+ with (
140
+ patch (
141
+ "evaluation.evals.ModelFactory.get_handler" , return_value = mock_handler
142
+ ),
143
+ patch ("evaluation.evals.Extractiveness" , return_value = mock_extractiveness ),
144
+ ):
145
+ result = await evaluate_response ("gpt-4" , "Test instructions" , "Test input" )
146
+
147
+ assert isinstance (result , pd .DataFrame )
148
+ assert len (result ) == 1
149
+ row = result .iloc [0 ]
150
+ assert row ["Generated Text" ] == "Generated response text"
151
+ assert row ["Extractiveness Coverage" ] == 0.8
152
+ assert row ["Input Token Usage" ] == 100
153
+ assert row ["Output Token Usage" ] == 50
154
+ assert row ["Duration (s)" ] == 1.5
155
+
156
+ @pytest .mark .parametrize (
157
+ "exception_side_effect" , ["get_handler" , "handle_request" , "extractiveness" ]
158
+ )
159
+ @pytest .mark .asyncio
160
+ async def test_evaluate_response_exceptions (
161
+ self , mock_token_usage , exception_side_effect
162
+ ):
163
+ if exception_side_effect == "get_handler" :
164
+ with patch (
165
+ "evaluation.evals.ModelFactory.get_handler" ,
166
+ side_effect = Exception ("Test error" ),
167
+ ):
168
+ result = await evaluate_response (
169
+ "invalid-model" , "Test instructions" , "Test input"
170
+ )
171
+
172
+ elif exception_side_effect == "handle_request" :
173
+ mock_handler = AsyncMock ()
174
+ mock_handler .handle_request .side_effect = Exception ("Handler error" )
175
+ with patch (
176
+ "evaluation.evals.ModelFactory.get_handler" , return_value = mock_handler
177
+ ):
178
+ result = await evaluate_response (
179
+ "gpt-4" , "Test instructions" , "Test input"
180
+ )
181
+
182
+ elif exception_side_effect == "extractiveness" :
183
+ mock_handler = AsyncMock ()
184
+ mock_handler .handle_request .return_value = (
185
+ "text" ,
186
+ mock_token_usage ,
187
+ {"input" : 5.0 , "output" : 15.0 },
188
+ 1.5 ,
189
+ )
190
+ mock_extractiveness = Mock ()
191
+ mock_extractiveness .compute .side_effect = Exception ("Extractiveness error" )
192
+
193
+ with (
194
+ patch (
195
+ "evaluation.evals.ModelFactory.get_handler" ,
196
+ return_value = mock_handler ,
197
+ ),
198
+ patch (
199
+ "evaluation.evals.Extractiveness" , return_value = mock_extractiveness
200
+ ),
201
+ ):
202
+ result = await evaluate_response (
203
+ "gpt-4" , "Test instructions" , "Test input"
204
+ )
205
+
206
+ assert isinstance (result , pd .DataFrame )
207
+ assert len (result ) == 1
208
+ assert pd .isna (result .iloc [0 ]["Generated Text" ])
209
+
41
210
42
- assert isinstance (df , pd .DataFrame )
43
- assert df .shape == (1 , 8 )
44
- assert df ["Output Text" ].iloc [0 ] == "This is a summary."
45
- assert df ["Extractiveness Coverage" ].iloc [0 ] == 0.8
46
- assert df ["Extractiveness Density" ].iloc [0 ] == 1.5
47
- assert df ["Extractiveness Compression" ].iloc [0 ] == 2.0
48
- assert df ["Input Token Usage" ].iloc [0 ] == 100
49
- assert df ["Output Token Usage" ].iloc [0 ] == 50
50
-
51
- expected_cost = (15.0 / 1_000_000 ) * 100 + (30.0 / 1_000_000 ) * 50
52
- assert pytest .approx (df ["Cost (USD)" ].iloc [0 ], rel = 1e-4 ) == expected_cost
53
- assert pytest .approx (df ["Duration (s)" ].iloc [0 ], rel = 1e-4 ) == 1.23
211
+ if __name__ == "__main__" :
212
+ pytest .main ([__file__ ])
0 commit comments