Skip to content

Commit 72971d4

Browse files
committed
run eval script
1 parent f029129 commit 72971d4

15 files changed

+1689
-2
lines changed

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -219,11 +219,11 @@ pip install -r requirements.txt
219219
If you notice any third-party libraries that are not included in the `requirements.txt` file but used in the `data/process.py` file, please add them with the compatible versions to the `requirements.txt` file.
220220

221221
### How to Validate Data?
222-
We build a GitHub action to validate the data. The action is based on the `script/bash.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
222+
We build a GitHub action to validate the data. The action is based on the `script/run.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
223223

224224
If you want to validate the data locally, you can run the following command:
225225
```bash
226-
sh script/bash.sh
226+
sh script/run.sh
227227
```
228228

229229
If you find any failed test cases, please fix the data in the `data/raw` folder based on the failed problem IDs. The refinement should be based on the [How to Refine Data?](#how-to-refine-data) section.
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
import re
2+
import os
3+
import glob
4+
5+
6+
def f_657(dir_path):
7+
"""
8+
Search for occurrences of the word "error" in all text files within a
9+
specified directory and its subdirectories.
10+
11+
Parameters:
12+
dir_path (str): The path of the directory.
13+
14+
Returns:
15+
dict: A dictionary with relative file paths as keys and the count of
16+
occurrences of the word "error" as values.
17+
18+
Raises:
19+
- ValueError: If directory in dir_path does not exist.
20+
21+
Requirements:
22+
- re: For regex pattern matching.
23+
- os: For retrieving relative file paths.
24+
- glob: For fetching all text file paths in the directory.
25+
26+
The function specifically searches for the word "error" in text files
27+
(with the extension ".txt").
28+
This function is NOT case sensitive, e.g. also "ERROr" will be counted.
29+
30+
Example:
31+
>>> f_657("/path/to/directory")
32+
{'file1.txt': 2, 'subdir/file2.txt': 1}
33+
34+
>>> f_657("/path/to/directory")
35+
{'test.txt': 245, 'subdir/test2.txt': 0, 'subdir/sf/test3.txt': 1}
36+
"""
37+
38+
if not os.path.isdir(dir_path):
39+
raise ValueError("Specified directory does not exist.")
40+
41+
result = {}
42+
file_paths = glob.glob(f'{dir_path}/**/*.txt', recursive=True)
43+
for file_path in file_paths:
44+
with open(file_path, 'r') as file:
45+
content = file.read()
46+
matches = re.findall(r'\berror\b', content, re.IGNORECASE)
47+
# Always set the file's count in the result dictionary, even if it's 0
48+
result[os.path.relpath(file_path, dir_path)] = len(matches)
49+
50+
return result
51+
52+
import unittest
53+
import os
54+
import shutil
55+
import tempfile
56+
class TestCases(unittest.TestCase):
57+
def setUp(self):
58+
# Create a temporary directory to simulate test environments
59+
self.test_dir = tempfile.mkdtemp()
60+
def tearDown(self):
61+
# Remove the temporary directory after the test
62+
shutil.rmtree(self.test_dir)
63+
def create_file(self, sub_path, content=""):
64+
# Helper method to create a file with given content
65+
full_path = os.path.join(self.test_dir, sub_path)
66+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
67+
with open(full_path, 'w') as file:
68+
file.write(content)
69+
# Return normalized path for cross-platform compatibility
70+
return os.path.normpath(sub_path)
71+
def test_non_existent(self):
72+
# Expect ValueError for non-existent directory
73+
with self.assertRaises(ValueError):
74+
f_657(os.path.join(self.test_dir, "non_existent"))
75+
def test_empty_folder(self):
76+
# Test empty directory
77+
result = f_657(self.test_dir)
78+
self.assertEqual(result, {})
79+
def test_files_with_errors(self):
80+
# Files with varying counts of 'error'
81+
files = {
82+
"1.txt": "error\nERROR\nErrOr",
83+
"subfolder1/2.txt": "",
84+
"subfolder2/3.txt": "error\nerror error"
85+
}
86+
expected = {
87+
os.path.normpath("1.txt"): 3,
88+
os.path.normpath("subfolder1/2.txt"): 0,
89+
os.path.normpath("subfolder2/3.txt"): 3
90+
}
91+
for path, content in files.items():
92+
self.create_file(path, content)
93+
result = f_657(self.test_dir)
94+
self.assertEqual(result, expected)
95+
def test_case_sensitive_and_realistic_text(self):
96+
# More complex scenarios, including nested directories
97+
file_path = self.create_file('nested/folder1/folder2/error_log.txt', 'Error\nerror\nERROR')
98+
expected = {file_path: 3}
99+
result = f_657(self.test_dir)
100+
self.assertEqual(result, expected)
101+
def test_exact_word_matching(self):
102+
# Ensure only the exact word 'error' is counted and ignore similar words like 'errors'
103+
files = {
104+
"file1.txt": "error error error", # Should count 3 times
105+
"subdir/file2.txt": "errors error erro errors", # Should count 1 time
106+
"subdir2/nested/file3.txt": "an error occurred", # Should count 1 time
107+
"subdir3/file4.txt": "no errors here", # Should count 0 times
108+
"subdir3/file5.txt": "Error and ERROR and error" # Should count 3 times, case insensitive
109+
}
110+
expected = {
111+
os.path.normpath("file1.txt"): 3,
112+
os.path.normpath("subdir/file2.txt"): 1,
113+
os.path.normpath("subdir2/nested/file3.txt"): 1,
114+
os.path.normpath("subdir3/file4.txt"): 0,
115+
os.path.normpath("subdir3/file5.txt"): 3
116+
}
117+
for path, content in files.items():
118+
self.create_file(path, content)
119+
result = f_657(self.test_dir)
120+
self.assertEqual(result, expected)
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import pandas as pd
2+
from sklearn.linear_model import LinearRegression
3+
4+
5+
def f_661(file_path, output_path=None, sort_key='title', linear_regression=False, x_column=None, y_column=None):
6+
"""
7+
Sorts a CSV file by a specific column key using pandas, and optionally writes the sorted data to another CSV file.
8+
Can also fit a linear regression model to specified columns if required.
9+
10+
Parameters:
11+
file_path (str): The path to the input CSV file. This parameter is required.
12+
output_path (str): The path where the sorted CSV will be saved. If not provided, the function won't save the sorted dataframe.
13+
sort_key (str): The column name used as a key to sort the CSV file. Defaults to 'title'.
14+
linear_regression (bool): If True, fits a linear regression model to the specified columns. Defaults to False.
15+
x_column (str): The name of the column to use as the predictor variable for linear regression.
16+
y_column (str): The name of the column to use as the response variable for linear regression.
17+
18+
Returns: DataFrame, str, or LinearRegression model: The sorted pandas DataFrame if 'output_path' is None and
19+
'linear_regression' is False, otherwise the path to the saved output file. If 'linear_regression' is True,
20+
returns the fitted model.
21+
22+
Requirements:
23+
- pandas
24+
- scikit-learn
25+
26+
Example:
27+
>>> model = f_661('data.csv', sort_key='title', linear_regression=True, x_column='age', y_column='salary')
28+
>>> # Returns a fitted LinearRegression model based on 'age' and 'salary' columns.
29+
30+
Raises:
31+
Exception: If there is an error in reading, sorting the data, or fitting the model.
32+
"""
33+
try:
34+
df = pd.read_csv(file_path)
35+
df.sort_values(by=[sort_key], inplace=True)
36+
37+
if linear_regression:
38+
if x_column not in df.columns or y_column not in df.columns:
39+
raise ValueError("Specified columns for linear regression do not exist in the dataframe")
40+
41+
X = df[[x_column]]
42+
y = df[y_column]
43+
model = LinearRegression().fit(X, y)
44+
return model
45+
46+
if output_path:
47+
df.to_csv(output_path, index=False)
48+
return output_path
49+
else:
50+
return df
51+
except Exception as e:
52+
raise Exception(f"Error while processing the file: {str(e)}")
53+
54+
import unittest
55+
import pandas as pd
56+
import numpy as np
57+
import os
58+
import shutil
59+
import tempfile
60+
class TestCases(unittest.TestCase):
61+
def setUp(self):
62+
# Create a temporary directory for test files
63+
self.test_dir = tempfile.mkdtemp()
64+
self.test_csv_path = os.path.join(self.test_dir, 'test_data.csv')
65+
# Create a sample CSV file
66+
df = pd.DataFrame({
67+
'title': ['Book C', 'Book A', 'Book B'],
68+
'x': [1, 2, 3],
69+
'y': [5, 7, 9]
70+
})
71+
df.to_csv(self.test_csv_path, index=False)
72+
def tearDown(self):
73+
# Remove the temporary directory after the test
74+
shutil.rmtree(self.test_dir)
75+
def test_valid_input_no_output_path(self):
76+
# Test with valid input, no output file specified (should return DataFrame)
77+
df = f_661(self.test_csv_path, sort_key='title')
78+
self.assertIsInstance(df, pd.DataFrame)
79+
self.assertTrue(df['title'].is_monotonic_increasing)
80+
def test_invalid_file_path(self):
81+
# Test with invalid file path (should raise an exception)
82+
with self.assertRaises(Exception):
83+
f_661(os.path.join(self.test_dir, 'non_existent.csv'))
84+
def test_invalid_sort_key(self):
85+
# Test with invalid sort key (should raise an exception)
86+
with self.assertRaises(Exception):
87+
f_661(self.test_csv_path, sort_key='non_existent_column')
88+
def test_output_data_saving(self):
89+
# Test if the function saves the sorted data correctly when an output path is provided
90+
output_path = os.path.join(self.test_dir, 'sorted_data.csv')
91+
result_path = f_661(self.test_csv_path, output_path=output_path, sort_key='title')
92+
self.assertEqual(result_path, output_path)
93+
# Check if the file is created and is not empty
94+
self.assertTrue(os.path.exists(output_path))
95+
self.assertGreater(os.stat(output_path).st_size, 0)
96+
def test_linear_regression_functionality(self):
97+
# Test if linear regression model is fitted correctly
98+
model = f_661(self.test_csv_path, linear_regression=True, x_column='x', y_column='y')
99+
self.assertIsInstance(model, LinearRegression)
100+
# Check if coefficients are as expected (approximate)
101+
np.testing.assert_almost_equal(model.coef_, [2], decimal=1)
102+
np.testing.assert_almost_equal(model.intercept_, 3, decimal=1)
103+
def test_linear_regression_error_on_invalid_columns(self):
104+
# Test error handling for non-existent columns in linear regression
105+
with self.assertRaises(Exception) as context:
106+
f_661(self.test_csv_path, linear_regression=True, x_column='nonexistent', y_column='title')
107+
self.assertIn("Specified columns for linear regression do not exist in the dataframe", str(context.exception))

0 commit comments

Comments
 (0)