bigcode-project
diff --git a/‎CONTRIBUTING.md
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎data/processed/f_657_simon_chien_edit_wo_doc.py
Lines changed: 120 additions & 0 deletions b/‎data/processed/f_657_simon_chien_edit_wo_doc.py
Lines changed: 120 additions & 0 deletions
diff --git a/‎data/processed/f_661_simon_chien_edit_wo_doc.py
Lines changed: 107 additions & 0 deletions b/‎data/processed/f_661_simon_chien_edit_wo_doc.py
Lines changed: 107 additions & 0 deletions
@@ -219,11 +219,11 @@ pip install -r requirements.txt
 If you notice any third-party libraries that are not included in the `requirements.txt` file but used in the `data/process.py` file, please add them with the compatible versions to the `requirements.txt` file.
 
 ### How to Validate Data?
-We build a GitHub action to validate the data. The action is based on the `script/bash.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
+We build a GitHub action to validate the data. The action is based on the `script/run.sh`. Specifically, any refined data will be copied to the `data/clean` folder and then parsed based on `script/parser.py`. The parsed data will be stored in the `data/processed` folder. The parsed data will be separate into two splits for `pytest`. The first split will be validated by running `pytest $FILE_NAME` and the second split will be validated by running `pytest --doctest-modules $FILE_NAME`. Please note that we validate each file separately, as `pytest` may fail unexpectedly when validating all files at once.
 
 If you want to validate the data locally, you can run the following command:
 ```bash
-sh script/bash.sh
+sh script/run.sh
 ```
 
 If you find any failed test cases, please fix the data in the `data/raw` folder based on the failed problem IDs. The refinement should be based on the [How to Refine Data?](#how-to-refine-data) section.
@@ -0,0 +1,120 @@
+import re
+import os
+import glob
+
+
+def f_657(dir_path):
+    """
+    Search for occurrences of the word "error" in all text files within a 
+    specified directory and its subdirectories.
+    
+    Parameters:
+    dir_path (str): The path of the directory.
+    
+    Returns:
+    dict: A dictionary with relative file paths as keys and the count of 
+            occurrences of the word "error" as values.
+    
+    Raises:
+    - ValueError: If directory in dir_path does not exist.
+
+    Requirements:
+    - re: For regex pattern matching.
+    - os: For retrieving relative file paths.
+    - glob: For fetching all text file paths in the directory.
+    
+    The function specifically searches for the word "error" in text files
+    (with the extension ".txt").
+    This function is NOT case sensitive, e.g. also "ERROr" will be counted.
+    
+    Example:
+    >>> f_657("/path/to/directory")
+    {'file1.txt': 2, 'subdir/file2.txt': 1}
+
+    >>> f_657("/path/to/directory")
+    {'test.txt': 245, 'subdir/test2.txt': 0, 'subdir/sf/test3.txt': 1}
+    """
+
+    if not os.path.isdir(dir_path):
+        raise ValueError("Specified directory does not exist.")
+
+    result = {}
+    file_paths = glob.glob(f'{dir_path}/**/*.txt', recursive=True)
+    for file_path in file_paths:
+        with open(file_path, 'r') as file:
+            content = file.read()
+        matches = re.findall(r'\berror\b', content, re.IGNORECASE)
+        # Always set the file's count in the result dictionary, even if it's 0
+        result[os.path.relpath(file_path, dir_path)] = len(matches)
+
+    return result
+
+import unittest
+import os
+import shutil
+import tempfile
+class TestCases(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory to simulate test environments
+        self.test_dir = tempfile.mkdtemp()
+    def tearDown(self):
+        # Remove the temporary directory after the test
+        shutil.rmtree(self.test_dir)
+    def create_file(self, sub_path, content=""):
+        # Helper method to create a file with given content
+        full_path = os.path.join(self.test_dir, sub_path)
+        os.makedirs(os.path.dirname(full_path), exist_ok=True)
+        with open(full_path, 'w') as file:
+            file.write(content)
+        # Return normalized path for cross-platform compatibility
+        return os.path.normpath(sub_path)
+    def test_non_existent(self):
+        # Expect ValueError for non-existent directory
+        with self.assertRaises(ValueError):
+            f_657(os.path.join(self.test_dir, "non_existent"))
+    def test_empty_folder(self):
+        # Test empty directory
+        result = f_657(self.test_dir)
+        self.assertEqual(result, {})
+    def test_files_with_errors(self):
+        # Files with varying counts of 'error'
+        files = {
+            "1.txt": "error\nERROR\nErrOr",
+            "subfolder1/2.txt": "",
+            "subfolder2/3.txt": "error\nerror error"
+        }
+        expected = {
+            os.path.normpath("1.txt"): 3,
+            os.path.normpath("subfolder1/2.txt"): 0,
+            os.path.normpath("subfolder2/3.txt"): 3
+        }
+        for path, content in files.items():
+            self.create_file(path, content)
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
+    def test_case_sensitive_and_realistic_text(self):
+        # More complex scenarios, including nested directories
+        file_path = self.create_file('nested/folder1/folder2/error_log.txt', 'Error\nerror\nERROR')
+        expected = {file_path: 3}
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
+    def test_exact_word_matching(self):
+        # Ensure only the exact word 'error' is counted and ignore similar words like 'errors'
+        files = {
+            "file1.txt": "error error error",  # Should count 3 times
+            "subdir/file2.txt": "errors error erro errors",  # Should count 1 time
+            "subdir2/nested/file3.txt": "an error occurred",  # Should count 1 time
+            "subdir3/file4.txt": "no errors here",  # Should count 0 times
+            "subdir3/file5.txt": "Error and ERROR and error"  # Should count 3 times, case insensitive
+        }
+        expected = {
+            os.path.normpath("file1.txt"): 3,
+            os.path.normpath("subdir/file2.txt"): 1,
+            os.path.normpath("subdir2/nested/file3.txt"): 1,
+            os.path.normpath("subdir3/file4.txt"): 0,
+            os.path.normpath("subdir3/file5.txt"): 3
+        }
+        for path, content in files.items():
+            self.create_file(path, content)
+        result = f_657(self.test_dir)
+        self.assertEqual(result, expected)
@@ -0,0 +1,107 @@
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+
+
+def f_661(file_path, output_path=None, sort_key='title', linear_regression=False, x_column=None, y_column=None):
+    """
+    Sorts a CSV file by a specific column key using pandas, and optionally writes the sorted data to another CSV file.
+    Can also fit a linear regression model to specified columns if required.
+
+    Parameters:
+    file_path (str): The path to the input CSV file. This parameter is required.
+    output_path (str): The path where the sorted CSV will be saved. If not provided, the function won't save the sorted dataframe.
+    sort_key (str): The column name used as a key to sort the CSV file. Defaults to 'title'.
+    linear_regression (bool): If True, fits a linear regression model to the specified columns. Defaults to False.
+    x_column (str): The name of the column to use as the predictor variable for linear regression.
+    y_column (str): The name of the column to use as the response variable for linear regression.
+
+    Returns: DataFrame, str, or LinearRegression model: The sorted pandas DataFrame if 'output_path' is None and
+    'linear_regression' is False, otherwise the path to the saved output file. If 'linear_regression' is True,
+    returns the fitted model.
+
+    Requirements:
+    - pandas
+    - scikit-learn
+
+    Example:
+    >>> model = f_661('data.csv', sort_key='title', linear_regression=True, x_column='age', y_column='salary')
+    >>> # Returns a fitted LinearRegression model based on 'age' and 'salary' columns.
+
+    Raises:
+    Exception: If there is an error in reading, sorting the data, or fitting the model.
+    """
+    try:
+        df = pd.read_csv(file_path)
+        df.sort_values(by=[sort_key], inplace=True)
+
+        if linear_regression:
+            if x_column not in df.columns or y_column not in df.columns:
+                raise ValueError("Specified columns for linear regression do not exist in the dataframe")
+
+            X = df[[x_column]]
+            y = df[y_column]
+            model = LinearRegression().fit(X, y)
+            return model
+
+        if output_path:
+            df.to_csv(output_path, index=False)
+            return output_path
+        else:
+            return df
+    except Exception as e:
+        raise Exception(f"Error while processing the file: {str(e)}")
+
+import unittest
+import pandas as pd
+import numpy as np
+import os
+import shutil
+import tempfile
+class TestCases(unittest.TestCase):
+    def setUp(self):
+        # Create a temporary directory for test files
+        self.test_dir = tempfile.mkdtemp()
+        self.test_csv_path = os.path.join(self.test_dir, 'test_data.csv')
+        # Create a sample CSV file
+        df = pd.DataFrame({
+            'title': ['Book C', 'Book A', 'Book B'],
+            'x': [1, 2, 3],
+            'y': [5, 7, 9]
+        })
+        df.to_csv(self.test_csv_path, index=False)
+    def tearDown(self):
+        # Remove the temporary directory after the test
+        shutil.rmtree(self.test_dir)
+    def test_valid_input_no_output_path(self):
+        # Test with valid input, no output file specified (should return DataFrame)
+        df = f_661(self.test_csv_path, sort_key='title')
+        self.assertIsInstance(df, pd.DataFrame)
+        self.assertTrue(df['title'].is_monotonic_increasing)
+    def test_invalid_file_path(self):
+        # Test with invalid file path (should raise an exception)
+        with self.assertRaises(Exception):
+            f_661(os.path.join(self.test_dir, 'non_existent.csv'))
+    def test_invalid_sort_key(self):
+        # Test with invalid sort key (should raise an exception)
+        with self.assertRaises(Exception):
+            f_661(self.test_csv_path, sort_key='non_existent_column')
+    def test_output_data_saving(self):
+        # Test if the function saves the sorted data correctly when an output path is provided
+        output_path = os.path.join(self.test_dir, 'sorted_data.csv')
+        result_path = f_661(self.test_csv_path, output_path=output_path, sort_key='title')
+        self.assertEqual(result_path, output_path)
+        # Check if the file is created and is not empty
+        self.assertTrue(os.path.exists(output_path))
+        self.assertGreater(os.stat(output_path).st_size, 0)
+    def test_linear_regression_functionality(self):
+        # Test if linear regression model is fitted correctly
+        model = f_661(self.test_csv_path, linear_regression=True, x_column='x', y_column='y')
+        self.assertIsInstance(model, LinearRegression)
+        # Check if coefficients are as expected (approximate)
+        np.testing.assert_almost_equal(model.coef_, [2], decimal=1)
+        np.testing.assert_almost_equal(model.intercept_, 3, decimal=1)
+    def test_linear_regression_error_on_invalid_columns(self):
+        # Test error handling for non-existent columns in linear regression
+        with self.assertRaises(Exception) as context:
+            f_661(self.test_csv_path, linear_regression=True, x_column='nonexistent', y_column='title')
+        self.assertIn("Specified columns for linear regression do not exist in the dataframe", str(context.exception))