-
Notifications
You must be signed in to change notification settings - Fork 541
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhance: Fix Data Quality with Outlier Handling and Improved Missing …
…Value Treatment (#207) * add OutlierTransformer, disable DiscreteTransformer * update NonValueTransformer * add docstring for OutlierTransformer * add new docstring for NonValueTransformer * Update data_transformer.py * add testcases for OutlierTransformer * Update outlier.py * update outlier testcases * update param name in regx inspector * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix func name typo * fix NonValueTransformer fit bug in testcases * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
31eefae
commit 16825af
Showing
9 changed files
with
265 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
from pandas import DataFrame | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.extension import hookimpl | ||
from sdgx.data_processors.transformers.base import Transformer | ||
from sdgx.utils import logger | ||
|
||
|
||
class OutlierTransformer(Transformer): | ||
""" | ||
A transformer class to handle outliers in the data by converting them to specified fill values. | ||
Attributes: | ||
int_columns (set): A set of column names that contain integer values. | ||
int_outlier_fill_value (int): The value to fill in for outliers in integer columns. Default is 0. | ||
float_columns (set): A set of column names that contain float values. | ||
float_outlier_fill_value (float): The value to fill in for outliers in float columns. Default is 0. | ||
""" | ||
|
||
int_columns: set = set() | ||
""" | ||
set: A set of column names that contain integer values. These columns will have their outliers replaced by `int_outlier_fill_value`. | ||
""" | ||
|
||
int_outlier_fill_value = 0 | ||
""" | ||
int: The value to fill in for outliers in integer columns. Default is 0. | ||
""" | ||
|
||
float_columns: set = set() | ||
""" | ||
set: A set of column names that contain float values. These columns will have their outliers replaced by `float_outlier_fill_value`. | ||
""" | ||
|
||
float_outlier_fill_value = float(0) | ||
""" | ||
float: The value to fill in for outliers in float columns. Default is 0. | ||
""" | ||
|
||
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): | ||
""" | ||
Fit method for the transformer. | ||
Records the names of integer and float columns from the metadata. | ||
Args: | ||
metadata (Metadata | None): The metadata object containing column type information. | ||
**kwargs: Additional keyword arguments. | ||
""" | ||
self.int_columns = metadata.int_columns | ||
self.float_columns = metadata.float_columns | ||
|
||
self.fitted = True | ||
|
||
logger.info("OutlierTransformer Fitted.") | ||
|
||
def convert(self, raw_data: DataFrame) -> DataFrame: | ||
""" | ||
Convert method to handle outliers in the input data by replacing them with specified fill values. | ||
Args: | ||
raw_data (DataFrame): The input DataFrame containing the data to be processed. | ||
Returns: | ||
DataFrame: The processed DataFrame with outliers replaced by fill values. | ||
""" | ||
res = raw_data | ||
|
||
logger.info("Converting data using OutlierTransformer...") | ||
|
||
# Dealing with the integer value columns | ||
def convert_to_int(value): | ||
try: | ||
return int(value) | ||
except ValueError: | ||
return self.int_outlier_fill_value | ||
|
||
for each_col in self.int_columns: | ||
res[each_col] = res[each_col].apply(convert_to_int) | ||
|
||
# Dealing with the float value columns | ||
def convert_to_float(value): | ||
try: | ||
return float(value) | ||
except ValueError: | ||
return self.float_outlier_fill_value | ||
|
||
for each_col in self.float_columns: | ||
res[each_col] = res[each_col].apply(convert_to_float) | ||
|
||
logger.info("Converting data using OutlierTransformer... Finished.") | ||
|
||
return res | ||
|
||
def reverse_convert(self, processed_data: DataFrame) -> DataFrame: | ||
""" | ||
Reverse_convert method for the transformer (No action for OutlierTransformer). | ||
Args: | ||
processed_data (DataFrame): The processed DataFrame. | ||
Returns: | ||
DataFrame: The same processed DataFrame. | ||
""" | ||
logger.info("Data reverse-converted by OutlierTransformer (No Action).") | ||
|
||
return processed_data | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
""" | ||
Register the OutlierTransformer with the manager. | ||
Args: | ||
manager: The manager object responsible for registering transformers. | ||
""" | ||
manager.register("OutlierTransformer", OutlierTransformer) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
75 changes: 75 additions & 0 deletions
75
tests/data_processors/transformers/test_transformers_outlier.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.transformers.outlier import OutlierTransformer | ||
|
||
|
||
@pytest.fixture | ||
def outlier_test_df(): | ||
row_cnt = 1000 | ||
header = ["int_id", "str_id", "int_random", "float_random"] | ||
|
||
int_id = list(range(row_cnt)) | ||
str_id = list("id_" + str(i) for i in range(row_cnt)) | ||
|
||
int_random = np.random.randint(100, size=row_cnt) | ||
float_random = np.random.uniform(0, 100, size=row_cnt) | ||
|
||
X = [[int_id[i], str_id[i], int_random[i], float_random[i]] for i in range(row_cnt)] | ||
|
||
# Convert the list of lists to a DataFrame | ||
df = pd.DataFrame(X, columns=header) | ||
|
||
# Introduce outliers | ||
outlier_indices = np.random.choice(row_cnt, size=int(row_cnt * 0.1), replace=False) | ||
for idx in outlier_indices: | ||
df.iat[idx, 2] = "not_number_outlier" # Introduce string in int column | ||
df.iat[idx, 3] = "not_number_outlier" # Introduce string in float column | ||
|
||
yield df | ||
|
||
|
||
def test_outlier_handling_test_df(outlier_test_df: pd.DataFrame): | ||
""" | ||
Test the handling of outliers in a DataFrame. | ||
This function tests the behavior of a DataFrame when it contains outliers. | ||
It is designed to be used in a testing environment, where the DataFrame is passed as an argument. | ||
Parameters: | ||
outlier_test_df (pd.DataFrame): The DataFrame to test. | ||
Returns: | ||
None | ||
Raises: | ||
AssertionError: If the DataFrame does not handle outliers as expected. | ||
""" | ||
|
||
assert "not_number_outlier" in outlier_test_df["int_random"].to_list() | ||
assert "not_number_outlier" in outlier_test_df["float_random"].to_list() | ||
|
||
# Initialize the OutlierTransformer. | ||
outlier_transformer = OutlierTransformer() | ||
# Check if the transformer has not been fitted yet. | ||
assert outlier_transformer.fitted is False | ||
|
||
# Fit the transformer with the DataFrame. | ||
metadata = Metadata.from_dataframe(outlier_test_df) | ||
metadata.int_columns = set(["int_id", "int_random"]) | ||
metadata.float_columns = set(["float_random"]) | ||
outlier_transformer.fit(metadata=metadata) | ||
# Check if the transformer has been fitted after the fit operation. | ||
assert outlier_transformer.fitted | ||
|
||
# Transform the DataFrame using the transformer. | ||
transformed_df = outlier_transformer.convert(outlier_test_df) | ||
|
||
# Check if the transformed DataFrame does not contain any outliers. | ||
assert not "not_number_outlier" in transformed_df["int_random"].to_list() | ||
assert not "not_number_outlier" in transformed_df["float_random"].to_list() | ||
|
||
# Check if the outliers have been replaced with the specified fill values. | ||
assert 0 in transformed_df["int_random"].to_list() | ||
assert 0.0 in transformed_df["float_random"].to_list() |