Skip to content

Commit

Permalink
Enhance: Fix Data Quality with Outlier Handling and Improved Missing …
Browse files Browse the repository at this point in the history
…Value Treatment (#207)

* add OutlierTransformer, disable DiscreteTransformer

* update NonValueTransformer

* add docstring for OutlierTransformer

* add new docstring for NonValueTransformer

* Update data_transformer.py

* add testcases for OutlierTransformer

* Update outlier.py

* update outlier testcases

* update param name in regx inspector

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix func name typo

* fix NonValueTransformer fit bug in testcases

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
MooooCat and pre-commit-ci[bot] committed Jul 29, 2024
1 parent 31eefae commit 16825af
Show file tree
Hide file tree
Showing 9 changed files with 265 additions and 24 deletions.
6 changes: 3 additions & 3 deletions sdgx/data_models/inspectors/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,16 +82,16 @@ def __init__(
if match_percentage:
self.match_percentage = match_percentage

def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
def fit(self, input_raw_data: pd.DataFrame, *args, **kwargs):
"""Fit the inspector.
Finds the list of regex columns from the tabular data (in pd.DataFrame).
Args:
raw_data (pd.DataFrame): Raw data
"""
for each_col in raw_data.columns:
each_match_rate = self._fit_column(raw_data[each_col])
for each_col in input_raw_data.columns:
each_match_rate = self._fit_column(input_raw_data[each_col])
if each_match_rate > self.match_percentage:
self.regex_columns.add(each_col)

Expand Down
1 change: 1 addition & 0 deletions sdgx/data_processors/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class DataProcessorManager(Manager):
p.lower()
for p in [
"NonValueTransformer",
"OutlierTransformer",
"EmailGenerator",
"ChnPiiGenerator",
"IntValueFormatter",
Expand Down
2 changes: 2 additions & 0 deletions sdgx/data_processors/transformers/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
pass


"""
@hookimpl
def register(manager):
manager.register("DiscreteTransformer", DiscreteTransformer)
"""
75 changes: 56 additions & 19 deletions sdgx/data_processors/transformers/nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,46 @@

class NonValueTransformer(Transformer):
"""
A transformer class for handling missing values in a DataFrame.
This class provides functionality to either drop rows with missing values or fill them with a specified value.
A transformer class designed to handle missing values in a DataFrame. It can either drop rows with missing values or fill them with specified values.
Attributes:
fill_na_value (int): The value to fill missing values in the data.
drop_na (bool): A boolean flag indicating whether to drop rows with missing values or fill them with `fill_na_value`.
int_columns (set): A set of column names that contain integer values.
float_columns (set): A set of column names that contain float values.
column_list (list): A list of all column names in the DataFrame.
fill_na_value_int (int): The value to fill missing integer values with. Default is 0.
fill_na_value_float (float): The value to fill missing float values with. Default is 0.0.
fill_na_value_default (str): The value to fill missing values for non-numeric columns with. Default is 'NAN_VALUE'.
drop_na (bool): A flag indicating whether to drop rows with missing values. If True, rows with missing values are dropped. If False, missing values are filled with specified values. Default is False.
"""

Methods:
fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): Fit method for the transformer.
convert(raw_data: DataFrame) -> DataFrame: Convert method to handle missing values in the input data.
reverse_convert(processed_data: DataFrame) -> DataFrame: Reverse_convert method for the transformer.
int_columns: set = set()
"""
A set of column names that contain integer values.
"""

fill_na_value = 0
float_columns: set = set()
"""
A set of column names that contain float values.
"""
The value to fill missing values in the data.

If `drop_na` is set to `False`, this value will be used to fill missing values in the data.
column_list: list = []
"""
A list of all column names in the DataFrame.
"""

fill_na_value_int = 0
"""
The value to fill missing integer values with. Default is 0.
"""

fill_na_value_float = 0.0
"""
The value to fill missing float values with. Default is 0.0.
"""

fill_na_value_default = "NAN_VALUE"
"""
The value to fill missing values for non-numeric columns with. Default is 'NAN_VALUE'.
"""

drop_na = False
Expand All @@ -46,16 +67,19 @@ class NonValueTransformer(Transformer):
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the transformer.
Does not require any action.
"""
logger.info("NonValueTransformer Fitted.")

for key, value in kwargs.items():
if key == "fill_na_value":
if key == "drop_na":
if not isinstance(value, str):
raise ValueError("fill_na_value must be of type <str>")
self.fill_na_value = value
self.drop_na = value

# record numeric columns
self.int_columns = metadata.int_columns
self.float_columns = metadata.float_columns
self.column_list = metadata.column_list

self.fitted = True

Expand All @@ -67,9 +91,22 @@ def convert(self, raw_data: DataFrame) -> DataFrame:
logger.info("Converting data using NonValueTransformer...")

if self.drop_na:
res = raw_data.dropna()
else:
res = raw_data.fillna(value=self.fill_na_value)
logger.info("Converting data using NonValueTransformer... Finished (Drop NA).")
return raw_data.dropna()

res = raw_data

# fill numeric nan value
for each_col in self.int_columns:
res[each_col] = res[each_col].fillna(self.fill_na_value_int)
for each_col in self.float_columns:
res[each_col] = res[each_col].fillna(self.fill_na_value_float)

# fill other non-numeric nan value
for each_col in self.column_list:
if each_col in self.int_columns or each_col in self.float_columns:
continue
res[each_col] = res[each_col].fillna(self.fill_na_value_default)

logger.info("Converting data using NonValueTransformer... Finished.")

Expand Down
122 changes: 122 additions & 0 deletions sdgx/data_processors/transformers/outlier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
from __future__ import annotations

from typing import Any

from pandas import DataFrame

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.extension import hookimpl
from sdgx.data_processors.transformers.base import Transformer
from sdgx.utils import logger


class OutlierTransformer(Transformer):
"""
A transformer class to handle outliers in the data by converting them to specified fill values.
Attributes:
int_columns (set): A set of column names that contain integer values.
int_outlier_fill_value (int): The value to fill in for outliers in integer columns. Default is 0.
float_columns (set): A set of column names that contain float values.
float_outlier_fill_value (float): The value to fill in for outliers in float columns. Default is 0.
"""

int_columns: set = set()
"""
set: A set of column names that contain integer values. These columns will have their outliers replaced by `int_outlier_fill_value`.
"""

int_outlier_fill_value = 0
"""
int: The value to fill in for outliers in integer columns. Default is 0.
"""

float_columns: set = set()
"""
set: A set of column names that contain float values. These columns will have their outliers replaced by `float_outlier_fill_value`.
"""

float_outlier_fill_value = float(0)
"""
float: The value to fill in for outliers in float columns. Default is 0.
"""

def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
"""
Fit method for the transformer.
Records the names of integer and float columns from the metadata.
Args:
metadata (Metadata | None): The metadata object containing column type information.
**kwargs: Additional keyword arguments.
"""
self.int_columns = metadata.int_columns
self.float_columns = metadata.float_columns

self.fitted = True

logger.info("OutlierTransformer Fitted.")

def convert(self, raw_data: DataFrame) -> DataFrame:
"""
Convert method to handle outliers in the input data by replacing them with specified fill values.
Args:
raw_data (DataFrame): The input DataFrame containing the data to be processed.
Returns:
DataFrame: The processed DataFrame with outliers replaced by fill values.
"""
res = raw_data

logger.info("Converting data using OutlierTransformer...")

# Dealing with the integer value columns
def convert_to_int(value):
try:
return int(value)
except ValueError:
return self.int_outlier_fill_value

for each_col in self.int_columns:
res[each_col] = res[each_col].apply(convert_to_int)

# Dealing with the float value columns
def convert_to_float(value):
try:
return float(value)
except ValueError:
return self.float_outlier_fill_value

for each_col in self.float_columns:
res[each_col] = res[each_col].apply(convert_to_float)

logger.info("Converting data using OutlierTransformer... Finished.")

return res

def reverse_convert(self, processed_data: DataFrame) -> DataFrame:
"""
Reverse_convert method for the transformer (No action for OutlierTransformer).
Args:
processed_data (DataFrame): The processed DataFrame.
Returns:
DataFrame: The same processed DataFrame.
"""
logger.info("Data reverse-converted by OutlierTransformer (No Action).")

return processed_data


@hookimpl
def register(manager):
"""
Register the OutlierTransformer with the manager.
Args:
manager: The manager object responsible for registering transformers.
"""
manager.register("OutlierTransformer", OutlierTransformer)
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ def transform(self, dataloader: DataLoader) -> NDArrayLoader:
def _inverse_transform_continuous(self, column_transform_info, column_data, sigmas, st):
gm = column_transform_info.transform
data = pd.DataFrame(column_data[:, :2], columns=list(gm.get_output_sdtypes()))
data = data.astype(float)
data.iloc[:, 1] = np.argmax(column_data[:, 1:], axis=1)
if sigmas is not None:
selected_normalized_value = np.random.normal(data.iloc[:, 0], sigmas[st])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_empty_data(raw_data: pd.DataFrame):
yield raw_data


def test_nan_handling_test_df(test_empty_data: pd.DataFrame):
def test_empty_handling_test_df(test_empty_data: pd.DataFrame):
"""
Test the handling of empty columns in a DataFrame.
This function tests the behavior of a DataFrame when it contains empty columns.
Expand Down
5 changes: 4 additions & 1 deletion tests/data_processors/transformers/test_transformers_nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import pytest

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.transformers.nan import NonValueTransformer


Expand Down Expand Up @@ -73,8 +74,10 @@ def test_nan_handling_test_df(nan_test_df: pd.DataFrame):
# Check if the transformer has not been fitted yet.
assert nan_transformer.fitted is False

nan_csv_metadata = Metadata.from_dataframe(nan_test_df)

# Fit the transformer with the DataFrame.
nan_transformer.fit(nan_test_df)
nan_transformer.fit(nan_csv_metadata)
# Check if the transformer has been fitted after the fit operation.
assert nan_transformer.fitted

Expand Down
75 changes: 75 additions & 0 deletions tests/data_processors/transformers/test_transformers_outlier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
import pandas as pd
import pytest

from sdgx.data_models.metadata import Metadata
from sdgx.data_processors.transformers.outlier import OutlierTransformer


@pytest.fixture
def outlier_test_df():
row_cnt = 1000
header = ["int_id", "str_id", "int_random", "float_random"]

int_id = list(range(row_cnt))
str_id = list("id_" + str(i) for i in range(row_cnt))

int_random = np.random.randint(100, size=row_cnt)
float_random = np.random.uniform(0, 100, size=row_cnt)

X = [[int_id[i], str_id[i], int_random[i], float_random[i]] for i in range(row_cnt)]

# Convert the list of lists to a DataFrame
df = pd.DataFrame(X, columns=header)

# Introduce outliers
outlier_indices = np.random.choice(row_cnt, size=int(row_cnt * 0.1), replace=False)
for idx in outlier_indices:
df.iat[idx, 2] = "not_number_outlier" # Introduce string in int column
df.iat[idx, 3] = "not_number_outlier" # Introduce string in float column

yield df


def test_outlier_handling_test_df(outlier_test_df: pd.DataFrame):
"""
Test the handling of outliers in a DataFrame.
This function tests the behavior of a DataFrame when it contains outliers.
It is designed to be used in a testing environment, where the DataFrame is passed as an argument.
Parameters:
outlier_test_df (pd.DataFrame): The DataFrame to test.
Returns:
None
Raises:
AssertionError: If the DataFrame does not handle outliers as expected.
"""

assert "not_number_outlier" in outlier_test_df["int_random"].to_list()
assert "not_number_outlier" in outlier_test_df["float_random"].to_list()

# Initialize the OutlierTransformer.
outlier_transformer = OutlierTransformer()
# Check if the transformer has not been fitted yet.
assert outlier_transformer.fitted is False

# Fit the transformer with the DataFrame.
metadata = Metadata.from_dataframe(outlier_test_df)
metadata.int_columns = set(["int_id", "int_random"])
metadata.float_columns = set(["float_random"])
outlier_transformer.fit(metadata=metadata)
# Check if the transformer has been fitted after the fit operation.
assert outlier_transformer.fitted

# Transform the DataFrame using the transformer.
transformed_df = outlier_transformer.convert(outlier_test_df)

# Check if the transformed DataFrame does not contain any outliers.
assert not "not_number_outlier" in transformed_df["int_random"].to_list()
assert not "not_number_outlier" in transformed_df["float_random"].to_list()

# Check if the outliers have been replaced with the specified fill values.
assert 0 in transformed_df["int_random"].to_list()
assert 0.0 in transformed_df["float_random"].to_list()

0 comments on commit 16825af

Please sign in to comment.