-
Notifications
You must be signed in to change notification settings - Fork 545
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Enhance Numeric Data Inspection and Introduce Positive/Negative Filte…
…ring (#217) * add new py files * update numeric inspector to support pos and neg * Delete positive.py * Create positive_negative.py * Update positive_negative.py * add test cases in test_filters_pos_neg.py * Update manager.py * translate comments * translate comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix fitted flat in PositiveNegativeFilter * Update numeric.py include zero * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix error in testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update NumericInspector according to Wayland's Review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
- Loading branch information
1 parent
4a28d1a
commit 9a34789
Showing
7 changed files
with
348 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from __future__ import annotations | ||
|
||
from sdgx.data_processors.base import DataProcessor | ||
|
||
|
||
class Filter(DataProcessor): | ||
""" | ||
Base class for all data filters. | ||
Filter is a module used to apply rules and remove sampled data that does not conform to the rules. | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any | ||
|
||
import pandas as pd | ||
|
||
from sdgx.data_models.metadata import Metadata | ||
from sdgx.data_processors.extension import hookimpl | ||
from sdgx.data_processors.filter.base import Filter | ||
from sdgx.utils import logger | ||
|
||
|
||
class PositiveNegativeFilter(Filter): | ||
""" | ||
A data processor for filtering positive and negative values. | ||
This filter is used to ensure that values in specific columns remain positive or negative. | ||
During the reverse conversion process, rows that do not meet the expected positivity or | ||
negativity will be removed. | ||
Attributes: | ||
int_columns (set): A set of column names containing integer values. | ||
float_columns (set): A set of column names containing float values. | ||
positive_columns (set): A set of column names that should contain positive values. | ||
negative_columns (set): A set of column names that should contain negative values. | ||
""" | ||
|
||
int_columns: set = set() | ||
""" | ||
A set of column names that contain integer values. | ||
""" | ||
|
||
float_columns: set = set() | ||
""" | ||
A set of column names that contain float values. | ||
""" | ||
|
||
positive_columns: set = set() | ||
""" | ||
A set of column names that are identified as containing positive numeric values. | ||
""" | ||
|
||
negative_columns: set = set() | ||
""" | ||
A set of column names that are identified as containing negative numeric values. | ||
""" | ||
|
||
def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]): | ||
""" | ||
Fit method for the data filter. | ||
""" | ||
logger.info("PositiveNegativeFilter Fitted.") | ||
|
||
# record int and float data | ||
self.int_columns = metadata.int_columns | ||
self.float_columns = metadata.float_columns | ||
|
||
# record pos and neg | ||
self.positive_columns = set(metadata.numeric_format["positive"]) | ||
self.negative_columns = set(metadata.numeric_format["negative"]) | ||
|
||
self.fitted = True | ||
|
||
def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Convert method for data filter (No Action). | ||
""" | ||
|
||
logger.info("Converting data using PositiveNegativeFilter... Finished (No Action)") | ||
|
||
return raw_data | ||
|
||
def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Reverse_convert method for the pos_neg data filter. | ||
Iterate through each row of data, check if there are negative values in positive_columns, | ||
or positive values in negative_columns. If the conditions are not met, discard the row. | ||
""" | ||
logger.info( | ||
f"Data reverse-converted by PositiveNegativeFilter Start with Shape: {processed_data.shape}." | ||
) | ||
|
||
# Create a boolean mask to mark the rows that need to be retained | ||
mask = pd.Series(True, index=processed_data.index) | ||
|
||
# Check positive_columns | ||
for col in self.positive_columns: | ||
if col in processed_data.columns: | ||
mask &= processed_data[col] >= 0 | ||
|
||
# Check negative_columns | ||
for col in self.negative_columns: | ||
if col in processed_data.columns: | ||
mask &= processed_data[col] <= 0 | ||
|
||
# Apply the mask to filter the data | ||
filtered_data = processed_data[mask] | ||
|
||
logger.info( | ||
f"Data reverse-converted by PositiveNegativeFilter with Output Shape: {filtered_data.shape}." | ||
) | ||
|
||
return filtered_data | ||
|
||
|
||
@hookimpl | ||
def register(manager): | ||
manager.register("PositiveNegativeFilter", PositiveNegativeFilter) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.