Enhance Numeric Data Inspection and Introduce Positive/Negative Filte…

…ring (#217) * add new py files * update numeric inspector to support pos and neg * Delete positive.py * Create positive_negative.py * Update positive_negative.py * add test cases in test_filters_pos_neg.py * Update manager.py * translate comments * translate comments * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix fitted flat in PositiveNegativeFilter * Update numeric.py include zero * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix error in testcase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update NumericInspector according to Wayland's Review * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
hitsz-ids · Aug 27, 2024 · 9a34789 · 9a34789
1 parent 4a28d1a
commit 9a34789
Show file tree

Hide file tree

Showing 7 changed files with 348 additions and 65 deletions.
diff --git a/sdgx/data_models/inspectors/numeric.py b/sdgx/data_models/inspectors/numeric.py
@@ -14,69 +14,132 @@ class NumericInspector(Inspector):
 
     This class is a subclass of `Inspector` and is designed to provide methods for inspecting
     and analyzing numeric data. It includes methods for detecting int or float data type.
+
+    In August 2024, we introduced a new feature that will continue to judge the positivity or
+    negativity after determining the type, thereby effectively improving the quality of synthetic
+    data in subsequent processing.
+    """
+
+    int_columns: set = set()
+    """
+    A set of column names that contain integer values.
+    """
+
+    float_columns: set = set()
+    """
+    A set of column names that contain float values.
+    """
+
+    positive_columns: set = set()
+    """
+    A set of column names that contain only positive numeric values.
+    """
+
+    negative_columns: set = set()
+    """
+    A set of column names that contain only negative numeric values.
+    """
+
+    pos_threshold: float = 0.95
+    """
+    The threshold proportion of positive values in a column to consider it as a positive column.
+    """
+
+    negative_threshold: float = 0.95
+    """
+    The threshold proportion of negative values in a column to consider it as a negative column.
     """
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.int_columns: set[str] = set()
-        self.float_columns: set[str] = set()
         self._int_rate = 0.9
         self.df_length = 0
 
-    def _is_int_column(self, col_series: pd.Series):
+    def _is_int_column(self, col_series: pd.Series) -> bool:
         """
-        Determine whether a column of pd.DataFrame is of type int
-        In the original pd.DataFrame automatically updated dtype, some int types will be marked as float.
-        In fact, we can make an accurate result by getting the decimal part of the value.
+        Determine if a column contains predominantly integer values.
+
+        This method checks if the proportion of integer values in the given column
+        exceeds a predefined threshold.
 
         Args:
-            col_series (pd.Series): One single column of the raw data.
+            col_series (pd.Series): The column series to be inspected.
+
+        Returns:
+            bool: True if the column is predominantly integer, False otherwise.
         """
+        # Convert the column series to numeric values, coercing errors to NaN and dropping them
+        numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()
 
-        def is_decimal_part_zero(num: float):
-            """
-            Is the decimal part == 0.0 ?
-
-            Args:
-                col_series (float): The number.
-            """
-            try:
-                decimal_part = num - int(num)
-            except ValueError:
-                return None
-            if decimal_part == 0.0:
-                return True
-            else:
-                return False
-
-        # Initialize the counter for values with zero decimal part
-        int_cnt = 0
-        col_length = self.df_length
-
-        # Iterate over each value in the series
-        for each_val in col_series:
-            decimal_zer0 = is_decimal_part_zero(each_val)
-            # If the decimal part is zero, increment the counter and continue to the next value
-            if decimal_zer0 is True:
-                int_cnt += 1
-                continue
-            # If the decimal part is not zero or not a decimal number
-            # decrease the length of the series and continue to the next value
-            if decimal_zer0 is None:
-                col_length -= 1
-                continue
-
-        # Calculate the rate of values with zero decimal part
-        if col_length <= 0:
-            int_rate = 0
-        else:
-            int_rate = int_cnt / col_length
-
-        # Check if the rate is greater than the predefined rate
-        if int_rate > self._int_rate:
-            return True
-        else:
-            return False
+        # Count how many of the numeric values are integers
+        int_cnt = (numeric_values == numeric_values.astype(int)).sum()
+
+        # Calculate the ratio of integer values to the total numeric values
+        int_rate = int_cnt / len(numeric_values)
+
+        # Return True if the integer rate is greater than the predefined threshold
+        return int_rate > self._int_rate
+
+    def _is_positive_or_negative_column(
+        self, col_series: pd.Series, threshold: float, comparison_func
+    ) -> bool:
+        """
+        Determine if a column contains predominantly positive or negative values.
+
+        This method checks if the proportion of values that satisfy a given comparison
+        function exceeds a predefined threshold.
+
+        Args:
+            col_series (pd.Series): The column series to be inspected.
+            threshold (float): The proportion threshold for considering the column as positive or negative.
+            comparison_func (function): A function that takes a numeric value and returns a boolean.
+
+        Returns:
+            bool: True if the column satisfies the condition, False otherwise.
+        """
+        # Convert the column series to numeric values, coercing errors to NaN and dropping NaN values
+        numeric_values = pd.to_numeric(col_series, errors="coerce").dropna()
+
+        # Apply the comparison function to the numeric values and sum the results
+        count = comparison_func(numeric_values).sum()
+
+        # Calculate the proportion of values that meet the comparison criteria
+        proportion = count / len(numeric_values)
+
+        # Return True if the proportion meets or exceeds the threshold, otherwise False
+        return proportion >= threshold
+
+    def _is_positive_column(self, col_series: pd.Series) -> bool:
+        """
+        Determine if a column contains predominantly positive values.
+
+        This method checks if the proportion of positive values in the given column
+        exceeds a predefined threshold.
+
+        Args:
+            col_series (pd.Series): The column series to be inspected.
+
+        Returns:
+            bool: True if the column is predominantly positive, False otherwise.
+        """
+        return self._is_positive_or_negative_column(col_series, self.pos_threshold, lambda x: x > 0)
+
+    def _is_negative_column(self, col_series: pd.Series) -> bool:
+        """
+        Determine if a column contains predominantly negative values.
+
+        This method checks if the proportion of negative values in the given column
+        exceeds a predefined threshold.
+
+        Args:
+            col_series (pd.Series): The column series to be inspected.
+
+        Returns:
+            bool: True if the column is predominantly negative, False otherwise.
+        """
+        return self._is_positive_or_negative_column(
+            col_series, self.negative_threshold, lambda x: x < 0
+        )
 
     def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
         """Fit the inspector.
@@ -87,33 +150,48 @@ def fit(self, raw_data: pd.DataFrame, *args, **kwargs):
             raw_data (pd.DataFrame): Raw data
         """
 
+        # Initialize sets for integer and float columns
         self.int_columns = set()
         self.float_columns = set()
 
-        self.df_length = len(raw_data)
+        # Initialize sets for positive and negative columns
+        self.positive_columns = set()
+        self.negative_columns = set()
 
-        float_candidate = self.float_columns.union(
-            set(raw_data.select_dtypes(include=["float64"]).columns)
-        )
-
-        for candidate in float_candidate:
-            if self._is_int_column(raw_data[candidate]):
-                self.int_columns.add(candidate)
-            else:
-                self.float_columns.add(candidate)
-
-        self.int_columns = self.int_columns.union(
-            set(raw_data.select_dtypes(include=["int64"]).columns)
-        )
+        # Store the length of the DataFrame
+        self.df_length = len(raw_data)
 
+        # Iterate all columns and determain the final data type
+        for col in raw_data.columns:
+            if raw_data[col].dtype in ["int64", "float64"]:
+                # float or int
+                if self._is_int_column(raw_data[col]):
+                    self.int_columns.add(col)
+                else:
+                    self.float_columns.add(col)
+
+                # positive? negative?
+                if self._is_positive_column(raw_data[col]):
+                    self.positive_columns.add(col)
+                elif self._is_negative_column(raw_data[col]):
+                    self.negative_columns.add(col)
+
+        # Mark the inspector as ready
         self.ready = True
 
     def inspect(self, *args, **kwargs) -> dict[str, Any]:
         """Inspect raw data and generate metadata."""
 
+        # Positive and negative columns should not be strictly considered as label columns
+        # We use the format dict to inspect and output to metadata
+        numeric_format: dict = {}
+        numeric_format["positive"] = sorted(list(self.positive_columns))
+        numeric_format["negative"] = sorted(list(self.negative_columns))
+
         return {
             "int_columns": list(self.int_columns),
             "float_columns": list(self.float_columns),
+            "numeric_format": numeric_format,
         }
 
 

diff --git a/sdgx/data_models/metadata.py b/sdgx/data_models/metadata.py
@@ -73,6 +73,7 @@ def check_column_list(cls, value) -> Any:
     datetime_columns: Set[str] = set()
     const_columns: Set[str] = set()
     datetime_format: Dict = defaultdict(str)
+    numeric_format: Dict = defaultdict(list)
 
     # version info
     version: str = "1.0"

diff --git a/sdgx/data_processors/filter/base.py b/sdgx/data_processors/filter/base.py
@@ -0,0 +1,11 @@
+from __future__ import annotations
+
+from sdgx.data_processors.base import DataProcessor
+
+
+class Filter(DataProcessor):
+    """
+    Base class for all data filters.
+
+    Filter is a module used to apply rules and remove sampled data that does not conform to the rules.
+    """
diff --git a/sdgx/data_processors/filter/positive_negative.py b/sdgx/data_processors/filter/positive_negative.py
@@ -0,0 +1,109 @@
+from __future__ import annotations
+
+from typing import Any
+
+import pandas as pd
+
+from sdgx.data_models.metadata import Metadata
+from sdgx.data_processors.extension import hookimpl
+from sdgx.data_processors.filter.base import Filter
+from sdgx.utils import logger
+
+
+class PositiveNegativeFilter(Filter):
+    """
+    A data processor for filtering positive and negative values.
+
+    This filter is used to ensure that values in specific columns remain positive or negative.
+    During the reverse conversion process, rows that do not meet the expected positivity or
+    negativity will be removed.
+
+    Attributes:
+        int_columns (set): A set of column names containing integer values.
+        float_columns (set): A set of column names containing float values.
+        positive_columns (set): A set of column names that should contain positive values.
+        negative_columns (set): A set of column names that should contain negative values.
+    """
+
+    int_columns: set = set()
+    """
+    A set of column names that contain integer values.
+    """
+
+    float_columns: set = set()
+    """
+    A set of column names that contain float values.
+    """
+
+    positive_columns: set = set()
+    """
+    A set of column names that are identified as containing positive numeric values.
+    """
+
+    negative_columns: set = set()
+    """
+    A set of column names that are identified as containing negative numeric values.
+    """
+
+    def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
+        """
+        Fit method for the data filter.
+        """
+        logger.info("PositiveNegativeFilter Fitted.")
+
+        # record int and float data
+        self.int_columns = metadata.int_columns
+        self.float_columns = metadata.float_columns
+
+        # record pos and neg
+        self.positive_columns = set(metadata.numeric_format["positive"])
+        self.negative_columns = set(metadata.numeric_format["negative"])
+
+        self.fitted = True
+
+    def convert(self, raw_data: pd.DataFrame) -> pd.DataFrame:
+        """
+        Convert method for data filter (No Action).
+        """
+
+        logger.info("Converting data using PositiveNegativeFilter... Finished (No Action)")
+
+        return raw_data
+
+    def reverse_convert(self, processed_data: pd.DataFrame) -> pd.DataFrame:
+        """
+        Reverse_convert method for the pos_neg data filter.
+
+        Iterate through each row of data, check if there are negative values in positive_columns,
+        or positive values in negative_columns. If the conditions are not met, discard the row.
+        """
+        logger.info(
+            f"Data reverse-converted by PositiveNegativeFilter Start with Shape: {processed_data.shape}."
+        )
+
+        # Create a boolean mask to mark the rows that need to be retained
+        mask = pd.Series(True, index=processed_data.index)
+
+        # Check positive_columns
+        for col in self.positive_columns:
+            if col in processed_data.columns:
+                mask &= processed_data[col] >= 0
+
+        # Check negative_columns
+        for col in self.negative_columns:
+            if col in processed_data.columns:
+                mask &= processed_data[col] <= 0
+
+        # Apply the mask to filter the data
+        filtered_data = processed_data[mask]
+
+        logger.info(
+            f"Data reverse-converted by PositiveNegativeFilter with Output Shape: {filtered_data.shape}."
+        )
+
+        return filtered_data
+
+
+@hookimpl
+def register(manager):
+    manager.register("PositiveNegativeFilter", PositiveNegativeFilter)
diff --git a/sdgx/data_processors/manager.py b/sdgx/data_processors/manager.py
@@ -55,6 +55,7 @@ class DataProcessorManager(Manager):
         ]
     ] + [
         "ConstValueTransformer".lower(),
+        "PositiveNegativeFilter".lower(),
         "EmptyTransformer".lower(),
         "ColumnOrderTransformer".lower(),
     ]

diff --git a/tests/data_models/inspector/test_numeric.py b/tests/data_models/inspector/test_numeric.py
@@ -23,6 +23,9 @@ def test_inspector(inspector: NumericInspector, raw_data):
     )
     assert not inspector.float_columns
     assert inspector.inspect_level == 10
+    assert inspector.negative_columns == set()
+    assert inspector.positive_columns == {"age", "hours-per-week", "fnlwgt", "educational-num"}
+    assert set(inspector.inspect().keys()) == {"int_columns", "float_columns", "numeric_format"}
 
 
 if __name__ == "__main__":