Update documentation and docstrings for DataProcessors (#186)

* update docs for data_processors * update some docstring * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
hitsz-ids · Jun 25, 2024 · a9c5a33 · a9c5a33
1 parent 4101a37
commit a9c5a33
Show file tree

Hide file tree

Showing 16 changed files with 206 additions and 10 deletions.
diff --git a/docs/source/api_reference/data_processors/index.rst b/docs/source/api_reference/data_processors/index.rst
@@ -11,8 +11,9 @@ Built-in DataProcessor
 -----------------------------
 
 .. toctree::
-    :maxdepth: 2
+    :maxdepth: 3
 
+    Built-in DataProcessors <processors/index>
 
 Custom DataProcessor Relevant
 -----------------------------

diff --git a/docs/source/api_reference/data_processors/processors/column_order.rst b/docs/source/api_reference/data_processors/processors/column_order.rst
@@ -0,0 +1,9 @@
+Column Order Transformer
+============================
+
+.. autoclass:: sdgx.data_processors.transformers.column_order.ColumnOrderTransformer
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/docs/source/api_reference/data_processors/processors/datetime.rst b/docs/source/api_reference/data_processors/processors/datetime.rst
@@ -0,0 +1,9 @@
+Datetime Formatter
+============================
+
+.. autoclass:: sdgx.data_processors.formatters.datetime.DatetimeFormatter
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/docs/source/api_reference/data_processors/processors/discrete.rst b/docs/source/api_reference/data_processors/processors/discrete.rst
@@ -0,0 +1,9 @@
+Discrete Transformer
+============================
+
+.. autoclass:: sdgx.data_processors.transformers.discrete.DiscreteTransformer
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/docs/source/api_reference/data_processors/processors/email.rst b/docs/source/api_reference/data_processors/processors/email.rst
@@ -0,0 +1,4 @@
+Email Generator
+============================
+
+TBD
diff --git a/docs/source/api_reference/data_processors/processors/index.rst b/docs/source/api_reference/data_processors/processors/index.rst
@@ -0,0 +1,34 @@
+Built-in DataProcessor
+========================================================
+
+Formatters
+-----------------------------
+
+.. toctree::
+    :maxdepth: 1
+
+    Datetime Formatter <datetime>
+    Int Formatter <int>
+
+
+Transformers
+-----------------------------
+
+
+.. toctree::
+    :maxdepth: 1
+
+    Discrete Transformer <discrete>
+    Column Order Transformer  <column_order>
+    Nan Transformer <nan>
+    Numeric Transformer <numeric>
+
+
+
+Generators
+-----------------------------
+
+.. toctree::
+    :maxdepth: 1
+
+    Email Generator <email>
diff --git a/docs/source/api_reference/data_processors/processors/int.rst b/docs/source/api_reference/data_processors/processors/int.rst
@@ -0,0 +1,9 @@
+Int Formatter
+============================
+
+.. autoclass:: sdgx.data_processors.formatters.int.IntValueFormatter
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/docs/source/api_reference/data_processors/processors/nan.rst b/docs/source/api_reference/data_processors/processors/nan.rst
@@ -0,0 +1,9 @@
+Nan Transformer
+============================
+
+.. autoclass:: sdgx.data_processors.transformers.nan.NonValueTransformer
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/docs/source/api_reference/data_processors/processors/numeric.rst b/docs/source/api_reference/data_processors/processors/numeric.rst
@@ -0,0 +1,9 @@
+Numeric Transformer
+============================
+
+.. autoclass:: sdgx.data_processors.transformers.numeric.NumericValueTransformer
+    :members:
+    :undoc-members:
+    :inherited-members:
+    :show-inheritance:
+    :private-members:
diff --git a/sdgx/data_processors/base.py b/sdgx/data_processors/base.py
@@ -17,6 +17,11 @@ class DataProcessor:
     fitted = False
 
     def check_fitted(self):
+        """Check if the processor is fitted.
+
+        Raises:
+            SynthesizerProcessorError: If the processor is not fitted.
+        """
         if not self.fitted:
             raise SynthesizerProcessorError("Processor NOT fitted.")
 

diff --git a/sdgx/data_processors/formatters/datetime.py b/sdgx/data_processors/formatters/datetime.py
@@ -14,7 +14,21 @@
 
 class DatetimeFormatter(Formatter):
     """
-    Formatter class for handling Datetime formats in pd.DataFrame.
+    A class for formatting datetime columns in a pandas DataFrame.
+
+    DatetimeFormatter is designed to handle the conversion of datetime columns to timestamp format and vice versa.
+    It uses metadata to identify datetime columns and their corresponding datetime formats.
+
+    Attributes:
+        datetime_columns (list): List of column names that are of datetime type.
+        datetime_formats (dict): Dictionary with column names as keys and datetime formats as values.
+        dead_columns (list): List of column names that are no longer needed or to be removed.
+        fitted (bool): Indicates whether the formatter has been fitted.
+
+    Methods:
+        fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): Fits the formatter by recording the datetime columns and their formats.
+        convert(raw_data: pd.DataFrame) -> pd.DataFrame: Converts datetime columns in raw_data to timestamp format.
+        reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Converts timestamp columns in processed_data back to datetime format.
     """
 
     datetime_columns: list = []

diff --git a/sdgx/data_processors/formatters/int.py b/sdgx/data_processors/formatters/int.py
@@ -16,6 +16,9 @@ class IntValueFormatter(Formatter):
     """
 
     int_columns: List = []
+    """
+    List of column names that are of type int, populated by the fit method using metadata.
+    """
 
     def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
         """

diff --git a/sdgx/data_processors/transformers/column_order.py b/sdgx/data_processors/transformers/column_order.py
@@ -12,9 +12,16 @@
 
 class ColumnOrderTransformer(Transformer):
     """
-    Transformer class for handling missing values in data.
+    A transformer that rearranges the columns of a DataFrame to a specified order.
 
-    This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer.
+    Attributes:
+        column_list (list): The list of column names in the desired order.
+
+    Methods:
+        fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): Fits the transformer by remembering the order of the columns.
+        convert(raw_data: pd.DataFrame) -> pd.DataFrame: Converts the input DataFrame by rearranging its columns.
+        reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Reverse-converts the processed DataFrame by rearranging its columns back to their original order.
+        rearrange_columns(column_list, processed_data): Rearranges the columns of a DataFrame according to the provided column list.
     """
 
     column_list: list = None

diff --git a/sdgx/data_processors/transformers/discrete.py b/sdgx/data_processors/transformers/discrete.py
@@ -13,9 +13,22 @@
 
 class DiscreteTransformer(Transformer):
     """
-    DiscreteTransformer is an important component of sdgx, used to handle discrete columns.
-
-    By default, DiscreteTransformer will perform one-hot encoding of discrete columns, and issue a warning message when dimensionality explosion occurs.
+    A transformer class for handling discrete values in the input data.
+
+    This class uses one-hot encoding to convert discrete values into a format that can be used by machine learning models.
+
+    Attributes:
+        discrete_columns (list): A list of column names that are of discrete type.
+        one_hot_warning_cnt (int): The warning count for one-hot encoding. If the number of new columns after one-hot encoding exceeds this count, a warning message will be issued.
+        one_hot_encoders (dict): A dictionary that stores the OneHotEncoder objects for each discrete column. The keys are the column names, and the values are the corresponding OneHotEncoder objects.
+        one_hot_column_names (dict): A dictionary that stores the new column names after one-hot encoding for each discrete column. The keys are the column names, and the values are lists of new column names.
+        onehot_encoder_handle_unknown (str): The parameter to handle unknown categories in the OneHotEncoder. If set to 'ignore', new categories will be ignored. If set to 'error', an error will be raised when new categories are encountered.
+
+    Methods:
+        fit(metadata: Metadata, tabular_data: DataLoader | pd.DataFrame): Fit the transformer to the input data.
+        _fit_column(column_name: str, column_data: pd.DataFrame): Fit a single discrete column.
+        convert(raw_data: pd.DataFrame) -> pd.DataFrame: Convert the input data using one-hot encoding.
+        reverse_convert(processed_data: pd.DataFrame) -> pd.DataFrame: Reverse the one-hot encoding process to get the original data.
     """
 
     discrete_columns: list = []
@@ -24,12 +37,29 @@ class DiscreteTransformer(Transformer):
     """
 
     one_hot_warning_cnt = 512
+    """
+    The warning count for one-hot encoding.
+    If the number of new columns after one-hot encoding exceeds this count, a warning message will be issued.
+    """
 
     one_hot_encoders: dict = {}
+    """
+    A dictionary that stores the OneHotEncoder objects for each discrete column.
+    The keys are the column names, and the values are the corresponding OneHotEncoder objects.
+    """
 
     one_hot_column_names: dict = {}
+    """
+    A dictionary that stores the new column names after one-hot encoding for each discrete column.
+    The keys are the column names, and the values are lists of new column names.
+    """
 
     onehot_encoder_handle_unknown: str = "ignore"
+    """
+    The parameter to handle unknown categories in the OneHotEncoder.
+    If set to 'ignore', new categories will be ignored.
+    If set to 'error', an error will be raised when new categories are encountered.
+    """
 
     def fit(self, metadata: Metadata, tabular_data: DataLoader | pd.DataFrame):
         """

diff --git a/sdgx/data_processors/transformers/nan.py b/sdgx/data_processors/transformers/nan.py
@@ -12,14 +12,33 @@
 
 class NonValueTransformer(Transformer):
     """
-    Transformer class for handling missing values in data.
+    A transformer class for handling missing values in a DataFrame.
 
-    This Transformer is mainly used as a reference for Transformer to facilitate developers to quickly understand the role of Transformer.
+    This class provides functionality to either drop rows with missing values or fill them with a specified value.
+
+    Attributes:
+        fill_na_value (int): The value to fill missing values in the data.
+        drop_na (bool): A boolean flag indicating whether to drop rows with missing values or fill them with `fill_na_value`.
+
+    Methods:
+        fit(metadata: Metadata | None = None, **kwargs: dict[str, Any]): Fit method for the transformer.
+        convert(raw_data: DataFrame) -> DataFrame: Convert method to handle missing values in the input data.
+        reverse_convert(processed_data: DataFrame) -> DataFrame: Reverse_convert method for the transformer.
     """
 
     fill_na_value = 0
+    """
+    The value to fill missing values in the data.
+
+    If `drop_na` is set to `False`, this value will be used to fill missing values in the data.
+    """
 
     drop_na = True
+    """
+    A boolean flag indicating whether to drop rows with missing values or fill them with `fill_na_value`.
+
+    If `True`, rows with missing values will be dropped. If `False`, missing values will be filled with `fill_na_value`.
+    """
 
     def fit(self, metadata: Metadata | None = None, **kwargs: dict[str, Any]):
         """

diff --git a/sdgx/data_processors/transformers/numeric.py b/sdgx/data_processors/transformers/numeric.py
@@ -15,16 +15,41 @@
 
 class NumericValueTransformer(Transformer):
     """
-    Transformer class for handling numeric value (int + float) in data.
+    A transformer class for numeric data.
+
+    This class is used to transform numeric data by scaling it using the StandardScaler from sklearn.
+
+    Attributes:
+        standard_scale (bool): A flag indicating whether to scale the data using StandardScaler.
+        int_columns (Set): A set of column names that are of integer type.
+        float_columns (Set): A set of column names that are of float type.
+        scalers (Dict): A dictionary of scalers for each numeric column.
     """
 
     standard_scale: bool = True
+    """
+    A flag indicating whether to scale the data using StandardScaler.
+    If True, the data will be scaled using StandardScaler.
+    If False, the data will not be scaled.
+    """
 
     int_columns: Set = []
+    """
+    A set of column names that are of integer type.
+    These columns will be considered for scaling if `standard_scale` is True.
+    """
 
     float_columns: Set = []
+    """
+    A set of column names that are of float type.
+    These columns will be considered for scaling if `standard_scale` is True.
+    """
 
     scalers: Dict = {}
+    """
+    A dictionary of scalers for each numeric column.
+    The keys are the column names and the values are the corresponding scalers.
+    """
 
     def fit(
         self,