Memory Optimization to main (#832)

* [WIP] Part 1 fix for categorical mem opt issue (#795) * part_1 of fix for mem optimization for categoical dict creation issue * precommit fix * Separated the update from the check in stop conditions for categoical columns * added tests and accounted for different varaibles affected by the change made to categories attribute * Modifications to code based on test findings * Fixes for logic and tests to match requirements from PR * Fix for rebase carry over issue * fixes for tests because of changes to variable names in categorical column object * precommit fixes and improvement of code based on testing * added stop_condition_unique_value_ratio and max_sample_size_to_check_stop_condition to CategoricalOptions (#808) * implementation of setting stop conds via options for cat column profiler (#810) * Space time analysis improvement (#809) * Made space time analysis code improvements (detect if dataset is already generated, specify cats to generate) * Modified md file to account for new variable in space time analysis code * fix: cat bug (#816) * hotfix for more conservatitive stop condition in categorical columns (#817) * [WIP] Fix for histogram merging (#815) * rough draft of merge fix for histograms * final fixes for passing of existing tests * Added option to remove calculations for updating row statistics (#827) * Fix to doc strings (#829) * Preset Option Fix: presets docsstring added (#830) * presets docsstring added * Update dataprofiler/profilers/profiler_options.py * Update dataprofiler/profilers/profiler_options.py Co-authored-by: Taylor Turner <[email protected]> * Update dataprofiler/profilers/profiler_options.py * Update dataprofiler/profilers/profiler_options.py --------- Co-authored-by: Taylor Turner <[email protected]> --------- Co-authored-by: ksneab7 <[email protected]> Co-authored-by: Michael Davis <[email protected]> Co-authored-by: JGSweets <[email protected]>
capitalone · May 23, 2023 · 672d723 · 672d723
1 parent a94b4e4
commit 672d723
Show file tree

Hide file tree

Showing 17 changed files with 1,476 additions and 98 deletions.
diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
@@ -44,8 +44,22 @@ def __init__(self, name: str | None, options: CategoricalOptions = None) -> None
         self.__calculations: dict = {}
         self._filter_properties_w_options(self.__calculations, options)
         self._top_k_categories: int | None = None
+
+        # Conditions to stop categorical profiling
+        self.max_sample_size_to_check_stop_condition = None
+        self.stop_condition_unique_value_ratio = None
+        self._stop_condition_is_met = False
+
+        self._stopped_at_unique_ratio: float | None = None
+        self._stopped_at_unique_count: int | None = None
         if options:
             self._top_k_categories = options.top_k_categories
+            self.stop_condition_unique_value_ratio = (
+                options.stop_condition_unique_value_ratio
+            )
+            self.max_sample_size_to_check_stop_condition = (
+                options.max_sample_size_to_check_stop_condition
+            )
 
     def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
         """
@@ -64,13 +78,59 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
             )
 
         merged_profile = CategoricalColumn(None)
-        merged_profile._categories = utils.add_nested_dictionaries(
-            self._categories, other._categories
-        )
         BaseColumnProfiler._add_helper(merged_profile, self, other)
+
         self._merge_calculations(
             merged_profile.__calculations, self.__calculations, other.__calculations
         )
+        # If both profiles have not met stop condition
+        if not (self._stop_condition_is_met or other._stop_condition_is_met):
+            merged_profile._categories = utils.add_nested_dictionaries(
+                self._categories, other._categories
+            )
+
+            # Transfer stop condition variables of 1st profile object to merged profile
+            # if they are not None else set to 2nd profile
+            profile1_product = self.sample_size * self.unique_ratio
+            profile2_product = other.sample_size * other.unique_ratio
+            if profile1_product < profile2_product:
+                merged_profile.max_sample_size_to_check_stop_condition = (
+                    self.max_sample_size_to_check_stop_condition
+                )
+                merged_profile.stop_condition_unique_value_ratio = (
+                    self.stop_condition_unique_value_ratio
+                )
+            else:
+                merged_profile.stop_condition_unique_value_ratio = (
+                    other.stop_condition_unique_value_ratio
+                )
+                merged_profile.max_sample_size_to_check_stop_condition = (
+                    other.max_sample_size_to_check_stop_condition
+                )
+
+            # Check merged profile w/ stop condition
+            if merged_profile._check_stop_condition_is_met(
+                merged_profile.sample_size, merged_profile.unique_ratio
+            ):
+                merged_profile._stopped_at_unique_ratio = merged_profile.unique_ratio
+                merged_profile._stopped_at_unique_count = merged_profile.unique_count
+                merged_profile._categories = {}
+                merged_profile._stop_condition_is_met = True
+
+        else:
+            if self.sample_size > other.sample_size:
+                merged_profile._stopped_at_unique_ratio = self.unique_ratio
+                merged_profile._stopped_at_unique_count = self.unique_count
+                merged_profile.sample_size = self.sample_size
+            else:
+                merged_profile._stopped_at_unique_ratio = other.unique_ratio
+                merged_profile._stopped_at_unique_count = other.unique_count
+                merged_profile.sample_size = other.sample_size
+
+            # If either profile has hit stop condition, remove categories dict
+            merged_profile._categories = {}
+            merged_profile._stop_condition_is_met = True
+
         return merged_profile
 
     def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
@@ -95,7 +155,7 @@ def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
                 (
                     "unique_count",
                     utils.find_diff_of_numbers(
-                        len(self.categories), len(other_profile.categories)
+                        self.unique_count, other_profile.unique_count
                     ),
                 ),
                 (
@@ -165,7 +225,7 @@ def profile(self) -> dict:
             categorical=self.is_match,
             statistics=dict(
                 [
-                    ("unique_count", len(self.categories)),
+                    ("unique_count", self.unique_count),
                     ("unique_ratio", self.unique_ratio),
                 ]
             ),
@@ -195,14 +255,27 @@ def categorical_counts(self) -> dict[str, int]:
     @property
     def unique_ratio(self) -> float:
         """Return ratio of unique categories to sample_size."""
-        unique_ratio = 1.0
+        if self._stop_condition_is_met:
+            return cast(float, self._stopped_at_unique_ratio)
+
         if self.sample_size:
-            unique_ratio = len(self.categories) / self.sample_size
-        return unique_ratio
+            return len(self.categories) / self.sample_size
+        return 0
+
+    @property
+    def unique_count(self) -> int:
+        """Return ratio of unique categories to sample_size."""
+        if self._stop_condition_is_met:
+            return cast(int, self._stopped_at_unique_count)
+
+        return len(self.categories)
 
     @property
     def is_match(self) -> bool:
         """Return true if column is categorical."""
+        if self._stop_condition_is_met:
+            return False
+
         is_match = False
         unique = len(self._categories)
         if unique <= self._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL:
@@ -214,6 +287,43 @@ def is_match(self) -> bool:
             is_match = True
         return is_match
 
+    def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
+        """Return boolean given stop conditions.
+
+        :param sample_size: Number of samples to check the stop condition
+        :type sample_size: int
+        :param unqiue_ratio: Ratio of unique values to full sample size to
+            check stop condition
+        :type unqiue_ratio: float
+        :return: boolean for stop conditions
+        """
+        if (
+            self.max_sample_size_to_check_stop_condition is not None
+            and self.stop_condition_unique_value_ratio is not None
+            and sample_size >= self.max_sample_size_to_check_stop_condition
+            and unqiue_ratio >= self.stop_condition_unique_value_ratio
+        ):
+            return True
+        return False
+
+    def _update_stop_condition(self, data: DataFrame):
+        """Return value stop_condition_is_met given stop conditions.
+
+        :param data: Dataframe currently being processed by categorical profiler
+        :type data: DataFrame
+        :return: boolean for stop conditions
+        """
+        merged_unique_count = len(self._categories)
+        merged_sample_size = self.sample_size + len(data)
+        merged_unique_ratio = merged_unique_count / merged_sample_size
+
+        self._stop_condition_is_met = self._check_stop_condition_is_met(
+            merged_sample_size, merged_unique_ratio
+        )
+        if self._stop_condition_is_met:
+            self._stopped_at_unique_ratio = merged_unique_ratio
+            self._stopped_at_unique_count = merged_unique_count
+
     @BaseColumnProfiler._timeit(name="categories")
     def _update_categories(
         self,
@@ -240,6 +350,9 @@ def _update_categories(
         self._categories = utils.add_nested_dictionaries(
             self._categories, category_count
         )
+        self._update_stop_condition(df_series)
+        if self._stop_condition_is_met:
+            self._categories = {}
 
     def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
         """
@@ -262,7 +375,8 @@ def update(self, df_series: Series) -> CategoricalColumn:
         :return: updated CategoricalColumn
         :rtype: CategoricalColumn
         """
-        if len(df_series) == 0:
+        # If condition for limiting profile calculations
+        if len(df_series) == 0 or self._stop_condition_is_met:
             return self
 
         profile = dict(sample_size=len(df_series))