Skip to content

Commit

Permalink
Memory Optimization to main (#832)
Browse files Browse the repository at this point in the history
* [WIP] Part 1 fix for categorical mem opt issue (#795)

* part_1 of fix for mem optimization for categoical dict creation issue

* precommit fix

* Separated the update from the check in stop conditions for categoical columns

* added tests and accounted for different varaibles affected by the change made to categories attribute

* Modifications to code based on test findings

* Fixes for logic and tests to match requirements from PR

* Fix for rebase carry over issue

* fixes for tests because of changes to variable names in categorical column object

* precommit fixes and improvement of code based on testing

* added stop_condition_unique_value_ratio and max_sample_size_to_check_stop_condition to CategoricalOptions (#808)

* implementation of setting stop conds via options for cat column profiler (#810)

* Space time analysis improvement (#809)

* Made space time analysis code improvements (detect if dataset is already generated, specify cats to generate)

* Modified md file to account for new variable in space time analysis code

* fix: cat bug (#816)

* hotfix for more conservatitive stop condition in categorical columns (#817)

* [WIP] Fix for histogram merging (#815)

* rough draft of merge fix for histograms

* final fixes for passing of existing tests

* Added option to remove calculations for updating row statistics (#827)

* Fix to doc strings (#829)

* Preset Option Fix: presets docsstring added (#830)

* presets docsstring added

* Update dataprofiler/profilers/profiler_options.py

* Update dataprofiler/profilers/profiler_options.py

Co-authored-by: Taylor Turner <[email protected]>

* Update dataprofiler/profilers/profiler_options.py

* Update dataprofiler/profilers/profiler_options.py

---------

Co-authored-by: Taylor Turner <[email protected]>

---------

Co-authored-by: ksneab7 <[email protected]>
Co-authored-by: Michael Davis <[email protected]>
Co-authored-by: JGSweets <[email protected]>
  • Loading branch information
4 people authored May 23, 2023
1 parent a94b4e4 commit 672d723
Show file tree
Hide file tree
Showing 17 changed files with 1,476 additions and 98 deletions.
132 changes: 123 additions & 9 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,22 @@ def __init__(self, name: str | None, options: CategoricalOptions = None) -> None
self.__calculations: dict = {}
self._filter_properties_w_options(self.__calculations, options)
self._top_k_categories: int | None = None

# Conditions to stop categorical profiling
self.max_sample_size_to_check_stop_condition = None
self.stop_condition_unique_value_ratio = None
self._stop_condition_is_met = False

self._stopped_at_unique_ratio: float | None = None
self._stopped_at_unique_count: int | None = None
if options:
self._top_k_categories = options.top_k_categories
self.stop_condition_unique_value_ratio = (
options.stop_condition_unique_value_ratio
)
self.max_sample_size_to_check_stop_condition = (
options.max_sample_size_to_check_stop_condition
)

def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
"""
Expand All @@ -64,13 +78,59 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
)

merged_profile = CategoricalColumn(None)
merged_profile._categories = utils.add_nested_dictionaries(
self._categories, other._categories
)
BaseColumnProfiler._add_helper(merged_profile, self, other)

self._merge_calculations(
merged_profile.__calculations, self.__calculations, other.__calculations
)
# If both profiles have not met stop condition
if not (self._stop_condition_is_met or other._stop_condition_is_met):
merged_profile._categories = utils.add_nested_dictionaries(
self._categories, other._categories
)

# Transfer stop condition variables of 1st profile object to merged profile
# if they are not None else set to 2nd profile
profile1_product = self.sample_size * self.unique_ratio
profile2_product = other.sample_size * other.unique_ratio
if profile1_product < profile2_product:
merged_profile.max_sample_size_to_check_stop_condition = (
self.max_sample_size_to_check_stop_condition
)
merged_profile.stop_condition_unique_value_ratio = (
self.stop_condition_unique_value_ratio
)
else:
merged_profile.stop_condition_unique_value_ratio = (
other.stop_condition_unique_value_ratio
)
merged_profile.max_sample_size_to_check_stop_condition = (
other.max_sample_size_to_check_stop_condition
)

# Check merged profile w/ stop condition
if merged_profile._check_stop_condition_is_met(
merged_profile.sample_size, merged_profile.unique_ratio
):
merged_profile._stopped_at_unique_ratio = merged_profile.unique_ratio
merged_profile._stopped_at_unique_count = merged_profile.unique_count
merged_profile._categories = {}
merged_profile._stop_condition_is_met = True

else:
if self.sample_size > other.sample_size:
merged_profile._stopped_at_unique_ratio = self.unique_ratio
merged_profile._stopped_at_unique_count = self.unique_count
merged_profile.sample_size = self.sample_size
else:
merged_profile._stopped_at_unique_ratio = other.unique_ratio
merged_profile._stopped_at_unique_count = other.unique_count
merged_profile.sample_size = other.sample_size

# If either profile has hit stop condition, remove categories dict
merged_profile._categories = {}
merged_profile._stop_condition_is_met = True

return merged_profile

def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
Expand All @@ -95,7 +155,7 @@ def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
(
"unique_count",
utils.find_diff_of_numbers(
len(self.categories), len(other_profile.categories)
self.unique_count, other_profile.unique_count
),
),
(
Expand Down Expand Up @@ -165,7 +225,7 @@ def profile(self) -> dict:
categorical=self.is_match,
statistics=dict(
[
("unique_count", len(self.categories)),
("unique_count", self.unique_count),
("unique_ratio", self.unique_ratio),
]
),
Expand Down Expand Up @@ -195,14 +255,27 @@ def categorical_counts(self) -> dict[str, int]:
@property
def unique_ratio(self) -> float:
"""Return ratio of unique categories to sample_size."""
unique_ratio = 1.0
if self._stop_condition_is_met:
return cast(float, self._stopped_at_unique_ratio)

if self.sample_size:
unique_ratio = len(self.categories) / self.sample_size
return unique_ratio
return len(self.categories) / self.sample_size
return 0

@property
def unique_count(self) -> int:
"""Return ratio of unique categories to sample_size."""
if self._stop_condition_is_met:
return cast(int, self._stopped_at_unique_count)

return len(self.categories)

@property
def is_match(self) -> bool:
"""Return true if column is categorical."""
if self._stop_condition_is_met:
return False

is_match = False
unique = len(self._categories)
if unique <= self._MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL:
Expand All @@ -214,6 +287,43 @@ def is_match(self) -> bool:
is_match = True
return is_match

def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
"""Return boolean given stop conditions.
:param sample_size: Number of samples to check the stop condition
:type sample_size: int
:param unqiue_ratio: Ratio of unique values to full sample size to
check stop condition
:type unqiue_ratio: float
:return: boolean for stop conditions
"""
if (
self.max_sample_size_to_check_stop_condition is not None
and self.stop_condition_unique_value_ratio is not None
and sample_size >= self.max_sample_size_to_check_stop_condition
and unqiue_ratio >= self.stop_condition_unique_value_ratio
):
return True
return False

def _update_stop_condition(self, data: DataFrame):
"""Return value stop_condition_is_met given stop conditions.
:param data: Dataframe currently being processed by categorical profiler
:type data: DataFrame
:return: boolean for stop conditions
"""
merged_unique_count = len(self._categories)
merged_sample_size = self.sample_size + len(data)
merged_unique_ratio = merged_unique_count / merged_sample_size

self._stop_condition_is_met = self._check_stop_condition_is_met(
merged_sample_size, merged_unique_ratio
)
if self._stop_condition_is_met:
self._stopped_at_unique_ratio = merged_unique_ratio
self._stopped_at_unique_count = merged_unique_count

@BaseColumnProfiler._timeit(name="categories")
def _update_categories(
self,
Expand All @@ -240,6 +350,9 @@ def _update_categories(
self._categories = utils.add_nested_dictionaries(
self._categories, category_count
)
self._update_stop_condition(df_series)
if self._stop_condition_is_met:
self._categories = {}

def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
"""
Expand All @@ -262,7 +375,8 @@ def update(self, df_series: Series) -> CategoricalColumn:
:return: updated CategoricalColumn
:rtype: CategoricalColumn
"""
if len(df_series) == 0:
# If condition for limiting profile calculations
if len(df_series) == 0 or self._stop_condition_is_met:
return self

profile = dict(sample_size=len(df_series))
Expand Down
Loading

0 comments on commit 672d723

Please sign in to comment.