Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categorical polars update #1135

Open
wants to merge 2 commits into
base: feature/polars
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 15 additions & 8 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
return True
return False

def _update_stop_condition(self, data: DataFrame):
def _update_stop_condition(self, data: DataFrame | pl.DataFrame):
"""Return value stop_condition_is_met given stop conditions.

:param data: Dataframe currently being processed by categorical profiler
Expand All @@ -499,8 +499,8 @@ def _get_categories_cms(self, df_series, len_df):
"""Return count min sketch and heavy hitters for both the batch and stream case.

:param df_series: Series currently being processed by categorical profiler
:type df_series: Series
:param len_df: the total number of samples iin df_series
:type df_series: polars.Series
:param len_df: the total number of samples in df_series
:type len_df: int
:return: cms, heavy_hitter_dict, missing_heavy_hitter_dict
"""
Expand Down Expand Up @@ -603,13 +603,13 @@ def _get_categories_full(self, df_series) -> dict:
:return: dict of counts for each unique value
:rtype: dict
"""
category_count: dict = dict(df_series.value_counts(sort=True).iter_rows())
category_count: dict = Series(df_series).value_counts(dropna=False).to_dict()
return category_count

@BaseColumnProfiler._timeit(name="categories")
def _update_categories(
self,
df_series: DataFrame,
df_series: DataFrame | pl.DataFrame,
prev_dependent_properties: dict = None,
subset_properties: dict = None,
) -> None:
Expand Down Expand Up @@ -659,7 +659,9 @@ def _update_categories(
if self._stop_condition_is_met:
self._categories = {}

def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
def _update_helper(
self, df_series_clean: Series | pl.Series, profile: dict
) -> None:
"""
Update col profile properties with clean dataset and its known profile.

Expand All @@ -671,7 +673,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None:
"""
self._update_column_base_properties(profile)

def update(self, df_series: Series) -> CategoricalColumn:
def update(self, df_series: pl.Series | Series) -> CategoricalColumn:
"""
Update the column profile.

Expand All @@ -687,12 +689,17 @@ def update(self, df_series: Series) -> CategoricalColumn:
if len(df_series) == 0 or self._stop_condition_is_met:
return self

if isinstance(df_series, pl.Series):
pandas_df = df_series.to_pandas()
else:
pandas_df = df_series

profile = dict(sample_size=len(df_series))
CategoricalColumn._update_categories(self, df_series)
BaseColumnProfiler._perform_property_calcs(
self,
self.__calculations,
df_series=df_series,
df_series=pandas_df,
prev_dependent_properties={},
subset_properties=profile,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1181,7 +1181,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self):
]

len_unique = len(set(cat_sentence_list))
cat_sentence_df = pd.Series(cat_sentence_list)
cat_sentence_df = pl.Series(cat_sentence_list)
column_profile = StructuredColProfiler(cat_sentence_df)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
"category"
Expand All @@ -1206,7 +1206,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self):
)
cat_sentence_list = list_unique_values * num_sentences

cat_sentence_df = pd.Series(cat_sentence_list)
cat_sentence_df = pl.Series(cat_sentence_list)
column_profile = StructuredColProfiler(cat_sentence_df)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
"category"
Expand All @@ -1233,7 +1233,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
cat_sentence_list = list_unique_values * num_sentences

len_unique = len(set(cat_sentence_list))
cat_sentence_df = pd.Series(cat_sentence_list)
cat_sentence_df = pl.Series(cat_sentence_list)
column_profile = StructuredColProfiler(cat_sentence_df)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
"category"
Expand Down Expand Up @@ -1263,7 +1263,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self):
cat_sentence_list[-3] = self.test_sentence_upper3 + str(num_sentences - 2)

len_unique = len(set(cat_sentence_list))
cat_sentence_df = pd.Series(cat_sentence_list)
cat_sentence_df = pl.Series(cat_sentence_list)
column_profile = StructuredColProfiler(cat_sentence_df)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
"category"
Expand All @@ -1288,7 +1288,7 @@ def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORI
]

len_unique = len(set(cat_sentence_list))
cat_sentence_df = pd.Series(cat_sentence_list)
cat_sentence_df = pl.Series(cat_sentence_list)
column_profile = StructuredColProfiler(cat_sentence_df)
cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[
"category"
Expand Down
Loading