diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 03263aa1..5393d5d2 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -476,7 +476,7 @@ def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float): return True return False - def _update_stop_condition(self, data: DataFrame): + def _update_stop_condition(self, data: DataFrame | pl.DataFrame): """Return value stop_condition_is_met given stop conditions. :param data: Dataframe currently being processed by categorical profiler @@ -499,8 +499,8 @@ def _get_categories_cms(self, df_series, len_df): """Return count min sketch and heavy hitters for both the batch and stream case. :param df_series: Series currently being processed by categorical profiler - :type df_series: Series - :param len_df: the total number of samples iin df_series + :type df_series: polars.Series + :param len_df: the total number of samples in df_series :type len_df: int :return: cms, heavy_hitter_dict, missing_heavy_hitter_dict """ @@ -603,13 +603,13 @@ def _get_categories_full(self, df_series) -> dict: :return: dict of counts for each unique value :rtype: dict """ - category_count: dict = dict(df_series.value_counts(sort=True).iter_rows()) + category_count: dict = Series(df_series).value_counts(dropna=False).to_dict() return category_count @BaseColumnProfiler._timeit(name="categories") def _update_categories( self, - df_series: DataFrame, + df_series: DataFrame | pl.DataFrame, prev_dependent_properties: dict = None, subset_properties: dict = None, ) -> None: @@ -659,7 +659,9 @@ def _update_categories( if self._stop_condition_is_met: self._categories = {} - def _update_helper(self, df_series_clean: Series, profile: dict) -> None: + def _update_helper( + self, df_series_clean: Series | pl.Series, profile: dict + ) -> None: """ Update col profile properties with clean dataset and its known profile. @@ -671,7 +673,7 @@ def _update_helper(self, df_series_clean: Series, profile: dict) -> None: """ self._update_column_base_properties(profile) - def update(self, df_series: Series) -> CategoricalColumn: + def update(self, df_series: pl.Series | Series) -> CategoricalColumn: """ Update the column profile. @@ -687,12 +689,17 @@ def update(self, df_series: Series) -> CategoricalColumn: if len(df_series) == 0 or self._stop_condition_is_met: return self + if isinstance(df_series, pl.Series): + pandas_df = df_series.to_pandas() + else: + pandas_df = df_series + profile = dict(sample_size=len(df_series)) CategoricalColumn._update_categories(self, df_series) BaseColumnProfiler._perform_property_calcs( self, self.__calculations, - df_series=df_series, + df_series=pandas_df, prev_dependent_properties={}, subset_properties=profile, ) diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index aac6486e..27ffc913 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -1181,7 +1181,7 @@ def test_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORICAL(self): ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1206,7 +1206,7 @@ def test_greater_than_CATEGORICAL_THRESHOLD_DEFAULT_identify_as_text(self): ) cat_sentence_list = list_unique_values * num_sentences - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1233,7 +1233,7 @@ def test_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list = list_unique_values * num_sentences len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1263,7 +1263,7 @@ def test_uppercase_less_than_CATEGORICAL_THRESHOLD_DEFAULT(self): cat_sentence_list[-3] = self.test_sentence_upper3 + str(num_sentences - 2) len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category" @@ -1288,7 +1288,7 @@ def test_long_sentences_fewer_than_MAXIMUM_UNIQUE_VALUES_TO_CLASSIFY_AS_CATEGORI ] len_unique = len(set(cat_sentence_list)) - cat_sentence_df = pd.Series(cat_sentence_list) + cat_sentence_df = pl.Series(cat_sentence_list) column_profile = StructuredColProfiler(cat_sentence_df) cat_profiler = column_profile.profiles["data_stats_profile"]._profiles[ "category"