Skip to content

Commit

Permalink
Bug fix for float precision calculation using categorical data with t…
Browse files Browse the repository at this point in the history
…railing zeros. (capitalone#1125)
  • Loading branch information
SchadtJ authored and JGSweets committed Jun 5, 2024
1 parent af9f275 commit 4491b97
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
5 changes: 4 additions & 1 deletion dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,10 @@ def _get_float_precision(

# length of sampled cells after all punctuation removed
len_per_float = (
df_series_clean.sample(sample_size).replace(to_replace=r, value="").map(len)
df_series_clean.sample(sample_size)
.astype(object)
.replace(to_replace=r, value="")
.map(len)
).astype(float)

# Determine statistics precision
Expand Down
7 changes: 7 additions & 0 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,13 @@ def test_profiled_precision(self):
msg=f"Errored for: {sample[0]}",
)

# Validate categorical series with trailing zeros supported
categorical_series = pd.Series(
[202209, 202210, 202211], dtype="category"
).apply(str)
float_profiler = FloatColumn("Name")
float_profiler.update(categorical_series)

def test_profiled_min(self):
# test with multiple values
data = np.linspace(-5, 5, 11)
Expand Down

0 comments on commit 4491b97

Please sign in to comment.