Skip to content

Commit

Permalink
Optimization for text column profile ksneab (#791)
Browse files Browse the repository at this point in the history
* changed list to set on line 147 in text_column_profile script

* Added np array encoder function for json dump. Error would sometiems happen on dump for not jsonable types
  • Loading branch information
ksneab7 authored Apr 24, 2023
1 parent ed0fd64 commit c2cee9e
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 6 deletions.
2 changes: 1 addition & 1 deletion dataprofiler/profilers/text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def _update_vocab(
:type subset_properties: dict
:return: None
"""
data_flat = list(itertools.chain(*data))
data_flat = set(itertools.chain(*data))
self.vocab = utils._combine_unique_sets(self.vocab, data_flat)

def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None:
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,12 @@ def __missing__(self, key: str) -> str:
return key


def _combine_unique_sets(a: list, b: list) -> list:
def _combine_unique_sets(a: list | set, b: list | set) -> list:
"""
Unify two lists.
:type a: list
:type b: list
:type a: Union [list, set]
:type b: Union[list, set]
:rtype: list
"""
combined_list: set = set()
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/space_time_analysis/dataset_generation.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import json
import string
from typing import List, Optional

Expand All @@ -15,6 +16,17 @@
import dataprofiler as dp


class NumpyEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, np.integer):
return int(obj)
if isinstance(obj, np.floating):
return float(obj)
if isinstance(obj, np.ndarray):
return obj.tolist()
return super().default(obj)


def nan_injection(
rng: Generator, df: pd.DataFrame, percent_to_nan: float = 0.0
) -> pd.DataFrame:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
except ImportError:
import dataprofiler as dp

from dataset_generation import generate_dataset_by_class, nan_injection
from dataset_generation import NumpyEncoder, generate_dataset_by_class, nan_injection

from dataprofiler import StructuredProfiler

Expand Down Expand Up @@ -236,7 +236,7 @@ def dp_space_time_analysis(
if not os.path.exists(os.path.dirname(path)):
os.makedirs(os.path.dirname(path))
with open(path, "w") as fp:
json.dump(profile_times, fp, indent=4)
json.dump(profile_times, fp, indent=4, cls=NumpyEncoder)
times_table.to_csv(path)


Expand Down

0 comments on commit c2cee9e

Please sign in to comment.