diff --git a/.github/workflows/test-python-package.yml b/.github/workflows/test-python-package.yml index 6914fdb93..41f7036e9 100644 --- a/.github/workflows/test-python-package.yml +++ b/.github/workflows/test-python-package.yml @@ -8,6 +8,7 @@ on: branches: - 'main' - 'feature/**' + - 'dev' jobs: build: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 55aad1327..ce67fff77 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -63,6 +63,7 @@ repos: networkx>=2.5.1, typing-extensions>=3.10.0.2, HLL>=2.0.3, + datasketches>=4.1.0, # requirements-dev.txt check-manifest>=0.48, @@ -109,7 +110,7 @@ repos: additional_dependencies: ['h5py', 'wheel', 'future', 'numpy', 'pandas', 'python-dateutil', 'pytz', 'pyarrow', 'chardet', 'fastavro', 'python-snappy', 'charset-normalizer', 'psutil', 'scipy', 'requests', - 'networkx','typing-extensions', 'HLL'] + 'networkx','typing-extensions', 'HLL', 'datasketches'] # Pyupgrade - standardize and modernize Python syntax for newer versions of the language - repo: https://github.com/asottile/pyupgrade rev: v3.3.0 diff --git a/dataprofiler/data_readers/csv_data.py b/dataprofiler/data_readers/csv_data.py index 4a9368dc9..7e13d4075 100644 --- a/dataprofiler/data_readers/csv_data.py +++ b/dataprofiler/data_readers/csv_data.py @@ -87,6 +87,7 @@ def __init__( self._checked_header: bool = "header" in options and self._header != "auto" self._default_delimiter: str = "," self._default_quotechar: str = '"' + self._sample_nrows: Optional[int] = options.get("sample_nrows", None) if data is not None: self._load_data(data) @@ -115,6 +116,11 @@ def header(self) -> Optional[Union[str, int]]: """Return header.""" return self._header + @property + def sample_nrows(self) -> Optional[int]: + """Return sample_nrows.""" + return self._sample_nrows + @property def is_structured(self) -> bool: """Determine compatibility with StructuredProfiler.""" @@ -168,6 +174,10 @@ def _check_and_return_options(options: Optional[Dict]) -> Dict: raise ValueError( "'record_samples_per_line' must be an int " "more than 0" ) + if "sample_nrows" in options: + value = options["sample_nrows"] + if not isinstance(value, int) or value < 0: + raise ValueError("'sample_nrows' must be an int more than 0") return options @staticmethod @@ -549,6 +559,7 @@ def _load_data_from_str(self, data_as_str: str) -> pd.DataFrame: data_buffered, self.delimiter, cast(Optional[int], self.header), + self.sample_nrows, self.selected_columns, read_in_string=True, ) @@ -595,6 +606,7 @@ def _load_data_from_file(self, input_file_path: str) -> pd.DataFrame: input_file_path, self.delimiter, cast(Optional[int], self.header), + self.sample_nrows, self.selected_columns, read_in_string=True, encoding=self.file_encoding, diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index b01321ecb..3e433d85d 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -1,9 +1,13 @@ """Contains functions for data readers.""" import json +import os +import random import re import urllib from collections import OrderedDict from io import BytesIO, StringIO, TextIOWrapper +from itertools import islice +from math import floor, log, log1p from typing import ( Any, Dict, @@ -24,7 +28,7 @@ from chardet.universaldetector import UniversalDetector from typing_extensions import TypeGuard -from .. import dp_logging +from .. import dp_logging, settings from .._typing import JSONType, Url from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA @@ -268,10 +272,106 @@ def read_json( return lines +def reservoir(file: TextIOWrapper, sample_nrows: int) -> list: + """ + Implement the mathematical logic of Reservoir sampling. + + :param file: wrapper of the opened csv file + :type file: TextIOWrapper + :param sample_nrows: number of rows to sample + :type sample_nrows: int + + :raises: ValueError() + + :return: sampled values + :rtype: list + """ + # Copyright 2021 Oscar Benjamin + # + # Permission is hereby granted, free of charge, to any person obtaining a copy + # of this software and associated documentation files (the "Software"), to deal + # in the Software without restriction, including without limitation the rights + # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + # copies of the Software, and to permit persons to whom the Software is + # furnished to do so, subject to the following conditions: + # + # The above copyright notice and this permission notice shall be included in + # all copies or substantial portions of the Software. + # + # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + # SOFTWARE. + # https://gist.github.com/oscarbenjamin/4c1b977181f34414a425f68589e895d1 + + iterator = iter(file) + values = list(islice(iterator, sample_nrows)) + + irange = range(len(values)) + indices = dict(zip(irange, irange)) + + kinv = 1 / sample_nrows + W = 1.0 + rng = random.Random(x=settings._seed) + if "DATAPROFILER_SEED" in os.environ and settings._seed is None: + seed = os.environ.get("DATAPROFILER_SEED") + if seed: + rng = random.Random(int(seed)) + + while True: + W *= rng.random() ** kinv + # random() < 1.0 but random() ** kinv might not be + # W == 1.0 implies "infinite" skips + if W == 1.0: + break + # skip is geometrically distributed with parameter W + skip = floor(log(rng.random()) / log1p(-W)) + try: + newval = next(islice(iterator, skip, skip + 1)) + except StopIteration: + break + # Append new, replace old with dummy, and keep track of order + remove_index = rng.randrange(sample_nrows) + values[indices[remove_index]] = str(None) + indices[remove_index] = len(values) + values.append(newval) + + values = [values[indices[i]] for i in irange] + return values + + +def rsample(file_path: TextIOWrapper, sample_nrows: int, args: dict) -> StringIO: + """ + Implement Reservoir Sampling to sample n rows out of a total of M rows. + + :param file_path: path of the csv file to be read in + :type file_path: TextIOWrapper + :param sample_nrows: number of rows being sampled + :type sample_nrows: int + :param args: options to read the csv file + :type args: dict + """ + header = args["header"] + result = [] + + if header is not None: + result = [[next(file_path) for i in range(header + 1)][-1]] + args["header"] = 0 + + result += reservoir(file_path, sample_nrows) + + fo = StringIO("".join([i if (i[-1] == "\n") else i + "\n" for i in result])) + return fo + + def read_csv_df( file_path: Union[str, BytesIO, TextIOWrapper], delimiter: Optional[str], header: Optional[int], + sample_nrows: Optional[int] = None, selected_columns: List[str] = [], read_in_string: bool = False, encoding: Optional[str] = "utf-8", @@ -314,19 +414,28 @@ def read_csv_df( # account for py3.6 requirement for pandas, can remove if >= py3.7 is_buf_wrapped = False + is_file_open = False if isinstance(file_path, BytesIO): # a BytesIO stream has to be wrapped in order to properly be detached # in 3.6 this avoids read_csv wrapping the stream and closing too early file_path = TextIOWrapper(file_path, encoding=encoding) is_buf_wrapped = True - - fo = pd.read_csv(file_path, **args) + elif isinstance(file_path, str): + file_path = open(file_path, encoding=encoding) + is_file_open = True + + file_data = file_path + if sample_nrows: + file_data = rsample(file_path, sample_nrows, args) + fo = pd.read_csv(file_data, **args) data = fo.read() # if the buffer was wrapped, detach it before returning if is_buf_wrapped: file_path = cast(TextIOWrapper, file_path) file_path.detach() + elif is_file_open: + file_path.close() fo.close() return data diff --git a/dataprofiler/data_readers/graph_data.py b/dataprofiler/data_readers/graph_data.py index a1bc9b35f..337408a68 100644 --- a/dataprofiler/data_readers/graph_data.py +++ b/dataprofiler/data_readers/graph_data.py @@ -255,7 +255,7 @@ def _format_data_networkx(self) -> nx.Graph: self.input_file_path, self._delimiter, cast(Optional[int], self._header), - [], + selected_columns=[], read_in_string=True, encoding=self.file_encoding, ) diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py index 611d07d8b..201f78998 100644 --- a/dataprofiler/labelers/base_data_labeler.py +++ b/dataprofiler/labelers/base_data_labeler.py @@ -637,7 +637,9 @@ def load_from_library(cls, name: str) -> BaseDataLabeler: :return: DataLabeler class :rtype: BaseDataLabeler """ - return cls(os.path.join(default_labeler_dir, name)) + labeler = cls(os.path.join(default_labeler_dir, name)) + labeler._default_model_loc = name + return labeler @classmethod def load_from_disk(cls, dirpath: str, load_options: dict = None) -> BaseDataLabeler: diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py index f2c5aaa63..7172e7472 100644 --- a/dataprofiler/labelers/data_labelers.py +++ b/dataprofiler/labelers/data_labelers.py @@ -102,7 +102,7 @@ def __new__( # type: ignore trainable: bool = False, ) -> BaseDataLabeler: """ - Create structured and unstructred data labeler objects. + Create structured and unstructured data labeler objects. :param dirpath: Path to load data labeler :type dirpath: str @@ -143,6 +143,9 @@ def load_from_library(cls, name: str, trainable: bool = False) -> BaseDataLabele """ if trainable: return TrainableDataLabeler.load_from_library(name) + for _, labeler_class_obj in cls.labeler_classes.items(): + if name in labeler_class_obj._default_model_loc: + return labeler_class_obj() return BaseDataLabeler.load_from_library(name) @classmethod diff --git a/dataprofiler/profilers/__init__.py b/dataprofiler/profilers/__init__.py index d2a057e0b..64e33e384 100644 --- a/dataprofiler/profilers/__init__.py +++ b/dataprofiler/profilers/__init__.py @@ -1,12 +1,98 @@ """Package for providing statistics and predictions for a given dataset.""" +from . import json_decoder from .base_column_profilers import BaseColumnProfiler from .categorical_column_profile import CategoricalColumn +from .column_profile_compilers import ( + BaseCompiler, + ColumnDataLabelerCompiler, + ColumnPrimitiveTypeProfileCompiler, + ColumnStatsProfileCompiler, +) from .data_labeler_column_profile import DataLabelerColumn from .datetime_column_profile import DateTimeColumn from .float_column_profile import FloatColumn from .int_column_profile import IntColumn from .numerical_column_stats import NumericStatsMixin from .order_column_profile import OrderColumn -from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler +from .profile_builder import ( + Profiler, + StructuredColProfiler, + StructuredProfiler, + UnstructuredProfiler, +) +from .profiler_options import ( + BaseInspectorOptions, + BooleanOption, + CategoricalOptions, + CorrelationOptions, + DataLabelerOptions, + DateTimeOptions, + FloatOptions, + HistogramOption, + HyperLogLogOptions, + IntOptions, + ModeOption, + NumericalOptions, + OrderOptions, + PrecisionOptions, + ProfilerOptions, + RowStatisticsOptions, + StructuredOptions, + TextOptions, + TextProfilerOptions, + UniqueCountOptions, + UnstructuredOptions, +) from .text_column_profile import TextColumn from .unstructured_labeler_profile import UnstructuredLabelerProfile + +# set here to avoid circular imports +json_decoder._profiles = { + CategoricalColumn.__name__: CategoricalColumn, + FloatColumn.__name__: FloatColumn, + IntColumn.__name__: IntColumn, + DateTimeColumn.__name__: DateTimeColumn, + OrderColumn.__name__: OrderColumn, + DataLabelerColumn.__name__: DataLabelerColumn, + TextColumn.__name__: TextColumn, +} + + +json_decoder._compilers = { + ColumnDataLabelerCompiler.__name__: ColumnDataLabelerCompiler, + ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler, + ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler, +} + +json_decoder._options = { + BooleanOption.__name__: BooleanOption, + HistogramOption.__name__: HistogramOption, + ModeOption.__name__: ModeOption, + BaseInspectorOptions.__name__: BaseInspectorOptions, + NumericalOptions.__name__: NumericalOptions, + IntOptions.__name__: IntOptions, + PrecisionOptions.__name__: PrecisionOptions, + FloatOptions.__name__: FloatOptions, + TextOptions.__name__: TextOptions, + DateTimeOptions.__name__: DateTimeOptions, + OrderOptions.__name__: OrderOptions, + CategoricalOptions.__name__: CategoricalOptions, + CorrelationOptions.__name__: CorrelationOptions, + UniqueCountOptions.__name__: UniqueCountOptions, + HyperLogLogOptions.__name__: HyperLogLogOptions, + RowStatisticsOptions.__name__: RowStatisticsOptions, + DataLabelerOptions.__name__: DataLabelerOptions, + TextProfilerOptions.__name__: TextProfilerOptions, + StructuredOptions.__name__: StructuredOptions, + UnstructuredOptions.__name__: UnstructuredOptions, + ProfilerOptions.__name__: ProfilerOptions, +} + + +json_decoder._profilers = { + StructuredProfiler.__name__: StructuredProfiler, +} + +json_decoder._structured_col_profiler = { + StructuredColProfiler.__name__: StructuredColProfiler, +} diff --git a/dataprofiler/profilers/base_column_profilers.py b/dataprofiler/profilers/base_column_profilers.py index 3b658d10c..d9c183c99 100644 --- a/dataprofiler/profilers/base_column_profilers.py +++ b/dataprofiler/profilers/base_column_profilers.py @@ -11,9 +11,8 @@ import numpy as np import pandas as pd -from dataprofiler.profilers.profiler_options import BaseInspectorOptions - from . import utils +from .profiler_options import BaseInspectorOptions, BaseOption BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler") @@ -30,7 +29,7 @@ class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta): _SAMPLING_RATIO = 0.20 _MIN_SAMPLING_COUNT = 500 - def __init__(self, name: str | None) -> None: + def __init__(self, name: str | None, options: BaseOption | None = None): """ Initialize base class properties for the subclass. @@ -249,6 +248,44 @@ def report(self, remove_disabled_flag: bool = False) -> dict: """ raise NotImplementedError() + @classmethod + def load_from_dict( + cls: type[BaseColumnProfilerT], + data: dict[str, Any], + config: dict | None = None, + ) -> BaseColumnProfilerT: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: BaseColumnProfiler + """ + if config is None: + config = {} + + class_options = config.get(cls.__name__) + profile: BaseColumnProfilerT = cls(data["name"], class_options) + + time_vals = data.pop("times") + setattr(profile, "times", defaultdict(float, time_vals)) + + for attr, value in data.items(): + if "__calculations" in attr: + for metric, function in value.items(): + if not hasattr(profile, function): + raise AttributeError( + f"Object {type(profile)} has no attribute {function}." + ) + value[metric] = getattr(profile, function).__func__ + setattr(profile, attr, value) + + return profile + BaseColumnPrimitiveTypeProfilerT = TypeVar( "BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler" @@ -282,7 +319,7 @@ def _update_column_base_properties(self, profile: dict) -> None: :type profile: base data profile dict :return: None """ - self.match_count += profile.pop("match_count") + self.match_count += int(profile.pop("match_count")) BaseColumnProfiler._update_column_base_properties(self, profile) def _add_helper( diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 58e05a952..caaf3778e 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -5,9 +5,11 @@ from operator import itemgetter from typing import cast +import datasketches from pandas import DataFrame, Series -from . import BaseColumnProfiler, utils +from . import utils +from .base_column_profilers import BaseColumnProfiler from .profiler_options import CategoricalOptions @@ -52,6 +54,11 @@ def __init__(self, name: str | None, options: CategoricalOptions = None) -> None self._stopped_at_unique_ratio: float | None = None self._stopped_at_unique_count: int | None = None + + self._cms_max_num_heavy_hitters: int | None = 5000 + self.cms_num_hashes: int | None = None + self.cms_num_buckets: int | None = None + self.cms: datasketches.countminsketch | None = None if options: self._top_k_categories = options.top_k_categories self.stop_condition_unique_value_ratio = ( @@ -61,6 +68,20 @@ def __init__(self, name: str | None, options: CategoricalOptions = None) -> None options.max_sample_size_to_check_stop_condition ) + if options.cms: + self._cms_max_num_heavy_hitters = options.cms_max_num_heavy_hitters + self.cms_num_hashes = datasketches.count_min_sketch.suggest_num_hashes( + options.cms_confidence + ) + self.cms_num_buckets = ( + datasketches.count_min_sketch.suggest_num_buckets( + options.cms_relative_error + ) + ) + self.cms = datasketches.count_min_sketch( + self.cms_num_hashes, self.cms_num_buckets + ) + def __add__(self, other: CategoricalColumn) -> CategoricalColumn: """ Merge the properties of two CategoricalColumn profiles. @@ -83,56 +104,140 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn: self._merge_calculations( merged_profile.__calculations, self.__calculations, other.__calculations ) - # If both profiles have not met stop condition - if not (self._stop_condition_is_met or other._stop_condition_is_met): - merged_profile._categories = utils.add_nested_dictionaries( - self._categories, other._categories + + if self.cms and other.cms: + + assert isinstance(self._cms_max_num_heavy_hitters, int) + assert isinstance(other._cms_max_num_heavy_hitters, int) + cms_max_num_heavy_hitters: int = min( + self._cms_max_num_heavy_hitters, other._cms_max_num_heavy_hitters ) - # Transfer stop condition variables of 1st profile object to merged profile - # if they are not None else set to 2nd profile - profile1_product = self.sample_size * self.unique_ratio - profile2_product = other.sample_size * other.unique_ratio - if profile1_product < profile2_product: - merged_profile.max_sample_size_to_check_stop_condition = ( - self.max_sample_size_to_check_stop_condition - ) - merged_profile.stop_condition_unique_value_ratio = ( - self.stop_condition_unique_value_ratio - ) - else: - merged_profile.stop_condition_unique_value_ratio = ( - other.stop_condition_unique_value_ratio - ) - merged_profile.max_sample_size_to_check_stop_condition = ( - other.max_sample_size_to_check_stop_condition + ( + merged_profile.cms, + merged_profile._categories, + merged_profile._cms_max_num_heavy_hitters, + ) = self._merge_categories_cms( + self.cms, + self._categories, + self.sample_size, + {}, + other.cms, + other._categories, + other.sample_size, + cms_max_num_heavy_hitters, + ) + + elif not self.cms and not other.cms: + # If both profiles have not met stop condition + if not (self._stop_condition_is_met or other._stop_condition_is_met): + merged_profile._categories = utils.add_nested_dictionaries( + self._categories, other._categories ) - # Check merged profile w/ stop condition - if merged_profile._check_stop_condition_is_met( - merged_profile.sample_size, merged_profile.unique_ratio - ): - merged_profile._stopped_at_unique_ratio = merged_profile.unique_ratio - merged_profile._stopped_at_unique_count = merged_profile.unique_count + # Transfer stop condition variables of 1st profile object to + # merged profile if they are not None else set to 2nd profile + profile1_product = self.sample_size * self.unique_ratio + profile2_product = other.sample_size * other.unique_ratio + if profile1_product < profile2_product: + merged_profile.max_sample_size_to_check_stop_condition = ( + self.max_sample_size_to_check_stop_condition + ) + merged_profile.stop_condition_unique_value_ratio = ( + self.stop_condition_unique_value_ratio + ) + else: + merged_profile.stop_condition_unique_value_ratio = ( + other.stop_condition_unique_value_ratio + ) + merged_profile.max_sample_size_to_check_stop_condition = ( + other.max_sample_size_to_check_stop_condition + ) + + # Check merged profile w/ stop condition + if merged_profile._check_stop_condition_is_met( + merged_profile.sample_size, merged_profile.unique_ratio + ): + merged_profile._stopped_at_unique_ratio = ( + merged_profile.unique_ratio + ) + merged_profile._stopped_at_unique_count = ( + merged_profile.unique_count + ) + merged_profile._categories = {} + merged_profile._stop_condition_is_met = True + + else: + if self.sample_size > other.sample_size: + merged_profile._stopped_at_unique_ratio = self.unique_ratio + merged_profile._stopped_at_unique_count = self.unique_count + merged_profile.sample_size = self.sample_size + else: + merged_profile._stopped_at_unique_ratio = other.unique_ratio + merged_profile._stopped_at_unique_count = other.unique_count + merged_profile.sample_size = other.sample_size + + # If either profile has hit stop condition, remove categories dict merged_profile._categories = {} merged_profile._stop_condition_is_met = True else: - if self.sample_size > other.sample_size: - merged_profile._stopped_at_unique_ratio = self.unique_ratio - merged_profile._stopped_at_unique_count = self.unique_count - merged_profile.sample_size = self.sample_size - else: - merged_profile._stopped_at_unique_ratio = other.unique_ratio - merged_profile._stopped_at_unique_count = other.unique_count - merged_profile.sample_size = other.sample_size - - # If either profile has hit stop condition, remove categories dict - merged_profile._categories = {} - merged_profile._stop_condition_is_met = True + raise Exception( + "Unable to add two profiles: One is using count min sketch" + "and the other is using full." + ) return merged_profile + @property + def gini_impurity(self) -> float | None: + """ + Return Gini Impurity. + + Gini Impurity is a way to calculate + likelihood of an incorrect classification of a new instance of + a random variable. + + G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. + We are traversing through categories and calculating with the column + + :return: None or Gini Impurity probability + """ + if self.sample_size == 0: + return None + gini_sum: float = 0 + for i in self._categories: + gini_sum += (self._categories[i] / self.sample_size) * ( + 1 - (self._categories[i] / self.sample_size) + ) + return gini_sum + + @property + def unalikeability(self) -> float | None: + """ + Return Unlikeability. + + Unikeability checks for "how often observations differ from one another" + Reference: Perry, M. and Kader, G. Variation as Unalikeability. + Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. + + U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) + Cij = 1 if i!=j, 0 if i=j + + :return: None or unlikeability probability + """ + if self.sample_size == 0: + return None + elif self.sample_size == 1: + return 0 + unalike_sum: int = 0 + for category in self._categories: + unalike_sum += ( + self.sample_size - self._categories[category] + ) * self._categories[category] + unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) + return unalike + def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: """ Find the differences for CategoricalColumns. @@ -212,6 +317,24 @@ def report(self, remove_disabled_flag: bool = False) -> dict: """ return self.profile + @classmethod + def load_from_dict(cls, data: dict, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: CategoricalColumn + """ + value = data.pop("_categories") + profile = super().load_from_dict(data) + setattr(profile, "_categories", defaultdict(int, value)) + return profile + @property def profile(self) -> dict: """ @@ -323,6 +446,118 @@ def _update_stop_condition(self, data: DataFrame): self._stopped_at_unique_ratio = merged_unique_ratio self._stopped_at_unique_count = merged_unique_count + @BaseColumnProfiler._timeit(name="categories") + def _get_categories_cms(self, df_series, len_df): + """Return count min sketch and heavy hitters for both the batch and stream case. + + :param df_series: Series currently being processed by categorical profiler + :type df_series: Series + :param len_df: the total number of samples iin df_series + :type len_df: int + :return: cms, heavy_hitter_dict, missing_heavy_hitter_dict + """ + cms = datasketches.count_min_sketch(self.cms_num_hashes, self.cms_num_buckets) + heavy_hitter_dict = defaultdict(int) + missing_heavy_hitter_dict = defaultdict(int) + for i, value in enumerate(df_series): + cms.update(value) + i_count = cms.get_estimate(value) + i_total_count = i_count + self.cms.get_estimate(value) + # approximate heavy-hitters + if i_count >= int(len_df / self._cms_max_num_heavy_hitters): + heavy_hitter_dict[value] = i_count + missing_heavy_hitter_dict.pop(value, None) + elif i_total_count >= int( + (self.sample_size + len_df) / self._cms_max_num_heavy_hitters + ): + missing_heavy_hitter_dict[value] = i_total_count + heavy_hitter_dict.pop(value, None) + + return cms, heavy_hitter_dict, missing_heavy_hitter_dict + + @BaseColumnProfiler._timeit(name="categories") + def _merge_categories_cms( + self, + cms1, + heavy_hitter_dict1, + len1, + missing_heavy_hitter_dict, + cms2, + heavy_hitter_dict2, + len2, + max_num_heavy_hitters, + ): + """Return the aggregate count min sketch and approximate histogram (categories). + + :param cms1: count min sketch + :type cms1: datasketches.countminsketch + :param cms2: count min sketch + :type cms2: datasketches.countminsketch + :param heavy_hitter_dict1: Heavy Hitters category count + :type heavy_hitter_dict1: Dict + :param heavy_hitter_dict2: Heavy Hitters category count + :type heavy_hitter_dict2: Dict + :param missing_heavy_hitter_dict: Heavy Hitters category count + considering two batches are two chunks of a data stream + :type missing_heavy_hitter_dict: Dict + :param len1: number of samples in batch 1 + :type len1: int + :param len2: number of samples in batch 2 + :type len2: int + :param max_num_heavy_hitters: value used to define + the threshold for minimum frequency required by a category to be counted + :type max_num_heavy_hitters: int + :return: cms1, categories, max_num_heavy_hitters + """ + try: + cms3 = datasketches.count_min_sketch( + self.cms_num_hashes, self.cms_num_buckets + ) + cms3.merge(cms1) + cms3.merge(cms2) + except ValueError as err: + raise err( + """Incompatible sketch configuration. When merging two sketches, + they must have the same number of buckets and hashes, + which are defined by cms_confidence and cms_relative_error options, + respectively.""" + ) + + # re-collecting the estimates of non intersecting categories before + # re-applying heavy-hitters to the aggregate profile. + heavy_hitter_dict1 = heavy_hitter_dict1.copy() + heavy_hitter_dict2 = heavy_hitter_dict2.copy() + for k in (x for x in heavy_hitter_dict1 if x not in heavy_hitter_dict2): + heavy_hitter_dict2[k] = cms2.get_estimate(k) + for k in (x for x in heavy_hitter_dict2 if x not in heavy_hitter_dict1): + heavy_hitter_dict1[k] = cms1.get_estimate(k) + + categories = utils.add_nested_dictionaries( + heavy_hitter_dict2, heavy_hitter_dict1 + ) + + # This is a catch all for edge cases where batch heavy hitters under estimates + # frequencies compared to treated as a sequence of batches as part of + # the same stream. + categories.update(missing_heavy_hitter_dict) + + total_samples = len1 + len2 + for cat in list(categories): + if categories[cat] < (total_samples / max_num_heavy_hitters): + categories.pop(cat) + return cms3, categories, max_num_heavy_hitters + + def _get_categories_full(self, df_series) -> dict: + """Get the unique counts (categories) of a series. + + :param df_series: df series with nulls removed + :type df_series: pandas.core.series.Series + :return: dict of counts for each unique value + :rtype: dict + """ + category_count: dict = df_series.value_counts(dropna=False).to_dict() + return category_count + @BaseColumnProfiler._timeit(name="categories") def _update_categories( self, @@ -345,13 +580,36 @@ def _update_categories( :type df_series: pandas.DataFrame :return: None """ - category_count = df_series.value_counts(dropna=False).to_dict() - self._categories = utils.add_nested_dictionaries( - self._categories, category_count - ) - self._update_stop_condition(df_series) - if self._stop_condition_is_met: - self._categories = {} + if self.cms is not None: + if self._cms_max_num_heavy_hitters is None: + raise ValueError( + "when using CMS, cms_max_num_heavy_hitters must be an integer" + ) + len_df = len(df_series) + ( + cms, + heavy_hitter_dict, + missing_heavy_hitter_dict, + ) = self._get_categories_cms(df_series, len_df) + + self.cms, self._categories, _ = self._merge_categories_cms( + cms, + heavy_hitter_dict, + len_df, + missing_heavy_hitter_dict, + self.cms, + self._categories, + self.sample_size, + self._cms_max_num_heavy_hitters, + ) + else: + category_count = self._get_categories_full(df_series) + self._categories = utils.add_nested_dictionaries( + self._categories, category_count + ) + self._update_stop_condition(df_series) + if self._stop_condition_is_met: + self._categories = {} def _update_helper(self, df_series_clean: Series, profile: dict) -> None: """ @@ -391,52 +649,3 @@ def update(self, df_series: Series) -> CategoricalColumn: self._update_helper(df_series, profile) return self - - @property - def gini_impurity(self) -> float | None: - """ - Return Gini Impurity. - - Gini Impurity is a way to calculate - likelihood of an incorrect classification of a new instance of - a random variable. - - G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. - We are traversing through categories and calculating with the column - - :return: None or Gini Impurity probability - """ - if self.sample_size == 0: - return None - gini_sum: float = 0 - for i in self._categories: - gini_sum += (self._categories[i] / self.sample_size) * ( - 1 - (self._categories[i] / self.sample_size) - ) - return gini_sum - - @property - def unalikeability(self) -> float | None: - """ - Return Unlikeability. - - Unikeability checks for "how often observations differ from one another" - Reference: Perry, M. and Kader, G. Variation as Unalikeability. - Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. - - U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) - Cij = 1 if i!=j, 0 if i=j - - :return: None or unlikeability probability - """ - if self.sample_size == 0: - return None - elif self.sample_size == 1: - return 0 - unalike_sum: int = 0 - for category in self._categories: - unalike_sum += ( - self.sample_size - self._categories[category] - ) * self._categories[category] - unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) - return unalike diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py index 441729f33..e3a8ecb16 100644 --- a/dataprofiler/profilers/column_profile_compilers.py +++ b/dataprofiler/profilers/column_profile_compilers.py @@ -14,6 +14,7 @@ from .datetime_column_profile import DateTimeColumn from .float_column_profile import FloatColumn from .int_column_profile import IntColumn +from .json_decoder import load_column_profile from .order_column_profile import OrderColumn from .profiler_options import BaseOption, StructuredOptions, UnstructuredOptions from .text_column_profile import TextColumn @@ -48,6 +49,7 @@ def __init__( if self._option_class is None: raise NotImplementedError("Must set the expected OptionClass.") + self.name = None self._profiles: dict = OrderedDict() if df_series is not None: self.name = df_series.name @@ -219,6 +221,36 @@ def update_profile( self._profiles[profile_type].update(df_series) return self + @classmethod + def load_from_dict(cls, data, config: dict | None = None) -> BaseCompiler: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Compiler with attributes populated. + :rtype: BaseCompiler + """ + compiler = cls() + + for attr, value in data.items(): + if "_profiles" in attr: + for col_type, profile_as_dict in value.items(): + value[col_type] = load_column_profile(profile_as_dict, config) + # since needs to be in the same order, use _profilers to enforce + value = OrderedDict( + { + k.type: value[k.type] + for k in compiler._profilers + if k.type in value + } + ) + setattr(compiler, attr, value) + return compiler + class ColumnPrimitiveTypeProfileCompiler( BaseCompiler["ColumnPrimitiveTypeProfileCompiler"] diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index a40a60dcf..9487278d6 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -9,7 +9,8 @@ from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler -from . import BaseColumnProfiler, utils +from . import utils +from .base_column_profilers import BaseColumnProfiler from .profiler_options import DataLabelerOptions @@ -306,6 +307,43 @@ def profile(self) -> dict: } return profile + @classmethod + def load_from_dict(cls, data, config: dict | None = None) -> DataLabelerColumn: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: DataLabelerColumn + """ + opt = DataLabelerOptions() + data_labeler_object = None + + data_labeler_load_attr = data.pop("data_labeler") + if data_labeler_load_attr: + data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_load_attr, config + ) + if data_labeler_object is not None: + opt.data_labeler_object = data_labeler_object + + # This is an ambiguous call to super classes. + # If load_from_dict is part of both super classes there may be issues + profile = super().load_from_dict(data, config={cls.__name__: opt}) + + if profile._reverse_label_mapping is not None: + profile._reverse_label_mapping = { + int(k): v for k, v in profile._reverse_label_mapping.items() + } + if profile._sum_predictions is not None: + profile._sum_predictions = np.array(profile._sum_predictions) + + return profile + def report(self, remove_disabled_flag: bool = False) -> dict: """ Return report. diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index 520b19f28..fc7801dd3 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -131,6 +131,31 @@ def report(self, remove_disabled_flag: bool = False) -> dict: """ return self.profile + @classmethod + def load_from_dict(cls, data, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: DateTimeColumn + """ + # This is an ambiguous call to super classes. + # If load_from_dict is part of both super classes there may be issues + profile = super().load_from_dict(data) + + if profile._dt_obj_min is not None: + profile._dt_obj_min = pd.Timestamp(profile._dt_obj_min) + + if profile._dt_obj_max is not None: + profile._dt_obj_max = pd.Timestamp(profile._dt_obj_max) + + return profile + @property def profile(self) -> dict: """Return the profile of the column.""" diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py index 231dc1778..bd3a1f3e3 100644 --- a/dataprofiler/profilers/float_column_profile.py +++ b/dataprofiler/profilers/float_column_profile.py @@ -159,6 +159,40 @@ def report(self, remove_disabled_flag: bool = False) -> dict: return profile + @classmethod + def load_from_dict(cls, data, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: FloatColumn + """ + # This is an ambiguous call to super classes. + # If load_from_dict is part of both super classes there may be issues + profile = super().load_from_dict(data) + profile._reformat_numeric_stats_types_on_serialized_profiles() + + # Fix float specific typing + if profile._precision["min"] is not None: + profile._precision["min"] = np.float64(profile._precision["min"]) + if profile._precision["max"] is not None: + profile._precision["max"] = np.float64(profile._precision["max"]) + if profile._precision["sum"] is not None: + profile._precision["sum"] = np.float64(profile._precision["sum"]) + if profile._precision["mean"] is not None: + profile._precision["mean"] = np.float64(profile._precision["mean"]) + if profile._precision["biased_var"] is not None: + profile._precision["biased_var"] = np.float64( + profile._precision["biased_var"] + ) + + return profile + @property def profile(self) -> dict: """ @@ -277,11 +311,11 @@ def _get_float_precision( # Determine statistics precision precision_sum = len_per_float.sum() subset_precision = { - "min": len_per_float.min(), - "max": len_per_float.max(), - "biased_var": float(np.var(len_per_float)), - "sum": precision_sum, - "mean": precision_sum / sample_size, + "min": np.float64(len_per_float.min()), + "max": np.float64(len_per_float.max()), + "biased_var": np.var(len_per_float), + "sum": np.float64(precision_sum), + "mean": np.float64(precision_sum / sample_size), "sample_size": sample_size, } diff --git a/dataprofiler/profilers/graph_profiler.py b/dataprofiler/profilers/graph_profiler.py index 4aa592515..123961d88 100644 --- a/dataprofiler/profilers/graph_profiler.py +++ b/dataprofiler/profilers/graph_profiler.py @@ -12,7 +12,8 @@ import scipy.stats as st from ..data_readers.graph_data import GraphData -from . import BaseColumnProfiler, utils +from . import utils +from .base_column_profilers import BaseColumnProfiler from .profiler_options import ProfilerOptions diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 7234f26e2..014465c71 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -72,6 +72,25 @@ def report(self, remove_disabled_flag: bool = False) -> dict: """ return self.profile + @classmethod + def load_from_dict(cls, data, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: IntColumn + """ + # This is an ambiguous call to super classes. + # If load_from_dict is part of both super classes there may be issues + profile = super().load_from_dict(data) + profile._reformat_numeric_stats_types_on_serialized_profiles() + return profile + @property def profile(self) -> dict: """ diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py index 44b1846f7..16bc2e148 100644 --- a/dataprofiler/profilers/json_decoder.py +++ b/dataprofiler/profilers/json_decoder.py @@ -1,12 +1,25 @@ """Contains methods to decode components of a Profiler.""" +from __future__ import annotations -import json +from typing import TYPE_CHECKING -from .base_column_profilers import BaseColumnProfiler -from .categorical_column_profile import CategoricalColumn +if TYPE_CHECKING: + import column_profile_compilers as col_pro_compiler + from .base_column_profilers import BaseColumnProfiler + from .profile_builder import BaseProfiler, StructuredColProfiler + from .profiler_options import BaseOption -def get_column_profiler_class(class_name: str) -> BaseColumnProfiler: + +# default, but set in the local __init__ to avoid circular imports +_profiles: dict[str, type[BaseColumnProfiler]] = {} +_profilers: dict[str, type[BaseProfiler]] = {} +_compilers: dict[str, type[col_pro_compiler.BaseCompiler]] = {} +_options: dict[str, type[BaseOption]] = {} +_structured_col_profiler: dict[str, type[StructuredColProfiler]] = {} + + +def get_column_profiler_class(class_name: str) -> type[BaseColumnProfiler]: """ Use name of class to return default-constructed version of that class. @@ -18,18 +31,95 @@ def get_column_profiler_class(class_name: str) -> BaseColumnProfiler: :type class_name: str representing name of class :return: subclass of BaseColumnProfiler object """ - profiles = { - CategoricalColumn.__name__: CategoricalColumn, - } - - profile_class = profiles.get(class_name) + profile_class: type[BaseColumnProfiler] | None = _profiles.get(class_name) if profile_class is None: raise ValueError(f"Invalid profiler class {class_name} " f"failed to load.") - profiler: BaseColumnProfiler = profile_class(None) - return profiler + return profile_class + + +def get_compiler_class(class_name: str) -> type[col_pro_compiler.BaseCompiler]: + """ + Use name of class to return default-constructed version of that class. + + Raises ValueError if class_name is not name of a subclass of + BaseCompiler. + + :param class_name: name of BaseCompiler subclass retrieved by + calling type(instance).__name__ + :type class_name: str representing name of class + :return: subclass of BaseCompiler object + """ + compiler_class: type[col_pro_compiler.BaseCompiler] | None = _compilers.get( + class_name + ) + if compiler_class is None: + raise ValueError(f"Invalid compiler class {class_name} " f"failed to load.") + return compiler_class -def load_column_profile(serialized_json: dict) -> BaseColumnProfiler: +def get_option_class(class_name: str) -> type[BaseOption]: + """ + Use name of class to return default-constructed version of that class. + + Raises ValueError if class_name is not name of a subclass of + BaseOptions. + + :param class_name: name of BaseOptions subclass retrieved by + calling type(instance).__name__ + :type class_name: str representing name of class + :return: subclass of BaseOptions object + """ + options_class: type[BaseOption] | None = _options.get(class_name) + if options_class is None: + raise ValueError(f"Invalid option class {class_name} " f"failed to load.") + return options_class + + +def get_profiler_class(class_name: str) -> type[BaseProfiler]: + """ + Use name of class to return default-constructed version of that class. + + Raises ValueError if class_name is not name of a subclass of + BaseProfiler. + + :param class_name: name of BaseProfiler subclass retrieved by + calling type(instance).__name__ + :type class_name: str representing name of class + + :raises: ValueError if the profiler class does not exist + + :return: subclass of BaseProfiler object + """ + profiler_class: type[BaseProfiler] | None = _profilers.get(class_name) + if profiler_class is None: + raise ValueError(f"Invalid profiler class {class_name} " f"failed to load.") + return profiler_class + + +def get_structured_col_profiler_class(class_name: str) -> type[StructuredColProfiler]: + """ + Use name of class to return default-constructed version of that class. + + Raises ValueError if class_name is not name of a subclass of + StructuredColProfiler. + :param class_name: name of StructuredColProfiler subclass retrieved by + calling type(instance).__name__ + :type class_name: str representing name of class + :return: subclass of StructuredColProfiler object + """ + struct_col_profiler_class: None | ( + type[StructuredColProfiler] + ) = _structured_col_profiler.get(class_name) + if struct_col_profiler_class is None: + raise ValueError( + f"Invalid structured col profiler class {class_name} " f"failed to load." + ) + return struct_col_profiler_class + + +def load_column_profile( + serialized_json: dict, config: dict | None = None +) -> BaseColumnProfiler: """ Construct subclass of BaseColumnProfiler given a serialized JSON. @@ -47,24 +137,131 @@ def load_column_profile(serialized_json: dict) -> BaseColumnProfiler: serialized using the custom encoder in profilers.json_encoder :type serialized_json: a dict that was created by calling json.loads on a JSON representation using the custom encoder + :param config: config for overriding data params when loading from dict + :type config: Dict | None + :return: subclass of BaseColumnProfiler that has been deserialized from JSON + """ - column_profiler = get_column_profiler_class(serialized_json["class"]) - for attr, value in serialized_json["data"].items(): - column_profiler.__setattr__(attr, value) + column_profiler_cls: type[ + BaseColumnProfiler[BaseColumnProfiler] + ] = get_column_profiler_class(serialized_json["class"]) + return column_profiler_cls.load_from_dict(serialized_json["data"], config) - return column_profiler +def load_compiler( + serialized_json: dict, config: dict | None = None +) -> col_pro_compiler.BaseCompiler: + """ + Construct subclass of BaseCompiler given a serialized JSON. + + Expected format of serialized_json (see json_encoder): + { + "class": + "data": { + : + : + ... + } + } + + :param serialized_json: JSON representation of profile compiler that was + serialized using the custom encoder in profilers.json_encoder + :type serialized_json: a dict that was created by calling json.loads on + a JSON representation using the custom encoder + :param config: config for overriding data params when loading from dict + :type config: Dict | None + :return: subclass of BaseCompiler that has been deserialized from + JSON -def decode_column_profiler(serialized: str) -> BaseColumnProfiler: """ - Construct subclass of BaseColumnProfiler given a serialized JSON. + column_profiler_cls: type[col_pro_compiler.BaseCompiler] = get_compiler_class( + serialized_json["class"] + ) + return column_profiler_cls.load_from_dict(serialized_json["data"], config) + + +def load_option(serialized_json: dict, config: dict | None = None) -> BaseOption: + """ + Construct subclass of BaseOption given a serialized JSON. + + Expected format of serialized_json (see json_encoder): + { + "class": + "data": { + : + : + ... + } + } - :param serialized: JSON representation of column profiler that was + :param serialized_json: JSON representation of option that was serialized using the custom encoder in profilers.json_encoder - :type serialized: a JSON str serialized using the custom decoder - :return: subclass of BaseColumnProfiler that has been deserialized from + :type serialized_json: a dict that was created by calling json.loads on + a JSON representation using the custom encoder + :param config: config for overriding data params when loading from dict + :type config: Dict | None + :return: subclass of BaseOption that has been deserialized from + JSON + + """ + option_cls: type[BaseOption] = get_option_class(serialized_json["class"]) + return option_cls.load_from_dict(serialized_json["data"], config) + + +def load_profiler(serialized_json: dict, config=None) -> BaseProfiler: + """ + Construct subclass of BaseProfiler given a serialized JSON. + + Expected format of serialized_json (see json_encoder): + { + "class": + "data": { + : + : + ... + } + } + + :param serialized_json: JSON representation of column profiler that was + serialized using the custom encoder in profilers.json_encoder + :type serialized_json: a dict that was created by calling json.loads on + a JSON representation using the custom encoder + :param config: config for overriding data params when loading from dict + :type config: Dict | None + :return: subclass of BaseProfiler that has been deserialized from + JSON + """ + profiler_cls: type[BaseProfiler] = get_profiler_class(serialized_json["class"]) + return profiler_cls.load_from_dict(serialized_json["data"], config) + + +def load_structured_col_profiler( + serialized_json: dict, config: dict | None = None +) -> StructuredColProfiler: + """ + Construct subclass of BaseProfiler given a serialized JSON. + + Expected format of serialized_json (see json_encoder): + { + "class": + "data": { + : + : + ... + } + } + :param serialized_json: JSON representation of column profiler that was + serialized using the custom encoder in profilers.json_encoder + :type serialized_json: a dict that was created by calling json.loads on + a JSON representation using the custom encoder + :param config: config for overriding data params when loading from dict + :type config: Dict | None + :return: subclass of BaseCompiler that has been deserialized from JSON """ - return load_column_profile(json.loads(serialized)) + profiler_cls: type[StructuredColProfiler] = get_structured_col_profiler_class( + serialized_json["class"] + ) + return profiler_cls.load_from_dict(serialized_json["data"], config) diff --git a/dataprofiler/profilers/json_encoder.py b/dataprofiler/profilers/json_encoder.py index 15a0d4e5c..4e12eb649 100644 --- a/dataprofiler/profilers/json_encoder.py +++ b/dataprofiler/profilers/json_encoder.py @@ -5,7 +5,14 @@ import numpy as np import pandas as pd -from . import base_column_profilers, numerical_column_stats +from ..labelers.base_data_labeler import BaseDataLabeler +from . import ( + base_column_profilers, + column_profile_compilers, + numerical_column_stats, + profile_builder, + profiler_options, +) class ProfileEncoder(json.JSONEncoder): @@ -17,22 +24,47 @@ def default(self, to_serialize): :param to_serialize: an object to be serialized :type to_serialize: a BaseColumnProfile object + + :raises: NotImplementedError + :return: a datatype serializble by json.JSONEncoder """ + if isinstance(to_serialize, profile_builder.UnstructuredProfiler): + raise NotImplementedError( + "UnstructuredProfiler serialization not supported." + ) + if isinstance( to_serialize, ( base_column_profilers.BaseColumnProfiler, numerical_column_stats.NumericStatsMixin, + column_profile_compilers.BaseCompiler, + profiler_options.BaseOption, + profile_builder.BaseProfiler, + profile_builder.StructuredColProfiler, ), ): return {"class": type(to_serialize).__name__, "data": to_serialize.__dict__} + elif isinstance(to_serialize, set): + return list(to_serialize) elif isinstance(to_serialize, np.integer): return int(to_serialize) elif isinstance(to_serialize, np.ndarray): return to_serialize.tolist() elif isinstance(to_serialize, pd.Timestamp): return to_serialize.isoformat() + elif isinstance(to_serialize, BaseDataLabeler): + # TODO: This does not allow the user to serialize a model if it is loaded + # "from_disk". Changes to BaseDataLabeler are needed for this feature + if to_serialize._default_model_loc is None: + raise ValueError( + "Serialization cannot be done on labelers with " + "_default_model_loc not set" + ) + + return {"from_library": to_serialize._default_model_loc} + elif callable(to_serialize): return to_serialize.__name__ diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index ae058464e..707e916db 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -56,11 +56,11 @@ def __init__(self, options: NumericalOptions = None) -> None: "NumericalStatsMixin parameter 'options' must be " "of type NumericalOptions." ) - self.min: int | float | None = None - self.max: int | float | None = None + self.min: int | float | np.float64 | np.int64 | None = None + self.max: int | float | np.float64 | np.int64 | None = None self._top_k_modes: int = 5 # By default, return at max 5 modes - self.sum: int | float = 0 - self._biased_variance: float = np.nan + self.sum: int | float | np.float64 | np.int64 = np.float64(0) + self._biased_variance: float | np.float64 = np.nan self._biased_skewness: float | np.float64 = np.nan self._biased_kurtosis: float | np.float64 = np.nan self._median_is_enabled: bool = True @@ -80,8 +80,10 @@ def __init__(self, options: NumericalOptions = None) -> None: self.user_set_histogram_bin: int | None = None self.bias_correction: bool = True # By default, we correct for bias self._mode_is_enabled: bool = True - self.num_zeros: int = 0 - self.num_negatives: int = 0 + self.num_zeros: int | np.int64 = np.int64(0) + self.num_negatives: int | np.int64 = np.int64(0) + self._num_quantiles: int = 1000 # TODO: add to options + if options: self.bias_correction = options.bias_correction.is_enabled self._top_k_modes = options.mode.top_k_modes @@ -98,23 +100,20 @@ def __init__(self, options: NumericalOptions = None) -> None: self.histogram_bin_method_names = ["custom"] self.histogram_methods: dict = {} self._stored_histogram: dict = { - "total_loss": 0, - "current_loss": 0, + "total_loss": np.float64(0.0), + "current_loss": np.float64(0.0), "suggested_bin_count": self.min_histogram_bin, "histogram": {"bin_counts": None, "bin_edges": None}, } self._batch_history: list = [] for method in self.histogram_bin_method_names: self.histogram_methods[method] = { - "total_loss": 0, - "current_loss": 0, + "total_loss": np.float64(0.0), + "current_loss": np.float64(0.0), "suggested_bin_count": self.min_histogram_bin, "histogram": {"bin_counts": None, "bin_edges": None}, } - num_quantiles: int = 1000 # TODO: add to options - self.quantiles: list[float] | dict = { - bin_num: None for bin_num in range(num_quantiles - 1) - } + self.quantiles: list[float] | None = None self.__calculations = { "min": NumericStatsMixin._get_min, "max": NumericStatsMixin._get_max, @@ -190,8 +189,8 @@ def _add_helper_merge_profile_histograms( for method in self.histogram_bin_method_names: self.histogram_methods[method] = { - "total_loss": 0, - "current_loss": 0, + "total_loss": np.float64(0.0), + "current_loss": np.float64(0.0), "histogram": {"bin_counts": None, "bin_edges": None}, } @@ -425,6 +424,58 @@ def report(self, remove_disabled_flag: bool = False) -> dict: return profile + def _reformat_numeric_stats_types_on_serialized_profiles(self): + """Assistance function in the deserialization of profiler objects. + + This function is to be used to enforce correct typing for attributes + associated with the NumericStatsMixin conversions when loading profiler + objects in from their serialized saved format + """ + + def convert_histogram_key_types_to_np(histogram_info: dict): + if histogram_info["total_loss"] is not None: + histogram_info["total_loss"] = np.float64(histogram_info["total_loss"]) + + if histogram_info["current_loss"] is not None: + histogram_info["current_loss"] = np.float64( + histogram_info["current_loss"] + ) + + # Convert hist lists to numpy arrays + for key in histogram_info["histogram"].keys(): + if histogram_info["histogram"][key] is not None: + histogram_info["histogram"][key] = np.array( + histogram_info["histogram"][key] + ) + return histogram_info + + self._stored_histogram = convert_histogram_key_types_to_np( + self._stored_histogram + ) + + # Convert hist method attributes to correct types + for key in self.histogram_methods.keys(): + self.histogram_methods[key] = convert_histogram_key_types_to_np( + self.histogram_methods[key] + ) + + if self.min is not None: + self.min = np.float64(self.min) + if self.max is not None: + self.max = np.float64(self.max) + if self.sum is not None: + self.sum = np.float64(self.sum) + if self.num_zeros is not None: + self.num_zeros = np.int64(self.num_zeros) + if self.num_negatives is not None: + self.num_negatives = np.int64(self.num_negatives) + if not np.isnan(self._biased_variance): + self._biased_variance = np.float64(self._biased_variance) + if not np.isnan(self._biased_skewness): + self._biased_skewness = np.float64(self._biased_skewness) + if not np.isnan(self._biased_kurtosis): + self._biased_kurtosis = np.float64(self._biased_kurtosis) + def diff( self, other_profile: NumericStatsMixinT, @@ -478,11 +529,11 @@ def diff( return differences @property - def mean(self) -> float: + def mean(self) -> float | np.float64: """Return mean value.""" if self.match_count == 0: return 0.0 - return float(self.sum) / self.match_count + return self.sum / self.match_count @property def mode(self) -> list[float]: @@ -509,7 +560,7 @@ def median(self) -> float: return self._get_percentile([50])[0] @property - def variance(self) -> float: + def variance(self) -> float | np.float64: """Return variance.""" return ( self._biased_variance @@ -544,7 +595,12 @@ def kurtosis(self) -> float | np.float64: @staticmethod def _perform_t_test( - mean1: float, var1: float, n1: int, mean2: float, var2: float, n2: int + mean1: float | np.float64, + var1: float | np.float64, + n1: int, + mean2: float | np.float64, + var2: float | np.float64, + n2: int, ) -> dict: results: dict = { "t-statistic": None, @@ -559,7 +615,9 @@ def _perform_t_test( RuntimeWarning, ) invalid_stats = True - if np.isnan([mean1, mean2, var1, var2]).any() or None in [ + if np.isnan( + [float(mean1), float(mean2), float(var1), float(var2)] + ).any() or None in [ mean1, mean2, var1, @@ -749,7 +807,7 @@ def _update_variance( batch_mean: float, batch_var: float, batch_count: int, - ) -> float: + ) -> float | np.float64: """ Calculate combined biased variance of the current values and new dataset. @@ -757,7 +815,7 @@ def _update_variance( :param batch_var: biased variance of new chunk :param batch_count: number of samples in new chunk :return: combined biased variance - :rtype: float + :rtype: float | np.float64 """ return self._merge_biased_variance( self.match_count, @@ -771,12 +829,12 @@ def _update_variance( @staticmethod def _merge_biased_variance( match_count1: int, - biased_variance1: float, - mean1: float, + biased_variance1: float | np.float64, + mean1: float | np.float64, match_count2: int, - biased_variance2: float, - mean2: float, - ) -> float: + biased_variance2: float | np.float64, + mean2: float | np.float64, + ) -> float | np.float64: """ Calculate combined biased variance of the current values and new dataset. @@ -787,7 +845,7 @@ def _merge_biased_variance( :param mean2: mean of chunk 2 :param biased_variance2: variance of chunk 2 without bias correction :return: combined variance - :rtype: float + :rtype: float | np.float64 """ if match_count1 < 1: return biased_variance2 @@ -809,7 +867,9 @@ def _merge_biased_variance( return new_variance @staticmethod - def _correct_bias_variance(match_count: int, biased_variance: float) -> float: + def _correct_bias_variance( + match_count: int, biased_variance: float | np.float64 + ) -> float | np.float64: if match_count is None or biased_variance is None or match_count < 2: warnings.warn( "Insufficient match count to correct bias in variance. Bias correction " @@ -826,12 +886,12 @@ def _correct_bias_variance(match_count: int, biased_variance: float) -> float: def _merge_biased_skewness( match_count1: int, biased_skewness1: float | np.float64, - biased_variance1: float, - mean1: float, + biased_variance1: float | np.float64, + mean1: float | np.float64, match_count2: int, biased_skewness2: float | np.float64, - biased_variance2: float, - mean2: float, + biased_variance2: float | np.float64, + mean2: float | np.float64, ) -> float | np.float64: """ Calculate the combined skewness of two data chunks. @@ -912,13 +972,13 @@ def _merge_biased_kurtosis( match_count1: int, biased_kurtosis1: float | np.float64, biased_skewness1: float | np.float64, - biased_variance1: float, - mean1: float, + biased_variance1: float | np.float64, + mean1: float | np.float64, match_count2: int, biased_kurtosis2: float | np.float64, biased_skewness2: float | np.float64, - biased_variance2: float, - mean2: float, + biased_variance2: float | np.float64, + mean2: float | np.float64, ) -> float | np.float64: """ Calculate the combined kurtosis of two sets of data. @@ -1069,7 +1129,7 @@ def _total_histogram_bin_variance( sum_var += bin_var return sum_var - def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> float: + def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> np.float64: """ Calculate error of each value from bin of the histogram it falls within. @@ -1097,7 +1157,7 @@ def _histogram_bin_error(self, input_array: np.ndarray | pd.Series) -> float: (input_array - (bin_edges[inds] + bin_edges[inds - 1]) / 2) ** 2 ) - return sum_error + return np.float64(sum_error) @staticmethod def _histogram_loss( @@ -1107,7 +1167,7 @@ def _histogram_loss( avg_totalvar: float, run_time: float, avg_runtime: float, - ) -> float: + ) -> np.float64: norm_diff_var: float = 0 norm_total_var: float = 0 @@ -1119,7 +1179,7 @@ def _histogram_loss( penalized_time = 1 # currently set as 1s if (run_time - avg_runtime) >= penalized_time: norm_runtime = float(run_time - avg_runtime) / avg_runtime - return norm_diff_var + norm_total_var + norm_runtime + return np.float64(norm_diff_var + norm_total_var + norm_runtime) def _select_method_for_histogram( self, @@ -1647,7 +1707,9 @@ def _get_quantiles(self) -> None: :return: list of quantiles """ - percentiles: np.ndarray = np.linspace(0, 100, len(self.quantiles) + 2)[1:-1] + percentiles: np.ndarray = np.linspace(0, 100, (self._num_quantiles - 1) + 2)[ + 1:-1 + ] self.quantiles = self._get_percentile(percentiles=percentiles) def _update_helper(self, df_series_clean: pd.Series, profile: dict) -> None: diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index 5e38a13c4..c6a369d8d 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -1,14 +1,33 @@ """Index profile analysis for individual col within structured profiling.""" from __future__ import annotations -from typing import cast +from abc import abstractmethod +from typing import Protocol, Type, TypeVar, cast +import numpy as np from pandas import DataFrame, Series -from . import BaseColumnProfiler, utils +from . import utils +from .base_column_profilers import BaseColumnProfiler from .profiler_options import OrderOptions +class Comparable(Protocol): + """Protocol for ensuring comparable types, in this case both floats or strings.""" + + @abstractmethod + def __lt__(self: CT, other: CT) -> bool: + """Protocol for ensuring comparable values.""" + pass + + +CT = TypeVar("CT", bound=Comparable) + +# bc type in class attr causing issues, need to alias +AliasFloatType = Type[np.float64] +AliasStrType = Type[str] + + class OrderColumn(BaseColumnProfiler["OrderColumn"]): """ Index column profile subclass of BaseColumnProfiler. @@ -32,8 +51,9 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None: "OrderColumn parameter 'options' must be of type" " OrderOptions." ) self.order: str | None = None - self._last_value: int | None = None - self._first_value: int | None = None + self._last_value: np.float64 | float | str | None = None + self._first_value: np.float64 | float | str | None = None + self._data_store_type: AliasStrType | AliasFloatType = np.float64 self._piecewise: bool | None = False self.__calculations: dict = {} self._filter_properties_w_options(self.__calculations, options) @@ -41,19 +61,22 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None: @staticmethod def _is_intersecting( - first_value1: int, last_value1: int, first_value2: int, last_value2: int + first_value1: CT, + last_value1: CT, + first_value2: CT, + last_value2: CT, ) -> bool: """ Check to see if the range of the datasets intersect. :param first_value1: beginning value of dataset 1 - :type first_value1: Integer + :type first_value1: Float | String :param last_value1: last value of dataset 1 - :type last_value1: Integer + :type last_value1: Float | String :param first_value2: beginning value of dataset 2 - :type first_value2: Integer + :type first_value2: Float | String :param last_value2: last value of dataset 2 - :type last_value2: Integer + :type last_value2: Float | String :return: Whether or not there is an intersection :rtype: Bool """ @@ -77,19 +100,22 @@ def _is_intersecting( @staticmethod def _is_enveloping( - first_value1: int, last_value1: int, first_value2: int, last_value2: int + first_value1: CT, + last_value1: CT, + first_value2: CT, + last_value2: CT, ) -> bool: """ Check to see if the range of the dataset 1 envelopes dataset 2. :param first_value1: beginning value of dataset 1 - :type first_value1: Integer + :type first_value1: Float | String :param last_value1: last value of dataset 1 - :type last_value1: Integer + :type last_value1: Float | String :param first_value2: beginning value of dataset 2 - :type first_value2: Integer + :type first_value2: Float | String :param last_value2: last value of dataset 2 - :type last_value2: Integer + :type last_value2: Float | String :return: Whether or not there is an intersection :rtype: Bool """ @@ -108,41 +134,56 @@ def _is_enveloping( def _merge_order( self, order1: str, - first_value1: int, - last_value1: int, + first_value1: CT, + last_value1: CT, + data_store_type1: AliasStrType | AliasFloatType, piecewise1: bool, order2: str, - first_value2: int, - last_value2: int, + first_value2: CT, + last_value2: CT, + data_store_type2: AliasStrType | AliasFloatType, piecewise2: bool, - ) -> tuple[str, int, int, bool]: + ) -> tuple[str, CT | None, CT | None, bool, AliasStrType | AliasFloatType]: """ Add the order of two datasets together. :param order1: order of original dataset :param first_value1: beginning value of original dataset :param last_value1: last value of original dataset + :param data_store_type1: type of value for first_value1 and last_value1 :param piecewise1: original dataset is piecewise or not :param order2: order of new dataset :param first_value2: beginning value of new dataset :param last_value2: last value of new dataset + :param data_store_type2: type of value for first_value2 and last_value2 :param piecewise2: new dataset is piecewise or not :type order1: String - :type first_value1: Integer - :type last_value1: Integer + :type first_value1: Float | String + :type last_value1: Float | String :type piecewise1: Boolean + :type data_store_type1: Type[str] | Type[np.float64] :type order2: String - :type first_value2: Integer - :type last_value2: Integer + :type first_value2: Float | String + :type last_value2: Float | String + :type data_store_type2: Type[str] | Type[np.float64] :type piecewise2: Boolean - :return: order, first_value, last_value, piecewise - :rtype: String, Int, Int, Boolean + :return: order, first_value, last_value, piecewise, merged_data_store_type + :rtype: String, Float | String, Float | String, Boolean, Type[str] + | Type[np.float64] """ # Return either order if one is None if not order1: - return order2, first_value2, last_value2, piecewise2 + return order2, first_value2, last_value2, piecewise2, data_store_type2 elif not order2: - return order1, first_value1, last_value1, piecewise1 + return order1, first_value1, last_value1, piecewise1, data_store_type1 + + merged_data_store_type: AliasStrType | AliasFloatType = np.float64 + if data_store_type1 is str or data_store_type2 is str: + first_value1 = cast(CT, str(first_value1)) + last_value1 = cast(CT, str(last_value1)) + first_value2 = cast(CT, str(first_value2)) + last_value2 = cast(CT, str(last_value2)) + merged_data_store_type = str is_intersecting = self._is_intersecting( first_value1, last_value1, first_value2, last_value2 @@ -156,8 +197,8 @@ def _merge_order( # Default initialization order = "random" - first_value: int | None = None - last_value: int | None = None + first_value: CT | None = None + last_value: CT | None = None if order1 == "random" or order2 == "random": order = "random" @@ -218,7 +259,7 @@ def _merge_order( ) or order == "random": piecewise = False - return order, cast(int, first_value), cast(int, last_value), piecewise + return order, first_value, last_value, piecewise, merged_data_store_type def __add__(self, other: OrderColumn) -> OrderColumn: """ @@ -238,14 +279,16 @@ def __add__(self, other: OrderColumn) -> OrderColumn: ) merged_profile = OrderColumn(None) - order, first_value, last_value, piecewise = self._merge_order( + order, first_value, last_value, piecewise, data_store_type = self._merge_order( self.order, self._first_value, self._last_value, + self._data_store_type, self._piecewise, other.order, other._first_value, other._last_value, + other._data_store_type, other._piecewise, ) @@ -253,6 +296,7 @@ def __add__(self, other: OrderColumn) -> OrderColumn: merged_profile._first_value = first_value merged_profile._last_value = last_value merged_profile._piecewise = piecewise + merged_profile._data_store_type = data_store_type BaseColumnProfiler._add_helper(merged_profile, self, other) self._merge_calculations( @@ -270,6 +314,33 @@ def report(self, remove_disabled_flag: bool = False) -> dict: """ return self.profile + @classmethod + def load_from_dict(cls, data, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: options for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: CategoricalColumn + """ + # This is an ambiguous call to super classes. + data["_data_store_type"] = ( + str if data["_data_store_type"] == "str" else np.float64 + ) + profile = super().load_from_dict(data) + try: + if profile.sample_size and profile._data_store_type is np.float64: + profile._first_value = np.float64(profile._first_value) + profile._last_value = np.float64(profile._last_value) + except ValueError: + profile._first_value = data["_first_value"] + profile._last_value = data["_last_value"] + return profile + @property def profile(self) -> dict: """ @@ -298,7 +369,9 @@ def diff(self, other_profile: OrderColumn, options: dict = None) -> dict: return differences @BaseColumnProfiler._timeit(name="order") - def _get_data_order(self, df_series: Series) -> tuple[str, float, float]: + def _get_data_order( + self, df_series: Series, data_store_type: AliasStrType | AliasFloatType + ) -> tuple[str, float, float, AliasStrType | AliasFloatType]: """ Retrieve the order profile of a given data series. @@ -307,20 +380,22 @@ def _get_data_order(self, df_series: Series) -> tuple[str, float, float]: :param df_series: a given column :type df_series: pandas.core.series.Series - :return: order, first_value, last_value - :rtype: String, Float, Float + :param data_store_type: type of value for first_value and last_value + :type data_store_type: Type[str] | Type[np.float64] + :return: order, first_value, last_value, data_store_type + :rtype: String, Float, Float, type, Type[str] | Type[np.float64] """ try: - df_series = df_series.astype(float) + if data_store_type is not str: + df_series = df_series.astype(float) except ValueError: - pass + data_store_type = str order = None last_value = df_series.iloc[0] first_value = df_series.iloc[0] - for i in range(1, len(df_series)): - value = df_series.iloc[i] + for value in df_series.values: if value < last_value and order == "ascending": order = "random" break @@ -335,7 +410,7 @@ def _get_data_order(self, df_series: Series) -> tuple[str, float, float]: if not order: order = "constant value" - return order, first_value, last_value + return order, first_value, last_value, data_store_type def _update_order( self, @@ -363,21 +438,26 @@ def _update_order( """ if self.order == "random": return - order, first_value, last_value = self._get_data_order(df_series) + order, first_value, last_value, data_store_type = self._get_data_order( + df_series, self._data_store_type + ) ( self.order, self._first_value, self._last_value, self._piecewise, + self._data_store_type, ) = self._merge_order( self.order, self._first_value, self._last_value, + self._data_store_type, self._piecewise, order, first_value, last_value, + data_store_type, piecewise2=False, ) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index d64eaa175..fc2a2246e 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -3,7 +3,9 @@ from __future__ import annotations import copy +import json import logging +import os import pickle import random import re @@ -11,14 +13,14 @@ from collections import OrderedDict, defaultdict from datetime import datetime from multiprocessing.pool import Pool -from typing import Any, Generator, List, Optional, cast +from typing import Any, Generator, List, Optional, TypeVar, cast import networkx as nx import numpy as np import pandas as pd from HLL import HyperLogLog -from .. import data_readers, dp_logging +from .. import data_readers, dp_logging, settings from ..data_readers.data import Data from ..labelers.base_data_labeler import BaseDataLabeler from ..labelers.data_labelers import DataLabeler @@ -32,6 +34,13 @@ ) from .graph_profiler import GraphProfiler from .helpers.report_helpers import _prepare_report, calculate_quantiles +from .json_decoder import ( + load_compiler, + load_option, + load_profiler, + load_structured_col_profiler, +) +from .json_encoder import ProfileEncoder from .profiler_options import ( BaseOption, ProfilerOptions, @@ -39,6 +48,8 @@ UnstructuredOptions, ) +BaseProfilerT = TypeVar("BaseProfilerT", bound="BaseProfiler") + logger = dp_logging.get_child_logger(__name__) @@ -379,6 +390,35 @@ def report(self, remove_disabled_flag: bool = False) -> OrderedDict: return report + @classmethod + def load_from_dict(cls, data, config: dict | None = None) -> StructuredColProfiler: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading structured column profiler + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: StructuredColProfiler + """ + profile = cls() + for attr, value in data.items(): + if attr == "profiles": + for profile_key, profile_value in value.items(): + value[profile_key] = load_compiler(profile_value, config) + if attr == "options" and value is not None: + value = load_option(value, config) + if attr == "_null_values": + value = { + k: (re.RegexFlag(v) if v != 0 else 0) for k, v in value.items() + } + if attr == "null_types_index": + value = {k: set(v) for k, v in value.items()} + setattr(profile, attr, value) + return profile + @property def profile(self) -> dict: """Return a report.""" @@ -614,11 +654,22 @@ def clean_data_and_get_base_stats( df_series = df_series.loc[true_sample_list] total_na = total_sample_size - len(true_sample_list) + rng = np.random.default_rng(settings._seed) + + if "DATAPROFILER_SEED" in os.environ and settings._seed is None: + seed = os.environ.get("DATAPROFILER_SEED") + if isinstance(seed, int): + rng = np.random.default_rng(int(seed)) + else: + warnings.warn("Seed should be an integer", RuntimeWarning) + base_stats = { "sample_size": total_sample_size, "null_count": total_na, "null_types": na_columns, - "sample": random.sample(list(df_series.values), min(len(df_series), 5)), + "sample": rng.choice( + list(df_series.values), (min(len(df_series), 5),), replace=False + ).tolist(), "min_id": min_id, "max_id": max_id, } @@ -835,6 +886,36 @@ def report(self, report_options: dict = None) -> dict: """ raise NotImplementedError() + @classmethod + def load_from_dict( + cls: type[BaseProfilerT], data, config: dict | None = None + ) -> BaseProfilerT: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for overriding data params when loading from dict + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: BaseProfiler + """ + options = load_option(data["options"], config) + profiler = cls(None, options=options) + + for attr, value in data.items(): + if "times" == attr: + value = defaultdict(float, value) + if "_profile" == attr: + for idx, profile in enumerate(value): + value[idx] = load_structured_col_profiler(profile, config) + if "options" == attr: + continue + + setattr(profiler, attr, value) + return profiler + def _update_profile_from_chunk( self, data: pd.Series | pd.DataFrame | list, @@ -1027,7 +1108,7 @@ def _restore_data_labelers(self, data_labeler: BaseDataLabeler = None) -> None: data_labeler_profile = profiler._profiles["data_labeler"] data_labeler_profile.data_labeler = data_labeler - def _save_helper(self, filepath: str | None, data_dict: dict) -> None: + def _pkl_save_helper(self, filepath: str | None, data_dict: dict) -> None: """ Save profiler to disk. @@ -1056,7 +1137,7 @@ def _save_helper(self, filepath: str | None, data_dict: dict) -> None: # Restore all data labelers self._restore_data_labelers(data_labelers) - def save(self, filepath: str = None) -> None: + def _json_save_helper(self, filepath: str | None) -> None: """ Save profiler to disk. @@ -1064,22 +1145,59 @@ def save(self, filepath: str = None) -> None: :type filepath: String :return: None """ + if filepath is None: + filepath = "profile-{}.json".format( + datetime.now().strftime("%d-%b-%Y-%H:%M:%S.%f") + ) + + with open(filepath, "w") as f: + json.dump(self, f, cls=ProfileEncoder) + + def save(self, filepath: str = None, save_method: str = "pickle") -> None: + """ + Save profiler to disk. + + :param filepath: Path of file to save to + :type filepath: String + :param save_method: The desired saving method (must be "pickle" or "json") + :type save_method: String + :return: None + """ raise NotImplementedError() @classmethod - def load(cls, filepath: str) -> BaseProfiler: + def load(cls, filepath: str, load_method: str | None = None) -> BaseProfiler: """ Load profiler from disk. :param filepath: Path of file to load from :type filepath: String + :param load_method: The desired loading method, default = None + :type load_method: Optional[String] :return: Profiler being loaded, StructuredProfiler or UnstructuredProfiler :rtype: BaseProfiler """ # Load profile from disk - with open(filepath, "rb") as infile: - data: dict = pickle.load(infile) + if isinstance(load_method, str): + load_method = load_method.lower() + if load_method not in [None, "pickle", "json"]: + raise ValueError( + "Please specify a valid load_method ('pickle','json' or None)" + ) + + data: dict | None = None + try: + if load_method is None or load_method == "pickle": + with open(filepath, "rb") as infile: + data = pickle.load(infile) + except pickle.UnpicklingError: + if load_method == "pickle": + raise ValueError("File is unable to be loaded as pickle.") + finally: + if data is None or load_method == "json": + with open(filepath) as infile: + return load_profiler(json.load(infile), {}) # remove profiler class if it exists profiler_class: str | None = data.pop("profiler_class", None) @@ -1308,6 +1426,24 @@ def report(self, report_options: dict = None) -> dict: report["data_stats"] = self._profile.report(remove_disabled_flag) return _prepare_report(report, output_format, omit_keys) + @classmethod + def load_from_dict( + cls, + data, + config: dict | None = None, + ): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading profiler params from dictionary + :type config: Dict | None + + :raises: NotImplementedError + """ + raise NotImplementedError("UnstructuredProfiler deserialization not supported.") + @utils.method_timeit(name="clean_and_base_stats") def _clean_data_and_get_base_stats( self, data: pd.Series, sample_size: int, min_true_samples: int = None @@ -1449,29 +1585,36 @@ def _update_profile_from_chunk( else: self._profile.update_profile(data, pool=pool) - def save(self, filepath: str = None) -> None: + def save(self, filepath: str = None, save_method: str = "pickle") -> None: """ Save profiler to disk. :param filepath: Path of file to save to :type filepath: String + :param save_method: The desired saving method ("pickle" | "json") + :type save_method: String :return: None """ - # Create dictionary for all metadata, options, and profile - data_dict = { - "total_samples": self.total_samples, - "sample": self.sample, - "encoding": self.encoding, - "file_type": self.file_type, - "_samples_per_update": self._samples_per_update, - "_min_true_samples": self._min_true_samples, - "_empty_line_count": self._empty_line_count, - "memory_size": self.memory_size, - "options": self.options, - "_profile": self.profile, - "times": self.times, - } - self._save_helper(filepath, data_dict) + save_method = save_method.lower() + if save_method == "pickle": + data_dict = { + "total_samples": self.total_samples, + "sample": self.sample, + "encoding": self.encoding, + "file_type": self.file_type, + "_samples_per_update": self._samples_per_update, + "_min_true_samples": self._min_true_samples, + "_empty_line_count": self._empty_line_count, + "memory_size": self.memory_size, + "options": self.options, + "_profile": self.profile, + "times": self.times, + } + self._pkl_save_helper(filepath, data_dict) + elif save_method == "json": + self._json_save_helper(filepath) + else: + raise ValueError('save_method must be "json" or "pickle".') class StructuredProfiler(BaseProfiler): @@ -1582,6 +1725,15 @@ def _add_error_checks( # type: ignore[override] "Attempting to merge two profiles with unique row " "count option enabled on one profile but not the other." ) + # Check null_count options + if ( + self.options.row_statistics.null_count.is_enabled + != other.options.row_statistics.null_count.is_enabled + ): + raise ValueError( + "Attempting to merge two profiles with null row " + "count option enabled on one profile but not the other." + ) # Check hashing_method options if ( self.options.row_statistics.unique_count.hashing_method @@ -1950,6 +2102,42 @@ def report(self, report_options: dict = None) -> dict: return _prepare_report(report, output_format, omit_keys) + @classmethod + def load_from_dict( + cls, + data, + config: dict | None = None, + ) -> StructuredProfiler: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: StructuredProfiler + """ + if data["chi2_matrix"] is not None: + data["chi2_matrix"] = np.array(data["chi2_matrix"]) + if data["correlation_matrix"] is not None: + data["correlation_matrix"] = np.array(data["correlation_matrix"]) + try: + data["_col_name_to_idx"] = defaultdict( + list, {int(k): v for k, v in data["_col_name_to_idx"].items()} + ) + except Exception: + data["_col_name_to_idx"] = defaultdict(list, data["_col_name_to_idx"]) + + data["hashed_row_object"] = { + int(k): v for k, v in data["hashed_row_object"].items() + } + + structured_profiler = super().load_from_dict(data, config) + + return structured_profiler + def _get_unique_row_ratio(self) -> float | None: """Return unique row ratio.""" if ( @@ -1967,7 +2155,10 @@ def _get_unique_row_ratio(self) -> float | None: def _get_row_is_null_ratio(self) -> float | None: """Return whether row is null ratio.""" - if not self.options.row_statistics.is_enabled: + if ( + not self.options.row_statistics.is_enabled + or not self.options.row_statistics.null_count.is_enabled + ): return None if self._min_col_samples_used: @@ -1976,7 +2167,10 @@ def _get_row_is_null_ratio(self) -> float | None: def _get_row_has_null_ratio(self) -> float | None: """Return whether row has null ratio.""" - if not self.options.row_statistics.is_enabled: + if ( + not self.options.row_statistics.is_enabled + or not self.options.row_statistics.null_count.is_enabled + ): return None if self._min_col_samples_used: @@ -2051,48 +2245,51 @@ def _update_row_statistics( self.hashed_row_object.add(record) # Calculate Null Column Count - null_rows = set() - null_in_row_count = set() - first_col_flag = True - for column in self._profile: - null_type_dict = column.null_types_index - null_row_indices = set() - if null_type_dict: - null_row_indices = set.union(*null_type_dict.values()) - - # If sample ids provided, only consider nulls in rows that - # were fully sampled - if sample_ids is not None: - # This is the amount (integer) indices were shifted by in the - # event of overlap - shift = column._index_shift - if shift is None: - # Shift is None if index is str or if no overlap detected - null_row_indices = null_row_indices.intersection( - data.index[sample_ids[: self._min_sampled_from_batch]] - ) + if self.options.row_statistics.null_count.is_enabled: + null_rows = set() + null_in_row_count = set() + first_col_flag = True + for column in self._profile: + null_type_dict = column.null_types_index + null_row_indices = set() + if null_type_dict: + null_row_indices = set.union(*null_type_dict.values()) + + # If sample ids provided, only consider nulls in rows that + # were fully sampled + if sample_ids is not None: + # This is the amount (integer) indices were shifted by in the + # event of overlap + shift = column._index_shift + if shift is None: + # Shift is None if index is str or if no overlap detected + null_row_indices = null_row_indices.intersection( + data.index[sample_ids[: self._min_sampled_from_batch]] + ) + else: + # Only shift if index shift detected (must be ints) + null_row_indices = null_row_indices.intersection( + data.index[sample_ids[: self._min_sampled_from_batch]] + + shift + ) + + # Find the common null indices between the columns + if first_col_flag: + null_rows = null_row_indices + null_in_row_count = null_row_indices + first_col_flag = False else: - # Only shift if index shift detected (must be ints) - null_row_indices = null_row_indices.intersection( - data.index[sample_ids[: self._min_sampled_from_batch]] + shift - ) + null_rows = null_rows.intersection(null_row_indices) + null_in_row_count = null_in_row_count.union(null_row_indices) - # Find the common null indices between the columns - if first_col_flag: - null_rows = null_row_indices - null_in_row_count = null_row_indices - first_col_flag = False + # If sample_ids provided, + # increment since that means only new data read + if sample_ids is not None: + self.row_has_null_count += len(null_in_row_count) + self.row_is_null_count += len(null_rows) else: - null_rows = null_rows.intersection(null_row_indices) - null_in_row_count = null_in_row_count.union(null_row_indices) - - # If sample_ids provided, increment since that means only new data read - if sample_ids is not None: - self.row_has_null_count += len(null_in_row_count) - self.row_is_null_count += len(null_rows) - else: - self.row_has_null_count = len(null_in_row_count) - self.row_is_null_count = len(null_rows) + self.row_has_null_count = len(null_in_row_count) + self.row_is_null_count = len(null_rows) def _get_correlation( self, clean_samples: dict, batch_properties: dict @@ -2409,7 +2606,7 @@ def _update_null_replication_metrics(self, clean_samples: dict) -> None: total_row_sum = np.asarray( [ get_data_type_profiler(profile).sum - if get_data_type(profile) + if get_data_type(profile) not in [None, "datetime"] else np.nan for profile in self._profile ] @@ -2831,32 +3028,38 @@ def tqdm(level: set[int]) -> Generator[int, None, None]: if self.options.null_replication_metrics.is_enabled: self._update_null_replication_metrics(clean_sampled_dict) - def save(self, filepath: str = None) -> None: + def save(self, filepath: str = None, save_method: str = "pickle") -> None: """ Save profiler to disk. :param filepath: Path of file to save to :type filepath: String + :param save_method: The desired saving method (must be "pickle" or "json") + :type save_method: String :return: None """ - # Create dictionary for all metadata, options, and profile - data_dict = { - "total_samples": self.total_samples, - "encoding": self.encoding, - "file_type": self.file_type, - "row_has_null_count": self.row_has_null_count, - "row_is_null_count": self.row_is_null_count, - "hashed_row_object": self.hashed_row_object, - "_samples_per_update": self._samples_per_update, - "_min_true_samples": self._min_true_samples, - "options": self.options, - "chi2_matrix": self.chi2_matrix, - "_profile": self.profile, - "_col_name_to_idx": self._col_name_to_idx, - "times": self.times, - } - - self._save_helper(filepath, data_dict) + save_method = save_method.lower() + if save_method == "pickle": + data_dict = { + "total_samples": self.total_samples, + "encoding": self.encoding, + "file_type": self.file_type, + "row_has_null_count": self.row_has_null_count, + "row_is_null_count": self.row_is_null_count, + "hashed_row_object": self.hashed_row_object, + "_samples_per_update": self._samples_per_update, + "_min_true_samples": self._min_true_samples, + "options": self.options, + "chi2_matrix": self.chi2_matrix, + "_profile": self.profile, + "_col_name_to_idx": self._col_name_to_idx, + "times": self.times, + } + self._pkl_save_helper(filepath, data_dict) + elif save_method == "json": + self._json_save_helper(filepath) + else: + raise ValueError('save_method must be "json" or "pickle".') class Profiler: @@ -2931,14 +3134,17 @@ def __new__( # type: ignore ) @classmethod - def load(cls, filepath: str) -> BaseProfiler: + def load(cls, filepath: str, load_method: str | None = None) -> BaseProfiler: """ Load profiler from disk. :param filepath: Path of file to load from :type filepath: String + :param load_method: The desired loading method, default = "None" + :type load_method: Optional[String] + :return: Profiler being loaded, StructuredProfiler or UnstructuredProfiler :rtype: BaseProfiler """ - return BaseProfiler.load(filepath) + return BaseProfiler.load(filepath, load_method) diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 76b3654e3..f34876a55 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -6,11 +6,19 @@ import copy import re import warnings +from typing import Any, Generic, TypeVar, cast from ..labelers.base_data_labeler import BaseDataLabeler +from . import utils +from .json_decoder import load_option +BaseOptionT = TypeVar("BaseOptionT", bound="BaseOption") +BooleanOptionT = TypeVar("BooleanOptionT", bound="BooleanOption") +NumericalOptionsT = TypeVar("NumericalOptionsT", bound="NumericalOptions") +BaseInspectorOptionsT = TypeVar("BaseInspectorOptionsT", bound="BaseInspectorOptions") -class BaseOption: + +class BaseOption(Generic[BaseOptionT]): """For configuring options.""" @property @@ -138,6 +146,28 @@ def validate(self, raise_error: bool = True) -> list[str] | None: return errors return None + @classmethod + def load_from_dict(cls, data, config: dict | None = None) -> BaseOption: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config to override loading options params from dictionary + :type config: Dict | None + + :return: Options with attributes populated. + :rtype: BaseOption + """ + option = cls() + + for attr, value in data.items(): + if isinstance(value, dict) and "class" in value: + value = load_option(value, config) + setattr(option, attr, value) + + return option + def __eq__(self, other: object) -> bool: """ Determine equality by ensuring equality of all attributes. @@ -150,7 +180,7 @@ def __eq__(self, other: object) -> bool: return self.__dict__ == other.__dict__ -class BooleanOption(BaseOption): +class BooleanOption(BaseOption[BooleanOptionT]): """For setting Boolean options.""" def __init__(self, is_enabled: bool = True) -> None: @@ -180,7 +210,7 @@ def _validate_helper(self, variable_path: str = "BooleanOption") -> list[str]: return errors -class HistogramOption(BooleanOption): +class HistogramOption(BooleanOption["HistogramOption"]): """For setting histogram options.""" def __init__( @@ -233,7 +263,7 @@ def _validate_helper(self, variable_path: str = "HistogramOption") -> list[str]: return errors -class ModeOption(BooleanOption): +class ModeOption(BooleanOption["ModeOption"]): """For setting mode estimation options.""" def __init__(self, is_enabled: bool = True, max_k_modes: int = 5) -> None: @@ -241,8 +271,8 @@ def __init__(self, is_enabled: bool = True, max_k_modes: int = 5) -> None: :ivar is_enabled: boolean option to enable/disable the option. :vartype is_enabled: bool - :ivar top_k_modes: the max number of modes to return, if applicable - :vartype top_k_modes: int + :ivar max_k_modes: the max number of modes to return, if applicable + :vartype max_k_modes: int """ self.top_k_modes = max_k_modes super().__init__(is_enabled=is_enabled) @@ -268,7 +298,7 @@ def _validate_helper(self, variable_path: str = "ModeOption") -> list[str]: return errors -class BaseInspectorOptions(BooleanOption): +class BaseInspectorOptions(BooleanOption[BaseInspectorOptionsT]): """For setting Base options.""" def __init__(self, is_enabled: bool = True) -> None: @@ -317,7 +347,7 @@ def is_prop_enabled(self, prop: str) -> bool: return is_enabled -class NumericalOptions(BaseInspectorOptions): +class NumericalOptions(BaseInspectorOptions[NumericalOptionsT]): """For configuring options for Numerican Stats Mixin.""" def __init__(self) -> None: @@ -355,20 +385,20 @@ def __init__(self) -> None: stats :vartype is_numeric_stats_enabled: bool """ - self.min = BooleanOption(is_enabled=True) - self.max = BooleanOption(is_enabled=True) - self.mode = ModeOption(is_enabled=True) - self.median = BooleanOption(is_enabled=True) - self.sum = BooleanOption(is_enabled=True) - self.variance = BooleanOption(is_enabled=True) - self.skewness = BooleanOption(is_enabled=True) - self.kurtosis = BooleanOption(is_enabled=True) - self.median_abs_deviation = BooleanOption(is_enabled=True) - self.num_zeros = BooleanOption(is_enabled=True) - self.num_negatives = BooleanOption(is_enabled=True) - self.histogram_and_quantiles = HistogramOption() + self.min: BooleanOption = BooleanOption(is_enabled=True) + self.max: BooleanOption = BooleanOption(is_enabled=True) + self.mode: ModeOption = ModeOption(is_enabled=True) + self.median: BooleanOption = BooleanOption(is_enabled=True) + self.sum: BooleanOption = BooleanOption(is_enabled=True) + self.variance: BooleanOption = BooleanOption(is_enabled=True) + self.skewness: BooleanOption = BooleanOption(is_enabled=True) + self.kurtosis: BooleanOption = BooleanOption(is_enabled=True) + self.median_abs_deviation: BooleanOption = BooleanOption(is_enabled=True) + self.num_zeros: BooleanOption = BooleanOption(is_enabled=True) + self.num_negatives: BooleanOption = BooleanOption(is_enabled=True) + self.histogram_and_quantiles: HistogramOption = HistogramOption() # By default, we correct for bias - self.bias_correction = BooleanOption(is_enabled=True) + self.bias_correction: BooleanOption = BooleanOption(is_enabled=True) BaseInspectorOptions.__init__(self) @property @@ -534,7 +564,7 @@ def _validate_helper(self, variable_path: str = "NumericalOptions") -> list[str] return errors -class IntOptions(NumericalOptions): +class IntOptions(NumericalOptions["IntOptions"]): """For configuring options for Int Column.""" def __init__(self) -> None: @@ -586,7 +616,7 @@ def _validate_helper(self, variable_path: str = "IntOptions") -> list[str]: return super()._validate_helper(variable_path) -class PrecisionOptions(BooleanOption): +class PrecisionOptions(BooleanOption["PrecisionOptions"]): """For configuring options for precision.""" def __init__(self, is_enabled: bool = True, sample_ratio: float = None) -> None: @@ -631,7 +661,7 @@ def _validate_helper(self, variable_path: str = "PrecisionOptions") -> list[str] return errors -class FloatOptions(NumericalOptions): +class FloatOptions(NumericalOptions["FloatOptions"]): """For configuring options for Float Column.""" def __init__(self) -> None: @@ -688,7 +718,7 @@ def _validate_helper(self, variable_path: str = "FloatOptions") -> list[str]: return errors -class TextOptions(NumericalOptions): +class TextOptions(NumericalOptions["TextOptions"]): """For configuring options for Text Column.""" def __init__(self) -> None: @@ -729,9 +759,9 @@ def __init__(self) -> None: :vartype is_numeric_stats_enabled: bool """ NumericalOptions.__init__(self) - self.vocab = BooleanOption(is_enabled=True) - self.num_zeros = BooleanOption(is_enabled=False) - self.num_negatives = BooleanOption(is_enabled=False) + self.vocab: BooleanOption = BooleanOption(is_enabled=True) + self.num_zeros: BooleanOption = BooleanOption(is_enabled=False) + self.num_negatives: BooleanOption = BooleanOption(is_enabled=False) def _validate_helper(self, variable_path: str = "TextOptions") -> list[str]: """ @@ -815,7 +845,7 @@ def is_numeric_stats_enabled(self, value: bool) -> None: self.histogram_and_quantiles.is_enabled = value -class DateTimeOptions(BaseInspectorOptions): +class DateTimeOptions(BaseInspectorOptions["DateTimeOptions"]): """For configuring options for Datetime Column.""" def __init__(self) -> None: @@ -839,7 +869,7 @@ def _validate_helper(self, variable_path: str = "DateTimeOptions") -> list[str]: return super()._validate_helper(variable_path) -class OrderOptions(BaseInspectorOptions): +class OrderOptions(BaseInspectorOptions["OrderOptions"]): """For configuring options for Order Column.""" def __init__(self) -> None: @@ -863,15 +893,19 @@ def _validate_helper(self, variable_path: str = "OrderOptions") -> list[str]: return super()._validate_helper(variable_path) -class CategoricalOptions(BaseInspectorOptions): +class CategoricalOptions(BaseInspectorOptions["CategoricalOptions"]): """For configuring options Categorical Column.""" def __init__( self, is_enabled: bool = True, - top_k_categories: int = None, + top_k_categories: int | None = None, max_sample_size_to_check_stop_condition: int | None = None, stop_condition_unique_value_ratio: float | None = None, + cms: bool = False, + cms_confidence: float | None = 0.95, + cms_relative_error: float | None = 0.01, + cms_max_num_heavy_hitters: int | None = 5000, ) -> None: """ Initialize options for the Categorical Column. @@ -886,6 +920,17 @@ def __init__( :ivar stop_condition_unique_value_ratio: The highest ratio of unique values to dataset size that is to be considered a categorical type :vartype stop_condition_unique_value_ratio: [None, float] + :ivar cms: boolean option for using count min sketch + :vartype cms: bool + :ivar cms_confidence: defines the number of hashes used in CMS. + eg. confidence = 1 - failure probability, default 0.95 + :vartype cms_confidence: [None, float] + :ivar cms_relative_error: defines the number of buckets used in CMS, + default 0.01 + :vartype cms_relative_error: [None, float] + :ivar cms_max_num_heavy_hitters: value used to define + the threshold for minimum frequency required by a category to be counted + :vartype cms_max_num_heavy_hitters: [None, int] """ BaseInspectorOptions.__init__(self, is_enabled=is_enabled) self.top_k_categories = top_k_categories @@ -893,6 +938,10 @@ def __init__( max_sample_size_to_check_stop_condition ) self.stop_condition_unique_value_ratio = stop_condition_unique_value_ratio + self.cms = cms + self.cms_confidence = cms_confidence + self.cms_relative_error = cms_relative_error + self.cms_max_num_heavy_hitters = cms_max_num_heavy_hitters def _validate_helper(self, variable_path: str = "CategoricalOptions") -> list[str]: """ @@ -940,10 +989,36 @@ def _validate_helper(self, variable_path: str = "CategoricalOptions") -> list[st "set or not set.".format(variable_path, variable_path) ) + if self.cms_confidence is not None and ( + not isinstance(self.cms_confidence, float) + or self.cms_confidence < 0 + or self.cms_confidence > 1.0 + ): + errors.append( + "{}.cms_confidence must be either None" + " or a float between 0 and 1".format(variable_path) + ) + + if self.cms_relative_error is not None and ( + not isinstance(self.cms_relative_error, float) + or self.cms_relative_error < 0 + or self.cms_relative_error > 1.0 + ): + errors.append( + "{}.cms_relative_error must be either None" + " or a float between 0 and 1".format(variable_path) + ) + + if self.cms and not isinstance(self.cms_max_num_heavy_hitters, int): + errors.append( + "{}.if using count min sketch, you must pass an" + "integer value for cms_max_num_heavy_hitters".format(variable_path) + ) + return errors -class CorrelationOptions(BaseInspectorOptions): +class CorrelationOptions(BaseInspectorOptions["CorrelationOptions"]): """For configuring options for Correlation between Columns.""" def __init__(self, is_enabled: bool = False, columns: list[str] = None) -> None: @@ -982,7 +1057,7 @@ def _validate_helper(self, variable_path: str = "CorrelationOptions") -> list[st return errors -class HyperLogLogOptions(BaseOption): +class HyperLogLogOptions(BaseOption["HyperLogLogOptions"]): """Options for alternative method of gathering unique row count.""" def __init__(self, seed: int = 0, register_count: int = 15) -> None: @@ -1026,7 +1101,7 @@ def _validate_helper(self, variable_path: str = "HyperLogLogOptions") -> list[st return errors -class UniqueCountOptions(BooleanOption): +class UniqueCountOptions(BooleanOption["UniqueCountOptions"]): """For configuring options for unique row count.""" def __init__(self, is_enabled: bool = True, hashing_method: str = "full") -> None: @@ -1067,18 +1142,30 @@ def _validate_helper(self, variable_path: str = "UniqueCountOptions") -> list[st return errors -class RowStatisticsOptions(BooleanOption): +class RowStatisticsOptions(BooleanOption["RowStatisticsOptions"]): """For configuring options for row statistics.""" - def __init__(self, is_enabled: bool = True, unique_count: bool = True) -> None: + def __init__( + self, + is_enabled: bool = True, + unique_count: bool = True, + null_count: bool = True, + ) -> None: """ Initialize options for row statistics. :ivar is_enabled: boolean option to enable/disable. :vartype is_enabled: bool + :ivar unique_count: boolean option to enable/disable unique_count + :vartype unique_count: bool + ivar null_count: boolean option to enable/disable null_count + :vartype null_count: bool """ BooleanOption.__init__(self, is_enabled=is_enabled) - self.unique_count = UniqueCountOptions(is_enabled=unique_count) + self.unique_count: UniqueCountOptions = UniqueCountOptions( + is_enabled=unique_count + ) + self.null_count: BooleanOption = BooleanOption(is_enabled=null_count) def _validate_helper( self, variable_path: str = "RowStatisticsOptions" @@ -1094,13 +1181,18 @@ def _validate_helper( errors = super()._validate_helper(variable_path=variable_path) if not isinstance(self.unique_count, UniqueCountOptions): errors.append( - f"{variable_path}.full_hashing must be an UniqueCountOptions." + f"{variable_path}.unique_count must be an UniqueCountOptions." ) + + if not isinstance(self.null_count, BooleanOption): + errors.append(f"{variable_path}.null_count must be an BooleanOption.") + errors += self.unique_count._validate_helper(variable_path + ".unique_counts") + errors += self.null_count._validate_helper(variable_path + ".null_count") return super()._validate_helper(variable_path) -class DataLabelerOptions(BaseInspectorOptions): +class DataLabelerOptions(BaseInspectorOptions["DataLabelerOptions"]): """For configuring options for Data Labeler Column.""" def __init__(self) -> None: @@ -1196,8 +1288,37 @@ def _validate_helper(self, variable_path: str = "DataLabelerOptions") -> list[st errors.append(f"{variable_path}.max_sample_size must be greater than 0.") return errors + @classmethod + def load_from_dict( + cls, + data, + config: dict | None = None, + ) -> DataLabelerOptions: + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config to override loading options params from dictionary + :type config: Dict | None -class TextProfilerOptions(BaseInspectorOptions): + :return: Profiler with attributes populated. + :rtype: DataLabelerOptions + """ + data_labeler_object = None + data_labeler_load_attr = data.pop("data_labeler_object", {}) + if data_labeler_load_attr: + data_labeler_object = utils.reload_labeler_from_options_or_get_new( + data_labeler_load_attr, config + ) + if data_labeler_object: + data["data_labeler_object"] = data_labeler_object + + dl_options = cast(DataLabelerOptions, super().load_from_dict(data)) + return dl_options + + +class TextProfilerOptions(BaseInspectorOptions["TextProfilerOptions"]): """For configuring options for text profiler.""" def __init__( @@ -1231,8 +1352,8 @@ def __init__( self.stop_words = stop_words self.top_k_chars = top_k_chars self.top_k_words = top_k_words - self.vocab = BooleanOption(is_enabled=True) - self.words = BooleanOption(is_enabled=True) + self.vocab: BooleanOption = BooleanOption(is_enabled=True) + self.words: BooleanOption = BooleanOption(is_enabled=True) def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> list[str]: """ @@ -1289,7 +1410,7 @@ def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> list[s return errors -class StructuredOptions(BaseOption): +class StructuredOptions(BaseOption["StructuredOptions"]): """For configuring options for structured profiler.""" def __init__( @@ -1335,18 +1456,18 @@ def __init__( :vartype sampling_ratio: Union[None, float] """ # Option variables - self.multiprocess = BooleanOption() - self.int = IntOptions() - self.float = FloatOptions() - self.datetime = DateTimeOptions() - self.text = TextOptions() - self.order = OrderOptions() - self.category = CategoricalOptions() - self.data_labeler = DataLabelerOptions() - self.correlation = CorrelationOptions() - self.chi2_homogeneity = BooleanOption(is_enabled=True) - self.null_replication_metrics = BooleanOption(is_enabled=False) - self.row_statistics = RowStatisticsOptions() + self.multiprocess: BooleanOption = BooleanOption() + self.int: IntOptions = IntOptions() + self.float: FloatOptions = FloatOptions() + self.datetime: DateTimeOptions = DateTimeOptions() + self.text: TextOptions = TextOptions() + self.order: OrderOptions = OrderOptions() + self.category: CategoricalOptions = CategoricalOptions() + self.data_labeler: DataLabelerOptions = DataLabelerOptions() + self.correlation: CorrelationOptions = CorrelationOptions() + self.chi2_homogeneity: BooleanOption = BooleanOption(is_enabled=True) + self.null_replication_metrics: BooleanOption = BooleanOption(is_enabled=False) + self.row_statistics: RowStatisticsOptions = RowStatisticsOptions() # Non-Option variables self.null_values = null_values self.column_null_values = column_null_values @@ -1487,7 +1608,7 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> list[str return errors -class UnstructuredOptions(BaseOption): +class UnstructuredOptions(BaseOption["UnstructuredOptions"]): """For configuring options for unstructured profiler.""" def __init__(self) -> None: @@ -1545,7 +1666,7 @@ def _validate_helper(self, variable_path: str = "UnstructuredOptions") -> list[s return errors -class ProfilerOptions(BaseOption): +class ProfilerOptions(BaseOption["ProfilerOptions"]): """For configuring options for profiler.""" def __init__(self, presets: str = None) -> None: @@ -1557,7 +1678,8 @@ def __init__(self, presets: str = None) -> None: :ivar unstructured_options: option set for unstructured dataset profiling. :vartype unstructured_options: UnstructuredOptions :ivar presets: A pre-configured mapping of a string name to group of options: - "complete", "data_types", and "numeric_stats_disabled". Default: None + "complete", "data_types", "numeric_stats_disabled", + and "lower_memory_sketching". Default: None :vartype presets: Optional[str] """ self.structured_options = StructuredOptions() @@ -1570,6 +1692,10 @@ def __init__(self, presets: str = None) -> None: self._data_types_presets() elif self.presets == "numeric_stats_disabled": self._numeric_stats_disabled_presets() + elif self.presets == "lower_memory_sketching": + self._lower_memory_sketching_presets() + else: + raise ValueError("The preset entered is not a valid preset.") def _complete_presets(self) -> None: self.set({"*.is_enabled": True}) @@ -1583,6 +1709,18 @@ def _numeric_stats_disabled_presets(self) -> None: self.set({"*.float.is_numeric_stats_enabled": False}) self.set({"structured_options.text.is_numeric_stats_enabled": False}) + def _lower_memory_sketching_presets(self) -> None: + self.set({"row_statistics.unique_count.hashing_method": "hll"}) + self.set( + { + ( + "structured_options.category." + "max_sample_size_to_check_stop_condition" + ): 5000 + } + ) + self.set({"structured_options.category.stop_condition_unique_value_ratio": 0.5}) + def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]: """ Validate the options do not conflict and cause errors. @@ -1620,7 +1758,7 @@ def _validate_helper(self, variable_path: str = "ProfilerOptions") -> list[str]: return errors - def set(self, options: dict[str, bool]) -> None: + def set(self, options: dict[str, Any]) -> None: """ Overwrite BaseOption.set. diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py index 9c57b408d..e8446dcb8 100644 --- a/dataprofiler/profilers/text_column_profile.py +++ b/dataprofiler/profilers/text_column_profile.py @@ -194,3 +194,20 @@ def update(self, df_series: pd.Series) -> TextColumn: self._update_helper(df_series, profile) return self + + @classmethod + def load_from_dict(cls, data, config: dict | None = None): + """ + Parse attribute from json dictionary into self. + + :param data: dictionary with attributes and values. + :type data: dict[string, Any] + :param config: config for loading column profiler params from dictionary + :type config: Dict | None + + :return: Profiler with attributes populated. + :rtype: TextColumn + """ + profile = super().load_from_dict(data) + profile._reformat_numeric_stats_types_on_serialized_profiles() + return profile diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 0f663818c..ffc9eb503 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -9,7 +9,8 @@ from numpy import ndarray from pandas import DataFrame, Series -from . import BaseColumnProfiler, utils +from . import utils +from .base_column_profilers import BaseColumnProfiler from .profiler_options import TextProfilerOptions diff --git a/dataprofiler/profilers/utils.py b/dataprofiler/profilers/utils.py index 1dfa7550f..09bfbac18 100644 --- a/dataprofiler/profilers/utils.py +++ b/dataprofiler/profilers/utils.py @@ -14,6 +14,7 @@ from itertools import islice from multiprocessing.pool import Pool from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -30,7 +31,12 @@ import scipy from pandas import DataFrame, Series -from dataprofiler import profilers, settings +from .. import settings +from ..labelers.data_labelers import DataLabeler + +if TYPE_CHECKING: + from ..labelers.base_data_labeler import BaseDataLabeler + from . import profile_builder def recursive_dict_update(d: dict, update_d: dict) -> dict: @@ -293,7 +299,7 @@ def add_nested_dictionaries(first_dict: dict, second_dict: dict) -> dict: return merged_dict -def biased_skew(df_series: Series) -> float: +def biased_skew(df_series: Series) -> np.float64: """ Calculate the biased estimator for skewness of the given data. @@ -302,15 +308,15 @@ def biased_skew(df_series: Series) -> float: :param df_series: data to get skewness of, assuming floats :type df_series: pandas Series :return: biased skewness - :rtype: float + :rtype: np.float64 """ n = len(df_series) if n < 1: - return np.nan + return np.float64(np.nan) mean = sum(df_series) / n if np.isinf(mean) or np.isnan(mean): - return np.nan + return np.float64(np.nan) diffs = df_series - mean squared_diffs = diffs**2 @@ -324,14 +330,14 @@ def biased_skew(df_series: Series) -> float: M3 = 0 if np.abs(M3) < 1e-14 else M3 if M2 == 0: - return 0.0 + return np.float64(0.0) with np.errstate(all="ignore"): - skew: float = np.sqrt(n) * M3 / np.power(M2, 1.5) + skew: np.float64 = np.sqrt(n) * M3 / np.power(M2, 1.5) return skew -def biased_kurt(df_series: Series) -> float: +def biased_kurt(df_series: Series) -> np.float64: """ Calculate the biased estimator for kurtosis of the given data. @@ -340,15 +346,15 @@ def biased_kurt(df_series: Series) -> float: :param df_series: data to get kurtosis of, assuming floats :type df_series: pandas Series :return: biased kurtosis - :rtype: float + :rtype: np.float64 """ n = len(df_series) if n < 1: - return np.nan + return np.float64(np.nan) mean = sum(df_series) / n if np.isinf(mean) or np.isnan(mean): - return np.nan + return np.float64(np.nan) diffs = df_series - mean squared_diffs = diffs**2 @@ -362,10 +368,10 @@ def biased_kurt(df_series: Series) -> float: M4 = 0 if np.abs(M4) < 1e-14 else M4 if M2 == 0: - return -3.0 + return np.float64(-3.0) with np.errstate(all="ignore"): - kurt: float = n * M4 / np.power(M2, 2) - 3 + kurt: np.float64 = n * M4 / np.power(M2, 2) - 3 return kurt @@ -387,7 +393,10 @@ def __sub__(self: T, other: T) -> Any: @overload -def find_diff_of_numbers(stat1: float | np.float64, stat2: float | np.float64) -> Any: +def find_diff_of_numbers( + stat1: int | float | np.float64 | np.int64 | None, + stat2: int | float | np.float64 | np.int64 | None, +) -> Any: ... @@ -404,9 +413,9 @@ def find_diff_of_numbers(stat1, stat2): For ints/floats, returns stat1 - stat2. :param stat1: the first statistical input - :type stat1: Union[int, float] + :type stat1: Union[int, float, np.float64, np.int64, None] :param stat2: the second statistical input - :type stat2: Union[int, float] + :type stat2: Union[int, float, np.float64, np.int64, None] :return: the difference of the stats """ if stat1 is None and stat2 is None: @@ -773,9 +782,9 @@ def chunk(lst: list, size: int) -> Iterator[tuple]: def merge( - top_profile: profilers.profile_builder.BaseProfiler, - other_profile: profilers.profile_builder.BaseProfiler = None, -) -> profilers.profile_builder.BaseProfiler: + top_profile: profile_builder.BaseProfiler, + other_profile: profile_builder.BaseProfiler = None, +) -> profile_builder.BaseProfiler: """ Merge two Profiles. @@ -792,9 +801,9 @@ def merge( def merge_profile_list( - list_of_profiles: list[profilers.profile_builder.BaseProfiler], + list_of_profiles: list[profile_builder.BaseProfiler], pool_count: int = 5, -) -> profilers.profile_builder.BaseProfiler: +) -> profile_builder.BaseProfiler: """Merge list of profiles into a single profile. :param list_of_profiles: Categories and respective counts of the second group @@ -818,3 +827,62 @@ def merge_profile_list( list_of_profiles[0]._restore_data_labelers(data_labeler) return list_of_profiles[0] + + +def reload_labeler_from_options_or_get_new( + data_labeler_load_attr: dict, config: dict | None = None +) -> BaseDataLabeler | None: + """ + If required by the load_attr load a data labeler, but reuse from config if possible. + + :param data_labeler_load_attr: dictionary with attributes and values. + :type data_labeler_load_attr: dict[string, dict] + :param config: config for loading classes to reuse an existing labeler + :type config: dict[string, dict] + + :return: Profiler with attributes populated. + :rtype: DataLabelerOptions + """ + data_labeler_object: BaseDataLabeler | None = None + if "from_library" in data_labeler_load_attr: + data_labeler_object = ( + ( + # get options from DLOptions first for reuse + config.get("DataLabelerOptions", {}) + .get("from_library", {}) + .get(data_labeler_load_attr["from_library"]) + or + # get options from DL column second for reuse + config.get("DataLabelerColumn", {}) + .get("from_library", {}) + .get(data_labeler_load_attr["from_library"]) + ) + if config is not None + else None + ) + # load from library if not in options + if data_labeler_object is None: + data_labeler_object = DataLabeler.load_from_library( + data_labeler_load_attr["from_library"] + ) + # save labelers so as not to reload if already loaded + if data_labeler_object is not None and config is not None: + for class_name in ["DataLabelerOptions", "DataLabelerColumn"]: + # get each layer of dicts to not overwrite + class_options = config.get(class_name, {}) + libray_options = class_options.get("from_library", {}) + # don't replace the one that already exists + if data_labeler_load_attr["from_library"] in libray_options: + continue + labeler_options = { + data_labeler_load_attr["from_library"]: data_labeler_object + } + # update the dicts each + libray_options.update(labeler_options) + class_options["from_library"] = libray_options + config[class_name] = class_options + elif "from_disk" in data_labeler_load_attr: + raise NotImplementedError( + "Models intialized from disk have not yet been made deserializable" + ) + return data_labeler_object diff --git a/dataprofiler/tests/data_readers/test_csv_data.py b/dataprofiler/tests/data_readers/test_csv_data.py index 66f986e59..ef20361be 100644 --- a/dataprofiler/tests/data_readers/test_csv_data.py +++ b/dataprofiler/tests/data_readers/test_csv_data.py @@ -1,6 +1,7 @@ import os import unittest from io import BytesIO, StringIO, TextIOWrapper +from itertools import islice import pandas as pd @@ -541,8 +542,6 @@ def test_allowed_data_formats(self): self.assertEqual(input_data_obj.data_format, data_format) data = input_data_obj.data if data_format == "dataframe": - import pandas as pd - self.assertIsInstance(data, pd.DataFrame) elif data_format in ["records", "json"]: self.assertIsInstance(data, list) @@ -605,11 +604,67 @@ def test_set_header(self): self.assertEqual(1, csv_data.header) self.assertEqual("1", first_value) + def test_set_header_with_sample(self): + test_dir = os.path.join(test_root_path, "data") + filename = "csv/sparse-first-and-last-column-two-headers.txt" + filename = os.path.join(test_dir, filename) + + # set bad header setting + options = dict(header=-2, sample_nrows=100) + with self.assertRaisesRegex( + ValueError, + "`header` must be one of following: auto, " + "none for no header, or a non-negative " + "integer for the row that represents the " + r"header \(0 based index\)", + ): + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + + # set bad header setting + options = dict(header="abcdef", sample_nrows=100) + with self.assertRaisesRegex( + ValueError, + "`header` must be one of following: auto, " + "none for no header, or a non-negative " + "integer for the row that represents the " + r"header \(0 based index\)", + ): + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + + # set header auto + options = dict(header="auto", sample_nrows=100) + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + self.assertEqual(1, csv_data.header) + self.assertEqual("1", first_value) + + # set header None (no header) + options = dict(header=None, sample_nrows=100) + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + self.assertIsNone(csv_data.header) # should be None + self.assertEqual("COUNT", first_value) + + # set header 0 + options = dict(header=0, sample_nrows=100) + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + self.assertEqual(0, csv_data.header) + self.assertEqual("CONTAR", first_value) + + # set header 1 + options = dict(header=1, sample_nrows=100) + csv_data = CSVData(filename, options=options) + first_value = csv_data.data.loc[0][0] + self.assertEqual(1, csv_data.header) + self.assertEqual("1", first_value) + def test_header_check_files(self): """ Determine if files with no header are properly determined. """ - from itertools import islice # add some more files to the list to test the header detection # these files have some first lines which are not the header @@ -691,6 +746,13 @@ def _test_options(option, valid, invalid, expected_error): expected_error="'record_samples_per_line' must be an int more than " "0", ) + _test_options( + "sample_nrows", + valid=[10, 15, 100], + invalid=[[-1, 0, dict()]], + expected_error="'sample_nrows' must be an int more than " "0", + ) + # test edge case for header being set file = self.input_file_names[0] filepath = file["path"] @@ -716,6 +778,21 @@ def test_len_data(self): self.assertEqual(input_file["count"], len(data), msg=input_file["path"]) self.assertEqual(input_file["count"], data.length, msg=input_file["path"]) + def test_len_sampled_data(self): + """ + Validate that length called on CSVData is appropriately determining the + length value. + """ + + for input_file in self.file_or_buf_list: + data = Data(input_file["path"], options={"sample_nrows": 100}) + self.assertEqual( + min(100, input_file["count"]), len(data), msg=input_file["path"] + ) + self.assertEqual( + min(100, input_file["count"]), data.length, msg=input_file["path"] + ) + def test_is_structured(self): # Default construction data = CSVData() diff --git a/dataprofiler/tests/labelers/test_data_labelers.py b/dataprofiler/tests/labelers/test_data_labelers.py index b7b2f00f3..bbde1c506 100644 --- a/dataprofiler/tests/labelers/test_data_labelers.py +++ b/dataprofiler/tests/labelers/test_data_labelers.py @@ -137,11 +137,15 @@ def test_load_data_labeler(self, *mocks): def test_load_from_library(self, *mocks): data_labeler = dp.DataLabeler.load_from_library("structured_model") self.assertIsInstance(data_labeler, BaseDataLabeler) + # Testing to ensure _default_model_loc is set correctly + self.assertEqual("structured_model", data_labeler._default_model_loc) data_labeler = dp.DataLabeler.load_from_library( "structured_model", trainable=True ) self.assertIsInstance(data_labeler, TrainableDataLabeler) + # Testing to ensure _default_model_loc is set correctly + self.assertEqual("structured_model", data_labeler._default_model_loc) @mock.patch("tensorflow.keras.models.load_model") def test_load_from_disk(self, *mocks): diff --git a/dataprofiler/tests/profilers/profiler_options/abstract_test_options.py b/dataprofiler/tests/profilers/profiler_options/abstract_test_options.py index 44bdb5bb4..1676b3aad 100644 --- a/dataprofiler/tests/profilers/profiler_options/abstract_test_options.py +++ b/dataprofiler/tests/profilers/profiler_options/abstract_test_options.py @@ -1,5 +1,11 @@ +import json + +from dataprofiler.profilers.json_decoder import load_option +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BaseOption +from .. import utils as test_utils + class AbstractTestOptions: @@ -46,3 +52,13 @@ def test_validate_helper(self): def test_validate(self): raise NotImplementedError + + +class JSONDecodeTestMixin: + def test_json_decode(self): + expected_options = self.get_options() + + serialized = json.dumps(expected_options, cls=ProfileEncoder) + deserialized = load_option(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_options) diff --git a/dataprofiler/tests/profilers/profiler_options/test_base_inspector_options.py b/dataprofiler/tests/profilers/profiler_options/test_base_inspector_options.py index 32a695491..208f9d50a 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_base_inspector_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_base_inspector_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BaseInspectorOptions from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( TestBooleanOption, @@ -47,3 +50,15 @@ def test_is_prop_enabled(self): def test_eq(self): super().test_eq() + + def test_json_encode(self): + option = BaseInspectorOptions(is_enabled=False) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "BaseInspectorOptions", + "data": {"is_enabled": False}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_base_option.py b/dataprofiler/tests/profilers/profiler_options/test_base_option.py index 99bbc0d19..f95ed7ac5 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_base_option.py +++ b/dataprofiler/tests/profilers/profiler_options/test_base_option.py @@ -1,13 +1,16 @@ +import json import unittest +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BaseOption from dataprofiler.tests.profilers.profiler_options.abstract_test_options import ( AbstractTestOptions, ) +from .. import utils as test_utils -class TestBaseOption(AbstractTestOptions, unittest.TestCase): +class TestBaseOption(AbstractTestOptions, unittest.TestCase): option_class = BaseOption def test_init(self): @@ -58,3 +61,16 @@ def test_eq(self): self.assertEqual(options, options) options2 = self.get_options() self.assertEqual(options, options2) + + def test_json_encode(self): + option = self.get_options() + + serialized = json.dumps(option, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "BaseOption", + "data": {}, + } + ) + + self.assertEqual(expected, serialized) diff --git a/dataprofiler/tests/profilers/profiler_options/test_boolean_option.py b/dataprofiler/tests/profilers/profiler_options/test_boolean_option.py index ac5eb20c2..9b82f8112 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_boolean_option.py +++ b/dataprofiler/tests/profilers/profiler_options/test_boolean_option.py @@ -1,10 +1,16 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BooleanOption +from dataprofiler.tests.profilers.profiler_options.abstract_test_options import ( + JSONDecodeTestMixin, +) from dataprofiler.tests.profilers.profiler_options.test_base_option import ( TestBaseOption, ) -class TestBooleanOption(TestBaseOption): +class TestBooleanOption(TestBaseOption, JSONDecodeTestMixin): option_class = BooleanOption keys = [] @@ -87,3 +93,15 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = self.get_options(is_enabled=False) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "BooleanOption", + "data": {"is_enabled": False}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_categorical_options.py b/dataprofiler/tests/profilers/profiler_options/test_categorical_options.py index 954320043..89bb0b149 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_categorical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_categorical_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import CategoricalOptions from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, @@ -16,6 +19,10 @@ def test_init(self): "top_k_categories": None, "max_sample_size_to_check_stop_condition": None, "stop_condition_unique_value_ratio": None, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, }, option.properties, ) @@ -26,6 +33,10 @@ def test_init(self): "top_k_categories": None, "max_sample_size_to_check_stop_condition": None, "stop_condition_unique_value_ratio": None, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, }, option.properties, ) @@ -36,6 +47,10 @@ def test_init(self): "top_k_categories": 2, "max_sample_size_to_check_stop_condition": None, "stop_condition_unique_value_ratio": None, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, }, option.properties, ) @@ -46,6 +61,10 @@ def test_init(self): "top_k_categories": None, "max_sample_size_to_check_stop_condition": 20, "stop_condition_unique_value_ratio": None, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, }, option.properties, ) @@ -56,6 +75,10 @@ def test_init(self): "top_k_categories": None, "max_sample_size_to_check_stop_condition": None, "stop_condition_unique_value_ratio": 2, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, }, option.properties, ) @@ -69,6 +92,29 @@ def test_init(self): "top_k_categories": None, "max_sample_size_to_check_stop_condition": 20, "stop_condition_unique_value_ratio": 2, + "cms": False, + "cms_confidence": 0.95, + "cms_relative_error": 0.01, + "cms_max_num_heavy_hitters": 5000, + }, + option.properties, + ) + option = self.get_options( + cms=True, + cms_confidence=0.98, + cms_relative_error=0.1, + cms_max_num_heavy_hitters=5, + ) + self.assertDictEqual( + { + "is_enabled": True, + "top_k_categories": None, + "max_sample_size_to_check_stop_condition": None, + "stop_condition_unique_value_ratio": None, + "cms": True, + "cms_confidence": 0.98, + "cms_relative_error": 0.1, + "cms_max_num_heavy_hitters": 5, }, option.properties, ) @@ -237,3 +283,24 @@ def test_is_prop_enabled(self): def test_eq(self): super().test_eq() + + def test_json_encode(self): + option = CategoricalOptions(is_enabled=False, top_k_categories=5) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "CategoricalOptions", + "data": { + "cms": False, + "cms_confidence": 0.95, + "cms_max_num_heavy_hitters": 5000, + "cms_relative_error": 0.01, + "is_enabled": False, + "max_sample_size_to_check_stop_condition": None, + "stop_condition_unique_value_ratio": None, + "top_k_categories": 5, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_correlation_options.py b/dataprofiler/tests/profilers/profiler_options/test_correlation_options.py new file mode 100644 index 000000000..08c028f98 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_correlation_options.py @@ -0,0 +1,23 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder +from dataprofiler.profilers.profiler_options import CorrelationOptions +from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( + TestBaseInspectorOptions, +) + + +class TestCorrelationOptions(TestBaseInspectorOptions): + def test_json_encode(self): + option = CorrelationOptions( + is_enabled=False, columns=["name", "age", "location"] + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "CorrelationOptions", + "data": {"is_enabled": False, "columns": ["name", "age", "location"]}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py index a74113d19..ef906e084 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_datalabeler_options.py @@ -1,7 +1,12 @@ +pass +import json from unittest import mock from dataprofiler.labelers.base_data_labeler import BaseDataLabeler +from dataprofiler.profilers.json_decoder import load_option +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import DataLabelerOptions +from dataprofiler.tests.profilers import utils as test_utils from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, ) @@ -168,3 +173,77 @@ def test_eq(self, *mocks): self.assertNotEqual(options, options2) options2.data_labeler_object._model = 7 self.assertEqual(options, options2) + + def test_json_encode(self): + option = DataLabelerOptions() + + mock_BaseDataLabeler = mock.Mock(spec=BaseDataLabeler) + mock_BaseDataLabeler._default_model_loc = "test_loc" + option.data_labeler_object = mock_BaseDataLabeler + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "DataLabelerOptions", + "data": { + "is_enabled": True, + "data_labeler_dirpath": None, + "max_sample_size": None, + "data_labeler_object": {"from_library": "test_loc"}, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) + + @mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_decode(self, mock_BaseDataLabeler): + expected_options = self.get_options() + + serialized = json.dumps(expected_options, cls=ProfileEncoder) + deserialized = load_option(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_options) + + # case where labeler exists but no config + mock_BaseDataLabeler._default_model_loc = "test_loc" + expected_options.data_labeler_object = mock_BaseDataLabeler + mock_BaseDataLabeler.load_from_library.return_value = mock_BaseDataLabeler + config = {} + + serialized = json.dumps(expected_options, cls=ProfileEncoder) + deserialized = load_option(json.loads(serialized), config) + test_utils.assert_profiles_equal(deserialized, expected_options) + + expected_config = { + "DataLabelerOptions": {"from_library": {"test_loc": mock_BaseDataLabeler}}, + "DataLabelerColumn": {"from_library": {"test_loc": mock_BaseDataLabeler}}, + } + self.assertDictEqual(expected_config, config) + + mock_BaseDataLabeler.load_from_library.reset_mock() + mock_BaseDataLabeler.load_from_library.return_value = None + deserialized = load_option(json.loads(serialized), config) + + mock_BaseDataLabeler.load_from_library.assert_not_called() + test_utils.assert_profiles_equal(deserialized, expected_options) + + config = { + "DataLabelerColumn": {"from_library": {"test_loc": mock_BaseDataLabeler}} + } + deserialized = load_option(json.loads(serialized), config) + + mock_BaseDataLabeler.load_from_library.assert_not_called() + test_utils.assert_profiles_equal(deserialized, expected_options) + self.assertDictEqual(expected_config, config) + + config = { + "DataLabelerOptions": {"from_library": {"test_loc": mock_BaseDataLabeler}} + } + deserialized = load_option(json.loads(serialized), config) + + mock_BaseDataLabeler.load_from_library.assert_not_called() + test_utils.assert_profiles_equal(deserialized, expected_options) + self.assertDictEqual(expected_config, config) diff --git a/dataprofiler/tests/profilers/profiler_options/test_datetime_options.py b/dataprofiler/tests/profilers/profiler_options/test_datetime_options.py index 2ab990a6b..8b0b51faa 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_datetime_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_datetime_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import DateTimeOptions from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, @@ -28,3 +31,15 @@ def test_is_prop_enabled(self): def test_eq(self): super().test_eq() + + def test_json_encode(self): + option = DateTimeOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "DateTimeOptions", + "data": {"is_enabled": True}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_float_options.py b/dataprofiler/tests/profilers/profiler_options/test_float_options.py index d226290f1..044faa04e 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_float_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_float_options.py @@ -1,3 +1,7 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import FloatOptions from dataprofiler.tests.profilers.profiler_options.test_numerical_options import ( TestNumericalOptions, @@ -36,3 +40,73 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.precision.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = FloatOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "FloatOptions", + "data": { + "min": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "max": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "mode": { + "class": "ModeOption", + "data": mock.ANY, + }, + "median": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "sum": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "variance": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "skewness": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "kurtosis": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "median_abs_deviation": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_zeros": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_negatives": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "histogram_and_quantiles": { + "class": "HistogramOption", + "data": mock.ANY, + }, + "bias_correction": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "is_enabled": True, + "precision": { + "class": "PrecisionOptions", + "data": mock.ANY, + }, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py b/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py index 8ae53a4d4..4bf3a3b16 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py +++ b/dataprofiler/tests/profilers/profiler_options/test_histogram_option.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import HistogramOption from .test_boolean_option import TestBooleanOption @@ -170,3 +173,18 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.bin_count_or_method = "sturges" self.assertEqual(options, options2) + + def test_json_encode(self): + option = HistogramOption(is_enabled=False, bin_count_or_method="doane") + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "HistogramOption", + "data": { + "bin_count_or_method": "doane", + "is_enabled": False, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_hyper_log_log_options.py b/dataprofiler/tests/profilers/profiler_options/test_hyper_log_log_options.py index 9e2780c8a..8606d9f13 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_hyper_log_log_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_hyper_log_log_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import HyperLogLogOptions from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( TestBaseOption, @@ -171,3 +174,15 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.register_count = 1 self.assertEqual(options, options2) + + def test_json_encode(self): + option = self.get_options() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "HyperLogLogOptions", + "data": {"seed": 0, "register_count": 15}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_int_options.py b/dataprofiler/tests/profilers/profiler_options/test_int_options.py index 62f892cec..317d1ff64 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_int_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_int_options.py @@ -1,3 +1,7 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import IntOptions from dataprofiler.tests.profilers.profiler_options.test_numerical_options import ( TestNumericalOptions, @@ -28,3 +32,69 @@ def test_is_numeric_stats_enabled(self): def test_eq(self): super().test_eq() + + def test_json_encode(self): + option = IntOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "IntOptions", + "data": { + "min": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "max": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "mode": { + "class": "ModeOption", + "data": mock.ANY, + }, + "median": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "sum": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "variance": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "skewness": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "kurtosis": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "median_abs_deviation": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_zeros": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_negatives": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "histogram_and_quantiles": { + "class": "HistogramOption", + "data": mock.ANY, + }, + "bias_correction": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "is_enabled": True, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_mode_option.py b/dataprofiler/tests/profilers/profiler_options/test_mode_option.py new file mode 100644 index 000000000..0c10eca28 --- /dev/null +++ b/dataprofiler/tests/profilers/profiler_options/test_mode_option.py @@ -0,0 +1,21 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder +from dataprofiler.profilers.profiler_options import ModeOption +from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( + TestBooleanOption, +) + + +class TestModeOptions(TestBooleanOption): + def test_json_encode(self): + option = ModeOption(is_enabled=False, max_k_modes=5) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "ModeOption", + "data": {"is_enabled": False, "top_k_modes": 5}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py index 63ed6f04f..03d6c01db 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_numerical_options.py @@ -1,3 +1,7 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import NumericalOptions from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, @@ -364,3 +368,69 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.min.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = NumericalOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "NumericalOptions", + "data": { + "min": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "max": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "mode": { + "class": "ModeOption", + "data": mock.ANY, + }, + "median": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "sum": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "variance": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "skewness": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "kurtosis": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "median_abs_deviation": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_zeros": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_negatives": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "histogram_and_quantiles": { + "class": "HistogramOption", + "data": mock.ANY, + }, + "bias_correction": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "is_enabled": True, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_order_options.py b/dataprofiler/tests/profilers/profiler_options/test_order_options.py index 4445f1497..c188db30b 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_order_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_order_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import OrderOptions from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, @@ -28,3 +31,12 @@ def test_is_prop_enabled(self): def test_eq(self): super().test_eq() + + def test_json_encode(self): + option = OrderOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = {"class": "OrderOptions", "data": {"is_enabled": True}} + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_precision_options.py b/dataprofiler/tests/profilers/profiler_options/test_precision_options.py index 41136c73f..9b0ebcf4a 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_precision_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_precision_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import PrecisionOptions from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( TestBooleanOption, @@ -115,3 +118,15 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.sample_ratio = 0.3 self.assertEqual(options, options2) + + def test_json_encode(self): + option = PrecisionOptions(is_enabled=False, sample_ratio=0.5) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "PrecisionOptions", + "data": {"sample_ratio": 0.5, "is_enabled": False}, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_class_options.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_class_options.py index 83e9ef57a..1400496a8 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_class_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_class_options.py @@ -1,10 +1,17 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import ProfilerOptions +from dataprofiler.tests.profilers.profiler_options.abstract_test_options import ( + JSONDecodeTestMixin, +) from dataprofiler.tests.profilers.profiler_options.test_base_option import ( TestBaseOption, ) -class TestProfilerOptions(TestBaseOption): +class TestProfilerOptions(TestBaseOption, JSONDecodeTestMixin): option_class = ProfilerOptions keys = ["structured_options", "unstructured_options"] @@ -156,3 +163,25 @@ def test_validate(self): with self.assertRaisesRegex(ValueError, "\n".join(expected_error)): option.validate() self.assertListEqual(expected_error, option.validate(raise_error=False)) + + def test_json_encode(self): + option = ProfilerOptions(presets="complete") + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "ProfilerOptions", + "data": { + "structured_options": { + "class": "StructuredOptions", + "data": mock.ANY, + }, + "unstructured_options": { + "class": "UnstructuredOptions", + "data": mock.ANY, + }, + "presets": "complete", + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py index 83b496502..133257c06 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py @@ -95,7 +95,7 @@ def test_numerical_stats_option(self, *mocks): self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertTrue(np.isnan(profile_column["statistics"]["variance"])) - self.assertIsNone(profile_column["statistics"]["quantiles"][0]) + self.assertIsNone(profile_column["statistics"]["quantiles"]) self.assertTrue(np.isnan(profile_column["statistics"]["skewness"])) self.assertTrue(np.isnan(profile_column["statistics"]["kurtosis"])) @@ -242,7 +242,7 @@ def test_disabling_all_stats(self, *mocks): self.assertIsNone(profile_column["statistics"]["min"]) self.assertIsNone(profile_column["statistics"]["max"]) self.assertTrue(np.isnan(profile_column["statistics"]["variance"])) - self.assertIsNone(profile_column["statistics"]["quantiles"][0]) + self.assertIsNone(profile_column["statistics"]["quantiles"]) self.assertTrue(profile_column["statistics"]["skewness"] is np.nan) self.assertTrue(profile_column["statistics"]["kurtosis"] is np.nan) self.assertTrue( diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py index 8f8bc0d6b..545d9259c 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_presets.py @@ -33,3 +33,21 @@ def test_profiler_preset_numeric_stats_disabled(self, *mocks): self.assertFalse(options.structured_options.null_replication_metrics.is_enabled) self.assertTrue(options.structured_options.category.is_enabled) self.assertTrue(options.structured_options.order.is_enabled) + + def test_profiler_preset_lower_memory_sketching(self, *mocks): + options = ProfilerOptions(presets="lower_memory_sketching") + self.assertEqual( + options.structured_options.row_statistics.unique_count.hashing_method, "hll" + ) + self.assertEqual( + options.structured_options.category.max_sample_size_to_check_stop_condition, + 5000, + ) + self.assertEqual( + options.structured_options.category.stop_condition_unique_value_ratio, 0.5 + ) + + def test_profiler_preset_failure(self, *mocks): + expected_error = "The preset entered is not a valid preset." + with self.assertRaisesRegex(ValueError, expected_error): + ProfilerOptions(presets="failing_preset") diff --git a/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py b/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py index b32ef4ee8..1e0cd8d43 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_row_statistics_options.py @@ -1,3 +1,7 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import RowStatisticsOptions from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( TestBooleanOption, @@ -7,7 +11,7 @@ class TestRowStatisticsOptions(TestBooleanOption): option_class = RowStatisticsOptions - keys = ["unique_count"] + keys = ["unique_count", "null_count"] def get_options(self, **params): options = RowStatisticsOptions() @@ -75,3 +79,18 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = self.get_options(is_enabled=False) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "RowStatisticsOptions", + "data": { + "is_enabled": False, + "unique_count": {"class": "UniqueCountOptions", "data": mock.ANY}, + "null_count": {"class": "BooleanOption", "data": {"is_enabled": True}}, + }, + } + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_structured_options.py b/dataprofiler/tests/profilers/profiler_options/test_structured_options.py index d03b9a78b..5cda69540 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_structured_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_structured_options.py @@ -1,12 +1,18 @@ +import json import re +from unittest import mock +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import StructuredOptions +from dataprofiler.tests.profilers.profiler_options.abstract_test_options import ( + JSONDecodeTestMixin, +) from dataprofiler.tests.profilers.profiler_options.test_base_option import ( TestBaseOption, ) -class TestStructuredOptions(TestBaseOption): +class TestStructuredOptions(TestBaseOption, JSONDecodeTestMixin): option_class = StructuredOptions other_keys = ["null_values", "column_null_values"] @@ -350,3 +356,66 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.float.precision.sample_ratio = 0.1 self.assertEqual(options, options2) + + def test_json_encode(self): + option = StructuredOptions( + null_values={"str": 1}, column_null_values={2: {"other_str": 5}} + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "StructuredOptions", + "data": { + "sampling_ratio": 0.2, + "multiprocess": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "int": { + "class": "IntOptions", + "data": mock.ANY, + }, + "float": { + "class": "FloatOptions", + "data": mock.ANY, + }, + "datetime": { + "class": "DateTimeOptions", + "data": {"is_enabled": True}, + }, + "text": { + "class": "TextOptions", + "data": mock.ANY, + }, + "order": {"class": "OrderOptions", "data": {"is_enabled": True}}, + "category": { + "class": "CategoricalOptions", + "data": mock.ANY, + }, + "data_labeler": { + "class": "DataLabelerOptions", + "data": mock.ANY, + }, + "correlation": { + "class": "CorrelationOptions", + "data": mock.ANY, + }, + "chi2_homogeneity": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "null_replication_metrics": { + "class": "BooleanOption", + "data": {"is_enabled": False}, + }, + "row_statistics": { + "class": "RowStatisticsOptions", + "data": mock.ANY, + }, + "null_values": {"str": 1}, + "column_null_values": {"2": {"other_str": 5}}, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_text_options.py b/dataprofiler/tests/profilers/profiler_options/test_text_options.py index f690028e5..57814126d 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_text_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_text_options.py @@ -1,3 +1,8 @@ +pass +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import TextOptions from dataprofiler.tests.profilers.profiler_options.test_numerical_options import ( TestNumericalOptions, @@ -69,3 +74,73 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.vocab.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = TextOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "TextOptions", + "data": { + "min": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "max": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "mode": { + "class": "ModeOption", + "data": mock.ANY, + }, + "median": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "sum": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "variance": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "skewness": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "kurtosis": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "median_abs_deviation": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "num_zeros": { + "class": "BooleanOption", + "data": {"is_enabled": False}, + }, + "num_negatives": { + "class": "BooleanOption", + "data": {"is_enabled": False}, + }, + "histogram_and_quantiles": { + "class": "HistogramOption", + "data": mock.ANY, + }, + "bias_correction": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + "is_enabled": True, + "vocab": { + "class": "BooleanOption", + "data": {"is_enabled": True}, + }, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_unique_count_options.py b/dataprofiler/tests/profilers/profiler_options/test_unique_count_options.py index 96e02b245..275f087cd 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_unique_count_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_unique_count_options.py @@ -1,3 +1,7 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import UniqueCountOptions from dataprofiler.tests.profilers.profiler_options.test_boolean_option import ( TestBooleanOption, @@ -93,3 +97,18 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = self.get_options(is_enabled=False) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "UniqueCountOptions", + "data": { + "is_enabled": False, + "hashing_method": "full", + "hll": {"class": "HyperLogLogOptions", "data": mock.ANY}, + }, + } + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_unstructured_options.py b/dataprofiler/tests/profilers/profiler_options/test_unstructured_options.py index 1546abe16..8e707bcdf 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_unstructured_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_unstructured_options.py @@ -1,10 +1,17 @@ +import json +from unittest import mock + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BooleanOption, UnstructuredOptions +from dataprofiler.tests.profilers.profiler_options.abstract_test_options import ( + JSONDecodeTestMixin, +) from dataprofiler.tests.profilers.profiler_options.test_base_option import ( TestBaseOption, ) -class TestUnstructuredOptions(TestBaseOption): +class TestUnstructuredOptions(TestBaseOption, JSONDecodeTestMixin): option_class = UnstructuredOptions keys = ["text", "data_labeler"] @@ -148,3 +155,24 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.text.stop_words = ["woah", "stop", "right", "there"] self.assertEqual(options, options2) + + def test_json_encode(self): + option = UnstructuredOptions() + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "UnstructuredOptions", + "data": { + "text": { + "class": "TextProfilerOptions", + "data": mock.ANY, + }, + "data_labeler": { + "class": "DataLabelerOptions", + "data": mock.ANY, + }, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/profiler_options/test_unstructured_text_profile_options.py b/dataprofiler/tests/profilers/profiler_options/test_unstructured_text_profile_options.py index 8c48f4c0a..715213af6 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_unstructured_text_profile_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_unstructured_text_profile_options.py @@ -1,3 +1,6 @@ +import json + +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import BooleanOption, TextProfilerOptions from dataprofiler.tests.profilers.profiler_options.test_base_inspector_options import ( TestBaseInspectorOptions, @@ -244,3 +247,29 @@ def test_eq(self): self.assertNotEqual(options, options2) options2.words.is_enabled = False self.assertEqual(options, options2) + + def test_json_encode(self): + option = TextProfilerOptions( + is_enabled=False, + is_case_sensitive=False, + stop_words=["ab", "aa", "aba"], + top_k_chars=5, + top_k_words=8, + ) + + serialized = json.dumps(option, cls=ProfileEncoder) + + expected = { + "class": "TextProfilerOptions", + "data": { + "is_enabled": False, + "is_case_sensitive": False, + "top_k_chars": 5, + "top_k_words": 8, + "stop_words": ["ab", "aa", "aba"], + "vocab": {"class": "BooleanOption", "data": {"is_enabled": True}}, + "words": {"class": "BooleanOption", "data": {"is_enabled": True}}, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) diff --git a/dataprofiler/tests/profilers/test_base_column_profilers.py b/dataprofiler/tests/profilers/test_base_column_profilers.py index 3eb83daec..eb2ed764d 100644 --- a/dataprofiler/tests/profilers/test_base_column_profilers.py +++ b/dataprofiler/tests/profilers/test_base_column_profilers.py @@ -163,7 +163,7 @@ def test_json_encode(self): } ) - self.assertEqual(serialized, expected) + self.assertEqual(expected, serialized) class TestBaseColumnPrimitiveTypeProfileClass(unittest.TestCase): diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index e0aca4a3d..10be10c58 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -8,7 +8,7 @@ import pandas as pd from dataprofiler.profilers import CategoricalColumn -from dataprofiler.profilers.json_decoder import decode_column_profiler +from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profile_builder import StructuredColProfiler from dataprofiler.profilers.profiler_options import CategoricalOptions @@ -272,7 +272,6 @@ def test_mixed_categorical_col_integer_string(self): self.assertCountEqual(categories, profile.categories) def test_categorical_mapping(self): - df1 = pd.Series( [ "abcd", @@ -843,6 +842,10 @@ def test_json_encode(self): "_stop_condition_is_met": False, "_stopped_at_unique_ratio": None, "_stopped_at_unique_count": None, + "_cms_max_num_heavy_hitters": 5000, + "cms_num_hashes": None, + "cms_num_buckets": None, + "cms": None, }, } ) @@ -868,7 +871,7 @@ def test_json_encode_after_update(self): ) profile = CategoricalColumn(df_categorical.name) - with patch("time.time", side_effect=lambda: 0.0): + with test_utils.mock_timeit(): profile.update(df_categorical) serialized = json.dumps(profile, cls=ProfileEncoder) @@ -880,7 +883,7 @@ def test_json_encode_after_update(self): "col_index": np.nan, "sample_size": 12, "metadata": {}, - "times": {"categories": 0.0}, + "times": {"categories": 1.0}, "thread_safe": True, "_categories": {"c": 5, "b": 4, "a": 3}, "_CategoricalColumn__calculations": {}, @@ -890,6 +893,10 @@ def test_json_encode_after_update(self): "_stop_condition_is_met": False, "_stopped_at_unique_ratio": None, "_stopped_at_unique_count": None, + "_cms_max_num_heavy_hitters": 5000, + "cms_num_hashes": None, + "cms_num_buckets": None, + "cms": None, }, } ) @@ -901,7 +908,7 @@ def test_json_decode(self): expected_profile = CategoricalColumn(fake_profile_name) serialized = json.dumps(expected_profile, cls=ProfileEncoder) - deserialized = decode_column_profiler(serialized) + deserialized = load_column_profile(json.loads(serialized)) test_utils.assert_profiles_equal(deserialized, expected_profile) @@ -928,14 +935,187 @@ def test_json_decode_after_update(self): ) expected_profile = CategoricalColumn(fake_profile_name) - with patch("time.time", side_effect=lambda: 0.0): + with test_utils.mock_timeit(): expected_profile.update(df_categorical) serialized = json.dumps(expected_profile, cls=ProfileEncoder) - deserialized = decode_column_profiler(serialized) + deserialized = load_column_profile(json.loads(serialized)) test_utils.assert_profiles_equal(deserialized, expected_profile) + df_categorical = pd.Series( + [ + "a", # add existing + "d", # add new + ] + ) + + # validating update after deserialization + deserialized.update(df_categorical) + + assert deserialized.sample_size == 14 + assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1} + + def test_cms_max_num_heavy_hitters(self): + df_categorical = pd.Series(["a"] * 5 + ["b"] * 5 + ["c"] * 10) + + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 2 + + profile = CategoricalColumn("test_name", options) + profile.update(df_categorical) + + self.assertEqual({"c": 10}, profile._categories) + self.assertTrue(profile.sample_size >= 10) + + def test_cms_update_hybrid_batch_stream(self): + dataset = pd.Series(["a"] * 7 + ["b"] * 9 + ["c"] * 14) + dataset1 = pd.Series(["a"] * 9 + ["b"] * 11 + ["c"] * 9 + ["d"] * 1) + + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 3 + + profile = CategoricalColumn("test_name", options) + profile.update(dataset) + + expected_categories = ["c"] + expected_categories_dict = {"c": 14} + + self.assertEqual(profile.sample_size, len(dataset)) + self.assertEqual(profile._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile.categories) + + profile.update(dataset1) + expected_categories = ["b", "c"] + expected_categories_dict = {"b": 20, "c": 23} + + self.assertEqual(profile.sample_size, len(dataset) + len(dataset1)) + self.assertEqual(profile._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile.categories) + + def test_cms_profile_merge_via_add(self): + + dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + + expected_categories = ["b", "c"] + expected_categories_dict = {"b": 22, "c": 23} + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 3 + + profile1 = CategoricalColumn("test_name", options) + profile1.update(dataset) + + expected_categories = ["b"] + expected_categories_dict = {"b": 12} + + self.assertEqual(profile1._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile1.categories) + + profile2 = CategoricalColumn("test_name", options) + profile2.update(dataset1) + + expected_categories = ["b", "c"] + expected_categories_dict = {"b": 10, "c": 14} + + self.assertEqual(profile2._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile2.categories) + + # Add profiles + profile3 = profile1 + profile2 + + expected_categories = ["b", "c"] + expected_categories_dict = {"b": 22, "c": 23} + + self.assertEqual( + profile3.sample_size, profile1.sample_size + profile2.sample_size + ) + self.assertEqual(profile3._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile3.categories) + + def test_cms_profile_min_max_num_heavy_hitters(self): + + dataset = pd.Series(["a"] * 9 + ["b"] * 12 + ["c"] * 9) + dataset1 = pd.Series(["a"] * 6 + ["b"] * 10 + ["c"] * 14) + + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 3 + + profile1 = CategoricalColumn("test_name", options) + profile1.update(dataset) + + options.cms_max_num_heavy_hitters = 10 + profile2 = CategoricalColumn("test_name", options) + profile2.update(dataset1) + + # Add profiles + profile3 = profile1 + profile2 + + self.assertEqual(profile3._cms_max_num_heavy_hitters, 3) + + def test_cms_catch_overwriting_with_missing_dict(self): + + dataset = pd.Series(["b"] * 2 + ["c"] * 14) + dataset1 = pd.Series(["b"] * 5 + ["c"] * 10) + + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 3 + + profile = CategoricalColumn("test_name", options) + profile.update(dataset) + + expected_categories = ["c"] + expected_categories_dict = {"c": 14} + + self.assertEqual(profile.sample_size, len(dataset)) + self.assertEqual(profile._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile.categories) + + profile.update(dataset1) + expected_categories = ["c"] + expected_categories_dict = {"c": 24} + + self.assertEqual(profile.sample_size, len(dataset) + len(dataset1)) + self.assertEqual(profile._categories, expected_categories_dict) + self.assertCountEqual(expected_categories, profile.categories) + + def test_cms_vs_full_mismatch_merge(self): + + dataset = pd.Series(["b"] * 2 + ["c"] * 14) + + options = CategoricalOptions() + options.cms = True + options.cms_confidence = 0.95 + options.cms_relative_error = 0.01 + options.cms_max_num_heavy_hitters = 3 + + profile_cms = CategoricalColumn("test_name", options) + profile_cms.update(dataset) + profile = CategoricalColumn("test_name") + profile.update(dataset) + + with self.assertRaisesRegex( + Exception, + "Unable to add two profiles: One is using count min sketch" + "and the other is using full.", + ): + profile3 = profile_cms + profile + class TestCategoricalSentence(unittest.TestCase): def setUp(self): diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 4891985b8..957bb694f 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -1,16 +1,23 @@ +import json import unittest from unittest import mock import numpy as np import pandas as pd +from dataprofiler.labelers import BaseDataLabeler from dataprofiler.profilers import column_profile_compilers as col_pro_compilers +from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler +from dataprofiler.profilers.json_decoder import load_compiler +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import ( BaseOption, StructuredOptions, UnstructuredOptions, ) +from . import utils as test_utils + class TestBaseProfileCompilerClass(unittest.TestCase): def test_cannot_instantiate(self): @@ -81,6 +88,61 @@ def test_add_profilers(self): self.assertEqual(3, merged_compiler._profiles["test"]) self.assertEqual("compiler1", merged_compiler.name) + @mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set()) + def test_no_profilers_error(self): + with self.assertRaises(NotImplementedError) as e: + col_pro_compilers.BaseCompiler() + self.assertEqual("Must add profilers.", str(e.exception)) + + @mock.patch.multiple( + col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock" + ) + def test_no_options_error(self): + with self.assertRaisesRegex( + NotImplementedError, "Must set the expected OptionClass." + ): + col_pro_compilers.BaseCompiler() + + def test_update_match_are_abstract(self): + self.assertCountEqual( + {"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__ + ) + + @mock.patch.multiple(BaseColumnProfiler, __abstractmethods__=set()) + def test_json_encode(self): + with mock.patch.multiple( + col_pro_compilers.BaseCompiler, + __abstractmethods__=set(), + _profilers=[BaseColumnProfiler], + _option_class=BaseOption, + ): + profile = col_pro_compilers.BaseCompiler() + + base_column_profiler = BaseColumnProfiler(name="test") + with mock.patch.object( + profile, "_profiles", {"BaseColumn": base_column_profiler} + ): + serialized = json.dumps(profile, cls=ProfileEncoder) + + dict_of_base_column_profiler = json.loads( + json.dumps(base_column_profiler, cls=ProfileEncoder) + ) + expected = json.dumps( + { + "class": "BaseCompiler", + "data": { + "name": None, + "_profiles": { + "BaseColumn": dict_of_base_column_profiler, + }, + }, + } + ) + + self.assertEqual(expected, serialized) + + +class TestColumnPrimitiveTypeProfileCompiler(unittest.TestCase): def test_primitive_compiler_report(self): structured_options = StructuredOptions() data1 = pd.Series(["2.6", "-1.8"]) @@ -307,7 +369,121 @@ def test_disabling_columns_during_primitive_diff(self): expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) - def test_compiler_stats_diff(self): + def test_json_encode(self): + + compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler() + + serialized = json.dumps(compiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "ColumnPrimitiveTypeProfileCompiler", + "data": { + "name": None, + "_profiles": {}, + }, + } + ) + self.assertEqual(expected, serialized) + + def test_json_encode_after_update(self): + + data = pd.Series(["-2", "-1", "1", "2"], name="test") + with test_utils.mock_timeit(): + compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data) + + with mock.patch.object(compiler._profiles["datetime"], "__dict__", {}): + with mock.patch.object(compiler._profiles["int"], "__dict__", {}): + with mock.patch.object(compiler._profiles["float"], "__dict__", {}): + with mock.patch.object(compiler._profiles["text"], "__dict__", {}): + serialized = json.dumps(compiler, cls=ProfileEncoder) + + # pop the data inside primitive column profiler as we just want to make + # sure generally it is serializing, decode will validate true replication + + expected = json.dumps( + { + "class": "ColumnPrimitiveTypeProfileCompiler", + "data": { + "name": "test", + "_profiles": { + "datetime": {"class": "DateTimeColumn", "data": {}}, + "int": {"class": "IntColumn", "data": {}}, + "float": {"class": "FloatColumn", "data": {}}, + "text": {"class": "TextColumn", "data": {}}, + }, + }, + } + ) + + self.assertEqual(expected, serialized) + + def test_json_decode(self): + expected_compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler() + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(expected_compiler, deserialized) + + def test_json_decode_after_update(self): + + data = pd.Series(["-2", "-1", "1", "2"], name="test") + with test_utils.mock_timeit(): + expected_compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler( + data + ) + + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_compiler) + # assert before update + assert ( + deserialized.report().get("statistics", {}).get("mean") + == sum([-2, -1, 1, 2]) / 4 + ) + + df_float = pd.Series( + [ + 4.0, # add existing + 15.0, # add new + ] + ).apply(str) + + # validating update after deserialization with a few small tests + deserialized.update_profile(df_float) + + for profile in deserialized._profiles.values(): + assert profile.sample_size == 6 + assert ( + deserialized.report().get("statistics", {}).get("mean") + == sum([-2, -1, 1, 2, 4, 15]) / 6 + ) + + +class TestColumnStatsProfileCompiler(unittest.TestCase): + def test_column_stats_profile_compiler_report(self): + structured_options = StructuredOptions() + structured_options.category.is_enabled = False + data1 = pd.Series(["2.6", "-1.8", "-2.3"]) + compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( + data1, structured_options + ) + report = compiler1.report(remove_disabled_flag=True) + self.assertNotIn("categorical", report) + self.assertIn("order", report) + + structured_options = StructuredOptions() + structured_options.order.is_enabled = False + data1 = pd.Series(["2.6", "-1.8", "-2.3"]) + compiler1 = col_pro_compilers.ColumnStatsProfileCompiler( + data1, structured_options + ) + report = compiler1.report(remove_disabled_flag=False) + self.assertIn("categorical", report) + self.assertNotIn("order", report) + + def test_column_stats_profile_compiler_stats_diff(self): data1 = pd.Series(["1", "9", "9"]) data2 = pd.Series(["10", "9", "9", "9"]) options = StructuredOptions() @@ -354,11 +530,131 @@ def test_compiler_stats_diff(self): expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) - @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") - @mock.patch( - "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" - ) - def test_compiler_data_labeler_diff(self, *mocked_datalabeler): + def test_json_encode(self): + + compiler = col_pro_compilers.ColumnStatsProfileCompiler() + + serialized = json.dumps(compiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "ColumnStatsProfileCompiler", + "data": { + "name": None, + "_profiles": {}, + }, + } + ) + self.assertEqual(expected, serialized) + + def test_json_encode_after_update(self): + + data = pd.Series(["-2", "-1", "1", "2"], name="test") + with test_utils.mock_timeit(): + compiler = col_pro_compilers.ColumnStatsProfileCompiler(data) + + with mock.patch.object( + compiler._profiles["order"], "__dict__", {"an": "order"} + ): + with mock.patch.object( + compiler._profiles["category"], "__dict__", {"this": "category"} + ): + serialized = json.dumps(compiler, cls=ProfileEncoder) + + expected = json.dumps( + { + "class": "ColumnStatsProfileCompiler", + "data": { + "name": "test", + "_profiles": { + "order": {"class": "OrderColumn", "data": {"an": "order"}}, + "category": { + "class": "CategoricalColumn", + "data": {"this": "category"}, + }, + }, + }, + } + ) + + self.assertEqual(expected, serialized) + + def test_json_decode(self): + expected_compiler = col_pro_compilers.ColumnStatsProfileCompiler() + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(expected_compiler, deserialized) + + def test_json_decode_after_update(self): + + data = pd.Series(["-2", "-1", "1", "15"], name="test") + with test_utils.mock_timeit(): + expected_compiler = col_pro_compilers.ColumnStatsProfileCompiler(data) + + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_compiler) + # assert before update + assert deserialized.report().get("order", None) == "ascending" + assert deserialized.report().get("categorical", None) == True + + df_float = pd.Series( + list(range(100)) # make orer random and not categorical + ).apply(str) + + # validating update after deserialization with a few small tests + deserialized.update_profile(df_float) + assert deserialized.report().get("order", None) == "random" + assert deserialized.report().get("categorical", None) == False + + +@mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, +) +@mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, +) +class TestColumnDataLabelerCompiler(unittest.TestCase): + @staticmethod + def _setup_data_labeler_mock(mock_instance): + mock_DataLabeler = mock_instance.return_value + mock_DataLabeler.label_mapping = {"a": 0, "b": 1} + mock_DataLabeler.reverse_label_mapping = {0: "a", 1: "b"} + mock_DataLabeler.model.num_labels = 2 + mock_DataLabeler.model.requires_zero_mapping = False + mock_DataLabeler._default_model_loc = "structured_model" + + mock_instance.load_from_library.side_effect = mock_instance + + def mock_predict(data, *args, **kwargs): + len_data = len(data) + output = [[1, 0], [0, 1]] * (len_data // 2) + if len_data % 2: + output += [[1, 0]] + conf = np.array(output) + if mock_DataLabeler.model.requires_zero_mapping: + conf = np.concatenate([[[0]] * len_data, conf], axis=1) + pred = np.argmax(conf, axis=1) + return {"pred": pred, "conf": conf} + + mock_DataLabeler.predict.side_effect = mock_predict + + def test_column_data_labeler_compiler_report(self, mock_instance, *mocks): + self._setup_data_labeler_mock(mock_instance) + structured_options = StructuredOptions() + data1 = pd.Series(["2.6", "-1.8", "-2.3"]) + compiler1 = col_pro_compilers.ColumnDataLabelerCompiler( + data1, structured_options + ) + report = compiler1.report(remove_disabled_flag=True) + self.assertIn("data_label", report) + self.assertIn("statistics", report) + + def test_compiler_data_labeler_diff(self, *mocks): # Initialize dummy data data = pd.Series([]) @@ -425,25 +721,123 @@ def test_compiler_data_labeler_diff(self, *mocked_datalabeler): expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) - @mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set()) - def test_no_profilers_error(self): - with self.assertRaises(NotImplementedError) as e: - col_pro_compilers.BaseCompiler() - self.assertEqual("Must add profilers.", str(e.exception)) + def test_json_encode(self, *mocks): + compiler = col_pro_compilers.ColumnDataLabelerCompiler() - @mock.patch.multiple( - col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock" - ) - def test_no_options_error(self): - with self.assertRaisesRegex( - NotImplementedError, "Must set the expected OptionClass." + serialized = json.dumps(compiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "ColumnDataLabelerCompiler", + "data": { + "name": None, + "_profiles": {}, + }, + } + ) + self.assertEqual(expected, serialized) + + def test_json_decode(self, *mocks): + expected_compiler = col_pro_compilers.ColumnDataLabelerCompiler() + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(expected_compiler, deserialized) + + def test_json_encode_after_update(self, mock_instance, *mocks): + self._setup_data_labeler_mock(mock_instance) + + data = pd.Series(["-2", "-1", "1", "2"]) + with test_utils.mock_timeit(): + compiler = col_pro_compilers.ColumnDataLabelerCompiler(data) + + with mock.patch.object( + compiler._profiles["data_labeler"], "__dict__", {"data_label": "INTEGER"} ): - col_pro_compilers.BaseCompiler() + serialized = json.dumps(compiler, cls=ProfileEncoder) + + expected = json.dumps( + { + "class": "ColumnDataLabelerCompiler", + "data": { + "name": None, + "_profiles": { + "data_labeler": { + "class": "DataLabelerColumn", + "data": {"data_label": "INTEGER"}, + }, + }, + }, + } + ) - def test_update_match_are_abstract(self): - self.assertCountEqual( - {"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__ + self.assertEqual(expected, serialized) + + def test_json_decode_after_update(self, mock_instance, mock_utils_DataLabeler): + + self._setup_data_labeler_mock(mock_instance) + mock_instance._default_model_loc = "structured_model" + mock_utils_DataLabeler.load_from_library = mock_instance + + data = pd.Series(["2", "-1", "1", "2"], name="test") + with test_utils.mock_timeit(): + expected_compiler = col_pro_compilers.ColumnDataLabelerCompiler(data) + + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + deserialized = load_compiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_compiler) + # assert before update + assert deserialized.report().get("data_label", None) == "a|b" + assert deserialized.report().get("statistics", {}).get( + "data_label_representation", None + ) == {"a": 0.5, "b": 0.5} + + new_data = pd.Series(["100"]) + + # validating update after deserialization with a few small tests + deserialized.update_profile(new_data) + assert deserialized.report().get("data_label", None) == "a|b" + assert deserialized.report().get("statistics", {}).get( + "data_label_representation", None + ) == {"a": 0.6, "b": 0.4} + + def test_json_decode_with_options( + self, mock_DataLabeler_cls, mock_utils_DataLabeler + ): + self._setup_data_labeler_mock(mock_DataLabeler_cls) + mock_DataLabeler_cls._default_model_loc = "structured_model" + mock_utils_DataLabeler.load_from_library = mock_DataLabeler_cls + + data = pd.Series(["2", "-1", "1", "2"], name="test") + with test_utils.mock_timeit(): + expected_compiler = col_pro_compilers.ColumnDataLabelerCompiler(data) + + serialized = json.dumps(expected_compiler, cls=ProfileEncoder) + + # create a new labeler ot load instead of from_library + new_mock_data_labeler = mock.Mock(spec=BaseDataLabeler) + new_mock_data_labeler.name = "new fake data labeler" + new_mock_data_labeler._default_model_loc = "my/fake/path" + options = { + "DataLabelerColumn": { + "from_library": {"structured_model": new_mock_data_labeler} + } + } + + mock_DataLabeler_cls.reset_mock() # set to 0 calls as option should override + deserialized = load_compiler(json.loads(serialized), options) + + # ensure doesn't change original, but options updates deserialized labeler + assert ( + expected_compiler._profiles.get("data_labeler", mock.Mock()).data_labeler + == mock_DataLabeler_cls.return_value + ) + assert ( + deserialized._profiles.get("data_labeler", mock.Mock()).data_labeler + == new_mock_data_labeler ) + mock_DataLabeler_cls.assert_not_called() class TestUnstructuredCompiler(unittest.TestCase): diff --git a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py index d2f458b8f..5b25f939c 100644 --- a/dataprofiler/tests/profilers/test_data_labeler_column_profile.py +++ b/dataprofiler/tests/profilers/test_data_labeler_column_profile.py @@ -1,3 +1,4 @@ +import json import unittest from collections import defaultdict from unittest import mock @@ -5,12 +6,20 @@ import numpy as np import pandas as pd +from dataprofiler.labelers import BaseDataLabeler from dataprofiler.profilers import utils from dataprofiler.profilers.data_labeler_column_profile import DataLabelerColumn +from dataprofiler.profilers.json_decoder import load_column_profile +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import DataLabelerOptions +from . import utils as test_utils -@mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") + +@mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, +) class TestDataLabelerColumnProfiler(unittest.TestCase): @staticmethod def _setup_data_labeler_mock(mock_instance): @@ -19,6 +28,7 @@ def _setup_data_labeler_mock(mock_instance): mock_DataLabeler.reverse_label_mapping = {0: "a", 1: "b"} mock_DataLabeler.model.num_labels = 2 mock_DataLabeler.model.requires_zero_mapping = False + mock_instance.load_from_library.side_effect = mock_instance def mock_predict(data, *args, **kwargs): len_data = len(data) @@ -372,7 +382,6 @@ def test_diff(self, mock_instance): "avg_predictions": {"a": "unchanged", "b": -0.70, "c": 0.70}, "label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84}, } - self.maxDiff = None self.assertDictEqual(expected_diff, diff) def test_empty_data(self, *mocks): @@ -391,3 +400,154 @@ def test_empty_data(self, *mocks): diff_profile = profiler1.diff(profiler2) self.assertIsNone(merge_profile.data_label) + + def test_json_encode(self, mock_instance): + self._setup_data_labeler_mock(mock_instance) + profiler = DataLabelerColumn("") + + # Validates that error is raised if model loc is not set for labeler + profiler.data_labeler._default_model_loc = None + with self.assertRaisesRegex( + ValueError, + "Serialization cannot be done on labelers with _default_model_loc not set", + ): + _ = json.dumps(profiler, cls=ProfileEncoder) + + # Reset the model loc to its initial value + profiler.data_labeler._default_model_loc = "this is a test model loc" + serialized = json.dumps(profiler, cls=ProfileEncoder) + + expected = json.dumps( + { + "class": "DataLabelerColumn", + "data": { + "name": "", + "col_index": float("nan"), + "sample_size": 0, + "metadata": {}, + "times": {}, + "thread_safe": False, + "_max_sample_size": 1000, + "data_labeler": {"from_library": "this is a test model loc"}, + "_reverse_label_mapping": None, + "_possible_data_labels": None, + "_rank_distribution": None, + "_sum_predictions": None, + "_top_k_voting": 1, + "_min_voting_prob": 0.2, + "_min_prob_differential": 0.2, + "_top_k_labels": 3, + "_min_top_label_prob": 0.35, + "_DataLabelerColumn__calculations": {}, + }, + } + ) + + self.assertEqual(serialized, expected) + + def test_json_encode_after_update(self, mock_instance): + self._setup_data_labeler_mock(mock_instance) + data = pd.Series(["1", "2", "3", "4"], dtype=object) + profiler = DataLabelerColumn(data.name) + profiler.data_labeler._default_model_loc = "this is a test model loc" + with test_utils.mock_timeit(): + profiler.update(data) + + serialized = json.dumps(profiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "DataLabelerColumn", + "data": { + "name": None, + "col_index": float("nan"), + "sample_size": 4, + "metadata": {}, + "times": {"data_labeler_predict": 1.0}, + "thread_safe": False, + "_max_sample_size": 1000, + "data_labeler": {"from_library": "this is a test model loc"}, + "_reverse_label_mapping": {0: "a", 1: "b"}, + "_possible_data_labels": ["a", "b"], + "_rank_distribution": { + "a": 2, + "b": 2, + }, + "_sum_predictions": [2.0, 2.0], + "_top_k_voting": 1, + "_min_voting_prob": 0.2, + "_min_prob_differential": 0.2, + "_top_k_labels": 3, + "_min_top_label_prob": 0.35, + "_DataLabelerColumn__calculations": {}, + }, + } + ) + + self.assertEqual(expected, serialized) + + @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + def test_json_decode(self, mock_utils_DataLabeler, mock_BaseDataLabeler): + self._setup_data_labeler_mock(mock_BaseDataLabeler) + mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler + + data = pd.Series(["1", "2", "3", "4"], dtype=object) + expected = DataLabelerColumn(data.name) + expected.data_labeler._default_model_loc = "structured_model" + serialized = json.dumps(expected, cls=ProfileEncoder) + + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected) + + # test decode with options to override load labeler + # create a new labeler ot load instead of from_library + new_mock_data_labeler = mock.Mock(spec=BaseDataLabeler) + new_mock_data_labeler.name = "new fake data labeler" + new_mock_data_labeler._default_model_loc = "my/fake/path" + config = { + "DataLabelerColumn": { + "from_library": {"structured_model": new_mock_data_labeler} + } + } + + mock_BaseDataLabeler.reset_mock() # set to 0 calls as option should override + mock_utils_DataLabeler.reset_mock() # set to 0 calls as option should override + deserialized = load_column_profile(json.loads(serialized), config) + assert deserialized.data_labeler == new_mock_data_labeler + mock_BaseDataLabeler.assert_not_called() + mock_utils_DataLabeler.assert_not_called() + + # validate raises error when cannot properly load data. + with self.assertRaisesRegex( + NotImplementedError, + "Models intialized from disk have not yet been made deserializable", + ): + class_as_dict = json.loads(serialized) + class_as_dict["data"]["data_labeler"] = {"from_disk": "test"} + deserialized = load_column_profile(class_as_dict, config) + + @mock.patch("dataprofiler.profilers.utils.DataLabeler", spec=BaseDataLabeler) + def test_json_decode_after_update( + self, mock_utils_DataLabeler, mock_BaseDataLabeler + ): + self._setup_data_labeler_mock(mock_BaseDataLabeler) + mock_utils_DataLabeler.load_from_library.side_effect = mock_BaseDataLabeler + + data = pd.Series(["1", "2", "3", "4"], dtype=object) + expected = DataLabelerColumn(data.name) + expected.data_labeler._default_model_loc = "structured_model" + with test_utils.mock_timeit(): + expected.update(data) + + serialized = json.dumps(expected, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected) + update_data = pd.Series(["4", "5", "6", "7"], dtype=object) + deserialized.update(update_data) + + assert deserialized.sample_size == 8 + self.assertDictEqual({"a": 4, "b": 4}, deserialized.rank_distribution) + np.testing.assert_array_equal( + np.array([4.0, 4.0]), deserialized.sum_predictions + ) diff --git a/dataprofiler/tests/profilers/test_datetime_column_profile.py b/dataprofiler/tests/profilers/test_datetime_column_profile.py index b74ce60e5..c00ac8e0d 100644 --- a/dataprofiler/tests/profilers/test_datetime_column_profile.py +++ b/dataprofiler/tests/profilers/test_datetime_column_profile.py @@ -10,11 +10,11 @@ import pandas as pd from dataprofiler.profilers import DateTimeColumn +from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import DateTimeOptions -from .. import test_utils -from . import utils +from . import utils as test_utils # This is taken from: https://github.com/rlworkgroup/dowel/pull/36/files # undo when cpython#4800 is merged. @@ -23,7 +23,7 @@ class TestDateTimeColumnProfiler(unittest.TestCase): def setUp(self): - utils.set_seed(seed=0) + test_utils.set_seed(seed=0) @staticmethod def _generate_datetime_data(date_format): @@ -33,7 +33,7 @@ def _generate_datetime_data(date_format): start_date = pd.Timestamp(1950, 7, 14) end_date = pd.Timestamp(2020, 7, 14) - date_sample = utils.generate_random_date_sample( + date_sample = test_utils.generate_random_date_sample( start_date, end_date, [date_format] ) gen_data.append(date_sample) @@ -500,3 +500,46 @@ def test_json_encode_after_update(self): ) self.assertEqual(serialized, expected) + + def test_json_decode(self): + fake_profile_name = None + expected_profile = DateTimeColumn(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + def test_json_decode_after_update(self): + fake_profile_name = "Fake profile name" + + data = [2.5, 12.5, "2013-03-10 15:43:30", 5, "03/10/13 15:43", "Mar 11, 2013"] + df = pd.Series(data) + + expected_profile = DateTimeColumn(fake_profile_name) + expected_profile.update(df) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + expected_formats = [ + "%m/%d/%y %H:%M", + "%Y-%m-%d %H:%M:%S", + "%B %d, %Y", + "%Y-%m-%dT%H:%M:%S", + "%Y%m%dT%H%M%S", + "%b %d, %Y", + ] + + data_new = ["2012-02-10T15:43:30", "20120210T154300", "March 12, 2014"] + df_new = pd.Series(data_new) + + # validating update after deserialization + deserialized.update(df_new) + + assert deserialized._dt_obj_min == pd.Timestamp("2012-02-10 15:43:00") + assert deserialized._dt_obj_max == pd.Timestamp("2014-03-12 00:00:00") + + assert set(deserialized.date_formats) == set(expected_formats) diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 9c7db90ea..86e721a33 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -9,8 +9,12 @@ import pandas as pd from dataprofiler.profilers import FloatColumn +from dataprofiler.profilers.json_decoder import load_column_profile +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import FloatOptions +from . import utils as test_utils + test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -32,7 +36,7 @@ def test_base_case(self): self.assertTrue(profiler.kurtosis is np.nan) self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) - self.assertEqual(len(profiler.quantiles), 999) + self.assertIsNone(profiler.quantiles) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): @@ -1734,3 +1738,264 @@ def test_diff(self): str(exc.exception), "Unsupported operand type(s) for diff: 'FloatColumn' and" " 'str'", ) + + def test_json_encode(self): + profiler = FloatColumn("0.0") + + serialized = json.dumps(profiler, cls=ProfileEncoder) + + # Copy of NumericalStatsMixin code to test serialization of dicts + expected_histogram_bin_method_names = [ + "auto", + "fd", + "doane", + "scott", + "rice", + "sturges", + "sqrt", + ] + expected_min_histogram_bin = 1000 + expected_historam_methods = {} + for method in expected_histogram_bin_method_names: + expected_historam_methods[method] = { + "total_loss": 0.0, + "current_loss": 0.0, + "suggested_bin_count": expected_min_histogram_bin, + "histogram": {"bin_counts": None, "bin_edges": None}, + } + + serialized = json.dumps(profiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "FloatColumn", + "data": { + "min": None, + "max": None, + "_top_k_modes": 5, + "sum": 0.0, + "_biased_variance": np.nan, + "_biased_skewness": np.nan, + "_biased_kurtosis": np.nan, + "_median_is_enabled": True, + "_median_abs_dev_is_enabled": True, + "max_histogram_bin": 100000, + "min_histogram_bin": expected_min_histogram_bin, + "histogram_bin_method_names": expected_histogram_bin_method_names, + "histogram_selection": None, + "user_set_histogram_bin": None, + "bias_correction": True, + "_mode_is_enabled": True, + "num_zeros": 0, + "num_negatives": 0, + "_num_quantiles": 1000, + "histogram_methods": expected_historam_methods, + "_stored_histogram": { + "total_loss": 0.0, + "current_loss": 0.0, + "suggested_bin_count": 1000, + "histogram": {"bin_counts": None, "bin_edges": None}, + }, + "_batch_history": [], + "quantiles": None, + "_NumericStatsMixin__calculations": { + "min": "_get_min", + "max": "_get_max", + "sum": "_get_sum", + "variance": "_get_variance", + "skewness": "_get_skewness", + "kurtosis": "_get_kurtosis", + "histogram_and_quantiles": "_get_histogram_and_quantiles", + "num_zeros": "_get_num_zeros", + "num_negatives": "_get_num_negatives", + }, + "name": "0.0", + "col_index": np.nan, + "sample_size": 0, + "metadata": dict(), + "times": defaultdict(), + "thread_safe": True, + "match_count": 0, + "_precision": { + "min": None, + "max": None, + "sum": None, + "mean": None, + "biased_var": None, + "sample_size": None, + "confidence_level": 0.999, + }, + "_FloatColumn__z_value_precision": 3.291, + "_FloatColumn__precision_sample_ratio": None, + "_FloatColumn__calculations": {"precision": "_update_precision"}, + }, + } + ) + self.assertEqual(expected, serialized) + + @mock.patch("time.time", return_value=0.0) + def test_json_encode_after_update(self, time): + data = np.array([0.0, 5.0, 10.0]) + df = pd.Series(data).apply(str) + + int_options = FloatOptions() + int_options.histogram_and_quantiles.bin_count_or_method = 5 + profiler = FloatColumn("0.0", int_options) + + mocked_quantiles = [0.25, 0.50, 0.75] + with mock.patch.object( + profiler, "_get_percentile", return_value=mocked_quantiles + ): + # Mock out complex _get_percentile function. + # Only need to test valid serialization of np.ndarry. + profiler.update(df) + + # Copy of NumericalStatsMixin code to test serialization of dicts + expected_histogram_bin_method_names = ["custom"] + expected_min_histogram_bin = 5 + expected_historam_methods = {} + for method in expected_histogram_bin_method_names: + expected_historam_methods[method] = { + "total_loss": 0.0, + "current_loss": 0.0, + "suggested_bin_count": expected_min_histogram_bin, + "histogram": {"bin_counts": None, "bin_edges": None}, + } + serialized = json.dumps(profiler, cls=ProfileEncoder) + + expected = json.dumps( + { + "class": "FloatColumn", + "data": { + "min": 0.0, + "max": 10.0, + "_top_k_modes": 5, + "sum": 15.0, + "_biased_variance": 16.666666666666668, + "_biased_skewness": 0.0, + "_biased_kurtosis": -1.5, + "_median_is_enabled": True, + "_median_abs_dev_is_enabled": True, + "max_histogram_bin": 100000, + "min_histogram_bin": 1000, + "histogram_bin_method_names": expected_histogram_bin_method_names, + "histogram_selection": None, + "user_set_histogram_bin": 5, + "bias_correction": True, + "_mode_is_enabled": True, + "num_zeros": 1, + "num_negatives": 0, + "_num_quantiles": 1000, + "histogram_methods": expected_historam_methods, + "_stored_histogram": { + "total_loss": 2.0, + "current_loss": 2.0, + "suggested_bin_count": 1000, + "histogram": { + "bin_counts": [1, 0, 1, 0, 1], + "bin_edges": [0.0, 2.0, 4.0, 6.0, 8.0, 10.0], + }, + }, + "_batch_history": [ + { + "match_count": 3, + "sample_size": 3, + "min": 0.0, + "max": 10.0, + "sum": 15.0, + "biased_variance": 16.666666666666668, + "mean": 5.0, + "biased_skewness": 0.0, + "biased_kurtosis": -1.5, + "num_zeros": 1, + "num_negatives": 0, + } + ], + "quantiles": [0.25, 0.5, 0.75], + "_NumericStatsMixin__calculations": { + "min": "_get_min", + "max": "_get_max", + "sum": "_get_sum", + "variance": "_get_variance", + "skewness": "_get_skewness", + "kurtosis": "_get_kurtosis", + "histogram_and_quantiles": "_get_histogram_and_quantiles", + "num_zeros": "_get_num_zeros", + "num_negatives": "_get_num_negatives", + }, + "name": "0.0", + "col_index": np.nan, + "sample_size": 3, + "metadata": dict(), + "times": { + "precision": 0.0, + "min": 0.0, + "max": 0.0, + "sum": 0.0, + "variance": 0.0, + "skewness": 0.0, + "kurtosis": 0.0, + "histogram_and_quantiles": 0.0, + "num_zeros": 0.0, + "num_negatives": 0.0, + }, + "thread_safe": True, + "match_count": 3, + "_precision": { + "min": 0.0, + "max": 2.0, + "sum": 3.0, + "mean": 1.0, + "biased_var": 0.6666666666666666, + "sample_size": 3, + "confidence_level": 0.999, + }, + "_FloatColumn__z_value_precision": 3.291, + "_FloatColumn__precision_sample_ratio": None, + "_FloatColumn__calculations": {"precision": "_update_precision"}, + }, + } + ) + + self.assertEqual(expected, serialized) + + def test_json_decode(self): + fake_profile_name = None + expected_profile = FloatColumn(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + def test_json_decode_after_update(self): + fake_profile_name = "Fake profile name" + # Actual deserialization + + # Build expected FloatColumn + df_float = pd.Series([-1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 0, 0, 9.0]).apply(str) + expected_profile = FloatColumn(fake_profile_name) + + with test_utils.mock_timeit(): + expected_profile.update(df_float) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + df_float = pd.Series( + [ + 4.0, # add existing + 15.0, # add new + ] + ).apply(str) + + # validating update after deserialization + deserialized.update(df_float) + + assert deserialized.sample_size == 12 + assert ( + deserialized.mean + == sum([-1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 0, 0, 9.0, 4, 15]) / 12 + ) + assert deserialized.max == 15 diff --git a/dataprofiler/tests/profilers/test_graph_profiler.py b/dataprofiler/tests/profilers/test_graph_profiler.py index ee0d7e722..4edaab2b3 100644 --- a/dataprofiler/tests/profilers/test_graph_profiler.py +++ b/dataprofiler/tests/profilers/test_graph_profiler.py @@ -298,7 +298,6 @@ def test_graph_data_object(self): self.assertDictEqual(self.expected_profile, profile.profile) def test_diff(self): - self.maxDiff = None profile_1 = dp.GraphProfiler(self.graph_1) profile_2 = dp.GraphProfiler(self.graph_2) profile_3 = dp.GraphProfiler(self.graph_3) diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 50ca859cf..01e624d20 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -9,9 +9,12 @@ import pandas as pd from dataprofiler.profilers import IntColumn +from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import IntOptions +from . import utils as test_utils + test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -33,10 +36,7 @@ def test_base_case(self): self.assertTrue(profiler.kurtosis is np.nan) self.assertTrue(profiler.stddev is np.nan) self.assertIsNone(profiler.histogram_selection) - self.assertDictEqual( - {k: profiler.quantiles.get(k, "fail") for k in (0, 1, 2)}, - {0: None, 1: None, 2: None}, - ) + self.assertIsNone(profiler.quantiles) self.assertIsNone(profiler.data_type_ratio) def test_single_data_variance_case(self): @@ -1117,8 +1117,8 @@ def test_json_encode(self): expected_historam_methods = {} for method in expected_histogram_bin_method_names: expected_historam_methods[method] = { - "total_loss": 0, - "current_loss": 0, + "total_loss": 0.0, + "current_loss": 0.0, "suggested_bin_count": expected_min_histogram_bin, "histogram": {"bin_counts": None, "bin_edges": None}, } @@ -1131,7 +1131,7 @@ def test_json_encode(self): "min": None, "max": None, "_top_k_modes": 5, - "sum": 0, + "sum": 0.0, "_biased_variance": np.nan, "_biased_skewness": np.nan, "_biased_kurtosis": np.nan, @@ -1146,15 +1146,16 @@ def test_json_encode(self): "_mode_is_enabled": True, "num_zeros": 0, "num_negatives": 0, + "_num_quantiles": 1000, "histogram_methods": expected_historam_methods, "_stored_histogram": { - "total_loss": 0, - "current_loss": 0, + "total_loss": 0.0, + "current_loss": 0.0, "suggested_bin_count": 1000, "histogram": {"bin_counts": None, "bin_edges": None}, }, "_batch_history": [], - "quantiles": {bin_num: None for bin_num in range(999)}, + "quantiles": None, "_NumericStatsMixin__calculations": { "min": "_get_min", "max": "_get_max", @@ -1227,10 +1228,11 @@ def test_json_encode_after_update(self, time): "_mode_is_enabled": True, "num_zeros": 1, "num_negatives": 0, + "_num_quantiles": 1000, "histogram_methods": { "custom": { - "total_loss": 0, - "current_loss": 0, + "total_loss": 0.0, + "current_loss": 0.0, "suggested_bin_count": 5, "histogram": {"bin_counts": None, "bin_edges": None}, } @@ -1294,3 +1296,47 @@ def test_json_encode_after_update(self, time): ) self.assertEqual(serialized, expected) + + def test_json_decode(self): + fake_profile_name = None + expected_profile = IntColumn(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + def test_json_decode_after_update(self): + fake_profile_name = "Fake profile name" + # Actual deserialization + + # Build expected IntColumn + df_int = pd.Series([-1, 2, 5, 7, 4, 3, 2, 0, 0, 9]) + expected_profile = IntColumn(fake_profile_name) + + with test_utils.mock_timeit(): + expected_profile.update(df_int) + + # Validate reporting before deserialization + expected_profile.report() + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + # Validate reporting after deserialization + deserialized.report() + test_utils.assert_profiles_equal(deserialized, expected_profile) + + df_int = pd.Series( + [ + 4, # add existing + 15, # add new + ] + ) + + # validating update after deserialization + deserialized.update(df_int) + + assert deserialized.sample_size == 12 + assert deserialized.mean == sum([-1, 2, 5, 7, 4, 3, 2, 0, 0, 9, 4, 15]) / 12 + assert deserialized.max == 15 diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 6e4299db6..d01a7c382 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -13,6 +13,8 @@ from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import NumericalOptions +from . import utils as test_utils + test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) @@ -396,6 +398,37 @@ def test_timeit(self): ) self.assertEqual(expected, num_profiler.times) + def test_from_dict_helper(self): + fake_profile_name = "Fake profile name" + + # Build expected CategoricalColumn + actual_profile = TestColumn() + expected_profile = TestColumn() + mock_saved_profile = dict( + { + "quantiles": None, + "_stored_histogram": { + "total_loss": np.float64(0.0), + "current_loss": np.float64(0.0), + "suggested_bin_count": 1000, + "histogram": { + "bin_counts": None, + "bin_edges": None, + }, + }, + } + ) + expected_profile._stored_histogram = mock_saved_profile["_stored_histogram"] + expected_profile.quantiles = None + + expected_profile._stored_histogram["histogram"] = { + "bin_counts": None, + "bin_edges": None, + } + actual_profile._reformat_numeric_stats_types_on_serialized_profiles() + + test_utils.assert_profiles_equal(expected_profile, actual_profile) + def test_histogram_bin_error(self): num_profiler = TestColumn() @@ -1166,8 +1199,8 @@ def test_json_encode(self): expected_historam_methods = {} for method in expected_histogram_bin_method_names: expected_historam_methods[method] = { - "total_loss": 0, - "current_loss": 0, + "total_loss": 0.0, + "current_loss": 0.0, "suggested_bin_count": expected_min_histogram_bin, "histogram": {"bin_counts": None, "bin_edges": None}, } @@ -1180,7 +1213,7 @@ def test_json_encode(self): "min": None, "max": None, "_top_k_modes": 5, - "sum": 0, + "sum": 0.0, "_biased_variance": np.nan, "_biased_skewness": np.nan, "_biased_kurtosis": np.nan, @@ -1195,15 +1228,16 @@ def test_json_encode(self): "_mode_is_enabled": True, "num_zeros": 0, "num_negatives": 0, + "_num_quantiles": 1000, "histogram_methods": expected_historam_methods, "_stored_histogram": { - "total_loss": 0, - "current_loss": 0, + "total_loss": 0.0, + "current_loss": 0.0, "suggested_bin_count": 1000, "histogram": {"bin_counts": None, "bin_edges": None}, }, "_batch_history": [], - "quantiles": {bin_num: None for bin_num in range(999)}, + "quantiles": None, "_NumericStatsMixin__calculations": { "min": "_get_min", "max": "_get_max", diff --git a/dataprofiler/tests/profilers/test_order_column_profile.py b/dataprofiler/tests/profilers/test_order_column_profile.py index dd7c7a059..aefb02883 100644 --- a/dataprofiler/tests/profilers/test_order_column_profile.py +++ b/dataprofiler/tests/profilers/test_order_column_profile.py @@ -8,9 +8,11 @@ import pandas as pd from dataprofiler.profilers import OrderColumn +from dataprofiler.profilers.json_decoder import load_column_profile from dataprofiler.profilers.json_encoder import ProfileEncoder from .. import test_utils +from . import utils # This is taken from: https://github.com/rlworkgroup/dowel/pull/36/files # undo when cpython#4800 is merged. @@ -318,7 +320,7 @@ def test_merge_timing(self): @mock.patch("dataprofiler.profilers.OrderColumn._get_data_order") def test_random_order_prevents_update_from_occuring(self, mock_get_data_order): - mock_get_data_order.return_value = ["random", 1, 2] + mock_get_data_order.return_value = ["random", 1, 2, str] data = ["a", "b", "ab"] df = pd.Series(data).apply(str) @@ -371,6 +373,7 @@ def test_json_encode(self): "order": None, "_last_value": None, "_first_value": None, + "_data_store_type": "float64", "_piecewise": False, "_OrderColumn__calculations": dict(), "name": "0", @@ -400,6 +403,7 @@ def test_json_encode_after_update(self): "order": "descending", "_last_value": "a", "_first_value": "za", + "_data_store_type": "str", "_piecewise": False, "_OrderColumn__calculations": dict(), "name": "0", @@ -413,3 +417,99 @@ def test_json_encode_after_update(self): ) self.assertEqual(serialized, expected) + + def test_json_decode(self): + fake_profile_name = None + expected_profile = OrderColumn(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + utils.assert_profiles_equal(deserialized, expected_profile) + + def test_json_decode_after_update_str(self): + fake_profile_name = "Fake profile name" + + # Build expected orderColumn + df_order = pd.Series(["za", "z", "c", "c"]) + expected_profile = OrderColumn(fake_profile_name) + + with utils.mock_timeit(): + expected_profile.update(df_order) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + utils.assert_profiles_equal(deserialized, expected_profile) + + # Adding data to update that is in descending order + # (consistent with previous data) + df_order = pd.Series( + [ + "c", # add existing + "a", # add new + ] + ) + + # validating update after deserialization + deserialized.update(df_order) + + assert deserialized.sample_size == 6 + assert deserialized._last_value == "a" + assert deserialized.order == expected_profile.order + + # Adding data to update that is in random order + # (not consistent with previous data) + df_order = pd.Series( + [ + "c", # add existing + "zza", # add new + ] + ) + deserialized.update(df_order) + + assert deserialized.sample_size == 8 + assert deserialized._last_value == "zza" + assert deserialized.order == "random" + + def test_json_decode_after_update_num(self): + fake_profile_name = "Fake profile name" + + # Build expected orderColumn + df_order = pd.Series(["1", "4", "6"]) + expected_profile = OrderColumn(fake_profile_name) + + with utils.mock_timeit(): + expected_profile.update(df_order) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + utils.assert_profiles_equal(deserialized, expected_profile) + + # Adding data to update that is in descending order + # (consistent with previous data) + df_order = pd.Series( + [ + "6", # add existing + "9", # add new + ] + ) + + # validating update after deserialization + with utils.mock_timeit(): + expected_profile.update(df_order) + deserialized.update(df_order) + utils.assert_profiles_equal(expected_profile, deserialized) + + # Adding data to update that is in random order + # (not consistent with previous data) + df_order = pd.Series( + [ + "3", # add existing + "1", # add new + ] + ) + with utils.mock_timeit(): + expected_profile.update(df_order) + deserialized.update(df_order) + utils.assert_profiles_equal(expected_profile, deserialized) diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 3a5505192..7a309e946 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -14,6 +14,7 @@ import dataprofiler import dataprofiler as dp from dataprofiler import StructuredDataLabeler, UnstructuredDataLabeler +from dataprofiler.labelers.base_data_labeler import BaseDataLabeler from dataprofiler.profilers.column_profile_compilers import ( ColumnDataLabelerCompiler, ColumnPrimitiveTypeProfileCompiler, @@ -21,6 +22,11 @@ ) from dataprofiler.profilers.graph_profiler import GraphProfiler from dataprofiler.profilers.helpers.report_helpers import _prepare_report +from dataprofiler.profilers.json_decoder import ( + load_profiler, + load_structured_col_profiler, +) +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profile_builder import ( Profiler, StructuredColProfiler, @@ -39,13 +45,20 @@ test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) -def setup_save_mock_open(mock_open): +def setup_save_mock_bytes_open(mock_open): mock_file = BytesIO() mock_file.close = lambda: None mock_open.side_effect = lambda *args: mock_file return mock_file +def setup_save_mock_string_open(mock_open): + mock_file = StringIO() + mock_file.close = lambda: None + mock_open.side_effect = lambda *args: mock_file + return mock_file + + class TestStructuredProfiler(unittest.TestCase): @classmethod def setUp(cls): @@ -60,8 +73,9 @@ def setUpClass(cls): ) cls.aws_dataset = pd.read_csv(cls.input_file_path) profiler_options = ProfilerOptions() - profiler_options.set({"data_labeler.is_enabled": False}) - + profiler_options.set( + {"data_labeler.is_enabled": False, "multiprocess.is_enabled": False} + ) profiler_options_hll = ProfilerOptions() profiler_options_hll.set( { @@ -75,10 +89,10 @@ def setUpClass(cls): ) @mock.patch( - "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" + "dataprofiler.profilers.profile_builder.ColumnPrimitiveTypeProfileCompiler" ) - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") + @mock.patch("dataprofiler.profilers.profile_builder.ColumnStatsProfileCompiler") + @mock.patch("dataprofiler.profilers.profile_builder.ColumnDataLabelerCompiler") @mock.patch( "dataprofiler.profilers.profile_builder.DataLabeler", spec=StructuredDataLabeler ) @@ -99,16 +113,15 @@ def test_bad_input_data(self, *mocks): StructuredProfiler(data) @mock.patch( - "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" + "dataprofiler.profilers.profile_builder.ColumnPrimitiveTypeProfileCompiler" ) - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") + @mock.patch("dataprofiler.profilers.profile_builder.ColumnStatsProfileCompiler") + @mock.patch("dataprofiler.profilers.profile_builder.ColumnDataLabelerCompiler") @mock.patch( "dataprofiler.profilers.profile_builder.DataLabeler", spec=StructuredDataLabeler ) @mock.patch( - "dataprofiler.profilers.profile_builder." - "StructuredProfiler._update_correlation" + "dataprofiler.profilers.profile_builder.StructuredProfiler._update_correlation" ) def test_list_data(self, *mocks): data = [[1, 1], [None, None], [3, 3], [4, 4], [5, 5], [None, None], [1, 1]] @@ -128,7 +141,7 @@ def test_list_data(self, *mocks): # validates the sample out maintains the same visual data format as the # input. - self.assertListEqual(["5", "1", "1", "3", "4"], profiler.profile[0].sample) + self.assertListEqual(["1", "4", "5", "1", "3"], profiler.profile[0].sample) @mock.patch( "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" @@ -233,6 +246,8 @@ def test_add_profilers(self, *mocks): self.assertEqual( "", merged_profile.file_type ) + self.assertTrue(merged_profile.options.row_statistics.null_count.is_enabled) + self.assertTrue(merged_profile.options.row_statistics.unique_count.is_enabled) self.assertEqual(2, merged_profile.row_has_null_count) self.assertEqual(2, merged_profile.row_is_null_count) self.assertEqual(7, merged_profile.total_samples) @@ -1458,7 +1473,7 @@ def test_min_true_samples(self, *mocks): profile = dp.StructuredProfiler(empty_df, min_true_samples=10) self.assertEqual(10, profile._min_true_samples) - def test_save_and_load(self): + def test_save_and_load_pkl_file(self): datapth = "dataprofiler/tests/data/" test_files = ["csv/guns.csv", "csv/iris.csv"] @@ -1474,14 +1489,14 @@ def test_save_and_load(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() mock_file.seek(0) with mock.patch( - "dataprofiler.profilers.profile_builder." "DataLabeler", + "dataprofiler.profilers.profile_builder.DataLabeler", return_value=data_labeler, ): - load_profile = dp.StructuredProfiler.load("mock.pkl") + load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle") # validate loaded profile has same data labeler class self.assertIsInstance( @@ -1504,6 +1519,55 @@ def test_save_and_load(self): load_report = test_utils.clean_report(load_profile.report()) np.testing.assert_equal(save_report, load_report) + def test_save_and_load_json_file(self): + datapth = "dataprofiler/tests/data/" + test_files = ["csv/guns.csv", "csv/iris.csv"] + + for test_file in test_files: + # Create Data and StructuredProfiler objects + data = dp.Data(os.path.join(datapth, test_file)) + options = ProfilerOptions() + options.set( + { + "correlation.is_enabled": True, + "null_replication_metrics.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + save_profile = dp.StructuredProfiler(data, options=options) + + # store the expected data_labeler + data_labeler = save_profile.options.data_labeler.data_labeler_object + + # Save and Load profile with Mock IO + with mock.patch("builtins.open") as m: + mock_file = setup_save_mock_string_open(m) + save_profile.save(save_method="json") + mock_file.seek(0) + with mock.patch( + "dataprofiler.profilers.utils.DataLabeler.load_from_library", + return_value=data_labeler, + ): + load_profile = dp.StructuredProfiler.load("mock.json", "JSON") + + # validate loaded profile has same data labeler class + self.assertIsInstance( + load_profile.options.data_labeler.data_labeler_object, + data_labeler.__class__, + ) + + # only checks first columns + # get first column + first_column_profile = load_profile.profile[0] + self.assertIsInstance( + first_column_profile.profiles["data_label_profile"] + ._profiles["data_labeler"] + .data_labeler, + data_labeler.__class__, + ) + + test_utils.assert_profiles_equal(save_profile, load_profile) + def test_save_and_load_no_labeler(self): # Create Data and UnstructuredProfiler objects data = pd.DataFrame([1, 2, 3], columns=["a"]) @@ -1515,12 +1579,12 @@ def test_save_and_load_no_labeler(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() mock_file.seek(0) - with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"): - load_profile = dp.StructuredProfiler.load("mock.pkl") + with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"): + load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle") # Check that reports are equivalent save_report = test_utils.clean_report(save_profile.report()) @@ -1531,6 +1595,130 @@ def test_save_and_load_no_labeler(self): save_profile.update_profile(pd.DataFrame({"a": [4, 5]})) load_profile.update_profile(pd.DataFrame({"a": [4, 5]})) + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" + ) + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", + spec=BaseDataLabeler, + ) + def test_save_json_file(self, *mocks): + mock_labeler = mocks[0].return_value + mock_labeler._default_model_loc = "structured_model" + mocks[0].load_from_library.return_value = mock_labeler + + df_structured = pd.DataFrame( + [ + [-1.5, 3.0, "nan"], + ["a", "z"], + ] + ).T + + profile_options = dp.ProfilerOptions() + profile_options.set( + { + "correlation.is_enabled": True, + "null_replication_metrics.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + + # Create Data and StructuredProfiler objects + with test_utils.mock_timeit(): + save_profile = dp.StructuredProfiler(df_structured, options=profile_options) + + # Save and Load profile with Mock IO + with mock.patch("builtins.open") as mock_open, mock.patch( + "dataprofiler.profilers.profile_builder.datetime" + ) as mock_pb_datetime: + mock_pb_datetime.now().strftime.return_value = "now" + mock_file = setup_save_mock_string_open(mock_open) + save_profile.save("output/mock.json", "JSON") + mock_file.seek(0) + + expected_first_path = "output/mock.json" + expected_data = { + "class": "StructuredProfiler", + "data": { + "_profile": [ + mock.ANY, + mock.ANY, + ], + "options": mock.ANY, + "encoding": None, + "file_type": "", + "_samples_per_update": None, + "_min_true_samples": 0, + "total_samples": 3, + "times": {"correlation": 1.0, "row_stats": 1.0}, + "_sampling_ratio": 0.2, + "_min_sample_size": 5000, + "row_has_null_count": 1, + "row_is_null_count": 1, + "_col_name_to_idx": {"0": [0], "1": [1]}, + "correlation_matrix": mock.ANY, + "chi2_matrix": mock.ANY, + "hashed_row_object": { + "3389675549807214348": True, + "3478012351066866062": True, + "5121271752956874941": True, + }, + "_null_replication_metrics": mock.ANY, + }, + } + + actual_data = json.loads(mock_file.read()) + + mock_open.assert_called_with(expected_first_path, "w") + self.assertDictEqual(expected_data, actual_data) + + # do a second call without a specified file path + with mock.patch("builtins.open") as mock_open, mock.patch( + "dataprofiler.profilers.profile_builder.datetime" + ) as mock_pb_datetime: + mock_pb_datetime.now().strftime.return_value = "now" + setup_save_mock_string_open(mock_open) + save_profile.save(save_method="json") + + expected_second_path = "profile-now.json" + + mock_open.assert_called_with(expected_second_path, "w") + + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" + ) + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", + spec=BaseDataLabeler, + ) + def test_save_value_error(self, *mocks): + mock_labeler = mocks[0].return_value + mock_labeler._default_model_loc = "structured_model" + mocks[0].load_from_library.return_value = mock_labeler + + df_structured = pd.DataFrame( + [ + [-1.5, 3.0, "nan"], + ["a", "z"], + ] + ).T + + profile_options = dp.ProfilerOptions() + profile_options.set( + { + "correlation.is_enabled": True, + "null_replication_metrics.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + save_profile = dp.StructuredProfiler(df_structured, options=profile_options) + + # Save and Load profile with Mock IO + with self.assertRaisesRegex( + ValueError, 'save_method must be "json" or "pickle".' + ): + save_profile.save(save_method="csv") + @mock.patch( "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" ) @@ -1725,7 +1913,7 @@ def test_get_and_validate_schema_mapping(self): self.assertDictEqual(actual_schema, expected_schema) @mock.patch( - "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" ) @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") @mock.patch( @@ -1850,7 +2038,7 @@ def test_diff(self, *mocks): @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") @mock.patch( - "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" ) def test_diff_type_checking(self, *mocks): data = pd.DataFrame([[1, 2], [5, 6]], columns=["a", "b"]) @@ -1924,7 +2112,7 @@ def test_diff_categorical_chi2_test(self, *mocks): ) @mock.patch( - "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" ) @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") @mock.patch( @@ -2111,6 +2299,30 @@ def test_null_replication_metrics_calculation(self): np.testing.assert_array_almost_equal([[20], [0]], column["class_sum"]) np.testing.assert_array_almost_equal([[10], [0]], column["class_mean"]) + # account for datetime + data = pd.DataFrame( + { + "a": [3, 2, np.nan, 7, None], + "b": [10, 10, 1, 4, 2], + "c": ["2/2/2021", "2/5/2021", "2/9/2021", "2/21/2021", None], + } + ) + profiler = dp.StructuredProfiler(data, options=profile_options) + expected_null_rep = { + 0: { + "class_prior": [0.6, 0.4], + "class_sum": [[24.0, np.nan], [3.0, 0.0]], + "class_mean": [[8.0, np.nan], [1.5, 0.0]], + }, + # 1: has not values bc none to replicate 100% real + 2: { + "class_prior": [0.8, 0.2], + "class_sum": [[12.0, 25.0], [0.0, 2.0]], + "class_mean": [[3.0, 6.25], [0.0, 2.0]], + }, + } + np.testing.assert_equal(expected_null_rep, profiler._null_replication_metrics) + def test_column_level_invalid_values(self): data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]]) @@ -2146,6 +2358,215 @@ def test_column_level_invalid_values(self): ["1", "2"], sorted(report["data_stats"][1]["samples"]) ) + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, + ) + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", spec=BaseDataLabeler + ) + def test_json_encode(self, mock_DataLabeler, *mocks): + fake_profile_name = None + mock_DataLabeler._default_model_loc = "test" + mock_DataLabeler.return_value = mock_DataLabeler + + with test_utils.mock_timeit(): + profile = StructuredProfiler(fake_profile_name) + + serialized = json.dumps(profile, cls=ProfileEncoder) + expected = { + "class": "StructuredProfiler", + "data": { + "_profile": [], + "options": mock.ANY, + "encoding": None, + "file_type": None, + "_samples_per_update": None, + "_min_true_samples": 0, + "total_samples": 0, + "times": {}, + "_sampling_ratio": 0.2, + "_min_sample_size": 5000, + "row_has_null_count": 0, + "row_is_null_count": 0, + "hashed_row_object": {}, + "_col_name_to_idx": {}, + "correlation_matrix": None, + "chi2_matrix": None, + "_null_replication_metrics": None, + }, + } + + serialized_dict = json.loads(serialized) + + self.assertDictEqual(expected, serialized_dict) + + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" + ) + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", spec=BaseDataLabeler + ) + def test_json_encode_after_update(self, mock_DataLabeler, *mocks): + mock_DataLabeler._default_model_loc = "test" + mock_DataLabeler.return_value = mock_DataLabeler + df_structured = pd.DataFrame( + [ + [-1.5, 3.0, "nan"], + ["a", "z"], + ] + ).T + with test_utils.mock_timeit(): + profile = StructuredProfiler(df_structured) + + serialized = json.dumps(profile, cls=ProfileEncoder) + + expected = { + "class": "StructuredProfiler", + "data": { + "_profile": [mock.ANY, mock.ANY], + "options": mock.ANY, + "encoding": None, + "file_type": "", + "_samples_per_update": None, + "_min_true_samples": 0, + "total_samples": 3, + "times": {"row_stats": 1.0}, + "_sampling_ratio": 0.2, + "_min_sample_size": 5000, + "row_has_null_count": 1, + "row_is_null_count": 1, + "_col_name_to_idx": {"0": [0], "1": [1]}, + "correlation_matrix": None, + "chi2_matrix": [[1.0, 0.26146412994911117], [0.26146412994911117, 1.0]], + "_null_replication_metrics": None, + }, + } + + serialized_dict = json.loads(serialized) + + # Checks for specific dict values + # Chi2 due to floating point + serialized_chi2 = serialized_dict["data"].pop("chi2_matrix") + expected_chi2 = expected["data"].pop("chi2_matrix") + np.testing.assert_array_almost_equal(expected_chi2, serialized_chi2) + + # hashed_row_object due to specificity of values + serialized_hashed_row_object = serialized_dict["data"].pop("hashed_row_object") + self.assertEqual(3, len(serialized_hashed_row_object.keys())) + + self.assertDictEqual(expected, serialized_dict) + + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", + spec=BaseDataLabeler, + ) + @mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): + mock_labeler = mock.Mock(spec=BaseDataLabeler) + mock_labeler._default_model_loc = "test" + mock_labeler.return_value = mock_labeler + mock_DataLabeler.load_from_library = mock_labeler + mock_utils_DataLabeler.load_from_library = mock_labeler + mock_DataLabeler.return_value = mock_labeler + + fake_profile_name = None + profile_options = dp.ProfilerOptions() + profile_options.set( + { + "correlation.is_enabled": True, + "null_replication_metrics.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + expected_profile = StructuredProfiler( + fake_profile_name, options=profile_options + ) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_profiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + @mock.patch( + "dataprofiler.profilers.profile_builder.DataLabeler", + spec=BaseDataLabeler, + ) + @mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_decode_after_update( + self, mock_utils_DataLabeler, mock_DataLabeler, *mocks + ): + mock_labeler = mock.Mock(spec=BaseDataLabeler) + mock_labeler._default_model_loc = "test" + mock_labeler.return_value = mock_labeler + mock_DataLabeler.load_from_library = mock_labeler + mock_utils_DataLabeler.load_from_library = mock_labeler + mock_DataLabeler.return_value = mock_labeler + mock_labeler._default_model_loc = "structured_model" + mock_labeler.model.num_labels = 2 + mock_labeler.reverse_label_mapping = {1: "a", 2: "b"} + + fake_profile_name = None + df_structured = pd.DataFrame([["1.5", "a", "4"], ["3.0", "z", 7]]) + + # update mock for 2 confidence values for 2 possible classes + mock_labeler.predict.side_effect = lambda *args, **kwargs: { + "pred": [], + "conf": [[1, 1], [0, 0]], + } + profile_options = dp.ProfilerOptions() + profile_options.set( + { + "correlation.is_enabled": True, + "null_replication_metrics.is_enabled": True, + "multiprocess.is_enabled": False, + } + ) + expected_profile = StructuredProfiler( + fake_profile_name, options=profile_options + ) + + with test_utils.mock_timeit(): + expected_profile.update_profile(df_structured) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_profiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + # validate passes one labeler all the way. + config = {} + mock_DataLabeler.load_from_library.reset_mock() + deserialized = load_profiler(json.loads(serialized), config) + mock_DataLabeler.load_from_library.assert_called_once() + + expected_config = { + "DataLabelerColumn": {"from_library": {"structured_model": mock_labeler}}, + "DataLabelerOptions": {"from_library": {"structured_model": mock_labeler}}, + } + self.assertDictEqual(expected_config, config) + + # validating update after deserialization + df_structured = pd.DataFrame( + [ + [4.0, "nan", "15.0"], # partial nan row + ["nan", "nan", "nan"], # Full nan row + ["1.5", "a", "4"], # Repeated from previous update + ] + ) + + with test_utils.mock_timeit(): + deserialized.update_profile(df_structured) + expected_profile.update_profile(df_structured) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + class TestStructuredColProfilerClass(unittest.TestCase): def setUp(self): @@ -2202,19 +2623,9 @@ def test_base_props(self): self.assertEqual(2999 * 3, src_profile.sample_size) @mock.patch( - "dataprofiler.profilers.column_profile_compilers." - "ColumnPrimitiveTypeProfileCompiler" - ) - @mock.patch( - "dataprofiler.profilers.column_profile_compilers." "ColumnStatsProfileCompiler" - ) - @mock.patch( - "dataprofiler.profilers.column_profile_compilers." "ColumnDataLabelerCompiler" - ) - @mock.patch( - "dataprofiler.profilers.profile_builder." - "StructuredProfiler._update_correlation" + "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" ) + @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") def test_add_profilers(self, *mocks): data = pd.Series([1, None, 3, 4, 5, None]) profile1 = StructuredColProfiler(data[:2]) @@ -2319,7 +2730,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): self.assertTrue(np.issubdtype(np.object_, df_series.dtype)) self.assertDictEqual( { - "sample": ["4.0", "6.0", "3.0"], + "sample": ["6.0", "3.0", "4.0"], "sample_size": 5, "null_count": 2, "null_types": dict(nan=["e", "b"]), @@ -2336,7 +2747,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): ) self.assertDictEqual( { - "sample": ["nan", "6.0", "4.0", "nan"], + "sample": ["6.0", "nan", "nan", "4.0"], "sample_size": 6, "null_count": 2, "null_types": {"1.0": ["a"], "3.0": ["c"]}, @@ -2353,7 +2764,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): ) self.assertDictEqual( { - "sample": ["3.0", "4.0", "6.0", "nan", "1.0"], + "sample": ["3.0", "4.0", "nan", "6.0", "nan"], "sample_size": 6, "null_count": 0, "null_types": {}, @@ -2506,11 +2917,9 @@ def test_sampling_ratio_passed_to_profile(self): self.assertEqual(10000, profiler.report()["global_stats"]["samples_used"]) @mock.patch( - "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" + "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" ) - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") - @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") + @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") def test_index_overlap_for_update_profile(self, *mocks): data = pd.Series([0, None, 1, 2, None]) profile = StructuredColProfiler(data) @@ -2525,11 +2934,9 @@ def test_index_overlap_for_update_profile(self, *mocks): self.assertDictEqual(profile.null_types_index, {"nan": {1, 4, 6, 9}}) @mock.patch( - "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" + "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" ) - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") - @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") + @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") def test_index_overlap_for_merge(self, *mocks): data = pd.Series([0, None, 1, 2, None]) profile1 = StructuredColProfiler(data) @@ -2550,11 +2957,9 @@ def test_index_overlap_for_merge(self, *mocks): self.assertDictEqual(profile2.null_types_index, {"nan": {1, 4}}) @mock.patch( - "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" + "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" ) - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") - @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") - @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") + @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") def test_min_max_id_properly_update(self, *mocks): data = pd.Series([1, None, 3, 4, 5, None, 1]) profile1 = StructuredColProfiler(data[:2]) @@ -2577,9 +2982,12 @@ def test_min_max_id_properly_update(self, *mocks): self.assertEqual(0, profile._min_id) self.assertEqual(6, profile._max_id) - @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") @mock.patch( - "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" + "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" + ) + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, ) @mock.patch( "dataprofiler.profilers.column_profile_compilers." @@ -2644,6 +3052,205 @@ def test_diff(self, *mocks): self.assertDictEqual(expected_diff, dict(profile1.diff(profile2))) + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_encode(self, mocked_datalabeler, *mocks): + col_profiler = StructuredColProfiler() + + serialized = json.dumps(col_profiler, cls=ProfileEncoder) + expected = json.dumps( + { + "class": "StructuredColProfiler", + "data": { + "name": None, + "options": None, + "_min_sample_size": 5000, + "_sampling_ratio": 0.2, + "_min_true_samples": 0, + "sample_size": 0, + "sample": [], + "null_count": 0, + "null_types": [], + "null_types_index": {}, + "_min_id": None, + "_max_id": None, + "_index_shift": None, + "_last_batch_size": None, + "profiles": {}, + "_null_values": { + "": 0, + "nan": 2, + "none": 2, + "null": 2, + " *": 0, + "--*": 0, + "__*": 0, + }, + }, + } + ) + self.assertEqual(expected, serialized) + + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_encode_after_update(self, mock_DataLabeler, *mocks): + mock_labeler = mock_DataLabeler.return_value + mock_labeler._default_model_loc = "test" + mock_labeler.model.num_labels = 2 + mock_labeler.reverse_label_mapping = {1: "a", 2: "b"} + mock_DataLabeler.load_from_library.return_value = mock_labeler + + data = pd.Series(["-2", "Nan", "1", "2"], name="test") + # update mock for 4 values + mock_labeler.predict.return_value = {"pred": [], "conf": np.zeros((4, 2))} + with test_utils.mock_timeit(): + col_profiler = StructuredColProfiler(data) + + serialized = json.dumps(col_profiler, cls=ProfileEncoder) + + expected = { + "class": "StructuredColProfiler", + "data": { + "name": "test", + "options": mock.ANY, + "_min_sample_size": 5000, + "_sampling_ratio": 0.2, + "_min_true_samples": 0, + "sample_size": 4, + "sample": ["2", "-2", "1"], + "null_count": 1, + "null_types": ["Nan"], + "null_types_index": { + "Nan": [ + 1, + ] + }, + "_min_id": 0, + "_max_id": 3, + "_index_shift": None, + "_last_batch_size": 4, + "_null_values": { + "": 0, + "nan": 2, + "none": 2, + "null": 2, + " *": 0, + "--*": 0, + "__*": 0, + }, + "profiles": { + "data_type_profile": { + "class": "ColumnPrimitiveTypeProfileCompiler", + "data": mock.ANY, + }, + "data_stats_profile": { + "class": "ColumnStatsProfileCompiler", + "data": mock.ANY, + }, + "data_label_profile": { + "class": "ColumnDataLabelerCompiler", + "data": mock.ANY, + }, + }, + }, + } + + self.assertDictEqual(expected, json.loads(serialized)) + + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, + ) + @mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): + mock_labeler = mock.Mock(spec=BaseDataLabeler) + mock_labeler._default_model_loc = "test" + mock_DataLabeler.load_from_library = mock_labeler + + fake_profile_name = None + expected_profile = StructuredColProfiler(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_structured_col_profiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + @mock.patch( + "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", + spec=BaseDataLabeler, + ) + @mock.patch( + "dataprofiler.profilers.utils.DataLabeler", + spec=BaseDataLabeler, + ) + def test_json_decode_after_update( + self, mock_utils_DataLabeler, mock_DataLabeler, *mocks + ): + mock_labeler = mock_DataLabeler.return_value + mock_labeler._default_model_loc = "test" + mock_labeler.model.num_labels = 2 + mock_labeler.reverse_label_mapping = {1: "a", 2: "b"} + mock_DataLabeler.load_from_library.return_value = mock_labeler + mock_utils_DataLabeler.load_from_library.return_value = mock_labeler + + # Build expected StructuredColProfiler + df_float = pd.Series([-1.5, None, 5.0, 7.0, 4.0, 3.0, "NaN", 0, 0, 9.0]).apply( + str + ) + # update mock for 10 values + mock_labeler.predict.return_value = {"pred": [], "conf": np.zeros((10, 2))} + + expected_profile = StructuredColProfiler(df_float) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_structured_col_profiler(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + assert deserialized.null_count == 2 + assert deserialized.null_types_index == { + "None": { + 1, + }, + "NaN": { + 6, + }, + } + + df_float = pd.Series( + [ + "NaN", # add existing + "15.0", # add new + "null", # add new + ] + ) + # update mock for 2 Values + mock_labeler.predict.return_value = {"pred": [], "conf": [[1, 1], [0, 0]]} + + # validating update after deserialization + deserialized.update_profile(df_float) + + assert deserialized.sample_size == 13 + assert deserialized.null_count == 4 + assert deserialized.null_types_index == { + "None": { + 1, + }, + "NaN": {6, 10}, + "null": { + 12, + }, + } + assert deserialized.profile["data_label"] == "a" + assert deserialized.profile["statistics"]["max"] == 15 + assert deserialized.profile["statistics"]["min"] == -1.5 + @mock.patch( "dataprofiler.profilers.profile_builder.UnstructuredCompiler", @@ -2945,6 +3552,45 @@ def test_min_true_samples(self, *mocks): profile = dp.UnstructuredProfiler(empty_df, min_true_samples=10) self.assertEqual(10, profile._min_true_samples) + def test_encode(self, *mocks): + profiler = UnstructuredProfiler(None) + with self.assertRaisesRegex( + NotImplementedError, "UnstructuredProfiler serialization not supported." + ): + json.dumps(profiler, cls=ProfileEncoder) + + def test_decode(self, *mocks): + with self.assertRaisesRegex( + ValueError, "Invalid profiler class UnstructuredProfiler failed to load." + ): + load_profiler({"class": "UnstructuredProfiler", "data": {}}) + + def test_load_from_dict(self, *mocks): + with self.assertRaisesRegex( + NotImplementedError, "UnstructuredProfiler deserialization not supported." + ): + UnstructuredProfiler.load_from_dict({}, None) + + @mock.patch("builtins.open") + def test_save_json_file(self, *mocks): + data = pd.Series(["this", "is my", "\n\r", "test"]) + save_profile = UnstructuredProfiler(data) + + with self.assertRaisesRegex( + NotImplementedError, "UnstructuredProfiler serialization not supported." + ): + save_profile.save(save_method="json") + + def test_save_value_error(self, *mocks): + data = pd.Series(["this", "is my", "\n\r", "test"]) + save_profile = UnstructuredProfiler(data) + + # Save and Load profile with Mock IO + with self.assertRaisesRegex( + ValueError, 'save_method must be "json" or "pickle".' + ): + save_profile.save(save_method="csv") + class TestUnstructuredProfilerWData(unittest.TestCase): @classmethod @@ -2954,7 +3600,7 @@ def setUp(cls): @classmethod def setUpClass(cls): test_utils.set_seed(0) - cls.maxDiff = None + cls.input_data = [ "edited 9 hours ago", "6. Do not duplicate code.", @@ -2992,7 +3638,6 @@ def setUpClass(cls): cls.report = cls.profiler.report() def test_sample(self): - self.maxDiff = None self.assertCountEqual( [ "Report", @@ -3399,7 +4044,7 @@ def test_report_remove_disabled_flag(self): self.assertIn("vocab", report["data_stats"]["statistics"]) self.assertIn("words", report["data_stats"]["statistics"]) - def test_save_and_load(self): + def test_save_and_load_pkl_file(self): data_folder = "dataprofiler/tests/data/" test_files = ["txt/code.txt", "txt/sentence-10x.txt"] @@ -3418,7 +4063,7 @@ def test_save_and_load(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() # make sure data_labeler unchanged @@ -3473,7 +4118,7 @@ def test_save_and_load_no_labeler(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() mock_file.seek(0) @@ -3555,6 +4200,57 @@ def setUpClass(cls): cls.data, len(cls.data), options=profiler_options_full ) + def test_adding_profiles_of_mismatched_null_count_options(self): + profiler_options_null_count = ProfilerOptions() + profiler_options_null_count.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": True, + } + ) + profiler_options_null_disabled = ProfilerOptions() + profiler_options_null_disabled.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": False, + } + ) + data = pd.DataFrame([1, None, 3, 4, 5, None, 1]) + with test_utils.mock_timeit(): + profiler_w_null_count = dp.StructuredProfiler( + data[:2], options=profiler_options_null_count + ) + profiler_w_disabled_null_count = dp.StructuredProfiler( + data[2:], options=profiler_options_null_disabled + ) + + with self.assertRaisesRegex( + ValueError, + "Attempting to merge two profiles with null row " + "count option enabled on one profile but not the other.", + ): + profiler_w_null_count + profiler_w_disabled_null_count + + def test_profile_null_count_not_enabled(self): + profiler_options_null_disabled = ProfilerOptions() + profiler_options_null_disabled.set( + { + "*.is_enabled": False, + "row_statistics.*.is_enabled": True, + "row_statistics.null_count.is_enabled": False, + } + ) + data = pd.DataFrame([1, None, 3, 4, 5, None, 1]) + with test_utils.mock_timeit(): + profiler_w_disabled_null_count = dp.StructuredProfiler( + data[2:], options=profiler_options_null_disabled + ) + + self.assertEqual(0, profiler_w_disabled_null_count.row_has_null_count) + self.assertEqual(0, profiler_w_disabled_null_count.row_is_null_count) + def test_correct_rows_ingested(self): test_dict = { "1": ["nan", "null", None, None, ""], @@ -3618,9 +4314,7 @@ def test_correct_null_row_counts(self): def test_row_is_null_ratio_row_stats_disabled(self): profiler_options_1 = ProfilerOptions() profiler_options_1.set( - { - "*.is_enabled": False, - } + {"*.is_enabled": False, "row_statistics.null_count.is_enabled": False} ) profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options_1) self.assertIsNone(profiler._get_row_is_null_ratio()) @@ -4085,6 +4779,18 @@ def test_unique_row_ratio_empty_profiler(self): profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options) self.assertEqual(0, profiler._get_unique_row_ratio()) + def test_null_count_empty_profiler(self): + profiler_options = ProfilerOptions() + profiler_options.set( + { + "*.is_enabled": False, + "row_statistics.null_count.is_enabled": False, + } + ) + profiler = StructuredProfiler(pd.DataFrame([]), options=profiler_options) + self.assertIsNone(profiler._get_row_is_null_ratio()) + self.assertIsNone(profiler._get_row_has_null_ratio()) + def test_correct_duplicate_row_count_full_row_hashing(self): self.assertEqual(15, len(self.trained_schema_full.hashed_row_object)) self.assertEqual(20, self.trained_schema_full.total_samples) @@ -4143,7 +4849,7 @@ def test_save_and_load_hll(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) self.trained_schema_hll.save() mock_file.seek(0) @@ -4278,14 +4984,14 @@ def test_save_and_load_structured(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() mock_file.seek(0) with mock.patch( "dataprofiler.profilers.profile_builder." "DataLabeler", return_value=data_labeler, ): - load_profile = dp.Profiler.load("mock.pkl") + load_profile = dp.Profiler.load("mock.pkl", load_method="PICKLE") # validate loaded profile has same data labeler class self.assertIsInstance( @@ -4331,7 +5037,7 @@ def test_save_and_load_unstructured(self): # Save and Load profile with Mock IO with mock.patch("builtins.open") as m: - mock_file = setup_save_mock_open(m) + mock_file = setup_save_mock_bytes_open(m) save_profile.save() # make sure data_labeler unchanged diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index b5ca43997..0d578c6e9 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -1,3 +1,4 @@ +import json import os import unittest import warnings @@ -8,6 +9,8 @@ import pandas as pd from dataprofiler.profilers import TextColumn, utils +from dataprofiler.profilers.json_decoder import load_column_profile +from dataprofiler.profilers.json_encoder import ProfileEncoder from dataprofiler.profilers.profiler_options import TextOptions from dataprofiler.tests.profilers import utils as test_utils @@ -607,3 +610,210 @@ def test_diff(self): places=2, ) self.assertDictEqual(expected_diff, profile_diff) + + @mock.patch("time.time", return_value=0.0) + def test_json_encode_after_update(self, time): + df = pd.Series( + [ + "abcd", + "aa", + "abcd", + "lito-potamus", + "b", + "4", + ".098", + "2", + "dfd", + "2", + "12.32", + ] + ).apply(str) + + text_options = TextOptions() + text_options.histogram_and_quantiles.bin_count_or_method = 5 + profiler = TextColumn(df.name, text_options) + with test_utils.mock_timeit(): + profiler.update(df) + + serialized_dict = json.loads(json.dumps(profiler, cls=ProfileEncoder)) + + # popping quantiles and comparing as list below since it is so large + serialized_quantiles = serialized_dict["data"].pop("quantiles") + + # popping vocab and comparing as set below since order is random + serialized_vocab = serialized_dict["data"].pop("vocab") + + serialized = json.dumps(serialized_dict) + + expected = json.dumps( + { + "class": "TextColumn", + "data": { + "min": 1.0, + "max": 12.0, + "_top_k_modes": 5, + "sum": 38.0, + "_biased_variance": 9.33884297520661, + "_biased_skewness": 1.8025833203700588, + "_biased_kurtosis": 2.7208317017777395, + "_median_is_enabled": True, + "_median_abs_dev_is_enabled": True, + "max_histogram_bin": 100000, + "min_histogram_bin": 1000, + "histogram_bin_method_names": ["custom"], + "histogram_selection": None, + "user_set_histogram_bin": 5, + "bias_correction": True, + "_mode_is_enabled": True, + "num_zeros": 0, + "num_negatives": 0, + "_num_quantiles": 1000, + "histogram_methods": { + "custom": { + "total_loss": 0.0, + "current_loss": 0.0, + "suggested_bin_count": 5, + "histogram": {"bin_counts": None, "bin_edges": None}, + } + }, + "_stored_histogram": { + "total_loss": 7.63, + "current_loss": 7.63, + "suggested_bin_count": 1000, + "histogram": { + "bin_counts": [6, 4, 0, 0, 1], + "bin_edges": [1.0, 3.2, 5.4, 7.6000000000000005, 9.8, 12.0], + }, + }, + "_batch_history": [ + { + "match_count": 11, + "sample_size": 11, + "min": 1.0, + "max": 12.0, + "sum": 38.0, + "biased_variance": 9.33884297520661, + "mean": 3.4545454545454546, + "biased_skewness": 1.8025833203700588, + "biased_kurtosis": 2.7208317017777395, + } + ], + "_NumericStatsMixin__calculations": { + "min": "_get_min", + "max": "_get_max", + "sum": "_get_sum", + "variance": "_get_variance", + "skewness": "_get_skewness", + "kurtosis": "_get_kurtosis", + "histogram_and_quantiles": "_get_histogram_and_quantiles", + }, + "name": None, + "col_index": np.nan, + "sample_size": 11, + "metadata": {}, + "times": { + "vocab": 1.0, + "min": 1.0, + "max": 1.0, + "sum": 1.0, + "variance": 1.0, + "skewness": 1.0, + "kurtosis": 1.0, + "histogram_and_quantiles": 1.0, + }, + "thread_safe": True, + "match_count": 11, + "_TextColumn__calculations": {"vocab": "_update_vocab"}, + "type": "string", + }, + } + ) + + expected_vocab = profiler.vocab + expected_quantiles = profiler.quantiles + + self.assertEqual(serialized, expected) + self.assertSetEqual(set(serialized_vocab), set(expected_vocab)) + self.assertListEqual(serialized_quantiles, expected_quantiles) + + def test_json_decode(self): + fake_profile_name = None + expected_profile = TextColumn(fake_profile_name) + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + test_utils.assert_profiles_equal(deserialized, expected_profile) + + def test_json_decode_after_update(self): + fake_profile_name = "Fake profile name" + # Actual deserialization + + # Build expected IntColumn + df_int = pd.Series( + [ + "abcd", + "aa", + "abcd", + "lito-potamus", + "b", + "4", + ".098", + "2", + "dfd", + "2", + "12.32", + ] + ) + expected_profile = TextColumn(fake_profile_name) + + with test_utils.mock_timeit(): + expected_profile.update(df_int) + + # Validate reporting before deserialization + expected_profile.report() + + serialized = json.dumps(expected_profile, cls=ProfileEncoder) + deserialized = load_column_profile(json.loads(serialized)) + + # Validate reporting after deserialization + deserialized.report() + test_utils.assert_profiles_equal(deserialized, expected_profile) + + df_str = pd.Series( + [ + "aa", # add existing + "awsome", # add new + ] + ) + + # validating update after deserialization + deserialized.update(df_str) + + assert deserialized.sample_size == 13 + assert set(deserialized.vocab) == { + ".", + "-", + "1", + "2", + "3", + "4", + "8", + "9", + "0", + "a", + "b", + "c", + "d", + "e", + "f", + "i", + "l", + "m", + "o", + "p", + "s", + "t", + "u", + "w", + } diff --git a/dataprofiler/tests/profilers/utils.py b/dataprofiler/tests/profilers/utils.py index b85b831d9..bef54763d 100644 --- a/dataprofiler/tests/profilers/utils.py +++ b/dataprofiler/tests/profilers/utils.py @@ -8,7 +8,11 @@ import dataprofiler as dp from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler -from dataprofiler.profilers.profile_builder import BaseProfiler +from dataprofiler.profilers.column_profile_compilers import BaseCompiler +from dataprofiler.profilers.profile_builder import BaseProfiler, StructuredColProfiler +from dataprofiler.profilers.profiler_options import BaseOption +from dataprofiler.profilers.utils import find_diff_of_dicts +from dataprofiler.tests.test_utils import patched_assert_warns def set_seed(seed=None): @@ -166,37 +170,56 @@ def increment_counter(): return mock.patch("time.time", side_effect=lambda: next(counter)) -def assert_profiles_equal(profile1, profile2): +def assert_profiles_equal(actual, expected): """ Checks if two profile objects are equal. profiles are instances of BaseProfiler or BaseColumnProfiler. Throws exception if not equal - :param profile_1: profile to compare to profile2 - :type profile_1: instance of BaseProfiler or BaseColumnProfiler - :param profile_2: profile to compare to profile1 - :type profile_2: instance of BaseProfiler or BaseColumnProfiler - """ - profile1_dict = profile1.__dict__ - profile2_dict = profile2.__dict__ - - if len(profile1_dict) != len(profile2_dict): - raise ValueError( - f"number of attributes on profile1 ({len(profile1_dict)}) != profile2 ({len(profile2_dict)})" - ) - - for attr1, value1 in profile1_dict.items(): - if attr1 not in profile2_dict: - raise ValueError(f"Profile attributes unmatched {attr1}") - - value2 = profile2_dict[attr1] - if not (isinstance(value2, type(value1)) or isinstance(value1, type(value2))): - raise ValueError(f"Profile value types unmatched: {value1} != {value2}") - - if isinstance(value1, (BaseProfiler, BaseColumnProfiler)): - assert_profiles_equal(value1, value2) - elif isinstance(value1, numbers.Number): - np.testing.assert_equal(value1, value2) - elif value1 != value2: - raise ValueError(f"Profile values unmatched: {value1} != {value2}") + :param actual: profile to compare to expected + :type actual: instance of BaseProfiler or BaseColumnProfiler + :param expected: profile to compare to actual + :type expected: instance of BaseProfiler or BaseColumnProfiler + """ + actual_dict = actual.__dict__ if not isinstance(actual, dict) else actual + expected_dict = expected.__dict__ if not isinstance(expected, dict) else expected + + assert ( + actual_dict.keys() == expected_dict.keys() + ), f"{actual_dict.keys()} != {expected_dict.keys()}" + + for key in expected_dict.keys(): + actual_value = actual_dict.get(key, None) + expected_value = expected_dict.get(key, None) + + assert type(actual_value) == type( + expected_value + ), f"{actual_value} with type {type(actual_value)} and \ + {expected_value} with type {type(expected_value)} \ + do not have the same type for key: {key}" + + if key == "_profile" and isinstance(actual_value, list): + for x in range(len(actual_value)): + assert_profiles_equal(actual_value[x], expected_value[x]) + elif isinstance( + actual_value, + ( + BaseProfiler, + BaseColumnProfiler, + StructuredColProfiler, + BaseCompiler, + BaseOption, + ), + ): + assert_profiles_equal(actual_value, expected_value) + elif isinstance(actual_value, dict): + assert_profiles_equal(actual_value, expected_value) + elif isinstance(actual_value, numbers.Number): + np.testing.assert_equal(actual_value, expected_value, f"{key}") + elif isinstance(actual_value, np.ndarray): + np.testing.assert_array_equal(actual_value, expected_value) + else: + assert ( + actual_value == expected_value + ), f"Actual value of {actual_value} and expected value of {expected_value} do not have the same value for key: {key}" diff --git a/dataprofiler/version.py b/dataprofiler/version.py index b2ee586af..d0f193ca5 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -1,7 +1,7 @@ """File contains the version number for the package.""" MAJOR = 0 -MINOR = 9 +MINOR = 10 MICRO = 0 POST = None # otherwise None diff --git a/examples/data_profiler_demo.ipynb b/examples/data_profiler_demo.ipynb index 0b95b6781..074d15e7b 100644 --- a/examples/data_profiler_demo.ipynb +++ b/examples/data_profiler_demo.ipynb @@ -407,13 +407,17 @@ "id": "de3e0e30-33e3-4b18-8240-7e5c6029eb97", "metadata": {}, "source": [ - "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles." + "Not only can the Profiler create and update profiles, it's also possible to save, load then manipulate profiles.\n", + "\n", + "Options for saving and loading profiles, currently supported are:\n", + "- Pickle\n", + "- JSON" ] }, { "cell_type": "code", "execution_count": null, - "id": "ffdb2126-cc11-49f1-aa5e-a533efc59f25", + "id": "2f03c1e5", "metadata": {}, "outputs": [], "source": [ @@ -421,8 +425,16 @@ "data = dp.Data(os.path.join(data_folder, \"csv/diamonds.csv\"))\n", "\n", "# Generate a profile\n", - "profile = dp.Profiler(data)\n", - "\n", + "profile = dp.Profiler(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffdb2126-cc11-49f1-aa5e-a533efc59f25", + "metadata": {}, + "outputs": [], + "source": [ "# Save a profile to disk for later (saves as pickle file)\n", "profile.save(filepath=\"my_profile.pkl\")\n", "\n", @@ -434,6 +446,24 @@ "# print(json.dumps(report, indent=4))" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e713fc2", + "metadata": {}, + "outputs": [], + "source": [ + "# Save a profile to disk for later (saves as JSON file)\n", + "profile.save(filepath=\"my_profile.json\", save_method=\"json\")\n", + "\n", + "# Load a profile from disk\n", + "loaded_profile = dp.Profiler.load(\"my_profile.json\", load_method=\"json\")\n", + "\n", + "# Report the compact version of the profile\n", + "# report = profile.report(report_options={\"output_format\":\"compact\"})\n", + "# print(json.dumps(report, indent=4))" + ] + }, { "cell_type": "markdown", "id": "4787d4ff-8bd7-4c91-b197-dc875fb1d2d9", @@ -746,7 +776,7 @@ "display_name": "Python 3", "language": "python", "name": "python3" - }, + }, "language_info": { "codemirror_mode": { "name": "ipython", diff --git a/requirements.txt b/requirements.txt index b4d420d4f..994ec78de 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,9 @@ fastavro>=1.0.0.post1 python-snappy>=0.5.4 charset-normalizer>=1.3.6 psutil>=4.0.0 -scipy>=1.4.1 +scipy>=1.4.1,<1.11.0 requests>=2.28.1 networkx>=2.5.1 typing-extensions>=3.10.0.2 HLL>=2.0.3 +datasketches>=4.1.0