Skip to content

Commit

Permalink
Includes mypy in pre-commit and fixes last needed updates (#696)
Browse files Browse the repository at this point in the history
* feat: add mypy to precommit

* fix: for passing mypy
  • Loading branch information
JGSweets authored Oct 19, 2022
1 parent 7550b4a commit d6bfde3
Show file tree
Hide file tree
Showing 9 changed files with 36 additions and 21 deletions.
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,13 @@ repos:
- id: debug-statements
- id: end-of-file-fixer
exclude: (^dataprofiler/tests/data/)
# Mypy: Optional static type checking
# https://github.com/pre-commit/mirrors-mypy
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.982
hooks:
- id: mypy
exclude: (^dataprofiler/tests/|^resources/|^examples|venv*/)
language_version: python3
additional_dependencies: ['types-setuptools', 'types-python-dateutil',
'types-requests', 'types-chardet', 'types-six']
3 changes: 2 additions & 1 deletion dataprofiler/_typing.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
"""Contains typing aliases."""
from typing import Dict, List, Union
from typing import Dict, List, NewType, Union

import numpy as np
import pandas as pd

DataArray = Union[pd.DataFrame, pd.Series, np.ndarray]
JSONType = Union[str, int, float, bool, None, List, Dict]
Url = NewType("Url", str)
7 changes: 4 additions & 3 deletions dataprofiler/data_readers/csv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def _guess_delimiter_and_quotechar(
vocab = Counter(data_as_str)
if "\n" in vocab:
vocab.pop("\n")
omitted_list: list[str] = omitted
omitted_list: List[str] = omitted
if quotechar is not None:
omitted_list = omitted + [quotechar]
for char in omitted_list:
Expand All @@ -206,7 +206,7 @@ def _guess_delimiter_and_quotechar(

# Sort vocabulary by count
ordered_vocab = []
sorted_keys = sorted(vocab, key=vocab.get, reverse=True)
sorted_keys = sorted(vocab, key=vocab.__getitem__, reverse=True)
for c in sorted_keys:
if c not in preferred:
ordered_vocab.append(c)
Expand All @@ -217,7 +217,8 @@ def _guess_delimiter_and_quotechar(
sniffer.preferred = preferred
try:
# NOTE: Pull the first element, the quote character
quotechar = sniffer._guess_quote_and_delimiter(
# ignoring type b/c error in getting this class's func
quotechar = sniffer._guess_quote_and_delimiter( # type: ignore
data_as_str, ordered_vocab[:20]
)[0]
except csv.Error:
Expand Down
7 changes: 4 additions & 3 deletions dataprofiler/data_readers/data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Contains factory class reading various kinds of data."""
from __future__ import absolute_import, division

from typing import Any, Dict, List, Optional
from io import BytesIO
from typing import Any, Dict, List, Optional, Union, cast

from .. import dp_logging
from .avro_data import AVROData
Expand Down Expand Up @@ -29,7 +30,7 @@ class Data(object):

def __new__(
cls,
input_file_path: Optional[str] = None,
input_file_path: Optional[Union[str, BytesIO]] = None,
data: Optional[Any] = None,
data_type: Optional[str] = None,
options: Optional[Dict] = None,
Expand Down Expand Up @@ -62,7 +63,7 @@ def __new__(
)

if not options:
options = dict()
options = cast(Dict, dict())

if is_valid_url(input_file_path):
input_file_path = url_to_bytes(input_file_path, options)
Expand Down
16 changes: 9 additions & 7 deletions dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,10 @@
import pyarrow.parquet as pq
import requests
from chardet.universaldetector import UniversalDetector
from typing_extensions import TypeGuard

from .. import dp_logging
from .._typing import JSONType
from .._typing import JSONType, Url
from .filepath_or_buffer import FileOrBufferHandler, is_stream_buffer # NOQA

logger = dp_logging.get_child_logger(__name__)
Expand Down Expand Up @@ -160,7 +161,7 @@ def read_json_df(
data_generator: Generator,
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> Tuple[Iterator[pd.DataFrame], pd.Series]:
) -> Tuple[pd.DataFrame, pd.Series]:
"""
Return an iterator that returns a chunk of data as dataframe in each call.
Expand All @@ -184,7 +185,7 @@ def read_json_df(
:type read_in_string: bool
:return: returns an iterator that returns a chunk of file as dataframe in
each call as well as original dtypes of the dataframe columns.
:rtype: typle(Iterator(pd.DataFrame), pd.Series(dtypes)
:rtype: tuple(pd.DataFrame, pd.Series(dtypes))
"""
lines: List[JSONType] = list()
k = 0
Expand Down Expand Up @@ -214,7 +215,7 @@ def read_json_df(


def read_json(
data_generator: Generator,
data_generator: Iterator,
selected_columns: Optional[List[str]] = None,
read_in_string: bool = False,
) -> List[JSONType]:
Expand Down Expand Up @@ -505,7 +506,8 @@ def detect_cell_type(cell: str) -> str:
else:

try:
if dateutil.parser.parse(cell, fuzzy=False):
# need to ingore type bc https://github.com/python/mypy/issues/8878
if dateutil.parser.parse(cell, fuzzy=False): # type:ignore
cell_type = "date"
except (ValueError, OverflowError, TypeError):
pass
Expand Down Expand Up @@ -675,7 +677,7 @@ def load_as_str_from_file(
return data_as_str


def is_valid_url(url_as_string: Any) -> bool:
def is_valid_url(url_as_string: Any) -> TypeGuard[Url]:
"""
Determine whether a given string is a valid URL.
Expand All @@ -692,7 +694,7 @@ def is_valid_url(url_as_string: Any) -> bool:
return all([result.scheme, result.netloc])


def url_to_bytes(url_as_string: str, options: Dict) -> BytesIO:
def url_to_bytes(url_as_string: Url, options: Dict) -> BytesIO:
"""
Read in URL and converts it to a byte stream.
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1785,7 +1785,7 @@ def convert_to_structured_analysis(
column_labels, sample = label_samples

# get count of all labels in prediction
column_label_counter = Counter(column_labels[: len(str(sample))])
column_label_counter: Counter = Counter(column_labels[: len(str(sample))])
column_label_counter.pop(ignore_value, None)
modes = [
entity_id
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __init__(
:param options: Options for the structured profiler.
:type options: StructuredOptions Object
"""
self.name: Optional[str] = None
self.name: Optional[Union[int, str]] = None
self.options = options
self._min_sample_size: int = min_sample_size
self._sampling_ratio: float = sampling_ratio
Expand Down Expand Up @@ -257,7 +257,7 @@ def diff(self, other_profile: StructuredColProfiler, options: Dict = None) -> Di

name = self.name
if isinstance(self.name, np.integer):
name = int(name)
name = int(name) # type: ignore

unordered_profile.update(
{
Expand Down Expand Up @@ -321,7 +321,7 @@ def report(self, remove_disabled_flag: bool = False) -> OrderedDict:

name = self.name
if isinstance(self.name, np.integer):
name = int(name)
name = int(name) # type: ignore

unordered_profile.update(
{
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ def get_memory_size(data: Union[list, np.ndarray, DataFrame], unit: str = "M") -
:type unit: string
:return: memory size of the input data
"""
unit_map = collections.defaultdict(B=0, K=1, M=2, G=3)
unit_map: Dict = collections.defaultdict(B=0, K=1, M=2, G=3)
if unit not in unit_map:
raise ValueError(
"Currently only supports the "
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/reports/utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
"""Contains functions for checking for installations/dependencies."""
import sys
import warnings
from typing import Any, Callable, List, NoReturn, TypeVar, cast
from typing import Any, Callable, List, TypeVar, cast

# Generic type for the return of the function "require_module()"
F = TypeVar("F", bound=Callable[..., Any])


def warn_missing_module(graph_func: str, module_name: str) -> NoReturn:
def warn_missing_module(graph_func: str, module_name: str) -> None:
"""
Return a warning if a given graph module doesn't exist.
Expand Down

0 comments on commit d6bfde3

Please sign in to comment.