From a56f9e356e620be477ef3a72a8abe318e665056a Mon Sep 17 00:00:00 2001 From: Christopher Wallace Date: Wed, 14 Jul 2021 11:03:52 -0400 Subject: [PATCH] Refactor StringIO/ByteIO changes made to is_match functions (#326) * Cleaned up data_utils * Cleaned up csv_data * Cleaned up filepath_or_buffer * Typo * Cleaned up json_data * Cleaned up tests * Streamline functionality formerly in DetachingTextIOWrapper * Forgot closing paren * Consistent naming --- dataprofiler/data_readers/csv_data.py | 10 ++- dataprofiler/data_readers/data_utils.py | 12 +-- .../data_readers/filepath_or_buffer.py | 48 +++++------ dataprofiler/data_readers/json_data.py | 10 +-- .../tests/data_readers/test_avro_data.py | 6 +- .../tests/data_readers/test_csv_data.py | 15 ++-- .../data_readers/test_filepath_or_buffer.py | 79 +++++++++++-------- .../tests/data_readers/test_json_data.py | 15 ++-- .../tests/data_readers/test_parquet_data.py | 6 +- 9 files changed, 108 insertions(+), 93 deletions(-) diff --git a/dataprofiler/data_readers/csv_data.py b/dataprofiler/data_readers/csv_data.py index cc65a1947..fb9175296 100644 --- a/dataprofiler/data_readers/csv_data.py +++ b/dataprofiler/data_readers/csv_data.py @@ -509,8 +509,9 @@ def _load_data_from_file(self, input_file_path): """ self._file_encoding = None - if not data_utils.is_stream_buffer(input_file_path) or isinstance(input_file_path, BytesIO): - self._file_encoding = data_utils.detect_file_encoding(input_file_path) + if not isinstance(input_file_path, StringIO): + self._file_encoding = \ + data_utils.detect_file_encoding(input_file_path) data_as_str = data_utils.load_as_str_from_file(input_file_path, self._file_encoding) @@ -518,7 +519,8 @@ def _load_data_from_file(self, input_file_path): if not self._delimiter or not self._checked_header: delimiter, quotechar = None, None if not self._delimiter or not self._quotechar: - delimiter, quotechar = self._guess_delimiter_and_quotechar(data_as_str) + delimiter, quotechar = \ + self._guess_delimiter_and_quotechar(data_as_str) if not self._delimiter: self._delimiter = delimiter if not self._quotechar: @@ -580,7 +582,7 @@ def is_match(cls, file_path, options=None): options = dict() file_encoding = None - if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO): + if not isinstance(file_path, StringIO): file_encoding = data_utils.detect_file_encoding(file_path=file_path) delimiter = options.get("delimiter", None) diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py index fee362448..c2582757d 100644 --- a/dataprofiler/data_readers/data_utils.py +++ b/dataprofiler/data_readers/data_utils.py @@ -560,12 +560,11 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10, search_query_value = b'\n' loc, occurance = find_nth_loc(sample_lines, - search_query=search_query_value, - n=remaining_lines) + search_query=search_query_value, + n=remaining_lines) # Add sample_lines to data_as_str no more than max_lines - if (is_stream_buffer(file_path) and isinstance(sample_lines[:loc],\ - bytes)): + if isinstance(sample_lines[:loc], bytes): data_as_str += sample_lines[:loc].decode(file_encoding) else: data_as_str += sample_lines[:loc] @@ -576,16 +575,13 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10, return data_as_str + def is_stream_buffer(filepath_or_buffer): """ Determines whether a given argument is a filepath or buffer. :param filepath_or_buffer: path to the file or buffer :type filepath_or_buffer: str - :param encoding: File encoding - :type encoding: str - :param seek: position to start in buffer - :type seek: int :return: true if string is a buffer or false if string is a filepath :rtype: boolean """ diff --git a/dataprofiler/data_readers/filepath_or_buffer.py b/dataprofiler/data_readers/filepath_or_buffer.py index 0333c0fbf..ea75a31e0 100644 --- a/dataprofiler/data_readers/filepath_or_buffer.py +++ b/dataprofiler/data_readers/filepath_or_buffer.py @@ -3,15 +3,17 @@ from . import data_utils -class FileOrBufferHandler(): +class FileOrBufferHandler: """ - FileOrBufferHandler class to read a filepath or buffer in and always return a readable buffer + FileOrBufferHandler class to read a filepath or buffer in and always + return a readable buffer. """ - def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offset=None, seek_whence=0): + def __init__(self, filepath_or_buffer, open_method='r', encoding=None, + seek_offset=None, seek_whence=0): """ - Context manager class used for inputing a file or buffer and returning a structure - that is always a buffer. + Context manager class used for inputting a file or buffer and returning + a structure that is always a buffer. :param filepath_or_buffer: path to the file being loaded or buffer :type filepath_or_buffer: Union[str, StringIO, BytesIO] @@ -27,19 +29,25 @@ def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offs self.seek_whence = seek_whence self._encoding = encoding self.original_type = type(filepath_or_buffer) + self._is_wrapped = False def __enter__(self): if isinstance(self._filepath_or_buffer, str): self._filepath_or_buffer = open( - self._filepath_or_buffer, self.open_method, encoding=self._encoding) + self._filepath_or_buffer, self.open_method, + encoding=self._encoding) - elif isinstance(self._filepath_or_buffer, BytesIO) and self.open_method == 'r': - self._filepath_or_buffer = DetachingTextIOWrapper(self._filepath_or_buffer, encoding=self._encoding) + elif isinstance(self._filepath_or_buffer, BytesIO) \ + and self.open_method == 'r': + self._filepath_or_buffer = \ + TextIOWrapper(self._filepath_or_buffer, encoding=self._encoding) + self._is_wrapped = True elif not data_utils.is_stream_buffer(self._filepath_or_buffer): # Raise AttributeError if attribute value not found. - raise AttributeError(f'Type {type(self._filepath_or_buffer)} is invalid. \ - filepath_or_buffer must be a string or StringIO/BytesIO object') + raise AttributeError(f'Type {type(self._filepath_or_buffer)} is ' + f'invalid. filepath_or_buffer must be a ' + f'string or StringIO/BytesIO object') if self.seek_offset is not None: self._filepath_or_buffer.seek(self.seek_offset, self.seek_whence) @@ -47,19 +55,13 @@ def __enter__(self): return self._filepath_or_buffer def __exit__(self, exc_type, exc_value, exc_traceback): - if isinstance(self._filepath_or_buffer, (StringIO, BytesIO, DetachingTextIOWrapper)): + # Need to detach buffer if wrapped (i.e. BytesIO opened with 'r') + if self._is_wrapped: + wrapper = self._filepath_or_buffer + self._filepath_or_buffer = wrapper.buffer + wrapper.detach() + + if isinstance(self._filepath_or_buffer, (StringIO, BytesIO)): self._filepath_or_buffer.seek(0) else: self._filepath_or_buffer.close() - -class DetachingTextIOWrapper(TextIOWrapper): - """ - DetachingTextIOWrapper class is used to detach buffer to avoid buffer closing before it's returned - """ - - def close(self): - self.detach() - - def __del__(self): - if self.buffer: - self.detach() \ No newline at end of file diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py index 455b9ae84..2b031a90a 100644 --- a/dataprofiler/data_readers/json_data.py +++ b/dataprofiler/data_readers/json_data.py @@ -1,15 +1,14 @@ from collections import OrderedDict -from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler import json import warnings -import types -from io import BytesIO +from six import StringIO import numpy as np import pandas as pd from . import data_utils from .base_data import BaseData from .structured_mixins import SpreadSheetDataMixin +from .filepath_or_buffer import FileOrBufferHandler class JSONData(SpreadSheetDataMixin, BaseData): @@ -364,10 +363,11 @@ def is_match(cls, file_path, options=None): options = dict() file_encoding = None - if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO): + if not isinstance(file_path, StringIO): file_encoding = data_utils.detect_file_encoding(file_path=file_path) - with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) as data_file: + with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) \ + as data_file: try: json.load(data_file) return True diff --git a/dataprofiler/tests/data_readers/test_avro_data.py b/dataprofiler/tests/data_readers/test_avro_data.py index b8da0c0e8..ac5920adc 100644 --- a/dataprofiler/tests/data_readers/test_avro_data.py +++ b/dataprofiler/tests/data_readers/test_avro_data.py @@ -34,14 +34,14 @@ def setUpClass(cls): def test_is_match_for_byte_streams(self): """ - Determine if the avro file can be automatically identified from byte stream + Determine if the avro file can be automatically identified from + byte stream """ for input_file in self.input_file_names: # as BytesIO Stream with open(input_file['path'], 'rb') as fp: byte_string = BytesIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'avro') + self.assertTrue(AVROData.is_match(byte_string)) def test_avro_file_identification(self): """ diff --git a/dataprofiler/tests/data_readers/test_csv_data.py b/dataprofiler/tests/data_readers/test_csv_data.py index 2d15f09b7..5b44802e9 100644 --- a/dataprofiler/tests/data_readers/test_csv_data.py +++ b/dataprofiler/tests/data_readers/test_csv_data.py @@ -163,23 +163,24 @@ def setUpClass(cls): def test_is_match_for_string_streams(self): """ - Determine if the csv file can be automatically identified from string stream + Determine if the csv file can be automatically identified from + string stream """ for input_file in self.input_file_names: - with open(input_file['path'], 'r', encoding=input_file['encoding']) as fp: + with open(input_file['path'], 'r', + encoding=input_file['encoding']) as fp: byte_string = StringIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'csv') + self.assertTrue(CSVData.is_match(byte_string)) def test_is_match_for_byte_streams(self): """ - Determine if the csv file can be automatically identified from byte stream + Determine if the csv file can be automatically identified from + byte stream """ for input_file in self.input_file_names: with open(input_file['path'], 'rb') as fp: byte_string = BytesIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'csv') + self.assertTrue(CSVData.is_match(byte_string)) def test_auto_file_identification(self): """ diff --git a/dataprofiler/tests/data_readers/test_filepath_or_buffer.py b/dataprofiler/tests/data_readers/test_filepath_or_buffer.py index b0abb5133..0745e3667 100644 --- a/dataprofiler/tests/data_readers/test_filepath_or_buffer.py +++ b/dataprofiler/tests/data_readers/test_filepath_or_buffer.py @@ -26,11 +26,12 @@ def setUpClass(cls): def test_make_buffer_from_filepath(self): """ - Make sure FileOrBufferHandler can input a file and read it similarly to open() + Make sure FileOrBufferHandler can input a file and read it similarly + to open() """ for input_file in self.input_file_names: - with FileOrBufferHandler(input_file['path'], 'r') as\ - filepath_or_buffer, open(input_file['path'], 'r') as\ + with FileOrBufferHandler(input_file['path'], 'r') as \ + filepath_or_buffer, open(input_file['path'], 'r') as \ input_file_check: # check first 100 lines @@ -44,12 +45,14 @@ def test_make_buffer_from_filepath(self): def test_pass_in_StringIO_buffer(self): """ - Make sure FileOrBufferHandler can take StringIO and read it similarly to open() + Make sure FileOrBufferHandler can take StringIO and read it similarly + to open() """ for input_file in self.input_file_names: - with FileOrBufferHandler(StringIO(open(input_file['path'], 'r') - .read())) as filepath_or_buffer, open(input_file['path'], 'r')\ - as input_file_check: + with FileOrBufferHandler(StringIO( + open(input_file['path'], 'r').read())) as \ + filepath_or_buffer, open(input_file['path'], 'r') as \ + input_file_check: # check first 100 lines for i in range(0, 100): @@ -58,13 +61,15 @@ def test_pass_in_StringIO_buffer(self): def test_pass_in_StringIO_seek_buffer(self): """ - Make sure FileOrBufferHandler can take StringIO with seek and read it similarly to open() with seek + Make sure FileOrBufferHandler can take StringIO with seek and read it + similarly to open() with seek """ for input_file in self.input_file_names: print(input_file) seek_offset_test = 100 - with FileOrBufferHandler(StringIO(open(input_file['path'], 'rb') - .read().decode()), seek_offset=seek_offset_test) as filepath_or_buffer,\ + with FileOrBufferHandler(StringIO( + open(input_file['path'], 'rb').read().decode()), + seek_offset=seek_offset_test) as filepath_or_buffer, \ open(input_file['path'], 'rb') as input_file_check: input_file_check.seek(seek_offset_test) @@ -76,12 +81,15 @@ def test_pass_in_StringIO_seek_buffer(self): def test_pass_in_BytesIO_buffer(self): """ - Make sure FileOrBufferHandler can take BytesIO and read it similarly to open() + Make sure FileOrBufferHandler can take BytesIO and read it similarly + to open() """ for input_file in self.input_file_names: - with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb'). - read())) as filepath_or_buffer, TextIOWrapper(open(input_file['path'], 'rb'))\ - as input_file_check: + with FileOrBufferHandler(BytesIO( + open(input_file['path'], 'rb').read())) as \ + filepath_or_buffer, \ + TextIOWrapper(open(input_file['path'], 'rb')) as \ + input_file_check: # check first 100 lines for i in range(0, 100): @@ -90,13 +98,16 @@ def test_pass_in_BytesIO_buffer(self): def test_pass_in_BytesIO_seek_buffer(self): """ - Make sure FileOrBufferHandler can take BytesIO with seek and read it similarly to open() with seek + Make sure FileOrBufferHandler can take BytesIO with seek and read it + similarly to open() with seek """ for input_file in self.input_file_names: seek_offset_test = 500 - with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb'). - read()), seek_offset=seek_offset_test) as filepath_or_buffer,\ - TextIOWrapper(open(input_file['path'], 'rb')) as input_file_check: + with FileOrBufferHandler(BytesIO( + open(input_file['path'], 'rb').read()), + seek_offset=seek_offset_test) as filepath_or_buffer, \ + TextIOWrapper(open(input_file['path'], 'rb')) as \ + input_file_check: input_file_check.seek(seek_offset_test) @@ -107,36 +118,38 @@ def test_pass_in_BytesIO_seek_buffer(self): def test_make_buffer_from_filepath_and_encoding(self): """ - Make sure FileOrBufferHandler can input a file and read it similarly to open() with encoding + Make sure FileOrBufferHandler can input a file and read it similarly + to open() with encoding """ - file_name =os.path.join(os.path.join(test_root_path, 'data'), \ - 'csv/iris-utf-16.csv') + file_name = os.path.join(os.path.join(test_root_path, 'data'), + 'csv/iris-utf-16.csv') file_encoding='utf-16' - with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) as\ - filepath_or_buffer, open(file_name, 'r', encoding=file_encoding) as\ - input_file_check: + with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) \ + as filepath_or_buffer, \ + open(file_name, 'r', encoding=file_encoding) \ + as input_file_check: # check first 100 lines for i in range(0, 100): self.assertEqual(filepath_or_buffer.readline(), - input_file_check.readline()) + input_file_check.readline()) # check that file was properly closed self.assertEqual(filepath_or_buffer.closed, - input_file_check.closed) + input_file_check.closed) def test_make_buffer_error_message(self): """ Check FileOrBufferHandler asserts proper attribute error """ file_name = dict(not_a_valid="option") - with self.assertRaisesRegex(AttributeError, "Type.*is invalid. \ - filepath_or_buffer must be a string or StringIO/BytesIO object"): - with FileOrBufferHandler(file_name, 'r') as\ - filepath_or_buffer, open(file_name, 'r') as\ - input_file_check: - filepath_or_buffer.readline(), - input_file_check.readline() + msg = (f"Type {type(file_name)} is invalid. filepath_or_buffer must " + f"be a string or StringIO/BytesIO object") + with self.assertRaisesRegex(AttributeError, msg): + with FileOrBufferHandler(file_name, 'r') as filepath_or_buffer, \ + open(file_name, 'r') as input_file_check: + filepath_or_buffer.readline(), + input_file_check.readline() if __name__ == '__main__': diff --git a/dataprofiler/tests/data_readers/test_json_data.py b/dataprofiler/tests/data_readers/test_json_data.py index 512934873..6887873b6 100644 --- a/dataprofiler/tests/data_readers/test_json_data.py +++ b/dataprofiler/tests/data_readers/test_json_data.py @@ -48,24 +48,25 @@ def setUpClass(cls): def test_is_match_for_string_streams(self): """ - Determine if the json file can be automatically identified from string stream + Determine if the json file can be automatically identified from + string stream """ for input_file in self.input_file_names: print(input_file) - with open(input_file['path'], 'r', encoding=input_file['encoding']) as fp: + with open(input_file['path'], 'r', + encoding=input_file['encoding']) as fp: byte_string = StringIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'json') + self.assertTrue(JSONData.is_match(byte_string)) def test_is_match_for_byte_streams(self): """ - Determine if the json file can be automatically identified byte stream + Determine if the json file can be automatically identified from + byte stream """ for input_file in self.input_file_names: with open(input_file['path'], 'rb') as fp: byte_string = BytesIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'json') + self.assertTrue(JSONData.is_match(byte_string)) def test_json_file_identification(self): """ diff --git a/dataprofiler/tests/data_readers/test_parquet_data.py b/dataprofiler/tests/data_readers/test_parquet_data.py index f07cbaab3..56206de15 100644 --- a/dataprofiler/tests/data_readers/test_parquet_data.py +++ b/dataprofiler/tests/data_readers/test_parquet_data.py @@ -34,13 +34,13 @@ def setUpClass(cls): def test_is_match_for_byte_streams(self): """ - Determine if the parquet file can be automatically identified from byte stream + Determine if the parquet file can be automatically identified from + byte stream """ for input_file in self.input_file_names: with open(input_file['path'], 'rb') as fp: byte_string = BytesIO(fp.read()) - input_data_obj = Data(byte_string) - self.assertEqual(input_data_obj.data_type, 'parquet') + self.assertTrue(ParquetData.is_match(byte_string)) def test_auto_file_identification(self): """