Refactor StringIO/ByteIO changes made to is_match functions (#326)

* Cleaned up data_utils * Cleaned up csv_data * Cleaned up filepath_or_buffer * Typo * Cleaned up json_data * Cleaned up tests * Streamline functionality formerly in DetachingTextIOWrapper * Forgot closing paren * Consistent naming
capitalone · Jul 14, 2021 · a56f9e3 · a56f9e3
1 parent 4ae669c
commit a56f9e3
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 93 deletions.
diff --git a/dataprofiler/data_readers/csv_data.py b/dataprofiler/data_readers/csv_data.py
@@ -509,16 +509,18 @@ def _load_data_from_file(self, input_file_path):
         """
 
         self._file_encoding = None
-        if not data_utils.is_stream_buffer(input_file_path) or isinstance(input_file_path, BytesIO):
-            self._file_encoding = data_utils.detect_file_encoding(input_file_path)
+        if not isinstance(input_file_path, StringIO):
+            self._file_encoding = \
+                data_utils.detect_file_encoding(input_file_path)
 
         data_as_str = data_utils.load_as_str_from_file(input_file_path,
                                                        self._file_encoding)
 
         if not self._delimiter or not self._checked_header:
             delimiter, quotechar = None, None
             if not self._delimiter or not self._quotechar:
-                delimiter, quotechar = self._guess_delimiter_and_quotechar(data_as_str)
+                delimiter, quotechar = \
+                    self._guess_delimiter_and_quotechar(data_as_str)
             if not self._delimiter:
                 self._delimiter = delimiter
             if not self._quotechar:
@@ -580,7 +582,7 @@ def is_match(cls, file_path, options=None):
             options = dict()
 
         file_encoding = None
-        if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO):
+        if not isinstance(file_path, StringIO):
             file_encoding = data_utils.detect_file_encoding(file_path=file_path)
 
         delimiter = options.get("delimiter", None)

diff --git a/dataprofiler/data_readers/data_utils.py b/dataprofiler/data_readers/data_utils.py
@@ -560,12 +560,11 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10,
                 search_query_value = b'\n'
 
             loc, occurance = find_nth_loc(sample_lines,
-                                        search_query=search_query_value,
-                                        n=remaining_lines)
+                                          search_query=search_query_value,
+                                          n=remaining_lines)
 
             # Add sample_lines to data_as_str no more than max_lines
-            if (is_stream_buffer(file_path) and isinstance(sample_lines[:loc],\
-                 bytes)):
+            if isinstance(sample_lines[:loc], bytes):
                 data_as_str += sample_lines[:loc].decode(file_encoding)
             else:
                 data_as_str += sample_lines[:loc]
@@ -576,16 +575,13 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10,
 
     return data_as_str
 
+
 def is_stream_buffer(filepath_or_buffer):
     """
     Determines whether a given argument is a filepath or buffer.
     
     :param filepath_or_buffer: path to the file or buffer 
     :type filepath_or_buffer: str
-    :param encoding: File encoding
-    :type encoding: str
-    :param seek: position to start in buffer
-    :type seek: int
     :return: true if string is a buffer or false if string is a filepath
     :rtype: boolean
     """

diff --git a/dataprofiler/data_readers/filepath_or_buffer.py b/dataprofiler/data_readers/filepath_or_buffer.py
@@ -3,15 +3,17 @@
 from . import data_utils
 
 
-class FileOrBufferHandler():
+class FileOrBufferHandler:
     """
-    FileOrBufferHandler class to read a filepath or buffer in and always return a readable buffer
+    FileOrBufferHandler class to read a filepath or buffer in and always
+    return a readable buffer.
     """
 
-    def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offset=None, seek_whence=0):
+    def __init__(self, filepath_or_buffer, open_method='r', encoding=None,
+                 seek_offset=None, seek_whence=0):
         """
-        Context manager class used for inputing a file or buffer and returning a structure
-        that is always a buffer.
+        Context manager class used for inputting a file or buffer and returning
+        a structure that is always a buffer.
 
         :param filepath_or_buffer: path to the file being loaded or buffer
         :type filepath_or_buffer: Union[str, StringIO, BytesIO]
@@ -27,39 +29,39 @@ def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offs
         self.seek_whence = seek_whence
         self._encoding = encoding
         self.original_type = type(filepath_or_buffer)
+        self._is_wrapped = False
 
     def __enter__(self):
         if isinstance(self._filepath_or_buffer, str):
             self._filepath_or_buffer = open(
-                self._filepath_or_buffer, self.open_method, encoding=self._encoding)
+                self._filepath_or_buffer, self.open_method,
+                encoding=self._encoding)
 
-        elif isinstance(self._filepath_or_buffer, BytesIO) and self.open_method == 'r':
-            self._filepath_or_buffer = DetachingTextIOWrapper(self._filepath_or_buffer, encoding=self._encoding)
+        elif isinstance(self._filepath_or_buffer, BytesIO) \
+                and self.open_method == 'r':
+            self._filepath_or_buffer = \
+                TextIOWrapper(self._filepath_or_buffer, encoding=self._encoding)
+            self._is_wrapped = True
 
         elif not data_utils.is_stream_buffer(self._filepath_or_buffer):
             # Raise AttributeError if attribute value not found.
-            raise AttributeError(f'Type {type(self._filepath_or_buffer)} is invalid. \
-                filepath_or_buffer must be a string or StringIO/BytesIO object')
+            raise AttributeError(f'Type {type(self._filepath_or_buffer)} is '
+                                 f'invalid. filepath_or_buffer must be a '
+                                 f'string or StringIO/BytesIO object')
 
         if self.seek_offset is not None:
             self._filepath_or_buffer.seek(self.seek_offset, self.seek_whence)
 
         return self._filepath_or_buffer
 
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        if isinstance(self._filepath_or_buffer, (StringIO, BytesIO, DetachingTextIOWrapper)):
+        # Need to detach buffer if wrapped (i.e. BytesIO opened with 'r')
+        if self._is_wrapped:
+            wrapper = self._filepath_or_buffer
+            self._filepath_or_buffer = wrapper.buffer
+            wrapper.detach()
+
+        if isinstance(self._filepath_or_buffer, (StringIO, BytesIO)):
             self._filepath_or_buffer.seek(0)
         else:
             self._filepath_or_buffer.close()
-
-class DetachingTextIOWrapper(TextIOWrapper):
-    """
-    DetachingTextIOWrapper class is used to detach buffer to avoid buffer closing before it's returned
-    """
-
-    def close(self):
-        self.detach()
-
-    def __del__(self):
-        if self.buffer:
-            self.detach()
diff --git a/dataprofiler/data_readers/json_data.py b/dataprofiler/data_readers/json_data.py
@@ -1,15 +1,14 @@
 from collections import OrderedDict
-from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler
 import json
 import warnings
-import types
-from io import BytesIO
+from six import StringIO
 
 import numpy as np
 import pandas as pd
 from . import data_utils
 from .base_data import BaseData
 from .structured_mixins import SpreadSheetDataMixin
+from .filepath_or_buffer import FileOrBufferHandler
 
 
 class JSONData(SpreadSheetDataMixin, BaseData):
@@ -364,10 +363,11 @@ def is_match(cls, file_path, options=None):
             options = dict()
 
         file_encoding = None
-        if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO):
+        if not isinstance(file_path, StringIO):
             file_encoding = data_utils.detect_file_encoding(file_path=file_path)
 
-        with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) as data_file:
+        with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) \
+                as data_file:
             try:
                 json.load(data_file)
                 return True

diff --git a/dataprofiler/tests/data_readers/test_avro_data.py b/dataprofiler/tests/data_readers/test_avro_data.py
@@ -34,14 +34,14 @@ def setUpClass(cls):
 
     def test_is_match_for_byte_streams(self):
         """
-        Determine if the avro file can be automatically identified from byte stream
+        Determine if the avro file can be automatically identified from
+        byte stream
         """
         for input_file in self.input_file_names:
             # as BytesIO Stream
             with open(input_file['path'], 'rb') as fp:
                 byte_string = BytesIO(fp.read())
-                input_data_obj = Data(byte_string)
-                self.assertEqual(input_data_obj.data_type, 'avro')
+                self.assertTrue(AVROData.is_match(byte_string))
 
     def test_avro_file_identification(self):
         """

diff --git a/dataprofiler/tests/data_readers/test_csv_data.py b/dataprofiler/tests/data_readers/test_csv_data.py
@@ -163,23 +163,24 @@ def setUpClass(cls):
 
     def test_is_match_for_string_streams(self):
         """
-        Determine if the csv file can be automatically identified from string stream
+        Determine if the csv file can be automatically identified from
+        string stream
         """
         for input_file in self.input_file_names:
-            with open(input_file['path'], 'r', encoding=input_file['encoding']) as fp:
+            with open(input_file['path'], 'r',
+                      encoding=input_file['encoding']) as fp:
                 byte_string = StringIO(fp.read())
-                input_data_obj = Data(byte_string)
-                self.assertEqual(input_data_obj.data_type, 'csv')
+                self.assertTrue(CSVData.is_match(byte_string))
 
     def test_is_match_for_byte_streams(self):
         """
-        Determine if the csv file can be automatically identified from byte stream
+        Determine if the csv file can be automatically identified from
+        byte stream
         """
         for input_file in self.input_file_names:
             with open(input_file['path'], 'rb') as fp:
                 byte_string = BytesIO(fp.read())
-                input_data_obj = Data(byte_string)
-                self.assertEqual(input_data_obj.data_type, 'csv')
+                self.assertTrue(CSVData.is_match(byte_string))
 
     def test_auto_file_identification(self):
         """

diff --git a/dataprofiler/tests/data_readers/test_filepath_or_buffer.py b/dataprofiler/tests/data_readers/test_filepath_or_buffer.py
@@ -26,11 +26,12 @@ def setUpClass(cls):
 
     def test_make_buffer_from_filepath(self):
         """
-        Make sure FileOrBufferHandler can input a file and read it similarly to open()
+        Make sure FileOrBufferHandler can input a file and read it similarly
+        to open()
         """
         for input_file in self.input_file_names:
-            with FileOrBufferHandler(input_file['path'], 'r') as\
-                    filepath_or_buffer, open(input_file['path'], 'r') as\
+            with FileOrBufferHandler(input_file['path'], 'r') as \
+                    filepath_or_buffer, open(input_file['path'], 'r') as \
                     input_file_check:
 
                 # check first 100 lines
@@ -44,12 +45,14 @@ def test_make_buffer_from_filepath(self):
 
     def test_pass_in_StringIO_buffer(self):
         """
-        Make sure FileOrBufferHandler can take StringIO and read it similarly to open() 
+        Make sure FileOrBufferHandler can take StringIO and read it similarly
+        to open()
         """
         for input_file in self.input_file_names:
-            with FileOrBufferHandler(StringIO(open(input_file['path'], 'r')
-                    .read())) as filepath_or_buffer, open(input_file['path'], 'r')\
-                    as input_file_check:
+            with FileOrBufferHandler(StringIO(
+                    open(input_file['path'], 'r').read())) as \
+                    filepath_or_buffer, open(input_file['path'], 'r') as \
+                    input_file_check:
 
                 # check first 100 lines
                 for i in range(0, 100):
@@ -58,13 +61,15 @@ def test_pass_in_StringIO_buffer(self):
 
     def test_pass_in_StringIO_seek_buffer(self):
         """
-        Make sure FileOrBufferHandler can take StringIO with seek and read it similarly to open() with seek
+        Make sure FileOrBufferHandler can take StringIO with seek and read it
+        similarly to open() with seek
         """
         for input_file in self.input_file_names:
             print(input_file)
             seek_offset_test = 100
-            with FileOrBufferHandler(StringIO(open(input_file['path'], 'rb')
-                    .read().decode()), seek_offset=seek_offset_test) as filepath_or_buffer,\
+            with FileOrBufferHandler(StringIO(
+                    open(input_file['path'], 'rb').read().decode()),
+                    seek_offset=seek_offset_test) as filepath_or_buffer, \
                     open(input_file['path'], 'rb') as input_file_check:
 
                 input_file_check.seek(seek_offset_test)
@@ -76,12 +81,15 @@ def test_pass_in_StringIO_seek_buffer(self):
 
     def test_pass_in_BytesIO_buffer(self):
         """
-        Make sure FileOrBufferHandler can take BytesIO and read it similarly to open() 
+        Make sure FileOrBufferHandler can take BytesIO and read it similarly
+        to open()
         """
         for input_file in self.input_file_names:
-            with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb').
-                    read())) as filepath_or_buffer, TextIOWrapper(open(input_file['path'], 'rb'))\
-                    as input_file_check:
+            with FileOrBufferHandler(BytesIO(
+                    open(input_file['path'], 'rb').read())) as \
+                    filepath_or_buffer, \
+                    TextIOWrapper(open(input_file['path'], 'rb')) as \
+                    input_file_check:
 
                 # check first 100 lines
                 for i in range(0, 100):
@@ -90,13 +98,16 @@ def test_pass_in_BytesIO_buffer(self):
 
     def test_pass_in_BytesIO_seek_buffer(self):
         """
-        Make sure FileOrBufferHandler can take BytesIO with seek and read it similarly to open() with seek
+        Make sure FileOrBufferHandler can take BytesIO with seek and read it
+        similarly to open() with seek
         """
         for input_file in self.input_file_names:
             seek_offset_test = 500
-            with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb').
-                    read()), seek_offset=seek_offset_test) as filepath_or_buffer,\
-                    TextIOWrapper(open(input_file['path'], 'rb')) as input_file_check:
+            with FileOrBufferHandler(BytesIO(
+                    open(input_file['path'], 'rb').read()),
+                    seek_offset=seek_offset_test) as filepath_or_buffer, \
+                    TextIOWrapper(open(input_file['path'], 'rb')) as \
+                    input_file_check:
 
                 input_file_check.seek(seek_offset_test)
 
@@ -107,36 +118,38 @@ def test_pass_in_BytesIO_seek_buffer(self):
 
     def test_make_buffer_from_filepath_and_encoding(self):
         """
-        Make sure FileOrBufferHandler can input a file and read it similarly to open() with encoding
+        Make sure FileOrBufferHandler can input a file and read it similarly
+        to open() with encoding
         """
-        file_name =os.path.join(os.path.join(test_root_path, 'data'), \
-            'csv/iris-utf-16.csv')
+        file_name = os.path.join(os.path.join(test_root_path, 'data'),
+                                 'csv/iris-utf-16.csv')
         file_encoding='utf-16'
-        with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) as\
-                filepath_or_buffer, open(file_name, 'r', encoding=file_encoding) as\
-                input_file_check:
+        with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) \
+                as filepath_or_buffer, \
+                open(file_name, 'r', encoding=file_encoding) \
+                as input_file_check:
 
             # check first 100 lines
             for i in range(0, 100):
                 self.assertEqual(filepath_or_buffer.readline(),
-                                    input_file_check.readline())
+                                 input_file_check.readline())
 
         # check that file was properly closed
         self.assertEqual(filepath_or_buffer.closed,
-                            input_file_check.closed)
+                         input_file_check.closed)
 
     def test_make_buffer_error_message(self):
         """
         Check FileOrBufferHandler asserts proper attribute error
         """
         file_name = dict(not_a_valid="option")
-        with self.assertRaisesRegex(AttributeError, "Type.*is invalid. \
-                filepath_or_buffer must be a string or StringIO/BytesIO object"):
-            with FileOrBufferHandler(file_name, 'r') as\
-                    filepath_or_buffer, open(file_name, 'r') as\
-                    input_file_check:
-                    filepath_or_buffer.readline(),
-                    input_file_check.readline()
+        msg = (f"Type {type(file_name)} is invalid. filepath_or_buffer must "
+               f"be a string or StringIO/BytesIO object")
+        with self.assertRaisesRegex(AttributeError, msg):
+            with FileOrBufferHandler(file_name, 'r') as filepath_or_buffer, \
+                    open(file_name, 'r') as input_file_check:
+                filepath_or_buffer.readline(),
+                input_file_check.readline()
 
 
 if __name__ == '__main__':