Skip to content

Commit

Permalink
Refactor StringIO/ByteIO changes made to is_match functions (#326)
Browse files Browse the repository at this point in the history
* Cleaned up data_utils

* Cleaned up csv_data

* Cleaned up filepath_or_buffer

* Typo

* Cleaned up json_data

* Cleaned up tests

* Streamline functionality formerly in DetachingTextIOWrapper

* Forgot closing paren

* Consistent naming
  • Loading branch information
ChrisWallace2020 committed Jul 14, 2021
1 parent 4ae669c commit a56f9e3
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 93 deletions.
10 changes: 6 additions & 4 deletions dataprofiler/data_readers/csv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,16 +509,18 @@ def _load_data_from_file(self, input_file_path):
"""

self._file_encoding = None
if not data_utils.is_stream_buffer(input_file_path) or isinstance(input_file_path, BytesIO):
self._file_encoding = data_utils.detect_file_encoding(input_file_path)
if not isinstance(input_file_path, StringIO):
self._file_encoding = \
data_utils.detect_file_encoding(input_file_path)

data_as_str = data_utils.load_as_str_from_file(input_file_path,
self._file_encoding)

if not self._delimiter or not self._checked_header:
delimiter, quotechar = None, None
if not self._delimiter or not self._quotechar:
delimiter, quotechar = self._guess_delimiter_and_quotechar(data_as_str)
delimiter, quotechar = \
self._guess_delimiter_and_quotechar(data_as_str)
if not self._delimiter:
self._delimiter = delimiter
if not self._quotechar:
Expand Down Expand Up @@ -580,7 +582,7 @@ def is_match(cls, file_path, options=None):
options = dict()

file_encoding = None
if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO):
if not isinstance(file_path, StringIO):
file_encoding = data_utils.detect_file_encoding(file_path=file_path)

delimiter = options.get("delimiter", None)
Expand Down
12 changes: 4 additions & 8 deletions dataprofiler/data_readers/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,12 +560,11 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10,
search_query_value = b'\n'

loc, occurance = find_nth_loc(sample_lines,
search_query=search_query_value,
n=remaining_lines)
search_query=search_query_value,
n=remaining_lines)

# Add sample_lines to data_as_str no more than max_lines
if (is_stream_buffer(file_path) and isinstance(sample_lines[:loc],\
bytes)):
if isinstance(sample_lines[:loc], bytes):
data_as_str += sample_lines[:loc].decode(file_encoding)
else:
data_as_str += sample_lines[:loc]
Expand All @@ -576,16 +575,13 @@ def load_as_str_from_file(file_path, file_encoding=None, max_lines=10,

return data_as_str


def is_stream_buffer(filepath_or_buffer):
"""
Determines whether a given argument is a filepath or buffer.
:param filepath_or_buffer: path to the file or buffer
:type filepath_or_buffer: str
:param encoding: File encoding
:type encoding: str
:param seek: position to start in buffer
:type seek: int
:return: true if string is a buffer or false if string is a filepath
:rtype: boolean
"""
Expand Down
48 changes: 25 additions & 23 deletions dataprofiler/data_readers/filepath_or_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
from . import data_utils


class FileOrBufferHandler():
class FileOrBufferHandler:
"""
FileOrBufferHandler class to read a filepath or buffer in and always return a readable buffer
FileOrBufferHandler class to read a filepath or buffer in and always
return a readable buffer.
"""

def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offset=None, seek_whence=0):
def __init__(self, filepath_or_buffer, open_method='r', encoding=None,
seek_offset=None, seek_whence=0):
"""
Context manager class used for inputing a file or buffer and returning a structure
that is always a buffer.
Context manager class used for inputting a file or buffer and returning
a structure that is always a buffer.
:param filepath_or_buffer: path to the file being loaded or buffer
:type filepath_or_buffer: Union[str, StringIO, BytesIO]
Expand All @@ -27,39 +29,39 @@ def __init__(self, filepath_or_buffer, open_method='r', encoding=None, seek_offs
self.seek_whence = seek_whence
self._encoding = encoding
self.original_type = type(filepath_or_buffer)
self._is_wrapped = False

def __enter__(self):
if isinstance(self._filepath_or_buffer, str):
self._filepath_or_buffer = open(
self._filepath_or_buffer, self.open_method, encoding=self._encoding)
self._filepath_or_buffer, self.open_method,
encoding=self._encoding)

elif isinstance(self._filepath_or_buffer, BytesIO) and self.open_method == 'r':
self._filepath_or_buffer = DetachingTextIOWrapper(self._filepath_or_buffer, encoding=self._encoding)
elif isinstance(self._filepath_or_buffer, BytesIO) \
and self.open_method == 'r':
self._filepath_or_buffer = \
TextIOWrapper(self._filepath_or_buffer, encoding=self._encoding)
self._is_wrapped = True

elif not data_utils.is_stream_buffer(self._filepath_or_buffer):
# Raise AttributeError if attribute value not found.
raise AttributeError(f'Type {type(self._filepath_or_buffer)} is invalid. \
filepath_or_buffer must be a string or StringIO/BytesIO object')
raise AttributeError(f'Type {type(self._filepath_or_buffer)} is '
f'invalid. filepath_or_buffer must be a '
f'string or StringIO/BytesIO object')

if self.seek_offset is not None:
self._filepath_or_buffer.seek(self.seek_offset, self.seek_whence)

return self._filepath_or_buffer

def __exit__(self, exc_type, exc_value, exc_traceback):
if isinstance(self._filepath_or_buffer, (StringIO, BytesIO, DetachingTextIOWrapper)):
# Need to detach buffer if wrapped (i.e. BytesIO opened with 'r')
if self._is_wrapped:
wrapper = self._filepath_or_buffer
self._filepath_or_buffer = wrapper.buffer
wrapper.detach()

if isinstance(self._filepath_or_buffer, (StringIO, BytesIO)):
self._filepath_or_buffer.seek(0)
else:
self._filepath_or_buffer.close()

class DetachingTextIOWrapper(TextIOWrapper):
"""
DetachingTextIOWrapper class is used to detach buffer to avoid buffer closing before it's returned
"""

def close(self):
self.detach()

def __del__(self):
if self.buffer:
self.detach()
10 changes: 5 additions & 5 deletions dataprofiler/data_readers/json_data.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
from collections import OrderedDict
from dataprofiler.data_readers.filepath_or_buffer import FileOrBufferHandler
import json
import warnings
import types
from io import BytesIO
from six import StringIO

import numpy as np
import pandas as pd
from . import data_utils
from .base_data import BaseData
from .structured_mixins import SpreadSheetDataMixin
from .filepath_or_buffer import FileOrBufferHandler


class JSONData(SpreadSheetDataMixin, BaseData):
Expand Down Expand Up @@ -364,10 +363,11 @@ def is_match(cls, file_path, options=None):
options = dict()

file_encoding = None
if not data_utils.is_stream_buffer(file_path) or isinstance(file_path, BytesIO):
if not isinstance(file_path, StringIO):
file_encoding = data_utils.detect_file_encoding(file_path=file_path)

with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) as data_file:
with FileOrBufferHandler(file_path, 'r', encoding=file_encoding) \
as data_file:
try:
json.load(data_file)
return True
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/tests/data_readers/test_avro_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ def setUpClass(cls):

def test_is_match_for_byte_streams(self):
"""
Determine if the avro file can be automatically identified from byte stream
Determine if the avro file can be automatically identified from
byte stream
"""
for input_file in self.input_file_names:
# as BytesIO Stream
with open(input_file['path'], 'rb') as fp:
byte_string = BytesIO(fp.read())
input_data_obj = Data(byte_string)
self.assertEqual(input_data_obj.data_type, 'avro')
self.assertTrue(AVROData.is_match(byte_string))

def test_avro_file_identification(self):
"""
Expand Down
15 changes: 8 additions & 7 deletions dataprofiler/tests/data_readers/test_csv_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,23 +163,24 @@ def setUpClass(cls):

def test_is_match_for_string_streams(self):
"""
Determine if the csv file can be automatically identified from string stream
Determine if the csv file can be automatically identified from
string stream
"""
for input_file in self.input_file_names:
with open(input_file['path'], 'r', encoding=input_file['encoding']) as fp:
with open(input_file['path'], 'r',
encoding=input_file['encoding']) as fp:
byte_string = StringIO(fp.read())
input_data_obj = Data(byte_string)
self.assertEqual(input_data_obj.data_type, 'csv')
self.assertTrue(CSVData.is_match(byte_string))

def test_is_match_for_byte_streams(self):
"""
Determine if the csv file can be automatically identified from byte stream
Determine if the csv file can be automatically identified from
byte stream
"""
for input_file in self.input_file_names:
with open(input_file['path'], 'rb') as fp:
byte_string = BytesIO(fp.read())
input_data_obj = Data(byte_string)
self.assertEqual(input_data_obj.data_type, 'csv')
self.assertTrue(CSVData.is_match(byte_string))

def test_auto_file_identification(self):
"""
Expand Down
79 changes: 46 additions & 33 deletions dataprofiler/tests/data_readers/test_filepath_or_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ def setUpClass(cls):

def test_make_buffer_from_filepath(self):
"""
Make sure FileOrBufferHandler can input a file and read it similarly to open()
Make sure FileOrBufferHandler can input a file and read it similarly
to open()
"""
for input_file in self.input_file_names:
with FileOrBufferHandler(input_file['path'], 'r') as\
filepath_or_buffer, open(input_file['path'], 'r') as\
with FileOrBufferHandler(input_file['path'], 'r') as \
filepath_or_buffer, open(input_file['path'], 'r') as \
input_file_check:

# check first 100 lines
Expand All @@ -44,12 +45,14 @@ def test_make_buffer_from_filepath(self):

def test_pass_in_StringIO_buffer(self):
"""
Make sure FileOrBufferHandler can take StringIO and read it similarly to open()
Make sure FileOrBufferHandler can take StringIO and read it similarly
to open()
"""
for input_file in self.input_file_names:
with FileOrBufferHandler(StringIO(open(input_file['path'], 'r')
.read())) as filepath_or_buffer, open(input_file['path'], 'r')\
as input_file_check:
with FileOrBufferHandler(StringIO(
open(input_file['path'], 'r').read())) as \
filepath_or_buffer, open(input_file['path'], 'r') as \
input_file_check:

# check first 100 lines
for i in range(0, 100):
Expand All @@ -58,13 +61,15 @@ def test_pass_in_StringIO_buffer(self):

def test_pass_in_StringIO_seek_buffer(self):
"""
Make sure FileOrBufferHandler can take StringIO with seek and read it similarly to open() with seek
Make sure FileOrBufferHandler can take StringIO with seek and read it
similarly to open() with seek
"""
for input_file in self.input_file_names:
print(input_file)
seek_offset_test = 100
with FileOrBufferHandler(StringIO(open(input_file['path'], 'rb')
.read().decode()), seek_offset=seek_offset_test) as filepath_or_buffer,\
with FileOrBufferHandler(StringIO(
open(input_file['path'], 'rb').read().decode()),
seek_offset=seek_offset_test) as filepath_or_buffer, \
open(input_file['path'], 'rb') as input_file_check:

input_file_check.seek(seek_offset_test)
Expand All @@ -76,12 +81,15 @@ def test_pass_in_StringIO_seek_buffer(self):

def test_pass_in_BytesIO_buffer(self):
"""
Make sure FileOrBufferHandler can take BytesIO and read it similarly to open()
Make sure FileOrBufferHandler can take BytesIO and read it similarly
to open()
"""
for input_file in self.input_file_names:
with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb').
read())) as filepath_or_buffer, TextIOWrapper(open(input_file['path'], 'rb'))\
as input_file_check:
with FileOrBufferHandler(BytesIO(
open(input_file['path'], 'rb').read())) as \
filepath_or_buffer, \
TextIOWrapper(open(input_file['path'], 'rb')) as \
input_file_check:

# check first 100 lines
for i in range(0, 100):
Expand All @@ -90,13 +98,16 @@ def test_pass_in_BytesIO_buffer(self):

def test_pass_in_BytesIO_seek_buffer(self):
"""
Make sure FileOrBufferHandler can take BytesIO with seek and read it similarly to open() with seek
Make sure FileOrBufferHandler can take BytesIO with seek and read it
similarly to open() with seek
"""
for input_file in self.input_file_names:
seek_offset_test = 500
with FileOrBufferHandler(BytesIO(open(input_file['path'], 'rb').
read()), seek_offset=seek_offset_test) as filepath_or_buffer,\
TextIOWrapper(open(input_file['path'], 'rb')) as input_file_check:
with FileOrBufferHandler(BytesIO(
open(input_file['path'], 'rb').read()),
seek_offset=seek_offset_test) as filepath_or_buffer, \
TextIOWrapper(open(input_file['path'], 'rb')) as \
input_file_check:

input_file_check.seek(seek_offset_test)

Expand All @@ -107,36 +118,38 @@ def test_pass_in_BytesIO_seek_buffer(self):

def test_make_buffer_from_filepath_and_encoding(self):
"""
Make sure FileOrBufferHandler can input a file and read it similarly to open() with encoding
Make sure FileOrBufferHandler can input a file and read it similarly
to open() with encoding
"""
file_name =os.path.join(os.path.join(test_root_path, 'data'), \
'csv/iris-utf-16.csv')
file_name = os.path.join(os.path.join(test_root_path, 'data'),
'csv/iris-utf-16.csv')
file_encoding='utf-16'
with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) as\
filepath_or_buffer, open(file_name, 'r', encoding=file_encoding) as\
input_file_check:
with FileOrBufferHandler(file_name, 'r', encoding=file_encoding) \
as filepath_or_buffer, \
open(file_name, 'r', encoding=file_encoding) \
as input_file_check:

# check first 100 lines
for i in range(0, 100):
self.assertEqual(filepath_or_buffer.readline(),
input_file_check.readline())
input_file_check.readline())

# check that file was properly closed
self.assertEqual(filepath_or_buffer.closed,
input_file_check.closed)
input_file_check.closed)

def test_make_buffer_error_message(self):
"""
Check FileOrBufferHandler asserts proper attribute error
"""
file_name = dict(not_a_valid="option")
with self.assertRaisesRegex(AttributeError, "Type.*is invalid. \
filepath_or_buffer must be a string or StringIO/BytesIO object"):
with FileOrBufferHandler(file_name, 'r') as\
filepath_or_buffer, open(file_name, 'r') as\
input_file_check:
filepath_or_buffer.readline(),
input_file_check.readline()
msg = (f"Type {type(file_name)} is invalid. filepath_or_buffer must "
f"be a string or StringIO/BytesIO object")
with self.assertRaisesRegex(AttributeError, msg):
with FileOrBufferHandler(file_name, 'r') as filepath_or_buffer, \
open(file_name, 'r') as input_file_check:
filepath_or_buffer.readline(),
input_file_check.readline()


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit a56f9e3

Please sign in to comment.