Skip to content

Commit

Permalink
Filter nested properties based on metadata (#130)
Browse files Browse the repository at this point in the history
* Support filtering of nested fields

Update filter_data_by_metadata function to allow filtering of nested fields - e.g. if property `address` has selected set to True, but property `address.street` has selected set to False, only the street would be excluded.

Processes data recursively.

* Update transform.py

make formatting a little clearer

* Update transform.py

Fix array type breadcrumb name

* Update transform.py

breadcrumb path documentation

* Update transform.py

change based on tests - must remove field from data object, not just set value to None.

* Update transform.py

line lenght :)

* Add tests for filtering nested fields

* Make pylint happy

* Simplify one line

Co-authored-by: Chris Goddard <[email protected]>
  • Loading branch information
judahrand and chrisgoddard authored Oct 27, 2020
1 parent 6c6c773 commit 9953adb
Show file tree
Hide file tree
Showing 3 changed files with 68 additions and 13 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ install: check_prereqs
python3 -m pip install -e '.[dev]'

test: install
pylint singer -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
nosetests --with-doctest -v
36 changes: 24 additions & 12 deletions singer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ def unix_seconds_to_datetime(value):
return strftime(datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc))


def breadcrumb_path(breadcrumb):
"""
Transform breadcrumb into familiar object dot-notation
"""
name = ".".join(breadcrumb)
name = name.replace('properties.', '')
name = name.replace('.items', '[]')
return name


class SchemaMismatch(Exception):
def __init__(self, errors):
if not errors:
Expand All @@ -46,7 +56,7 @@ def __init__(self, errors):
msg = "Errors during transform\n\t{}".format("\n\t".join(estrs))
msg += "\n\n\nErrors during transform: [{}]".format(", ".join(estrs))

super(SchemaMismatch, self).__init__(msg)
super().__init__(msg)

class SchemaKey:
ref = "$ref"
Expand Down Expand Up @@ -110,25 +120,27 @@ def __enter__(self):
def __exit__(self, *args):
self.log_warning()

def filter_data_by_metadata(self, data, metadata):
def filter_data_by_metadata(self, data, metadata, parent=()):
if isinstance(data, dict) and metadata:
for field_name in list(data.keys()):
selected = singer.metadata.get(metadata, ('properties', field_name), 'selected')
inclusion = singer.metadata.get(metadata, ('properties', field_name), 'inclusion')
breadcrumb = parent + ('properties', field_name)
selected = singer.metadata.get(metadata, breadcrumb, 'selected')
inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion')
if inclusion == 'automatic':
continue

if selected is False:
if (selected is False) or (inclusion == 'unsupported'):
data.pop(field_name, None)
# Track that a field was filtered because the customer
# didn't select it.
self.filtered.add(field_name)
# didn't select it or the tap declared it as unsupported.
self.filtered.add(breadcrumb_path(breadcrumb))
else:
data[field_name] = self.filter_data_by_metadata(
data[field_name], metadata, breadcrumb)

if inclusion == 'unsupported':
data.pop(field_name, None)
# Track that the field was filtered because the tap
# declared it as unsupported.
self.filtered.add(field_name)
if isinstance(data, list) and metadata:
breadcrumb = parent + ('items',)
data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data]

return data

Expand Down
43 changes: 43 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,49 @@ def test_drops_fields_which_are_unsupported(self):
dict_value = {"name": "chicken"}
self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

def test_drops_nested_object_fields_which_are_unselected(self):
schema = {"type": "object",
"properties": {"addr": {"type": "object",
"properties": {"addr1": {"type": "string"},
"city": {"type": "string"},
"state": {"type": "string"},
'amount': {'type': 'integer'}}}}}
metadata = {
('properties','addr'): {"selected": True},
('properties','addr', 'properties','amount'): {"selected": False}
}
data = {'addr':
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'}
}
expected = {'addr':
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
}
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

def test_drops_nested_array_fields_which_are_unselected(self):
schema = {"type": "object",
"properties": {"addrs": {"type": "array",
"items": {"type": "object",
"properties": {"addr1": {"type": "string"},
"city": {"type": "string"},
"state": {"type": "string"},
'amount': {'type': 'integer'}}}}}}
metadata = {
('properties','addrs'): {"selected": True},
('properties','addrs','items','properties','amount'): {"selected": False}
}
data = {'addrs': [
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'},
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2', 'amount': '456'}
]
}
expected = {'addrs': [
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2'}
]
}
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

class TestResolveSchemaReferences(unittest.TestCase):
def test_internal_refs_resolve(self):
schema = {"type": "object",
Expand Down

0 comments on commit 9953adb

Please sign in to comment.