Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter nested properties based on metadata #130

Merged
merged 10 commits into from
Oct 27, 2020
Merged
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ install: check_prereqs
python3 -m pip install -e '.[dev]'

test: install
pylint singer -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access
nosetests --with-doctest -v
36 changes: 24 additions & 12 deletions singer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ def unix_seconds_to_datetime(value):
return strftime(datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc))


def breadcrumb_path(breadcrumb):
"""
Transform breadcrumb into familiar object dot-notation
"""
name = ".".join(breadcrumb)
name = name.replace('properties.', '')
name = name.replace('.items', '[]')
return name


class SchemaMismatch(Exception):
def __init__(self, errors):
if not errors:
Expand All @@ -46,7 +56,7 @@ def __init__(self, errors):
msg = "Errors during transform\n\t{}".format("\n\t".join(estrs))
msg += "\n\n\nErrors during transform: [{}]".format(", ".join(estrs))

super(SchemaMismatch, self).__init__(msg)
super().__init__(msg)

class SchemaKey:
ref = "$ref"
Expand Down Expand Up @@ -110,25 +120,27 @@ def __enter__(self):
def __exit__(self, *args):
self.log_warning()

def filter_data_by_metadata(self, data, metadata):
def filter_data_by_metadata(self, data, metadata, parent=()):
if isinstance(data, dict) and metadata:
for field_name in list(data.keys()):
selected = singer.metadata.get(metadata, ('properties', field_name), 'selected')
inclusion = singer.metadata.get(metadata, ('properties', field_name), 'inclusion')
breadcrumb = parent + ('properties', field_name)
selected = singer.metadata.get(metadata, breadcrumb, 'selected')
inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion')
if inclusion == 'automatic':
continue

if selected is False:
if (selected is False) or (inclusion == 'unsupported'):
data.pop(field_name, None)
# Track that a field was filtered because the customer
# didn't select it.
self.filtered.add(field_name)
# didn't select it or the tap declared it as unsupported.
self.filtered.add(breadcrumb_path(breadcrumb))
else:
data[field_name] = self.filter_data_by_metadata(
data[field_name], metadata, breadcrumb)

if inclusion == 'unsupported':
data.pop(field_name, None)
# Track that the field was filtered because the tap
# declared it as unsupported.
self.filtered.add(field_name)
if isinstance(data, list) and metadata:
breadcrumb = parent + ('items',)
data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data]

return data

Expand Down
43 changes: 43 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,49 @@ def test_drops_fields_which_are_unsupported(self):
dict_value = {"name": "chicken"}
self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

def test_drops_nested_object_fields_which_are_unselected(self):
schema = {"type": "object",
"properties": {"addr": {"type": "object",
"properties": {"addr1": {"type": "string"},
"city": {"type": "string"},
"state": {"type": "string"},
'amount': {'type': 'integer'}}}}}
metadata = {
('properties','addr'): {"selected": True},
('properties','addr', 'properties','amount'): {"selected": False}
}
data = {'addr':
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'}
}
expected = {'addr':
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
}
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

def test_drops_nested_array_fields_which_are_unselected(self):
schema = {"type": "object",
"properties": {"addrs": {"type": "array",
"items": {"type": "object",
"properties": {"addr1": {"type": "string"},
"city": {"type": "string"},
"state": {"type": "string"},
'amount': {'type': 'integer'}}}}}}
metadata = {
('properties','addrs'): {"selected": True},
('properties','addrs','items','properties','amount'): {"selected": False}
}
data = {'addrs': [
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'},
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2', 'amount': '456'}
]
}
expected = {'addrs': [
{'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'},
{'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2'}
]
}
self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata))

class TestResolveSchemaReferences(unittest.TestCase):
def test_internal_refs_resolve(self):
schema = {"type": "object",
Expand Down