diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f6c6da9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/sandbox diff --git a/TODO.md b/TODO.md index bf4997f..d3b3b4f 100644 --- a/TODO.md +++ b/TODO.md @@ -1,41 +1,5 @@ - -- every json file has the same structure, there top-level keys signify what - type of data it is, e.g. all raw data would be under a "raw_schema_read" field. - This allows us to merge a directory of cached data, and so that various - tools can have json files just about them on the SGSCHEMA_PATH, e.g.: - - sgevents.json: - - { - 'entity_aliases': { - 'sgevents:EventReceipt': 'CustomNonProjectEntity01' - }, - 'field_aliases': { - 'CustomNonProjectEntity01': { - 'type': 'sg_type' - } - }, - 'field_tags': { - 'PublishEvent': { - 'sg_type': ['sgcache:include'] - } - } - } - - { - 'PublishEvent': { - 'aliases': ['sgpublish:Publish', 'Publish'], - 'fields': { - 'sg_type': { - 'aliases': ['sgpublish:type', 'type'], - 'data_type': 'text', - 'name': 'Type', - 'tags': ['sgcache:include'], - } - } - } - } +- test the loading and resolution of aliases and tags - caches of the raw schema; both public ones and the private one - cache of the reduced schema @@ -60,7 +24,7 @@ e.g.: EntityType.[sgcache.include==true] PublishEvent.[sgpublish.is_core] - Tags: PublishEvent.$sgpublish:core -> {sg_code,sg_type,...} + Tags: PublishEvent.#sgsession:core -> {sg_code,sg_type,...} - Automatic sg_ prefix detection: Publish.type -> PublishEvent.sg_type @@ -72,11 +36,36 @@ This is more in SGSession (or other consumers) -- Are tags/alises forward or backward declared? - - schema.PublishEvent.aliases = ['Publish'] - vs - schema.entity_aliases['Publish'] = 'PublishEvent' +- Public API: + + schema.resolve_entity('$Publish') -> ['PublishEvent'] + schema.resolve_field('PublishEvent', 'type') -> ['sg_type'] + +- Create a standard-ish set of tags and aliases: + $parent pointing to typical parent + +- Include backrefs in reduced schema? Known as "inverse_association" in private + schema. + + + + + + + + + + + + + + + + + + + + + + - schema.PublishEvent.sg_type.aliases = ['type'] - schema.field_aliases['PublishEvent']['type'] = 'sg_type' diff --git a/sgschema/entity.py b/sgschema/entity.py new file mode 100644 index 0000000..4690e57 --- /dev/null +++ b/sgschema/entity.py @@ -0,0 +1,79 @@ +from .field import Field +from .utils import cached_property + + +class Entity(object): + + def __init__(self, schema, name): + + self.schema = schema + self.name = name + + self.fields = {} + + self._aliases = set() + self._tags = set() + + self._field_aliases = {} + self._field_tags = {} + + @cached_property + def field_aliases(self): + field_aliases = dict(self._field_aliases) + for field in self.fields.itervalues(): + for alias in field._aliases: + field_aliases[alias] = field.name + return field_aliases + + @cached_property + def field_tags(self): + field_tags = {k: set(v) for k, v in self._field_tags.iteritems()} + for field in self.fields.itervalues(): + for tag in field._tags: + field_tags.setdefault(tag, set()).add(field.name) + return field_tags + + @cached_property + def aliases(self): + aliases = set(self._aliases) + for k, v in self.schema._entity_aliases.iteritems(): + if v == self.name: + aliases.add(k) + return aliases + + @cached_property + def tags(self): + tags = set(self._tags) + for k, v in self.schema._entity_tags.iteritems(): + if self.name in v: + tags.add(k) + return tags + + def _get_or_make_field(self, name): + try: + return self.fields[name] + except KeyError: + return self.fields.setdefault(name, Field(self, name)) + + def _reduce_raw(self, schema, raw_entity): + pass + + def _load(self, raw): + for name, value in raw.pop('fields', {}).iteritems(): + self._get_or_make_field(name)._load(value) + + self._field_aliases.update(raw.pop('field_aliases', {})) + self._field_tags.update(raw.pop('field_tags', {})) + + self._aliases.update(raw.pop('aliases', ())) + self._tags.update(raw.pop('tags', ())) + + if raw: + raise ValueError('unknown entity keys: %s' % ', '.join(sorted(raw))) + + def _dump(self): + return {k: v for k, v in ( + ('fields', {field.name: field._dump() for field in self.fields.itervalues()}), + ('tags', sorted(self.tags)), + ('aliases', sorted(self.aliases)), + ) if v} diff --git a/sgschema/field.py b/sgschema/field.py new file mode 100644 index 0000000..e5a3760 --- /dev/null +++ b/sgschema/field.py @@ -0,0 +1,60 @@ +from .utils import cached_property + +class Field(dict): + + def __init__(self, entity, name): + + self.entity = entity + self.name = name + + self.allowed_entity_types = set() + self.data_type = None + + self._aliases = set() + self._tags = set() + + @cached_property + def aliases(self): + aliases = set(self._aliases) + for k, v in self.entity._field_aliases.iteritems(): + if v == self.name: + aliases.add(k) + return aliases + + @cached_property + def tags(self): + tags = set(self._tags) + for k, v in self.entity._field_tags.iteritems(): + if self.name in v: + tags.add(k) + return tags + + def _reduce_raw(self, schema, raw_field): + + self.data_type = raw_field['data_type']['value'] + + raw_private = schema._raw_private['entity_fields'][self.entity.name].get(self.name, {}) + + if raw_private.get('identifier_column'): + self._tags.add('identifier_column') + + if self.data_type in ('entity', 'multi_entity'): + types_ = raw_private['allowed_entity_types'] or [] + self.allowed_entity_types = set(types_[:]) + + def _load(self, raw): + self.allowed_entity_types.update(raw.pop('allowed_entity_types', ())) + self.data_type = raw.pop('data_type', self.data_type) + self._aliases.update(raw.pop('aliases', ())) + self._tags.update(raw.pop('tags', ())) + if raw: + raise ValueError('unknown field tags: %s' % ', '.join(sorted(raw))) + + def _dump(self): + return {k: v for k, v in ( + ('aliases', sorted(self.aliases)), + ('allowed_entity_types', sorted(self.allowed_entity_types)), + ('data_type', self.data_type), + ('tags', sorted(self.tags)), + ) if v} + diff --git a/sgschema/schema.py b/sgschema/schema.py index 7a0c34d..692ba15 100644 --- a/sgschema/schema.py +++ b/sgschema/schema.py @@ -1,3 +1,4 @@ +import ast import json import os import re @@ -5,6 +6,9 @@ import requests import yaml +from .entity import Entity +from .field import Field +from .utils import cached_property class Schema(object): @@ -16,11 +20,31 @@ def __init__(self): self._raw_private = None self.entities = {} - self.fields = {} - self.entity_aliases = {} - self.field_aliases = {} - self.field_tags = {} - + + self._entity_aliases = {} + self._entity_tags = {} + + @cached_property + def entity_aliases(self): + entity_aliases = dict(self._entity_aliases) + for entity in self.entities.itervalues(): + for alias in entity._aliases: + entity_aliases[alias] = entity.name + return entity_aliases + + @cached_property + def entity_tags(self): + entity_tags = {k: set(v) for k, v in self._entity_tags.iteritems()} + for entity in self.entities.itervalues(): + for tag in entity._tags: + entity_tags.setdefault(tag, set()).add(entity.name) + return entity_tags + + def _get_or_make_entity(self, name): + try: + return self.entities[name] + except KeyError: + return self.entities.setdefault(name, Entity(self, name)) def read(self, sg): @@ -47,34 +71,17 @@ def read(self, sg): self._reduce_raw() def _reduce_raw(self): - + for type_name, raw_entity in self._raw_entities.iteritems(): - - self.entities[type_name] = entity = {} - for name in ('name', ): - entity[name] = raw_entity[name]['value'] + entity = self._get_or_make_entity(type_name) + entity._reduce_raw(self, raw_entity) for type_name, raw_fields in self._raw_fields.iteritems(): - - raw_fields = self._raw_fields[type_name] - self.fields[type_name] = fields = {} - + entity = self._get_or_make_entity(type_name) for field_name, raw_field in raw_fields.iteritems(): + field = entity._get_or_make_field(field_name) + field._reduce_raw(self, raw_field) - fields[field_name] = field = {} - - for key in 'name', 'data_type': - field[key] = raw_field[key]['value'] - - raw_private = self._raw_private['entity_fields'][type_name].get(field_name, {}) - - if raw_private.get('identifier_column'): - field['identifier_column'] = True - self.identifier_columns[type_name] = field_name - - if field['data_type'] in ('entity', 'multi_entity'): - types_ = raw_private['allowed_entity_types'] or [] - field['allowed_entity_types'] = types_[:] def _dump_prep(self, value): if isinstance(value, unicode): @@ -86,36 +93,74 @@ def _dump_prep(self, value): else: return value - def dump(self, dir_path): - for name in 'fields', 'entities', 'private': - value = getattr(self, '_raw_' + name) - if value: - with open(os.path.join(dir_path, 'raw_%s.json' % name), 'w') as fh: - fh.write(json.dumps(value, indent=4, sort_keys=True)) - for name in ('fields',): - value = getattr(self, name) - if value: - with open(os.path.join(dir_path, name + '.json'), 'w') as fh: - fh.write(json.dumps(self._dump_prep(value), indent=4, sort_keys=True)) - - def load(self, dir_path, raw=False): - - if not raw: - for name in ('fields', 'entities'): - path = os.path.join(dir_path, name + '.json') - if os.path.exists(path): - with open(path) as fh: - setattr(self, name, json.load(fh)) - if self.fields: - self._build_associations() + def dump(self, path, raw=False): + if raw: + with open(path, 'w') as fh: + fh.write(json.dumps({ + 'raw_fields': self._raw_fields, + 'raw_entities': self._raw_entities, + 'raw_private': self._raw_private, + }, indent=4, sort_keys=True)) + else: + data = {entity.name: entity._dump() for entity in self.entities.itervalues()} + with open(path, 'w') as fh: + fh.write(json.dumps(data, indent=4, sort_keys=True)) - if raw or not self.fields: - for name in 'fields', 'entities', 'private': - path = os.path.join(dir_path, 'raw_%s.json' % name) - if os.path.exists(path): - with open(path) as fh: - setattr(self, '_raw_' + name, json.load(fh)) - self._reduce_raw() + def load_directory(self, dir_path): + for file_name in os.listdir(dir_path): + if file_name.startswith('.') or not file_name.endswith('.json'): + continue + self.load(self, os.path.join(dir_path, file_name)) + + def load_raw(self, path): + + raw = json.loads(open(path).read()) + keys = 'raw_entities', 'raw_fields', 'raw_private' + + # Make sure we have the right keys, and only the right keys. + missing = [k for k in keys if k not in raw] + if missing: + raise ValueError('missing keys in raw schema: %s' % ', '.join(missing)) + if len(keys) != 3: + extra = [k for k in raw if k not in keys] + raise ValueError('extra keys in raw schema: %s' % ', '.join(extra)) + + for k in keys: + setattr(self, '_' + k, raw[k]) + + self._reduce_raw() + + def load(self, path): + + encoded = open(path).read() + raw = json.loads(encoded) + #raw = ast.literal_eval(encoded) + + # If it is a dictionary of entity types, pretend it is in an "entities" key. + title_cased = sum(int(k[:1].isupper()) for k in raw) + if title_cased: + if len(raw) != title_cased: + raise ValueError('mix of direct and indirect entity specifications') + raw = {'entities': raw} + + # Load the two direct fields. + for type_name, value in raw.pop('entities', {}).iteritems(): + self._get_or_make_entity(type_name)._load(value) + self._entity_aliases.update(raw.pop('entity_aliases', {})) + + # Load any indirect fields. + for key, values in raw.iteritems(): + if key.startswith('entity_'): + entity_attr = key[7:] + for type_name, value in values.iteritems(): + self.entities[type_name]._load({entity_attr: value}) + elif key.startswith('field_'): + field_attr = key[6:] + for type_name, fields in values.iteritems(): + for field_name, value in fields.iteritems(): + self.entities[type_name].fields[field_name]._load({field_attr: value}) + else: + raise ValueError('unknown complex field %s' % key) @@ -130,12 +175,23 @@ def load(self, dir_path, raw=False): if False: schema.read(sg) + schema.dump('sandbox/raw.json', raw=True) else: - schema.load('sandbox', raw=True) + schema.load_raw('sandbox/raw.json') + + schema.entities['PublishEvent'].aliases.add('Publish') + schema.entities['PublishEvent'].aliases.add('sgpublish:Publish') + schema.entities['PublishEvent'].fields['sg_type'].aliases.add('type') + schema.entities['PublishEvent'].fields['sg_type'].tags.add('sgcache:include') - schema.dump('sandbox') + schema.dump('sandbox/reduced.json') t = time.time() - schema.load('sandbox') + schema = Schema() + schema.load('sandbox/reduced.json') print 1000 * (time.time() - t) + print schema.entity_aliases['Publish'] + print schema.entities['PublishEvent'].field_aliases['type'] + print schema.entities['PublishEvent'].field_tags['identifier_column'] + \ No newline at end of file diff --git a/sgschema/utils.py b/sgschema/utils.py new file mode 100644 index 0000000..f3b942d --- /dev/null +++ b/sgschema/utils.py @@ -0,0 +1,18 @@ + + +class cached_property(object): + + def __init__(self, func, name=None, doc=None): + self.__name__ = name or func.__name__ + self.__module__ = func.__module__ + self.__doc__ = doc or func.__doc__ + self.func = func + + def __get__(self, obj, type=None): + if obj is None: + return self + try: + return obj.__dict__[self.__name__] + except KeyError: + obj.__dict__[self.__name__] = value = self.func(obj) + return value