diff --git a/api/pls_render_metadata/urls.py b/api/pls_render_metadata/urls.py new file mode 100644 index 000000000..6d0e6fccc --- /dev/null +++ b/api/pls_render_metadata/urls.py @@ -0,0 +1,8 @@ +from django.urls import path + +from . import views + + +urlpatterns = [ + path('pls-render-metadata', views.pls_render_metadata), +] diff --git a/api/pls_render_metadata/views.py b/api/pls_render_metadata/views.py new file mode 100644 index 000000000..3d7a583b2 --- /dev/null +++ b/api/pls_render_metadata/views.py @@ -0,0 +1,54 @@ +"""pls-render-metadata: an amnesiac metadata renderer + +a django view that takes a POST request with a raw metadata record in the request body + +sidenote: using the pls ("please") prefix to mean we (the server) are: + - acting at explicit request of someone else + - aiming to help + - behaving politely/discreetly (will forget all data/metadata) + +""" + +import json + +from django.http import HttpResponse + +from share.util.extensions import Extensions +from share.regulate import Regulator + + +def pls_render_metadata(request): + if request.method != 'POST': + return HttpResponse( + status=405, + headers={'Allow': 'POST'}, + content='only POST!' + ) + + transformer_key = request.GET.get('transformer', 'v2_push') + normal_graph = pls_normalize(request.body, transformer_key) + requested_formats = request.GET.getlist('formats') + formatted_records = [ + pls_render(normal_graph, format_key) + for format_key in requested_formats + ] + return HttpResponse( + content=json.dumps(formatted_records), + ) + + +def pls_normalize(raw_datum, transformer_key): + transformer = Extensions.get('share.transformers', transformer_key)() + graph = transformer.transform(raw_datum) + Regulator().regulate(graph) # in-place + + +def pls_render(graph, record_formats): + formatters = [ + Extensions.get('share.metadata_formats', record_format)() + for record_format in record_formats + ] + return [ + formatter.format_from_graph(graph) + for formatter in formatters + ] diff --git a/api/urls.py b/api/urls.py index bc41ad8b9..d40dc6806 100644 --- a/api/urls.py +++ b/api/urls.py @@ -19,6 +19,8 @@ url('^', include('api.suids.urls')), url('^', include('api.users.urls')), + url('^', include('api.pls_render_metadata.urls')), + url('^schemas?/', include('api.schemas.urls'), name='schema'), url('^search/', include('api.search.urls'), name='search'), diff --git a/share/ingest/change_builder.py b/share/ingest/change_builder.py deleted file mode 100644 index faacd6141..000000000 --- a/share/ingest/change_builder.py +++ /dev/null @@ -1,267 +0,0 @@ -import logging -import pendulum -import datetime - -from django.apps import apps -from django.conf import settings -from django.contrib.contenttypes.models import ContentType - -from share.disambiguation.matcher import Matcher -from share.disambiguation.strategies import DatabaseStrategy -from share.models import Change, ChangeSet -from share.util import IDObfuscator - - -logger = logging.getLogger(__name__) - - -class ChangeSetBuilder: - def __init__(self, graph, normalized_datum, matches=None, disambiguate=False): - if matches and disambiguate: - raise ValueError('ChangeSetBuilder: Either provide matches or disambiguate=True, not both') - - self.graph = graph - self.normalized_datum = normalized_datum - self.user = normalized_datum.source # "source" here is a ShareUser - self.source = self.user.source - self.matches = matches or {} - self.disambiguate = disambiguate - - def build_change_set(self): - if self.disambiguate: - self.matches = Matcher(DatabaseStrategy(self.source)).find_all_matches(self.graph) - - change_builders = [ChangeBuilder(n, self.source, self.matches) for n in self.graph.topologically_sorted()] - if all(cb.diff is None for cb in change_builders): - logger.debug('No changes detected in {!r}, skipping.'.format(self.graph)) - return None - - change_set = ChangeSet(normalized_data_id=self.normalized_datum.id) - change_set.save() - - Change.objects.bulk_create( - filter(None, [ - cb.build_change(change_set, save=False) - for cb in change_builders - ]) - ) - - return change_set - - -class ChangeBuilder: - def __init__(self, node, source=None, matches=None): - self.node = node - self.source = source - self.matches = matches or {} - - self.instance = self._get_match(node) - self.diff = self.get_diff() if not self.should_skip() else None - - def build_change(self, change_set, save=True): - if self.diff is None: - logger.debug('No changes detected in {!r}, skipping.'.format(self.node)) - return None - - model = apps.get_model('share', self.node.type) - - attrs = { - 'node_id': self.node.id, - 'change': self.diff, - 'change_set': change_set, - 'model_type': ContentType.objects.get_for_model(model, for_concrete_model=False), - 'target_type': ContentType.objects.get_for_model(model, for_concrete_model=True), - 'target_version_type': ContentType.objects.get_for_model(model.VersionModel, for_concrete_model=True), - } - - if not self.instance: - attrs['type'] = Change.TYPE.create - else: - attrs['type'] = Change.TYPE.update - attrs['target_id'] = self.instance.pk - attrs['target_version_id'] = self.instance.version_id - - change = Change(**attrs) - - if save: - change.save() - - return change - - def should_skip(self): - model = apps.get_model('share', self.node.type) - if not hasattr(model, 'VersionModel'): - # Non-ShareObjects (e.g. SubjectTaxonomy) cannot be changed. - # Shouldn't reach this point... - logger.warning('Change node {!r} targets immutable model {}, skipping.'.format(self.node, model)) - return True - - if self.instance: - if (self.node.type == 'subject' - and self.instance.central_synonym is None - and (not self.source or self.source.user.username != settings.APPLICATION_USERNAME)): - return True - - return False - - def get_diff(self): - attrs = self.node.attrs() - relations = self.node.relations(in_edges=False) - - new_extra = attrs.pop('extra', None) - if new_extra and self.source and self.source.user: - # Only save "extra" data that has a namespace - extra_namespace = self.source.user.username - - old_extra = getattr(self.instance, 'extra', None) - if old_extra: - old_extra = old_extra.data.get(extra_namespace, {}) - - # NOTE extra changes are only diffed at the top level - extra_diff = { - k: v - for k, v in new_extra.items() - if k not in old_extra or old_extra[k] != v - } - if extra_diff: - attrs['extra'] = extra_diff - else: - attrs['extra'] = new_extra - - if self.instance is None: - return self._diff_for_create(attrs, relations) - return self._diff_for_update(attrs, relations) - - def _diff_for_create(self, attrs, relations): - return { - **attrs, - **self._relations_to_jsonld(relations), - } - - def _diff_for_update(self, attrs, relations): - if not self.instance: - raise ValueError('ChangeBuilder: Do not call _diff_for_update without self.instance set') - - attrs_diff, relations_diff = {}, {} - - ignore_attrs = self._get_ignore_attrs(attrs) - - new_model = apps.get_model('share', self.node.type) - old_model = type(self.instance) - is_subject = new_model is apps.get_model('share', 'subject') - - if '@type' not in ignore_attrs and old_model is not new_model: - if ( - len(new_model.__mro__) >= len(old_model.__mro__) - - # Special case to allow creators to be downgraded to contributors - # This allows OSF users to mark project contributors as bibiliographic or non-bibiliographic - # and have that be reflected in SHARE - or issubclass(new_model, apps.get_model('share', 'contributor')) - ): - attrs_diff['@type'] = new_model._meta.label_lower - - for k, v in attrs.items(): - if k in ignore_attrs: - logger.debug('Ignoring potentially conflicting change to "%s"', k) - continue - old_value = getattr(self.instance, k) - if isinstance(old_value, datetime.datetime): - v = pendulum.parse(v) - if v != old_value: - attrs_diff[k] = v.isoformat() if isinstance(v, datetime.datetime) else v - - # TODO Add relationships in for non-subjects. Somehow got omitted first time around - if is_subject: - for k, v in relations.items(): - old_value = getattr(self.instance, k) - if old_value != self._get_match(v): - relations_diff[k] = v - - diff = { - **attrs_diff, - **self._relations_to_jsonld(relations_diff), - } - # If there's nothing to update, return None instead of an empty diff - if not diff: - new_source = ( - self.source - and not is_subject - and hasattr(self.instance, 'sources') - and not self.instance.sources.filter(source=self.source).exists() - ) - - if not new_source: - return None - return diff - - def _get_match(self, node): - return self.matches.get(node) or self.matches.get(node.id) - - def _relations_to_jsonld(self, relations): - def refs(n): - if isinstance(n, list): - return [refs(node) for node in n] - instance = self._get_match(n) - return { - '@id': IDObfuscator.encode(instance) if instance else n.id, - '@type': n.type, - } - return { - k: refs(v) - for k, v in relations.items() - } - - def _get_ignore_attrs(self, attrs): - ignore_attrs = set() - - model = apps.get_model('share', self.node.type) - if not issubclass(model, apps.get_model('share', 'creativework')): - # Only work records get special treatment at the moment - return ignore_attrs - - # Hacky fix for SHARE-604 - # If the given date_updated is older than the current one, don't accept any changes that would overwrite newer changes - if 'date_updated' in attrs and self.instance.date_updated: - date_updated = pendulum.parse(attrs['date_updated']) - if date_updated < self.instance.date_updated: - logger.warning('%s appears to be from the past, change date_updated (%s) is older than the current (%s). Ignoring conflicting changes.', self.node, attrs['date_updated'], self.instance.date_updated) - # Just in case - ignore_attrs.update(self.instance.change.change.keys()) - - # Go back until we find a change that is older than us - for version in self.instance.versions.select_related('change').iterator(): - if not version.date_updated or date_updated > version.date_updated: - break - ignore_attrs.update(version.change.change.keys()) - - # If we get changes from a source that hasn't been marked as canonical - # don't allow attributes set by canonical sources to be changed. - # Stops aggregators from overwriting the most correct information - # IE CrossRef sometimes turns preprints into articles/publications - # TODO Write a test case for subjects - if self.source and not self.source.canonical and hasattr(self.instance, 'sources'): - # Only fetch 15. If there are more than 15, it's probably a bug. Even if it is not, the past 15 changes should be enough... - prev_changes = list( - self.instance.changes.filter( - change_set__normalized_data__source__source__canonical=True - ).values_list( - 'change', - flat=True - )[:15] - ) - canonical_keys = set(key for change in prev_changes for key in change.keys()) - if prev_changes and set(attrs.keys()) & canonical_keys: - canonical_sources = list( - self.instance.sources.filter( - source__canonical=True - ).values_list('username', flat=True) - ) - logger.warning('Recieved changes from a non-canonical source %s that conflict with one of %s. Ignoring conflicting changes', self.source, canonical_sources) - ignore_attrs.update(canonical_keys) - - # Appears that type doesn't get added to changes or at least the first change - # Safe to assume the type was set by the canonical source - ignore_attrs.add('@type') - - return ignore_attrs diff --git a/share/metadata_formats/base.py b/share/metadata_formats/base.py index 0714f46ac..0a3050372 100644 --- a/share/metadata_formats/base.py +++ b/share/metadata_formats/base.py @@ -3,11 +3,18 @@ from share.models.core import NormalizedData from share.models.ingest import SourceUniqueIdentifier +from share.util.graph import MutableGraph class MetadataFormatter(ABC): - @abstractmethod def format(self, normalized_data: NormalizedData) -> Optional[str]: + """return a string representation of the given metadata in the formatter's format + """ + mgraph = MutableGraph.from_jsonld(normalized_data.data) + return self.format_from_graph(mgraph) + + @abstractmethod + def format_from_graph(self, normalized_data: NormalizedData) -> Optional[str]: """return a string representation of the given metadata in the formatter's format """ raise NotImplementedError diff --git a/share/metadata_formats/oai_dc.py b/share/metadata_formats/oai_dc.py index a2edf71d3..891943e83 100644 --- a/share/metadata_formats/oai_dc.py +++ b/share/metadata_formats/oai_dc.py @@ -16,6 +16,9 @@ class OaiDcFormatter(MetadataFormatter): def format(self, normalized_datum): mgraph = MutableGraph.from_jsonld(normalized_datum.data) + return self.format_from_graph(mgraph) + + def format_from_graph(self, mgraph): central_work = mgraph.get_central_node(guess=True) if ( diff --git a/share/metadata_formats/sharev2_elastic.py b/share/metadata_formats/sharev2_elastic.py index 030ab611b..4c98f2476 100644 --- a/share/metadata_formats/sharev2_elastic.py +++ b/share/metadata_formats/sharev2_elastic.py @@ -58,6 +58,10 @@ def format_as_deleted(self, suid): 'is_deleted': True, }) + def format_from_graph(self, mgraph): + # HACK because sharev2_elastic is dumb + raise NotImplementedError('sharev2_elastic formatter depends on a NormalizedData') + def format(self, normalized_datum): mgraph = MutableGraph.from_jsonld(normalized_datum.data) central_work = mgraph.get_central_node(guess=True) diff --git a/share/transform/base.py b/share/transform/base.py index b8bf94888..ccbd5da61 100644 --- a/share/transform/base.py +++ b/share/transform/base.py @@ -6,7 +6,7 @@ class BaseTransformer(metaclass=abc.ABCMeta): - def __init__(self, source_config): + def __init__(self, source_config=None): self.config = source_config @abc.abstractmethod diff --git a/share/transform/chain/links.py b/share/transform/chain/links.py index 040f8d4f1..4a6bf3627 100644 --- a/share/transform/chain/links.py +++ b/share/transform/chain/links.py @@ -1029,7 +1029,7 @@ def execute(self, obj): break if not final[0]: - if self._urn_fallback: + if self._urn_fallback and getattr(Context(), '_config', None): urn = self.FALLBACK_FORMAT.format(source=Context()._config.label, id=urllib.parse.quote(obj)) return URNLink().execute(urn) else: diff --git a/share/transformers/oai.py b/share/transformers/oai.py index d8a47db0f..3bfd213c3 100644 --- a/share/transformers/oai.py +++ b/share/transformers/oai.py @@ -310,7 +310,6 @@ class RootParser(OAICreativeWork): type_map = root_type_map if property_list: - logger.debug('Attaching addition properties %s to transformer for %s', property_list, self.config.label) for prop in property_list: if prop in RootParser._extra: logger.warning('Skipping property %s, it already exists', prop) diff --git a/tests/api/test_pls_render_metadata.py b/tests/api/test_pls_render_metadata.py new file mode 100644 index 000000000..6a363295f --- /dev/null +++ b/tests/api/test_pls_render_metadata.py @@ -0,0 +1,26 @@ +from tests.share.metadata_formats.base import FORMATTER_TEST_INPUTS +from tests.share.metadata_formats.test_oai_dc_formatter import TestOaiDcFormatter as oaidc_test_cases + + +class TestPlsRenderMetadata: + def _get_test_keys(self): + return FORMATTER_TEST_INPUTS.keys() + + def _get_input(self, test_key): + return FORMATTER_TEST_INPUTS[test_key]['normalized_datum_kwargs']['data'] + + def _get_expected_output(self, test_key): + return oaidc_test_cases.expected_outputs[test_key] + + def test_works(self, client): + for test_key in self._get_test_keys(): + try: + resp = client.post( + '/api/v2/pls-render-metadata', + self._get_input(test_key), + ) + assert resp.status_code == 200 + assert resp.json() == self._get_expected_output(test_key) + print(f'success! ({test_key})') + except Exception as e: + print(f'fail! ({test_key}, {e})')