Skip to content

Commit

Permalink
fix: oai-pmh feed
Browse files Browse the repository at this point in the history
- derive `oai_dc` xml from indexcard rdf
- add `notify_index` param to `task__derive` to allow deriving *without*
  hitting search indexes (default `True` to match current behavior)
- update `task__schedule_all_for_deriver` to pass `notify_index=False`
  to `task__derive` by default
  • Loading branch information
aaxelb committed Feb 9, 2024
1 parent c36225d commit 5f0dc3b
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 23 deletions.
3 changes: 1 addition & 2 deletions share/oaipmh/indexcard_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ class OaiPmhRepository:
# this dictionary's keys are `metadataPrefix` values
FORMATS = {
'oai_dc': {
'formatter_key': 'oai_dc',
'deriver_iri': str(OAI_DC),
'schema': 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
'namespace': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
Expand Down Expand Up @@ -64,7 +63,7 @@ def validate_metadata_prefix(self, maybe_prefix):
):
self.errors.append(oai_errors.BadFormat(maybe_prefix))

def resolve_oai_identifier(self, identifier) -> trove_db.Indexcard:
def resolve_oai_identifier(self, identifier) -> trove_db.Indexcard | None:
splid = identifier.split(self.IDENTIFER_DELIMITER)
if len(splid) != 3 or splid[:2] != ['oai', self.REPOSITORY_IDENTIFIER]:
self.errors.append(oai_errors.BadRecordID(identifier))
Expand Down
12 changes: 11 additions & 1 deletion share/oaipmh/util.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
from dateutil import parser

from lxml import etree
from primitive_metadata import primitive_rdf

from trove.vocab.namespaces import OAI, OAI_DC


def format_datetime(dt):
"""OAI-PMH has specific time format requirements -- comply.
"""
if isinstance(dt, primitive_rdf.Literal):
dt = dt.unicode_value
if isinstance(dt, str):
dt = parser.isoparse(dt)
return dt.strftime('%Y-%m-%dT%H:%M:%SZ')
Expand All @@ -18,6 +22,7 @@ def format_datetime(dt):
'oai_dc': str(OAI_DC),
'oai-identifier': 'http://www.openarchives.org/OAI/2.0/oai-identifier',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
'xml': 'http://www.w3.org/XML/1998/namespace',
}


Expand Down Expand Up @@ -47,6 +52,11 @@ def nsmap(*namespace_prefixes, default=None):
# wrapper for lxml.etree.SubElement, adds `text` kwarg for convenience
def SubEl(parent, tag_name, text=None, **kwargs):
element = etree.SubElement(parent, tag_name, **kwargs)
if text:
if isinstance(text, primitive_rdf.Literal):
_language_tag = text.language
if _language_tag:
element.set(ns('xml', 'lang'), text.language)
element.text = text.unicode_value
elif text:
element.text = text
return element
28 changes: 14 additions & 14 deletions tests/share/test_oaipmh_trove.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

NAMESPACES = {
'dc': 'http://purl.org/dc/elements/1.1/',
'ns0': 'http://www.openarchives.org/OAI/2.0/',
'oai': 'http://www.openarchives.org/OAI/2.0/',
'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
}

Expand All @@ -32,7 +32,7 @@ def oai_request(data, request_method, expect_errors=False):
raise NotImplementedError
assert response.status_code == 200
parsed = etree.fromstring(response.content, parser=etree.XMLParser(recover=True))
actual_errors = parsed.xpath('//ns0:error', namespaces=NAMESPACES)
actual_errors = parsed.xpath('//oai:error', namespaces=NAMESPACES)
if expect_errors:
assert actual_errors
return actual_errors
Expand All @@ -57,41 +57,41 @@ def request_method(self, request):

def test_identify(self, request_method):
parsed = oai_request({'verb': 'Identify'}, request_method)
assert parsed.xpath('//ns0:Identify/ns0:repositoryName', namespaces=NAMESPACES)[0].text == 'Share/trove'
assert parsed.xpath('//oai:Identify/oai:repositoryName', namespaces=NAMESPACES)[0].text == 'Share/trove'

def test_list_sets(self, request_method):
parsed = oai_request({'verb': 'ListSets'}, request_method)
_num_sets = len(parsed.xpath('//ns0:ListSets/ns0:set', namespaces=NAMESPACES))
_num_sets = len(parsed.xpath('//oai:ListSets/oai:set', namespaces=NAMESPACES))
assert _num_sets == share_db.Source.objects.all().count()

def test_list_formats(self, request_method):
parsed = oai_request({'verb': 'ListMetadataFormats'}, request_method)
prefixes = parsed.xpath('//ns0:ListMetadataFormats/ns0:metadataFormat/ns0:metadataPrefix', namespaces=NAMESPACES)
prefixes = parsed.xpath('//oai:ListMetadataFormats/oai:metadataFormat/oai:metadataPrefix', namespaces=NAMESPACES)
assert len(prefixes) == 1
assert prefixes[0].text == 'oai_dc'

def test_list_identifiers(self, request_method, oai_indexcard):
parsed = oai_request({'verb': 'ListIdentifiers', 'metadataPrefix': 'oai_dc'}, request_method)
identifiers = parsed.xpath('//ns0:ListIdentifiers/ns0:header/ns0:identifier', namespaces=NAMESPACES)
identifiers = parsed.xpath('//oai:ListIdentifiers/oai:header/oai:identifier', namespaces=NAMESPACES)
assert len(identifiers) == 1
assert identifiers[0].text == 'oai:share.osf.io:{}'.format(oai_indexcard.upriver_indexcard.uuid)

def test_list_records(self, request_method, oai_indexcard, django_assert_num_queries):
with django_assert_num_queries(1):
parsed = oai_request({'verb': 'ListRecords', 'metadataPrefix': 'oai_dc'}, request_method)
records = parsed.xpath('//ns0:ListRecords/ns0:record', namespaces=NAMESPACES)
records = parsed.xpath('//oai:ListRecords/oai:record', namespaces=NAMESPACES)
assert len(records) == 1
assert len(records[0].xpath('ns0:metadata/ns0:foo', namespaces=NAMESPACES)) == 1
assert len(records[0].xpath('oai:metadata/oai:foo', namespaces=NAMESPACES)) == 1
record_id = 'oai:share.osf.io:{}'.format(oai_indexcard.upriver_indexcard.uuid)
assert record_id == records[0].xpath('ns0:header/ns0:identifier', namespaces=NAMESPACES)[0].text
assert record_id == records[0].xpath('oai:header/oai:identifier', namespaces=NAMESPACES)[0].text

def test_get_record(self, request_method, oai_indexcard):
ant_id = 'oai:share.osf.io:{}'.format(oai_indexcard.upriver_indexcard.uuid)
parsed = oai_request({'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': ant_id}, request_method)
records = parsed.xpath('//ns0:GetRecord/ns0:record', namespaces=NAMESPACES)
records = parsed.xpath('//oai:GetRecord/oai:record', namespaces=NAMESPACES)
assert len(records) == 1
assert len(records[0].xpath('ns0:metadata/ns0:foo', namespaces=NAMESPACES)) == 1
assert ant_id == records[0].xpath('ns0:header/ns0:identifier', namespaces=NAMESPACES)[0].text
assert len(records[0].xpath('oai:metadata/oai:foo', namespaces=NAMESPACES)) == 1
assert ant_id == records[0].xpath('oai:header/oai:identifier', namespaces=NAMESPACES)[0].text

@pytest.mark.parametrize('verb, params, errors', [
('GetRecord', {}, ['badArgument']),
Expand Down Expand Up @@ -235,10 +235,10 @@ def _assert_full_list(self, verb, params, request_method, expected_count, page_s
parsed = oai_request({'verb': verb, 'resumptionToken': token}, request_method)
else:
parsed = oai_request({'verb': verb, 'metadataPrefix': 'oai_dc', **params}, request_method)
page = parsed.xpath('//ns0:header/ns0:identifier', namespaces=NAMESPACES)
page = parsed.xpath('//oai:header/oai:identifier', namespaces=NAMESPACES)
pages += 1
count += len(page)
token = parsed.xpath('//ns0:resumptionToken', namespaces=NAMESPACES)
token = parsed.xpath('//oai:resumptionToken', namespaces=NAMESPACES)
assert len(token) == 1
token = token[0].text
if token:
Expand Down
5 changes: 3 additions & 2 deletions trove/derive/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
from . import (
sharev2_elastic,
osfmap_json,
oaidc_xml,
)


DERIVER_SET = (
sharev2_elastic.ShareV2ElasticDeriver,
# TODO:
osfmap_json.OsfmapJsonDeriver,
# oaidc_xml,
oaidc_xml.OaiDcXmlDeriver,
# TODO:
# datacite_xml, (from osf.metadata)
# datacite_json, (from osf.metadata)
# property_label?
Expand Down
4 changes: 4 additions & 0 deletions trove/derive/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ def __init__(self, upriver_rdf: IndexcardRdf):
self.focus_iri = upriver_rdf.focus_iri
self.data = primitive_rdf.RdfGraph(upriver_rdf.as_rdf_tripledict())

def q(self, pathset):
# convenience for querying self.data on self.focus_iri
return self.data.q(self.focus_iri, pathset)

###
# for subclasses to implement:

Expand Down
133 changes: 133 additions & 0 deletions trove/derive/oaidc_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from lxml import etree
from primitive_metadata import primitive_rdf as rdf

from share.oaipmh.util import format_datetime, ns, nsmap, SubEl, OAI_DC

from trove.vocab.namespaces import (
DCMITYPE,
DCTERMS,
FOAF,
OSFMAP,
RDF,
RDFS,
SHAREv2,
SKOS,
)

from ._base import IndexcardDeriver


DC_RELATION_PREDICATES = {
DCTERMS.hasPart,
DCTERMS.hasVersion,
DCTERMS.isPartOf,
DCTERMS.isVersionOf,
DCTERMS.references,
OSFMAP.hasAnalyticCodeResource,
OSFMAP.hasDataResource,
OSFMAP.hasMaterialsResource,
OSFMAP.hasPapersResource,
OSFMAP.hasPreregisteredAnalysisPlan,
OSFMAP.hasPreregisteredStudyDesign,
OSFMAP.hasRoot,
OSFMAP.hasSupplementalResource,
OSFMAP.isContainedBy,
OSFMAP.isSupplementedBy,
OSFMAP.supplements,
}


class OaiDcXmlDeriver(IndexcardDeriver):
# abstract method from IndexcardDeriver
@staticmethod
def deriver_iri() -> str:
return str(OAI_DC)

# abstract method from IndexcardDeriver
def should_skip(self) -> bool:
_allowed_focustype_iris = {
SHAREv2.CreativeWork,
OSFMAP.Project,
OSFMAP.ProjectComponent,
OSFMAP.Registration,
OSFMAP.RegistrationComponent,
OSFMAP.Preprint,
}
_focustype_iris = self.q(RDF.type)
return _allowed_focustype_iris.isdisjoint(_focustype_iris)

# abstract method from IndexcardDeriver
def derive_card_as_text(self):
_dc_element = self._derive_card_as_xml()
return etree.tostring(_dc_element, encoding='unicode')

def _derive_card_as_xml(self) -> etree.Element:
dc_element = etree.Element(
ns('oai_dc', 'dc'),
attrib={
ns('xsi', 'schemaLocation'): f'{OAI_DC} http://www.openarchives.org/OAI/2.0/oai_dc.xsd'
},
nsmap=nsmap('oai_dc', 'dc', 'xsi'),
)
for _title in self.q(DCTERMS.title):
SubEl(dc_element, ns('dc', 'title'), _title)

for _creator_name in self.q({DCTERMS.creator: {FOAF.name}}):
SubEl(dc_element, ns('dc', 'creator'), _creator_name)
_subject_paths = [
DCTERMS.subject, # may use literal subject names
{DCTERMS.subject: {RDFS.label, SKOS.prefLabel, SKOS.altLabel}}, # or labeled subjects
]
for _subject in self.q(_subject_paths):
if isinstance(_subject, rdf.Literal):
SubEl(dc_element, ns('dc', 'subject'), _subject)

for _description in self.q(DCTERMS.description):
SubEl(dc_element, ns('dc', 'description'), _description)

for _publisher_name in self.q({DCTERMS.publisher: FOAF.name}):
SubEl(dc_element, ns('dc', 'publisher'), _publisher_name)

for _contributor_name in self.q({DCTERMS.contributor: FOAF.name}):
SubEl(dc_element, ns('dc', 'contributor'), _contributor_name)

try:
_date = next(self.q([
DCTERMS.date,
DCTERMS.datePublished,
DCTERMS.modified,
DCTERMS.created,
]))
except StopIteration: # no date
pass
else:
SubEl(dc_element, ns('dc', 'date'), format_datetime(_date))

for _type_iri in self.q(RDF.type):
for _type_namespace in (OSFMAP, DCMITYPE, SHAREv2):
if _type_iri in _type_namespace:
SubEl(
dc_element,
ns('dc', 'type'),
rdf.iri_minus_namespace(_type_iri, OSFMAP),
)

for _identifier in self.q(DCTERMS.identifier):
SubEl(dc_element, ns('dc', 'identifier'), _identifier)

for _language in self.q(DCTERMS.language):
SubEl(dc_element, ns('dc', 'language'), _language)

for _related_iri in self.q(DC_RELATION_PREDICATES):
SubEl(dc_element, ns('dc', 'relation'), _related_iri)

for _rights in self.q(DCTERMS.rights):
_value = (
_rights
if isinstance(_rights, (str, rdf.Literal))
else next(self.q({DCTERMS.rights: DCTERMS.title}), None)
)
if _value:
SubEl(dc_element, ns('dc', 'rights'), _value)

return dc_element
9 changes: 5 additions & 4 deletions trove/digestive_tract.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,12 +208,13 @@ def task__extract_and_derive(task: celery.Task, raw_id: int, urgent=False):


@celery.shared_task(acks_late=True, bind=True)
def task__derive(task: celery.Task, indexcard_id: int, deriver_iri: str):
def task__derive(task: celery.Task, indexcard_id: int, deriver_iri: str, notify_index=True):
_indexcard = trove_db.Indexcard.objects.get(id=indexcard_id)
derive(_indexcard, deriver_iris=[deriver_iri])
# TODO: avoid unnecessary work; let IndexStrategy subscribe to a specific
# IndexcardDeriver (perhaps by deriver-specific MessageType?)
IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard])
if notify_index:
IndexMessenger(celery_app=task.app).notify_indexcard_update([_indexcard])


@celery.shared_task(acks_late=True)
Expand All @@ -231,15 +232,15 @@ def task__schedule_extract_and_derive_for_source_config(source_config_id: int):


@celery.shared_task(acks_late=True)
def task__schedule_all_for_deriver(deriver_iri: str):
def task__schedule_all_for_deriver(deriver_iri: str, notify_index=False):
if not get_deriver_classes([deriver_iri]):
raise DigestiveError(f'unknown deriver_iri: {deriver_iri}')
_indexcard_id_qs = (
trove_db.Indexcard.objects
.values_list('id', flat=True)
)
for _indexcard_id in _indexcard_id_qs.iterator():
task__derive.apply_async((_indexcard_id, deriver_iri))
task__derive.apply_async((_indexcard_id, deriver_iri, notify_index))


# TODO: remove legacy ingest
Expand Down

0 comments on commit 5f0dc3b

Please sign in to comment.