From 47279682a6618fd4c7800667c6cee82939c15599 Mon Sep 17 00:00:00 2001 From: huberrob Date: Thu, 24 Oct 2024 12:01:00 +0200 Subject: [PATCH] spatial coverage metadata parsing for ISO19XXX, EML and DataCite see #537 #538 --- fuji_server/helper/metadata_collector_xml.py | 22 +++++++- fuji_server/helper/metadata_mapper.py | 55 +++++++++++++------- 2 files changed, 57 insertions(+), 20 deletions(-) diff --git a/fuji_server/helper/metadata_collector_xml.py b/fuji_server/helper/metadata_collector_xml.py index 5aeba72a..092d303e 100644 --- a/fuji_server/helper/metadata_collector_xml.py +++ b/fuji_server/helper/metadata_collector_xml.py @@ -312,7 +312,6 @@ def get_mapped_xml_metadata(self, tree, mapping): res = dict() # make sure related_resources are not listed in the mapping dict instead related_resource_Reltype has to be used res["related_resources"] = [] - for prop in mapping: res[prop] = [] if isinstance(mapping.get(prop).get("path"), list): @@ -402,4 +401,25 @@ def get_mapped_xml_metadata(self, tree, mapping): res.pop("object_content_identifier_size", None) res.pop("object_content_identifier_url", None) res.pop("object_content_identifier_service", None) + if res.get("coverage_spatial_coordinates") or res.get("coverage_spatial_names"): + res["coverage_spatial"] = [] + if not isinstance(res["coverage_spatial_coordinates"], list): + res["coverage_spatial_coordinates"] = [res["coverage_spatial_coordinates"]] + ci = 0 + for spatial_info in res["coverage_spatial_coordinates"] or res.get("coverage_spatial_names"): + spatial_coordinates = None + spatial_name = None + if res.get("coverage_spatial_coordinates"): + if ci < len(res["coverage_spatial_coordinates"]): + spatial_coordinates = res["coverage_spatial_coordinates"][ci] + if res.get("coverage_spatial_name"): + if ci < len(res["coverage_spatial_name"]): + spatial_name = res["coverage_spatial_name"][ci] + res["coverage_spatial"].append( + {"coordinates": str(spatial_coordinates).split(" "), "name": spatial_name} + ) + ci += 1 + res.pop("coverage_spatial_coordinates", None) + res.pop("coverage_spatial_name", None) + return res diff --git a/fuji_server/helper/metadata_mapper.py b/fuji_server/helper/metadata_mapper.py index 831a247a..643b9221 100644 --- a/fuji_server/helper/metadata_mapper.py +++ b/fuji_server/helper/metadata_mapper.py @@ -262,7 +262,8 @@ def flip_dict(dict_to_flip): "submitted_date: dates[?dateType == 'Submitted'].date," "object_content_identifier: {url: contentUrl} , " "access_level: rightsList[*].rightsUri || rightsList[*].rights, " - "language: language }" + "language: language," + "coverage_spatial: geoLocations[*].{coordinates: geoLocationBox.*[] || geoLocationPoint.*[] || geoLocationPolygons[*].polygonPoints[].*[],name: geoLocationPlace }}" ) #'related_resources: relatedIdentifiers[*].[relatedIdentifier,relationType]}' @@ -378,6 +379,20 @@ def flip_dict(dict_to_flip): "license": {"path": ["./{*}rightsList/{*}rights", "./{*}rightsList/{*}rights@@rightsURI"]}, "access_level": {"path": ["./{*}rightsList/{*}rights", "./{*}rightsList/{*}rights@@rightsURI"]}, "language": {"path": "./{*}language"}, + "coverage_spatial_coordinates": { + "path": [ + "./{*}geoLocations/{*}geoLocation/{*}geoLocationPoint", + "./{*}geoLocations/{*}geoLocation/{*}geoLocationBox", + "./{*}geoLocations/{*}geoLocation/{*}geoLocationPolygon", + ] + }, + "coverage_spatial_name": { + "path": [ + "./{*}geoLocations/{*}geoLocationPlace", + "./{*}geoLocations/{*}geoLocationPlace", + "./{*}geoLocations/{*}geoLocationPlace", + ] + }, } XML_MAPPING_METS = { @@ -442,6 +457,10 @@ def flip_dict(dict_to_flip): "path": "./{*}dataset/{*}dataTable/{*}physical/{*}distribution/{*}online/{*}size" }, "language": {"path": "./{*}dataset/{*}language"}, + "coverage_spatial_coordinates": { + "path": "./{*}dataset/{*}coverage/{*}geographicCoverage/{*}boundingCoordinates" + }, + "coverage_spatial_name": {"path": "./{*}dataset/{*}coverage/{*}geographicCoverage/{*}geographicDescription"}, } # CLARIN CMDI XML_MAPPING_CMD = { @@ -562,6 +581,9 @@ def flip_dict(dict_to_flip): "object_content_identifier_type": {"path": ".//{*}fileDscr/{*}fileTxt/{*}fileType"}, "measured_variable": {"path": "./{*}dataDscr/{*}var@@name"}, "language": {"path": ["./{*}codeBook@@lang", "./{*}stdyDscr/{*}citation/{*}titlStmt/{*}titl@@xml:lang"]}, + # https://ddialliance.org/Specification/DDI-Codebook/2.1/DTD/Documentation/version2-1-all.html#2.0 + # spatial_coverage_name: geogCover + # spatial_coverage_coordinates:geoBndBox } XML_MAPPING_DIF = { "object_identifier": {"path": "./{*}Dataset_Citation/{*}Persistent_Identifier"}, @@ -666,24 +688,6 @@ def flip_dict(dict_to_flip): }, ], }, - """ - "object_content_identifier_url": { - "path": [ - "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}linkage/{*}URL", - #"./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource[{*}protocol]/{*}linkage/{*}URL", - "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}linkage/{*}URL" - ] - }, - "object_content_identifier_type": { - "path": [ - "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}applicationProfile/{*}Anchor", - "./{*}distributionInfo/{*}MD_Distribution/{*}transferOptions/{*}MD_DigitalTransferOptions/{*}onLine/{*}CI_OnlineResource/{*}applicationProfile/{*}Anchor" - ] - }, - "object_content_identifier_service": { - "path": "./{*}distributionInfo/{*}MD_Distribution//{*}CI_OnlineResource/{*}protocol/{*}Anchor@@xlink:href" - }, - """ "measured_variable": { "path": [ "./{*}contentInfo/{*}MD_CoverageDescription/{*}attributeDescription/{*}RecordType", @@ -720,4 +724,17 @@ def flip_dict(dict_to_flip): ] }, "language": {"path": "./{*}language/{*}LanguageCode@@codeListValue"}, + "coverage_spatial_coordinates": { + "path": [ + "./{*}identificationInfo//{*}geographicElement/{*}EX_GeographicBoundingBox", + "./{*}identificationInfo//{*}geographicElement/{*}gmd:EX_BoundingPolygon", + ] + }, + "coverage_spatial_name": { + "path": [ + "./{*}identificationInfo//{*}geographicElement/{*}geographicIdentifier/{*}MD_Identifier/{*}code", + "./{*}identificationInfo//{*}geographicElement/{*}geographicIdentifier/{*}MD_Identifier/{*}code", + ] + }, + # "./{*}identificationInfo//{*}geographicElement//{*}posList"] }