diff --git a/fuji_server/harvester/metadata_harvester.py b/fuji_server/harvester/metadata_harvester.py index 1f2aa22c..cc94c834 100644 --- a/fuji_server/harvester/metadata_harvester.py +++ b/fuji_server/harvester/metadata_harvester.py @@ -146,7 +146,7 @@ def merge_metadata(self, metadict, url, method, format, mimetype, schema="", nam namespaces = [namespaces] test_uris = namespaces if schema != "": - test_uris.append(schema) + test_uris.insert(0, schema) metadata_standard = self.get_metadata_standard_by_uris(test_uris) allow_merge = True if self.allowed_metadata_standards: @@ -797,7 +797,6 @@ def retrieve_metadata_embedded(self): pass # print('EXT META',ext_meta) self.logger.info("FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata from html page") - # TODO: actually schema.org, dcat and skos metadata is collected from a json-ld graph so this should be renamed schemaorg_collector_embedded = MetaDataCollectorRdf( loggerinst=self.logger, target_url=(self.pid_url or self.landing_url), @@ -1109,7 +1108,7 @@ def retrieve_metadata_external_rdf_negotiated(self, target_url_list=[]): source_rdf, neg_rdf_collector.metadata_format, neg_rdf_collector.getContentType(), - "http://www.w3.org/1999/02/22-rdf-syntax-ns", + neg_rdf_collector.main_entity_format, neg_rdf_collector.getNamespaces(), ) diff --git a/fuji_server/helper/metadata_collector_dublincore.py b/fuji_server/helper/metadata_collector_dublincore.py index 0235839e..be810776 100644 --- a/fuji_server/helper/metadata_collector_dublincore.py +++ b/fuji_server/helper/metadata_collector_dublincore.py @@ -38,6 +38,32 @@ def __init__(self, sourcemetadata, mapping, loggerinst): """ super().__init__(logger=loggerinst, mapping=mapping, sourcemetadata=sourcemetadata) + def parse_coverage(self, coverage, type): + type = type.split(".")[-1] # DCT.Period + cov = {"type": None, "value": [], "name": None} + coordinate_keys = ["east", "north", "northlimit", "eastlimit", "southlimit", "westlimit"] + period_keys = ["start", "end"] + try: + cpts = coverage.split(";") + for cpt in cpts: + cvi = cpt.split("=") + if len(cvi) == 2: + if type in ["Point", "Box", "Location"]: + cov["type"] = "spatial" + if cvi[0].strip() == "name": + cov["name"] = cvi[1] + if cvi[0].strip() in coordinate_keys: + cov["value"].append(cvi[1]) + elif type in ["Period", "PeriodOfTime"]: + cov["type"] = "temporal" + if cvi[0].strip() == "name": + cov["name"] = cvi[1] + if cvi[0].strip() in period_keys: + cov["value"].append(cvi[1]) + except Exception as e: + print("ERROR: ", e) + return cov + def parse_metadata(self): """Parse the Dublin Core metadata from the data @@ -96,7 +122,9 @@ def parse_metadata(self): dc_t = None if len(dc_name_parts) == 3: dc_t = dc_name_parts[2] - meta_dc_matches.append([dc_name_parts[1], dc_t, meta_tag.get("content")]) + meta_dc_matches.append( + [dc_name_parts[1], dc_t, meta_tag.get("content"), meta_tag.get("scheme")] + ) # meta_dc_matches = re.findall(exp, self.source_metadata) except Exception as e: self.logger.exception(f"Parsing error, failed to extract DublinCore -: {e}") @@ -120,7 +148,8 @@ def parse_metadata(self): t = dc_meta[1] # 3 # value v = dc_meta[2] # 5 - + # ccheme + s = dc_meta[3] if k.lower() == "date": if t == "dateAccepted": dc_core_metadata["accepted_date"] = v @@ -140,6 +169,18 @@ def parse_metadata(self): except Exception: # nothing found so just continue pass + if elem in ["coverage_spatial", "coverage_temporal"]: + coverage_info = self.parse_coverage(v, s) + v = {"name": coverage_info.get("name"), "reference": coverage_info.get("reference")} + if coverage_info.get("type") == "spatial": + v["coordinates"] = coverage_info.get("value") + elem = "coverage_spatial" + print("DC Spatial Coverage: ", v) + else: + elem = "coverage_temporal" + v["dates"] = coverage_info.get("value") + print("DC Temporal Coverage: ", v) + v = [v] if elem == "related_resources": # dc_core_metadata['related_resources'] = [] # tuple of type and relation diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py index 55c122e8..0c778f3f 100644 --- a/fuji_server/helper/metadata_collector_rdf.py +++ b/fuji_server/helper/metadata_collector_rdf.py @@ -82,6 +82,7 @@ def __init__(self, loggerinst, target_url=None, source=None, json_ld_content=Non self.resolved_url = target_url self.content_type = None self.source_name = source + self.main_entity_format = str(RDF) # the main enties format e.g. dcat:Dataset => DCAT etc.. self.metadata_format = MetadataFormats.RDF if self.source_name == MetadataSources.RDFA_EMBEDDED: self.metadata_format = MetadataFormats.RDFA @@ -158,7 +159,7 @@ def get_metadata_from_graph(self, rdf_response_graph): or rdflib.term.URIRef("https://schema.org/") in graph_namespaces.values() ): self.logger.info("FsF-F2-01M : RDF Graph seems to contain schema.org metadata elements") - schema_metadata = self.get_schemaorg_metadata_from_graph(rdf_response_graph) + schema_metadata = self.get_schemaorg_metadata(rdf_response_graph) if bool(set(ontology_indicator) & set(graph_namespaces.values())): self.logger.info("FsF-F2-01M : RDF Graph seems to contain SKOS/OWL metadata elements") skos_metadata = self.get_ontology_metadata(rdf_response_graph) @@ -692,24 +693,29 @@ def get_ontology_metadata(self, graph): return ont_metadata def get_main_entity(self, graph): + main_entity_item, main_entity_type, main_entity_namespace = None, None, None # Finding the main entity of the graph graph_entity_list = [] main_entity = {} creative_work_detected = False - # we aim to only test creative works and subtypes + # we aim to only test creative works and subtypes taking the terms (names) from schema.org creative_work_types = Preprocessor.get_schema_org_creativeworks() try: for cw in list(graph.subjects(predicate=RDF.type)): types = list(graph.objects(predicate=RDF.type, subject=cw)) types_names = [] + namespaces = [] for tp in types: type_name = re.split(r"/|#", str(tp))[-1] if type_name.lower in creative_work_types: creative_work_detected = True types_names.append(type_name) + namespaces.append(tp) nsbj = len(list(graph.subjects(object=cw))) nprp = len(list(graph.objects(subject=cw))) - graph_entity_list.append({"item": cw, "nosbj": nsbj, "noprp": nprp, "types": types_names, "score": 0}) + graph_entity_list.append( + {"item": cw, "nosbj": nsbj, "noprp": nprp, "types": types_names, "ns": namespaces, "score": 0} + ) # score max_prp = max(graph_entity_list, key=lambda x: x["noprp"])["noprp"] max_sbj = max(graph_entity_list, key=lambda x: x["nosbj"])["nosbj"] @@ -725,15 +731,20 @@ def get_main_entity(self, graph): score = prp_score + sbj_score / 2 graph_entity_list[gk]["score"] = score gk += 1 - main_entity = (sorted(graph_entity_list, key=lambda d: d["score"], reverse=True))[0] - if not creative_work_detected: - self.logger.info( - "FsF-F2-01M : Detected main entity found in RDF graph seems not to be a creative work type" - ) - return main_entity.get("item"), main_entity.get("types") + main_entity = (sorted(graph_entity_list, key=lambda d: d["score"], reverse=True))[0] + if not creative_work_detected: + self.logger.info( + "FsF-F2-01M : Detected main entity found in RDF graph seems not to be a creative work type" + ) + main_entity_item, main_entity_type, main_entity_namespace = ( + main_entity.get("item"), + main_entity.get("types"), + main_entity.get("ns"), + ) except Exception as ee: self.logger.warning("FsF-F2-01M : Failed to detect main entity in metadata given as RDF Graph") print("MAIN ENTITY IDENTIFICATION ERROR: ", ee) + return main_entity_item, main_entity_type, main_entity_namespace """def find_root_candidates(self, graph, allowed_types=["Dataset"]): allowed_types = [at.lower() for at in allowed_types if isinstance(at, str)] @@ -779,8 +790,8 @@ def get_main_entity(self, graph): return cand_creative_work, object_types_dict""" - def get_schemaorg_metadata_from_graph(self, graph): - main_entity_id, main_entity_type = self.get_main_entity(graph) + def get_schemaorg_metadata(self, graph): + main_entity_id, main_entity_type, main_entity_namespace = self.get_main_entity(graph) creative_work_type = "Dataset" if main_entity_id: creative_work = main_entity_id @@ -791,6 +802,7 @@ def get_schemaorg_metadata_from_graph(self, graph): # is e.g. important in case schema.org is encoded as RDFa and variuos namespaces are used # this is tested by namepace elsewhere if creative_work: + self.main_entity_format = str(SDO) schema_metadata = self.get_core_metadata(graph, creative_work, type=creative_work_type) # "access_free" access_free = graph.value(creative_work, SMA.isAccessibleForFree) or graph.value( @@ -891,6 +903,42 @@ def get_schemaorg_metadata_from_graph(self, graph): schema_metadata["object_content_identifier"].append( {"url": service_url, "type": service_type, "service": service_desc} ) + # spatialCoverage + schema_metadata["coverage_spatial"] = [] + for spatial in ( + list(graph.objects(creative_work, SMA.spatialCoverage)) + + list(graph.objects(creative_work, SDO.spatialCoverage)) + + list(graph.objects(creative_work, SMA.spatial)) + + list(graph.objects(creative_work, SDO.spatial)) + ): + spatial_info = {} + if graph.value(spatial, SMA.name) or graph.value(spatial, SDO.name): + # Place name + spatial_info["name"] = graph.value(spatial, SMA.name) or graph.value(spatial, SDO.name) + if graph.value(spatial, SMA.latitude) or graph.value(spatial, SDO.latitude): + spatial_info["coordinates"] = [ + (graph.value(spatial, SMA.latitude) or graph.value(spatial, SDO.latitude)), + (graph.value(spatial, SMA.longitude) or graph.value(spatial, SDO.longitude)), + ] + elif graph.value(spatial, SMA.geo) or graph.value(spatial, SDO.geo): + spatial_geo = graph.value(spatial, SMA.geo) or graph.value(spatial, SDO.geo) + if graph.value(spatial_geo, SMA.latitude) or graph.value(spatial_geo, SDO.longitude): + spatial_info["coordinates"] = [ + (graph.value(spatial_geo, SMA.latitude) or graph.value(spatial_geo, SDO.latitude)), + (graph.value(spatial_geo, SMA.longitude) or graph.value(spatial_geo, SDO.longitude)), + ] + else: + spatial_extent = ( + graph.value(spatial_geo, SMA.box) + or graph.value(spatial_geo, SDO.box) + or graph.value(spatial_geo, SMA.polygon) + or graph.value(spatial_geo, SDO.polygon) + or graph.value(spatial_geo, SMA.line) + or graph.value(spatial_geo, SDO.line) + ) + spatial_info["coordinates"] = re.split(r"[\s,]+", str(spatial_extent)) + if spatial_info: + schema_metadata["coverage_spatial"].append(spatial_info) schema_metadata["measured_variable"] = [] for variable in list(graph.objects(creative_work, SMA.variableMeasured)) + list( @@ -957,7 +1005,7 @@ def get_dcat_metadata(self, graph): CSVW = Namespace("http://www.w3.org/ns/csvw#") dcat_root_type = "Dataset" datasets = [] - main_entity_id, main_entity_type = self.get_main_entity(graph) + main_entity_id, main_entity_type, main_entity_namespace = self.get_main_entity(graph) if main_entity_id: # prioritize Dataset type if "Dataset" not in main_entity_type: @@ -969,6 +1017,7 @@ def get_dcat_metadata(self, graph): if len(datasets) > 1: self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one") if len(datasets) > 0: + self.main_entity_format = str(DCAT) dcat_metadata = self.get_core_metadata(graph, datasets[0], type="Dataset") # distribution distribution = graph.objects(datasets[0], DCAT.distribution) diff --git a/fuji_server/helper/metadata_mapper.py b/fuji_server/helper/metadata_mapper.py index c22be5d5..831a247a 100644 --- a/fuji_server/helper/metadata_mapper.py +++ b/fuji_server/helper/metadata_mapper.py @@ -44,9 +44,11 @@ def flip_dict(dict_to_flip): "publication_date": {"label": "Publication Date", "sameAs": "http://purl.org/dc/terms/date"}, "summary": {"label": "Summary", "sameAs": "http://purl.org/dc/terms/abstract"}, "keywords": {"label": "Keywords", "sameAs": "http://purl.org/dc/terms/subject"}, + # object_content_identifier (list) subproperties: 'url', 'type', 'size' "object_content_identifier": {"label": "Content (Data) Identifier", "sameAs": "https://schema.org/contentUrl"}, "access_level": {"label": "Access Level", "sameAs": "http://purl.org/dc/terms/accessRights"}, "access_free": {"label": "Free Access", "sameAs": "https://schema.org/isAccessibleForFree"}, + # related_resources (list) subproperties: 'relation_type', 'related_resource' "related_resources": {"label": "Related resources", "sameAs": "http://purl.org/dc/terms/related"}, "provenance_general": {"label": "Provenance", "sameAs": "http://purl.org/dc/terms/provenance"}, "measured_variable": {"label": "Measured Variable", "sameAs": "https://schema.org/variableMeasured"}, @@ -60,8 +62,13 @@ def flip_dict(dict_to_flip): "right_holder": {"label": "License", "sameAs": "http://purl.org/dc/terms/rightsHolder"}, "object_size": {"label": "Object Size", "sameAs": "http://purl.org/dc/terms/extent"}, "language": {"label": "Language", "sameAs": "http://purl.org/dc/terms/language"}, + # required for Github etc. software FAIR assessment "license_path": {"label": "License Path", "sameAs": None}, "metadata_service": {"label": "Metadata Service", "sameAs": None}, + # spatial coverage (list): subproperties: 'name' (string or URI), 'coordinates' (list), 'reference' (string or URI). Either name or coordinates MUST be there + "coverage_spatial": {"label": "Geographical Coverage", "sameAs": "http://purl.org/dc/terms/Location"}, + # temporal coverage (list): subproperties: 'name', 'date' + "coverage_temporal": {"label": "Temporal Coverage", "sameAs": None}, } # core metadata elements (FsF-F2-01M) @@ -177,6 +184,8 @@ def flip_dict(dict_to_flip): "isRequiredBy", ], "language": "language", + "coverage_spatial": ["coverage", "Location", "spatial"], + "coverage_temporal": ["coverage", "PeriodOfTime", "Period", "temporal"], } # https://ogp.me/