Skip to content

Commit

Permalink
started spatial coverage metadata parsing for DC and schema.org #537 #…
Browse files Browse the repository at this point in the history
…538; RDF main entity detection now also gives a main_entity_format (instead of default RDF) e.g. schema_org or DCAT; Fixed wrong metadata format (schema) detection in case many namespaces are found;
  • Loading branch information
huberrob committed Oct 22, 2024
1 parent 6675fb2 commit 95da507
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 17 deletions.
5 changes: 2 additions & 3 deletions fuji_server/harvester/metadata_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def merge_metadata(self, metadict, url, method, format, mimetype, schema="", nam
namespaces = [namespaces]
test_uris = namespaces
if schema != "":
test_uris.append(schema)
test_uris.insert(0, schema)
metadata_standard = self.get_metadata_standard_by_uris(test_uris)
allow_merge = True
if self.allowed_metadata_standards:
Expand Down Expand Up @@ -797,7 +797,6 @@ def retrieve_metadata_embedded(self):
pass
# print('EXT META',ext_meta)
self.logger.info("FsF-F2-01M : Trying to retrieve schema.org JSON-LD metadata from html page")
# TODO: actually schema.org, dcat and skos metadata is collected from a json-ld graph so this should be renamed
schemaorg_collector_embedded = MetaDataCollectorRdf(
loggerinst=self.logger,
target_url=(self.pid_url or self.landing_url),
Expand Down Expand Up @@ -1109,7 +1108,7 @@ def retrieve_metadata_external_rdf_negotiated(self, target_url_list=[]):
source_rdf,
neg_rdf_collector.metadata_format,
neg_rdf_collector.getContentType(),
"http://www.w3.org/1999/02/22-rdf-syntax-ns",
neg_rdf_collector.main_entity_format,
neg_rdf_collector.getNamespaces(),
)

Expand Down
45 changes: 43 additions & 2 deletions fuji_server/helper/metadata_collector_dublincore.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,32 @@ def __init__(self, sourcemetadata, mapping, loggerinst):
"""
super().__init__(logger=loggerinst, mapping=mapping, sourcemetadata=sourcemetadata)

def parse_coverage(self, coverage, type):
type = type.split(".")[-1] # DCT.Period
cov = {"type": None, "value": [], "name": None}
coordinate_keys = ["east", "north", "northlimit", "eastlimit", "southlimit", "westlimit"]
period_keys = ["start", "end"]
try:
cpts = coverage.split(";")
for cpt in cpts:
cvi = cpt.split("=")
if len(cvi) == 2:
if type in ["Point", "Box", "Location"]:
cov["type"] = "spatial"
if cvi[0].strip() == "name":
cov["name"] = cvi[1]
if cvi[0].strip() in coordinate_keys:
cov["value"].append(cvi[1])
elif type in ["Period", "PeriodOfTime"]:
cov["type"] = "temporal"
if cvi[0].strip() == "name":
cov["name"] = cvi[1]
if cvi[0].strip() in period_keys:
cov["value"].append(cvi[1])
except Exception as e:
print("ERROR: ", e)
return cov

def parse_metadata(self):
"""Parse the Dublin Core metadata from the data
Expand Down Expand Up @@ -96,7 +122,9 @@ def parse_metadata(self):
dc_t = None
if len(dc_name_parts) == 3:
dc_t = dc_name_parts[2]
meta_dc_matches.append([dc_name_parts[1], dc_t, meta_tag.get("content")])
meta_dc_matches.append(
[dc_name_parts[1], dc_t, meta_tag.get("content"), meta_tag.get("scheme")]
)
# meta_dc_matches = re.findall(exp, self.source_metadata)
except Exception as e:
self.logger.exception(f"Parsing error, failed to extract DublinCore -: {e}")
Expand All @@ -120,7 +148,8 @@ def parse_metadata(self):
t = dc_meta[1] # 3
# value
v = dc_meta[2] # 5

# ccheme
s = dc_meta[3]
if k.lower() == "date":
if t == "dateAccepted":
dc_core_metadata["accepted_date"] = v
Expand All @@ -140,6 +169,18 @@ def parse_metadata(self):
except Exception:
# nothing found so just continue
pass
if elem in ["coverage_spatial", "coverage_temporal"]:
coverage_info = self.parse_coverage(v, s)
v = {"name": coverage_info.get("name"), "reference": coverage_info.get("reference")}
if coverage_info.get("type") == "spatial":
v["coordinates"] = coverage_info.get("value")
elem = "coverage_spatial"
print("DC Spatial Coverage: ", v)
else:
elem = "coverage_temporal"
v["dates"] = coverage_info.get("value")
print("DC Temporal Coverage: ", v)
v = [v]
if elem == "related_resources":
# dc_core_metadata['related_resources'] = []
# tuple of type and relation
Expand Down
73 changes: 61 additions & 12 deletions fuji_server/helper/metadata_collector_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def __init__(self, loggerinst, target_url=None, source=None, json_ld_content=Non
self.resolved_url = target_url
self.content_type = None
self.source_name = source
self.main_entity_format = str(RDF) # the main enties format e.g. dcat:Dataset => DCAT etc..
self.metadata_format = MetadataFormats.RDF
if self.source_name == MetadataSources.RDFA_EMBEDDED:
self.metadata_format = MetadataFormats.RDFA
Expand Down Expand Up @@ -158,7 +159,7 @@ def get_metadata_from_graph(self, rdf_response_graph):
or rdflib.term.URIRef("https://schema.org/") in graph_namespaces.values()
):
self.logger.info("FsF-F2-01M : RDF Graph seems to contain schema.org metadata elements")
schema_metadata = self.get_schemaorg_metadata_from_graph(rdf_response_graph)
schema_metadata = self.get_schemaorg_metadata(rdf_response_graph)
if bool(set(ontology_indicator) & set(graph_namespaces.values())):
self.logger.info("FsF-F2-01M : RDF Graph seems to contain SKOS/OWL metadata elements")
skos_metadata = self.get_ontology_metadata(rdf_response_graph)
Expand Down Expand Up @@ -692,24 +693,29 @@ def get_ontology_metadata(self, graph):
return ont_metadata

def get_main_entity(self, graph):
main_entity_item, main_entity_type, main_entity_namespace = None, None, None
# Finding the main entity of the graph
graph_entity_list = []
main_entity = {}
creative_work_detected = False
# we aim to only test creative works and subtypes
# we aim to only test creative works and subtypes taking the terms (names) from schema.org
creative_work_types = Preprocessor.get_schema_org_creativeworks()
try:
for cw in list(graph.subjects(predicate=RDF.type)):
types = list(graph.objects(predicate=RDF.type, subject=cw))
types_names = []
namespaces = []
for tp in types:
type_name = re.split(r"/|#", str(tp))[-1]
if type_name.lower in creative_work_types:
creative_work_detected = True
types_names.append(type_name)
namespaces.append(tp)
nsbj = len(list(graph.subjects(object=cw)))
nprp = len(list(graph.objects(subject=cw)))
graph_entity_list.append({"item": cw, "nosbj": nsbj, "noprp": nprp, "types": types_names, "score": 0})
graph_entity_list.append(
{"item": cw, "nosbj": nsbj, "noprp": nprp, "types": types_names, "ns": namespaces, "score": 0}
)
# score
max_prp = max(graph_entity_list, key=lambda x: x["noprp"])["noprp"]
max_sbj = max(graph_entity_list, key=lambda x: x["nosbj"])["nosbj"]
Expand All @@ -725,15 +731,20 @@ def get_main_entity(self, graph):
score = prp_score + sbj_score / 2
graph_entity_list[gk]["score"] = score
gk += 1
main_entity = (sorted(graph_entity_list, key=lambda d: d["score"], reverse=True))[0]
if not creative_work_detected:
self.logger.info(
"FsF-F2-01M : Detected main entity found in RDF graph seems not to be a creative work type"
)
return main_entity.get("item"), main_entity.get("types")
main_entity = (sorted(graph_entity_list, key=lambda d: d["score"], reverse=True))[0]
if not creative_work_detected:
self.logger.info(
"FsF-F2-01M : Detected main entity found in RDF graph seems not to be a creative work type"
)
main_entity_item, main_entity_type, main_entity_namespace = (
main_entity.get("item"),
main_entity.get("types"),
main_entity.get("ns"),
)
except Exception as ee:
self.logger.warning("FsF-F2-01M : Failed to detect main entity in metadata given as RDF Graph")
print("MAIN ENTITY IDENTIFICATION ERROR: ", ee)
return main_entity_item, main_entity_type, main_entity_namespace

"""def find_root_candidates(self, graph, allowed_types=["Dataset"]):
allowed_types = [at.lower() for at in allowed_types if isinstance(at, str)]
Expand Down Expand Up @@ -779,8 +790,8 @@ def get_main_entity(self, graph):
return cand_creative_work, object_types_dict"""

def get_schemaorg_metadata_from_graph(self, graph):
main_entity_id, main_entity_type = self.get_main_entity(graph)
def get_schemaorg_metadata(self, graph):
main_entity_id, main_entity_type, main_entity_namespace = self.get_main_entity(graph)
creative_work_type = "Dataset"
if main_entity_id:
creative_work = main_entity_id
Expand All @@ -791,6 +802,7 @@ def get_schemaorg_metadata_from_graph(self, graph):
# is e.g. important in case schema.org is encoded as RDFa and variuos namespaces are used
# this is tested by namepace elsewhere
if creative_work:
self.main_entity_format = str(SDO)
schema_metadata = self.get_core_metadata(graph, creative_work, type=creative_work_type)
# "access_free"
access_free = graph.value(creative_work, SMA.isAccessibleForFree) or graph.value(
Expand Down Expand Up @@ -891,6 +903,42 @@ def get_schemaorg_metadata_from_graph(self, graph):
schema_metadata["object_content_identifier"].append(
{"url": service_url, "type": service_type, "service": service_desc}
)
# spatialCoverage
schema_metadata["coverage_spatial"] = []
for spatial in (
list(graph.objects(creative_work, SMA.spatialCoverage))
+ list(graph.objects(creative_work, SDO.spatialCoverage))
+ list(graph.objects(creative_work, SMA.spatial))
+ list(graph.objects(creative_work, SDO.spatial))
):
spatial_info = {}
if graph.value(spatial, SMA.name) or graph.value(spatial, SDO.name):
# Place name
spatial_info["name"] = graph.value(spatial, SMA.name) or graph.value(spatial, SDO.name)
if graph.value(spatial, SMA.latitude) or graph.value(spatial, SDO.latitude):
spatial_info["coordinates"] = [
(graph.value(spatial, SMA.latitude) or graph.value(spatial, SDO.latitude)),
(graph.value(spatial, SMA.longitude) or graph.value(spatial, SDO.longitude)),
]
elif graph.value(spatial, SMA.geo) or graph.value(spatial, SDO.geo):
spatial_geo = graph.value(spatial, SMA.geo) or graph.value(spatial, SDO.geo)
if graph.value(spatial_geo, SMA.latitude) or graph.value(spatial_geo, SDO.longitude):
spatial_info["coordinates"] = [
(graph.value(spatial_geo, SMA.latitude) or graph.value(spatial_geo, SDO.latitude)),
(graph.value(spatial_geo, SMA.longitude) or graph.value(spatial_geo, SDO.longitude)),
]
else:
spatial_extent = (
graph.value(spatial_geo, SMA.box)
or graph.value(spatial_geo, SDO.box)
or graph.value(spatial_geo, SMA.polygon)
or graph.value(spatial_geo, SDO.polygon)
or graph.value(spatial_geo, SMA.line)
or graph.value(spatial_geo, SDO.line)
)
spatial_info["coordinates"] = re.split(r"[\s,]+", str(spatial_extent))
if spatial_info:
schema_metadata["coverage_spatial"].append(spatial_info)

schema_metadata["measured_variable"] = []
for variable in list(graph.objects(creative_work, SMA.variableMeasured)) + list(
Expand Down Expand Up @@ -957,7 +1005,7 @@ def get_dcat_metadata(self, graph):
CSVW = Namespace("http://www.w3.org/ns/csvw#")
dcat_root_type = "Dataset"
datasets = []
main_entity_id, main_entity_type = self.get_main_entity(graph)
main_entity_id, main_entity_type, main_entity_namespace = self.get_main_entity(graph)
if main_entity_id:
# prioritize Dataset type
if "Dataset" not in main_entity_type:
Expand All @@ -969,6 +1017,7 @@ def get_dcat_metadata(self, graph):
if len(datasets) > 1:
self.logger.info("FsF-F2-01M : Found more than one DCAT Dataset description, will use first one")
if len(datasets) > 0:
self.main_entity_format = str(DCAT)
dcat_metadata = self.get_core_metadata(graph, datasets[0], type="Dataset")
# distribution
distribution = graph.objects(datasets[0], DCAT.distribution)
Expand Down
9 changes: 9 additions & 0 deletions fuji_server/helper/metadata_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,11 @@ def flip_dict(dict_to_flip):
"publication_date": {"label": "Publication Date", "sameAs": "http://purl.org/dc/terms/date"},
"summary": {"label": "Summary", "sameAs": "http://purl.org/dc/terms/abstract"},
"keywords": {"label": "Keywords", "sameAs": "http://purl.org/dc/terms/subject"},
# object_content_identifier (list) subproperties: 'url', 'type', 'size'
"object_content_identifier": {"label": "Content (Data) Identifier", "sameAs": "https://schema.org/contentUrl"},
"access_level": {"label": "Access Level", "sameAs": "http://purl.org/dc/terms/accessRights"},
"access_free": {"label": "Free Access", "sameAs": "https://schema.org/isAccessibleForFree"},
# related_resources (list) subproperties: 'relation_type', 'related_resource'
"related_resources": {"label": "Related resources", "sameAs": "http://purl.org/dc/terms/related"},
"provenance_general": {"label": "Provenance", "sameAs": "http://purl.org/dc/terms/provenance"},
"measured_variable": {"label": "Measured Variable", "sameAs": "https://schema.org/variableMeasured"},
Expand All @@ -60,8 +62,13 @@ def flip_dict(dict_to_flip):
"right_holder": {"label": "License", "sameAs": "http://purl.org/dc/terms/rightsHolder"},
"object_size": {"label": "Object Size", "sameAs": "http://purl.org/dc/terms/extent"},
"language": {"label": "Language", "sameAs": "http://purl.org/dc/terms/language"},
# required for Github etc. software FAIR assessment
"license_path": {"label": "License Path", "sameAs": None},
"metadata_service": {"label": "Metadata Service", "sameAs": None},
# spatial coverage (list): subproperties: 'name' (string or URI), 'coordinates' (list), 'reference' (string or URI). Either name or coordinates MUST be there
"coverage_spatial": {"label": "Geographical Coverage", "sameAs": "http://purl.org/dc/terms/Location"},
# temporal coverage (list): subproperties: 'name', 'date'
"coverage_temporal": {"label": "Temporal Coverage", "sameAs": None},
}

# core metadata elements (FsF-F2-01M)
Expand Down Expand Up @@ -177,6 +184,8 @@ def flip_dict(dict_to_flip):
"isRequiredBy",
],
"language": "language",
"coverage_spatial": ["coverage", "Location", "spatial"],
"coverage_temporal": ["coverage", "PeriodOfTime", "Period", "temporal"],
}

# https://ogp.me/
Expand Down

0 comments on commit 95da507

Please sign in to comment.