Skip to content

Commit

Permalink
Merge pull request #173 from oemof/feature/infer_metadata
Browse files Browse the repository at this point in the history
Infer metadata from the data *.csv file
  • Loading branch information
Bachibouzouk authored Jun 4, 2024
2 parents 322b05d + ed91085 commit 782118e
Show file tree
Hide file tree
Showing 21 changed files with 338 additions and 114 deletions.
1 change: 1 addition & 0 deletions AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ Authors
* Marie-Claire Gering
* Julian Endres
* Felix Maurer
* Pierre-Francois Duc
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Unreleased
----------

Features
* Improve the function to infer package metadata `#173 <https://github.com/oemof/oemof-tabular/pull/173>`_

Fixes

Expand Down
15 changes: 14 additions & 1 deletion docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,18 @@ This can also be done for sequences and geometries.
To create meta-data `json` file you can use the following code:


.. code-block:: python
from datapackage_utilities import building
building.infer_metadata_from_data(
package_name="my-datapackage",
path="/home/user/datpackages/my-datapackage"
)
Or, if you want to specify manually the relation of the foreign keys, you can use this code:

.. code-block:: python
from datapackage_utilities import building
Expand Down Expand Up @@ -354,7 +366,8 @@ field names in the generators-profile resource.
.. note::

This usage breaks with the datapackage standard and creates
non-valid resources.**
non-valid resources.



Scripting
Expand Down
2 changes: 1 addition & 1 deletion src/oemof/tabular/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.0.5"
__version__ = "0.0.6dev"
__project__ = "oemof.tabular"


Expand Down
5 changes: 5 additions & 0 deletions src/oemof/tabular/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@
with open(FOREIGN_KEY_DESCRIPTORS_FILE, "r") as fk_descriptors_file:
FOREIGN_KEY_DESCRIPTORS = json.load(fk_descriptors_file)

SPECIAL_FIELD_NAMES = {}
for fk, descriptor in FOREIGN_KEY_DESCRIPTORS.items():
for el in descriptor:
SPECIAL_FIELD_NAMES[el["fields"]] = fk

supported_oemof_tabular_versions = [
None,
"0.0.1",
Expand Down
204 changes: 204 additions & 0 deletions src/oemof/tabular/datapackage/building.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import sys
import tarfile
import urllib.request
import warnings
import zipfile
from ftplib import FTP
from urllib.parse import urlparse
Expand Down Expand Up @@ -59,6 +60,208 @@ def update_package_descriptor():
p.save("datapackage.json")


def map_sequence_profiles_to_resource_name(
p, excluded_profiles=("timeindex",)
):
"""Look in every sequence resources and map each of its fields to itself
Within this process the unicity of the field names will be checked,
with the exception of the field "timeindex"
"""

def check_sequences_labels_unicity(labels, new_labels):
intersect = set(labels).intersection(new_labels)
if len(intersect) == 1:
intersect = intersect.pop()
if not intersect == "timeindex":
answer = [intersect]
else:
answer = []
else:
answer = list(intersect)

if answer:
warnings.warn(
f"The labels of the profiles are not unique across all"
f"files within 'sequences' folder: '{','.join(intersect)}' "
f"used more than once"
)
return answer

sequences = {}
sequence_labels = []
duplicated_labels = []
for r in p.resources:
if "/sequences/" in r.descriptor["path"]:
field_labels = [
f.name
for f in r.schema.fields
if f.name not in excluded_profiles
]
sequences[r.descriptor["name"]] = field_labels
duplicated_labels += check_sequences_labels_unicity(
sequence_labels, field_labels
)
sequence_labels += field_labels

if duplicated_labels:
# write an error message here
raise ValueError(
f"The following sequences labels are not unique"
f" across all sequences files: "
f"{', '.join(duplicated_labels)}"
)
# map each profile to its resource name
sequences_mapping = {
value: key for (key, values) in sequences.items() for value in values
}
return sequences_mapping


def infer_resource_foreign_keys(resource, sequences_profiles_to_resource):
"""Find out the foreign keys within a resource fields
Look through all field of a resource which are of type 'string'
if any of their values are matching a profile header in any of
the sequences resources
Parameters
----------
resource: a :datapackage.Resource: instance
sequences_profiles_to_resource: the mapping of sequence profile
headers to their resource name
Returns
-------
The :datapackage.Resource: instance with updated "foreignKeys" field
"""
r = resource
data = pd.DataFrame.from_records(r.read(keyed=True))
# TODO not sure this should be set here
r.descriptor["schema"]["primaryKey"] = "name"
if "foreignKeys" not in r.descriptor["schema"]:
r.descriptor["schema"]["foreignKeys"] = []

for field in r.schema.fields:
if field.type == "string":
for potential_fk in data[field.name].dropna().unique():

if potential_fk in sequences_profiles_to_resource:
# this is actually a wrong format and should be
# with a "fields" field under the "reference" fields

fk = {
"fields": field.name,
"reference": {
"resource": sequences_profiles_to_resource[
potential_fk
],
},
}

if fk not in r.descriptor["schema"]["foreignKeys"]:
r.descriptor["schema"]["foreignKeys"].append(fk)
r.commit()
return r


def infer_package_foreign_keys(package):
"""Infer the foreign_keys from elements and sequences and update meta data
Parameters
----------
package
Returns
-------
"""
p = package
sequences_profiles_to_resource = map_sequence_profiles_to_resource_name(p)

for r in p.resources:
if os.sep + "elements" + os.sep in r.descriptor["path"]:
r = infer_resource_foreign_keys(r, sequences_profiles_to_resource)
# sort foreign_key entries by alphabetically by fields
r.descriptor["schema"]["foreignKeys"].sort(
key=lambda x: x["fields"]
)
p.remove_resource(r.name)
p.add_resource(r.descriptor)


def infer_metadata_from_data(
path,
package_name="default-name",
metadata_filename="datapackage.json",
):
"""Creates a metadata .json file at the root-folder of datapackage
The foreign keys are inferred from the csv files within
"data/elements" and "data/sequences" resources.
Parameters
----------
path: string
Absolute path to root-folder of the datapackage
package_name: string
Name of the data package
metadata_filename: basestring
Name of the inferred metadata string.
Returns
-------
Save a json metadata file at the root-folder of datapackage
under the provided path.
"""

# Infer the fields from the package data
path = os.path.abspath(path)
p0 = Package(base_path=path)
p0.infer(os.path.join(path, "**" + os.sep + "*.csv"))
p0.commit()
p0.save(os.path.join(path, metadata_filename))

foreign_keys = {}

def infer_resource_basic_foreign_keys(resource):
"""Prepare foreign_keys dict for building.infer_metadata
Compare the fields of a resource to a list of field names known
to be foreign keys. If the field name is within the list, it is
used to populate the dict 'foreign_keys'
"""
for field in resource.schema.fields:
if field.name in config.SPECIAL_FIELD_NAMES:
fk_descriptor = config.SPECIAL_FIELD_NAMES[field.name]
if fk_descriptor in foreign_keys:
if resource.name not in foreign_keys[fk_descriptor]:
foreign_keys[fk_descriptor].append(resource.name)
else:
foreign_keys[fk_descriptor] = [resource.name]

for r in p0.resources:
if os.sep + "elements" + os.sep in r.descriptor["path"]:
infer_resource_basic_foreign_keys(r)
# this function saves the metadata of the package in json format
infer_metadata(
package_name=package_name,
path=path,
foreign_keys=foreign_keys,
metadata_filename=metadata_filename,
)

# reload the package from the saved json file
p = Package(os.path.join(path, metadata_filename))
infer_package_foreign_keys(p)
p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
p.commit()
p.save(os.path.join(path, metadata_filename))


def infer_metadata(
package_name="default-name",
keep_resources=False,
Expand Down Expand Up @@ -231,6 +434,7 @@ def infer_metadata(
)
p.add_resource(r.descriptor)

p.descriptor["resources"].sort(key=lambda x: (x["path"], x["name"]))
p.commit()
p.save(metadata_filename)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"profile": "tabular-data-package",
"name": "oemof-tabular-dispatch-example",
"name": "dispatch-example",
"oemof_tabular_version": "0.0.6dev",
"resources": [
{
Expand Down Expand Up @@ -435,4 +435,4 @@
}
}
]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
kwargs = {}

building.infer_metadata(
package_name="oemof-tabular-dispatch-example",
package_name="dispatch-example",
foreign_keys={
"bus": ["volatile", "dispatchable", "storage", "load"],
"profile": ["load", "volatile"],
Expand Down
Loading

0 comments on commit 782118e

Please sign in to comment.