Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 Improve minerals explorer, further changes #3167

Merged
merged 25 commits into from
Aug 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f99b22c
📊 Improve minerals explorer, further changes
pabloarosado Aug 21, 2024
ffd925a
Include World aggregate for BGS Uranium data
pabloarosado Aug 21, 2024
ef71130
Add global share of nickel processed production
pabloarosado Aug 21, 2024
5da0dbf
Create a meta.yml file and improve metadata
pabloarosado Aug 22, 2024
ab0f4c4
Fix missing share columns
pabloarosado Aug 22, 2024
91a6efd
Add DoDs
pabloarosado Aug 22, 2024
e98bdd4
Fix chromium data
pabloarosado Aug 22, 2024
6273414
Fix chromium reserves
pabloarosado Aug 22, 2024
9ec51bb
Combine BGS and USGS data for chromium
pabloarosado Aug 22, 2024
f3164d0
Minor fix in map bracketer
pabloarosado Aug 22, 2024
a1fe5e0
Minor improvement to map bracketer
pabloarosado Aug 22, 2024
a8b85f6
Update map brackets
pabloarosado Aug 22, 2024
a0165fc
Avoid unnecessary warnings in BGS meadow step
pabloarosado Aug 22, 2024
79bed10
Properly extract metadata from attached word documents, without ignor…
pabloarosado Aug 23, 2024
3534978
Merge branch 'master' of github.com:owid/etl into improve-minerals-ex…
pabloarosado Aug 23, 2024
da672e1
Merge branch 'master' of github.com:owid/etl into improve-minerals-ex…
pabloarosado Aug 23, 2024
0664687
Remove unnecessary code about imports and exports
pabloarosado Aug 23, 2024
57ffbc4
Handle BGS notes and footnotes
pabloarosado Aug 23, 2024
a4374a1
Use yml files instead to edit notes
pabloarosado Aug 23, 2024
9509eef
Add description processing
pabloarosado Aug 23, 2024
85a0356
Improve historical USGS snapshot metadata
pabloarosado Aug 23, 2024
88732ff
Improve unit value metadata
pabloarosado Aug 23, 2024
c26b8d6
Add new metals & minerals tag
pabloarosado Aug 23, 2024
9957dce
Remove short attribution
pabloarosado Aug 23, 2024
d10ebf0
Some cleaning in BGS notes
pabloarosado Aug 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion apps/wizard/app_pages/map_brackets.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,10 +982,14 @@ def _create_maximum_instances_message(mb: MapBracketer) -> str:
if variable_id not in variable_ids_with_brackets_already_defined
]

if len(variable_ids) == 0:
st.error("No variables to choose from. They may have map brackets already defined.")
st.stop()

# Select a variable id from a dropdown menu.
variable_id: str = str(
st.selectbox( # type: ignore
label="Indicator id",
label=f"Indicator id ({len(variable_ids)} variables available)",
options=variable_ids,
index=0,
)
Expand Down
46 changes: 27 additions & 19 deletions etl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,29 +748,37 @@ def print_tables_metadata_template(tables: List[Table], fields: Optional[List[st
for column in tb.columns:
dict_values = {}
for field in fields:
value = getattr(tb[column].metadata, field) or ""

# Add some simple rules to simplify some common cases.

# If title is empty, or if title is underscore (probably because it is taken from the column name),
# create a custom title.
if (field == "title") and ((value == "") or ("_" in value)):
value = column.capitalize().replace("_", " ")

# If unit or short_unit is empty, and the column name contains 'pct', set it to '%'.
if (value == "") and (field in ["unit", "short_unit"]) and "pct" in column:
value = "%"

if field == "processing_level":
# Assume a minor processing level (it will be manually overwritten, if needed).
value = "minor"

dict_values[field] = value
if field.startswith("presentation"):
field = field.replace("presentation.", "")
value = getattr(tb[column].metadata.presentation, field) or ""
if "presentation" not in dict_values:
dict_values["presentation"] = {}
dict_values["presentation"][field] = value
else:
value = getattr(tb[column].metadata, field) or ""

# Add some simple rules to simplify some common cases.

# If title is empty, or if title is underscore (probably because it is taken from the column name),
# create a custom title.
if (field == "title") and ((value == "") or ("_" in value)):
value = column.capitalize().replace("_", " ")

# If unit or short_unit is empty, and the column name contains 'pct', set it to '%'.
if (value == "") and (field in ["unit", "short_unit"]) and "pct" in column:
value = "%"

if field == "processing_level":
# Assume a minor processing level (it will be manually overwritten, if needed).
value = "minor"

dict_values[field] = value
dict_variables[column] = dict_values
dict_tables[tb.metadata.short_name] = {"variables": dict_variables}
dict_output = {"tables": dict_tables}

print(yaml.dump(dict_output, default_flow_style=False, sort_keys=False))
# print(yaml.dump(dict_output, default_flow_style=False, sort_keys=False))
print(yaml.dump(dict_output, default_flow_style=False, sort_keys=False, width=float("inf")))


@contextmanager
Expand Down
632 changes: 632 additions & 0 deletions etl/steps/data/garden/bgs/2024-07-09/notes_edited.yml

Large diffs are not rendered by default.

1,057 changes: 1,057 additions & 0 deletions etl/steps/data/garden/bgs/2024-07-09/notes_original.yml

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ definitions:
short_unit: t
processing_level: major
presentation:
attribution_short: BGS
topic_tags:
- Energy

Expand All @@ -16,10 +15,6 @@ tables:
world_mineral_statistics:
title: World Mineral Statistics
variables:
exports:
title: Exports of commodities
imports:
title: Imports of commodities
production:
title: Production of commodities
world_mineral_statistics_flat:
Expand Down
133 changes: 77 additions & 56 deletions etl/steps/data/garden/bgs/2024-07-09/world_mineral_statistics.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Load a meadow dataset and create a garden dataset."""

import ast
import json
from typing import Dict, List

import owid.catalog.processing as pr
Expand All @@ -10,6 +9,7 @@
from tqdm.auto import tqdm

from etl.data_helpers import geo
from etl.files import ruamel_load
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
Expand Down Expand Up @@ -115,9 +115,11 @@
("Cement", "Portland cement"): None,
("Cement clinker", "Cement, clinker"): None,
("Cement, finished", "Cement, finished"): None,
("Chromium", "Metal"): ("Chromium", "Refinery"),
# NOTE: The following has only imports/exports data.
("Chromium", "Metal"): None,
# NOTE: The following has only imports/exports data.
("Chromium", "Ores & concentrates"): None,
("Chromium ores and concentrates", "Unknown"): None,
("Chromium ores and concentrates", "Unknown"): ("Chromium", "Mine"),
# NOTE: All subcommodities of coal production will be summed up into one.
("Coal", "Anthracite"): ("Coal", "Mine, anthracite"),
("Coal", "Anthracite & Bituminous"): ("Coal", "Mine, anthracite & Bituminous"),
Expand Down Expand Up @@ -342,11 +344,13 @@
("Magnesite and magnesia", "Unknown"): None,
("Magnesium metal, primary", "Unknown"): ("Magnesium metal", "Smelter"),
("Manganese", "Metal"): ("Manganese", "Refinery"),
("Manganese", "Ores & Concentrates"): ("Manganese", "Mine, ores & concentrates"),
# NOTE: The following could be mapped to ("Manganese", "Mine, ores & concentrates"), but we decided to discard it.
("Manganese", "Ores & Concentrates"): None,
("Manganese ore", "Chemical"): None,
("Manganese ore", "Manganese ore (ferruginous)"): None,
("Manganese ore", "Metallurgical"): None,
("Manganese ore", "Unknown"): ("Manganese", "Mine, ores & concentrates"),
# NOTE: The following could be mapped to ("Manganese", "Mine, ores & concentrates"), but we decided to discard it.
("Manganese ore", "Unknown"): None,
("Mercury", "Unknown"): ("Mercury", "Mine"),
# NOTE: All mica data below is very sparse. Several have data until 1980, or 2002.
# The sub-commodity with the largest numbers is "Unknown", so it's not clear what it means.
Expand Down Expand Up @@ -708,8 +712,8 @@
"production|Potash|Unspecified|tonnes": "Values are reported as tonnes of potassium oxide content.",
"production|Potash|Mine, polyhalite|tonnes": "Values are reported as tonnes of potassium oxide content.",
"production|Potash|Mine, potassic salts|tonnes": "Values are reported as tonnes of potassium oxide content.",
"imports|Potash|Mine, chloride|tonnes": "Values are reported as tonnes of potassium oxide content.",
"exports|Potash|Mine, chloride|tonnes": "Values are reported as tonnes of potassium oxide content.",
# "imports|Potash|Mine, chloride|tonnes": "Values are reported as tonnes of potassium oxide content.",
# "exports|Potash|Mine, chloride|tonnes": "Values are reported as tonnes of potassium oxide content.",
"production|Platinum group metals|Mine, iridium|tonnes": "Values are reported as tonnes of metal content.",
"production|Platinum group metals|Mine, other|tonnes": "Values are reported as tonnes of metal content.",
"production|Platinum group metals|Mine, palladium|tonnes": "Values are reported as tonnes of metal content.",
Expand All @@ -729,13 +733,13 @@
ACCEPTED_OVERLAPS = [
# {1991: {"USSR", "Armenia"}},
# {1991: {"USSR", "Belarus"}},
{1991: {"USSR", "Russia"}},
# {1991: {"USSR", "Russia"}},
{1992: {"Czechia", "Czechoslovakia"}},
{1992: {"Slovakia", "Czechoslovakia"}},
{1990: {"Germany", "East Germany"}},
{1990: {"Germany", "West Germany"}},
# {1990: {"Germany", "East Germany"}},
# {1990: {"Germany", "West Germany"}},
# {2010: {"Netherlands Antilles", "Bonaire Sint Eustatius and Saba"}},
{1990: {"Yemen", "Yemen People's Republic"}},
# {1990: {"Yemen", "Yemen People's Republic"}},
]


Expand Down Expand Up @@ -926,7 +930,9 @@ def clean_notes(notes):
return notes_clean


def gather_notes(tb: Table, notes_columns: List[str]) -> Dict[str, str]:
def gather_notes(
tb: Table, notes_columns: List[str], notes_original: Dict[str, List[str]], notes_edited: Dict[str, List[str]]
) -> Dict[str, str]:
# Create another table with the same structure, but containing notes.
tb_flat_notes = tb.pivot(
index=["country", "year"],
Expand All @@ -940,7 +946,7 @@ def gather_notes(tb: Table, notes_columns: List[str]) -> Dict[str, str]:

# Gather all notes for each column.
notes_dict = {}
for column in tqdm(tb_flat_notes.drop(columns=["country", "year"]).columns):
for column in tqdm(tb_flat_notes.drop(columns=["country", "year"]).columns, disable=True):
_notes = tb_flat_notes[column].dropna().tolist()
if len(_notes) > 0:
# Gather all notes for this column.
Expand All @@ -949,10 +955,24 @@ def gather_notes(tb: Table, notes_columns: List[str]) -> Dict[str, str]:
notes = pd.unique(pd.Series(notes)).tolist()
# Join notes.
if len(notes) > 0:
notes_str = "- " + "\n- ".join(notes)
notes_dict[column] = notes_str
notes_dict[column] = notes

return notes_dict
# Check that the notes coincide with the original notes stored in an adjacent file.
error = "Original BGS notes and footnotes have changed."
assert notes_dict == notes_original, error
# To update the original notes:
# from etl.files import ruamel_dump
# (paths.directory / "notes_original.yml").write_text(ruamel_dump(notes_original))

# Load the edited notes, that will overwrite the original notes.
notes_dict.update(notes_edited)

# Join all notes into one string, separated by line breaks.
notes_str_dict = {}
for column, notes in notes_dict.items():
notes_str_dict[column] = "- " + "\n- ".join(notes)

return notes_str_dict


def add_global_data(tb: Table, ds_regions: Dataset) -> Table:
Expand Down Expand Up @@ -986,27 +1006,27 @@ def add_global_data(tb: Table, ds_regions: Dataset) -> Table:
regions_to_remove = [region for region in regions if region != "World"]
tb = tb.loc[~tb["country"].isin(regions_to_remove)].reset_index(drop=True)

# We noticed that imports/exports data have:
# * Only data for European countries (and Turkey) from 2003 onwards. Check this:
regions = ds_regions["regions"]
europe = regions.loc[json.loads(regions[regions["name"] == "Europe"]["members"].item())]["name"].unique().tolist()
error = "Expected only European countries (including Turkey) imports/exports data after 2002."
assert set(tb[(tb["imports"].notnull()) & (tb["year"] > 2002)]["country"]) <= (
set(europe) | set(["United Kingdom", "Turkey", "World"])
), error
assert set(tb[(tb["exports"].notnull()) & (tb["year"] > 2002)]["country"]) <= (
set(europe) | set(["United Kingdom", "Turkey", "World"])
), error
# * Only UK data from 2019 onwards. Check this:
error = "Expected only UK imports/exports data after 2018."
assert set(tb[(tb["imports"].notnull()) & (tb["year"] > 2018)]["country"]) == set(
["United Kingdom", "World"]
), error
assert set(tb[(tb["exports"].notnull()) & (tb["year"] > 2018)]["country"]) == set(
["United Kingdom", "World"]
), error
# Therefore, it only makes sense to have global imports/exports data until 2002.
tb.loc[(tb["year"] > 2002) & (tb["country"] == "World"), ["imports", "exports"]] = None
# # We noticed that imports/exports data have:
# # * Only data for European countries (and Turkey) from 2003 onwards. Check this:
# regions = ds_regions["regions"]
# europe = regions.loc[json.loads(regions[regions["name"] == "Europe"]["members"].item())]["name"].unique().tolist()
# error = "Expected only European countries (including Turkey) imports/exports data after 2002."
# assert set(tb[(tb["imports"].notnull()) & (tb["year"] > 2002)]["country"]) <= (
# set(europe) | set(["United Kingdom", "Turkey", "World"])
# ), error
# assert set(tb[(tb["exports"].notnull()) & (tb["year"] > 2002)]["country"]) <= (
# set(europe) | set(["United Kingdom", "Turkey", "World"])
# ), error
# # * Only UK data from 2019 onwards. Check this:
# error = "Expected only UK imports/exports data after 2018."
# assert set(tb[(tb["imports"].notnull()) & (tb["year"] > 2018)]["country"]) == set(
# ["United Kingdom", "World"]
# ), error
# assert set(tb[(tb["exports"].notnull()) & (tb["year"] > 2018)]["country"]) == set(
# ["United Kingdom", "World"]
# ), error
# # Therefore, it only makes sense to have global imports/exports data until 2002.
# tb.loc[(tb["year"] > 2002) & (tb["country"] == "World"), ["imports", "exports"]] = None

return tb

Expand Down Expand Up @@ -1117,6 +1137,13 @@ def run(dest_dir: str) -> None:
# Load regions dataset.
ds_regions = paths.load_dataset("regions")

# Load adjacent file containing the original BGS notes and footnotes for each data column.
# NOTE: This file is loaded as a sanity check, in case in a later update notes change.
notes_original = ruamel_load(paths.directory / "notes_original.yml")

# Load the addjacent file containing the edited notes.
notes_edited = ruamel_load(paths.directory / "notes_edited.yml")

#
# Process data.
#
Expand All @@ -1125,6 +1152,10 @@ def run(dest_dir: str) -> None:
df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
)

# We decided to discard imports and exports data, since (as mentioned in other comments) it includes data for
# non-european countries only until 2002, and it causes many issues.
tb = tb[tb["category"] == "Production"].reset_index(drop=True)

# Remove data for regions that did not exist at the time.
tb = remove_data_from_non_existing_regions(tb=tb)

Expand Down Expand Up @@ -1171,45 +1202,40 @@ def run(dest_dir: str) -> None:
join_column_levels_with="_",
)
.underscore()
.rename(
columns={"value_production": "production", "value_imports": "imports", "value_exports": "exports"},
errors="raise",
)
.rename(columns={"value_production": "production"}, errors="raise")
)

# Set an appropriate format for value columns.
tb = tb.astype({column: "Float64" for column in ["production", "imports", "exports"]})
tb = tb.astype({column: "Float64" for column in ["production"]})

# Parse notes as lists of strings.
for column in [
"note_production",
"note_imports",
"note_exports",
"general_notes_production",
"general_notes_imports",
"general_notes_exports",
]:
tb[column] = tb[column].fillna("[]").apply(ast.literal_eval)

# Add global data.
tb = add_global_data(tb=tb, ds_regions=ds_regions)

# Clean notes columns, and combine notes at the individual row level with general table notes.
for category in ["exports", "imports", "production"]:
for category in ["production"]:
tb[f"notes_{category}"] = [
clean_notes(note) for note in tb[f"note_{category}"] + tb[f"general_notes_{category}"]
]
# Drop unnecessary columns.
tb = tb.drop(columns=[f"note_{category}", f"general_notes_{category}"])

# Gather all notes in a dictionary.
notes = gather_notes(tb, notes_columns=["notes_exports", "notes_imports", "notes_production"])
notes = gather_notes(
tb, notes_columns=["notes_production"], notes_original=notes_original, notes_edited=notes_edited
)

# Create a wide table.
tb_flat = tb.pivot(
index=["country", "year"],
columns=["commodity", "sub_commodity", "unit"],
values=["exports", "imports", "production"],
values=["production"],
join_column_levels_with="|",
)

Expand All @@ -1223,7 +1249,7 @@ def run(dest_dir: str) -> None:
tb_flat[column].metadata.description_from_producer = "Notes found in original BGS data:\n" + notes[column]

# To avoid ETL failing when storing the table, convert lists of notes to strings (and add metadata).
for column in ["notes_imports", "notes_exports", "notes_production"]:
for column in ["notes_production"]:
tb[column] = tb[column].copy_metadata(tb["production"]).astype(str)

# Add footnotes.
Expand All @@ -1236,12 +1262,7 @@ def run(dest_dir: str) -> None:
tb_flat = tb_flat.dropna(axis=1, how="all").reset_index(drop=True)

# Format table conveniently.
# NOTE: All commodities have the same unit for imports, exports and production except one:
# Potash Chloride uses "tonnes" for imports and exports, and "tonnes of K20 content" (which is also misspelled).
# Due to this, the index cannot simply be "country", "year", "commodity", "sub_commodity"; we need also "unit".
# counts = tb.groupby(["commodity", "sub_commodity", "country", "year"], observed=True, as_index=False).nunique()
# counts[counts["unit"] > 1][["commodity", "sub_commodity"]].drop_duplicates()
tb = tb.format(["country", "year", "commodity", "sub_commodity", "unit"])
tb = tb.format(["country", "year", "commodity", "sub_commodity"])
tb_flat = tb_flat.format(["country", "year"], short_name=paths.short_name + "_flat")
tb_flat = tb_flat.astype({column: "Float64" for column in tb_flat.columns})

Expand Down
Loading
Loading